End of ML

- fixes for clustering - fixes in general
2024-08-21 19:45:42 +02:00
parent bba07c0b49
commit 9ea43beace
5 changed files with 71 additions and 84 deletions
--- a/src/app.py
+++ b/src/app.py
@@ -2,12 +2,14 @@ import numpy as np
 import sklearn
 import sklearn.cluster
 import sklearn.linear_model
+import sklearn.metrics
 import sklearn.model_selection
 import sklearn.neural_network

 from typing import Any
+from learning.functions import print_metrics
 from learning.ml import MLAlgorithm
-from learning.data import ConfusionMatrix, Dataset, TargetType
+from learning.data import Dataset, TargetType
 from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
 from learning.unsupervised import KMeans

@@ -68,15 +70,15 @@ def electrical_grid_mlp() -> tuple[Dataset, MLAlgorithm, Any]:
    ds.factorize(["stabf"])
    ds.standardize()
    size = [4, 3]
-    return (ds, MultiLayerPerceptron(ds, size, 0.08), sklearn.neural_network.MLPClassifier(size, 'relu'))
+    return (ds, MultiLayerPerceptron(ds, size, 0.05), sklearn.neural_network.MLPClassifier(size, 'relu'))

 def frogs() -> tuple[Dataset, MLAlgorithm, Any]:
-    ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.MultiClassification)
-    ds.remove(["Family", "Genus", "RecordID"])
-    ds.factorize(["Species"])
+    ds = Dataset(CLASSIFICATION + "frogs.csv", "Family", TargetType.MultiClassification)
+    ds.remove(["Species", "Genus", "RecordID"])
+    ds.factorize(["Family"])
    ds.standardize()
-    size = [18, 15, 12]
-    return (ds, MultiLayerPerceptron(ds, size, 0.047), sklearn.neural_network.MLPClassifier(size, 'relu'))
+    size = [18, 12, 8]
+    return (ds, MultiLayerPerceptron(ds, size, 0.02), sklearn.neural_network.MLPClassifier(size, 'relu'))

 def iris() -> tuple[Dataset, MLAlgorithm, Any]:
    ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.MultiClassification)
@@ -90,15 +92,14 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]:
 # ********************

 def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
-    ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.NoTarget)
+    ds = Dataset(CLASSIFICATION + "frogs.csv", "Family", TargetType.NoTarget)
    ds.remove(["Family", "Genus", "RecordID", "Species"])
-    clusters = 10
+    clusters = 4
    return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))

 def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
    ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget)
    ds.remove(["Class"])
-    ds.standardize()
    clusters = 3
    return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))

@@ -113,8 +114,8 @@ if __name__ == "__main__":
    #rand = 347617386   # LoR for electrical_grid
    #rand = 834535453   # LoR for heart
    #rand = 1793295160  # MLP for iris
-    #rand = 2914000170  # MLP for frogs
-    #rand = 885416001   # KMe for frogs_no_target
+    #rand = 629702080   # MLP for frogs
+    #rand = 1038336550  # KMe for frogs_no_target

    np.random.seed(rand)
    print(f"Using seed: {rand}")
@@ -129,9 +130,6 @@ if __name__ == "__main__":
    sk.set_params(max_iter=epochs)
    sk.fit(learn.x, learn.y)
    print(f"Sklearn    : {abs(sk.score(test.x, test.y)):0.5f}")
-    if ds.target_type == TargetType.Classification or ds.target_type == TargetType.MultiClassification:
-        conf = ConfusionMatrix(test.y, sk.predict(test.x))
-        conf.print()
-    print("========================")
+    print_metrics(ml._target_type, test, sk.predict(test.x))

    ml.plot()
--- a/src/learning/data.py
+++ b/src/learning/data.py
@@ -40,8 +40,9 @@ class Dataset:
        self.target_type = target_type

        # move target to the start
-        col_target = self.data.pop(target)
-        self.data.insert(0, target, col_target)
+        if target_type != TargetType.NoTarget:
+            col_target = self.data.pop(target)
+            self.data.insert(0, target, col_target)

    def remove(self, columns:list[str]) -> Self:
        for col in columns:
@@ -85,17 +86,14 @@ class Dataset:

        data = []
        for x in splitted:
-            total = total_each - x.shape[0]
-            data.append(x)
-            if total > 0:
-                samples = np.random.choice(x, size=total, replace=True)
-                data.append(samples)
+            samples = np.random.choice(len(x), size=total_each, replace=True)
+            data.append(x[samples])

        return np.concatenate(data, axis=0)

-    def split_data_target(self, data:np.ndarray) -> Data:
+    def split_dataset_target(self, data:np.ndarray) -> Data:
        target = data[:, 0] if self.target_type != TargetType.NoTarget else None
-        data = data[:, 1:]
+        data = data[:, 1:] if self.target_type != TargetType.NoTarget else data
        if self.target_type == TargetType.MultiClassification:
            target = target.astype(int)
            uniques = np.unique(target).shape[0]
@@ -122,14 +120,15 @@ class Dataset:
                np.random.shuffle(data)
                learn, valid, test = self.split_dataset(data, valid_frac, test_frac)

-                if self.target_type == TargetType.Regression or self.target_type == TargetType.NoTarget:
+                if self.target_type == TargetType.Classification\
+                or self.target_type == TargetType.MultiClassification:
                    learn = self.prepare_classification(learn)
                    valid = self.prepare_classification(valid)
                    test = self.prepare_classification(test)

-                learn = self.split_data_target(learn)
-                valid = self.split_data_target(valid)
-                test = self.split_data_target(test)
+                learn = self.split_dataset_target(learn)
+                valid = self.split_dataset_target(valid)
+                test = self.split_dataset_target(test)
                return (learn, valid, test)
            except:
                if max_iter == 0:
--- a/src/learning/functions.py
+++ b/src/learning/functions.py
@@ -1,5 +1,8 @@
 import numpy as np

+from learning.data import ConfusionMatrix, Data, Dataset, TargetType
+from sklearn.metrics import silhouette_score, r2_score
+
 NOT_ZERO = 1e-15
 LEAKY_RELU = 0.2

@@ -45,20 +48,34 @@ def cross_entropy_loss(h0:np.ndarray, y:np.ndarray) -> float:
 # Randoms
 # **********

-def pearson(h0:np.ndarray, y:np.ndarray) -> float:
-    diff1 = h0 - h0.mean()
-    diff2 = y - y.mean()
-    num = np.sum(diff1 * diff2)
-    den = np.sqrt(np.sum(diff1**2)) * np.sqrt(np.sum(diff2**2))
-    return num / den
-
-def r_squared(h0:np.ndarray, y:np.ndarray) -> float:
-    y_mean = np.mean(y)
-    ss_resid = np.sum((y - h0) ** 2)
-    ss_total = np.sum((y - y_mean) ** 2)
-    return 1 - (ss_resid / ss_total)
-
 def with_bias(x:np.ndarray) -> np.ndarray:
    shape = (x.shape[0], 1) if len(x.shape) != 1 else (1,)
    ones = np.ones(shape)
    return np.hstack([ones, x])
+
+def print_metrics(target:TargetType, dataset:Data, h0:np.ndarray) -> None:
+    if target == TargetType.Regression:
+        print(f"R^2        : {r2_score(dataset.y, h0):0.5f}")
+        print(f"Pearson    : {np.corrcoef(dataset.y, h0)[0, 1]:0.5f}")
+    elif target != TargetType.NoTarget:
+        if h0.ndim == 1: h0 = np.where(h0 > 0.5, 1, 0)
+        ConfusionMatrix(dataset.y, h0).print()
+    else:
+        print(f"Silhouette : {silhouette_score(dataset.x, h0):0.5f}")
+    print("========================")
+
+def print_silhouette_weka(ds:Dataset, file_weka:str):
+    test, _, _, _ = ds.get_dataset()[2].as_tuple()
+    test = np.round(test, 6)
+
+    weka = Dataset(file_weka, "", TargetType.NoTarget)
+    weka.factorize(["cluster"])
+
+    weka, _, _, _ = weka.get_dataset(test_frac=0, valid_frac=0)[0].as_tuple()
+    weka_x, weka_y = weka[:, :-1], weka[:, -1:]
+
+    bau = [np.where((weka_x == x).all(axis=1))[0][0] for x in test]
+    weka_x, weka_y = weka_x[bau], weka_y[bau].ravel()
+
+    score = silhouette_score(weka_x, weka_y)
+    print(score)
--- a/src/learning/ml.py
+++ b/src/learning/ml.py
@@ -2,10 +2,11 @@ import sys
 import numpy as np

 from abc import ABC, abstractmethod
+
 from plot import Plot
 from tqdm import tqdm
-from learning.data import ConfusionMatrix, Dataset, Data, TargetType
-from learning.functions import pearson, r_squared
+from learning.data import Dataset, Data, TargetType
+from learning.functions import print_metrics

 class MLAlgorithm(ABC):
    """ Classe generica per gli algoritmi di Machine Learning """
@@ -84,36 +85,7 @@ class MLAlgorithm(ABC):
        print(f"Loss valid : {self.validation_loss():0.5f}")
        print(f"Loss test  : {self.test_loss():0.5f}")
        print("========================")
-        if self._target_type == TargetType.Regression:
-            print(f"Pearson    : {self.test_pearson():0.5f}")
-            print(f"R^2        : {self.test_r_squared():0.5f}")
-            print("========================")
-        elif self._target_type != TargetType.NoTarget:
-            conf = self.test_confusion_matrix()
-            conf.print()
-            print("========================")
-
-    def test_confusion_matrix(self) -> ConfusionMatrix:
-        if self._target_type != TargetType.Classification\
-        and self._target_type != TargetType.MultiClassification:
-            return None
-
-        h0 = self._h0(self._testset.x)
-        y = self._testset.y
-        if h0.ndim == 1:
-            h0 = np.where(h0 > 0.5, 1, 0)
-
-        return ConfusionMatrix(y, h0)
-
-    def test_pearson(self) -> float:
-        if self._target_type != TargetType.Regression:
-            return 0
-        return pearson(self._h0(self._testset.x), self._testset.y)
-
-    def test_r_squared(self) -> float:
-        if self._target_type != TargetType.Regression:
-            return 0
-        return r_squared(self._h0(self._testset.x), self._testset.y)
+        print_metrics(self._target_type, self._testset, self._h0(self._testset.x))

    @abstractmethod
    def _h0(self, x:np.ndarray) -> np.ndarray: pass
--- a/src/learning/unsupervised.py
+++ b/src/learning/unsupervised.py
@@ -16,6 +16,17 @@ class KMeans(MLAlgorithm):
        distances = np.linalg.norm(diff, axis=2)
        return np.argmin(distances, axis=1)

+    def _predict_loss(self, dataset:Data) -> float:
+        assignments = self._h0(dataset.x)
+        loss = 0.0
+
+        for k in range(self.total):
+            assigned_points = dataset.x[assignments == k]
+            if len(assigned_points) > 0:
+                diff = assigned_points - self.centroids[k]
+                loss += np.sum(np.linalg.norm(diff, axis=1) ** 2)
+        return loss
+
    def _learning_step(self) -> float:
        assignments = self._h0(self._learnset.x)
        centroids = []
@@ -32,16 +43,6 @@ class KMeans(MLAlgorithm):
        self.centroids = np.array(centroids)
        return self._predict_loss(self._learnset)

-    def _predict_loss(self, dataset:Data) -> float:
-        assignments = self._h0(dataset.x)
-        loss = 0.0
-
-        for k in range(self.total):
-            assigned_points = dataset.x[assignments == k]
-            if len(assigned_points) > 0:
-                diff = assigned_points - self.centroids[k]
-                loss += np.sum(np.linalg.norm(diff, axis=1) ** 2)
-        return loss

    def _get_parameters(self):
        return self.centroids.copy()