From 8e8e0b2d516dfff82b26467ce15a7f28407bd519 Mon Sep 17 00:00:00 2001
From: Berack96 <giacomobertolazzi7@gmail.com>
Date: Mon, 12 Aug 2024 22:09:41 +0200
Subject: [PATCH] KMeans - implemented KMeans - fixed non seeded rng - fixed
 display exception with NoTargets - added basic test cases to app

---
 src/app.py                   | 32 +++++++++++++++++++---
 src/learning/data.py         |  3 ++-
 src/learning/ml.py           |  2 +-
 src/learning/unsupervised.py | 51 ++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 5 deletions(-)
 create mode 100644 src/learning/unsupervised.py

diff --git a/src/app.py b/src/app.py
index f19ea58..3fac393 100644
--- a/src/app.py
+++ b/src/app.py
@@ -2,12 +2,14 @@ import random
 from typing import Any
 import numpy as np
 import sklearn
+import sklearn.cluster
 import sklearn.linear_model
 import sklearn.model_selection
 import sklearn.neural_network
 from learning.data import Dataset, TargetType
 from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
 from learning.ml import MLAlgorithm
+from learning.unsupervised import KMeans
 
 DATASET = "datasets/"
 REGRESSION = DATASET + "regression/"
@@ -75,6 +77,23 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]:
     size = [4, 3]
     return (ds, MultiLayerPerceptron(ds, size), sklearn.neural_network.MLPClassifier(size, 'relu'))
 
+# ********************
+# MultiLayerPerceptron
+# ********************
+
+def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
+    ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.NoTarget)
+    ds.remove(["Family", "Genus", "RecordID", "Species"])
+    clusters = 10
+    return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
+
+def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
+    ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget)
+    ds.remove(["Class"])
+    ds.normalize()
+    clusters = 3
+    return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
+
 # ********************
 # Main & random
 # ********************
@@ -82,17 +101,24 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]:
 if __name__ == "__main__":
     np.set_printoptions(linewidth=np.inf, formatter={'float': '{:>10.5f}'.format})
     rand = random.randint(0, 4294967295)
+    #rand = 1997847910  # LiR for power_plant
+    #rand = 347617386   # LoR for electrical_grid
+    #rand = 1793295160  # MLP for iris
+    #rand = 885416001   # KMe for frogs_no_target
+
     np.random.seed(rand)
     print(f"Using seed: {rand}")
 
-    ds, ml, sk = electrical_grid()
-    ml.learn(10000, verbose=True)
+    ds, ml, sk = iris()
+
+    epochs, _, _ = ml.learn(1000, verbose=True)
     ml.display_results()
 
     np.random.seed(rand)
     learn, test, valid = ds.get_dataset()
+    sk.set_params(max_iter=epochs)
     sk.fit(learn.x, learn.y)
-    print(f"Sklearn    : {sk.score(test.x, test.y):0.5f}")
+    print(f"Sklearn    : {abs(sk.score(test.x, test.y)):0.5f}")
     print("========================")
 
     ml.plot()
diff --git a/src/learning/data.py b/src/learning/data.py
index f6d8b2e..7678b21 100644
--- a/src/learning/data.py
+++ b/src/learning/data.py
@@ -83,7 +83,8 @@ class Dataset:
         splitted = [data[ data[:,0] == k ] for k in classes ]
         total_each = np.average([len(x) for x in splitted]).astype(int)
 
-        rng = np.random.default_rng()
+        seed = np.random.randint(0, 4294967295)
+        rng = np.random.default_rng(seed)
         data = []
         for x in splitted:
             samples = rng.choice(x, size=total_each, replace=True, shuffle=False)
diff --git a/src/learning/ml.py b/src/learning/ml.py
index e39143e..e1a4cb0 100644
--- a/src/learning/ml.py
+++ b/src/learning/ml.py
@@ -79,7 +79,7 @@ class MLAlgorithm(ABC):
         print(f"Loss test  : {self.test_loss():0.5f}")
         if self._target_type == TargetType.Regression:
             print(f"R^2        : {self.test_r_squared():0.5f}")
-        else:
+        elif self._target_type != TargetType.NoTarget:
             conf = self.test_confusion_matrix()
             print(f"Accuracy   : {conf.accuracy():0.5f} - classes {conf.accuracy_per_class()}")
             print(f"Precision  : {conf.precision():0.5f} - classes {conf.precision_per_class()}")
diff --git a/src/learning/unsupervised.py b/src/learning/unsupervised.py
new file mode 100644
index 0000000..e9320e1
--- /dev/null
+++ b/src/learning/unsupervised.py
@@ -0,0 +1,51 @@
+import math as math
+import numpy as np
+
+from abc import abstractmethod
+from learning.ml import MLAlgorithm
+from learning.data import Dataset, Data
+NOT_ZERO = 1e-15
+
+class KMeans(MLAlgorithm):
+    def __init__(self, dataset: Dataset, clusters:int) -> None:
+        super().__init__(dataset)
+        dimensions = self._learnset.x.shape[1]
+        self.total = clusters
+        self.centroids = np.random.rand(clusters, dimensions)
+
+    def _h0(self, x:np.ndarray) -> np.ndarray:
+        diff = x[:, np.newaxis] - self.centroids
+        distances = np.linalg.norm(diff, axis=2)
+        return np.argmin(distances, axis=1)
+
+    def _learning_step(self) -> float:
+        assignments = self._h0(self._learnset.x)
+        centroids = []
+
+        for k in range(self.total):
+            assigned_points = self._learnset.x[assignments == k]
+
+            if len(assigned_points) > 0:
+                mean = np.mean(assigned_points, axis=0)
+                centroids.append(mean)
+            else:
+                self.total -= 1
+
+        self.centroids = np.array(centroids)
+        return self._predict_loss(self._learnset)
+
+    def _predict_loss(self, dataset:Data) -> float:
+        assignments = self._h0(dataset.x)
+        loss = 0.0
+
+        for k in range(self.total):
+            assigned_points = dataset.x[assignments == k]
+            if len(assigned_points) > 0:
+                diff = assigned_points - self.centroids[k]
+                loss += np.sum(np.linalg.norm(diff, axis=1) ** 2)
+        return loss
+
+    def _get_parameters(self):
+        return self.centroids.copy()
+    def _set_parameters(self, parameters):
+        self.centroids = parameters