From 8e8e0b2d516dfff82b26467ce15a7f28407bd519 Mon Sep 17 00:00:00 2001 From: Berack96 Date: Mon, 12 Aug 2024 22:09:41 +0200 Subject: [PATCH] KMeans - implemented KMeans - fixed non seeded rng - fixed display exception with NoTargets - added basic test cases to app --- src/app.py | 32 +++++++++++++++++++--- src/learning/data.py | 3 ++- src/learning/ml.py | 2 +- src/learning/unsupervised.py | 51 ++++++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 src/learning/unsupervised.py diff --git a/src/app.py b/src/app.py index f19ea58..3fac393 100644 --- a/src/app.py +++ b/src/app.py @@ -2,12 +2,14 @@ import random from typing import Any import numpy as np import sklearn +import sklearn.cluster import sklearn.linear_model import sklearn.model_selection import sklearn.neural_network from learning.data import Dataset, TargetType from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron from learning.ml import MLAlgorithm +from learning.unsupervised import KMeans DATASET = "datasets/" REGRESSION = DATASET + "regression/" @@ -75,6 +77,23 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]: size = [4, 3] return (ds, MultiLayerPerceptron(ds, size), sklearn.neural_network.MLPClassifier(size, 'relu')) +# ******************** +# MultiLayerPerceptron +# ******************** + +def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]: + ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.NoTarget) + ds.remove(["Family", "Genus", "RecordID", "Species"]) + clusters = 10 + return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters)) + +def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]: + ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget) + ds.remove(["Class"]) + ds.normalize() + clusters = 3 + return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters)) + # ******************** # Main & random # ******************** @@ -82,17 +101,24 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]: if __name__ == "__main__": np.set_printoptions(linewidth=np.inf, formatter={'float': '{:>10.5f}'.format}) rand = random.randint(0, 4294967295) + #rand = 1997847910 # LiR for power_plant + #rand = 347617386 # LoR for electrical_grid + #rand = 1793295160 # MLP for iris + #rand = 885416001 # KMe for frogs_no_target + np.random.seed(rand) print(f"Using seed: {rand}") - ds, ml, sk = electrical_grid() - ml.learn(10000, verbose=True) + ds, ml, sk = iris() + + epochs, _, _ = ml.learn(1000, verbose=True) ml.display_results() np.random.seed(rand) learn, test, valid = ds.get_dataset() + sk.set_params(max_iter=epochs) sk.fit(learn.x, learn.y) - print(f"Sklearn : {sk.score(test.x, test.y):0.5f}") + print(f"Sklearn : {abs(sk.score(test.x, test.y)):0.5f}") print("========================") ml.plot() diff --git a/src/learning/data.py b/src/learning/data.py index f6d8b2e..7678b21 100644 --- a/src/learning/data.py +++ b/src/learning/data.py @@ -83,7 +83,8 @@ class Dataset: splitted = [data[ data[:,0] == k ] for k in classes ] total_each = np.average([len(x) for x in splitted]).astype(int) - rng = np.random.default_rng() + seed = np.random.randint(0, 4294967295) + rng = np.random.default_rng(seed) data = [] for x in splitted: samples = rng.choice(x, size=total_each, replace=True, shuffle=False) diff --git a/src/learning/ml.py b/src/learning/ml.py index e39143e..e1a4cb0 100644 --- a/src/learning/ml.py +++ b/src/learning/ml.py @@ -79,7 +79,7 @@ class MLAlgorithm(ABC): print(f"Loss test : {self.test_loss():0.5f}") if self._target_type == TargetType.Regression: print(f"R^2 : {self.test_r_squared():0.5f}") - else: + elif self._target_type != TargetType.NoTarget: conf = self.test_confusion_matrix() print(f"Accuracy : {conf.accuracy():0.5f} - classes {conf.accuracy_per_class()}") print(f"Precision : {conf.precision():0.5f} - classes {conf.precision_per_class()}") diff --git a/src/learning/unsupervised.py b/src/learning/unsupervised.py new file mode 100644 index 0000000..e9320e1 --- /dev/null +++ b/src/learning/unsupervised.py @@ -0,0 +1,51 @@ +import math as math +import numpy as np + +from abc import abstractmethod +from learning.ml import MLAlgorithm +from learning.data import Dataset, Data +NOT_ZERO = 1e-15 + +class KMeans(MLAlgorithm): + def __init__(self, dataset: Dataset, clusters:int) -> None: + super().__init__(dataset) + dimensions = self._learnset.x.shape[1] + self.total = clusters + self.centroids = np.random.rand(clusters, dimensions) + + def _h0(self, x:np.ndarray) -> np.ndarray: + diff = x[:, np.newaxis] - self.centroids + distances = np.linalg.norm(diff, axis=2) + return np.argmin(distances, axis=1) + + def _learning_step(self) -> float: + assignments = self._h0(self._learnset.x) + centroids = [] + + for k in range(self.total): + assigned_points = self._learnset.x[assignments == k] + + if len(assigned_points) > 0: + mean = np.mean(assigned_points, axis=0) + centroids.append(mean) + else: + self.total -= 1 + + self.centroids = np.array(centroids) + return self._predict_loss(self._learnset) + + def _predict_loss(self, dataset:Data) -> float: + assignments = self._h0(dataset.x) + loss = 0.0 + + for k in range(self.total): + assigned_points = dataset.x[assignments == k] + if len(assigned_points) > 0: + diff = assigned_points - self.centroids[k] + loss += np.sum(np.linalg.norm(diff, axis=1) ** 2) + return loss + + def _get_parameters(self): + return self.centroids.copy() + def _set_parameters(self, parameters): + self.centroids = parameters