diff --git a/src/app.py b/src/app.py index 019a0d1..6c4ceb7 100644 --- a/src/app.py +++ b/src/app.py @@ -2,12 +2,14 @@ import numpy as np import sklearn import sklearn.cluster import sklearn.linear_model +import sklearn.metrics import sklearn.model_selection import sklearn.neural_network from typing import Any +from learning.functions import print_metrics from learning.ml import MLAlgorithm -from learning.data import ConfusionMatrix, Dataset, TargetType +from learning.data import Dataset, TargetType from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron from learning.unsupervised import KMeans @@ -68,15 +70,15 @@ def electrical_grid_mlp() -> tuple[Dataset, MLAlgorithm, Any]: ds.factorize(["stabf"]) ds.standardize() size = [4, 3] - return (ds, MultiLayerPerceptron(ds, size, 0.08), sklearn.neural_network.MLPClassifier(size, 'relu')) + return (ds, MultiLayerPerceptron(ds, size, 0.05), sklearn.neural_network.MLPClassifier(size, 'relu')) def frogs() -> tuple[Dataset, MLAlgorithm, Any]: - ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.MultiClassification) - ds.remove(["Family", "Genus", "RecordID"]) - ds.factorize(["Species"]) + ds = Dataset(CLASSIFICATION + "frogs.csv", "Family", TargetType.MultiClassification) + ds.remove(["Species", "Genus", "RecordID"]) + ds.factorize(["Family"]) ds.standardize() - size = [18, 15, 12] - return (ds, MultiLayerPerceptron(ds, size, 0.047), sklearn.neural_network.MLPClassifier(size, 'relu')) + size = [18, 12, 8] + return (ds, MultiLayerPerceptron(ds, size, 0.02), sklearn.neural_network.MLPClassifier(size, 'relu')) def iris() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.MultiClassification) @@ -90,15 +92,14 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]: # ******************** def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]: - ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.NoTarget) + ds = Dataset(CLASSIFICATION + "frogs.csv", "Family", TargetType.NoTarget) ds.remove(["Family", "Genus", "RecordID", "Species"]) - clusters = 10 + clusters = 4 return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters)) def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget) ds.remove(["Class"]) - ds.standardize() clusters = 3 return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters)) @@ -113,8 +114,8 @@ if __name__ == "__main__": #rand = 347617386 # LoR for electrical_grid #rand = 834535453 # LoR for heart #rand = 1793295160 # MLP for iris - #rand = 2914000170 # MLP for frogs - #rand = 885416001 # KMe for frogs_no_target + #rand = 629702080 # MLP for frogs + #rand = 1038336550 # KMe for frogs_no_target np.random.seed(rand) print(f"Using seed: {rand}") @@ -129,9 +130,6 @@ if __name__ == "__main__": sk.set_params(max_iter=epochs) sk.fit(learn.x, learn.y) print(f"Sklearn : {abs(sk.score(test.x, test.y)):0.5f}") - if ds.target_type == TargetType.Classification or ds.target_type == TargetType.MultiClassification: - conf = ConfusionMatrix(test.y, sk.predict(test.x)) - conf.print() - print("========================") + print_metrics(ml._target_type, test, sk.predict(test.x)) ml.plot() diff --git a/src/learning/data.py b/src/learning/data.py index f51346a..f140b61 100644 --- a/src/learning/data.py +++ b/src/learning/data.py @@ -40,8 +40,9 @@ class Dataset: self.target_type = target_type # move target to the start - col_target = self.data.pop(target) - self.data.insert(0, target, col_target) + if target_type != TargetType.NoTarget: + col_target = self.data.pop(target) + self.data.insert(0, target, col_target) def remove(self, columns:list[str]) -> Self: for col in columns: @@ -85,17 +86,14 @@ class Dataset: data = [] for x in splitted: - total = total_each - x.shape[0] - data.append(x) - if total > 0: - samples = np.random.choice(x, size=total, replace=True) - data.append(samples) + samples = np.random.choice(len(x), size=total_each, replace=True) + data.append(x[samples]) return np.concatenate(data, axis=0) - def split_data_target(self, data:np.ndarray) -> Data: + def split_dataset_target(self, data:np.ndarray) -> Data: target = data[:, 0] if self.target_type != TargetType.NoTarget else None - data = data[:, 1:] + data = data[:, 1:] if self.target_type != TargetType.NoTarget else data if self.target_type == TargetType.MultiClassification: target = target.astype(int) uniques = np.unique(target).shape[0] @@ -122,14 +120,15 @@ class Dataset: np.random.shuffle(data) learn, valid, test = self.split_dataset(data, valid_frac, test_frac) - if self.target_type == TargetType.Regression or self.target_type == TargetType.NoTarget: + if self.target_type == TargetType.Classification\ + or self.target_type == TargetType.MultiClassification: learn = self.prepare_classification(learn) valid = self.prepare_classification(valid) test = self.prepare_classification(test) - learn = self.split_data_target(learn) - valid = self.split_data_target(valid) - test = self.split_data_target(test) + learn = self.split_dataset_target(learn) + valid = self.split_dataset_target(valid) + test = self.split_dataset_target(test) return (learn, valid, test) except: if max_iter == 0: diff --git a/src/learning/functions.py b/src/learning/functions.py index adaa46f..6408742 100644 --- a/src/learning/functions.py +++ b/src/learning/functions.py @@ -1,5 +1,8 @@ import numpy as np +from learning.data import ConfusionMatrix, Data, Dataset, TargetType +from sklearn.metrics import silhouette_score, r2_score + NOT_ZERO = 1e-15 LEAKY_RELU = 0.2 @@ -45,20 +48,34 @@ def cross_entropy_loss(h0:np.ndarray, y:np.ndarray) -> float: # Randoms # ********** -def pearson(h0:np.ndarray, y:np.ndarray) -> float: - diff1 = h0 - h0.mean() - diff2 = y - y.mean() - num = np.sum(diff1 * diff2) - den = np.sqrt(np.sum(diff1**2)) * np.sqrt(np.sum(diff2**2)) - return num / den - -def r_squared(h0:np.ndarray, y:np.ndarray) -> float: - y_mean = np.mean(y) - ss_resid = np.sum((y - h0) ** 2) - ss_total = np.sum((y - y_mean) ** 2) - return 1 - (ss_resid / ss_total) - def with_bias(x:np.ndarray) -> np.ndarray: shape = (x.shape[0], 1) if len(x.shape) != 1 else (1,) ones = np.ones(shape) return np.hstack([ones, x]) + +def print_metrics(target:TargetType, dataset:Data, h0:np.ndarray) -> None: + if target == TargetType.Regression: + print(f"R^2 : {r2_score(dataset.y, h0):0.5f}") + print(f"Pearson : {np.corrcoef(dataset.y, h0)[0, 1]:0.5f}") + elif target != TargetType.NoTarget: + if h0.ndim == 1: h0 = np.where(h0 > 0.5, 1, 0) + ConfusionMatrix(dataset.y, h0).print() + else: + print(f"Silhouette : {silhouette_score(dataset.x, h0):0.5f}") + print("========================") + +def print_silhouette_weka(ds:Dataset, file_weka:str): + test, _, _, _ = ds.get_dataset()[2].as_tuple() + test = np.round(test, 6) + + weka = Dataset(file_weka, "", TargetType.NoTarget) + weka.factorize(["cluster"]) + + weka, _, _, _ = weka.get_dataset(test_frac=0, valid_frac=0)[0].as_tuple() + weka_x, weka_y = weka[:, :-1], weka[:, -1:] + + bau = [np.where((weka_x == x).all(axis=1))[0][0] for x in test] + weka_x, weka_y = weka_x[bau], weka_y[bau].ravel() + + score = silhouette_score(weka_x, weka_y) + print(score) diff --git a/src/learning/ml.py b/src/learning/ml.py index 16221e4..7418716 100644 --- a/src/learning/ml.py +++ b/src/learning/ml.py @@ -2,10 +2,11 @@ import sys import numpy as np from abc import ABC, abstractmethod + from plot import Plot from tqdm import tqdm -from learning.data import ConfusionMatrix, Dataset, Data, TargetType -from learning.functions import pearson, r_squared +from learning.data import Dataset, Data, TargetType +from learning.functions import print_metrics class MLAlgorithm(ABC): """ Classe generica per gli algoritmi di Machine Learning """ @@ -84,36 +85,7 @@ class MLAlgorithm(ABC): print(f"Loss valid : {self.validation_loss():0.5f}") print(f"Loss test : {self.test_loss():0.5f}") print("========================") - if self._target_type == TargetType.Regression: - print(f"Pearson : {self.test_pearson():0.5f}") - print(f"R^2 : {self.test_r_squared():0.5f}") - print("========================") - elif self._target_type != TargetType.NoTarget: - conf = self.test_confusion_matrix() - conf.print() - print("========================") - - def test_confusion_matrix(self) -> ConfusionMatrix: - if self._target_type != TargetType.Classification\ - and self._target_type != TargetType.MultiClassification: - return None - - h0 = self._h0(self._testset.x) - y = self._testset.y - if h0.ndim == 1: - h0 = np.where(h0 > 0.5, 1, 0) - - return ConfusionMatrix(y, h0) - - def test_pearson(self) -> float: - if self._target_type != TargetType.Regression: - return 0 - return pearson(self._h0(self._testset.x), self._testset.y) - - def test_r_squared(self) -> float: - if self._target_type != TargetType.Regression: - return 0 - return r_squared(self._h0(self._testset.x), self._testset.y) + print_metrics(self._target_type, self._testset, self._h0(self._testset.x)) @abstractmethod def _h0(self, x:np.ndarray) -> np.ndarray: pass diff --git a/src/learning/unsupervised.py b/src/learning/unsupervised.py index 0e7aee1..8462200 100644 --- a/src/learning/unsupervised.py +++ b/src/learning/unsupervised.py @@ -16,6 +16,17 @@ class KMeans(MLAlgorithm): distances = np.linalg.norm(diff, axis=2) return np.argmin(distances, axis=1) + def _predict_loss(self, dataset:Data) -> float: + assignments = self._h0(dataset.x) + loss = 0.0 + + for k in range(self.total): + assigned_points = dataset.x[assignments == k] + if len(assigned_points) > 0: + diff = assigned_points - self.centroids[k] + loss += np.sum(np.linalg.norm(diff, axis=1) ** 2) + return loss + def _learning_step(self) -> float: assignments = self._h0(self._learnset.x) centroids = [] @@ -32,16 +43,6 @@ class KMeans(MLAlgorithm): self.centroids = np.array(centroids) return self._predict_loss(self._learnset) - def _predict_loss(self, dataset:Data) -> float: - assignments = self._h0(dataset.x) - loss = 0.0 - - for k in range(self.total): - assigned_points = dataset.x[assignments == k] - if len(assigned_points) > 0: - diff = assigned_points - self.centroids[k] - loss += np.sum(np.linalg.norm(diff, axis=1) ** 2) - return loss def _get_parameters(self): return self.centroids.copy()