From bba07c0b493a6246f949e02963543724011b4a3c Mon Sep 17 00:00:00 2001 From: Berack96 Date: Wed, 21 Aug 2024 14:52:25 +0200 Subject: [PATCH] Fixes - fixed bias - fixed confusion matrix - added print of confusion matrix - fixed dataset incorrectly splitting sharing classes with train and test - fixed mlp not correctly initializing layers - better print results - break if nan in learning --- src/app.py | 31 ++++++++++----- src/learning/data.py | 77 +++++++++++++++++++++++++------------- src/learning/functions.py | 10 +++-- src/learning/ml.py | 15 +++----- src/learning/supervised.py | 12 +++--- 5 files changed, 89 insertions(+), 56 deletions(-) diff --git a/src/app.py b/src/app.py index 873c10e..019a0d1 100644 --- a/src/app.py +++ b/src/app.py @@ -7,7 +7,7 @@ import sklearn.neural_network from typing import Any from learning.ml import MLAlgorithm -from learning.data import Dataset, TargetType +from learning.data import ConfusionMatrix, Dataset, TargetType from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron from learning.unsupervised import KMeans @@ -24,7 +24,7 @@ def auto_mpg() -> tuple[Dataset, MLAlgorithm, Any]: ds.numbers(["HP"]) ds.handle_na() - ds.normalize(excepts=["Cylinders","Year","Origin"]) + ds.standardize(excepts=["Cylinders","Year","Origin"]) return (ds, LinearRegression(ds, learning_rate=0.0001), sklearn.linear_model.SGDRegressor()) def automobile() -> tuple[Dataset, MLAlgorithm, Any]: @@ -34,12 +34,12 @@ def automobile() -> tuple[Dataset, MLAlgorithm, Any]: ds.factorize(attributes_to_modify) ds.numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]) ds.handle_na() - ds.normalize(excepts=attributes_to_modify) + ds.standardize(excepts=attributes_to_modify) return (ds, LinearRegression(ds, learning_rate=0.003), sklearn.linear_model.SGDRegressor()) def power_plant() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(REGRESSION + "power-plant.csv", "energy-output", TargetType.Regression) - ds.normalize(excepts=None) + ds.standardize(excepts=None) return (ds, LinearRegression(ds, learning_rate=0.1), sklearn.linear_model.SGDRegressor()) # ******************** @@ -49,31 +49,39 @@ def power_plant() -> tuple[Dataset, MLAlgorithm, Any]: def electrical_grid() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(CLASSIFICATION + "electrical_grid.csv", "stabf", TargetType.Classification) ds.factorize(["stabf"]) - ds.normalize() + ds.standardize() return (ds, LogisticRegression(ds, learning_rate=100), sklearn.linear_model.LogisticRegression()) def heart() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(CLASSIFICATION + "heart.csv", "Disease", TargetType.Classification) attributes_to_modify = ["Disease", "Sex", "ChestPainType"] ds.factorize(attributes_to_modify) - ds.normalize(excepts=attributes_to_modify) + ds.standardize(excepts=attributes_to_modify) return (ds, LogisticRegression(ds, learning_rate=0.01), sklearn.linear_model.LogisticRegression()) # ******************** # MultiLayerPerceptron # ******************** +def electrical_grid_mlp() -> tuple[Dataset, MLAlgorithm, Any]: + ds = Dataset(CLASSIFICATION + "electrical_grid.csv", "stabf", TargetType.MultiClassification) + ds.factorize(["stabf"]) + ds.standardize() + size = [4, 3] + return (ds, MultiLayerPerceptron(ds, size, 0.08), sklearn.neural_network.MLPClassifier(size, 'relu')) + def frogs() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.MultiClassification) ds.remove(["Family", "Genus", "RecordID"]) ds.factorize(["Species"]) - size = [18, 15, 12, 10, 8] - return (ds, MultiLayerPerceptron(ds, size), sklearn.neural_network.MLPClassifier(size, 'relu')) + ds.standardize() + size = [18, 15, 12] + return (ds, MultiLayerPerceptron(ds, size, 0.047), sklearn.neural_network.MLPClassifier(size, 'relu')) def iris() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.MultiClassification) ds.factorize(["Class"]) - ds.normalize() + ds.standardize() size = [4, 3] return (ds, MultiLayerPerceptron(ds, size), sklearn.neural_network.MLPClassifier(size, 'relu')) @@ -90,7 +98,7 @@ def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]: def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget) ds.remove(["Class"]) - ds.normalize() + ds.standardize() clusters = 3 return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters)) @@ -121,6 +129,9 @@ if __name__ == "__main__": sk.set_params(max_iter=epochs) sk.fit(learn.x, learn.y) print(f"Sklearn : {abs(sk.score(test.x, test.y)):0.5f}") + if ds.target_type == TargetType.Classification or ds.target_type == TargetType.MultiClassification: + conf = ConfusionMatrix(test.y, sk.predict(test.x)) + conf.print() print("========================") ml.plot() diff --git a/src/learning/data.py b/src/learning/data.py index 8633fba..f51346a 100644 --- a/src/learning/data.py +++ b/src/learning/data.py @@ -2,6 +2,8 @@ import numpy as np import pandas as pd from enum import Enum +import sklearn +import sklearn.metrics from typing_extensions import Self class TargetType(Enum): @@ -46,7 +48,7 @@ class Dataset: self.data.pop(col) return self - def normalize(self, excepts:list[str]=[]) -> Self: + def standardize(self, excepts:list[str]=[]) -> Self: if excepts is None: excepts = [] else: excepts.append(self.target) @@ -81,48 +83,67 @@ class Dataset: splitted = [data[ data[:,0] == k ] for k in classes ] total_each = np.average([len(x) for x in splitted]).astype(int) - seed = np.random.randint(0, 4294967295) - rng = np.random.default_rng(seed) data = [] for x in splitted: - samples = rng.choice(x, size=total_each, replace=True, shuffle=False) - data.append(samples) + total = total_each - x.shape[0] + data.append(x) + if total > 0: + samples = np.random.choice(x, size=total, replace=True) + data.append(samples) return np.concatenate(data, axis=0) - def split_data_target(self, data:np.ndarray) -> tuple[np.ndarray, np.ndarray]: + def split_data_target(self, data:np.ndarray) -> Data: target = data[:, 0] if self.target_type != TargetType.NoTarget else None data = data[:, 1:] if self.target_type == TargetType.MultiClassification: target = target.astype(int) uniques = np.unique(target).shape[0] target = np.eye(uniques)[target] - return (data, target) - - def get_dataset(self, test_frac:float=0.2, valid_frac:float=0.2) -> tuple[Data, Data, Data]: - data = self.data.to_numpy() - data = self.prepare_classification(data) - - np.random.shuffle(data) + return Data(data, target) + def split_dataset(self, data:np.ndarray, valid_frac:float, test_frac:float) -> tuple[np.ndarray, np.ndarray, np.ndarray]: total = data.shape[0] valid_cutoff = int(total * valid_frac) test_cutoff = int(total * test_frac) + valid_cutoff + learn = data[test_cutoff:] valid = data[:valid_cutoff] test = data[valid_cutoff:test_cutoff] - learn = data[test_cutoff:] + return (learn, valid, test) - l = [] - for data in [learn, test, valid]: - data, target = self.split_data_target(data) - l.append(Data(data, target)) - return l + + def get_dataset(self, test_frac:float=0.2, valid_frac:float=0.2) -> tuple[Data, Data, Data]: + data = self.data.to_numpy() + max_iter = 10 + while max_iter > 0: + max_iter -= 1 + try: + np.random.shuffle(data) + learn, valid, test = self.split_dataset(data, valid_frac, test_frac) + + if self.target_type == TargetType.Regression or self.target_type == TargetType.NoTarget: + learn = self.prepare_classification(learn) + valid = self.prepare_classification(valid) + test = self.prepare_classification(test) + + learn = self.split_data_target(learn) + valid = self.split_data_target(valid) + test = self.split_data_target(test) + return (learn, valid, test) + except: + if max_iter == 0: + raise Exception("Could not split dataset evenly for the classes, try again with another seed or add more cases in the dataset") class ConfusionMatrix: matrix:np.ndarray def __init__(self, dataset_y: np.ndarray, predictions_y:np.ndarray) -> None: + if len(dataset_y.shape) > 1: + dataset_y = np.argmax(dataset_y, axis=1) + if len(predictions_y.shape) > 1: + predictions_y = np.argmax(predictions_y, axis=1) + classes = len(np.unique(dataset_y)) conf_matrix = np.zeros((classes, classes), dtype=int) @@ -137,6 +158,7 @@ class ConfusionMatrix: self.fp = np.sum(conf_matrix, axis=0) - self.tp self.fn = np.sum(conf_matrix, axis=1) - self.tp self.tn = self.total - (self.tp + self.fp + self.fn) + self.kappa = sklearn.metrics.cohen_kappa_score(dataset_y, predictions_y) def divide_ignore_zero(self, a:np.ndarray, b:np.ndarray) -> np.ndarray: with np.errstate(divide='ignore', invalid='ignore'): @@ -156,12 +178,6 @@ class ConfusionMatrix: def specificity_per_class(self) -> np.ndarray: return self.divide_ignore_zero(self.tn, self.tn + self.fp) - def cohen_kappa_per_class(self) -> np.ndarray: - p_pl = (self.tp + self.fn) * (self.tp + self.fp) / (self.total ** 2) - p_ne = (self.tn + self.fp) * (self.tn + self.fn) / (self.total ** 2) - p = p_pl + p_ne - return (self.accuracy_per_class() - p) / (1 - p) - def f1_score_per_class(self) -> np.ndarray: prec = self.precision_per_class() rec = self.recall_per_class() @@ -187,5 +203,12 @@ class ConfusionMatrix: return np.average(f1_per_class, weights=self.weights) def cohen_kappa(self) -> float: - kappa_per_class = self.cohen_kappa_per_class() - return np.average(kappa_per_class, weights=self.weights) + return self.kappa + + def print(self)-> None: + print(f"Cohen Kappa: {self.cohen_kappa():0.5f}") + print(f"Accuracy : {self.accuracy():0.5f} - classes {self.accuracy_per_class()}") + print(f"Precision : {self.precision():0.5f} - classes {self.precision_per_class()}") + print(f"Recall : {self.recall():0.5f} - classes {self.recall_per_class()}") + print(f"Specificity: {self.specificity():0.5f} - classes {self.specificity_per_class()}") + print(f"F1 score : {self.f1_score():0.5f} - classes {self.f1_score_per_class()}") diff --git a/src/learning/functions.py b/src/learning/functions.py index cb66138..adaa46f 100644 --- a/src/learning/functions.py +++ b/src/learning/functions.py @@ -19,9 +19,10 @@ def lrelu_derivative(x:np.ndarray) -> np.ndarray: return np.where(x < 0, LEAKY_RELU, 1) def softmax(x:np.ndarray) -> np.ndarray: - x = x - np.max(x, axis=1, keepdims=True) # for overflow + axis = 1 if len(x.shape) != 1 else 0 + x = x - np.max(x, axis=axis, keepdims=True) # for overflow exp_x = np.exp(x) - sum_x = np.sum(exp_x, axis=1, keepdims=True) + sum_x = np.sum(exp_x, axis=axis, keepdims=True) return exp_x / sum_x def softmax_derivative(h0:np.ndarray, y:np.ndarray) -> np.ndarray: return h0 - y @@ -58,5 +59,6 @@ def r_squared(h0:np.ndarray, y:np.ndarray) -> float: return 1 - (ss_resid / ss_total) def with_bias(x:np.ndarray) -> np.ndarray: - ones = np.ones(shape=(x.shape[0], 1)) - return np.hstack([x, ones]) + shape = (x.shape[0], 1) if len(x.shape) != 1 else (1,) + ones = np.ones(shape) + return np.hstack([ones, x]) diff --git a/src/learning/ml.py b/src/learning/ml.py index 6d005f8..16221e4 100644 --- a/src/learning/ml.py +++ b/src/learning/ml.py @@ -50,6 +50,7 @@ class MLAlgorithm(ABC): if valid_loss < best[0]: best = (valid_loss, self._get_parameters()) + if np.isnan(learn_loss) or np.isnan(valid_loss): break learn.append(learn_loss) valid.append(valid_loss) if verbose: trange.set_postfix({"learn": f"{learn[-1]:2.5f}", "validation": f"{valid[-1]:2.5f}"}) @@ -82,17 +83,15 @@ class MLAlgorithm(ABC): print(f"Loss learn : {self.learning_loss():0.5f}") print(f"Loss valid : {self.validation_loss():0.5f}") print(f"Loss test : {self.test_loss():0.5f}") + print("========================") if self._target_type == TargetType.Regression: print(f"Pearson : {self.test_pearson():0.5f}") print(f"R^2 : {self.test_r_squared():0.5f}") + print("========================") elif self._target_type != TargetType.NoTarget: conf = self.test_confusion_matrix() - print(f"Accuracy : {conf.accuracy():0.5f} - classes {conf.accuracy_per_class()}") - print(f"Precision : {conf.precision():0.5f} - classes {conf.precision_per_class()}") - print(f"Recall : {conf.recall():0.5f} - classes {conf.recall_per_class()}") - print(f"Specificity: {conf.specificity():0.5f} - classes {conf.specificity_per_class()}") - print(f"Cohen Kappa: {conf.cohen_kappa():0.5f} - classes {conf.cohen_kappa_per_class()}") - print(f"F1 score : {conf.f1_score():0.5f} - classes {conf.f1_score_per_class()}") + conf.print() + print("========================") def test_confusion_matrix(self) -> ConfusionMatrix: if self._target_type != TargetType.Classification\ @@ -103,9 +102,7 @@ class MLAlgorithm(ABC): y = self._testset.y if h0.ndim == 1: h0 = np.where(h0 > 0.5, 1, 0) - else: - h0 = np.argmax(h0, axis=1) - y = np.argmax(y, axis=1) + return ConfusionMatrix(y, h0) def test_pearson(self) -> float: diff --git a/src/learning/supervised.py b/src/learning/supervised.py index d95334a..1e7541c 100644 --- a/src/learning/supervised.py +++ b/src/learning/supervised.py @@ -66,8 +66,8 @@ class MultiLayerPerceptron(MLAlgorithm): input = self._learnset.x.shape[1] output = self._learnset.y.shape[1] - if type(layers) is not list[int]: - layers = [4, 3, output] + if not all(isinstance(x, int) for x in layers): + raise Exception("The list of layers must oly be of integers!") else: layers.append(output) self.layers = [] @@ -90,6 +90,9 @@ class MultiLayerPerceptron(MLAlgorithm): self.activations.append(x) # saving activation result return softmax(x) + def _predict_loss(self, dataset:Data) -> float: + return cross_entropy_loss(self._h0(dataset.x), dataset.y) + def _learning_step(self) -> float: x, y, m, _ = self._learnset.as_tuple() delta = softmax_derivative(self._h0(x), y) @@ -100,7 +103,7 @@ class MultiLayerPerceptron(MLAlgorithm): deltaW *= self.learning_rate deltaW += self.momentum * self.previous_delta[l] - delta = np.dot(delta, self.layers[l][:-1].T) # ignoring bias + delta = np.dot(delta, self.layers[l][1:].T) # ignoring bias delta *= lrelu_derivative(activation) self.layers[l] -= deltaW @@ -108,9 +111,6 @@ class MultiLayerPerceptron(MLAlgorithm): return self._predict_loss(self._learnset) - def _predict_loss(self, dataset:Data) -> float: - return cross_entropy_loss(self._h0(dataset.x), dataset.y) - def _get_parameters(self): parameters = { 'layers': [], 'previous_delta': [] } for x in range(len(self.layers)):