From 969338196bb7b6060f94d65fc7489c39a19b0cf0 Mon Sep 17 00:00:00 2001 From: Berack96 Date: Sun, 28 Apr 2024 19:32:43 +0200 Subject: [PATCH] LogisticRegression - implemented LR - changed classes tree --- .vscode/launch.json | 15 ++++++++++ requirements | 3 ++ src/app.py | 60 +++++++++++++++++++++++--------------- src/learning/data.py | 12 ++++++-- src/learning/ml.py | 59 ++++++++++++++++++++----------------- src/learning/supervised.py | 43 +++++++++++++++++---------- 6 files changed, 124 insertions(+), 68 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 requirements diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..e05779f --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: App", + "type": "debugpy", + "request": "launch", + "program": "src\\app.py", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/requirements b/requirements new file mode 100644 index 0000000..bf93cc4 --- /dev/null +++ b/requirements @@ -0,0 +1,3 @@ +matplotlib +pandas +tqdm diff --git a/src/app.py b/src/app.py index 3bca1f9..9ed22c3 100644 --- a/src/app.py +++ b/src/app.py @@ -1,34 +1,48 @@ from learning.data import Dataset -from learning.supervised import LinearRegression -from learning.ml import MLRegression +from learning.supervised import LinearRegression, LogisticRegression, MultiLogisticRegression +from learning.ml import MLAlgorithm from typing import Callable -def auto_mpg() -> tuple[int, MLRegression]: - df = Dataset("datasets\\auto-mpg.csv", "MPG") +def auto_mpg() -> tuple[int, MLAlgorithm]: + ds = Dataset("datasets\\auto-mpg.csv", "MPG") - df.to_numbers(["HP"]) - df.handle_na() - df.regularize(excepts=["Cylinders","Year","Origin"]) - return (1000, LinearRegression(df, learning_rate=0.0001)) + ds.to_numbers(["HP"]) + ds.handle_na() + ds.regularize(excepts=["Cylinders","Year","Origin"]) + return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.0001)) -def automobile() -> tuple[int, MLRegression]: - df = Dataset("datasets\\regression\\automobile.csv", "symboling") +def automobile() -> tuple[int, MLAlgorithm]: + ds = Dataset("datasets\\regression\\automobile.csv", "symboling") attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"] - df.factorize(attributes_to_modify) - df.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]) - df.handle_na() - df.regularize(excepts=attributes_to_modify) - return (1000, LinearRegression(df, learning_rate=0.004)) + ds.factorize(attributes_to_modify) + ds.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]) + ds.handle_na() + ds.regularize(excepts=attributes_to_modify) + return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.004)) -def power_plant() -> tuple[int, MLRegression]: - df = Dataset("datasets\\regression\\power-plant.csv", "energy-output") - df.regularize() - return (80, LinearRegression(df, learning_rate=0.1)) +def power_plant() -> tuple[int, MLAlgorithm]: + ds = Dataset("datasets\\regression\\power-plant.csv", "energy-output") + ds.regularize() + return (80, LinearRegression(ds.as_ndarray(), learning_rate=0.1)) + + +def electrical_grid() -> tuple[int, MLAlgorithm]: + ds = Dataset("datasets\\classification\\electrical_grid.csv", "stabf") + ds.factorize(["stabf"]) + ds.regularize() + return (1000, LogisticRegression(ds.as_ndarray(), learning_rate=0.08)) + +def frogs() -> tuple[int, MLAlgorithm]: + ds = Dataset("datasets\\classification\\frogs.csv", "Species") + ds.remove(["Family", "Genus", "RecordID"]) + ds.factorize(["Species"]) + return (1000, MultiLogisticRegression(ds.as_ndarray(), learning_rate=0.08)) -def learn_dataset(function:Callable[..., tuple[int, MLRegression]], epochs:int=100000, verbose=True)-> None: + +def learn_dataset(function:Callable[..., tuple[int, MLAlgorithm]], epochs:int=10000, verbose=True)-> MLAlgorithm: skip, ml = function() ml.learn(epochs, verbose=verbose) @@ -38,8 +52,8 @@ def learn_dataset(function:Callable[..., tuple[int, MLRegression]], epochs:int=1 print(f"Loss value: tests={err_tests:1.5f}, valid={err_valid:1.5f}, learn={err_learn:1.5f}") ml.plot(skip=skip) - - + return ml if __name__ == "__main__": - learn_dataset(automobile) + ml = learn_dataset(electrical_grid) + print(ml.accuracy(ml.testset)) diff --git a/src/learning/data.py b/src/learning/data.py index e151186..4a4ed08 100644 --- a/src/learning/data.py +++ b/src/learning/data.py @@ -4,7 +4,7 @@ import numpy as np from typing_extensions import Self class Dataset: - def __init__(self, csv:str, target:str) -> None: + def __init__(self, csv:str, target:str, classification:bool=None) -> None: data = pd.read_csv(csv) # move target to the start @@ -12,10 +12,18 @@ class Dataset: data.insert(0, target, col_target) data.insert(1, "Bias", 1.0) + if classification == None: + classification = (data[target].dtype == object) + self.original = data self.data = data self.target = target - self.classification = (data[target].dtype == object) + self.classification = classification + + def remove(self, columns:list[str]) -> Self: + for col in columns: + self.data.pop(col) + return self def regularize(self, excepts:list[str]=[]) -> Self: excepts.append(self.target) diff --git a/src/learning/ml.py b/src/learning/ml.py index 7617645..d1cf836 100644 --- a/src/learning/ml.py +++ b/src/learning/ml.py @@ -1,29 +1,27 @@ from abc import ABC, abstractmethod -from learning.data import Dataset from plot import Plot from tqdm import tqdm +import pandas as pd import numpy as np class MLAlgorithm(ABC): """ Classe generica per gli algoritmi di Machine Learning """ - dataset: Dataset testset: np.ndarray learnset: np.ndarray _valid_loss: list[float] _train_loss: list[float] - def _set_dataset(self, dataset:Dataset, split:float=0.2): - ndarray = dataset.shuffle().as_ndarray() - splitT = int(ndarray.shape[0] * split) + def _set_dataset(self, dataset:np.ndarray, split:float=0.2): + splitT = int(dataset.shape[0] * split) splitV = int(splitT / 2) - self.dataset = dataset - self.validset = ndarray[:splitV] - self.testset = ndarray[splitV:splitT] - self.learnset = ndarray[splitT:] + np.random.shuffle(dataset) + self.validset = dataset[:splitV] + self.testset = dataset[splitV:splitT] + self.learnset = dataset[splitT:] def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]: x = np.delete(dset, 0, 1) @@ -71,7 +69,32 @@ class MLAlgorithm(ABC): def test_loss(self) -> float: return self.predict_loss(self.testset) + def plot(self, skip:int=1000) -> None: + skip = skip if len(self._train_loss) > skip else 0 + plot = Plot("Loss", "Time", "Mean Loss") + plot.line("training", "blue", data=self._train_loss[skip:]) + plot.line("validation", "red", data=self._valid_loss[skip:]) + plot.wait() + def confusion_matrix(self, dataset:np.ndarray) -> np.ndarray: + x, y, _ = self._split_data_target(dataset) + h0 = np.where(self._h0(x) > 0.5, 1, 0) + + classes = len(np.unique(y)) + conf_matrix = np.zeros((classes, classes), dtype=int) + + for actual, prediction in zip(y, h0): + conf_matrix[int(actual), int(prediction)] += 1 + return conf_matrix + + def accuracy(self, dataset:np.ndarray) -> np.ndarray: + conf = self.confusion_matrix(dataset) + correct = np.sum(np.diagonal(conf)) + total = np.sum(conf) + return correct / total + + @abstractmethod + def _h0(self, x:np.ndarray) -> np.ndarray: pass @abstractmethod def learning_step(self) -> float: pass @abstractmethod @@ -80,21 +103,3 @@ class MLAlgorithm(ABC): def get_parameters(self): pass @abstractmethod def set_parameters(self, parameters): pass - - @abstractmethod - def plot(self, skip:int=1000) -> None: - skip = skip if len(self._train_loss) > skip else 0 - plot = Plot("Loss", "Time", "Mean Loss") - plot.line("training", "blue", data=self._train_loss[skip:]) - plot.line("validation", "red", data=self._valid_loss[skip:]) - plot.wait() - - - -class MLRegression(MLAlgorithm): - def plot(self, skip: int = 1000) -> None: - return super().plot(skip) - -class MLClassification(MLAlgorithm): - def plot(self, skip: int = 1000) -> None: - return super().plot(skip) \ No newline at end of file diff --git a/src/learning/supervised.py b/src/learning/supervised.py index 947b154..d8cd4df 100644 --- a/src/learning/supervised.py +++ b/src/learning/supervised.py @@ -1,35 +1,29 @@ import math as math import numpy as np -from learning.ml import MLRegression, MLClassification -from learning.data import Dataset +from abc import abstractmethod +from learning.ml import MLAlgorithm -class LinearRegression(MLRegression): +class GradientDescent(MLAlgorithm): theta:np.ndarray alpha:float - def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None: + def __init__(self, dataset:np.ndarray, learning_rate:float=0.1) -> None: self._set_dataset(dataset) - parameters = dataset.data.shape[1] - 1 #removing the result + parameters = dataset.shape[1] - 1 #removing the result self.theta = np.random.rand(parameters) self.alpha = max(0, learning_rate) def learning_step(self) -> float: - theta = self.theta - alpha = self.alpha x, y, m = self._split_data_target(self.learnset) - self.theta -= alpha * (1/m) * np.sum((x.dot(theta) - y) * x.T, axis=1) - return self._error(x, y, m) + self.theta -= self.alpha * (1/m) * np.sum((self._h0(x) - y) * x.T, axis=1) + return self._loss(x, y, m) def predict_loss(self, dataset:np.ndarray) -> float: x, y, m = self._split_data_target(dataset) - return self._error(x, y, m) - - def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float: - diff = (x.dot(self.theta) - y) - return 1/(2*m) * np.sum(diff ** 2) + return self._loss(x, y, m) def get_parameters(self): return self.theta.copy() @@ -37,6 +31,23 @@ class LinearRegression(MLRegression): def set_parameters(self, parameters): self.theta = parameters + @abstractmethod + def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float: pass -class LogisticRegression(MLClassification): - pass + +class LinearRegression(GradientDescent): + def _h0(self, x: np.ndarray) -> np.ndarray: + return self.theta.dot(x.T) + + def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float: + diff = (x.dot(self.theta) - y) + return 1/(2*m) * np.sum(diff ** 2) + +class LogisticRegression(GradientDescent): + def _h0(self, x: np.ndarray) -> np.ndarray: + return 1 / (1 + np.exp(-self.theta.dot(x.T))) + + def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float: + h0 = self._h0(x) + diff = -y*np.log(h0) -(1-y)*np.log(1-h0) + return 1/m * np.sum(diff)