From 969338196bb7b6060f94d65fc7489c39a19b0cf0 Mon Sep 17 00:00:00 2001
From: Berack96 <giacomobertolazzi7@gmail.com>
Date: Sun, 28 Apr 2024 19:32:43 +0200
Subject: [PATCH] LogisticRegression - implemented LR - changed classes tree

---
 .vscode/launch.json        | 15 ++++++++++
 requirements               |  3 ++
 src/app.py                 | 60 +++++++++++++++++++++++---------------
 src/learning/data.py       | 12 ++++++--
 src/learning/ml.py         | 59 ++++++++++++++++++++-----------------
 src/learning/supervised.py | 43 +++++++++++++++++----------
 6 files changed, 124 insertions(+), 68 deletions(-)
 create mode 100644 .vscode/launch.json
 create mode 100644 requirements

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..e05779f
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: App",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "src\\app.py",
+            "console": "integratedTerminal"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/requirements b/requirements
new file mode 100644
index 0000000..bf93cc4
--- /dev/null
+++ b/requirements
@@ -0,0 +1,3 @@
+matplotlib
+pandas
+tqdm
diff --git a/src/app.py b/src/app.py
index 3bca1f9..9ed22c3 100644
--- a/src/app.py
+++ b/src/app.py
@@ -1,34 +1,48 @@
 from learning.data import Dataset
-from learning.supervised import LinearRegression
-from learning.ml import MLRegression
+from learning.supervised import LinearRegression, LogisticRegression, MultiLogisticRegression
+from learning.ml import MLAlgorithm
 from typing import Callable
 
-def auto_mpg() -> tuple[int, MLRegression]:
-    df = Dataset("datasets\\auto-mpg.csv", "MPG")
+def auto_mpg() -> tuple[int, MLAlgorithm]:
+    ds = Dataset("datasets\\auto-mpg.csv", "MPG")
 
-    df.to_numbers(["HP"])
-    df.handle_na()
-    df.regularize(excepts=["Cylinders","Year","Origin"])
-    return (1000, LinearRegression(df, learning_rate=0.0001))
+    ds.to_numbers(["HP"])
+    ds.handle_na()
+    ds.regularize(excepts=["Cylinders","Year","Origin"])
+    return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.0001))
 
-def automobile() -> tuple[int, MLRegression]:
-    df = Dataset("datasets\\regression\\automobile.csv", "symboling")
+def automobile() -> tuple[int, MLAlgorithm]:
+    ds = Dataset("datasets\\regression\\automobile.csv", "symboling")
 
     attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
-    df.factorize(attributes_to_modify)
-    df.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
-    df.handle_na()
-    df.regularize(excepts=attributes_to_modify)
-    return (1000, LinearRegression(df, learning_rate=0.004))
+    ds.factorize(attributes_to_modify)
+    ds.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
+    ds.handle_na()
+    ds.regularize(excepts=attributes_to_modify)
+    return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.004))
 
-def power_plant() -> tuple[int, MLRegression]:
-    df = Dataset("datasets\\regression\\power-plant.csv", "energy-output")
-    df.regularize()
-    return (80, LinearRegression(df, learning_rate=0.1))
+def power_plant() -> tuple[int, MLAlgorithm]:
+    ds = Dataset("datasets\\regression\\power-plant.csv", "energy-output")
+    ds.regularize()
+    return (80, LinearRegression(ds.as_ndarray(), learning_rate=0.1))
+
+
+def electrical_grid() -> tuple[int, MLAlgorithm]:
+    ds = Dataset("datasets\\classification\\electrical_grid.csv", "stabf")
+    ds.factorize(["stabf"])
+    ds.regularize()
+    return (1000, LogisticRegression(ds.as_ndarray(), learning_rate=0.08))
+
+def frogs() -> tuple[int, MLAlgorithm]:
+    ds = Dataset("datasets\\classification\\frogs.csv", "Species")
+    ds.remove(["Family", "Genus", "RecordID"])
+    ds.factorize(["Species"])
+    return (1000, MultiLogisticRegression(ds.as_ndarray(), learning_rate=0.08))
 
 
 
-def learn_dataset(function:Callable[..., tuple[int, MLRegression]], epochs:int=100000, verbose=True)-> None:
+
+def learn_dataset(function:Callable[..., tuple[int, MLAlgorithm]], epochs:int=10000, verbose=True)-> MLAlgorithm:
     skip, ml = function()
     ml.learn(epochs, verbose=verbose)
 
@@ -38,8 +52,8 @@ def learn_dataset(function:Callable[..., tuple[int, MLRegression]], epochs:int=1
     print(f"Loss value: tests={err_tests:1.5f}, valid={err_valid:1.5f}, learn={err_learn:1.5f}")
 
     ml.plot(skip=skip)
-
-
+    return ml
 
 if __name__ == "__main__":
-    learn_dataset(automobile)
+    ml = learn_dataset(electrical_grid)
+    print(ml.accuracy(ml.testset))
diff --git a/src/learning/data.py b/src/learning/data.py
index e151186..4a4ed08 100644
--- a/src/learning/data.py
+++ b/src/learning/data.py
@@ -4,7 +4,7 @@ import numpy as np
 from typing_extensions import Self
 
 class Dataset:
-    def __init__(self, csv:str, target:str) -> None:
+    def __init__(self, csv:str, target:str, classification:bool=None) -> None:
         data = pd.read_csv(csv)
 
         # move target to the start
@@ -12,10 +12,18 @@ class Dataset:
         data.insert(0, target, col_target)
         data.insert(1, "Bias", 1.0)
 
+        if classification == None:
+            classification = (data[target].dtype == object)
+
         self.original = data
         self.data = data
         self.target = target
-        self.classification = (data[target].dtype == object)
+        self.classification = classification
+
+    def remove(self, columns:list[str]) -> Self:
+        for col in columns:
+            self.data.pop(col)
+        return self
 
     def regularize(self, excepts:list[str]=[]) -> Self:
         excepts.append(self.target)
diff --git a/src/learning/ml.py b/src/learning/ml.py
index 7617645..d1cf836 100644
--- a/src/learning/ml.py
+++ b/src/learning/ml.py
@@ -1,29 +1,27 @@
 from abc import ABC, abstractmethod
-from learning.data import Dataset
 from plot import Plot
 from tqdm import tqdm
 
+import pandas as pd
 import numpy as np
 
 
 class MLAlgorithm(ABC):
     """ Classe generica per gli algoritmi di Machine Learning """
 
-    dataset: Dataset
     testset: np.ndarray
     learnset: np.ndarray
     _valid_loss: list[float]
     _train_loss: list[float]
 
-    def _set_dataset(self, dataset:Dataset, split:float=0.2):
-        ndarray = dataset.shuffle().as_ndarray()
-        splitT = int(ndarray.shape[0] * split)
+    def _set_dataset(self, dataset:np.ndarray, split:float=0.2):
+        splitT = int(dataset.shape[0] * split)
         splitV = int(splitT / 2)
 
-        self.dataset = dataset
-        self.validset = ndarray[:splitV]
-        self.testset = ndarray[splitV:splitT]
-        self.learnset = ndarray[splitT:]
+        np.random.shuffle(dataset)
+        self.validset = dataset[:splitV]
+        self.testset = dataset[splitV:splitT]
+        self.learnset = dataset[splitT:]
 
     def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]:
         x = np.delete(dset, 0, 1)
@@ -71,7 +69,32 @@ class MLAlgorithm(ABC):
     def test_loss(self) -> float:
         return self.predict_loss(self.testset)
 
+    def plot(self, skip:int=1000) -> None:
+        skip = skip if len(self._train_loss) > skip else 0
+        plot = Plot("Loss", "Time", "Mean Loss")
+        plot.line("training", "blue", data=self._train_loss[skip:])
+        plot.line("validation", "red", data=self._valid_loss[skip:])
+        plot.wait()
 
+    def confusion_matrix(self, dataset:np.ndarray) -> np.ndarray:
+        x, y, _ = self._split_data_target(dataset)
+        h0 = np.where(self._h0(x) > 0.5, 1, 0)
+
+        classes = len(np.unique(y))
+        conf_matrix = np.zeros((classes, classes), dtype=int)
+
+        for actual, prediction in zip(y, h0):
+            conf_matrix[int(actual), int(prediction)] += 1
+        return conf_matrix
+
+    def accuracy(self, dataset:np.ndarray) -> np.ndarray:
+        conf = self.confusion_matrix(dataset)
+        correct = np.sum(np.diagonal(conf))
+        total = np.sum(conf)
+        return correct / total
+
+    @abstractmethod
+    def _h0(self, x:np.ndarray) -> np.ndarray: pass
     @abstractmethod
     def learning_step(self) -> float: pass
     @abstractmethod
@@ -80,21 +103,3 @@ class MLAlgorithm(ABC):
     def get_parameters(self): pass
     @abstractmethod
     def set_parameters(self, parameters): pass
-
-    @abstractmethod
-    def plot(self, skip:int=1000) -> None:
-        skip = skip if len(self._train_loss) > skip else 0
-        plot = Plot("Loss", "Time", "Mean Loss")
-        plot.line("training", "blue", data=self._train_loss[skip:])
-        plot.line("validation", "red", data=self._valid_loss[skip:])
-        plot.wait()
-
-
-
-class MLRegression(MLAlgorithm):
-    def plot(self, skip: int = 1000) -> None:
-        return super().plot(skip)
-
-class MLClassification(MLAlgorithm):
-    def plot(self, skip: int = 1000) -> None:
-        return super().plot(skip)
\ No newline at end of file
diff --git a/src/learning/supervised.py b/src/learning/supervised.py
index 947b154..d8cd4df 100644
--- a/src/learning/supervised.py
+++ b/src/learning/supervised.py
@@ -1,35 +1,29 @@
 import math as math
 import numpy as np
 
-from learning.ml import MLRegression, MLClassification
-from learning.data import Dataset
+from abc import abstractmethod
+from learning.ml import MLAlgorithm
 
-class LinearRegression(MLRegression):
+class GradientDescent(MLAlgorithm):
     theta:np.ndarray
     alpha:float
 
-    def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
+    def __init__(self, dataset:np.ndarray, learning_rate:float=0.1) -> None:
         self._set_dataset(dataset)
 
-        parameters = dataset.data.shape[1] - 1 #removing the result
+        parameters = dataset.shape[1] - 1 #removing the result
         self.theta = np.random.rand(parameters)
         self.alpha = max(0, learning_rate)
 
     def learning_step(self) -> float:
-        theta = self.theta
-        alpha = self.alpha
         x, y, m = self._split_data_target(self.learnset)
 
-        self.theta -= alpha * (1/m) * np.sum((x.dot(theta) - y) * x.T, axis=1)
-        return self._error(x, y, m)
+        self.theta -= self.alpha * (1/m) * np.sum((self._h0(x) - y) * x.T, axis=1)
+        return self._loss(x, y, m)
 
     def predict_loss(self, dataset:np.ndarray) -> float:
         x, y, m = self._split_data_target(dataset)
-        return self._error(x, y, m)
-
-    def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
-        diff = (x.dot(self.theta) - y)
-        return 1/(2*m) * np.sum(diff ** 2)
+        return self._loss(x, y, m)
 
     def get_parameters(self):
         return self.theta.copy()
@@ -37,6 +31,23 @@ class LinearRegression(MLRegression):
     def set_parameters(self, parameters):
         self.theta = parameters
 
+    @abstractmethod
+    def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float: pass
 
-class LogisticRegression(MLClassification):
-    pass
+
+class LinearRegression(GradientDescent):
+    def _h0(self, x: np.ndarray) -> np.ndarray:
+        return self.theta.dot(x.T)
+
+    def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
+        diff = (x.dot(self.theta) - y)
+        return 1/(2*m) * np.sum(diff ** 2)
+
+class LogisticRegression(GradientDescent):
+    def _h0(self, x: np.ndarray) -> np.ndarray:
+        return 1 / (1 + np.exp(-self.theta.dot(x.T)))
+
+    def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
+        h0 = self._h0(x)
+        diff = -y*np.log(h0) -(1-y)*np.log(1-h0)
+        return 1/m * np.sum(diff)