LogisticRegression

- implemented LR
- changed classes tree
This commit is contained in:
2024-04-28 19:32:43 +02:00
parent ed0cfb3aa2
commit 969338196b
6 changed files with 124 additions and 68 deletions

15
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,15 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: App",
"type": "debugpy",
"request": "launch",
"program": "src\\app.py",
"console": "integratedTerminal"
}
]
}

3
requirements Normal file
View File

@@ -0,0 +1,3 @@
matplotlib
pandas
tqdm

View File

@@ -1,34 +1,48 @@
from learning.data import Dataset from learning.data import Dataset
from learning.supervised import LinearRegression from learning.supervised import LinearRegression, LogisticRegression, MultiLogisticRegression
from learning.ml import MLRegression from learning.ml import MLAlgorithm
from typing import Callable from typing import Callable
def auto_mpg() -> tuple[int, MLRegression]: def auto_mpg() -> tuple[int, MLAlgorithm]:
df = Dataset("datasets\\auto-mpg.csv", "MPG") ds = Dataset("datasets\\auto-mpg.csv", "MPG")
df.to_numbers(["HP"]) ds.to_numbers(["HP"])
df.handle_na() ds.handle_na()
df.regularize(excepts=["Cylinders","Year","Origin"]) ds.regularize(excepts=["Cylinders","Year","Origin"])
return (1000, LinearRegression(df, learning_rate=0.0001)) return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.0001))
def automobile() -> tuple[int, MLRegression]: def automobile() -> tuple[int, MLAlgorithm]:
df = Dataset("datasets\\regression\\automobile.csv", "symboling") ds = Dataset("datasets\\regression\\automobile.csv", "symboling")
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"] attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
df.factorize(attributes_to_modify) ds.factorize(attributes_to_modify)
df.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]) ds.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
df.handle_na() ds.handle_na()
df.regularize(excepts=attributes_to_modify) ds.regularize(excepts=attributes_to_modify)
return (1000, LinearRegression(df, learning_rate=0.004)) return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.004))
def power_plant() -> tuple[int, MLRegression]: def power_plant() -> tuple[int, MLAlgorithm]:
df = Dataset("datasets\\regression\\power-plant.csv", "energy-output") ds = Dataset("datasets\\regression\\power-plant.csv", "energy-output")
df.regularize() ds.regularize()
return (80, LinearRegression(df, learning_rate=0.1)) return (80, LinearRegression(ds.as_ndarray(), learning_rate=0.1))
def electrical_grid() -> tuple[int, MLAlgorithm]:
ds = Dataset("datasets\\classification\\electrical_grid.csv", "stabf")
ds.factorize(["stabf"])
ds.regularize()
return (1000, LogisticRegression(ds.as_ndarray(), learning_rate=0.08))
def frogs() -> tuple[int, MLAlgorithm]:
ds = Dataset("datasets\\classification\\frogs.csv", "Species")
ds.remove(["Family", "Genus", "RecordID"])
ds.factorize(["Species"])
return (1000, MultiLogisticRegression(ds.as_ndarray(), learning_rate=0.08))
def learn_dataset(function:Callable[..., tuple[int, MLRegression]], epochs:int=100000, verbose=True)-> None:
def learn_dataset(function:Callable[..., tuple[int, MLAlgorithm]], epochs:int=10000, verbose=True)-> MLAlgorithm:
skip, ml = function() skip, ml = function()
ml.learn(epochs, verbose=verbose) ml.learn(epochs, verbose=verbose)
@@ -38,8 +52,8 @@ def learn_dataset(function:Callable[..., tuple[int, MLRegression]], epochs:int=1
print(f"Loss value: tests={err_tests:1.5f}, valid={err_valid:1.5f}, learn={err_learn:1.5f}") print(f"Loss value: tests={err_tests:1.5f}, valid={err_valid:1.5f}, learn={err_learn:1.5f}")
ml.plot(skip=skip) ml.plot(skip=skip)
return ml
if __name__ == "__main__": if __name__ == "__main__":
learn_dataset(automobile) ml = learn_dataset(electrical_grid)
print(ml.accuracy(ml.testset))

View File

@@ -4,7 +4,7 @@ import numpy as np
from typing_extensions import Self from typing_extensions import Self
class Dataset: class Dataset:
def __init__(self, csv:str, target:str) -> None: def __init__(self, csv:str, target:str, classification:bool=None) -> None:
data = pd.read_csv(csv) data = pd.read_csv(csv)
# move target to the start # move target to the start
@@ -12,10 +12,18 @@ class Dataset:
data.insert(0, target, col_target) data.insert(0, target, col_target)
data.insert(1, "Bias", 1.0) data.insert(1, "Bias", 1.0)
if classification == None:
classification = (data[target].dtype == object)
self.original = data self.original = data
self.data = data self.data = data
self.target = target self.target = target
self.classification = (data[target].dtype == object) self.classification = classification
def remove(self, columns:list[str]) -> Self:
for col in columns:
self.data.pop(col)
return self
def regularize(self, excepts:list[str]=[]) -> Self: def regularize(self, excepts:list[str]=[]) -> Self:
excepts.append(self.target) excepts.append(self.target)

View File

@@ -1,29 +1,27 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from learning.data import Dataset
from plot import Plot from plot import Plot
from tqdm import tqdm from tqdm import tqdm
import pandas as pd
import numpy as np import numpy as np
class MLAlgorithm(ABC): class MLAlgorithm(ABC):
""" Classe generica per gli algoritmi di Machine Learning """ """ Classe generica per gli algoritmi di Machine Learning """
dataset: Dataset
testset: np.ndarray testset: np.ndarray
learnset: np.ndarray learnset: np.ndarray
_valid_loss: list[float] _valid_loss: list[float]
_train_loss: list[float] _train_loss: list[float]
def _set_dataset(self, dataset:Dataset, split:float=0.2): def _set_dataset(self, dataset:np.ndarray, split:float=0.2):
ndarray = dataset.shuffle().as_ndarray() splitT = int(dataset.shape[0] * split)
splitT = int(ndarray.shape[0] * split)
splitV = int(splitT / 2) splitV = int(splitT / 2)
self.dataset = dataset np.random.shuffle(dataset)
self.validset = ndarray[:splitV] self.validset = dataset[:splitV]
self.testset = ndarray[splitV:splitT] self.testset = dataset[splitV:splitT]
self.learnset = ndarray[splitT:] self.learnset = dataset[splitT:]
def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]: def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]:
x = np.delete(dset, 0, 1) x = np.delete(dset, 0, 1)
@@ -71,7 +69,32 @@ class MLAlgorithm(ABC):
def test_loss(self) -> float: def test_loss(self) -> float:
return self.predict_loss(self.testset) return self.predict_loss(self.testset)
def plot(self, skip:int=1000) -> None:
skip = skip if len(self._train_loss) > skip else 0
plot = Plot("Loss", "Time", "Mean Loss")
plot.line("training", "blue", data=self._train_loss[skip:])
plot.line("validation", "red", data=self._valid_loss[skip:])
plot.wait()
def confusion_matrix(self, dataset:np.ndarray) -> np.ndarray:
x, y, _ = self._split_data_target(dataset)
h0 = np.where(self._h0(x) > 0.5, 1, 0)
classes = len(np.unique(y))
conf_matrix = np.zeros((classes, classes), dtype=int)
for actual, prediction in zip(y, h0):
conf_matrix[int(actual), int(prediction)] += 1
return conf_matrix
def accuracy(self, dataset:np.ndarray) -> np.ndarray:
conf = self.confusion_matrix(dataset)
correct = np.sum(np.diagonal(conf))
total = np.sum(conf)
return correct / total
@abstractmethod
def _h0(self, x:np.ndarray) -> np.ndarray: pass
@abstractmethod @abstractmethod
def learning_step(self) -> float: pass def learning_step(self) -> float: pass
@abstractmethod @abstractmethod
@@ -80,21 +103,3 @@ class MLAlgorithm(ABC):
def get_parameters(self): pass def get_parameters(self): pass
@abstractmethod @abstractmethod
def set_parameters(self, parameters): pass def set_parameters(self, parameters): pass
@abstractmethod
def plot(self, skip:int=1000) -> None:
skip = skip if len(self._train_loss) > skip else 0
plot = Plot("Loss", "Time", "Mean Loss")
plot.line("training", "blue", data=self._train_loss[skip:])
plot.line("validation", "red", data=self._valid_loss[skip:])
plot.wait()
class MLRegression(MLAlgorithm):
def plot(self, skip: int = 1000) -> None:
return super().plot(skip)
class MLClassification(MLAlgorithm):
def plot(self, skip: int = 1000) -> None:
return super().plot(skip)

View File

@@ -1,35 +1,29 @@
import math as math import math as math
import numpy as np import numpy as np
from learning.ml import MLRegression, MLClassification from abc import abstractmethod
from learning.data import Dataset from learning.ml import MLAlgorithm
class LinearRegression(MLRegression): class GradientDescent(MLAlgorithm):
theta:np.ndarray theta:np.ndarray
alpha:float alpha:float
def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None: def __init__(self, dataset:np.ndarray, learning_rate:float=0.1) -> None:
self._set_dataset(dataset) self._set_dataset(dataset)
parameters = dataset.data.shape[1] - 1 #removing the result parameters = dataset.shape[1] - 1 #removing the result
self.theta = np.random.rand(parameters) self.theta = np.random.rand(parameters)
self.alpha = max(0, learning_rate) self.alpha = max(0, learning_rate)
def learning_step(self) -> float: def learning_step(self) -> float:
theta = self.theta
alpha = self.alpha
x, y, m = self._split_data_target(self.learnset) x, y, m = self._split_data_target(self.learnset)
self.theta -= alpha * (1/m) * np.sum((x.dot(theta) - y) * x.T, axis=1) self.theta -= self.alpha * (1/m) * np.sum((self._h0(x) - y) * x.T, axis=1)
return self._error(x, y, m) return self._loss(x, y, m)
def predict_loss(self, dataset:np.ndarray) -> float: def predict_loss(self, dataset:np.ndarray) -> float:
x, y, m = self._split_data_target(dataset) x, y, m = self._split_data_target(dataset)
return self._error(x, y, m) return self._loss(x, y, m)
def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
diff = (x.dot(self.theta) - y)
return 1/(2*m) * np.sum(diff ** 2)
def get_parameters(self): def get_parameters(self):
return self.theta.copy() return self.theta.copy()
@@ -37,6 +31,23 @@ class LinearRegression(MLRegression):
def set_parameters(self, parameters): def set_parameters(self, parameters):
self.theta = parameters self.theta = parameters
@abstractmethod
def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float: pass
class LogisticRegression(MLClassification):
pass class LinearRegression(GradientDescent):
def _h0(self, x: np.ndarray) -> np.ndarray:
return self.theta.dot(x.T)
def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
diff = (x.dot(self.theta) - y)
return 1/(2*m) * np.sum(diff ** 2)
class LogisticRegression(GradientDescent):
def _h0(self, x: np.ndarray) -> np.ndarray:
return 1 / (1 + np.exp(-self.theta.dot(x.T)))
def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
h0 = self._h0(x)
diff = -y*np.log(h0) -(1-y)*np.log(1-h0)
return 1/m * np.sum(diff)