LogisticRegression

- implemented LR
- changed classes tree
This commit is contained in:
2024-04-28 19:32:43 +02:00
parent ed0cfb3aa2
commit 969338196b
6 changed files with 124 additions and 68 deletions

View File

@@ -4,7 +4,7 @@ import numpy as np
from typing_extensions import Self
class Dataset:
def __init__(self, csv:str, target:str) -> None:
def __init__(self, csv:str, target:str, classification:bool=None) -> None:
data = pd.read_csv(csv)
# move target to the start
@@ -12,10 +12,18 @@ class Dataset:
data.insert(0, target, col_target)
data.insert(1, "Bias", 1.0)
if classification == None:
classification = (data[target].dtype == object)
self.original = data
self.data = data
self.target = target
self.classification = (data[target].dtype == object)
self.classification = classification
def remove(self, columns:list[str]) -> Self:
for col in columns:
self.data.pop(col)
return self
def regularize(self, excepts:list[str]=[]) -> Self:
excepts.append(self.target)

View File

@@ -1,29 +1,27 @@
from abc import ABC, abstractmethod
from learning.data import Dataset
from plot import Plot
from tqdm import tqdm
import pandas as pd
import numpy as np
class MLAlgorithm(ABC):
""" Classe generica per gli algoritmi di Machine Learning """
dataset: Dataset
testset: np.ndarray
learnset: np.ndarray
_valid_loss: list[float]
_train_loss: list[float]
def _set_dataset(self, dataset:Dataset, split:float=0.2):
ndarray = dataset.shuffle().as_ndarray()
splitT = int(ndarray.shape[0] * split)
def _set_dataset(self, dataset:np.ndarray, split:float=0.2):
splitT = int(dataset.shape[0] * split)
splitV = int(splitT / 2)
self.dataset = dataset
self.validset = ndarray[:splitV]
self.testset = ndarray[splitV:splitT]
self.learnset = ndarray[splitT:]
np.random.shuffle(dataset)
self.validset = dataset[:splitV]
self.testset = dataset[splitV:splitT]
self.learnset = dataset[splitT:]
def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]:
x = np.delete(dset, 0, 1)
@@ -71,7 +69,32 @@ class MLAlgorithm(ABC):
def test_loss(self) -> float:
return self.predict_loss(self.testset)
def plot(self, skip:int=1000) -> None:
skip = skip if len(self._train_loss) > skip else 0
plot = Plot("Loss", "Time", "Mean Loss")
plot.line("training", "blue", data=self._train_loss[skip:])
plot.line("validation", "red", data=self._valid_loss[skip:])
plot.wait()
def confusion_matrix(self, dataset:np.ndarray) -> np.ndarray:
x, y, _ = self._split_data_target(dataset)
h0 = np.where(self._h0(x) > 0.5, 1, 0)
classes = len(np.unique(y))
conf_matrix = np.zeros((classes, classes), dtype=int)
for actual, prediction in zip(y, h0):
conf_matrix[int(actual), int(prediction)] += 1
return conf_matrix
def accuracy(self, dataset:np.ndarray) -> np.ndarray:
conf = self.confusion_matrix(dataset)
correct = np.sum(np.diagonal(conf))
total = np.sum(conf)
return correct / total
@abstractmethod
def _h0(self, x:np.ndarray) -> np.ndarray: pass
@abstractmethod
def learning_step(self) -> float: pass
@abstractmethod
@@ -80,21 +103,3 @@ class MLAlgorithm(ABC):
def get_parameters(self): pass
@abstractmethod
def set_parameters(self, parameters): pass
@abstractmethod
def plot(self, skip:int=1000) -> None:
skip = skip if len(self._train_loss) > skip else 0
plot = Plot("Loss", "Time", "Mean Loss")
plot.line("training", "blue", data=self._train_loss[skip:])
plot.line("validation", "red", data=self._valid_loss[skip:])
plot.wait()
class MLRegression(MLAlgorithm):
def plot(self, skip: int = 1000) -> None:
return super().plot(skip)
class MLClassification(MLAlgorithm):
def plot(self, skip: int = 1000) -> None:
return super().plot(skip)

View File

@@ -1,35 +1,29 @@
import math as math
import numpy as np
from learning.ml import MLRegression, MLClassification
from learning.data import Dataset
from abc import abstractmethod
from learning.ml import MLAlgorithm
class LinearRegression(MLRegression):
class GradientDescent(MLAlgorithm):
theta:np.ndarray
alpha:float
def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
def __init__(self, dataset:np.ndarray, learning_rate:float=0.1) -> None:
self._set_dataset(dataset)
parameters = dataset.data.shape[1] - 1 #removing the result
parameters = dataset.shape[1] - 1 #removing the result
self.theta = np.random.rand(parameters)
self.alpha = max(0, learning_rate)
def learning_step(self) -> float:
theta = self.theta
alpha = self.alpha
x, y, m = self._split_data_target(self.learnset)
self.theta -= alpha * (1/m) * np.sum((x.dot(theta) - y) * x.T, axis=1)
return self._error(x, y, m)
self.theta -= self.alpha * (1/m) * np.sum((self._h0(x) - y) * x.T, axis=1)
return self._loss(x, y, m)
def predict_loss(self, dataset:np.ndarray) -> float:
x, y, m = self._split_data_target(dataset)
return self._error(x, y, m)
def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
diff = (x.dot(self.theta) - y)
return 1/(2*m) * np.sum(diff ** 2)
return self._loss(x, y, m)
def get_parameters(self):
return self.theta.copy()
@@ -37,6 +31,23 @@ class LinearRegression(MLRegression):
def set_parameters(self, parameters):
self.theta = parameters
@abstractmethod
def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float: pass
class LogisticRegression(MLClassification):
pass
class LinearRegression(GradientDescent):
def _h0(self, x: np.ndarray) -> np.ndarray:
return self.theta.dot(x.T)
def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
diff = (x.dot(self.theta) - y)
return 1/(2*m) * np.sum(diff ** 2)
class LogisticRegression(GradientDescent):
def _h0(self, x: np.ndarray) -> np.ndarray:
return 1 / (1 + np.exp(-self.theta.dot(x.T)))
def _loss(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
h0 = self._h0(x)
diff = -y*np.log(h0) -(1-y)*np.log(1-h0)
return 1/m * np.sum(diff)