Refactor Dataset
- better finalize function - support for one-hot-encoding
This commit is contained in:
38
src/app.py
38
src/app.py
@@ -1,43 +1,43 @@
|
|||||||
from learning.data import Dataset
|
from learning.data import Dataset, TargetType
|
||||||
from learning.supervised import LinearRegression, LogisticRegression, MultiLogisticRegression
|
from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
|
||||||
from learning.ml import MLAlgorithm
|
from learning.ml import MLAlgorithm
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
|
||||||
def auto_mpg() -> tuple[int, MLAlgorithm]:
|
def auto_mpg() -> tuple[int, MLAlgorithm]:
|
||||||
ds = Dataset("datasets\\auto-mpg.csv", "MPG")
|
ds = Dataset("datasets\\auto-mpg.csv", "MPG", TargetType.Regression)
|
||||||
|
|
||||||
ds.to_numbers(["HP"])
|
ds.numbers(["HP"])
|
||||||
ds.handle_na()
|
ds.handle_na()
|
||||||
ds.regularize(excepts=["Cylinders","Year","Origin"])
|
ds.normalize(excepts=["Cylinders","Year","Origin"])
|
||||||
return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.0001))
|
return (1000, LinearRegression(ds, learning_rate=0.0001))
|
||||||
|
|
||||||
def automobile() -> tuple[int, MLAlgorithm]:
|
def automobile() -> tuple[int, MLAlgorithm]:
|
||||||
ds = Dataset("datasets\\regression\\automobile.csv", "symboling")
|
ds = Dataset("datasets\\regression\\automobile.csv", "symboling", TargetType.Regression)
|
||||||
|
|
||||||
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
|
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
|
||||||
ds.factorize(attributes_to_modify)
|
ds.factorize(attributes_to_modify)
|
||||||
ds.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
|
ds.numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
|
||||||
ds.handle_na()
|
ds.handle_na()
|
||||||
ds.regularize(excepts=attributes_to_modify)
|
ds.normalize(excepts=attributes_to_modify)
|
||||||
return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.004))
|
return (1000, LinearRegression(ds, learning_rate=0.004))
|
||||||
|
|
||||||
def power_plant() -> tuple[int, MLAlgorithm]:
|
def power_plant() -> tuple[int, MLAlgorithm]:
|
||||||
ds = Dataset("datasets\\regression\\power-plant.csv", "energy-output")
|
ds = Dataset("datasets\\regression\\power-plant.csv", "energy-output", TargetType.Regression)
|
||||||
ds.regularize()
|
ds.normalize()
|
||||||
return (80, LinearRegression(ds.as_ndarray(), learning_rate=0.1))
|
return (80, LinearRegression(ds, learning_rate=0.1))
|
||||||
|
|
||||||
|
|
||||||
def electrical_grid() -> tuple[int, MLAlgorithm]:
|
def electrical_grid() -> tuple[int, MLAlgorithm]:
|
||||||
ds = Dataset("datasets\\classification\\electrical_grid.csv", "stabf")
|
ds = Dataset("datasets\\classification\\electrical_grid.csv", "stabf", TargetType.Classification)
|
||||||
ds.factorize(["stabf"])
|
ds.factorize(["stabf"])
|
||||||
ds.regularize()
|
ds.normalize()
|
||||||
return (1000, LogisticRegression(ds.as_ndarray(), learning_rate=0.08))
|
return (1000, LogisticRegression(ds, learning_rate=0.08))
|
||||||
|
|
||||||
def frogs() -> tuple[int, MLAlgorithm]:
|
def frogs() -> tuple[int, MLAlgorithm]:
|
||||||
ds = Dataset("datasets\\classification\\frogs.csv", "Species")
|
ds = Dataset("datasets\\classification\\frogs.csv", "Species", TargetType.MultiClassification)
|
||||||
ds.remove(["Family", "Genus", "RecordID"])
|
ds.remove(["Family", "Genus", "RecordID"])
|
||||||
ds.factorize(["Species"])
|
ds.factorize(["Species"])
|
||||||
return (1000, MultiLogisticRegression(ds.as_ndarray(), learning_rate=0.08))
|
return (1000, MultiLayerPerceptron(ds, learning_rate=0.08))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -55,5 +55,5 @@ def learn_dataset(function:Callable[..., tuple[int, MLAlgorithm]], epochs:int=10
|
|||||||
return ml
|
return ml
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
ml = learn_dataset(electrical_grid)
|
ml = learn_dataset(automobile)
|
||||||
print(ml.accuracy(ml.testset))
|
print(ml.accuracy(ml.testset))
|
||||||
|
|||||||
@@ -1,33 +1,53 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
class TargetType(Enum):
|
||||||
|
Regression = 1
|
||||||
|
Classification = 2
|
||||||
|
MultiClassification = 3
|
||||||
|
NoTarget = 4
|
||||||
|
|
||||||
|
class Data:
|
||||||
|
x: np.ndarray
|
||||||
|
y: np.ndarray
|
||||||
|
size: int
|
||||||
|
param: int
|
||||||
|
|
||||||
|
def __init__(self, x:np.ndarray, y:np.ndarray) -> None:
|
||||||
|
self.x = x
|
||||||
|
self.y = y
|
||||||
|
self.size = x.shape[0]
|
||||||
|
self.param = x.shape[1]
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return "X: " + str(self.x) + "\nY: " + str(self.y)
|
||||||
|
def as_tuple(self) -> tuple[np.ndarray, np.ndarray, int, int]:
|
||||||
|
return (self.x, self.y, self.size, self.param)
|
||||||
|
|
||||||
class Dataset:
|
class Dataset:
|
||||||
def __init__(self, csv:str, target:str, classification:bool=None) -> None:
|
data: pd.DataFrame
|
||||||
data = pd.read_csv(csv)
|
target: str
|
||||||
|
target_type: TargetType
|
||||||
|
|
||||||
|
def __init__(self, csv:str, target:str, target_type:TargetType) -> None:
|
||||||
|
self.original = pd.read_csv(csv)
|
||||||
|
self.data = self.original
|
||||||
|
self.target = target
|
||||||
|
self.target_type = target_type
|
||||||
|
|
||||||
# move target to the start
|
# move target to the start
|
||||||
col_target = data.pop(target)
|
col_target = self.data.pop(target)
|
||||||
data.insert(0, target, col_target)
|
self.data.insert(0, target, col_target)
|
||||||
data.insert(1, "Bias", 1.0)
|
|
||||||
|
|
||||||
if classification == None:
|
|
||||||
classification = (data[target].dtype == object)
|
|
||||||
|
|
||||||
self.original = data
|
|
||||||
self.data = data
|
|
||||||
self.target = target
|
|
||||||
self.classification = classification
|
|
||||||
|
|
||||||
def remove(self, columns:list[str]) -> Self:
|
def remove(self, columns:list[str]) -> Self:
|
||||||
for col in columns:
|
for col in columns:
|
||||||
self.data.pop(col)
|
self.data.pop(col)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def regularize(self, excepts:list[str]=[]) -> Self:
|
def normalize(self, excepts:list[str]=[]) -> Self:
|
||||||
excepts.append(self.target)
|
excepts.append(self.target)
|
||||||
excepts.append("Bias")
|
|
||||||
for col in self.data:
|
for col in self.data:
|
||||||
if col not in excepts:
|
if col not in excepts:
|
||||||
index = self.data.columns.get_loc(col)
|
index = self.data.columns.get_loc(col)
|
||||||
@@ -42,7 +62,7 @@ class Dataset:
|
|||||||
data[col] = pd.factorize(data[col])[0]
|
data[col] = pd.factorize(data[col])[0]
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_numbers(self, columns:list[str]=[]) -> Self:
|
def numbers(self, columns:list[str]=[]) -> Self:
|
||||||
data = self.data
|
data = self.data
|
||||||
for col in columns:
|
for col in columns:
|
||||||
if data[col].dtype == object:
|
if data[col].dtype == object:
|
||||||
@@ -53,34 +73,38 @@ class Dataset:
|
|||||||
self.data = self.data.dropna()
|
self.data = self.data.dropna()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def shuffle(self) -> Self:
|
def get_dataset(self, test_frac:float=0.15, valid_frac:float=0.15) -> tuple[Data, Data, Data]:
|
||||||
self.data = self.data.sample(frac=1)
|
data = self.data.to_numpy()
|
||||||
return self
|
data = np.insert(data, 1, 1, axis=1) # adding bias
|
||||||
|
np.random.shuffle(data)
|
||||||
|
|
||||||
def as_ndarray(self) -> np.ndarray:
|
total = data.shape[0]
|
||||||
return self.data.to_numpy()
|
valid_cutoff = int(total * valid_frac)
|
||||||
|
test_cutoff = int(total * test_frac) + valid_cutoff
|
||||||
def get_index(self, column:str) -> int:
|
|
||||||
return self.data.columns.get_loc(column)
|
|
||||||
|
|
||||||
class PrincipalComponentAnalisys:
|
|
||||||
def __init__(self, data:np.ndarray) -> None:
|
|
||||||
self.data = data
|
|
||||||
|
|
||||||
def reduce(self, total:int=0, threshold:float=1) -> Self:
|
|
||||||
columns = self.data.shape[1]
|
|
||||||
if total > columns or total <= 0:
|
|
||||||
total = columns
|
|
||||||
if threshold <= 0 or threshold > 1:
|
|
||||||
threshold = 1
|
|
||||||
|
|
||||||
|
valid = data[:valid_cutoff]
|
||||||
|
test = data[valid_cutoff:test_cutoff]
|
||||||
|
learn = data[test_cutoff:]
|
||||||
|
|
||||||
|
l = []
|
||||||
|
for ds in [learn, test, valid]:
|
||||||
|
target = ds[:, 0] if self.target_type != TargetType.NoTarget else None
|
||||||
|
ds = ds[:, 1:]
|
||||||
|
if self.target_type == TargetType.MultiClassification:
|
||||||
|
target = target.astype(int)
|
||||||
|
uniques = np.unique(target).shape[0]
|
||||||
|
target = np.eye(uniques)[target]
|
||||||
|
l.append(Data(ds, target))
|
||||||
|
return l
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
df = Dataset("datasets\\regression\\automobile.csv", "symboling")
|
ds = Dataset("datasets\\classification\\frogs.csv", "Species", TargetType.MultiClassification)
|
||||||
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
|
ds.remove(["Family", "Genus", "RecordID"])
|
||||||
df.factorize(attributes_to_modify)
|
ds.factorize(["Species"])
|
||||||
df.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
|
|
||||||
df.handle_na()
|
np.random.seed(0)
|
||||||
df.regularize(excepts=attributes_to_modify)
|
learn, test, valid = ds.get_dataset()
|
||||||
print(df.data.dtypes)
|
print(learn)
|
||||||
|
print(test)
|
||||||
|
print(valid)
|
||||||
|
|
||||||
|
|||||||
@@ -1,33 +1,25 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from plot import Plot
|
from plot import Plot
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from learning.data import Dataset, Data
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
class MLAlgorithm(ABC):
|
class MLAlgorithm(ABC):
|
||||||
""" Classe generica per gli algoritmi di Machine Learning """
|
""" Classe generica per gli algoritmi di Machine Learning """
|
||||||
|
|
||||||
testset: np.ndarray
|
learnset: Data
|
||||||
learnset: np.ndarray
|
validset: Data
|
||||||
|
testset: Data
|
||||||
|
_learn_loss: list[float]
|
||||||
_valid_loss: list[float]
|
_valid_loss: list[float]
|
||||||
_train_loss: list[float]
|
|
||||||
|
|
||||||
def _set_dataset(self, dataset:np.ndarray, split:float=0.2):
|
def __init__(self, dataset:Dataset) -> None:
|
||||||
splitT = int(dataset.shape[0] * split)
|
learn, test, valid = dataset.get_dataset(0.2, 0.2)
|
||||||
splitV = int(splitT / 2)
|
self.learnset = learn
|
||||||
|
self.validset = valid
|
||||||
np.random.shuffle(dataset)
|
self.testset = test
|
||||||
self.validset = dataset[:splitV]
|
|
||||||
self.testset = dataset[splitV:splitT]
|
|
||||||
self.learnset = dataset[splitT:]
|
|
||||||
|
|
||||||
def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]:
|
|
||||||
x = np.delete(dset, 0, 1)
|
|
||||||
y = dset[:, 0]
|
|
||||||
m = dset.shape[0]
|
|
||||||
return (x, y, m)
|
|
||||||
|
|
||||||
def learn(self, epochs:int, early_stop:float=0.0000001, max_patience:int=10, verbose:bool=False) -> tuple[int, list, list]:
|
def learn(self, epochs:int, early_stop:float=0.0000001, max_patience:int=10, verbose:bool=False) -> tuple[int, list, list]:
|
||||||
learn = []
|
learn = []
|
||||||
@@ -56,7 +48,7 @@ class MLAlgorithm(ABC):
|
|||||||
except KeyboardInterrupt: pass
|
except KeyboardInterrupt: pass
|
||||||
if verbose: print(f"Loop ended after {count} epochs")
|
if verbose: print(f"Loop ended after {count} epochs")
|
||||||
|
|
||||||
self._train_loss = learn
|
self._learn_loss = learn
|
||||||
self._valid_loss = valid
|
self._valid_loss = valid
|
||||||
return (count, learn, valid)
|
return (count, learn, valid)
|
||||||
|
|
||||||
@@ -70,24 +62,23 @@ class MLAlgorithm(ABC):
|
|||||||
return self.predict_loss(self.testset)
|
return self.predict_loss(self.testset)
|
||||||
|
|
||||||
def plot(self, skip:int=1000) -> None:
|
def plot(self, skip:int=1000) -> None:
|
||||||
skip = skip if len(self._train_loss) > skip else 0
|
skip = skip if len(self._learn_loss) > skip else 0
|
||||||
plot = Plot("Loss", "Time", "Mean Loss")
|
plot = Plot("Loss", "Time", "Mean Loss")
|
||||||
plot.line("training", "blue", data=self._train_loss[skip:])
|
plot.line("training", "blue", data=self._learn_loss[skip:])
|
||||||
plot.line("validation", "red", data=self._valid_loss[skip:])
|
plot.line("validation", "red", data=self._valid_loss[skip:])
|
||||||
plot.wait()
|
plot.wait()
|
||||||
|
|
||||||
def confusion_matrix(self, dataset:np.ndarray) -> np.ndarray:
|
def confusion_matrix(self, dataset:Data) -> np.ndarray:
|
||||||
x, y, _ = self._split_data_target(dataset)
|
h0 = np.where(self._h0(dataset.x) > 0.5, 1, 0)
|
||||||
h0 = np.where(self._h0(x) > 0.5, 1, 0)
|
|
||||||
|
|
||||||
classes = len(np.unique(y))
|
classes = len(np.unique(dataset.y))
|
||||||
conf_matrix = np.zeros((classes, classes), dtype=int)
|
conf_matrix = np.zeros((classes, classes), dtype=int)
|
||||||
|
|
||||||
for actual, prediction in zip(y, h0):
|
for actual, prediction in zip(dataset.y, h0):
|
||||||
conf_matrix[int(actual), int(prediction)] += 1
|
conf_matrix[int(actual), int(prediction)] += 1
|
||||||
return conf_matrix
|
return conf_matrix
|
||||||
|
|
||||||
def accuracy(self, dataset:np.ndarray) -> np.ndarray:
|
def accuracy(self, dataset:Data) -> float:
|
||||||
conf = self.confusion_matrix(dataset)
|
conf = self.confusion_matrix(dataset)
|
||||||
correct = np.sum(np.diagonal(conf))
|
correct = np.sum(np.diagonal(conf))
|
||||||
total = np.sum(conf)
|
total = np.sum(conf)
|
||||||
@@ -98,7 +89,7 @@ class MLAlgorithm(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def learning_step(self) -> float: pass
|
def learning_step(self) -> float: pass
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def predict_loss(self, dataset:np.ndarray) -> float: pass
|
def predict_loss(self, dataset:Data) -> float: pass
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_parameters(self): pass
|
def get_parameters(self): pass
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|||||||
@@ -3,27 +3,25 @@ import numpy as np
|
|||||||
|
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from learning.ml import MLAlgorithm
|
from learning.ml import MLAlgorithm
|
||||||
|
from learning.data import Dataset, Data
|
||||||
|
|
||||||
class GradientDescent(MLAlgorithm):
|
class GradientDescent(MLAlgorithm):
|
||||||
theta:np.ndarray
|
theta:np.ndarray
|
||||||
alpha:float
|
alpha:float
|
||||||
|
|
||||||
def __init__(self, dataset:np.ndarray, learning_rate:float=0.1) -> None:
|
def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
|
||||||
self._set_dataset(dataset)
|
self.__init__(dataset)
|
||||||
|
self.theta = np.random.rand(self.learnset.param)
|
||||||
parameters = dataset.shape[1] - 1 #removing the result
|
|
||||||
self.theta = np.random.rand(parameters)
|
|
||||||
self.alpha = max(0, learning_rate)
|
self.alpha = max(0, learning_rate)
|
||||||
|
|
||||||
def learning_step(self) -> float:
|
def learning_step(self) -> float:
|
||||||
x, y, m = self._split_data_target(self.learnset)
|
x, y, m, _ = self.learnset.as_tuple()
|
||||||
|
|
||||||
self.theta -= self.alpha * (1/m) * np.sum((self._h0(x) - y) * x.T, axis=1)
|
self.theta -= self.alpha * (1/m) * np.sum((self._h0(x) - y) * x.T, axis=1)
|
||||||
return self._loss(x, y, m)
|
return self._loss(x, y, m)
|
||||||
|
|
||||||
def predict_loss(self, dataset:np.ndarray) -> float:
|
def predict_loss(self, dataset:Data) -> float:
|
||||||
x, y, m = self._split_data_target(dataset)
|
return self._loss(dataset.x, dataset.y, dataset.size)
|
||||||
return self._loss(x, y, m)
|
|
||||||
|
|
||||||
def get_parameters(self):
|
def get_parameters(self):
|
||||||
return self.theta.copy()
|
return self.theta.copy()
|
||||||
@@ -51,3 +49,10 @@ class LogisticRegression(GradientDescent):
|
|||||||
h0 = self._h0(x)
|
h0 = self._h0(x)
|
||||||
diff = -y*np.log(h0) -(1-y)*np.log(1-h0)
|
diff = -y*np.log(h0) -(1-y)*np.log(1-h0)
|
||||||
return 1/m * np.sum(diff)
|
return 1/m * np.sum(diff)
|
||||||
|
|
||||||
|
class MultiLayerPerceptron(MLAlgorithm):
|
||||||
|
neurons: list[np.ndarray]
|
||||||
|
|
||||||
|
def __init__(self, dataset:Dataset, layers:list[int]=[4,3]) -> None:
|
||||||
|
self.__init__(dataset)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user