Project struct

This commit is contained in:
2024-04-18 18:53:57 +02:00
parent e362cff0c7
commit 18e390d34b
6 changed files with 192 additions and 87 deletions

51
src/app.py Normal file
View File

@@ -0,0 +1,51 @@
from learning.data import Dataset
from learning.supervised import LinearRegression
from learning.ml import MLAlgorithm
from plot import Plot
def auto_mpg() -> MLAlgorithm:
df = Dataset("datasets\\auto-mpg.csv", "MPG")
df.to_numbers(["HP"])
df.handle_na()
df.regularize(excepts=["Cylinders","Year","Origin"])
return LinearRegression(df, learning_rate=0.0001)
def automobile() -> MLAlgorithm:
df = Dataset("datasets\\regression\\automobile.csv", "symboling")
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
df.factorize(attributes_to_modify)
df.to_numbers()
df.handle_na()
df.regularize(excepts=attributes_to_modify)
return LinearRegression(df, learning_rate=0.001)
epoch = 50000
skip = 1000
lr = automobile()
train, test = lr.learn(epoch)
plot = Plot("Error", "Time", "Mean Error")
plot.line("training", "red", data=train[skip:])
plot.line("test", "blue", data=test[skip:])
"""
for _ in range(0, epoch):
train_err = lr.learning_step()
test_err = lr.test_error()
plot.update("training", train_err)
plot.update("test", test_err)
plot.update_limits()
"""
plot.wait()

View File

@@ -10,13 +10,16 @@ class Dataset:
# move target to the start
col_target = data.pop(target)
data.insert(0, target, col_target)
data.insert(1, "Bias", 1.0)
self.original = data
self.data = data
self.target = target
self.classification = (data[target].dtype == object)
def regularize(self, excepts:list=[]) -> Self:
excepts.append(self.target)
excepts.append("Bias")
for col in self.data:
if col not in excepts:
dt = self.data[col]
@@ -44,10 +47,11 @@ class Dataset:
self.data = self.data.sample(frac=1)
return self
def as_ndarray(self, bias=True):
data = self.data.copy()
if bias: data.insert(1, "Bias", 1.0)
return data.to_numpy()
def as_ndarray(self) -> np.ndarray:
return self.data.to_numpy()
def get_index(self, column:str) -> int:
return self.data.columns.get_loc(column)
class PrincipalComponentAnalisys:
def __init__(self, data:np.ndarray) -> None:

41
src/learning/ml.py Normal file
View File

@@ -0,0 +1,41 @@
from abc import ABC, abstractmethod
from learning.data import Dataset
import numpy as np
class MLAlgorithm(ABC):
dataset: Dataset
testset: np.ndarray
learnset: np.ndarray
def _set_dataset(self, dataset:Dataset, split:float=0.2):
ndarray = dataset.shuffle().as_ndarray()
split = int(ndarray.shape[0] * split)
self.dataset = dataset
self.testset = ndarray[split:]
self.learnset = ndarray[:split]
def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]:
x = np.delete(dset, 0, 1)
y = dset[:, 0]
m = dset.shape[0]
return (x, y, m)
def learn(self, times:int) -> tuple[list, list]:
train = []
test = []
for _ in range(0, max(1, times)):
train.append(self.learning_step())
test.append(self.test_error())
return (train, test)
@abstractmethod
def learning_step(self) -> float:
pass
@abstractmethod
def test_error(self) -> float:
pass

View File

@@ -0,0 +1,29 @@
import math as math
import numpy as np
from ml import MLAlgorithm
from learning.data import Dataset
class LinearRegression(MLAlgorithm):
def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
self._set_dataset(dataset)
parameters = dataset.data.shape[1] - 1 #removing the result
self.theta = np.random.rand(parameters)
self.alpha = max(0, learning_rate)
def learning_step(self) -> float:
theta = self.theta
alpha = self.alpha
x, y, m = self._split_data_target(self.learnset)
self.theta -= alpha * (1/m) * np.sum((x.dot(theta) - y) * x.T, axis=1)
return self._error(x, y, m)
def test_error(self) -> float:
x, y, m = self._split_data_target(self.testset)
return self._error(x, y, m)
def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
diff = (x.dot(self.theta) - y)
return 1/(2*m) * np.sum(diff ** 2)

View File

@@ -1,83 +0,0 @@
import math as math
import numpy as np
import matplotlib.pyplot as plt
from data import Dataset
class LinearRegression:
def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
ndarray = dataset.shuffle().as_ndarray()
parameters = ndarray.shape[1] - 1 #removing the result
split = int(ndarray.shape[0] * 0.2)
self.testset = ndarray[split:]
self.trainingset = ndarray[:split]
self.theta = np.random.rand(parameters)
self.alpha = max(0, learning_rate)
def learn(self, times:int) -> list:
train = []
test = []
for _ in range(0, max(1, times)):
train.append(self.learning_step())
test.append(self.test_error())
return (train, test)
def learning_step(self) -> float:
theta = self.theta
alpha = self.alpha
x = np.delete(self.trainingset, 0, 1)
y = self.trainingset[:, 0]
m = self.trainingset.shape[0]
diff = (x.dot(theta) - y)
sum = np.sum(diff * x.T, axis=1)
theta -= alpha * (1/m) * sum
self.theta = theta
return self._error(x, y, m)
def test_error(self) -> float:
x = np.delete(self.testset, 0, 1)
y = self.testset[:, 0]
m = self.testset.shape[0]
return self._error(x, y, m)
def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
diff = (x.dot(self.theta) - y)
return 1/(2*m) * np.sum(diff ** 2)
def auto_mpg(epoch:int):
df = Dataset("datasets\\auto-mpg.csv", "MPG")
df.to_numbers(["HP"])
df.handle_na()
df.regularize(excepts=["Cylinders","Year","Origin"])
lr = LinearRegression(df, learning_rate=0.0001)
return lr.learn(epoch)
def automobile(epoch:int):
df = Dataset("datasets\\regression\\automobile.csv", "symboling")
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
df.factorize(attributes_to_modify)
df.to_numbers()
df.handle_na()
df.regularize(excepts=attributes_to_modify)
lr = LinearRegression(df, learning_rate=0.001)
return lr.learn(epoch)
if __name__ == '__main__':
epoch = 10000
skip = - int(epoch * 0.9)
err_train, err_test = auto_mpg(epoch)
plt.title("Error")
plt.xlabel("Time")
plt.ylabel("Mean Error")
plt.plot(err_train[skip:-1], color="red")
plt.plot(err_test[skip:-1], color="blue")
plt.show()

63
src/plot.py Normal file
View File

@@ -0,0 +1,63 @@
import matplotlib.pyplot as plt
from typing_extensions import Self
class Plot:
def __init__(self, title:str, labelx:str, labely:str) -> None:
plt.title(title)
plt.xlabel(labelx)
plt.ylabel(labely)
plt.ion()
plt.show(block=False)
self.data = dict()
def wait(self) -> Self:
plt.ioff()
plt.show()
return self
def scatter(self, label:str, datax:list[float], datay:list[float], color:str) -> Self:
plt.scatter(datax, datay, color=color, label=label)
return self
def line(self, label:str, color:str, data:list[float]=[], max_length:int=100) -> Self:
line, = plt.plot(data if len(data) > 0 else [0], label=label, color=color)
x = [] if len(data) == 0 else [*range(len(data))]
self.data[label] = (line, data, x, max_length)
plt.legend()
return self
def update(self, label:str, newdata:float) -> Self:
line, datay, datax, max = self.data[label]
x = 0 if len(datax) == 0 else datax[-1]
datax.append(x+1)
datay.append(newdata)
remove = len(datax) - max
if remove > 0:
del datax[:remove]
del datay[:remove]
line.set_data((datax, datay))
return self
def update_limits(self) -> Self:
if not bool(plt.get_fignums()): raise Exception("plot closed!")
limy_top = 0.1
limx_top, limx_bot = (0, 100000000000000000)
for val in self.data:
_, datay, datax, _ = self.data[val]
limy_top = max(max(datay), limy_top)
limx_top = max(max(datax), limx_top)
limx_bot = min(min(datax), limx_bot)
if limx_top == limx_bot: limx_top += 1
plt.xlim(limx_bot, limx_top)
plt.ylim(0, limy_top)
plt.draw()
plt.pause(0.0000000001)
return self