Project struct
This commit is contained in:
51
src/app.py
Normal file
51
src/app.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
from learning.data import Dataset
|
||||||
|
from learning.supervised import LinearRegression
|
||||||
|
from learning.ml import MLAlgorithm
|
||||||
|
from plot import Plot
|
||||||
|
|
||||||
|
def auto_mpg() -> MLAlgorithm:
|
||||||
|
df = Dataset("datasets\\auto-mpg.csv", "MPG")
|
||||||
|
|
||||||
|
df.to_numbers(["HP"])
|
||||||
|
df.handle_na()
|
||||||
|
df.regularize(excepts=["Cylinders","Year","Origin"])
|
||||||
|
|
||||||
|
return LinearRegression(df, learning_rate=0.0001)
|
||||||
|
|
||||||
|
def automobile() -> MLAlgorithm:
|
||||||
|
df = Dataset("datasets\\regression\\automobile.csv", "symboling")
|
||||||
|
|
||||||
|
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
|
||||||
|
df.factorize(attributes_to_modify)
|
||||||
|
df.to_numbers()
|
||||||
|
df.handle_na()
|
||||||
|
df.regularize(excepts=attributes_to_modify)
|
||||||
|
|
||||||
|
return LinearRegression(df, learning_rate=0.001)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
epoch = 50000
|
||||||
|
skip = 1000
|
||||||
|
lr = automobile()
|
||||||
|
|
||||||
|
train, test = lr.learn(epoch)
|
||||||
|
|
||||||
|
plot = Plot("Error", "Time", "Mean Error")
|
||||||
|
plot.line("training", "red", data=train[skip:])
|
||||||
|
plot.line("test", "blue", data=test[skip:])
|
||||||
|
|
||||||
|
"""
|
||||||
|
for _ in range(0, epoch):
|
||||||
|
train_err = lr.learning_step()
|
||||||
|
test_err = lr.test_error()
|
||||||
|
|
||||||
|
plot.update("training", train_err)
|
||||||
|
plot.update("test", test_err)
|
||||||
|
plot.update_limits()
|
||||||
|
"""
|
||||||
|
|
||||||
|
plot.wait()
|
||||||
|
|
||||||
|
|
||||||
@@ -10,13 +10,16 @@ class Dataset:
|
|||||||
# move target to the start
|
# move target to the start
|
||||||
col_target = data.pop(target)
|
col_target = data.pop(target)
|
||||||
data.insert(0, target, col_target)
|
data.insert(0, target, col_target)
|
||||||
|
data.insert(1, "Bias", 1.0)
|
||||||
|
|
||||||
|
self.original = data
|
||||||
self.data = data
|
self.data = data
|
||||||
self.target = target
|
self.target = target
|
||||||
self.classification = (data[target].dtype == object)
|
self.classification = (data[target].dtype == object)
|
||||||
|
|
||||||
def regularize(self, excepts:list=[]) -> Self:
|
def regularize(self, excepts:list=[]) -> Self:
|
||||||
excepts.append(self.target)
|
excepts.append(self.target)
|
||||||
|
excepts.append("Bias")
|
||||||
for col in self.data:
|
for col in self.data:
|
||||||
if col not in excepts:
|
if col not in excepts:
|
||||||
dt = self.data[col]
|
dt = self.data[col]
|
||||||
@@ -44,10 +47,11 @@ class Dataset:
|
|||||||
self.data = self.data.sample(frac=1)
|
self.data = self.data.sample(frac=1)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def as_ndarray(self, bias=True):
|
def as_ndarray(self) -> np.ndarray:
|
||||||
data = self.data.copy()
|
return self.data.to_numpy()
|
||||||
if bias: data.insert(1, "Bias", 1.0)
|
|
||||||
return data.to_numpy()
|
def get_index(self, column:str) -> int:
|
||||||
|
return self.data.columns.get_loc(column)
|
||||||
|
|
||||||
class PrincipalComponentAnalisys:
|
class PrincipalComponentAnalisys:
|
||||||
def __init__(self, data:np.ndarray) -> None:
|
def __init__(self, data:np.ndarray) -> None:
|
||||||
41
src/learning/ml.py
Normal file
41
src/learning/ml.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from learning.data import Dataset
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class MLAlgorithm(ABC):
|
||||||
|
|
||||||
|
dataset: Dataset
|
||||||
|
testset: np.ndarray
|
||||||
|
learnset: np.ndarray
|
||||||
|
|
||||||
|
def _set_dataset(self, dataset:Dataset, split:float=0.2):
|
||||||
|
ndarray = dataset.shuffle().as_ndarray()
|
||||||
|
split = int(ndarray.shape[0] * split)
|
||||||
|
|
||||||
|
self.dataset = dataset
|
||||||
|
self.testset = ndarray[split:]
|
||||||
|
self.learnset = ndarray[:split]
|
||||||
|
|
||||||
|
def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]:
|
||||||
|
x = np.delete(dset, 0, 1)
|
||||||
|
y = dset[:, 0]
|
||||||
|
m = dset.shape[0]
|
||||||
|
return (x, y, m)
|
||||||
|
|
||||||
|
def learn(self, times:int) -> tuple[list, list]:
|
||||||
|
train = []
|
||||||
|
test = []
|
||||||
|
for _ in range(0, max(1, times)):
|
||||||
|
train.append(self.learning_step())
|
||||||
|
test.append(self.test_error())
|
||||||
|
return (train, test)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def learning_step(self) -> float:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def test_error(self) -> float:
|
||||||
|
pass
|
||||||
29
src/learning/supervised.py
Normal file
29
src/learning/supervised.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
import math as math
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ml import MLAlgorithm
|
||||||
|
from learning.data import Dataset
|
||||||
|
|
||||||
|
class LinearRegression(MLAlgorithm):
|
||||||
|
def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
|
||||||
|
self._set_dataset(dataset)
|
||||||
|
|
||||||
|
parameters = dataset.data.shape[1] - 1 #removing the result
|
||||||
|
self.theta = np.random.rand(parameters)
|
||||||
|
self.alpha = max(0, learning_rate)
|
||||||
|
|
||||||
|
def learning_step(self) -> float:
|
||||||
|
theta = self.theta
|
||||||
|
alpha = self.alpha
|
||||||
|
x, y, m = self._split_data_target(self.learnset)
|
||||||
|
|
||||||
|
self.theta -= alpha * (1/m) * np.sum((x.dot(theta) - y) * x.T, axis=1)
|
||||||
|
return self._error(x, y, m)
|
||||||
|
|
||||||
|
def test_error(self) -> float:
|
||||||
|
x, y, m = self._split_data_target(self.testset)
|
||||||
|
return self._error(x, y, m)
|
||||||
|
|
||||||
|
def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
|
||||||
|
diff = (x.dot(self.theta) - y)
|
||||||
|
return 1/(2*m) * np.sum(diff ** 2)
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
import math as math
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
from data import Dataset
|
|
||||||
|
|
||||||
|
|
||||||
class LinearRegression:
|
|
||||||
def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
|
|
||||||
ndarray = dataset.shuffle().as_ndarray()
|
|
||||||
parameters = ndarray.shape[1] - 1 #removing the result
|
|
||||||
|
|
||||||
split = int(ndarray.shape[0] * 0.2)
|
|
||||||
self.testset = ndarray[split:]
|
|
||||||
self.trainingset = ndarray[:split]
|
|
||||||
|
|
||||||
self.theta = np.random.rand(parameters)
|
|
||||||
self.alpha = max(0, learning_rate)
|
|
||||||
|
|
||||||
def learn(self, times:int) -> list:
|
|
||||||
train = []
|
|
||||||
test = []
|
|
||||||
for _ in range(0, max(1, times)):
|
|
||||||
train.append(self.learning_step())
|
|
||||||
test.append(self.test_error())
|
|
||||||
return (train, test)
|
|
||||||
|
|
||||||
def learning_step(self) -> float:
|
|
||||||
theta = self.theta
|
|
||||||
alpha = self.alpha
|
|
||||||
x = np.delete(self.trainingset, 0, 1)
|
|
||||||
y = self.trainingset[:, 0]
|
|
||||||
m = self.trainingset.shape[0]
|
|
||||||
|
|
||||||
diff = (x.dot(theta) - y)
|
|
||||||
sum = np.sum(diff * x.T, axis=1)
|
|
||||||
theta -= alpha * (1/m) * sum
|
|
||||||
self.theta = theta
|
|
||||||
return self._error(x, y, m)
|
|
||||||
|
|
||||||
def test_error(self) -> float:
|
|
||||||
x = np.delete(self.testset, 0, 1)
|
|
||||||
y = self.testset[:, 0]
|
|
||||||
m = self.testset.shape[0]
|
|
||||||
return self._error(x, y, m)
|
|
||||||
|
|
||||||
def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
|
|
||||||
diff = (x.dot(self.theta) - y)
|
|
||||||
return 1/(2*m) * np.sum(diff ** 2)
|
|
||||||
|
|
||||||
def auto_mpg(epoch:int):
|
|
||||||
df = Dataset("datasets\\auto-mpg.csv", "MPG")
|
|
||||||
|
|
||||||
df.to_numbers(["HP"])
|
|
||||||
df.handle_na()
|
|
||||||
df.regularize(excepts=["Cylinders","Year","Origin"])
|
|
||||||
|
|
||||||
lr = LinearRegression(df, learning_rate=0.0001)
|
|
||||||
return lr.learn(epoch)
|
|
||||||
|
|
||||||
def automobile(epoch:int):
|
|
||||||
df = Dataset("datasets\\regression\\automobile.csv", "symboling")
|
|
||||||
|
|
||||||
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
|
|
||||||
df.factorize(attributes_to_modify)
|
|
||||||
df.to_numbers()
|
|
||||||
df.handle_na()
|
|
||||||
df.regularize(excepts=attributes_to_modify)
|
|
||||||
|
|
||||||
lr = LinearRegression(df, learning_rate=0.001)
|
|
||||||
return lr.learn(epoch)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
epoch = 10000
|
|
||||||
skip = - int(epoch * 0.9)
|
|
||||||
err_train, err_test = auto_mpg(epoch)
|
|
||||||
plt.title("Error")
|
|
||||||
plt.xlabel("Time")
|
|
||||||
plt.ylabel("Mean Error")
|
|
||||||
plt.plot(err_train[skip:-1], color="red")
|
|
||||||
plt.plot(err_test[skip:-1], color="blue")
|
|
||||||
plt.show()
|
|
||||||
63
src/plot.py
Normal file
63
src/plot.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
class Plot:
|
||||||
|
def __init__(self, title:str, labelx:str, labely:str) -> None:
|
||||||
|
plt.title(title)
|
||||||
|
plt.xlabel(labelx)
|
||||||
|
plt.ylabel(labely)
|
||||||
|
plt.ion()
|
||||||
|
plt.show(block=False)
|
||||||
|
|
||||||
|
self.data = dict()
|
||||||
|
|
||||||
|
def wait(self) -> Self:
|
||||||
|
plt.ioff()
|
||||||
|
plt.show()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def scatter(self, label:str, datax:list[float], datay:list[float], color:str) -> Self:
|
||||||
|
plt.scatter(datax, datay, color=color, label=label)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def line(self, label:str, color:str, data:list[float]=[], max_length:int=100) -> Self:
|
||||||
|
line, = plt.plot(data if len(data) > 0 else [0], label=label, color=color)
|
||||||
|
x = [] if len(data) == 0 else [*range(len(data))]
|
||||||
|
|
||||||
|
self.data[label] = (line, data, x, max_length)
|
||||||
|
plt.legend()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def update(self, label:str, newdata:float) -> Self:
|
||||||
|
line, datay, datax, max = self.data[label]
|
||||||
|
|
||||||
|
x = 0 if len(datax) == 0 else datax[-1]
|
||||||
|
datax.append(x+1)
|
||||||
|
datay.append(newdata)
|
||||||
|
|
||||||
|
remove = len(datax) - max
|
||||||
|
if remove > 0:
|
||||||
|
del datax[:remove]
|
||||||
|
del datay[:remove]
|
||||||
|
|
||||||
|
line.set_data((datax, datay))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def update_limits(self) -> Self:
|
||||||
|
if not bool(plt.get_fignums()): raise Exception("plot closed!")
|
||||||
|
limy_top = 0.1
|
||||||
|
limx_top, limx_bot = (0, 100000000000000000)
|
||||||
|
|
||||||
|
for val in self.data:
|
||||||
|
_, datay, datax, _ = self.data[val]
|
||||||
|
limy_top = max(max(datay), limy_top)
|
||||||
|
limx_top = max(max(datax), limx_top)
|
||||||
|
limx_bot = min(min(datax), limx_bot)
|
||||||
|
if limx_top == limx_bot: limx_top += 1
|
||||||
|
|
||||||
|
plt.xlim(limx_bot, limx_top)
|
||||||
|
plt.ylim(0, limy_top)
|
||||||
|
plt.draw()
|
||||||
|
plt.pause(0.0000000001)
|
||||||
|
return self
|
||||||
|
|
||||||
Reference in New Issue
Block a user