From 1fb277bc702076512343997c2ef2a9b64087c617 Mon Sep 17 00:00:00 2001 From: Berack96 Date: Mon, 22 Apr 2024 15:41:13 +0200 Subject: [PATCH] Progress Bar - added progress bar - divided dataset into validation, test, learning - added patience for learning --- src/app.py | 39 +++++++++-------- src/learning/ml.py | 86 ++++++++++++++++++++++++-------------- src/learning/supervised.py | 13 +++++- 3 files changed, 88 insertions(+), 50 deletions(-) diff --git a/src/app.py b/src/app.py index e1e26c7..5f7d241 100644 --- a/src/app.py +++ b/src/app.py @@ -1,16 +1,17 @@ from learning.data import Dataset from learning.supervised import LinearRegression from learning.ml import MLRegression +from typing import Callable -def auto_mpg() -> tuple[int, int, MLRegression]: +def auto_mpg() -> tuple[int, MLRegression]: df = Dataset("datasets\\auto-mpg.csv", "MPG") df.to_numbers(["HP"]) df.handle_na() df.regularize(excepts=["Cylinders","Year","Origin"]) - return (5000, 1000, LinearRegression(df, learning_rate=0.0001)) + return (1000, LinearRegression(df, learning_rate=0.0001)) -def automobile() -> tuple[int, int, MLRegression]: +def automobile() -> tuple[int, MLRegression]: df = Dataset("datasets\\regression\\automobile.csv", "symboling") attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"] @@ -18,23 +19,27 @@ def automobile() -> tuple[int, int, MLRegression]: df.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]) df.handle_na() df.regularize(excepts=attributes_to_modify) - return (5000, 1000, LinearRegression(df, learning_rate=0.002)) + return (1000, LinearRegression(df, learning_rate=0.004)) -def power_plant() -> tuple[int, int, MLRegression]: +def power_plant() -> tuple[int, MLRegression]: df = Dataset("datasets\\regression\\power-plant.csv", "energy-output") df.regularize() - return (1000, 80, LinearRegression(df, learning_rate=0.1)) + return (80, LinearRegression(df, learning_rate=0.1)) -epoch, skip, ml = automobile() -ml.learn(epoch) -ml.plot(skip=skip) -""" -for _ in range(0, epoch): - train_err = lr.learning_step() - test_err = lr.test_error() - plot.update("training", train_err) - plot.update("test", test_err) - plot.update_limits() -""" +def learn_dataset(function:Callable[..., tuple[int, MLRegression]], epochs:int=100000, verbose=True)-> None: + skip, ml = function() + ml.learn(epochs, verbose=verbose) + + err_tests = ml.test_loss() + err_valid = ml.validation_loss() + err_learn = ml.learning_loss() + print(f"Loss value: tests={err_tests:1.5f}, valid={err_valid:1.5f}, learn={err_learn:1.5f}") + + ml.plot(skip=skip) + + + +if __name__ == "__main__": + learn_dataset(auto_mpg) diff --git a/src/learning/ml.py b/src/learning/ml.py index 8ce3ce7..2b21fe1 100644 --- a/src/learning/ml.py +++ b/src/learning/ml.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from learning.data import Dataset from plot import Plot +from tqdm import tqdm import numpy as np @@ -11,16 +12,18 @@ class MLAlgorithm(ABC): dataset: Dataset testset: np.ndarray learnset: np.ndarray - test_error: list[float] - train_error: list[float] + _valid_loss: list[float] + _train_loss: list[float] def _set_dataset(self, dataset:Dataset, split:float=0.2): ndarray = dataset.shuffle().as_ndarray() - split = int(ndarray.shape[0] * split) + splitT = int(ndarray.shape[0] * split) + splitV = int(splitT / 2) self.dataset = dataset - self.testset = ndarray[split:] - self.learnset = ndarray[:split] + self.validset = ndarray[:splitV] + self.testset = ndarray[splitV:splitT] + self.learnset = ndarray[splitT:] def _split_data_target(self, dset:np.ndarray) -> tuple[np.ndarray, np.ndarray, int]: x = np.delete(dset, 0, 1) @@ -28,43 +31,64 @@ class MLAlgorithm(ABC): m = dset.shape[0] return (x, y, m) - def learn(self, times:int) -> tuple[list, list]: - _, train, test = self.learn_until(times) - return (train, test) - - def learn_until(self, max_iter:int=1000000, delta:float=0.0) -> tuple[int, list, list]: - train = [] - test = [] - prev = None + def learn(self, epochs:int, early_stop:float=0.0000001, max_patience:int=10, verbose:bool=False) -> tuple[int, list, list]: + learn = [] + valid = [] count = 0 + patience = 0 + trange = range(epochs) + if verbose: trange = tqdm(trange, bar_format="Epochs {percentage:3.0f}% [{bar}] {elapsed}{postfix}") - while count < max_iter and (prev == None or prev - train[-1] > delta): - count += 1 - prev = train[-1] if len(train) > 0 else None + try: + for _ in trange: + if count > 1 and valid[-2] - valid[-1] < early_stop: + if patience >= max_patience: + self.set_parameters(backup) + break + patience += 1 + else: + backup = self.get_parameters() + patience = 0 - train.append(self.learning_step()) - test.append(self.test_error()) + count += 1 + learn.append(self.learning_step()) + valid.append(self.validation_loss()) + + if verbose: trange.set_postfix({"learn": f"{learn[-1]:2.5f}", "validation": f"{valid[-1]:2.5f}"}) + except KeyboardInterrupt: pass + if verbose: print(f"Loop ended after {count} epochs") + + self._train_loss = learn + self._valid_loss = valid + return (count, learn, valid) + + def learning_loss(self) -> float: + return self.predict_loss(self.learnset) + + def validation_loss(self) -> float: + return self.predict_loss(self.validset) + + def test_loss(self) -> float: + return self.predict_loss(self.testset) - self.train_error = train - self.test_error = test - return (count, train, test) @abstractmethod - def learning_step(self) -> float: - pass - + def learning_step(self) -> float: pass @abstractmethod - def test_error(self) -> float: - pass - + def predict_loss(self, dataset:np.ndarray) -> float: pass @abstractmethod - def plot(self, skip:int=1000) -> None: - pass + def plot(self, skip:int=1000) -> None: pass + @abstractmethod + def get_parameters(self): pass + @abstractmethod + def set_parameters(self, parameters): pass + class MLRegression(MLAlgorithm): def plot(self, skip:int=1000) -> None: + skip = skip if len(self._train_loss) > skip else 0 plot = Plot("Error", "Time", "Mean Error") - plot.line("training", "blue", data=self.train_error[skip:]) - plot.line("test", "red", data=self.test_error[skip:]) + plot.line("training", "blue", data=self._train_loss[skip:]) + plot.line("validation", "red", data=self._valid_loss[skip:]) plot.wait() diff --git a/src/learning/supervised.py b/src/learning/supervised.py index e34c87f..76c0c9b 100644 --- a/src/learning/supervised.py +++ b/src/learning/supervised.py @@ -5,6 +5,9 @@ from learning.ml import MLRegression from learning.data import Dataset class LinearRegression(MLRegression): + theta:np.ndarray + alpha:float + def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None: self._set_dataset(dataset) @@ -20,10 +23,16 @@ class LinearRegression(MLRegression): self.theta -= alpha * (1/m) * np.sum((x.dot(theta) - y) * x.T, axis=1) return self._error(x, y, m) - def test_error(self) -> float: - x, y, m = self._split_data_target(self.testset) + def predict_loss(self, dataset:np.ndarray) -> float: + x, y, m = self._split_data_target(dataset) return self._error(x, y, m) def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float: diff = (x.dot(self.theta) - y) return 1/(2*m) * np.sum(diff ** 2) + + def get_parameters(self): + return self.theta.copy() + + def set_parameters(self, parameters): + self.theta = parameters