Linear Regression

2024-04-17 20:37:41 +02:00
parent d7e7664c37
commit e362cff0c7
9 changed files with 10746 additions and 2 deletions
--- a/src/data.py
+++ b/src/data.py
@@ -0,0 +1,62 @@
+import pandas as pd
+import numpy as np
+
+from typing_extensions import Self
+
+class Dataset:
+    def __init__(self, csv:str, target:str) -> None:
+        data = pd.read_csv(csv)
+
+        # move target to the start
+        col_target = data.pop(target)
+        data.insert(0, target, col_target)
+
+        self.data = data
+        self.target = target
+        self.classification = (data[target].dtype == object)
+
+    def regularize(self, excepts:list=[]) -> Self:
+        excepts.append(self.target)
+        for col in self.data:
+            if col not in excepts:
+                dt = self.data[col]
+                self.data[col] = (dt - dt.mean()) / dt.std()
+        return self
+
+    def factorize(self, columns:list=[]) -> Self:
+        data = self.data
+        for col in columns:
+            data[col] = pd.factorize(data[col])[0]
+        return self
+
+    def to_numbers(self, columns:list=[]) -> Self:
+        data = self.data
+        for col in self.data.columns:
+            if data[col].dtype == object:
+                data[col] = pd.to_numeric(data[col], errors='coerce')
+        return self
+
+    def handle_na(self) -> Self:
+        self.data = self.data.dropna()
+        return self
+
+    def shuffle(self) -> Self:
+        self.data = self.data.sample(frac=1)
+        return self
+
+    def as_ndarray(self, bias=True):
+        data = self.data.copy()
+        if bias: data.insert(1, "Bias", 1.0)
+        return data.to_numpy()
+
+class PrincipalComponentAnalisys:
+    def __init__(self, data:np.ndarray) -> None:
+        self.data = data
+
+    def reduce(self, total:int=0, threshold:float=1) -> Self:
+        columns = self.data.shape[1]
+        if total > columns or total <= 0:
+            total = columns
+        if threshold <= 0 or threshold > 1:
+            threshold = 1
+
--- a/src/linear_regression.py
+++ b/src/linear_regression.py
@@ -0,0 +1,83 @@
+import math as math
+import numpy as np
+import matplotlib.pyplot as plt
+
+from data import Dataset
+
+
+class LinearRegression:
+    def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
+        ndarray = dataset.shuffle().as_ndarray()
+        parameters = ndarray.shape[1] - 1 #removing the result
+
+        split = int(ndarray.shape[0] * 0.2)
+        self.testset = ndarray[split:]
+        self.trainingset = ndarray[:split]
+
+        self.theta = np.random.rand(parameters)
+        self.alpha = max(0, learning_rate)
+
+    def learn(self, times:int) -> list:
+        train = []
+        test = []
+        for _ in range(0, max(1, times)):
+            train.append(self.learning_step())
+            test.append(self.test_error())
+        return (train, test)
+
+    def learning_step(self) -> float:
+        theta = self.theta
+        alpha = self.alpha
+        x = np.delete(self.trainingset, 0, 1)
+        y = self.trainingset[:, 0]
+        m = self.trainingset.shape[0]
+
+        diff = (x.dot(theta) - y)
+        sum = np.sum(diff * x.T, axis=1)
+        theta -= alpha * (1/m) * sum
+        self.theta = theta
+        return self._error(x, y, m)
+
+    def test_error(self) -> float:
+        x = np.delete(self.testset, 0, 1)
+        y = self.testset[:, 0]
+        m = self.testset.shape[0]
+        return self._error(x, y, m)
+
+    def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
+        diff = (x.dot(self.theta) - y)
+        return 1/(2*m) * np.sum(diff ** 2)
+
+def auto_mpg(epoch:int):
+    df = Dataset("datasets\\auto-mpg.csv", "MPG")
+
+    df.to_numbers(["HP"])
+    df.handle_na()
+    df.regularize(excepts=["Cylinders","Year","Origin"])
+
+    lr = LinearRegression(df, learning_rate=0.0001)
+    return lr.learn(epoch)
+
+def automobile(epoch:int):
+    df = Dataset("datasets\\regression\\automobile.csv", "symboling")
+
+    attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
+    df.factorize(attributes_to_modify)
+    df.to_numbers()
+    df.handle_na()
+    df.regularize(excepts=attributes_to_modify)
+
+    lr = LinearRegression(df, learning_rate=0.001)
+    return lr.learn(epoch)
+
+
+if __name__ == '__main__':
+    epoch = 10000
+    skip = - int(epoch * 0.9)
+    err_train, err_test = auto_mpg(epoch)
+    plt.title("Error")
+    plt.xlabel("Time")
+    plt.ylabel("Mean Error")
+    plt.plot(err_train[skip:-1], color="red")
+    plt.plot(err_test[skip:-1], color="blue")
+    plt.show()