Linear Regression
This commit is contained in:
62
src/data.py
Normal file
62
src/data.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from typing_extensions import Self
|
||||
|
||||
class Dataset:
|
||||
def __init__(self, csv:str, target:str) -> None:
|
||||
data = pd.read_csv(csv)
|
||||
|
||||
# move target to the start
|
||||
col_target = data.pop(target)
|
||||
data.insert(0, target, col_target)
|
||||
|
||||
self.data = data
|
||||
self.target = target
|
||||
self.classification = (data[target].dtype == object)
|
||||
|
||||
def regularize(self, excepts:list=[]) -> Self:
|
||||
excepts.append(self.target)
|
||||
for col in self.data:
|
||||
if col not in excepts:
|
||||
dt = self.data[col]
|
||||
self.data[col] = (dt - dt.mean()) / dt.std()
|
||||
return self
|
||||
|
||||
def factorize(self, columns:list=[]) -> Self:
|
||||
data = self.data
|
||||
for col in columns:
|
||||
data[col] = pd.factorize(data[col])[0]
|
||||
return self
|
||||
|
||||
def to_numbers(self, columns:list=[]) -> Self:
|
||||
data = self.data
|
||||
for col in self.data.columns:
|
||||
if data[col].dtype == object:
|
||||
data[col] = pd.to_numeric(data[col], errors='coerce')
|
||||
return self
|
||||
|
||||
def handle_na(self) -> Self:
|
||||
self.data = self.data.dropna()
|
||||
return self
|
||||
|
||||
def shuffle(self) -> Self:
|
||||
self.data = self.data.sample(frac=1)
|
||||
return self
|
||||
|
||||
def as_ndarray(self, bias=True):
|
||||
data = self.data.copy()
|
||||
if bias: data.insert(1, "Bias", 1.0)
|
||||
return data.to_numpy()
|
||||
|
||||
class PrincipalComponentAnalisys:
|
||||
def __init__(self, data:np.ndarray) -> None:
|
||||
self.data = data
|
||||
|
||||
def reduce(self, total:int=0, threshold:float=1) -> Self:
|
||||
columns = self.data.shape[1]
|
||||
if total > columns or total <= 0:
|
||||
total = columns
|
||||
if threshold <= 0 or threshold > 1:
|
||||
threshold = 1
|
||||
|
||||
83
src/linear_regression.py
Normal file
83
src/linear_regression.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import math as math
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from data import Dataset
|
||||
|
||||
|
||||
class LinearRegression:
|
||||
def __init__(self, dataset:Dataset, learning_rate:float=0.1) -> None:
|
||||
ndarray = dataset.shuffle().as_ndarray()
|
||||
parameters = ndarray.shape[1] - 1 #removing the result
|
||||
|
||||
split = int(ndarray.shape[0] * 0.2)
|
||||
self.testset = ndarray[split:]
|
||||
self.trainingset = ndarray[:split]
|
||||
|
||||
self.theta = np.random.rand(parameters)
|
||||
self.alpha = max(0, learning_rate)
|
||||
|
||||
def learn(self, times:int) -> list:
|
||||
train = []
|
||||
test = []
|
||||
for _ in range(0, max(1, times)):
|
||||
train.append(self.learning_step())
|
||||
test.append(self.test_error())
|
||||
return (train, test)
|
||||
|
||||
def learning_step(self) -> float:
|
||||
theta = self.theta
|
||||
alpha = self.alpha
|
||||
x = np.delete(self.trainingset, 0, 1)
|
||||
y = self.trainingset[:, 0]
|
||||
m = self.trainingset.shape[0]
|
||||
|
||||
diff = (x.dot(theta) - y)
|
||||
sum = np.sum(diff * x.T, axis=1)
|
||||
theta -= alpha * (1/m) * sum
|
||||
self.theta = theta
|
||||
return self._error(x, y, m)
|
||||
|
||||
def test_error(self) -> float:
|
||||
x = np.delete(self.testset, 0, 1)
|
||||
y = self.testset[:, 0]
|
||||
m = self.testset.shape[0]
|
||||
return self._error(x, y, m)
|
||||
|
||||
def _error(self, x:np.ndarray, y:np.ndarray, m:int) -> float:
|
||||
diff = (x.dot(self.theta) - y)
|
||||
return 1/(2*m) * np.sum(diff ** 2)
|
||||
|
||||
def auto_mpg(epoch:int):
|
||||
df = Dataset("datasets\\auto-mpg.csv", "MPG")
|
||||
|
||||
df.to_numbers(["HP"])
|
||||
df.handle_na()
|
||||
df.regularize(excepts=["Cylinders","Year","Origin"])
|
||||
|
||||
lr = LinearRegression(df, learning_rate=0.0001)
|
||||
return lr.learn(epoch)
|
||||
|
||||
def automobile(epoch:int):
|
||||
df = Dataset("datasets\\regression\\automobile.csv", "symboling")
|
||||
|
||||
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
|
||||
df.factorize(attributes_to_modify)
|
||||
df.to_numbers()
|
||||
df.handle_na()
|
||||
df.regularize(excepts=attributes_to_modify)
|
||||
|
||||
lr = LinearRegression(df, learning_rate=0.001)
|
||||
return lr.learn(epoch)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
epoch = 10000
|
||||
skip = - int(epoch * 0.9)
|
||||
err_train, err_test = auto_mpg(epoch)
|
||||
plt.title("Error")
|
||||
plt.xlabel("Time")
|
||||
plt.ylabel("Mean Error")
|
||||
plt.plot(err_train[skip:-1], color="red")
|
||||
plt.plot(err_test[skip:-1], color="blue")
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user