diff --git a/.gitignore b/.gitignore index ed8ebf5..9f7550b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -__pycache__ \ No newline at end of file +__pycache__ +.venv diff --git a/requirements b/requirements index bf93cc4..7174011 100644 --- a/requirements +++ b/requirements @@ -1,3 +1,4 @@ matplotlib pandas -tqdm +tqdm +scikit-learn diff --git a/src/app.py b/src/app.py index 9e3c932..cef3321 100644 --- a/src/app.py +++ b/src/app.py @@ -25,7 +25,7 @@ def auto_mpg() -> tuple[Dataset, MLAlgorithm, Any]: ds.numbers(["HP"]) ds.handle_na() ds.normalize(excepts=["Cylinders","Year","Origin"]) - return (ds, LinearRegression(ds, learning_rate=0.0001), sklearn.linear_model.LinearRegression()) + return (ds, LinearRegression(ds, learning_rate=0.0001), sklearn.linear_model.SGDRegressor()) def automobile() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(REGRESSION + "automobile.csv", "symboling", TargetType.Regression) @@ -35,12 +35,12 @@ def automobile() -> tuple[Dataset, MLAlgorithm, Any]: ds.numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]) ds.handle_na() ds.normalize(excepts=attributes_to_modify) - return (ds, LinearRegression(ds, learning_rate=0.004), sklearn.linear_model.LinearRegression()) + return (ds, LinearRegression(ds, learning_rate=0.003), sklearn.linear_model.SGDRegressor()) def power_plant() -> tuple[Dataset, MLAlgorithm, Any]: ds = Dataset(REGRESSION + "power-plant.csv", "energy-output", TargetType.Regression) - ds.normalize() - return (ds, LinearRegression(ds, learning_rate=0.1), sklearn.linear_model.LinearRegression()) + ds.normalize(excepts=None) + return (ds, LinearRegression(ds, learning_rate=0.1), sklearn.linear_model.SGDRegressor()) # ******************** # Logistic Regression @@ -101,7 +101,7 @@ def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]: if __name__ == "__main__": np.set_printoptions(linewidth=np.inf, formatter={'float': '{:>10.5f}'.format}) rand = np.random.randint(0, 4294967295) - #rand = 1997847910 # LiR for power_plant + #rand = 2205910060 # LiR for power_plant #rand = 347617386 # LoR for electrical_grid #rand = 1793295160 # MLP for iris #rand = 2914000170 # MLP for frogs @@ -110,7 +110,7 @@ if __name__ == "__main__": np.random.seed(rand) print(f"Using seed: {rand}") - ds, ml, sk = frogs() + ds, ml, sk = power_plant() epochs, _, _ = ml.learn(1000, verbose=True) ml.display_results() diff --git a/src/learning/data.py b/src/learning/data.py index 03b3fd8..8383032 100644 --- a/src/learning/data.py +++ b/src/learning/data.py @@ -52,10 +52,8 @@ class Dataset: for col in self.data: if col not in excepts: - index = self.data.columns.get_loc(col) - datacol = self.data.pop(col) - datacol = (datacol - datacol.mean()) / datacol.std() - self.data.insert(index, col, datacol) + datacol = self.data[col] + self.data[col] = (datacol - datacol.mean()) / datacol.std() return self def factorize(self, columns:list[str]=[]) -> Self: diff --git a/src/learning/functions.py b/src/learning/functions.py index d5b6068..cb66138 100644 --- a/src/learning/functions.py +++ b/src/learning/functions.py @@ -44,6 +44,13 @@ def cross_entropy_loss(h0:np.ndarray, y:np.ndarray) -> float: # Randoms # ********** +def pearson(h0:np.ndarray, y:np.ndarray) -> float: + diff1 = h0 - h0.mean() + diff2 = y - y.mean() + num = np.sum(diff1 * diff2) + den = np.sqrt(np.sum(diff1**2)) * np.sqrt(np.sum(diff2**2)) + return num / den + def r_squared(h0:np.ndarray, y:np.ndarray) -> float: y_mean = np.mean(y) ss_resid = np.sum((y - h0) ** 2) diff --git a/src/learning/ml.py b/src/learning/ml.py index a72f1d4..5899969 100644 --- a/src/learning/ml.py +++ b/src/learning/ml.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from plot import Plot from tqdm import tqdm from learning.data import ConfusionMatrix, Dataset, Data, TargetType -from learning.functions import r_squared +from learning.functions import pearson, r_squared class MLAlgorithm(ABC): """ Classe generica per gli algoritmi di Machine Learning """ @@ -83,6 +83,7 @@ class MLAlgorithm(ABC): print(f"Loss valid : {self.validation_loss():0.5f}") print(f"Loss test : {self.test_loss():0.5f}") if self._target_type == TargetType.Regression: + print(f"Pearson : {self.test_pearson():0.5f}") print(f"R^2 : {self.test_r_squared():0.5f}") elif self._target_type != TargetType.NoTarget: conf = self.test_confusion_matrix() @@ -106,6 +107,11 @@ class MLAlgorithm(ABC): y = np.argmax(y, axis=1) return ConfusionMatrix(y, h0) + def test_pearson(self) -> float: + if self._target_type != TargetType.Regression: + return 0 + return pearson(self._h0(self._testset.x), self._testset.y) + def test_r_squared(self) -> float: if self._target_type != TargetType.Regression: return 0 diff --git a/src/learning/supervised.py b/src/learning/supervised.py index aac983f..d95334a 100644 --- a/src/learning/supervised.py +++ b/src/learning/supervised.py @@ -19,11 +19,13 @@ class GradientDescent(MLAlgorithm): def _learning_step(self) -> float: x, y, m, _ = self._learnset.as_tuple() + h0 = self._h0(x) regularization = (self.lambd / m) * self.theta regularization[0] = 0 - derivative = self.alpha * np.mean((self._h0(x) - y) * with_bias(x).T, axis=1) - self.theta -= derivative + regularization + + derivative = np.mean((h0 - y) * with_bias(x).T, axis=1) + self.theta -= self.alpha * derivative + regularization return self._loss(x, y) def _predict_loss(self, dataset:Data) -> float: