Fixes for Presentation

2024-08-19 20:58:24 +02:00
parent 142fe5ccdf
commit 8b1c149535
7 changed files with 30 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-__pycache__
+__pycache__
+.venv
--- a/3
+++ b/3
@@ -1,3 +1,4 @@
 matplotlib
 pandas
-tqdm
+tqdm
+scikit-learn
--- a/src/app.py
+++ b/src/app.py
@@ -25,7 +25,7 @@ def auto_mpg() -> tuple[Dataset, MLAlgorithm, Any]:
    ds.numbers(["HP"])
    ds.handle_na()
    ds.normalize(excepts=["Cylinders","Year","Origin"])
-    return (ds, LinearRegression(ds, learning_rate=0.0001), sklearn.linear_model.LinearRegression())
+    return (ds, LinearRegression(ds, learning_rate=0.0001), sklearn.linear_model.SGDRegressor())

 def automobile() -> tuple[Dataset, MLAlgorithm, Any]:
    ds = Dataset(REGRESSION + "automobile.csv", "symboling", TargetType.Regression)
@@ -35,12 +35,12 @@ def automobile() -> tuple[Dataset, MLAlgorithm, Any]:
    ds.numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
    ds.handle_na()
    ds.normalize(excepts=attributes_to_modify)
-    return (ds, LinearRegression(ds, learning_rate=0.004), sklearn.linear_model.LinearRegression())
+    return (ds, LinearRegression(ds, learning_rate=0.003), sklearn.linear_model.SGDRegressor())

 def power_plant() -> tuple[Dataset, MLAlgorithm, Any]:
    ds = Dataset(REGRESSION + "power-plant.csv", "energy-output", TargetType.Regression)
-    ds.normalize()
-    return (ds, LinearRegression(ds, learning_rate=0.1), sklearn.linear_model.LinearRegression())
+    ds.normalize(excepts=None)
+    return (ds, LinearRegression(ds, learning_rate=0.1), sklearn.linear_model.SGDRegressor())

 # ********************
 # Logistic Regression
@@ -101,7 +101,7 @@ def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
 if __name__ == "__main__":
    np.set_printoptions(linewidth=np.inf, formatter={'float': '{:>10.5f}'.format})
    rand = np.random.randint(0, 4294967295)
-    #rand = 1997847910  # LiR for power_plant
+    #rand = 2205910060  # LiR for power_plant
    #rand = 347617386   # LoR for electrical_grid
    #rand = 1793295160  # MLP for iris
    #rand = 2914000170  # MLP for frogs
@@ -110,7 +110,7 @@ if __name__ == "__main__":
    np.random.seed(rand)
    print(f"Using seed: {rand}")

-    ds, ml, sk = frogs()
+    ds, ml, sk = power_plant()

    epochs, _, _ = ml.learn(1000, verbose=True)
    ml.display_results()
--- a/src/learning/data.py
+++ b/src/learning/data.py
@@ -52,10 +52,8 @@ class Dataset:

        for col in self.data:
            if col not in excepts:
-                index = self.data.columns.get_loc(col)
-                datacol = self.data.pop(col)
-                datacol = (datacol - datacol.mean()) / datacol.std()
-                self.data.insert(index, col, datacol)
+                datacol = self.data[col]
+                self.data[col] = (datacol - datacol.mean()) / datacol.std()
        return self

    def factorize(self, columns:list[str]=[]) -> Self:
--- a/src/learning/functions.py
+++ b/src/learning/functions.py
@@ -44,6 +44,13 @@ def cross_entropy_loss(h0:np.ndarray, y:np.ndarray) -> float:
 # Randoms
 # **********

+def pearson(h0:np.ndarray, y:np.ndarray) -> float:
+    diff1 = h0 - h0.mean()
+    diff2 = y - y.mean()
+    num = np.sum(diff1 * diff2)
+    den = np.sqrt(np.sum(diff1**2)) * np.sqrt(np.sum(diff2**2))
+    return num / den
+
 def r_squared(h0:np.ndarray, y:np.ndarray) -> float:
    y_mean = np.mean(y)
    ss_resid = np.sum((y - h0) ** 2)
--- a/src/learning/ml.py
+++ b/src/learning/ml.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from plot import Plot
 from tqdm import tqdm
 from learning.data import ConfusionMatrix, Dataset, Data, TargetType
-from learning.functions import r_squared
+from learning.functions import pearson, r_squared

 class MLAlgorithm(ABC):
    """ Classe generica per gli algoritmi di Machine Learning """
@@ -83,6 +83,7 @@ class MLAlgorithm(ABC):
        print(f"Loss valid : {self.validation_loss():0.5f}")
        print(f"Loss test  : {self.test_loss():0.5f}")
        if self._target_type == TargetType.Regression:
+            print(f"Pearson    : {self.test_pearson():0.5f}")
            print(f"R^2        : {self.test_r_squared():0.5f}")
        elif self._target_type != TargetType.NoTarget:
            conf = self.test_confusion_matrix()
@@ -106,6 +107,11 @@ class MLAlgorithm(ABC):
            y = np.argmax(y, axis=1)
        return ConfusionMatrix(y, h0)

+    def test_pearson(self) -> float:
+        if self._target_type != TargetType.Regression:
+            return 0
+        return pearson(self._h0(self._testset.x), self._testset.y)
+
    def test_r_squared(self) -> float:
        if self._target_type != TargetType.Regression:
            return 0
--- a/src/learning/supervised.py
+++ b/src/learning/supervised.py
@@ -19,11 +19,13 @@ class GradientDescent(MLAlgorithm):

    def _learning_step(self) -> float:
        x, y, m, _ = self._learnset.as_tuple()
+        h0 = self._h0(x)

        regularization = (self.lambd / m) * self.theta
        regularization[0] = 0
-        derivative = self.alpha * np.mean((self._h0(x) - y) * with_bias(x).T, axis=1)
-        self.theta -= derivative + regularization
+
+        derivative =  np.mean((h0 - y) * with_bias(x).T, axis=1)
+        self.theta -= self.alpha * derivative + regularization
        return self._loss(x, y)

    def _predict_loss(self, dataset:Data) -> float: