End of ML

- fixes for clustering
- fixes in general
This commit is contained in:
2024-08-21 19:45:42 +02:00
parent bba07c0b49
commit 9ea43beace
5 changed files with 71 additions and 84 deletions

View File

@@ -2,12 +2,14 @@ import numpy as np
import sklearn import sklearn
import sklearn.cluster import sklearn.cluster
import sklearn.linear_model import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection import sklearn.model_selection
import sklearn.neural_network import sklearn.neural_network
from typing import Any from typing import Any
from learning.functions import print_metrics
from learning.ml import MLAlgorithm from learning.ml import MLAlgorithm
from learning.data import ConfusionMatrix, Dataset, TargetType from learning.data import Dataset, TargetType
from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
from learning.unsupervised import KMeans from learning.unsupervised import KMeans
@@ -68,15 +70,15 @@ def electrical_grid_mlp() -> tuple[Dataset, MLAlgorithm, Any]:
ds.factorize(["stabf"]) ds.factorize(["stabf"])
ds.standardize() ds.standardize()
size = [4, 3] size = [4, 3]
return (ds, MultiLayerPerceptron(ds, size, 0.08), sklearn.neural_network.MLPClassifier(size, 'relu')) return (ds, MultiLayerPerceptron(ds, size, 0.05), sklearn.neural_network.MLPClassifier(size, 'relu'))
def frogs() -> tuple[Dataset, MLAlgorithm, Any]: def frogs() -> tuple[Dataset, MLAlgorithm, Any]:
ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.MultiClassification) ds = Dataset(CLASSIFICATION + "frogs.csv", "Family", TargetType.MultiClassification)
ds.remove(["Family", "Genus", "RecordID"]) ds.remove(["Species", "Genus", "RecordID"])
ds.factorize(["Species"]) ds.factorize(["Family"])
ds.standardize() ds.standardize()
size = [18, 15, 12] size = [18, 12, 8]
return (ds, MultiLayerPerceptron(ds, size, 0.047), sklearn.neural_network.MLPClassifier(size, 'relu')) return (ds, MultiLayerPerceptron(ds, size, 0.02), sklearn.neural_network.MLPClassifier(size, 'relu'))
def iris() -> tuple[Dataset, MLAlgorithm, Any]: def iris() -> tuple[Dataset, MLAlgorithm, Any]:
ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.MultiClassification) ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.MultiClassification)
@@ -90,15 +92,14 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]:
# ******************** # ********************
def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]: def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.NoTarget) ds = Dataset(CLASSIFICATION + "frogs.csv", "Family", TargetType.NoTarget)
ds.remove(["Family", "Genus", "RecordID", "Species"]) ds.remove(["Family", "Genus", "RecordID", "Species"])
clusters = 10 clusters = 4
return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters)) return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]: def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget) ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget)
ds.remove(["Class"]) ds.remove(["Class"])
ds.standardize()
clusters = 3 clusters = 3
return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters)) return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
@@ -113,8 +114,8 @@ if __name__ == "__main__":
#rand = 347617386 # LoR for electrical_grid #rand = 347617386 # LoR for electrical_grid
#rand = 834535453 # LoR for heart #rand = 834535453 # LoR for heart
#rand = 1793295160 # MLP for iris #rand = 1793295160 # MLP for iris
#rand = 2914000170 # MLP for frogs #rand = 629702080 # MLP for frogs
#rand = 885416001 # KMe for frogs_no_target #rand = 1038336550 # KMe for frogs_no_target
np.random.seed(rand) np.random.seed(rand)
print(f"Using seed: {rand}") print(f"Using seed: {rand}")
@@ -129,9 +130,6 @@ if __name__ == "__main__":
sk.set_params(max_iter=epochs) sk.set_params(max_iter=epochs)
sk.fit(learn.x, learn.y) sk.fit(learn.x, learn.y)
print(f"Sklearn : {abs(sk.score(test.x, test.y)):0.5f}") print(f"Sklearn : {abs(sk.score(test.x, test.y)):0.5f}")
if ds.target_type == TargetType.Classification or ds.target_type == TargetType.MultiClassification: print_metrics(ml._target_type, test, sk.predict(test.x))
conf = ConfusionMatrix(test.y, sk.predict(test.x))
conf.print()
print("========================")
ml.plot() ml.plot()

View File

@@ -40,8 +40,9 @@ class Dataset:
self.target_type = target_type self.target_type = target_type
# move target to the start # move target to the start
col_target = self.data.pop(target) if target_type != TargetType.NoTarget:
self.data.insert(0, target, col_target) col_target = self.data.pop(target)
self.data.insert(0, target, col_target)
def remove(self, columns:list[str]) -> Self: def remove(self, columns:list[str]) -> Self:
for col in columns: for col in columns:
@@ -85,17 +86,14 @@ class Dataset:
data = [] data = []
for x in splitted: for x in splitted:
total = total_each - x.shape[0] samples = np.random.choice(len(x), size=total_each, replace=True)
data.append(x) data.append(x[samples])
if total > 0:
samples = np.random.choice(x, size=total, replace=True)
data.append(samples)
return np.concatenate(data, axis=0) return np.concatenate(data, axis=0)
def split_data_target(self, data:np.ndarray) -> Data: def split_dataset_target(self, data:np.ndarray) -> Data:
target = data[:, 0] if self.target_type != TargetType.NoTarget else None target = data[:, 0] if self.target_type != TargetType.NoTarget else None
data = data[:, 1:] data = data[:, 1:] if self.target_type != TargetType.NoTarget else data
if self.target_type == TargetType.MultiClassification: if self.target_type == TargetType.MultiClassification:
target = target.astype(int) target = target.astype(int)
uniques = np.unique(target).shape[0] uniques = np.unique(target).shape[0]
@@ -122,14 +120,15 @@ class Dataset:
np.random.shuffle(data) np.random.shuffle(data)
learn, valid, test = self.split_dataset(data, valid_frac, test_frac) learn, valid, test = self.split_dataset(data, valid_frac, test_frac)
if self.target_type == TargetType.Regression or self.target_type == TargetType.NoTarget: if self.target_type == TargetType.Classification\
or self.target_type == TargetType.MultiClassification:
learn = self.prepare_classification(learn) learn = self.prepare_classification(learn)
valid = self.prepare_classification(valid) valid = self.prepare_classification(valid)
test = self.prepare_classification(test) test = self.prepare_classification(test)
learn = self.split_data_target(learn) learn = self.split_dataset_target(learn)
valid = self.split_data_target(valid) valid = self.split_dataset_target(valid)
test = self.split_data_target(test) test = self.split_dataset_target(test)
return (learn, valid, test) return (learn, valid, test)
except: except:
if max_iter == 0: if max_iter == 0:

View File

@@ -1,5 +1,8 @@
import numpy as np import numpy as np
from learning.data import ConfusionMatrix, Data, Dataset, TargetType
from sklearn.metrics import silhouette_score, r2_score
NOT_ZERO = 1e-15 NOT_ZERO = 1e-15
LEAKY_RELU = 0.2 LEAKY_RELU = 0.2
@@ -45,20 +48,34 @@ def cross_entropy_loss(h0:np.ndarray, y:np.ndarray) -> float:
# Randoms # Randoms
# ********** # **********
def pearson(h0:np.ndarray, y:np.ndarray) -> float:
diff1 = h0 - h0.mean()
diff2 = y - y.mean()
num = np.sum(diff1 * diff2)
den = np.sqrt(np.sum(diff1**2)) * np.sqrt(np.sum(diff2**2))
return num / den
def r_squared(h0:np.ndarray, y:np.ndarray) -> float:
y_mean = np.mean(y)
ss_resid = np.sum((y - h0) ** 2)
ss_total = np.sum((y - y_mean) ** 2)
return 1 - (ss_resid / ss_total)
def with_bias(x:np.ndarray) -> np.ndarray: def with_bias(x:np.ndarray) -> np.ndarray:
shape = (x.shape[0], 1) if len(x.shape) != 1 else (1,) shape = (x.shape[0], 1) if len(x.shape) != 1 else (1,)
ones = np.ones(shape) ones = np.ones(shape)
return np.hstack([ones, x]) return np.hstack([ones, x])
def print_metrics(target:TargetType, dataset:Data, h0:np.ndarray) -> None:
if target == TargetType.Regression:
print(f"R^2 : {r2_score(dataset.y, h0):0.5f}")
print(f"Pearson : {np.corrcoef(dataset.y, h0)[0, 1]:0.5f}")
elif target != TargetType.NoTarget:
if h0.ndim == 1: h0 = np.where(h0 > 0.5, 1, 0)
ConfusionMatrix(dataset.y, h0).print()
else:
print(f"Silhouette : {silhouette_score(dataset.x, h0):0.5f}")
print("========================")
def print_silhouette_weka(ds:Dataset, file_weka:str):
test, _, _, _ = ds.get_dataset()[2].as_tuple()
test = np.round(test, 6)
weka = Dataset(file_weka, "", TargetType.NoTarget)
weka.factorize(["cluster"])
weka, _, _, _ = weka.get_dataset(test_frac=0, valid_frac=0)[0].as_tuple()
weka_x, weka_y = weka[:, :-1], weka[:, -1:]
bau = [np.where((weka_x == x).all(axis=1))[0][0] for x in test]
weka_x, weka_y = weka_x[bau], weka_y[bau].ravel()
score = silhouette_score(weka_x, weka_y)
print(score)

View File

@@ -2,10 +2,11 @@ import sys
import numpy as np import numpy as np
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from plot import Plot from plot import Plot
from tqdm import tqdm from tqdm import tqdm
from learning.data import ConfusionMatrix, Dataset, Data, TargetType from learning.data import Dataset, Data, TargetType
from learning.functions import pearson, r_squared from learning.functions import print_metrics
class MLAlgorithm(ABC): class MLAlgorithm(ABC):
""" Classe generica per gli algoritmi di Machine Learning """ """ Classe generica per gli algoritmi di Machine Learning """
@@ -84,36 +85,7 @@ class MLAlgorithm(ABC):
print(f"Loss valid : {self.validation_loss():0.5f}") print(f"Loss valid : {self.validation_loss():0.5f}")
print(f"Loss test : {self.test_loss():0.5f}") print(f"Loss test : {self.test_loss():0.5f}")
print("========================") print("========================")
if self._target_type == TargetType.Regression: print_metrics(self._target_type, self._testset, self._h0(self._testset.x))
print(f"Pearson : {self.test_pearson():0.5f}")
print(f"R^2 : {self.test_r_squared():0.5f}")
print("========================")
elif self._target_type != TargetType.NoTarget:
conf = self.test_confusion_matrix()
conf.print()
print("========================")
def test_confusion_matrix(self) -> ConfusionMatrix:
if self._target_type != TargetType.Classification\
and self._target_type != TargetType.MultiClassification:
return None
h0 = self._h0(self._testset.x)
y = self._testset.y
if h0.ndim == 1:
h0 = np.where(h0 > 0.5, 1, 0)
return ConfusionMatrix(y, h0)
def test_pearson(self) -> float:
if self._target_type != TargetType.Regression:
return 0
return pearson(self._h0(self._testset.x), self._testset.y)
def test_r_squared(self) -> float:
if self._target_type != TargetType.Regression:
return 0
return r_squared(self._h0(self._testset.x), self._testset.y)
@abstractmethod @abstractmethod
def _h0(self, x:np.ndarray) -> np.ndarray: pass def _h0(self, x:np.ndarray) -> np.ndarray: pass

View File

@@ -16,6 +16,17 @@ class KMeans(MLAlgorithm):
distances = np.linalg.norm(diff, axis=2) distances = np.linalg.norm(diff, axis=2)
return np.argmin(distances, axis=1) return np.argmin(distances, axis=1)
def _predict_loss(self, dataset:Data) -> float:
assignments = self._h0(dataset.x)
loss = 0.0
for k in range(self.total):
assigned_points = dataset.x[assignments == k]
if len(assigned_points) > 0:
diff = assigned_points - self.centroids[k]
loss += np.sum(np.linalg.norm(diff, axis=1) ** 2)
return loss
def _learning_step(self) -> float: def _learning_step(self) -> float:
assignments = self._h0(self._learnset.x) assignments = self._h0(self._learnset.x)
centroids = [] centroids = []
@@ -32,16 +43,6 @@ class KMeans(MLAlgorithm):
self.centroids = np.array(centroids) self.centroids = np.array(centroids)
return self._predict_loss(self._learnset) return self._predict_loss(self._learnset)
def _predict_loss(self, dataset:Data) -> float:
assignments = self._h0(dataset.x)
loss = 0.0
for k in range(self.total):
assigned_points = dataset.x[assignments == k]
if len(assigned_points) > 0:
diff = assigned_points - self.centroids[k]
loss += np.sum(np.linalg.norm(diff, axis=1) ** 2)
return loss
def _get_parameters(self): def _get_parameters(self):
return self.centroids.copy() return self.centroids.copy()