End of ML
- fixes for clustering - fixes in general
This commit is contained in:
30
src/app.py
30
src/app.py
@@ -2,12 +2,14 @@ import numpy as np
|
|||||||
import sklearn
|
import sklearn
|
||||||
import sklearn.cluster
|
import sklearn.cluster
|
||||||
import sklearn.linear_model
|
import sklearn.linear_model
|
||||||
|
import sklearn.metrics
|
||||||
import sklearn.model_selection
|
import sklearn.model_selection
|
||||||
import sklearn.neural_network
|
import sklearn.neural_network
|
||||||
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from learning.functions import print_metrics
|
||||||
from learning.ml import MLAlgorithm
|
from learning.ml import MLAlgorithm
|
||||||
from learning.data import ConfusionMatrix, Dataset, TargetType
|
from learning.data import Dataset, TargetType
|
||||||
from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
|
from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
|
||||||
from learning.unsupervised import KMeans
|
from learning.unsupervised import KMeans
|
||||||
|
|
||||||
@@ -68,15 +70,15 @@ def electrical_grid_mlp() -> tuple[Dataset, MLAlgorithm, Any]:
|
|||||||
ds.factorize(["stabf"])
|
ds.factorize(["stabf"])
|
||||||
ds.standardize()
|
ds.standardize()
|
||||||
size = [4, 3]
|
size = [4, 3]
|
||||||
return (ds, MultiLayerPerceptron(ds, size, 0.08), sklearn.neural_network.MLPClassifier(size, 'relu'))
|
return (ds, MultiLayerPerceptron(ds, size, 0.05), sklearn.neural_network.MLPClassifier(size, 'relu'))
|
||||||
|
|
||||||
def frogs() -> tuple[Dataset, MLAlgorithm, Any]:
|
def frogs() -> tuple[Dataset, MLAlgorithm, Any]:
|
||||||
ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.MultiClassification)
|
ds = Dataset(CLASSIFICATION + "frogs.csv", "Family", TargetType.MultiClassification)
|
||||||
ds.remove(["Family", "Genus", "RecordID"])
|
ds.remove(["Species", "Genus", "RecordID"])
|
||||||
ds.factorize(["Species"])
|
ds.factorize(["Family"])
|
||||||
ds.standardize()
|
ds.standardize()
|
||||||
size = [18, 15, 12]
|
size = [18, 12, 8]
|
||||||
return (ds, MultiLayerPerceptron(ds, size, 0.047), sklearn.neural_network.MLPClassifier(size, 'relu'))
|
return (ds, MultiLayerPerceptron(ds, size, 0.02), sklearn.neural_network.MLPClassifier(size, 'relu'))
|
||||||
|
|
||||||
def iris() -> tuple[Dataset, MLAlgorithm, Any]:
|
def iris() -> tuple[Dataset, MLAlgorithm, Any]:
|
||||||
ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.MultiClassification)
|
ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.MultiClassification)
|
||||||
@@ -90,15 +92,14 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]:
|
|||||||
# ********************
|
# ********************
|
||||||
|
|
||||||
def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
|
def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
|
||||||
ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.NoTarget)
|
ds = Dataset(CLASSIFICATION + "frogs.csv", "Family", TargetType.NoTarget)
|
||||||
ds.remove(["Family", "Genus", "RecordID", "Species"])
|
ds.remove(["Family", "Genus", "RecordID", "Species"])
|
||||||
clusters = 10
|
clusters = 4
|
||||||
return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
|
return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
|
||||||
|
|
||||||
def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
|
def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
|
||||||
ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget)
|
ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget)
|
||||||
ds.remove(["Class"])
|
ds.remove(["Class"])
|
||||||
ds.standardize()
|
|
||||||
clusters = 3
|
clusters = 3
|
||||||
return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
|
return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
|
||||||
|
|
||||||
@@ -113,8 +114,8 @@ if __name__ == "__main__":
|
|||||||
#rand = 347617386 # LoR for electrical_grid
|
#rand = 347617386 # LoR for electrical_grid
|
||||||
#rand = 834535453 # LoR for heart
|
#rand = 834535453 # LoR for heart
|
||||||
#rand = 1793295160 # MLP for iris
|
#rand = 1793295160 # MLP for iris
|
||||||
#rand = 2914000170 # MLP for frogs
|
#rand = 629702080 # MLP for frogs
|
||||||
#rand = 885416001 # KMe for frogs_no_target
|
#rand = 1038336550 # KMe for frogs_no_target
|
||||||
|
|
||||||
np.random.seed(rand)
|
np.random.seed(rand)
|
||||||
print(f"Using seed: {rand}")
|
print(f"Using seed: {rand}")
|
||||||
@@ -129,9 +130,6 @@ if __name__ == "__main__":
|
|||||||
sk.set_params(max_iter=epochs)
|
sk.set_params(max_iter=epochs)
|
||||||
sk.fit(learn.x, learn.y)
|
sk.fit(learn.x, learn.y)
|
||||||
print(f"Sklearn : {abs(sk.score(test.x, test.y)):0.5f}")
|
print(f"Sklearn : {abs(sk.score(test.x, test.y)):0.5f}")
|
||||||
if ds.target_type == TargetType.Classification or ds.target_type == TargetType.MultiClassification:
|
print_metrics(ml._target_type, test, sk.predict(test.x))
|
||||||
conf = ConfusionMatrix(test.y, sk.predict(test.x))
|
|
||||||
conf.print()
|
|
||||||
print("========================")
|
|
||||||
|
|
||||||
ml.plot()
|
ml.plot()
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ class Dataset:
|
|||||||
self.target_type = target_type
|
self.target_type = target_type
|
||||||
|
|
||||||
# move target to the start
|
# move target to the start
|
||||||
|
if target_type != TargetType.NoTarget:
|
||||||
col_target = self.data.pop(target)
|
col_target = self.data.pop(target)
|
||||||
self.data.insert(0, target, col_target)
|
self.data.insert(0, target, col_target)
|
||||||
|
|
||||||
@@ -85,17 +86,14 @@ class Dataset:
|
|||||||
|
|
||||||
data = []
|
data = []
|
||||||
for x in splitted:
|
for x in splitted:
|
||||||
total = total_each - x.shape[0]
|
samples = np.random.choice(len(x), size=total_each, replace=True)
|
||||||
data.append(x)
|
data.append(x[samples])
|
||||||
if total > 0:
|
|
||||||
samples = np.random.choice(x, size=total, replace=True)
|
|
||||||
data.append(samples)
|
|
||||||
|
|
||||||
return np.concatenate(data, axis=0)
|
return np.concatenate(data, axis=0)
|
||||||
|
|
||||||
def split_data_target(self, data:np.ndarray) -> Data:
|
def split_dataset_target(self, data:np.ndarray) -> Data:
|
||||||
target = data[:, 0] if self.target_type != TargetType.NoTarget else None
|
target = data[:, 0] if self.target_type != TargetType.NoTarget else None
|
||||||
data = data[:, 1:]
|
data = data[:, 1:] if self.target_type != TargetType.NoTarget else data
|
||||||
if self.target_type == TargetType.MultiClassification:
|
if self.target_type == TargetType.MultiClassification:
|
||||||
target = target.astype(int)
|
target = target.astype(int)
|
||||||
uniques = np.unique(target).shape[0]
|
uniques = np.unique(target).shape[0]
|
||||||
@@ -122,14 +120,15 @@ class Dataset:
|
|||||||
np.random.shuffle(data)
|
np.random.shuffle(data)
|
||||||
learn, valid, test = self.split_dataset(data, valid_frac, test_frac)
|
learn, valid, test = self.split_dataset(data, valid_frac, test_frac)
|
||||||
|
|
||||||
if self.target_type == TargetType.Regression or self.target_type == TargetType.NoTarget:
|
if self.target_type == TargetType.Classification\
|
||||||
|
or self.target_type == TargetType.MultiClassification:
|
||||||
learn = self.prepare_classification(learn)
|
learn = self.prepare_classification(learn)
|
||||||
valid = self.prepare_classification(valid)
|
valid = self.prepare_classification(valid)
|
||||||
test = self.prepare_classification(test)
|
test = self.prepare_classification(test)
|
||||||
|
|
||||||
learn = self.split_data_target(learn)
|
learn = self.split_dataset_target(learn)
|
||||||
valid = self.split_data_target(valid)
|
valid = self.split_dataset_target(valid)
|
||||||
test = self.split_data_target(test)
|
test = self.split_dataset_target(test)
|
||||||
return (learn, valid, test)
|
return (learn, valid, test)
|
||||||
except:
|
except:
|
||||||
if max_iter == 0:
|
if max_iter == 0:
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from learning.data import ConfusionMatrix, Data, Dataset, TargetType
|
||||||
|
from sklearn.metrics import silhouette_score, r2_score
|
||||||
|
|
||||||
NOT_ZERO = 1e-15
|
NOT_ZERO = 1e-15
|
||||||
LEAKY_RELU = 0.2
|
LEAKY_RELU = 0.2
|
||||||
|
|
||||||
@@ -45,20 +48,34 @@ def cross_entropy_loss(h0:np.ndarray, y:np.ndarray) -> float:
|
|||||||
# Randoms
|
# Randoms
|
||||||
# **********
|
# **********
|
||||||
|
|
||||||
def pearson(h0:np.ndarray, y:np.ndarray) -> float:
|
|
||||||
diff1 = h0 - h0.mean()
|
|
||||||
diff2 = y - y.mean()
|
|
||||||
num = np.sum(diff1 * diff2)
|
|
||||||
den = np.sqrt(np.sum(diff1**2)) * np.sqrt(np.sum(diff2**2))
|
|
||||||
return num / den
|
|
||||||
|
|
||||||
def r_squared(h0:np.ndarray, y:np.ndarray) -> float:
|
|
||||||
y_mean = np.mean(y)
|
|
||||||
ss_resid = np.sum((y - h0) ** 2)
|
|
||||||
ss_total = np.sum((y - y_mean) ** 2)
|
|
||||||
return 1 - (ss_resid / ss_total)
|
|
||||||
|
|
||||||
def with_bias(x:np.ndarray) -> np.ndarray:
|
def with_bias(x:np.ndarray) -> np.ndarray:
|
||||||
shape = (x.shape[0], 1) if len(x.shape) != 1 else (1,)
|
shape = (x.shape[0], 1) if len(x.shape) != 1 else (1,)
|
||||||
ones = np.ones(shape)
|
ones = np.ones(shape)
|
||||||
return np.hstack([ones, x])
|
return np.hstack([ones, x])
|
||||||
|
|
||||||
|
def print_metrics(target:TargetType, dataset:Data, h0:np.ndarray) -> None:
|
||||||
|
if target == TargetType.Regression:
|
||||||
|
print(f"R^2 : {r2_score(dataset.y, h0):0.5f}")
|
||||||
|
print(f"Pearson : {np.corrcoef(dataset.y, h0)[0, 1]:0.5f}")
|
||||||
|
elif target != TargetType.NoTarget:
|
||||||
|
if h0.ndim == 1: h0 = np.where(h0 > 0.5, 1, 0)
|
||||||
|
ConfusionMatrix(dataset.y, h0).print()
|
||||||
|
else:
|
||||||
|
print(f"Silhouette : {silhouette_score(dataset.x, h0):0.5f}")
|
||||||
|
print("========================")
|
||||||
|
|
||||||
|
def print_silhouette_weka(ds:Dataset, file_weka:str):
|
||||||
|
test, _, _, _ = ds.get_dataset()[2].as_tuple()
|
||||||
|
test = np.round(test, 6)
|
||||||
|
|
||||||
|
weka = Dataset(file_weka, "", TargetType.NoTarget)
|
||||||
|
weka.factorize(["cluster"])
|
||||||
|
|
||||||
|
weka, _, _, _ = weka.get_dataset(test_frac=0, valid_frac=0)[0].as_tuple()
|
||||||
|
weka_x, weka_y = weka[:, :-1], weka[:, -1:]
|
||||||
|
|
||||||
|
bau = [np.where((weka_x == x).all(axis=1))[0][0] for x in test]
|
||||||
|
weka_x, weka_y = weka_x[bau], weka_y[bau].ravel()
|
||||||
|
|
||||||
|
score = silhouette_score(weka_x, weka_y)
|
||||||
|
print(score)
|
||||||
|
|||||||
@@ -2,10 +2,11 @@ import sys
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
from plot import Plot
|
from plot import Plot
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from learning.data import ConfusionMatrix, Dataset, Data, TargetType
|
from learning.data import Dataset, Data, TargetType
|
||||||
from learning.functions import pearson, r_squared
|
from learning.functions import print_metrics
|
||||||
|
|
||||||
class MLAlgorithm(ABC):
|
class MLAlgorithm(ABC):
|
||||||
""" Classe generica per gli algoritmi di Machine Learning """
|
""" Classe generica per gli algoritmi di Machine Learning """
|
||||||
@@ -84,36 +85,7 @@ class MLAlgorithm(ABC):
|
|||||||
print(f"Loss valid : {self.validation_loss():0.5f}")
|
print(f"Loss valid : {self.validation_loss():0.5f}")
|
||||||
print(f"Loss test : {self.test_loss():0.5f}")
|
print(f"Loss test : {self.test_loss():0.5f}")
|
||||||
print("========================")
|
print("========================")
|
||||||
if self._target_type == TargetType.Regression:
|
print_metrics(self._target_type, self._testset, self._h0(self._testset.x))
|
||||||
print(f"Pearson : {self.test_pearson():0.5f}")
|
|
||||||
print(f"R^2 : {self.test_r_squared():0.5f}")
|
|
||||||
print("========================")
|
|
||||||
elif self._target_type != TargetType.NoTarget:
|
|
||||||
conf = self.test_confusion_matrix()
|
|
||||||
conf.print()
|
|
||||||
print("========================")
|
|
||||||
|
|
||||||
def test_confusion_matrix(self) -> ConfusionMatrix:
|
|
||||||
if self._target_type != TargetType.Classification\
|
|
||||||
and self._target_type != TargetType.MultiClassification:
|
|
||||||
return None
|
|
||||||
|
|
||||||
h0 = self._h0(self._testset.x)
|
|
||||||
y = self._testset.y
|
|
||||||
if h0.ndim == 1:
|
|
||||||
h0 = np.where(h0 > 0.5, 1, 0)
|
|
||||||
|
|
||||||
return ConfusionMatrix(y, h0)
|
|
||||||
|
|
||||||
def test_pearson(self) -> float:
|
|
||||||
if self._target_type != TargetType.Regression:
|
|
||||||
return 0
|
|
||||||
return pearson(self._h0(self._testset.x), self._testset.y)
|
|
||||||
|
|
||||||
def test_r_squared(self) -> float:
|
|
||||||
if self._target_type != TargetType.Regression:
|
|
||||||
return 0
|
|
||||||
return r_squared(self._h0(self._testset.x), self._testset.y)
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _h0(self, x:np.ndarray) -> np.ndarray: pass
|
def _h0(self, x:np.ndarray) -> np.ndarray: pass
|
||||||
|
|||||||
@@ -16,6 +16,17 @@ class KMeans(MLAlgorithm):
|
|||||||
distances = np.linalg.norm(diff, axis=2)
|
distances = np.linalg.norm(diff, axis=2)
|
||||||
return np.argmin(distances, axis=1)
|
return np.argmin(distances, axis=1)
|
||||||
|
|
||||||
|
def _predict_loss(self, dataset:Data) -> float:
|
||||||
|
assignments = self._h0(dataset.x)
|
||||||
|
loss = 0.0
|
||||||
|
|
||||||
|
for k in range(self.total):
|
||||||
|
assigned_points = dataset.x[assignments == k]
|
||||||
|
if len(assigned_points) > 0:
|
||||||
|
diff = assigned_points - self.centroids[k]
|
||||||
|
loss += np.sum(np.linalg.norm(diff, axis=1) ** 2)
|
||||||
|
return loss
|
||||||
|
|
||||||
def _learning_step(self) -> float:
|
def _learning_step(self) -> float:
|
||||||
assignments = self._h0(self._learnset.x)
|
assignments = self._h0(self._learnset.x)
|
||||||
centroids = []
|
centroids = []
|
||||||
@@ -32,16 +43,6 @@ class KMeans(MLAlgorithm):
|
|||||||
self.centroids = np.array(centroids)
|
self.centroids = np.array(centroids)
|
||||||
return self._predict_loss(self._learnset)
|
return self._predict_loss(self._learnset)
|
||||||
|
|
||||||
def _predict_loss(self, dataset:Data) -> float:
|
|
||||||
assignments = self._h0(dataset.x)
|
|
||||||
loss = 0.0
|
|
||||||
|
|
||||||
for k in range(self.total):
|
|
||||||
assigned_points = dataset.x[assignments == k]
|
|
||||||
if len(assigned_points) > 0:
|
|
||||||
diff = assigned_points - self.centroids[k]
|
|
||||||
loss += np.sum(np.linalg.norm(diff, axis=1) ** 2)
|
|
||||||
return loss
|
|
||||||
|
|
||||||
def _get_parameters(self):
|
def _get_parameters(self):
|
||||||
return self.centroids.copy()
|
return self.centroids.copy()
|
||||||
|
|||||||
Reference in New Issue
Block a user