KMeans
- implemented KMeans - fixed non seeded rng - fixed display exception with NoTargets - added basic test cases to app
This commit is contained in:
32
src/app.py
32
src/app.py
@@ -2,12 +2,14 @@ import random
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sklearn
|
import sklearn
|
||||||
|
import sklearn.cluster
|
||||||
import sklearn.linear_model
|
import sklearn.linear_model
|
||||||
import sklearn.model_selection
|
import sklearn.model_selection
|
||||||
import sklearn.neural_network
|
import sklearn.neural_network
|
||||||
from learning.data import Dataset, TargetType
|
from learning.data import Dataset, TargetType
|
||||||
from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
|
from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
|
||||||
from learning.ml import MLAlgorithm
|
from learning.ml import MLAlgorithm
|
||||||
|
from learning.unsupervised import KMeans
|
||||||
|
|
||||||
DATASET = "datasets/"
|
DATASET = "datasets/"
|
||||||
REGRESSION = DATASET + "regression/"
|
REGRESSION = DATASET + "regression/"
|
||||||
@@ -75,6 +77,23 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]:
|
|||||||
size = [4, 3]
|
size = [4, 3]
|
||||||
return (ds, MultiLayerPerceptron(ds, size), sklearn.neural_network.MLPClassifier(size, 'relu'))
|
return (ds, MultiLayerPerceptron(ds, size), sklearn.neural_network.MLPClassifier(size, 'relu'))
|
||||||
|
|
||||||
|
# ********************
|
||||||
|
# MultiLayerPerceptron
|
||||||
|
# ********************
|
||||||
|
|
||||||
|
def frogs_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
|
||||||
|
ds = Dataset(CLASSIFICATION + "frogs.csv", "Species", TargetType.NoTarget)
|
||||||
|
ds.remove(["Family", "Genus", "RecordID", "Species"])
|
||||||
|
clusters = 10
|
||||||
|
return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
|
||||||
|
|
||||||
|
def iris_no_target() -> tuple[Dataset, MLAlgorithm, Any]:
|
||||||
|
ds = Dataset(CLASSIFICATION + "iris.csv", "Class", TargetType.NoTarget)
|
||||||
|
ds.remove(["Class"])
|
||||||
|
ds.normalize()
|
||||||
|
clusters = 3
|
||||||
|
return (ds, KMeans(ds, clusters), sklearn.cluster.KMeans(clusters))
|
||||||
|
|
||||||
# ********************
|
# ********************
|
||||||
# Main & random
|
# Main & random
|
||||||
# ********************
|
# ********************
|
||||||
@@ -82,17 +101,24 @@ def iris() -> tuple[Dataset, MLAlgorithm, Any]:
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
np.set_printoptions(linewidth=np.inf, formatter={'float': '{:>10.5f}'.format})
|
np.set_printoptions(linewidth=np.inf, formatter={'float': '{:>10.5f}'.format})
|
||||||
rand = random.randint(0, 4294967295)
|
rand = random.randint(0, 4294967295)
|
||||||
|
#rand = 1997847910 # LiR for power_plant
|
||||||
|
#rand = 347617386 # LoR for electrical_grid
|
||||||
|
#rand = 1793295160 # MLP for iris
|
||||||
|
#rand = 885416001 # KMe for frogs_no_target
|
||||||
|
|
||||||
np.random.seed(rand)
|
np.random.seed(rand)
|
||||||
print(f"Using seed: {rand}")
|
print(f"Using seed: {rand}")
|
||||||
|
|
||||||
ds, ml, sk = electrical_grid()
|
ds, ml, sk = iris()
|
||||||
ml.learn(10000, verbose=True)
|
|
||||||
|
epochs, _, _ = ml.learn(1000, verbose=True)
|
||||||
ml.display_results()
|
ml.display_results()
|
||||||
|
|
||||||
np.random.seed(rand)
|
np.random.seed(rand)
|
||||||
learn, test, valid = ds.get_dataset()
|
learn, test, valid = ds.get_dataset()
|
||||||
|
sk.set_params(max_iter=epochs)
|
||||||
sk.fit(learn.x, learn.y)
|
sk.fit(learn.x, learn.y)
|
||||||
print(f"Sklearn : {sk.score(test.x, test.y):0.5f}")
|
print(f"Sklearn : {abs(sk.score(test.x, test.y)):0.5f}")
|
||||||
print("========================")
|
print("========================")
|
||||||
|
|
||||||
ml.plot()
|
ml.plot()
|
||||||
|
|||||||
@@ -83,7 +83,8 @@ class Dataset:
|
|||||||
splitted = [data[ data[:,0] == k ] for k in classes ]
|
splitted = [data[ data[:,0] == k ] for k in classes ]
|
||||||
total_each = np.average([len(x) for x in splitted]).astype(int)
|
total_each = np.average([len(x) for x in splitted]).astype(int)
|
||||||
|
|
||||||
rng = np.random.default_rng()
|
seed = np.random.randint(0, 4294967295)
|
||||||
|
rng = np.random.default_rng(seed)
|
||||||
data = []
|
data = []
|
||||||
for x in splitted:
|
for x in splitted:
|
||||||
samples = rng.choice(x, size=total_each, replace=True, shuffle=False)
|
samples = rng.choice(x, size=total_each, replace=True, shuffle=False)
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class MLAlgorithm(ABC):
|
|||||||
print(f"Loss test : {self.test_loss():0.5f}")
|
print(f"Loss test : {self.test_loss():0.5f}")
|
||||||
if self._target_type == TargetType.Regression:
|
if self._target_type == TargetType.Regression:
|
||||||
print(f"R^2 : {self.test_r_squared():0.5f}")
|
print(f"R^2 : {self.test_r_squared():0.5f}")
|
||||||
else:
|
elif self._target_type != TargetType.NoTarget:
|
||||||
conf = self.test_confusion_matrix()
|
conf = self.test_confusion_matrix()
|
||||||
print(f"Accuracy : {conf.accuracy():0.5f} - classes {conf.accuracy_per_class()}")
|
print(f"Accuracy : {conf.accuracy():0.5f} - classes {conf.accuracy_per_class()}")
|
||||||
print(f"Precision : {conf.precision():0.5f} - classes {conf.precision_per_class()}")
|
print(f"Precision : {conf.precision():0.5f} - classes {conf.precision_per_class()}")
|
||||||
|
|||||||
51
src/learning/unsupervised.py
Normal file
51
src/learning/unsupervised.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
import math as math
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from abc import abstractmethod
|
||||||
|
from learning.ml import MLAlgorithm
|
||||||
|
from learning.data import Dataset, Data
|
||||||
|
NOT_ZERO = 1e-15
|
||||||
|
|
||||||
|
class KMeans(MLAlgorithm):
|
||||||
|
def __init__(self, dataset: Dataset, clusters:int) -> None:
|
||||||
|
super().__init__(dataset)
|
||||||
|
dimensions = self._learnset.x.shape[1]
|
||||||
|
self.total = clusters
|
||||||
|
self.centroids = np.random.rand(clusters, dimensions)
|
||||||
|
|
||||||
|
def _h0(self, x:np.ndarray) -> np.ndarray:
|
||||||
|
diff = x[:, np.newaxis] - self.centroids
|
||||||
|
distances = np.linalg.norm(diff, axis=2)
|
||||||
|
return np.argmin(distances, axis=1)
|
||||||
|
|
||||||
|
def _learning_step(self) -> float:
|
||||||
|
assignments = self._h0(self._learnset.x)
|
||||||
|
centroids = []
|
||||||
|
|
||||||
|
for k in range(self.total):
|
||||||
|
assigned_points = self._learnset.x[assignments == k]
|
||||||
|
|
||||||
|
if len(assigned_points) > 0:
|
||||||
|
mean = np.mean(assigned_points, axis=0)
|
||||||
|
centroids.append(mean)
|
||||||
|
else:
|
||||||
|
self.total -= 1
|
||||||
|
|
||||||
|
self.centroids = np.array(centroids)
|
||||||
|
return self._predict_loss(self._learnset)
|
||||||
|
|
||||||
|
def _predict_loss(self, dataset:Data) -> float:
|
||||||
|
assignments = self._h0(dataset.x)
|
||||||
|
loss = 0.0
|
||||||
|
|
||||||
|
for k in range(self.total):
|
||||||
|
assigned_points = dataset.x[assignments == k]
|
||||||
|
if len(assigned_points) > 0:
|
||||||
|
diff = assigned_points - self.centroids[k]
|
||||||
|
loss += np.sum(np.linalg.norm(diff, axis=1) ** 2)
|
||||||
|
return loss
|
||||||
|
|
||||||
|
def _get_parameters(self):
|
||||||
|
return self.centroids.copy()
|
||||||
|
def _set_parameters(self, parameters):
|
||||||
|
self.centroids = parameters
|
||||||
Reference in New Issue
Block a user