- implemented KMeans - fixed non seeded rng - fixed display exception with NoTargets - added basic test cases to app
200 lines
6.5 KiB
Python
200 lines
6.5 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
|
|
from enum import Enum
|
|
from typing_extensions import Self
|
|
|
|
class TargetType(Enum):
|
|
Regression = 1
|
|
Classification = 2
|
|
MultiClassification = 3
|
|
NoTarget = 4
|
|
|
|
class Data:
|
|
x: np.ndarray
|
|
y: np.ndarray
|
|
size: int
|
|
param: int
|
|
|
|
def __init__(self, x:np.ndarray, y:np.ndarray) -> None:
|
|
self.x = x
|
|
self.y = y
|
|
self.size = x.shape[0]
|
|
self.param = x.shape[1]
|
|
def __str__(self) -> str:
|
|
return "X: " + str(self.x) + "\nY: " + str(self.y)
|
|
def as_tuple(self) -> tuple[np.ndarray, np.ndarray, int, int]:
|
|
return (self.x, self.y, self.size, self.param)
|
|
|
|
class Dataset:
|
|
data: pd.DataFrame
|
|
target: str
|
|
target_type: TargetType
|
|
|
|
def __init__(self, csv:str, target:str, target_type:TargetType) -> None:
|
|
self.original = pd.read_csv(csv)
|
|
self.data = self.original
|
|
self.target = target
|
|
self.target_type = target_type
|
|
|
|
# move target to the start
|
|
col_target = self.data.pop(target)
|
|
self.data.insert(0, target, col_target)
|
|
|
|
def remove(self, columns:list[str]) -> Self:
|
|
for col in columns:
|
|
self.data.pop(col)
|
|
return self
|
|
|
|
def normalize(self, excepts:list[str]=[]) -> Self:
|
|
if excepts is None: excepts = []
|
|
else: excepts.append(self.target)
|
|
|
|
for col in self.data:
|
|
if col not in excepts:
|
|
index = self.data.columns.get_loc(col)
|
|
datacol = self.data.pop(col)
|
|
datacol = (datacol - datacol.mean()) / datacol.std()
|
|
self.data.insert(index, col, datacol)
|
|
return self
|
|
|
|
def factorize(self, columns:list[str]=[]) -> Self:
|
|
data = self.data
|
|
for col in columns:
|
|
data[col] = pd.factorize(data[col])[0]
|
|
return self
|
|
|
|
def numbers(self, columns:list[str]=[]) -> Self:
|
|
data = self.data
|
|
for col in columns:
|
|
if data[col].dtype == object:
|
|
data[col] = pd.to_numeric(data[col], errors='coerce')
|
|
return self
|
|
|
|
def handle_na(self) -> Self:
|
|
self.data = self.data.dropna()
|
|
return self
|
|
|
|
def prepare_classification(self, data:np.ndarray) -> np.ndarray:
|
|
if self.target_type == TargetType.Regression or self.target_type == TargetType.NoTarget:
|
|
return data
|
|
|
|
classes = np.unique(data[:, 0])
|
|
splitted = [data[ data[:,0] == k ] for k in classes ]
|
|
total_each = np.average([len(x) for x in splitted]).astype(int)
|
|
|
|
seed = np.random.randint(0, 4294967295)
|
|
rng = np.random.default_rng(seed)
|
|
data = []
|
|
for x in splitted:
|
|
samples = rng.choice(x, size=total_each, replace=True, shuffle=False)
|
|
data.append(samples)
|
|
|
|
return np.concatenate(data, axis=0)
|
|
|
|
def split_data_target(self, data:np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
target = data[:, 0] if self.target_type != TargetType.NoTarget else None
|
|
data = data[:, 1:]
|
|
if self.target_type == TargetType.MultiClassification:
|
|
target = target.astype(int)
|
|
uniques = np.unique(target).shape[0]
|
|
target = np.eye(uniques)[target]
|
|
return (data, target)
|
|
|
|
def get_dataset(self, test_frac:float=0.2, valid_frac:float=0.2) -> tuple[Data, Data, Data]:
|
|
data = self.data.to_numpy()
|
|
data = self.prepare_classification(data)
|
|
|
|
np.random.shuffle(data)
|
|
|
|
total = data.shape[0]
|
|
valid_cutoff = int(total * valid_frac)
|
|
test_cutoff = int(total * test_frac) + valid_cutoff
|
|
|
|
valid = data[:valid_cutoff]
|
|
test = data[valid_cutoff:test_cutoff]
|
|
learn = data[test_cutoff:]
|
|
|
|
l = []
|
|
for data in [learn, test, valid]:
|
|
data, target = self.split_data_target(data)
|
|
l.append(Data(data, target))
|
|
return l
|
|
|
|
class ConfusionMatrix:
|
|
matrix:np.ndarray
|
|
|
|
def __init__(self, dataset_y: np.ndarray, predictions_y:np.ndarray) -> None:
|
|
classes = len(np.unique(dataset_y))
|
|
conf_matrix = np.zeros((classes, classes), dtype=int)
|
|
|
|
for actual, prediction in zip(dataset_y, predictions_y):
|
|
conf_matrix[int(actual), int(prediction)] += 1
|
|
|
|
self.matrix = conf_matrix
|
|
self.classes = classes
|
|
self.total = dataset_y.shape[0]
|
|
self.tp = np.diagonal(conf_matrix)
|
|
self.fp = np.sum(conf_matrix, axis=0) - self.tp
|
|
self.fn = np.sum(conf_matrix, axis=1) - self.tp
|
|
self.tn = self.total - (self.tp + self.fp + self.fn)
|
|
|
|
def divide_ignore_zero(self, a:np.ndarray, b:np.ndarray) -> np.ndarray:
|
|
with np.errstate(divide='ignore', invalid='ignore'):
|
|
c = np.true_divide(a, b)
|
|
c[c == np.inf] = 0
|
|
return np.nan_to_num(c)
|
|
|
|
def accuracy_per_class(self) -> np.ndarray:
|
|
return self.tp / np.sum(self.matrix, axis=1)
|
|
|
|
def precision_per_class(self) -> np.ndarray:
|
|
return self.divide_ignore_zero(self.tp, self.tp + self.fp)
|
|
|
|
def recall_per_class(self) -> np.ndarray:
|
|
return self.divide_ignore_zero(self.tp, self.tp + self.fn)
|
|
|
|
def f1_score_per_class(self) -> np.ndarray:
|
|
prec = self.precision_per_class()
|
|
rec = self.recall_per_class()
|
|
return self.divide_ignore_zero(2 * prec * rec, prec + rec)
|
|
|
|
def specificity_per_class(self) -> np.ndarray:
|
|
return self.divide_ignore_zero(self.tn, self.tn + self.fp)
|
|
|
|
def accuracy(self) -> float:
|
|
return self.tp.sum() / self.total
|
|
|
|
def precision(self) -> float:
|
|
precision_per_class = self.precision_per_class()
|
|
support = np.sum(self.matrix, axis=1)
|
|
return np.average(precision_per_class, weights=support)
|
|
|
|
def recall(self) -> float:
|
|
recall_per_class = self.recall_per_class()
|
|
support = np.sum(self.matrix, axis=1)
|
|
return np.average(recall_per_class, weights=support)
|
|
|
|
def f1_score(self) -> float:
|
|
f1_per_class = self.f1_score_per_class()
|
|
support = np.sum(self.matrix, axis=1)
|
|
return np.average(f1_per_class, weights=support)
|
|
|
|
def specificity(self) -> float:
|
|
specificity_per_class = self.specificity_per_class()
|
|
support = np.sum(self.matrix, axis=1)
|
|
return np.average(specificity_per_class, weights=support)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
ds = Dataset("datasets\\classification\\frogs.csv", "Species", TargetType.MultiClassification)
|
|
ds.remove(["Family", "Genus", "RecordID"])
|
|
ds.factorize(["Species"])
|
|
|
|
np.random.seed(0)
|
|
learn, test, valid = ds.get_dataset()
|
|
print(learn)
|
|
print(test)
|
|
print(valid)
|
|
|