Files
upo-ml/src/learning/data.py
Berack96 b4bd976a9d Fixes
- cleaner app.py
- fixed pandas Warning
- better learning method
-power-plant csv fixed
2024-04-21 15:01:17 +02:00

79 lines
2.5 KiB
Python

import pandas as pd
import numpy as np
from typing_extensions import Self
class Dataset:
def __init__(self, csv:str, target:str) -> None:
data = pd.read_csv(csv)
# move target to the start
col_target = data.pop(target)
data.insert(0, target, col_target)
data.insert(1, "Bias", 1.0)
self.original = data
self.data = data
self.target = target
self.classification = (data[target].dtype == object)
def regularize(self, excepts:list[str]=[]) -> Self:
excepts.append(self.target)
excepts.append("Bias")
for col in self.data:
if col not in excepts:
index = self.data.columns.get_loc(col)
datacol = self.data.pop(col)
datacol = (datacol - datacol.mean()) / datacol.std()
self.data.insert(index, col, datacol)
return self
def factorize(self, columns:list[str]=[]) -> Self:
data = self.data
for col in columns:
data[col] = pd.factorize(data[col])[0]
return self
def to_numbers(self, columns:list[str]=[]) -> Self:
data = self.data
for col in columns:
if data[col].dtype == object:
data[col] = pd.to_numeric(data[col], errors='coerce')
return self
def handle_na(self) -> Self:
self.data = self.data.dropna()
return self
def shuffle(self) -> Self:
self.data = self.data.sample(frac=1)
return self
def as_ndarray(self) -> np.ndarray:
return self.data.to_numpy()
def get_index(self, column:str) -> int:
return self.data.columns.get_loc(column)
class PrincipalComponentAnalisys:
def __init__(self, data:np.ndarray) -> None:
self.data = data
def reduce(self, total:int=0, threshold:float=1) -> Self:
columns = self.data.shape[1]
if total > columns or total <= 0:
total = columns
if threshold <= 0 or threshold > 1:
threshold = 1
if __name__ == "__main__":
df = Dataset("datasets\\regression\\automobile.csv", "symboling")
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
df.factorize(attributes_to_modify)
df.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
df.handle_na()
df.regularize(excepts=attributes_to_modify)
print(df.data.dtypes)