upo-ml/src/learning/data.py

import pandas as pd
import numpy as np

from typing_extensions import Self

class Dataset:
    def __init__(self, csv:str, target:str) -> None:
        data = pd.read_csv(csv)

        # move target to the start
        col_target = data.pop(target)
        data.insert(0, target, col_target)
        data.insert(1, "Bias", 1.0)

        self.original = data
        self.data = data
        self.target = target
        self.classification = (data[target].dtype == object)

    def regularize(self, excepts:list[str]=[]) -> Self:
        excepts.append(self.target)
        excepts.append("Bias")
        for col in self.data:
            if col not in excepts:
                index = self.data.columns.get_loc(col)
                datacol = self.data.pop(col)
                datacol = (datacol - datacol.mean()) / datacol.std()
                self.data.insert(index, col, datacol)
        return self

    def factorize(self, columns:list[str]=[]) -> Self:
        data = self.data
        for col in columns:
            data[col] = pd.factorize(data[col])[0]
        return self

    def to_numbers(self, columns:list[str]=[]) -> Self:
        data = self.data
        for col in columns:
            if data[col].dtype == object:
                data[col] = pd.to_numeric(data[col], errors='coerce')
        return self

    def handle_na(self) -> Self:
        self.data = self.data.dropna()
        return self

    def shuffle(self) -> Self:
        self.data = self.data.sample(frac=1)
        return self

    def as_ndarray(self) -> np.ndarray:
        return self.data.to_numpy()

    def get_index(self, column:str) -> int:
        return self.data.columns.get_loc(column)

class PrincipalComponentAnalisys:
    def __init__(self, data:np.ndarray) -> None:
        self.data = data

    def reduce(self, total:int=0, threshold:float=1) -> Self:
        columns = self.data.shape[1]
        if total > columns or total <= 0:
            total = columns
        if threshold <= 0 or threshold > 1:
            threshold = 1


if __name__ == "__main__":
    df = Dataset("datasets\\regression\\automobile.csv", "symboling")
    attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
    df.factorize(attributes_to_modify)
    df.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
    df.handle_na()
    df.regularize(excepts=attributes_to_modify)
    print(df.data.dtypes)