Refactor Dataset

- better finalize function
- support for one-hot-encoding
This commit is contained in:
2024-05-02 14:19:23 +02:00
parent 969338196b
commit 3a4e07afc8
4 changed files with 118 additions and 98 deletions

View File

@@ -1,43 +1,43 @@
from learning.data import Dataset
from learning.supervised import LinearRegression, LogisticRegression, MultiLogisticRegression
from learning.data import Dataset, TargetType
from learning.supervised import LinearRegression, LogisticRegression, MultiLayerPerceptron
from learning.ml import MLAlgorithm
from typing import Callable
def auto_mpg() -> tuple[int, MLAlgorithm]:
ds = Dataset("datasets\\auto-mpg.csv", "MPG")
ds = Dataset("datasets\\auto-mpg.csv", "MPG", TargetType.Regression)
ds.to_numbers(["HP"])
ds.numbers(["HP"])
ds.handle_na()
ds.regularize(excepts=["Cylinders","Year","Origin"])
return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.0001))
ds.normalize(excepts=["Cylinders","Year","Origin"])
return (1000, LinearRegression(ds, learning_rate=0.0001))
def automobile() -> tuple[int, MLAlgorithm]:
ds = Dataset("datasets\\regression\\automobile.csv", "symboling")
ds = Dataset("datasets\\regression\\automobile.csv", "symboling", TargetType.Regression)
attributes_to_modify = ["fuel-system", "engine-type", "drive-wheels", "body-style", "make", "engine-location", "aspiration", "fuel-type", "num-of-cylinders", "num-of-doors"]
ds.factorize(attributes_to_modify)
ds.to_numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
ds.numbers(["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"])
ds.handle_na()
ds.regularize(excepts=attributes_to_modify)
return (1000, LinearRegression(ds.as_ndarray(), learning_rate=0.004))
ds.normalize(excepts=attributes_to_modify)
return (1000, LinearRegression(ds, learning_rate=0.004))
def power_plant() -> tuple[int, MLAlgorithm]:
ds = Dataset("datasets\\regression\\power-plant.csv", "energy-output")
ds.regularize()
return (80, LinearRegression(ds.as_ndarray(), learning_rate=0.1))
ds = Dataset("datasets\\regression\\power-plant.csv", "energy-output", TargetType.Regression)
ds.normalize()
return (80, LinearRegression(ds, learning_rate=0.1))
def electrical_grid() -> tuple[int, MLAlgorithm]:
ds = Dataset("datasets\\classification\\electrical_grid.csv", "stabf")
ds = Dataset("datasets\\classification\\electrical_grid.csv", "stabf", TargetType.Classification)
ds.factorize(["stabf"])
ds.regularize()
return (1000, LogisticRegression(ds.as_ndarray(), learning_rate=0.08))
ds.normalize()
return (1000, LogisticRegression(ds, learning_rate=0.08))
def frogs() -> tuple[int, MLAlgorithm]:
ds = Dataset("datasets\\classification\\frogs.csv", "Species")
ds = Dataset("datasets\\classification\\frogs.csv", "Species", TargetType.MultiClassification)
ds.remove(["Family", "Genus", "RecordID"])
ds.factorize(["Species"])
return (1000, MultiLogisticRegression(ds.as_ndarray(), learning_rate=0.08))
return (1000, MultiLayerPerceptron(ds, learning_rate=0.08))
@@ -55,5 +55,5 @@ def learn_dataset(function:Callable[..., tuple[int, MLAlgorithm]], epochs:int=10
return ml
if __name__ == "__main__":
ml = learn_dataset(electrical_grid)
ml = learn_dataset(automobile)
print(ml.accuracy(ml.testset))