aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rw-r--r--backend/microservice/__pycache__/mlservice.cpython-310.pycbin5009 -> 7405 bytes
-rw-r--r--backend/microservice/api.py2
-rw-r--r--backend/microservice/api/ml_service.py127
-rw-r--r--backend/microservice/mlservice.py32
4 files changed, 135 insertions, 26 deletions
diff --git a/backend/microservice/__pycache__/mlservice.cpython-310.pyc b/backend/microservice/__pycache__/mlservice.cpython-310.pyc
index c079459a..ac93f3db 100644
--- a/backend/microservice/__pycache__/mlservice.cpython-310.pyc
+++ b/backend/microservice/__pycache__/mlservice.cpython-310.pyc
Binary files differ
diff --git a/backend/microservice/api.py b/backend/microservice/api.py
index 4768f34c..9a28b159 100644
--- a/backend/microservice/api.py
+++ b/backend/microservice/api.py
@@ -9,7 +9,7 @@ import csv
import json
import mlservice
import h5py
-from mlservice2 import unositok
+from mlservice import unositok
app = flask.Flask(__name__)
diff --git a/backend/microservice/api/ml_service.py b/backend/microservice/api/ml_service.py
index ea562212..c7082454 100644
--- a/backend/microservice/api/ml_service.py
+++ b/backend/microservice/api/ml_service.py
@@ -1,4 +1,5 @@
import pandas as pd
+from sklearn import datasets
import tensorflow as tf
import keras
import numpy as np
@@ -11,12 +12,15 @@ from typing_extensions import Self
from copyreg import constructor
from flask import request, jsonify, render_template
from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import OrdinalEncoder
+import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
+'''
@dataclass
-class TrainingResult:
+class TrainingResultClassification:
accuracy: float
precision: float
recall: float
@@ -26,12 +30,21 @@ class TrainingResult:
tp: float
specificity: float
f1: float
+ logloss: float
+ fpr: float
+ tpr: float
+ metrics: dict
+
+@datasets
+class TrainingResultRegression:
mse: float
mae: float
mape: float
rmse: float
- fpr: float
- tpr: float
+'''
+@dataclass
+class TrainingResult:
+ metrics: dict
def train(dataset, params, callback):
problem_type = params["type"]
@@ -66,6 +79,7 @@ def train(dataset, params, callback):
data.pop(col)
#
# Enkodiranje
+ # https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/
#
encoding=params["encoding"]
if(encoding=='label'):
@@ -79,6 +93,32 @@ def train(dataset, params, callback):
if(data[col].dtype==np.object_):
category_columns.append(col)
data=pd.get_dummies(data, columns=category_columns, prefix=category_columns)
+ elif(encoding=='ordinal'):
+ encoder = OrdinalEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+ elif(encoding=='hashing'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns))
+ encoder.fit_transform(data)
+ elif(encoding=='binary'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BinaryEncoder(cols=category_columns, return_df=True)
+ encoder.fit_transform(data)
+ elif(encoding=='baseN'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5)
+ encoder.fit_transform(data)
#
# Input - output
#
@@ -94,17 +134,20 @@ def train(dataset, params, callback):
test=params["randomTestSetDistribution"]
randomOrder = params["randomOrder"]
if(randomOrder):
- random=50
+ random=123
else:
random=0
- x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test, random_state=random)
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test, shuffle=params["shuffle"], random_state=random)
#
# Skaliranje vrednosti
#
+ '''
scaler=StandardScaler()
scaler.fit(x_train)
x_test=scaler.transform(x_test)
x_train=scaler.transform(x_train)
+ '''
+
#
# Treniranje modela
#
@@ -113,7 +156,10 @@ def train(dataset, params, callback):
for func in params["hiddenLayerActivationFunctions"]:
classifier.add(tf.keras.layers.Dense(units=hidden_layer_neurons,activation=func))
output_func = params["outputLayerActivationFunction"]
- classifier.add(tf.keras.layers.Dense(units=1,activation=output_func))
+ if(problem_type!="regresioni"):
+ classifier.add(tf.keras.layers.Dense(units=1,activation=output_func))
+ else:
+ classifier.add(tf.keras.layers.Dense(units=1))
optimizer = params["optimizer"]
metrics=params['metrics']
loss_func=params["lossFunction"]
@@ -125,11 +171,14 @@ def train(dataset, params, callback):
# Test
#
model_name = params['_id']
- y_pred=classifier.predict(x_test)
+ #y_pred=classifier.predict(x_test)
if(problem_type == "regresioni"):
- classifier.evaluate(x_test, y_test)
- elif(problem_type == "binarni-klasifikacioni"):
+ y_pred=classifier.predict(x_test)
+ print(classifier.evaluate(x_test, y_test))
+ elif(problem_type == "binarni-klasifikacioni"):
+ y_pred=classifier.predict(x_test)
y_pred=(y_pred>=0.5).astype('int')
+
y_pred=y_pred.flatten()
result=pd.DataFrame({"Actual":y_test,"Predicted":y_pred})
classifier.save("temp/"+model_name, save_format='h5')
@@ -139,20 +188,48 @@ def train(dataset, params, callback):
print("HELLO???")
print(result)
print("HELLO???")
- accuracy = float(sm.accuracy_score(y_test,y_pred))
- precision = float(sm.precision_score(y_test,y_pred))
- recall = float(sm.recall_score(y_test,y_pred))
- tn, fp, fn, tp = sm.confusion_matrix(y_test,y_pred).ravel()
- specificity = float(tn / (tn+fp))
- f1 = float(sm.f1_score(y_test,y_pred))
- mse = float(sm.mean_squared_error(y_test,y_pred))
- mae = float(sm.mean_absolute_error(y_test,y_pred))
- mape = float(sm.mean_absolute_percentage_error(y_test,y_pred))
- rmse = float(np.sqrt(sm.mean_squared_error(y_test,y_pred)))
- fpr, tpr, _ = sm.roc_curve(y_test,y_pred)
+ if(problem_type=="binarni-klasifikacioni"):
+ accuracy = float(sm.accuracy_score(y_test,y_pred))
+ precision = float(sm.precision_score(y_test,y_pred))
+ recall = float(sm.recall_score(y_test,y_pred))
+ tn, fp, fn, tp = sm.confusion_matrix(y_test,y_pred).ravel()
+ specificity = float(tn / (tn+fp))
+ f1 = float(sm.f1_score(y_test,y_pred))
+ fpr, tpr, _ = sm.roc_curve(y_test,y_pred)
+ logloss = float(sm.log_loss(y_test, y_pred))
+ metrics= {"accuracy" : accuracy,
+ "precision" : precision,
+ "recall" : recall,
+ "specificity" : specificity,
+ "f1" : f1,
+ "tn" : float(tn),
+ "fp" : float(fp),
+ "fn" : float(fn),
+ "tp" : float(tp),
+ "fpr" : fpr.tolist(),
+ "tpr" : tpr.tolist(),
+ "logloss" : logloss
+ }
+ elif(problem_type=="regresioni"):
+ # https://www.analyticsvidhya.com/blog/2021/05/know-the-best-evaluation-metrics-for-your-regression-model/
+ mse = float(sm.mean_squared_error(y_test,y_pred))
+ mae = float(sm.mean_absolute_error(y_test,y_pred))
+ mape = float(sm.mean_absolute_percentage_error(y_test,y_pred))
+ rmse = float(np.sqrt(sm.mean_squared_error(y_test,y_pred)))
+ rmsle = float(np.sqrt(sm.mean_squared_error(y_test, y_pred)))
+ r2 = float(sm.r2_score(y_test, y_pred))
+ # n - num of observations
+ # k - num of independent variables
+ n = 40
+ k = 2
+ adj_r2 = float(1 - ((1-r2)*(n-1)/(n-k-1)))
+ metrics= {"mse" : mse,
+ "mae" : mae,
+ "mape" : mape,
+ "rmse" : rmse,
+ "rmsle" : rmsle,
+ "r2" : r2,
+ "adj_r2" : adj_r2
+ }
# TODO upload trenirani model nazad na backend
- return TrainingResult(accuracy, precision, recall, float(tn), float(fp), float(fn), float(tp), specificity, f1, mse, mae, mape, rmse, fpr.tolist(), tpr.tolist())
-
-
-
-
+ return TrainingResult(metrics) \ No newline at end of file
diff --git a/backend/microservice/mlservice.py b/backend/microservice/mlservice.py
index b2eafe9a..8f56fc3f 100644
--- a/backend/microservice/mlservice.py
+++ b/backend/microservice/mlservice.py
@@ -54,6 +54,38 @@ def obuka(dataunos,params,modelunos,dataunosdrugog):
data[zeljenekolone[i]]=dataunos[zeljenekolone[i]]
#print(data.head(10))
+ ### 0.1) Povratne vrednosti statistike za front (za popunjavanje null vrednosti izabranih kolona) PART4
+ datafront=data.copy()
+ svekolone=datafront.columns
+ kategorijskekolone=datafront.select_dtypes(include=['object']).columns
+ #print(kategorijskekolone )
+ #kategorijskekolone=datacategorical.columns
+ #print(svekolone)
+ for i in range(len(svekolone)):
+ nazivkolone=svekolone[i]
+ if(nazivkolone in kategorijskekolone):
+ svekategorije=datafront[nazivkolone].unique()
+ medijana=None
+ srednjavrednost=None
+ frontreturn={'colName':nazivkolone,
+ 'colType':'categorical',
+ 'categoricalValues':svekategorije,
+ 'mean':medijana,
+ 'average':srednjavrednost
+ }
+ else:
+ svekategorije=None
+ medijana=datafront[nazivkolone].mean()
+ srednjavrednost=sum(datafront[nazivkolone])/len(datafront[nazivkolone])
+ frontreturn={'colName':nazivkolone,
+ 'colType':'noncategorical',
+ 'categoricalValues':svekategorije,
+ 'mean':medijana,
+ 'average':srednjavrednost
+ }
+
+ print(frontreturn)
+
#predvidetikol=input("UNETI NAZIV KOLONE ČIJU VREDNOST TREBA PREDVIDETI ")
###sta se cuva od promenjivih broj kolone ili naziv kolone???