aboutsummaryrefslogtreecommitdiff
path: root/backend/microservice/api/ml_service.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/microservice/api/ml_service.py')
-rw-r--r--backend/microservice/api/ml_service.py236
1 files changed, 217 insertions, 19 deletions
diff --git a/backend/microservice/api/ml_service.py b/backend/microservice/api/ml_service.py
index 7b950bcd..0aed3dc9 100644
--- a/backend/microservice/api/ml_service.py
+++ b/backend/microservice/api/ml_service.py
@@ -1,3 +1,6 @@
+from cmath import nan
+from enum import unique
+from itertools import count
import pandas as pd
from sklearn import datasets
import tensorflow as tf
@@ -13,12 +16,64 @@ from copyreg import constructor
from flask import request, jsonify, render_template
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
-#import category_encoders as ce
+import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
+import statistics as s
+from sklearn.metrics import roc_auc_score
+
+def returnColumnsInfo(dataset):
+ dict=[]
+ datafront=dataset.copy()
+ svekolone=datafront.columns
+ kategorijskekolone=datafront.select_dtypes(include=['object']).columns
+ allNullCols=0
+ for kolona in svekolone:
+ if(kolona in kategorijskekolone):
+ uniquevalues=datafront[kolona].unique()
+ mean=0
+ median=0
+ min=0
+ max=0
+ nullCount=datafront[kolona].isnull().sum()
+ if(nullCount>0):
+ allNullCols=allNullCols+1
+ frontreturn={'columnName':kolona,
+ 'isNumber':False,
+ 'uniqueValues':uniquevalues.tolist(),
+ 'mean':float(mean),
+ 'median':float(median),
+ 'numNulls':float(nullCount),
+ 'min':min,
+ 'max':max
+ }
+ dict.append(frontreturn)
+ else:
+ mean=datafront[kolona].mean()
+ median=s.median(datafront[kolona])
+ nullCount=datafront[kolona].isnull().sum()
+ min=min(datafront[kolona])
+ max=max(datafront[kolona])
+ if(nullCount>0):
+ allNullCols=allNullCols+1
+ frontreturn={'columnName':kolona,
+ 'isNumber':1,
+ 'uniqueValues':[],
+ 'mean':float(mean),
+ 'median':float(median),
+ 'numNulls':float(nullCount),
+ 'min':min,
+ 'max':max
+ }
+ dict.append(frontreturn)
+ NullRows = datafront[datafront.isnull().any(axis=1)]
+ #print(NullRows)
+ #print(len(NullRows))
+ allNullRows=len(NullRows)
+
+ return {'columnInfo':dict,'allNullColl':allNullCols,'allNullRows':allNullRows}
-'''
@dataclass
class TrainingResultClassification:
accuracy: float
@@ -34,23 +89,25 @@ class TrainingResultClassification:
fpr: float
tpr: float
metrics: dict
-
+'''
@datasets
class TrainingResultRegression:
mse: float
mae: float
mape: float
rmse: float
-'''
+
@dataclass
class TrainingResult:
metrics: dict
-
+'''
def train(dataset, params, callback):
problem_type = params["type"]
data = pd.DataFrame()
for col in params["inputColumns"]:
data[col]=dataset[col]
+
+ print(data.head())
output_column = params["columnToPredict"]
data[output_column] = dataset[output_column]
#
@@ -98,7 +155,7 @@ def train(dataset, params, callback):
for col in data.columns:
if(data[col].dtype==np.object_):
data[col]=encoder.fit_transform(data[col])
- '''
+
elif(encoding=='hashing'):
category_columns=[]
for col in data.columns:
@@ -120,7 +177,7 @@ def train(dataset, params, callback):
if(data[col].dtype==np.object_):
category_columns.append(col)
encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5)
- encoder.fit_transform(data)'''
+ encoder.fit_transform(data)
#
# Input - output
#
@@ -130,6 +187,8 @@ def train(dataset, params, callback):
x_columns.append(col)
x = data[x_columns].values
y = data[output_column].values
+ print(x_columns)
+ print(x)
#
# Podela na test i trening skupove
#
@@ -139,7 +198,7 @@ def train(dataset, params, callback):
random=123
else:
random=0
- x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test, shuffle=params["shuffle"], random_state=random)
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5,random_state=0)
#
# Skaliranje vrednosti
#
@@ -158,21 +217,30 @@ def train(dataset, params, callback):
if(problem_type=='multi-klasifikacioni'):
func=params['hiddenLayerActivationFunctions']
- funcFirst=func.pop(0)
- inputDim = len(data.columns) - 1
- classifier=tf.keras.Sequential(units=hidden_layer_neurons,input_dim=inputDim,activation=funcFirst)
- for f in func:
- classifier.add(tf.keras.layers.Dense(units=hidden_layer_neurons,activation=func))
output_func = params["outputLayerActivationFunction"]
- numberofclasses=len(output_column.unique())
- classifier.add(tf.keras.layers.Dense(units=numberofclasses,activation=output_func))
-
optimizer = params["optimizer"]
metrics=params['metrics']
loss_func=params["lossFunction"]
- classifier.compile(optimizer=optimizer, loss=loss_func,metrics=metrics)
batch_size = params["batchSize"]
epochs = params["epochs"]
+ inputDim = len(data.columns) - 1
+ '''
+ classifier=tf.keras.Sequential()
+
+ classifier.add(tf.keras.layers.Dense(units=len(data.columns),input_dim=inputDim))#input layer
+
+ for f in func:#hidden layers
+ classifier.add(tf.keras.layers.Dense(hidden_layer_neurons,activation=f))
+
+ numberofclasses=len(output_column.unique())
+ classifier.add(tf.keras.layers.Dense(numberofclasses,activation=output_func))#output layer
+ '''
+ model=tf.keras.Sequential()
+ model.add(tf.keras.layers.Dense(1,input_dim=x_train.shape[1]))#input layer
+ model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+ model.add(tf.keras.layers.Dense(len(output_column.unique())+1, activation='softmax'))
+ classifier.compile(optimizer=optimizer, loss=loss_func,metrics=metrics)
+
history=classifier.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callback(x_test, y_test))
else:
classifier=tf.keras.Sequential()
@@ -180,10 +248,12 @@ def train(dataset, params, callback):
for func in params["hiddenLayerActivationFunctions"]:
classifier.add(tf.keras.layers.Dense(units=hidden_layer_neurons,activation=func))
output_func = params["outputLayerActivationFunction"]
+
if(problem_type!="regresioni"):
classifier.add(tf.keras.layers.Dense(units=1,activation=output_func))
else:
classifier.add(tf.keras.layers.Dense(units=1))
+
optimizer = params["optimizer"]
metrics=params['metrics']
loss_func=params["lossFunction"]
@@ -202,10 +272,33 @@ def train(dataset, params, callback):
elif(problem_type == "binarni-klasifikacioni"):
y_pred=classifier.predict(x_test)
y_pred=(y_pred>=0.5).astype('int')
-
+ elif(problem_type=='multi-klasifikacioni'):
+ y_pred=classifier.predict(x_test)
+ y_pred=np.argmax(y_pred,axis=1)
+
y_pred=y_pred.flatten()
result=pd.DataFrame({"Actual":y_test,"Predicted":y_pred})
classifier.save("temp/"+model_name, save_format='h5')
+ # ROC multi-klasifikacioni
+ def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
+
+ #creating a set of all the unique classes using the actual class list
+ unique_class = set(actual_class)
+ roc_auc_dict = {}
+ for per_class in unique_class:
+
+ #creating a list of all the classes except the current class
+ other_class = [x for x in unique_class if x != per_class]
+
+ #marking the current class as 1 and all other classes as 0
+ new_actual_class = [0 if x in other_class else 1 for x in actual_class]
+ new_pred_class = [0 if x in other_class else 1 for x in pred_class]
+
+ #using the sklearn metrics method to calculate the roc_auc_score
+ roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
+ roc_auc_dict[per_class] = roc_auc
+
+ return roc_auc_dict
#
# Metrike
#
@@ -255,5 +348,110 @@ def train(dataset, params, callback):
"r2" : r2,
"adj_r2" : adj_r2
}
+ elif(problem_type=="multi-klasifikacioni"):
+
+ cr=sm.classification_report(y_test, y_pred)
+ cm=sm.confusion_matrix(y_test,y_pred)
+ # https://www.kaggle.com/code/nkitgupta/evaluation-metrics-for-multi-class-classification/notebook
+ accuracy=metrics.accuracy_score(y_test, y_pred)
+ macro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'macro')
+ micro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'micro')
+ macro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'macro')
+ micro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'micro')
+ macro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'macro')
+ micro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'micro')
+ roc_auc_dict=roc_auc_score_multiclass(y_test, y_pred)
+
+
# TODO upload trenirani model nazad na backend
- return TrainingResult(metrics) \ No newline at end of file
+ #return TrainingResult(metrics)
+
+
+def manageH5(datain,params,h5model):
+ dataset=datain.copy()
+ problem_type = params["type"]
+ data = pd.DataFrame()
+ for col in params["inputColumns"]:
+ data[col]=dataset[col]
+ output_column = params["columnToPredict"]
+ data[output_column] = dataset[output_column]
+ #
+ # Brisanje null kolona / redova / zamena
+ #nullreplace=[
+ # {"column":"Embarked","value":"C","deleteRow":false,"deleteCol":true},
+ # {"column": "Cabin","value":"C123","deleteRow":"0","deleteCol":"0"}]
+
+ null_value_options = params["nullValues"]
+ null_values_replacers = params["nullValuesReplacers"]
+
+ if(null_value_options=='replace'):
+ print("replace null") # TODO
+ elif(null_value_options=='delete_rows'):
+ data=data.dropna()
+ elif(null_value_options=='delete_columns'):
+ data=data.dropna()
+ #
+ #print(data.isnull().any())
+ #
+ # Brisanje kolona koje ne uticu na rezultat
+ #
+ num_rows=data.shape[0]
+ for col in data.columns:
+ if((data[col].nunique()==(num_rows)) and (data[col].dtype==np.object_)):
+ data.pop(col)
+ #
+ # Enkodiranje
+ # https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/
+ #
+ encoding=params["encoding"]
+ if(encoding=='label'):
+ encoder=LabelEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+ elif(encoding=='onehot'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ data=pd.get_dummies(data, columns=category_columns, prefix=category_columns)
+ elif(encoding=='ordinal'):
+ encoder = OrdinalEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+
+ elif(encoding=='hashing'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns))
+ encoder.fit_transform(data)
+ elif(encoding=='binary'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BinaryEncoder(cols=category_columns, return_df=True)
+ encoder.fit_transform(data)
+
+ elif(encoding=='baseN'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5)
+ encoder.fit_transform(data)
+ #
+ # Input - output
+ #
+ x_columns = []
+ for col in data.columns:
+ if(col!=output_column):
+ x_columns.append(col)
+ x = data[x_columns].values
+ y = data[output_column].values
+
+
+ y_pred=h5model.predict_classes(x) \ No newline at end of file