aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rw-r--r--backend/api/api/Controllers/FileController.cs103
-rw-r--r--backend/api/api/Controllers/PredictorController.cs10
-rw-r--r--backend/api/api/Models/FileModel.cs1
-rw-r--r--backend/api/api/Models/PredictorColumns.cs8
-rw-r--r--backend/api/api/Services/PredictorService.cs2
-rw-r--r--backend/api/api/Services/UserService.cs2
-rw-r--r--backend/microservice/__pycache__/mlservice.cpython-310.pycbin5009 -> 7405 bytes
-rw-r--r--backend/microservice/api.py2
-rw-r--r--backend/microservice/api/controller.py22
-rw-r--r--backend/microservice/api/ml_service.py363
-rw-r--r--backend/microservice/api/newmlservice.py424
-rw-r--r--backend/microservice/mlservice.py32
12 files changed, 924 insertions, 45 deletions
diff --git a/backend/api/api/Controllers/FileController.cs b/backend/api/api/Controllers/FileController.cs
index 0fe8415b..d29c5676 100644
--- a/backend/api/api/Controllers/FileController.cs
+++ b/backend/api/api/Controllers/FileController.cs
@@ -4,6 +4,7 @@ using api.Services;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Net.Http.Headers;
+
namespace api.Controllers
{
[Route("api/[controller]")]
@@ -11,6 +12,7 @@ namespace api.Controllers
public class FileController : ControllerBase
{
private string[] permittedExtensions = { ".csv" };
+ private string[] permittedExtensionsH5 = { ".h5" };//niz da bi dodali h4 itd
private readonly IConfiguration _configuration;
private IJwtToken _token;
private IFileService _fileservice;
@@ -22,6 +24,77 @@ namespace api.Controllers
}
+ [HttpPost("h5")]
+ [Authorize(Roles = "User,Guest")]
+ public async Task<ActionResult<string>> H5Upload([FromForm] IFormFile file)
+ {
+
+ //get username from jwtToken
+ string uploaderId;
+ string folderName;
+ var header = Request.Headers[HeaderNames.Authorization];
+ if (AuthenticationHeaderValue.TryParse(header, out var headerValue))
+ {
+
+ var scheme = headerValue.Scheme;
+ var parameter = headerValue.Parameter;
+ uploaderId = _token.TokenToId(parameter);
+ if (uploaderId == null)
+ return null;
+ }
+ else
+ return BadRequest();
+ if (uploaderId == "")
+ {
+ folderName = "TempFiles";
+ }
+ else
+ {
+ folderName = "UploadedFiles";
+ }
+
+
+ //Check filetype
+ var filename = file.FileName;
+ var ext = Path.GetExtension(filename).ToLowerInvariant();
+ var name = Path.GetFileNameWithoutExtension(filename).ToLowerInvariant();
+ if (string.IsNullOrEmpty(ext) || !permittedExtensionsH5.Contains(ext))
+ {
+ return BadRequest("Wrong file type");
+ }
+ var folderPath = Path.Combine(Directory.GetCurrentDirectory(), folderName, uploaderId);
+ //Check Directory
+ if (!Directory.Exists(folderPath))
+ {
+ Directory.CreateDirectory(folderPath);
+ }
+ //Index file if same filename
+ var fullPath = Path.Combine(folderPath, filename);
+ int i = 0;
+
+ while (System.IO.File.Exists(fullPath))
+ {
+ i++;
+ fullPath = Path.Combine(folderPath, name + i.ToString() + ext);
+ }
+
+
+ //Write file
+ using (var stream = new FileStream(fullPath, FileMode.Create))
+ {
+ await file.CopyToAsync(stream);
+ }
+ FileModel fileModel = new FileModel();
+ fileModel.type = "h5";
+ fileModel.path = fullPath;
+ fileModel.uploaderId = uploaderId;
+ fileModel.date = DateTime.Now.ToUniversalTime();
+ fileModel = _fileservice.Create(fileModel);
+
+
+ return Ok(fileModel);
+ }
+
[HttpPost("Csv")]
[Authorize(Roles = "User,Guest")]
@@ -81,6 +154,7 @@ namespace api.Controllers
await file.CopyToAsync(stream);
}
FileModel fileModel= new FileModel();
+ fileModel.type = "csv";
fileModel.path=fullPath;
fileModel.uploaderId= uploaderId;
fileModel.date = DateTime.Now.ToUniversalTime();
@@ -90,6 +164,35 @@ namespace api.Controllers
return Ok(fileModel);
}
+
+ //msm generalno moze da se koristi Download samo
+ [HttpGet("downloadh5")]
+ [Authorize(Roles = "User,Guest")]
+ public async Task<ActionResult> DownloadH5(string id)
+ {
+ //Get Username
+ string uploaderId;
+ var header = Request.Headers[HeaderNames.Authorization];
+ if (AuthenticationHeaderValue.TryParse(header, out var headerValue))
+ {
+
+ var scheme = headerValue.Scheme;
+ var parameter = headerValue.Parameter;
+ uploaderId = _token.TokenToId(parameter);
+ if (uploaderId == null)
+ return null;
+ }
+ else
+ return BadRequest();
+
+ string filePath = _fileservice.GetFilePath(id, uploaderId);
+ if (filePath == null)
+ return BadRequest();
+
+ return File(System.IO.File.ReadAllBytes(filePath), "application/octet-stream", Path.GetFileName(filePath));
+
+ }
+
[HttpGet("Download")]
[Authorize(Roles = "User,Guest")]
public async Task<ActionResult> DownloadFile(string id)
diff --git a/backend/api/api/Controllers/PredictorController.cs b/backend/api/api/Controllers/PredictorController.cs
index cdc14632..161271e2 100644
--- a/backend/api/api/Controllers/PredictorController.cs
+++ b/backend/api/api/Controllers/PredictorController.cs
@@ -77,7 +77,7 @@ namespace api.Controllers
// GET api/<PredictorController>/getpredictor/{name}
[HttpGet("getpredictor/{id}")]
- [Authorize(Roles = "User")]
+ [Authorize(Roles = "User,Guest")]
public ActionResult<Predictor> GetPredictor(string id)
{
string username;
@@ -188,8 +188,8 @@ namespace api.Controllers
// POST api/<PredictorController>/usepredictor {predictor,inputs}
[HttpPost("usepredictor/{id}")]
- [Authorize(Roles = "User")]
- public ActionResult UsePredictor(String id, [FromBody] String[] inputs)
+ [Authorize(Roles = "User,Guest")]
+ public ActionResult UsePredictor(String id, [FromBody] PredictorColumns[] inputs)
{
string username;
@@ -207,8 +207,8 @@ namespace api.Controllers
Predictor predictor = _predictorService.GetPredictor(username, id);
- foreach(String i in inputs)
- Debug.WriteLine(i);
+ foreach(PredictorColumns i in inputs)
+ Debug.WriteLine(i.value.ToString());
return NoContent();
}
diff --git a/backend/api/api/Models/FileModel.cs b/backend/api/api/Models/FileModel.cs
index 1043309d..47b12110 100644
--- a/backend/api/api/Models/FileModel.cs
+++ b/backend/api/api/Models/FileModel.cs
@@ -8,6 +8,7 @@ namespace api.Models
[BsonId]
[BsonRepresentation(BsonType.ObjectId)]
public string _id { get; set; }
+ public string type { get; set; }
public string uploaderId { get; set; }
public string path { get; set; }
[BsonDateTimeOptions(Kind = DateTimeKind.Utc)]
diff --git a/backend/api/api/Models/PredictorColumns.cs b/backend/api/api/Models/PredictorColumns.cs
new file mode 100644
index 00000000..82f3e979
--- /dev/null
+++ b/backend/api/api/Models/PredictorColumns.cs
@@ -0,0 +1,8 @@
+namespace api.Models
+{
+ public class PredictorColumns
+ {
+ public String name { get; set; }
+ public String value { get; set; }
+ }
+}
diff --git a/backend/api/api/Services/PredictorService.cs b/backend/api/api/Services/PredictorService.cs
index 01bc8359..b15255ac 100644
--- a/backend/api/api/Services/PredictorService.cs
+++ b/backend/api/api/Services/PredictorService.cs
@@ -42,7 +42,7 @@ namespace api.Services
}
public Predictor GetPredictor(string username, string id)
{
- return _predictor.Find(predictor => predictor.username == username && predictor._id == id).FirstOrDefault();
+ return _predictor.Find(predictor => predictor._id == id && (predictor.username == username || predictor.isPublic == true)).FirstOrDefault();
}
//last private models
diff --git a/backend/api/api/Services/UserService.cs b/backend/api/api/Services/UserService.cs
index 7ec6f4b2..7fc4bdb1 100644
--- a/backend/api/api/Services/UserService.cs
+++ b/backend/api/api/Services/UserService.cs
@@ -50,7 +50,7 @@ namespace api.Services
//username koji postoji u bazi
using (var session = _client.StartSession())
{
-
+ if(username!=user.Username)
if(_users.Find(u => u.Username == user.Username).FirstOrDefault()!=null)
{
return false;
diff --git a/backend/microservice/__pycache__/mlservice.cpython-310.pyc b/backend/microservice/__pycache__/mlservice.cpython-310.pyc
index c079459a..ac93f3db 100644
--- a/backend/microservice/__pycache__/mlservice.cpython-310.pyc
+++ b/backend/microservice/__pycache__/mlservice.cpython-310.pyc
Binary files differ
diff --git a/backend/microservice/api.py b/backend/microservice/api.py
index 4768f34c..9a28b159 100644
--- a/backend/microservice/api.py
+++ b/backend/microservice/api.py
@@ -9,7 +9,7 @@ import csv
import json
import mlservice
import h5py
-from mlservice2 import unositok
+from mlservice import unositok
app = flask.Flask(__name__)
diff --git a/backend/microservice/api/controller.py b/backend/microservice/api/controller.py
index 059af317..1b17f727 100644
--- a/backend/microservice/api/controller.py
+++ b/backend/microservice/api/controller.py
@@ -1,7 +1,7 @@
import flask
from flask import request, jsonify
import ml_socket
-import ml_service
+import newmlservice
import tensorflow as tf
import pandas as pd
@@ -25,7 +25,7 @@ def train():
f = request.json["dataset"]
dataset = pd.read_csv(f)
#
- result = ml_service.train(dataset, request.json["model"], train_callback)
+ result = newmlservice.train(dataset, request.json["model"], train_callback)
print(result)
return jsonify(result)
@@ -34,10 +34,22 @@ def predict():
f = request.json['filepath']
dataset = pd.read_csv(f)
m = request.json['modelpath']
- #model = tf.keras.models.load_model(m)
- #
- #model.predict?
+ model = tf.keras.models.load_model(m)
+ print("********************************model loaded*******************************")
+ newmlservice.manageH5(dataset,request.json['model'],model)
+ return "done"
+
+@app.route('/preprocess',methods=['POST'])
+def returnColumnsInfo():
+ f=request.json['filepathcolinfo']
+ dataset=pd.read_csv(f)
+
+ result=newmlservice.returnColumnsInfo(dataset)
+
+ return jsonify(result)
+
+
print("App loaded.")
ml_socket.start()
app.run() \ No newline at end of file
diff --git a/backend/microservice/api/ml_service.py b/backend/microservice/api/ml_service.py
index ea562212..0aed3dc9 100644
--- a/backend/microservice/api/ml_service.py
+++ b/backend/microservice/api/ml_service.py
@@ -1,4 +1,8 @@
+from cmath import nan
+from enum import unique
+from itertools import count
import pandas as pd
+from sklearn import datasets
import tensorflow as tf
import keras
import numpy as np
@@ -11,12 +15,67 @@ from typing_extensions import Self
from copyreg import constructor
from flask import request, jsonify, render_template
from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import OrdinalEncoder
+import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
+import statistics as s
+from sklearn.metrics import roc_auc_score
+
+def returnColumnsInfo(dataset):
+ dict=[]
+ datafront=dataset.copy()
+ svekolone=datafront.columns
+ kategorijskekolone=datafront.select_dtypes(include=['object']).columns
+ allNullCols=0
+ for kolona in svekolone:
+ if(kolona in kategorijskekolone):
+ uniquevalues=datafront[kolona].unique()
+ mean=0
+ median=0
+ min=0
+ max=0
+ nullCount=datafront[kolona].isnull().sum()
+ if(nullCount>0):
+ allNullCols=allNullCols+1
+ frontreturn={'columnName':kolona,
+ 'isNumber':False,
+ 'uniqueValues':uniquevalues.tolist(),
+ 'mean':float(mean),
+ 'median':float(median),
+ 'numNulls':float(nullCount),
+ 'min':min,
+ 'max':max
+ }
+ dict.append(frontreturn)
+ else:
+ mean=datafront[kolona].mean()
+ median=s.median(datafront[kolona])
+ nullCount=datafront[kolona].isnull().sum()
+ min=min(datafront[kolona])
+ max=max(datafront[kolona])
+ if(nullCount>0):
+ allNullCols=allNullCols+1
+ frontreturn={'columnName':kolona,
+ 'isNumber':1,
+ 'uniqueValues':[],
+ 'mean':float(mean),
+ 'median':float(median),
+ 'numNulls':float(nullCount),
+ 'min':min,
+ 'max':max
+ }
+ dict.append(frontreturn)
+ NullRows = datafront[datafront.isnull().any(axis=1)]
+ #print(NullRows)
+ #print(len(NullRows))
+ allNullRows=len(NullRows)
+
+ return {'columnInfo':dict,'allNullColl':allNullCols,'allNullRows':allNullRows}
@dataclass
-class TrainingResult:
+class TrainingResultClassification:
accuracy: float
precision: float
recall: float
@@ -26,18 +85,29 @@ class TrainingResult:
tp: float
specificity: float
f1: float
+ logloss: float
+ fpr: float
+ tpr: float
+ metrics: dict
+'''
+@datasets
+class TrainingResultRegression:
mse: float
mae: float
mape: float
rmse: float
- fpr: float
- tpr: float
+@dataclass
+class TrainingResult:
+ metrics: dict
+'''
def train(dataset, params, callback):
problem_type = params["type"]
data = pd.DataFrame()
for col in params["inputColumns"]:
data[col]=dataset[col]
+
+ print(data.head())
output_column = params["columnToPredict"]
data[output_column] = dataset[output_column]
#
@@ -66,6 +136,7 @@ def train(dataset, params, callback):
data.pop(col)
#
# Enkodiranje
+ # https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/
#
encoding=params["encoding"]
if(encoding=='label'):
@@ -79,6 +150,34 @@ def train(dataset, params, callback):
if(data[col].dtype==np.object_):
category_columns.append(col)
data=pd.get_dummies(data, columns=category_columns, prefix=category_columns)
+ elif(encoding=='ordinal'):
+ encoder = OrdinalEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+
+ elif(encoding=='hashing'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns))
+ encoder.fit_transform(data)
+ elif(encoding=='binary'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BinaryEncoder(cols=category_columns, return_df=True)
+ encoder.fit_transform(data)
+
+ elif(encoding=='baseN'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5)
+ encoder.fit_transform(data)
#
# Input - output
#
@@ -88,71 +187,271 @@ def train(dataset, params, callback):
x_columns.append(col)
x = data[x_columns].values
y = data[output_column].values
+ print(x_columns)
+ print(x)
#
# Podela na test i trening skupove
#
test=params["randomTestSetDistribution"]
randomOrder = params["randomOrder"]
if(randomOrder):
- random=50
+ random=123
else:
random=0
- x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test, random_state=random)
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5,random_state=0)
#
# Skaliranje vrednosti
#
+ '''
scaler=StandardScaler()
scaler.fit(x_train)
x_test=scaler.transform(x_test)
x_train=scaler.transform(x_train)
+ '''
+
#
# Treniranje modela
#
- classifier=tf.keras.Sequential()
+ #
hidden_layer_neurons = params["hiddenLayerNeurons"]
- for func in params["hiddenLayerActivationFunctions"]:
- classifier.add(tf.keras.layers.Dense(units=hidden_layer_neurons,activation=func))
- output_func = params["outputLayerActivationFunction"]
- classifier.add(tf.keras.layers.Dense(units=1,activation=output_func))
- optimizer = params["optimizer"]
- metrics=params['metrics']
- loss_func=params["lossFunction"]
- classifier.compile(optimizer=optimizer, loss=loss_func,metrics=metrics)
- batch_size = params["batchSize"]
- epochs = params["epochs"]
- history=classifier.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callback(x_test, y_test), validation_split=0.2) # TODO params["validationSplit"]
+
+ if(problem_type=='multi-klasifikacioni'):
+ func=params['hiddenLayerActivationFunctions']
+ output_func = params["outputLayerActivationFunction"]
+ optimizer = params["optimizer"]
+ metrics=params['metrics']
+ loss_func=params["lossFunction"]
+ batch_size = params["batchSize"]
+ epochs = params["epochs"]
+ inputDim = len(data.columns) - 1
+ '''
+ classifier=tf.keras.Sequential()
+
+ classifier.add(tf.keras.layers.Dense(units=len(data.columns),input_dim=inputDim))#input layer
+
+ for f in func:#hidden layers
+ classifier.add(tf.keras.layers.Dense(hidden_layer_neurons,activation=f))
+
+ numberofclasses=len(output_column.unique())
+ classifier.add(tf.keras.layers.Dense(numberofclasses,activation=output_func))#output layer
+ '''
+ model=tf.keras.Sequential()
+ model.add(tf.keras.layers.Dense(1,input_dim=x_train.shape[1]))#input layer
+ model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+ model.add(tf.keras.layers.Dense(len(output_column.unique())+1, activation='softmax'))
+ classifier.compile(optimizer=optimizer, loss=loss_func,metrics=metrics)
+
+ history=classifier.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callback(x_test, y_test))
+ else:
+ classifier=tf.keras.Sequential()
+
+ for func in params["hiddenLayerActivationFunctions"]:
+ classifier.add(tf.keras.layers.Dense(units=hidden_layer_neurons,activation=func))
+ output_func = params["outputLayerActivationFunction"]
+
+ if(problem_type!="regresioni"):
+ classifier.add(tf.keras.layers.Dense(units=1,activation=output_func))
+ else:
+ classifier.add(tf.keras.layers.Dense(units=1))
+
+ optimizer = params["optimizer"]
+ metrics=params['metrics']
+ loss_func=params["lossFunction"]
+ classifier.compile(optimizer=optimizer, loss=loss_func,metrics=metrics)
+ batch_size = params["batchSize"]
+ epochs = params["epochs"]
+ history=classifier.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callback(x_test, y_test), validation_split=0.2) # TODO params["validationSplit"]
#
# Test
#
model_name = params['_id']
- y_pred=classifier.predict(x_test)
+ #y_pred=classifier.predict(x_test)
if(problem_type == "regresioni"):
- classifier.evaluate(x_test, y_test)
- elif(problem_type == "binarni-klasifikacioni"):
+ y_pred=classifier.predict(x_test)
+ print(classifier.evaluate(x_test, y_test))
+ elif(problem_type == "binarni-klasifikacioni"):
+ y_pred=classifier.predict(x_test)
y_pred=(y_pred>=0.5).astype('int')
+ elif(problem_type=='multi-klasifikacioni'):
+ y_pred=classifier.predict(x_test)
+ y_pred=np.argmax(y_pred,axis=1)
+
y_pred=y_pred.flatten()
result=pd.DataFrame({"Actual":y_test,"Predicted":y_pred})
classifier.save("temp/"+model_name, save_format='h5')
+ # ROC multi-klasifikacioni
+ def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
+
+ #creating a set of all the unique classes using the actual class list
+ unique_class = set(actual_class)
+ roc_auc_dict = {}
+ for per_class in unique_class:
+
+ #creating a list of all the classes except the current class
+ other_class = [x for x in unique_class if x != per_class]
+
+ #marking the current class as 1 and all other classes as 0
+ new_actual_class = [0 if x in other_class else 1 for x in actual_class]
+ new_pred_class = [0 if x in other_class else 1 for x in pred_class]
+
+ #using the sklearn metrics method to calculate the roc_auc_score
+ roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
+ roc_auc_dict[per_class] = roc_auc
+
+ return roc_auc_dict
#
# Metrike
#
print("HELLO???")
print(result)
print("HELLO???")
- accuracy = float(sm.accuracy_score(y_test,y_pred))
- precision = float(sm.precision_score(y_test,y_pred))
- recall = float(sm.recall_score(y_test,y_pred))
- tn, fp, fn, tp = sm.confusion_matrix(y_test,y_pred).ravel()
- specificity = float(tn / (tn+fp))
- f1 = float(sm.f1_score(y_test,y_pred))
- mse = float(sm.mean_squared_error(y_test,y_pred))
- mae = float(sm.mean_absolute_error(y_test,y_pred))
- mape = float(sm.mean_absolute_percentage_error(y_test,y_pred))
- rmse = float(np.sqrt(sm.mean_squared_error(y_test,y_pred)))
- fpr, tpr, _ = sm.roc_curve(y_test,y_pred)
+ if(problem_type=="binarni-klasifikacioni"):
+ accuracy = float(sm.accuracy_score(y_test,y_pred))
+ precision = float(sm.precision_score(y_test,y_pred))
+ recall = float(sm.recall_score(y_test,y_pred))
+ tn, fp, fn, tp = sm.confusion_matrix(y_test,y_pred).ravel()
+ specificity = float(tn / (tn+fp))
+ f1 = float(sm.f1_score(y_test,y_pred))
+ fpr, tpr, _ = sm.roc_curve(y_test,y_pred)
+ logloss = float(sm.log_loss(y_test, y_pred))
+ metrics= {"accuracy" : accuracy,
+ "precision" : precision,
+ "recall" : recall,
+ "specificity" : specificity,
+ "f1" : f1,
+ "tn" : float(tn),
+ "fp" : float(fp),
+ "fn" : float(fn),
+ "tp" : float(tp),
+ "fpr" : fpr.tolist(),
+ "tpr" : tpr.tolist(),
+ "logloss" : logloss
+ }
+ elif(problem_type=="regresioni"):
+ # https://www.analyticsvidhya.com/blog/2021/05/know-the-best-evaluation-metrics-for-your-regression-model/
+ mse = float(sm.mean_squared_error(y_test,y_pred))
+ mae = float(sm.mean_absolute_error(y_test,y_pred))
+ mape = float(sm.mean_absolute_percentage_error(y_test,y_pred))
+ rmse = float(np.sqrt(sm.mean_squared_error(y_test,y_pred)))
+ rmsle = float(np.sqrt(sm.mean_squared_error(y_test, y_pred)))
+ r2 = float(sm.r2_score(y_test, y_pred))
+ # n - num of observations
+ # k - num of independent variables
+ n = 40
+ k = 2
+ adj_r2 = float(1 - ((1-r2)*(n-1)/(n-k-1)))
+ metrics= {"mse" : mse,
+ "mae" : mae,
+ "mape" : mape,
+ "rmse" : rmse,
+ "rmsle" : rmsle,
+ "r2" : r2,
+ "adj_r2" : adj_r2
+ }
+ elif(problem_type=="multi-klasifikacioni"):
+
+ cr=sm.classification_report(y_test, y_pred)
+ cm=sm.confusion_matrix(y_test,y_pred)
+ # https://www.kaggle.com/code/nkitgupta/evaluation-metrics-for-multi-class-classification/notebook
+ accuracy=metrics.accuracy_score(y_test, y_pred)
+ macro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'macro')
+ micro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'micro')
+ macro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'macro')
+ micro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'micro')
+ macro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'macro')
+ micro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'micro')
+ roc_auc_dict=roc_auc_score_multiclass(y_test, y_pred)
+
+
# TODO upload trenirani model nazad na backend
- return TrainingResult(accuracy, precision, recall, float(tn), float(fp), float(fn), float(tp), specificity, f1, mse, mae, mape, rmse, fpr.tolist(), tpr.tolist())
+ #return TrainingResult(metrics)
+def manageH5(datain,params,h5model):
+ dataset=datain.copy()
+ problem_type = params["type"]
+ data = pd.DataFrame()
+ for col in params["inputColumns"]:
+ data[col]=dataset[col]
+ output_column = params["columnToPredict"]
+ data[output_column] = dataset[output_column]
+ #
+ # Brisanje null kolona / redova / zamena
+ #nullreplace=[
+ # {"column":"Embarked","value":"C","deleteRow":false,"deleteCol":true},
+ # {"column": "Cabin","value":"C123","deleteRow":"0","deleteCol":"0"}]
+
+ null_value_options = params["nullValues"]
+ null_values_replacers = params["nullValuesReplacers"]
+
+ if(null_value_options=='replace'):
+ print("replace null") # TODO
+ elif(null_value_options=='delete_rows'):
+ data=data.dropna()
+ elif(null_value_options=='delete_columns'):
+ data=data.dropna()
+ #
+ #print(data.isnull().any())
+ #
+ # Brisanje kolona koje ne uticu na rezultat
+ #
+ num_rows=data.shape[0]
+ for col in data.columns:
+ if((data[col].nunique()==(num_rows)) and (data[col].dtype==np.object_)):
+ data.pop(col)
+ #
+ # Enkodiranje
+ # https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/
+ #
+ encoding=params["encoding"]
+ if(encoding=='label'):
+ encoder=LabelEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+ elif(encoding=='onehot'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ data=pd.get_dummies(data, columns=category_columns, prefix=category_columns)
+ elif(encoding=='ordinal'):
+ encoder = OrdinalEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+ elif(encoding=='hashing'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns))
+ encoder.fit_transform(data)
+ elif(encoding=='binary'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BinaryEncoder(cols=category_columns, return_df=True)
+ encoder.fit_transform(data)
+ elif(encoding=='baseN'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5)
+ encoder.fit_transform(data)
+ #
+ # Input - output
+ #
+ x_columns = []
+ for col in data.columns:
+ if(col!=output_column):
+ x_columns.append(col)
+ x = data[x_columns].values
+ y = data[output_column].values
+
+
+ y_pred=h5model.predict_classes(x) \ No newline at end of file
diff --git a/backend/microservice/api/newmlservice.py b/backend/microservice/api/newmlservice.py
new file mode 100644
index 00000000..50af15f8
--- /dev/null
+++ b/backend/microservice/api/newmlservice.py
@@ -0,0 +1,424 @@
+from enum import unique
+from itertools import count
+import pandas as pd
+from sklearn import datasets, multiclass
+import tensorflow as tf
+import keras
+import numpy as np
+import csv
+import json
+import h5py
+import sklearn.metrics as sm
+from statistics import mode
+from typing_extensions import Self
+from copyreg import constructor
+from flask import request, jsonify, render_template
+from sklearn.preprocessing import LabelEncoder, MinMaxScaler
+from sklearn.preprocessing import OrdinalEncoder
+import category_encoders as ce
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from dataclasses import dataclass
+import statistics as s
+from sklearn.metrics import roc_auc_score
+from ann_visualizer.visualize import ann_viz;
+def returnColumnsInfo(dataset):
+ dict=[]
+ datafront=dataset.copy()
+ svekolone=datafront.columns
+ kategorijskekolone=datafront.select_dtypes(include=['object']).columns
+ allNullCols=0
+ for kolona in svekolone:
+ if(kolona in kategorijskekolone):
+ uniquevalues=datafront[kolona].unique()
+ mean=0
+ median=0
+ nullCount=datafront[kolona].isnull().sum()
+ if(nullCount>0):
+ allNullCols=allNullCols+1
+ frontreturn={'columnName':kolona,
+ 'isNumber':False,
+ 'uniqueValues':uniquevalues.tolist(),
+ 'median':float(mean),
+ 'mean':float(median),
+ 'numNulls':float(nullCount)
+ }
+ dict.append(frontreturn)
+ else:
+ mean=datafront[kolona].mean()
+ median=s.median(datafront[kolona])
+ nullCount=datafront[kolona].isnull().sum()
+ if(nullCount>0):
+ allNullCols=allNullCols+1
+ frontreturn={'columnName':kolona,
+ 'isNumber':1,
+ 'uniqueValues':[],
+ 'mean':float(mean),
+ 'median':float(median),
+ 'numNulls':float(nullCount)
+ }
+ dict.append(frontreturn)
+ NullRows = datafront[datafront.isnull().any(axis=1)]
+ #print(NullRows)
+ #print(len(NullRows))
+ allNullRows=len(NullRows)
+
+ return {'columnInfo':dict,'allNullColl':allNullCols,'allNullRows':allNullRows}
+
+@dataclass
+class TrainingResultClassification:
+ accuracy: float
+ precision: float
+ recall: float
+ tn: float
+ fp: float
+ fn: float
+ tp: float
+ specificity: float
+ f1: float
+ logloss: float
+ fpr: float
+ tpr: float
+ metrics: dict
+'''
+@datasets
+class TrainingResultRegression:
+ mse: float
+ mae: float
+ mape: float
+ rmse: float
+
+@dataclass
+class TrainingResult:
+ metrics: dict
+'''
+
+def train(dataset, params, callback):
+ problem_type = params["type"]
+ print(problem_type)
+ data = pd.DataFrame()
+ print(data)
+ for col in params["inputColumns"]:
+ print(col)
+ data[col]=dataset[col]
+ output_column = params["columnToPredict"]
+ data[output_column] = dataset[output_column]
+ print(data)
+
+ ###NULL
+ null_value_options = params["nullValues"]
+ null_values_replacers = params["nullValuesReplacers"]
+
+ if(null_value_options=='replace'):
+ print("replace null") # TODO
+ elif(null_value_options=='delete_rows'):
+ data=data.dropna()
+ elif(null_value_options=='delete_columns'):
+ data=data.dropna()
+ print(data.shape)
+
+ #
+ # Brisanje kolona koje ne uticu na rezultat
+ #
+ num_rows=data.shape[0]
+ for col in data.columns:
+ if((data[col].nunique()==(num_rows)) and (data[col].dtype==np.object_)):
+ data.pop(col)
+ #
+ ### Enkodiranje
+ encoding=params["encoding"]
+ if(encoding=='label'):
+ encoder=LabelEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+
+
+ elif(encoding=='onehot'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ data=pd.get_dummies(data, columns=category_columns, prefix=category_columns)
+
+ elif(encoding=='ordinal'):
+ encoder = OrdinalEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+
+ elif(encoding=='hashing'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns))
+ encoder.fit_transform(data)
+ elif(encoding=='binary'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BinaryEncoder(cols=category_columns, return_df=True)
+ encoder.fit_transform(data)
+
+ elif(encoding=='baseN'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5)
+ encoder.fit_transform(data)
+ #
+ # Input - output
+ #
+ x_columns = []
+ for col in data.columns:
+ if(col!=output_column):
+ x_columns.append(col)
+ print(x_columns)
+ x = data[x_columns].values
+ y = data[output_column].values
+
+ #
+ # Podela na test i trening skupove
+ #
+ test=params["randomTestSetDistribution"]
+ randomOrder = params["randomOrder"]
+ if(randomOrder):
+ random=123
+ else:
+ random=0
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test, random_state=random)
+ print(x_train,x_test)
+
+ #
+ # Treniranje modela
+ #
+ #
+ if(problem_type=='multi-klasifikacioni'):
+ #print('multi')
+ classifier=tf.keras.Sequential()
+
+ classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][0],input_dim=x_train.shape[1]))#prvi skriveni + definisanje prethodnog-ulaznog
+ for i in range(params['hiddenLayers']-1):#ako postoji vise od jednog skrivenog sloja
+ #print(i)
+ classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][i+1]))#i-ti skriveni sloj
+ classifier.add(tf.keras.layers.Dense(units=5, activation=params['outputLayerActivationFunction']))#izlazni sloj
+
+ classifier.compile(loss =params["lossFunction"] , optimizer = params['optimizer'] , metrics =params['metrics'])
+
+ history=classifier.fit(x_train, y_train, epochs = params['epochs'],batch_size=params['batchSize'])
+
+ y_pred=classifier.predict(x_test)
+ y_pred=np.argmax(y_pred,axis=1)
+ #print(y_pred.flatten())
+ #print(y_test)
+ scores = classifier.evaluate(x_test, y_test)
+ print("\n%s: %.2f%%" % (classifier.metrics_names[1], scores[1]*100))
+ classifier.save("temp/"+params['name'], save_format='h5')
+ #vizuelizacija u python-u
+ #from ann_visualizer.visualize import ann_viz;
+ #ann_viz(classifier, title="My neural network")
+
+ elif(problem_type=='binarni-klasifikacioni'):
+ #print('*************************************************************************binarni')
+ classifier=tf.keras.Sequential()
+
+ classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][0],input_dim=x_train.shape[1]))#prvi skriveni + definisanje prethodnog-ulaznog
+ for i in range(params['hiddenLayers']-1):#ako postoji vise od jednog skrivenog sloja
+ #print(i)
+ classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][i+1]))#i-ti skriveni sloj
+ classifier.add(tf.keras.layers.Dense(units=1, activation=params['outputLayerActivationFunction']))#izlazni sloj
+
+ classifier.compile(loss =params["lossFunction"] , optimizer = params['optimizer'] , metrics =params['metrics'])
+
+ history=classifier.fit(x_train, y_train, epochs = params['epochs'],batch_size=params['batchSize'])
+
+ y_pred=classifier.predict(x_test)
+ y_pred=(y_pred>=0.5).astype('int')
+
+ print(y_pred.flatten())
+ print(y_test)
+
+ scores = classifier.evaluate(x_test, y_test)
+ print("\n%s: %.2f%%" % (classifier.metrics_names[1], scores[1]*100))
+ #ann_viz(classifier, title="My neural network")
+
+ classifier.save("temp/"+params['name'], save_format='h5')
+
+ elif(problem_type=='regresioni'):
+ classifier=tf.keras.Sequential()
+
+ classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][0],input_dim=x_train.shape[1]))#prvi skriveni + definisanje prethodnog-ulaznog
+ for i in range(params['hiddenLayers']-1):#ako postoji vise od jednog skrivenog sloja
+ #print(i)
+ classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][i+1]))#i-ti skriveni sloj
+ classifier.add(tf.keras.layers.Dense(units=1))
+
+ classifier.compile(loss =params["lossFunction"] , optimizer = params['optimizer'] , metrics =params['metrics'])
+
+ history=classifier.fit(x_train, y_train, epochs = params['epochs'],batch_size=params['batchSize'])
+ y_pred=classifier.predict(x_test)
+ print(classifier.evaluate(x_test, y_test))
+
+ def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
+
+ #creating a set of all the unique classes using the actual class list
+ unique_class = set(actual_class)
+ roc_auc_dict = {}
+ for per_class in unique_class:
+
+ #creating a list of all the classes except the current class
+ other_class = [x for x in unique_class if x != per_class]
+
+ #marking the current class as 1 and all other classes as 0
+ new_actual_class = [0 if x in other_class else 1 for x in actual_class]
+ new_pred_class = [0 if x in other_class else 1 for x in pred_class]
+
+ #using the sklearn metrics method to calculate the roc_auc_score
+ roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
+ roc_auc_dict[per_class] = roc_auc
+
+ return roc_auc_dict
+ #
+ # Metrike
+ #
+
+ if(problem_type=="binarni-klasifikacioni"):
+ accuracy = float(sm.accuracy_score(y_test,y_pred))
+ precision = float(sm.precision_score(y_test,y_pred))
+ recall = float(sm.recall_score(y_test,y_pred))
+ tn, fp, fn, tp = sm.confusion_matrix(y_test,y_pred).ravel()
+ specificity = float(tn / (tn+fp))
+ f1 = float(sm.f1_score(y_test,y_pred))
+ fpr, tpr, _ = sm.roc_curve(y_test,y_pred)
+ logloss = float(sm.log_loss(y_test, y_pred))
+ metrics= {"accuracy" : accuracy,
+ "precision" : precision,
+ "recall" : recall,
+ "specificity" : specificity,
+ "f1" : f1,
+ "tn" : float(tn),
+ "fp" : float(fp),
+ "fn" : float(fn),
+ "tp" : float(tp),
+ "fpr" : fpr.tolist(),
+ "tpr" : tpr.tolist(),
+ "logloss" : logloss
+ }
+ elif(problem_type=="regresioni"):
+ # https://www.analyticsvidhya.com/blog/2021/05/know-the-best-evaluation-metrics-for-your-regression-model/
+ mse = float(sm.mean_squared_error(y_test,y_pred))
+ mae = float(sm.mean_absolute_error(y_test,y_pred))
+ mape = float(sm.mean_absolute_percentage_error(y_test,y_pred))
+ rmse = float(np.sqrt(sm.mean_squared_error(y_test,y_pred)))
+ rmsle = float(np.sqrt(sm.mean_squared_error(y_test, y_pred)))
+ r2 = float(sm.r2_score(y_test, y_pred))
+ # n - num of observations
+ # k - num of independent variables
+ n = 40
+ k = 2
+ adj_r2 = float(1 - ((1-r2)*(n-1)/(n-k-1)))
+ metrics= {"mse" : mse,
+ "mae" : mae,
+ "mape" : mape,
+ "rmse" : rmse,
+ "rmsle" : rmsle,
+ "r2" : r2,
+ "adj_r2" : adj_r2
+ }
+ '''
+ elif(problem_type=="multi-klasifikacioni"):
+
+ cr=sm.classification_report(y_test, y_pred)
+ cm=sm.confusion_matrix(y_test,y_pred)
+ # https://www.kaggle.com/code/nkitgupta/evaluation-metrics-for-multi-class-classification/notebook
+ accuracy=metrics.accuracy_score(y_test, y_pred)
+ macro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'macro')
+ micro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'micro')
+ macro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'macro')
+ micro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'micro')
+ macro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'macro')
+ micro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'micro')
+ roc_auc_dict=roc_auc_score_multiclass(y_test, y_pred)
+ '''
+
+def manageH5(dataset,params,h5model):
+ problem_type = params["type"]
+ print(problem_type)
+ data = pd.DataFrame()
+ #print(data)
+ for col in params["inputColumns"]:
+ print(col)
+ data[col]=dataset[col]
+ output_column = params["columnToPredict"]
+ data[output_column] = dataset[output_column]
+ #print(data)
+
+ ###NULL
+ null_value_options = params["nullValues"]
+ null_values_replacers = params["nullValuesReplacers"]
+
+ if(null_value_options=='replace'):
+ print("replace null") # TODO
+ elif(null_value_options=='delete_rows'):
+ data=data.dropna()
+ elif(null_value_options=='delete_columns'):
+ data=data.dropna()
+ print(data.shape)
+
+ #
+ # Brisanje kolona koje ne uticu na rezultat
+ #
+ num_rows=data.shape[0]
+ for col in data.columns:
+ if((data[col].nunique()==(num_rows)) and (data[col].dtype==np.object_)):
+ data.pop(col)
+ #
+ ### Enkodiranje
+ encoding=params["encoding"]
+ if(encoding=='label'):
+ encoder=LabelEncoder()
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ data[col]=encoder.fit_transform(data[col])
+
+
+ elif(encoding=='onehot'):
+ category_columns=[]
+ for col in data.columns:
+ if(data[col].dtype==np.object_):
+ category_columns.append(col)
+ data=pd.get_dummies(data, columns=category_columns, prefix=category_columns)
+ #print(data)
+
+ #
+ # Input - output
+ #
+ x_columns = []
+ for col in data.columns:
+ if(col!=output_column):
+ x_columns.append(col)
+ #print(x_columns)
+ x2 = data[x_columns]
+ print(x2)
+ print(x2.values)
+ x2 = data[x_columns].values
+ print(x2)
+ y2 = data[output_column].values
+ h5model.summary()
+ ann_viz(h5model, title="My neural network")
+
+ h5model.compile(loss=params['lossFunction'], optimizer=params['optimizer'], metrics=params['metrics'])
+
+ history=h5model.fit(x2, y2, epochs = params['epochs'],batch_size=params['batchSize'])
+
+ y_pred2=h5model.predict(x2)
+
+ y_pred2=np.argmax(y_pred2,axis=1)
+ #y_pred=h5model.predict_classes(x)
+ score = h5model.evaluate(x2,y_pred2, verbose=0)
+ print("%s: %.2f%%" % (h5model.metrics_names[1], score[1]*100))
+ print(y_pred2)
+ print( 'done') \ No newline at end of file
diff --git a/backend/microservice/mlservice.py b/backend/microservice/mlservice.py
index b2eafe9a..8f56fc3f 100644
--- a/backend/microservice/mlservice.py
+++ b/backend/microservice/mlservice.py
@@ -54,6 +54,38 @@ def obuka(dataunos,params,modelunos,dataunosdrugog):
data[zeljenekolone[i]]=dataunos[zeljenekolone[i]]
#print(data.head(10))
+ ### 0.1) Povratne vrednosti statistike za front (za popunjavanje null vrednosti izabranih kolona) PART4
+ datafront=data.copy()
+ svekolone=datafront.columns
+ kategorijskekolone=datafront.select_dtypes(include=['object']).columns
+ #print(kategorijskekolone )
+ #kategorijskekolone=datacategorical.columns
+ #print(svekolone)
+ for i in range(len(svekolone)):
+ nazivkolone=svekolone[i]
+ if(nazivkolone in kategorijskekolone):
+ svekategorije=datafront[nazivkolone].unique()
+ medijana=None
+ srednjavrednost=None
+ frontreturn={'colName':nazivkolone,
+ 'colType':'categorical',
+ 'categoricalValues':svekategorije,
+ 'mean':medijana,
+ 'average':srednjavrednost
+ }
+ else:
+ svekategorije=None
+ medijana=datafront[nazivkolone].mean()
+ srednjavrednost=sum(datafront[nazivkolone])/len(datafront[nazivkolone])
+ frontreturn={'colName':nazivkolone,
+ 'colType':'noncategorical',
+ 'categoricalValues':svekategorije,
+ 'mean':medijana,
+ 'average':srednjavrednost
+ }
+
+ print(frontreturn)
+
#predvidetikol=input("UNETI NAZIV KOLONE ČIJU VREDNOST TREBA PREDVIDETI ")
###sta se cuva od promenjivih broj kolone ili naziv kolone???