diff options
Diffstat (limited to 'backend')
-rw-r--r-- | backend/api/api/Controllers/FileController.cs | 103 | ||||
-rw-r--r-- | backend/api/api/Controllers/PredictorController.cs | 10 | ||||
-rw-r--r-- | backend/api/api/Models/FileModel.cs | 1 | ||||
-rw-r--r-- | backend/api/api/Models/PredictorColumns.cs | 8 | ||||
-rw-r--r-- | backend/api/api/Services/PredictorService.cs | 2 | ||||
-rw-r--r-- | backend/api/api/Services/UserService.cs | 2 | ||||
-rw-r--r-- | backend/microservice/__pycache__/mlservice.cpython-310.pyc | bin | 5009 -> 7405 bytes | |||
-rw-r--r-- | backend/microservice/api.py | 2 | ||||
-rw-r--r-- | backend/microservice/api/controller.py | 22 | ||||
-rw-r--r-- | backend/microservice/api/ml_service.py | 363 | ||||
-rw-r--r-- | backend/microservice/api/newmlservice.py | 424 | ||||
-rw-r--r-- | backend/microservice/mlservice.py | 32 |
12 files changed, 924 insertions, 45 deletions
diff --git a/backend/api/api/Controllers/FileController.cs b/backend/api/api/Controllers/FileController.cs index 0fe8415b..d29c5676 100644 --- a/backend/api/api/Controllers/FileController.cs +++ b/backend/api/api/Controllers/FileController.cs @@ -4,6 +4,7 @@ using api.Services; using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Mvc; using Microsoft.Net.Http.Headers; + namespace api.Controllers { [Route("api/[controller]")] @@ -11,6 +12,7 @@ namespace api.Controllers public class FileController : ControllerBase { private string[] permittedExtensions = { ".csv" }; + private string[] permittedExtensionsH5 = { ".h5" };//niz da bi dodali h4 itd private readonly IConfiguration _configuration; private IJwtToken _token; private IFileService _fileservice; @@ -22,6 +24,77 @@ namespace api.Controllers } + [HttpPost("h5")] + [Authorize(Roles = "User,Guest")] + public async Task<ActionResult<string>> H5Upload([FromForm] IFormFile file) + { + + //get username from jwtToken + string uploaderId; + string folderName; + var header = Request.Headers[HeaderNames.Authorization]; + if (AuthenticationHeaderValue.TryParse(header, out var headerValue)) + { + + var scheme = headerValue.Scheme; + var parameter = headerValue.Parameter; + uploaderId = _token.TokenToId(parameter); + if (uploaderId == null) + return null; + } + else + return BadRequest(); + if (uploaderId == "") + { + folderName = "TempFiles"; + } + else + { + folderName = "UploadedFiles"; + } + + + //Check filetype + var filename = file.FileName; + var ext = Path.GetExtension(filename).ToLowerInvariant(); + var name = Path.GetFileNameWithoutExtension(filename).ToLowerInvariant(); + if (string.IsNullOrEmpty(ext) || !permittedExtensionsH5.Contains(ext)) + { + return BadRequest("Wrong file type"); + } + var folderPath = Path.Combine(Directory.GetCurrentDirectory(), folderName, uploaderId); + //Check Directory + if (!Directory.Exists(folderPath)) + { + Directory.CreateDirectory(folderPath); + } + //Index file if same filename + var fullPath = Path.Combine(folderPath, filename); + int i = 0; + + while (System.IO.File.Exists(fullPath)) + { + i++; + fullPath = Path.Combine(folderPath, name + i.ToString() + ext); + } + + + //Write file + using (var stream = new FileStream(fullPath, FileMode.Create)) + { + await file.CopyToAsync(stream); + } + FileModel fileModel = new FileModel(); + fileModel.type = "h5"; + fileModel.path = fullPath; + fileModel.uploaderId = uploaderId; + fileModel.date = DateTime.Now.ToUniversalTime(); + fileModel = _fileservice.Create(fileModel); + + + return Ok(fileModel); + } + [HttpPost("Csv")] [Authorize(Roles = "User,Guest")] @@ -81,6 +154,7 @@ namespace api.Controllers await file.CopyToAsync(stream); } FileModel fileModel= new FileModel(); + fileModel.type = "csv"; fileModel.path=fullPath; fileModel.uploaderId= uploaderId; fileModel.date = DateTime.Now.ToUniversalTime(); @@ -90,6 +164,35 @@ namespace api.Controllers return Ok(fileModel); } + + //msm generalno moze da se koristi Download samo + [HttpGet("downloadh5")] + [Authorize(Roles = "User,Guest")] + public async Task<ActionResult> DownloadH5(string id) + { + //Get Username + string uploaderId; + var header = Request.Headers[HeaderNames.Authorization]; + if (AuthenticationHeaderValue.TryParse(header, out var headerValue)) + { + + var scheme = headerValue.Scheme; + var parameter = headerValue.Parameter; + uploaderId = _token.TokenToId(parameter); + if (uploaderId == null) + return null; + } + else + return BadRequest(); + + string filePath = _fileservice.GetFilePath(id, uploaderId); + if (filePath == null) + return BadRequest(); + + return File(System.IO.File.ReadAllBytes(filePath), "application/octet-stream", Path.GetFileName(filePath)); + + } + [HttpGet("Download")] [Authorize(Roles = "User,Guest")] public async Task<ActionResult> DownloadFile(string id) diff --git a/backend/api/api/Controllers/PredictorController.cs b/backend/api/api/Controllers/PredictorController.cs index cdc14632..161271e2 100644 --- a/backend/api/api/Controllers/PredictorController.cs +++ b/backend/api/api/Controllers/PredictorController.cs @@ -77,7 +77,7 @@ namespace api.Controllers // GET api/<PredictorController>/getpredictor/{name} [HttpGet("getpredictor/{id}")] - [Authorize(Roles = "User")] + [Authorize(Roles = "User,Guest")] public ActionResult<Predictor> GetPredictor(string id) { string username; @@ -188,8 +188,8 @@ namespace api.Controllers // POST api/<PredictorController>/usepredictor {predictor,inputs} [HttpPost("usepredictor/{id}")] - [Authorize(Roles = "User")] - public ActionResult UsePredictor(String id, [FromBody] String[] inputs) + [Authorize(Roles = "User,Guest")] + public ActionResult UsePredictor(String id, [FromBody] PredictorColumns[] inputs) { string username; @@ -207,8 +207,8 @@ namespace api.Controllers Predictor predictor = _predictorService.GetPredictor(username, id); - foreach(String i in inputs) - Debug.WriteLine(i); + foreach(PredictorColumns i in inputs) + Debug.WriteLine(i.value.ToString()); return NoContent(); } diff --git a/backend/api/api/Models/FileModel.cs b/backend/api/api/Models/FileModel.cs index 1043309d..47b12110 100644 --- a/backend/api/api/Models/FileModel.cs +++ b/backend/api/api/Models/FileModel.cs @@ -8,6 +8,7 @@ namespace api.Models [BsonId] [BsonRepresentation(BsonType.ObjectId)] public string _id { get; set; } + public string type { get; set; } public string uploaderId { get; set; } public string path { get; set; } [BsonDateTimeOptions(Kind = DateTimeKind.Utc)] diff --git a/backend/api/api/Models/PredictorColumns.cs b/backend/api/api/Models/PredictorColumns.cs new file mode 100644 index 00000000..82f3e979 --- /dev/null +++ b/backend/api/api/Models/PredictorColumns.cs @@ -0,0 +1,8 @@ +namespace api.Models +{ + public class PredictorColumns + { + public String name { get; set; } + public String value { get; set; } + } +} diff --git a/backend/api/api/Services/PredictorService.cs b/backend/api/api/Services/PredictorService.cs index 01bc8359..b15255ac 100644 --- a/backend/api/api/Services/PredictorService.cs +++ b/backend/api/api/Services/PredictorService.cs @@ -42,7 +42,7 @@ namespace api.Services } public Predictor GetPredictor(string username, string id) { - return _predictor.Find(predictor => predictor.username == username && predictor._id == id).FirstOrDefault(); + return _predictor.Find(predictor => predictor._id == id && (predictor.username == username || predictor.isPublic == true)).FirstOrDefault(); } //last private models diff --git a/backend/api/api/Services/UserService.cs b/backend/api/api/Services/UserService.cs index 7ec6f4b2..7fc4bdb1 100644 --- a/backend/api/api/Services/UserService.cs +++ b/backend/api/api/Services/UserService.cs @@ -50,7 +50,7 @@ namespace api.Services //username koji postoji u bazi using (var session = _client.StartSession()) { - + if(username!=user.Username) if(_users.Find(u => u.Username == user.Username).FirstOrDefault()!=null) { return false; diff --git a/backend/microservice/__pycache__/mlservice.cpython-310.pyc b/backend/microservice/__pycache__/mlservice.cpython-310.pyc Binary files differindex c079459a..ac93f3db 100644 --- a/backend/microservice/__pycache__/mlservice.cpython-310.pyc +++ b/backend/microservice/__pycache__/mlservice.cpython-310.pyc diff --git a/backend/microservice/api.py b/backend/microservice/api.py index 4768f34c..9a28b159 100644 --- a/backend/microservice/api.py +++ b/backend/microservice/api.py @@ -9,7 +9,7 @@ import csv import json import mlservice import h5py -from mlservice2 import unositok +from mlservice import unositok app = flask.Flask(__name__) diff --git a/backend/microservice/api/controller.py b/backend/microservice/api/controller.py index 059af317..1b17f727 100644 --- a/backend/microservice/api/controller.py +++ b/backend/microservice/api/controller.py @@ -1,7 +1,7 @@ import flask from flask import request, jsonify import ml_socket -import ml_service +import newmlservice import tensorflow as tf import pandas as pd @@ -25,7 +25,7 @@ def train(): f = request.json["dataset"] dataset = pd.read_csv(f) # - result = ml_service.train(dataset, request.json["model"], train_callback) + result = newmlservice.train(dataset, request.json["model"], train_callback) print(result) return jsonify(result) @@ -34,10 +34,22 @@ def predict(): f = request.json['filepath'] dataset = pd.read_csv(f) m = request.json['modelpath'] - #model = tf.keras.models.load_model(m) - # - #model.predict? + model = tf.keras.models.load_model(m) + print("********************************model loaded*******************************") + newmlservice.manageH5(dataset,request.json['model'],model) + return "done" + +@app.route('/preprocess',methods=['POST']) +def returnColumnsInfo(): + f=request.json['filepathcolinfo'] + dataset=pd.read_csv(f) + + result=newmlservice.returnColumnsInfo(dataset) + + return jsonify(result) + + print("App loaded.") ml_socket.start() app.run()
\ No newline at end of file diff --git a/backend/microservice/api/ml_service.py b/backend/microservice/api/ml_service.py index ea562212..0aed3dc9 100644 --- a/backend/microservice/api/ml_service.py +++ b/backend/microservice/api/ml_service.py @@ -1,4 +1,8 @@ +from cmath import nan +from enum import unique +from itertools import count import pandas as pd +from sklearn import datasets import tensorflow as tf import keras import numpy as np @@ -11,12 +15,67 @@ from typing_extensions import Self from copyreg import constructor from flask import request, jsonify, render_template from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import OrdinalEncoder +import category_encoders as ce from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from dataclasses import dataclass +import statistics as s +from sklearn.metrics import roc_auc_score + +def returnColumnsInfo(dataset): + dict=[] + datafront=dataset.copy() + svekolone=datafront.columns + kategorijskekolone=datafront.select_dtypes(include=['object']).columns + allNullCols=0 + for kolona in svekolone: + if(kolona in kategorijskekolone): + uniquevalues=datafront[kolona].unique() + mean=0 + median=0 + min=0 + max=0 + nullCount=datafront[kolona].isnull().sum() + if(nullCount>0): + allNullCols=allNullCols+1 + frontreturn={'columnName':kolona, + 'isNumber':False, + 'uniqueValues':uniquevalues.tolist(), + 'mean':float(mean), + 'median':float(median), + 'numNulls':float(nullCount), + 'min':min, + 'max':max + } + dict.append(frontreturn) + else: + mean=datafront[kolona].mean() + median=s.median(datafront[kolona]) + nullCount=datafront[kolona].isnull().sum() + min=min(datafront[kolona]) + max=max(datafront[kolona]) + if(nullCount>0): + allNullCols=allNullCols+1 + frontreturn={'columnName':kolona, + 'isNumber':1, + 'uniqueValues':[], + 'mean':float(mean), + 'median':float(median), + 'numNulls':float(nullCount), + 'min':min, + 'max':max + } + dict.append(frontreturn) + NullRows = datafront[datafront.isnull().any(axis=1)] + #print(NullRows) + #print(len(NullRows)) + allNullRows=len(NullRows) + + return {'columnInfo':dict,'allNullColl':allNullCols,'allNullRows':allNullRows} @dataclass -class TrainingResult: +class TrainingResultClassification: accuracy: float precision: float recall: float @@ -26,18 +85,29 @@ class TrainingResult: tp: float specificity: float f1: float + logloss: float + fpr: float + tpr: float + metrics: dict +''' +@datasets +class TrainingResultRegression: mse: float mae: float mape: float rmse: float - fpr: float - tpr: float +@dataclass +class TrainingResult: + metrics: dict +''' def train(dataset, params, callback): problem_type = params["type"] data = pd.DataFrame() for col in params["inputColumns"]: data[col]=dataset[col] + + print(data.head()) output_column = params["columnToPredict"] data[output_column] = dataset[output_column] # @@ -66,6 +136,7 @@ def train(dataset, params, callback): data.pop(col) # # Enkodiranje + # https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/ # encoding=params["encoding"] if(encoding=='label'): @@ -79,6 +150,34 @@ def train(dataset, params, callback): if(data[col].dtype==np.object_): category_columns.append(col) data=pd.get_dummies(data, columns=category_columns, prefix=category_columns) + elif(encoding=='ordinal'): + encoder = OrdinalEncoder() + for col in data.columns: + if(data[col].dtype==np.object_): + data[col]=encoder.fit_transform(data[col]) + + elif(encoding=='hashing'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns)) + encoder.fit_transform(data) + elif(encoding=='binary'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.BinaryEncoder(cols=category_columns, return_df=True) + encoder.fit_transform(data) + + elif(encoding=='baseN'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5) + encoder.fit_transform(data) # # Input - output # @@ -88,71 +187,271 @@ def train(dataset, params, callback): x_columns.append(col) x = data[x_columns].values y = data[output_column].values + print(x_columns) + print(x) # # Podela na test i trening skupove # test=params["randomTestSetDistribution"] randomOrder = params["randomOrder"] if(randomOrder): - random=50 + random=123 else: random=0 - x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test, random_state=random) + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5,random_state=0) # # Skaliranje vrednosti # + ''' scaler=StandardScaler() scaler.fit(x_train) x_test=scaler.transform(x_test) x_train=scaler.transform(x_train) + ''' + # # Treniranje modela # - classifier=tf.keras.Sequential() + # hidden_layer_neurons = params["hiddenLayerNeurons"] - for func in params["hiddenLayerActivationFunctions"]: - classifier.add(tf.keras.layers.Dense(units=hidden_layer_neurons,activation=func)) - output_func = params["outputLayerActivationFunction"] - classifier.add(tf.keras.layers.Dense(units=1,activation=output_func)) - optimizer = params["optimizer"] - metrics=params['metrics'] - loss_func=params["lossFunction"] - classifier.compile(optimizer=optimizer, loss=loss_func,metrics=metrics) - batch_size = params["batchSize"] - epochs = params["epochs"] - history=classifier.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callback(x_test, y_test), validation_split=0.2) # TODO params["validationSplit"] + + if(problem_type=='multi-klasifikacioni'): + func=params['hiddenLayerActivationFunctions'] + output_func = params["outputLayerActivationFunction"] + optimizer = params["optimizer"] + metrics=params['metrics'] + loss_func=params["lossFunction"] + batch_size = params["batchSize"] + epochs = params["epochs"] + inputDim = len(data.columns) - 1 + ''' + classifier=tf.keras.Sequential() + + classifier.add(tf.keras.layers.Dense(units=len(data.columns),input_dim=inputDim))#input layer + + for f in func:#hidden layers + classifier.add(tf.keras.layers.Dense(hidden_layer_neurons,activation=f)) + + numberofclasses=len(output_column.unique()) + classifier.add(tf.keras.layers.Dense(numberofclasses,activation=output_func))#output layer + ''' + model=tf.keras.Sequential() + model.add(tf.keras.layers.Dense(1,input_dim=x_train.shape[1]))#input layer + model.add(tf.keras.layers.Dense(1, activation='sigmoid')) + model.add(tf.keras.layers.Dense(len(output_column.unique())+1, activation='softmax')) + classifier.compile(optimizer=optimizer, loss=loss_func,metrics=metrics) + + history=classifier.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callback(x_test, y_test)) + else: + classifier=tf.keras.Sequential() + + for func in params["hiddenLayerActivationFunctions"]: + classifier.add(tf.keras.layers.Dense(units=hidden_layer_neurons,activation=func)) + output_func = params["outputLayerActivationFunction"] + + if(problem_type!="regresioni"): + classifier.add(tf.keras.layers.Dense(units=1,activation=output_func)) + else: + classifier.add(tf.keras.layers.Dense(units=1)) + + optimizer = params["optimizer"] + metrics=params['metrics'] + loss_func=params["lossFunction"] + classifier.compile(optimizer=optimizer, loss=loss_func,metrics=metrics) + batch_size = params["batchSize"] + epochs = params["epochs"] + history=classifier.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callback(x_test, y_test), validation_split=0.2) # TODO params["validationSplit"] # # Test # model_name = params['_id'] - y_pred=classifier.predict(x_test) + #y_pred=classifier.predict(x_test) if(problem_type == "regresioni"): - classifier.evaluate(x_test, y_test) - elif(problem_type == "binarni-klasifikacioni"): + y_pred=classifier.predict(x_test) + print(classifier.evaluate(x_test, y_test)) + elif(problem_type == "binarni-klasifikacioni"): + y_pred=classifier.predict(x_test) y_pred=(y_pred>=0.5).astype('int') + elif(problem_type=='multi-klasifikacioni'): + y_pred=classifier.predict(x_test) + y_pred=np.argmax(y_pred,axis=1) + y_pred=y_pred.flatten() result=pd.DataFrame({"Actual":y_test,"Predicted":y_pred}) classifier.save("temp/"+model_name, save_format='h5') + # ROC multi-klasifikacioni + def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"): + + #creating a set of all the unique classes using the actual class list + unique_class = set(actual_class) + roc_auc_dict = {} + for per_class in unique_class: + + #creating a list of all the classes except the current class + other_class = [x for x in unique_class if x != per_class] + + #marking the current class as 1 and all other classes as 0 + new_actual_class = [0 if x in other_class else 1 for x in actual_class] + new_pred_class = [0 if x in other_class else 1 for x in pred_class] + + #using the sklearn metrics method to calculate the roc_auc_score + roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average) + roc_auc_dict[per_class] = roc_auc + + return roc_auc_dict # # Metrike # print("HELLO???") print(result) print("HELLO???") - accuracy = float(sm.accuracy_score(y_test,y_pred)) - precision = float(sm.precision_score(y_test,y_pred)) - recall = float(sm.recall_score(y_test,y_pred)) - tn, fp, fn, tp = sm.confusion_matrix(y_test,y_pred).ravel() - specificity = float(tn / (tn+fp)) - f1 = float(sm.f1_score(y_test,y_pred)) - mse = float(sm.mean_squared_error(y_test,y_pred)) - mae = float(sm.mean_absolute_error(y_test,y_pred)) - mape = float(sm.mean_absolute_percentage_error(y_test,y_pred)) - rmse = float(np.sqrt(sm.mean_squared_error(y_test,y_pred))) - fpr, tpr, _ = sm.roc_curve(y_test,y_pred) + if(problem_type=="binarni-klasifikacioni"): + accuracy = float(sm.accuracy_score(y_test,y_pred)) + precision = float(sm.precision_score(y_test,y_pred)) + recall = float(sm.recall_score(y_test,y_pred)) + tn, fp, fn, tp = sm.confusion_matrix(y_test,y_pred).ravel() + specificity = float(tn / (tn+fp)) + f1 = float(sm.f1_score(y_test,y_pred)) + fpr, tpr, _ = sm.roc_curve(y_test,y_pred) + logloss = float(sm.log_loss(y_test, y_pred)) + metrics= {"accuracy" : accuracy, + "precision" : precision, + "recall" : recall, + "specificity" : specificity, + "f1" : f1, + "tn" : float(tn), + "fp" : float(fp), + "fn" : float(fn), + "tp" : float(tp), + "fpr" : fpr.tolist(), + "tpr" : tpr.tolist(), + "logloss" : logloss + } + elif(problem_type=="regresioni"): + # https://www.analyticsvidhya.com/blog/2021/05/know-the-best-evaluation-metrics-for-your-regression-model/ + mse = float(sm.mean_squared_error(y_test,y_pred)) + mae = float(sm.mean_absolute_error(y_test,y_pred)) + mape = float(sm.mean_absolute_percentage_error(y_test,y_pred)) + rmse = float(np.sqrt(sm.mean_squared_error(y_test,y_pred))) + rmsle = float(np.sqrt(sm.mean_squared_error(y_test, y_pred))) + r2 = float(sm.r2_score(y_test, y_pred)) + # n - num of observations + # k - num of independent variables + n = 40 + k = 2 + adj_r2 = float(1 - ((1-r2)*(n-1)/(n-k-1))) + metrics= {"mse" : mse, + "mae" : mae, + "mape" : mape, + "rmse" : rmse, + "rmsle" : rmsle, + "r2" : r2, + "adj_r2" : adj_r2 + } + elif(problem_type=="multi-klasifikacioni"): + + cr=sm.classification_report(y_test, y_pred) + cm=sm.confusion_matrix(y_test,y_pred) + # https://www.kaggle.com/code/nkitgupta/evaluation-metrics-for-multi-class-classification/notebook + accuracy=metrics.accuracy_score(y_test, y_pred) + macro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'macro') + micro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'micro') + macro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'macro') + micro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'micro') + macro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'macro') + micro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'micro') + roc_auc_dict=roc_auc_score_multiclass(y_test, y_pred) + + # TODO upload trenirani model nazad na backend - return TrainingResult(accuracy, precision, recall, float(tn), float(fp), float(fn), float(tp), specificity, f1, mse, mae, mape, rmse, fpr.tolist(), tpr.tolist()) + #return TrainingResult(metrics) +def manageH5(datain,params,h5model): + dataset=datain.copy() + problem_type = params["type"] + data = pd.DataFrame() + for col in params["inputColumns"]: + data[col]=dataset[col] + output_column = params["columnToPredict"] + data[output_column] = dataset[output_column] + # + # Brisanje null kolona / redova / zamena + #nullreplace=[ + # {"column":"Embarked","value":"C","deleteRow":false,"deleteCol":true}, + # {"column": "Cabin","value":"C123","deleteRow":"0","deleteCol":"0"}] + + null_value_options = params["nullValues"] + null_values_replacers = params["nullValuesReplacers"] + + if(null_value_options=='replace'): + print("replace null") # TODO + elif(null_value_options=='delete_rows'): + data=data.dropna() + elif(null_value_options=='delete_columns'): + data=data.dropna() + # + #print(data.isnull().any()) + # + # Brisanje kolona koje ne uticu na rezultat + # + num_rows=data.shape[0] + for col in data.columns: + if((data[col].nunique()==(num_rows)) and (data[col].dtype==np.object_)): + data.pop(col) + # + # Enkodiranje + # https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/ + # + encoding=params["encoding"] + if(encoding=='label'): + encoder=LabelEncoder() + for col in data.columns: + if(data[col].dtype==np.object_): + data[col]=encoder.fit_transform(data[col]) + elif(encoding=='onehot'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + data=pd.get_dummies(data, columns=category_columns, prefix=category_columns) + elif(encoding=='ordinal'): + encoder = OrdinalEncoder() + for col in data.columns: + if(data[col].dtype==np.object_): + data[col]=encoder.fit_transform(data[col]) + elif(encoding=='hashing'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns)) + encoder.fit_transform(data) + elif(encoding=='binary'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.BinaryEncoder(cols=category_columns, return_df=True) + encoder.fit_transform(data) + elif(encoding=='baseN'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5) + encoder.fit_transform(data) + # + # Input - output + # + x_columns = [] + for col in data.columns: + if(col!=output_column): + x_columns.append(col) + x = data[x_columns].values + y = data[output_column].values + + + y_pred=h5model.predict_classes(x)
\ No newline at end of file diff --git a/backend/microservice/api/newmlservice.py b/backend/microservice/api/newmlservice.py new file mode 100644 index 00000000..50af15f8 --- /dev/null +++ b/backend/microservice/api/newmlservice.py @@ -0,0 +1,424 @@ +from enum import unique +from itertools import count +import pandas as pd +from sklearn import datasets, multiclass +import tensorflow as tf +import keras +import numpy as np +import csv +import json +import h5py +import sklearn.metrics as sm +from statistics import mode +from typing_extensions import Self +from copyreg import constructor +from flask import request, jsonify, render_template +from sklearn.preprocessing import LabelEncoder, MinMaxScaler +from sklearn.preprocessing import OrdinalEncoder +import category_encoders as ce +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +from dataclasses import dataclass +import statistics as s +from sklearn.metrics import roc_auc_score +from ann_visualizer.visualize import ann_viz; +def returnColumnsInfo(dataset): + dict=[] + datafront=dataset.copy() + svekolone=datafront.columns + kategorijskekolone=datafront.select_dtypes(include=['object']).columns + allNullCols=0 + for kolona in svekolone: + if(kolona in kategorijskekolone): + uniquevalues=datafront[kolona].unique() + mean=0 + median=0 + nullCount=datafront[kolona].isnull().sum() + if(nullCount>0): + allNullCols=allNullCols+1 + frontreturn={'columnName':kolona, + 'isNumber':False, + 'uniqueValues':uniquevalues.tolist(), + 'median':float(mean), + 'mean':float(median), + 'numNulls':float(nullCount) + } + dict.append(frontreturn) + else: + mean=datafront[kolona].mean() + median=s.median(datafront[kolona]) + nullCount=datafront[kolona].isnull().sum() + if(nullCount>0): + allNullCols=allNullCols+1 + frontreturn={'columnName':kolona, + 'isNumber':1, + 'uniqueValues':[], + 'mean':float(mean), + 'median':float(median), + 'numNulls':float(nullCount) + } + dict.append(frontreturn) + NullRows = datafront[datafront.isnull().any(axis=1)] + #print(NullRows) + #print(len(NullRows)) + allNullRows=len(NullRows) + + return {'columnInfo':dict,'allNullColl':allNullCols,'allNullRows':allNullRows} + +@dataclass +class TrainingResultClassification: + accuracy: float + precision: float + recall: float + tn: float + fp: float + fn: float + tp: float + specificity: float + f1: float + logloss: float + fpr: float + tpr: float + metrics: dict +''' +@datasets +class TrainingResultRegression: + mse: float + mae: float + mape: float + rmse: float + +@dataclass +class TrainingResult: + metrics: dict +''' + +def train(dataset, params, callback): + problem_type = params["type"] + print(problem_type) + data = pd.DataFrame() + print(data) + for col in params["inputColumns"]: + print(col) + data[col]=dataset[col] + output_column = params["columnToPredict"] + data[output_column] = dataset[output_column] + print(data) + + ###NULL + null_value_options = params["nullValues"] + null_values_replacers = params["nullValuesReplacers"] + + if(null_value_options=='replace'): + print("replace null") # TODO + elif(null_value_options=='delete_rows'): + data=data.dropna() + elif(null_value_options=='delete_columns'): + data=data.dropna() + print(data.shape) + + # + # Brisanje kolona koje ne uticu na rezultat + # + num_rows=data.shape[0] + for col in data.columns: + if((data[col].nunique()==(num_rows)) and (data[col].dtype==np.object_)): + data.pop(col) + # + ### Enkodiranje + encoding=params["encoding"] + if(encoding=='label'): + encoder=LabelEncoder() + for col in data.columns: + if(data[col].dtype==np.object_): + data[col]=encoder.fit_transform(data[col]) + + + elif(encoding=='onehot'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + data=pd.get_dummies(data, columns=category_columns, prefix=category_columns) + + elif(encoding=='ordinal'): + encoder = OrdinalEncoder() + for col in data.columns: + if(data[col].dtype==np.object_): + data[col]=encoder.fit_transform(data[col]) + + elif(encoding=='hashing'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns)) + encoder.fit_transform(data) + elif(encoding=='binary'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.BinaryEncoder(cols=category_columns, return_df=True) + encoder.fit_transform(data) + + elif(encoding=='baseN'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5) + encoder.fit_transform(data) + # + # Input - output + # + x_columns = [] + for col in data.columns: + if(col!=output_column): + x_columns.append(col) + print(x_columns) + x = data[x_columns].values + y = data[output_column].values + + # + # Podela na test i trening skupove + # + test=params["randomTestSetDistribution"] + randomOrder = params["randomOrder"] + if(randomOrder): + random=123 + else: + random=0 + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test, random_state=random) + print(x_train,x_test) + + # + # Treniranje modela + # + # + if(problem_type=='multi-klasifikacioni'): + #print('multi') + classifier=tf.keras.Sequential() + + classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][0],input_dim=x_train.shape[1]))#prvi skriveni + definisanje prethodnog-ulaznog + for i in range(params['hiddenLayers']-1):#ako postoji vise od jednog skrivenog sloja + #print(i) + classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][i+1]))#i-ti skriveni sloj + classifier.add(tf.keras.layers.Dense(units=5, activation=params['outputLayerActivationFunction']))#izlazni sloj + + classifier.compile(loss =params["lossFunction"] , optimizer = params['optimizer'] , metrics =params['metrics']) + + history=classifier.fit(x_train, y_train, epochs = params['epochs'],batch_size=params['batchSize']) + + y_pred=classifier.predict(x_test) + y_pred=np.argmax(y_pred,axis=1) + #print(y_pred.flatten()) + #print(y_test) + scores = classifier.evaluate(x_test, y_test) + print("\n%s: %.2f%%" % (classifier.metrics_names[1], scores[1]*100)) + classifier.save("temp/"+params['name'], save_format='h5') + #vizuelizacija u python-u + #from ann_visualizer.visualize import ann_viz; + #ann_viz(classifier, title="My neural network") + + elif(problem_type=='binarni-klasifikacioni'): + #print('*************************************************************************binarni') + classifier=tf.keras.Sequential() + + classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][0],input_dim=x_train.shape[1]))#prvi skriveni + definisanje prethodnog-ulaznog + for i in range(params['hiddenLayers']-1):#ako postoji vise od jednog skrivenog sloja + #print(i) + classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][i+1]))#i-ti skriveni sloj + classifier.add(tf.keras.layers.Dense(units=1, activation=params['outputLayerActivationFunction']))#izlazni sloj + + classifier.compile(loss =params["lossFunction"] , optimizer = params['optimizer'] , metrics =params['metrics']) + + history=classifier.fit(x_train, y_train, epochs = params['epochs'],batch_size=params['batchSize']) + + y_pred=classifier.predict(x_test) + y_pred=(y_pred>=0.5).astype('int') + + print(y_pred.flatten()) + print(y_test) + + scores = classifier.evaluate(x_test, y_test) + print("\n%s: %.2f%%" % (classifier.metrics_names[1], scores[1]*100)) + #ann_viz(classifier, title="My neural network") + + classifier.save("temp/"+params['name'], save_format='h5') + + elif(problem_type=='regresioni'): + classifier=tf.keras.Sequential() + + classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][0],input_dim=x_train.shape[1]))#prvi skriveni + definisanje prethodnog-ulaznog + for i in range(params['hiddenLayers']-1):#ako postoji vise od jednog skrivenog sloja + #print(i) + classifier.add(tf.keras.layers.Dense(units=params['hiddenLayerNeurons'], activation=params['hiddenLayerActivationFunctions'][i+1]))#i-ti skriveni sloj + classifier.add(tf.keras.layers.Dense(units=1)) + + classifier.compile(loss =params["lossFunction"] , optimizer = params['optimizer'] , metrics =params['metrics']) + + history=classifier.fit(x_train, y_train, epochs = params['epochs'],batch_size=params['batchSize']) + y_pred=classifier.predict(x_test) + print(classifier.evaluate(x_test, y_test)) + + def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"): + + #creating a set of all the unique classes using the actual class list + unique_class = set(actual_class) + roc_auc_dict = {} + for per_class in unique_class: + + #creating a list of all the classes except the current class + other_class = [x for x in unique_class if x != per_class] + + #marking the current class as 1 and all other classes as 0 + new_actual_class = [0 if x in other_class else 1 for x in actual_class] + new_pred_class = [0 if x in other_class else 1 for x in pred_class] + + #using the sklearn metrics method to calculate the roc_auc_score + roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average) + roc_auc_dict[per_class] = roc_auc + + return roc_auc_dict + # + # Metrike + # + + if(problem_type=="binarni-klasifikacioni"): + accuracy = float(sm.accuracy_score(y_test,y_pred)) + precision = float(sm.precision_score(y_test,y_pred)) + recall = float(sm.recall_score(y_test,y_pred)) + tn, fp, fn, tp = sm.confusion_matrix(y_test,y_pred).ravel() + specificity = float(tn / (tn+fp)) + f1 = float(sm.f1_score(y_test,y_pred)) + fpr, tpr, _ = sm.roc_curve(y_test,y_pred) + logloss = float(sm.log_loss(y_test, y_pred)) + metrics= {"accuracy" : accuracy, + "precision" : precision, + "recall" : recall, + "specificity" : specificity, + "f1" : f1, + "tn" : float(tn), + "fp" : float(fp), + "fn" : float(fn), + "tp" : float(tp), + "fpr" : fpr.tolist(), + "tpr" : tpr.tolist(), + "logloss" : logloss + } + elif(problem_type=="regresioni"): + # https://www.analyticsvidhya.com/blog/2021/05/know-the-best-evaluation-metrics-for-your-regression-model/ + mse = float(sm.mean_squared_error(y_test,y_pred)) + mae = float(sm.mean_absolute_error(y_test,y_pred)) + mape = float(sm.mean_absolute_percentage_error(y_test,y_pred)) + rmse = float(np.sqrt(sm.mean_squared_error(y_test,y_pred))) + rmsle = float(np.sqrt(sm.mean_squared_error(y_test, y_pred))) + r2 = float(sm.r2_score(y_test, y_pred)) + # n - num of observations + # k - num of independent variables + n = 40 + k = 2 + adj_r2 = float(1 - ((1-r2)*(n-1)/(n-k-1))) + metrics= {"mse" : mse, + "mae" : mae, + "mape" : mape, + "rmse" : rmse, + "rmsle" : rmsle, + "r2" : r2, + "adj_r2" : adj_r2 + } + ''' + elif(problem_type=="multi-klasifikacioni"): + + cr=sm.classification_report(y_test, y_pred) + cm=sm.confusion_matrix(y_test,y_pred) + # https://www.kaggle.com/code/nkitgupta/evaluation-metrics-for-multi-class-classification/notebook + accuracy=metrics.accuracy_score(y_test, y_pred) + macro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'macro') + micro_averaged_precision=metrics.precision_score(y_test, y_pred, average = 'micro') + macro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'macro') + micro_averaged_recall=metrics.recall_score(y_test, y_pred, average = 'micro') + macro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'macro') + micro_averaged_f1=metrics.f1_score(y_test, y_pred, average = 'micro') + roc_auc_dict=roc_auc_score_multiclass(y_test, y_pred) + ''' + +def manageH5(dataset,params,h5model): + problem_type = params["type"] + print(problem_type) + data = pd.DataFrame() + #print(data) + for col in params["inputColumns"]: + print(col) + data[col]=dataset[col] + output_column = params["columnToPredict"] + data[output_column] = dataset[output_column] + #print(data) + + ###NULL + null_value_options = params["nullValues"] + null_values_replacers = params["nullValuesReplacers"] + + if(null_value_options=='replace'): + print("replace null") # TODO + elif(null_value_options=='delete_rows'): + data=data.dropna() + elif(null_value_options=='delete_columns'): + data=data.dropna() + print(data.shape) + + # + # Brisanje kolona koje ne uticu na rezultat + # + num_rows=data.shape[0] + for col in data.columns: + if((data[col].nunique()==(num_rows)) and (data[col].dtype==np.object_)): + data.pop(col) + # + ### Enkodiranje + encoding=params["encoding"] + if(encoding=='label'): + encoder=LabelEncoder() + for col in data.columns: + if(data[col].dtype==np.object_): + data[col]=encoder.fit_transform(data[col]) + + + elif(encoding=='onehot'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + data=pd.get_dummies(data, columns=category_columns, prefix=category_columns) + #print(data) + + # + # Input - output + # + x_columns = [] + for col in data.columns: + if(col!=output_column): + x_columns.append(col) + #print(x_columns) + x2 = data[x_columns] + print(x2) + print(x2.values) + x2 = data[x_columns].values + print(x2) + y2 = data[output_column].values + h5model.summary() + ann_viz(h5model, title="My neural network") + + h5model.compile(loss=params['lossFunction'], optimizer=params['optimizer'], metrics=params['metrics']) + + history=h5model.fit(x2, y2, epochs = params['epochs'],batch_size=params['batchSize']) + + y_pred2=h5model.predict(x2) + + y_pred2=np.argmax(y_pred2,axis=1) + #y_pred=h5model.predict_classes(x) + score = h5model.evaluate(x2,y_pred2, verbose=0) + print("%s: %.2f%%" % (h5model.metrics_names[1], score[1]*100)) + print(y_pred2) + print( 'done')
\ No newline at end of file diff --git a/backend/microservice/mlservice.py b/backend/microservice/mlservice.py index b2eafe9a..8f56fc3f 100644 --- a/backend/microservice/mlservice.py +++ b/backend/microservice/mlservice.py @@ -54,6 +54,38 @@ def obuka(dataunos,params,modelunos,dataunosdrugog): data[zeljenekolone[i]]=dataunos[zeljenekolone[i]] #print(data.head(10)) + ### 0.1) Povratne vrednosti statistike za front (za popunjavanje null vrednosti izabranih kolona) PART4 + datafront=data.copy() + svekolone=datafront.columns + kategorijskekolone=datafront.select_dtypes(include=['object']).columns + #print(kategorijskekolone ) + #kategorijskekolone=datacategorical.columns + #print(svekolone) + for i in range(len(svekolone)): + nazivkolone=svekolone[i] + if(nazivkolone in kategorijskekolone): + svekategorije=datafront[nazivkolone].unique() + medijana=None + srednjavrednost=None + frontreturn={'colName':nazivkolone, + 'colType':'categorical', + 'categoricalValues':svekategorije, + 'mean':medijana, + 'average':srednjavrednost + } + else: + svekategorije=None + medijana=datafront[nazivkolone].mean() + srednjavrednost=sum(datafront[nazivkolone])/len(datafront[nazivkolone]) + frontreturn={'colName':nazivkolone, + 'colType':'noncategorical', + 'categoricalValues':svekategorije, + 'mean':medijana, + 'average':srednjavrednost + } + + print(frontreturn) + #predvidetikol=input("UNETI NAZIV KOLONE ČIJU VREDNOST TREBA PREDVIDETI ") ###sta se cuva od promenjivih broj kolone ili naziv kolone??? |