diff options
author | TAMARA JERINIC <tamara.jerinic@gmail.com> | 2022-05-02 00:56:32 +0200 |
---|---|---|
committer | TAMARA JERINIC <tamara.jerinic@gmail.com> | 2022-05-02 18:40:22 +0200 |
commit | 61d8c3e8a88d0787f34b03fcd4fe2b533c571e1b (patch) | |
tree | 940ee9e2c4123ae8e61f7a9a36aafc1adcc1ae06 /backend | |
parent | 7cabfa3d4220d840b47f268ffbc31901cae52167 (diff) |
Dodato računanje novih statističkih podataka u mlservice fajl, usklađeno sa frontom.
Diffstat (limited to 'backend')
-rw-r--r-- | backend/microservice/api/controller.py | 18 | ||||
-rw-r--r-- | backend/microservice/api/newmlservice.py | 49 |
2 files changed, 53 insertions, 14 deletions
diff --git a/backend/microservice/api/controller.py b/backend/microservice/api/controller.py index 9b83b8e7..fad6e181 100644 --- a/backend/microservice/api/controller.py +++ b/backend/microservice/api/controller.py @@ -107,26 +107,32 @@ def predict(): @app.route('/preprocess',methods=['POST']) def returnColumnsInfo(): print("********************************PREPROCESS*******************************") + dataset = json.loads(request.form["dataset"]) file = request.files.get("file") data=pd.read_csv(file) - - #dataset={} + ''' #f = request.json['filepath'] #data=pd.read_csv(f) - + dataset={} + ''' preprocess = newmlservice.returnColumnsInfo(data) #samo 10 jedinstvenih posto ih ima previse, bilo bi dobro da promenimo ovo da to budu 10 najzastupljenijih vrednosti + for col in preprocess["columnInfo"]: - col["uniqueValues"] = col["uniqueValues"][0:10] - col["uniqueValuesCount"] = col["uniqueValuesCount"][0:10] + col["uniqueValues"] = col["uniqueValues"][0:5] + col["uniqueValuesCount"] = col["uniqueValuesCount"][0:5] + col['uniqueValuesPercent']=col['uniqueValuesPercent'][0:5] dataset["columnInfo"] = preprocess["columnInfo"] dataset["nullCols"] = preprocess["allNullColl"] dataset["nullRows"] = preprocess["allNullRows"] dataset["colCount"] = preprocess["colCount"] dataset["rowCount"] = preprocess["rowCount"] dataset["isPreProcess"] = True - print(dataset) + #print(dataset) + + + return jsonify(dataset) print("App loaded.") diff --git a/backend/microservice/api/newmlservice.py b/backend/microservice/api/newmlservice.py index 604e4d3c..f74f8386 100644 --- a/backend/microservice/api/newmlservice.py +++ b/backend/microservice/api/newmlservice.py @@ -1,6 +1,7 @@ from enum import unique from itertools import count import os +from sys import breakpointhook import pandas as pd from sklearn import datasets, multiclass import tensorflow as tf @@ -38,27 +39,38 @@ def returnColumnsInfo(dataset): unique=datafront[kolona].value_counts() uniquevalues=[] uniquevaluescount=[] + uniquevaluespercent=[] for val, count in unique.iteritems(): - uniquevalues.append(val) - uniquevaluescount.append(count) + if(val): + uniquevalues.append(val) + uniquevaluescount.append(count) + percent=count/rowCount + uniquevaluespercent.append(percent) #print(uniquevalues) #print(uniquevaluescount) mean=0 median=0 minimum=0 maximum=0 + q1=0 + q3=0 nullCount=datafront[kolona].isnull().sum() if(nullCount>0): allNullCols=allNullCols+1 - frontreturn={'columnName':kolona, + frontreturn={ + 'columnName':kolona, 'isNumber':False, 'uniqueValues':uniquevalues, 'uniqueValuesCount':uniquevaluescount, - 'median':float(mean), - 'mean':float(median), + 'uniqueValuesPercent':uniquevaluespercent, + 'mean':float(mean), + 'median':float(median), 'numNulls':int(nullCount), 'min':float(minimum), 'max':float(maximum), + 'q1':float(q1), + 'q3':float(q3), + } dict.append(frontreturn) else: @@ -66,18 +78,39 @@ def returnColumnsInfo(dataset): maximum=max(datafront[kolona]) mean=datafront[kolona].mean() median=s.median(datafront[kolona].copy().dropna()) + q1= np.percentile(datafront[kolona].copy().dropna(), 25) + q3= np.percentile(datafront[kolona].copy().dropna(), 75) nullCount=datafront[kolona].isnull().sum() if(nullCount>0): allNullCols=allNullCols+1 - frontreturn={'columnName':kolona, + + #pretvaranje u kategorijsku + datafront = datafront.astype({kolona: str}) + print(datafront.dtypes) + unique=datafront[kolona].value_counts() + uniquevaluesn=[] + uniquevaluescountn=[] + uniquevaluespercentn=[] + for val, count in unique.iteritems(): + if(val): + uniquevaluesn.append(val) + uniquevaluescountn.append(count) + percent=count/rowCount + uniquevaluespercentn.append(percent) + frontreturn={ + 'columnName':kolona, 'isNumber':1, - 'uniqueValues':[], - 'uniqueValuesCount':[], + 'uniqueValues':uniquevaluesn, + 'uniqueValuesCount':uniquevaluescountn, + 'uniqueValuesPercent':uniquevaluespercentn, 'mean':float(mean), 'median':float(median), 'numNulls':int(nullCount), 'min':float(minimum), 'max':float(maximum), + 'q1':float(q1), + 'q3':float(q3), + } dict.append(frontreturn) NullRows = datafront[datafront.isnull().any(axis=1)] |