diff options
Diffstat (limited to 'backend')
-rw-r--r-- | backend/api/api/Models/ColumnInfo.cs | 12 | ||||
-rw-r--r-- | backend/microservice/api/controller.py | 18 | ||||
-rw-r--r-- | backend/microservice/api/newmlservice.py | 49 |
3 files changed, 64 insertions, 15 deletions
diff --git a/backend/api/api/Models/ColumnInfo.cs b/backend/api/api/Models/ColumnInfo.cs index be3c7251..f2cae104 100644 --- a/backend/api/api/Models/ColumnInfo.cs +++ b/backend/api/api/Models/ColumnInfo.cs @@ -4,7 +4,7 @@ { public ColumnInfo() { } - public ColumnInfo(string columnName, string columnType, bool isNumber, int numNulls, float mean, float min, float max, float median, string[] uniqueValues) + public ColumnInfo(string columnName, string columnType, bool isNumber, int numNulls, float mean, float min, float max, float median,float q1,float q3, string[] uniqueValues, int[]uniqueValuesCount, float[] uniqueValuesPercent) { this.columnName = columnName; this.columnType = columnType; @@ -13,8 +13,12 @@ this.mean = mean; this.min = min; this.max = max; + this.q1 = q1; + this.q3 = q3; this.median = median; this.uniqueValues = uniqueValues; + this.uniqueValuesPercent = uniqueValuesPercent; + this.uniqueValuesCount = uniqueValuesCount; } public string columnName { get; set; } @@ -25,7 +29,13 @@ public float min { get; set; } public float max { get; set; } public float median { get; set; } + public float q1 { get; set; } + public float q3 { get; set; } + public string[] uniqueValues { get; set; } + public int[] uniqueValuesCount { get; set; } + public float[] uniqueValuesPercent { get; set; } + } } diff --git a/backend/microservice/api/controller.py b/backend/microservice/api/controller.py index 9b83b8e7..fad6e181 100644 --- a/backend/microservice/api/controller.py +++ b/backend/microservice/api/controller.py @@ -107,26 +107,32 @@ def predict(): @app.route('/preprocess',methods=['POST']) def returnColumnsInfo(): print("********************************PREPROCESS*******************************") + dataset = json.loads(request.form["dataset"]) file = request.files.get("file") data=pd.read_csv(file) - - #dataset={} + ''' #f = request.json['filepath'] #data=pd.read_csv(f) - + dataset={} + ''' preprocess = newmlservice.returnColumnsInfo(data) #samo 10 jedinstvenih posto ih ima previse, bilo bi dobro da promenimo ovo da to budu 10 najzastupljenijih vrednosti + for col in preprocess["columnInfo"]: - col["uniqueValues"] = col["uniqueValues"][0:10] - col["uniqueValuesCount"] = col["uniqueValuesCount"][0:10] + col["uniqueValues"] = col["uniqueValues"][0:5] + col["uniqueValuesCount"] = col["uniqueValuesCount"][0:5] + col['uniqueValuesPercent']=col['uniqueValuesPercent'][0:5] dataset["columnInfo"] = preprocess["columnInfo"] dataset["nullCols"] = preprocess["allNullColl"] dataset["nullRows"] = preprocess["allNullRows"] dataset["colCount"] = preprocess["colCount"] dataset["rowCount"] = preprocess["rowCount"] dataset["isPreProcess"] = True - print(dataset) + #print(dataset) + + + return jsonify(dataset) print("App loaded.") diff --git a/backend/microservice/api/newmlservice.py b/backend/microservice/api/newmlservice.py index 604e4d3c..f74f8386 100644 --- a/backend/microservice/api/newmlservice.py +++ b/backend/microservice/api/newmlservice.py @@ -1,6 +1,7 @@ from enum import unique from itertools import count import os +from sys import breakpointhook import pandas as pd from sklearn import datasets, multiclass import tensorflow as tf @@ -38,27 +39,38 @@ def returnColumnsInfo(dataset): unique=datafront[kolona].value_counts() uniquevalues=[] uniquevaluescount=[] + uniquevaluespercent=[] for val, count in unique.iteritems(): - uniquevalues.append(val) - uniquevaluescount.append(count) + if(val): + uniquevalues.append(val) + uniquevaluescount.append(count) + percent=count/rowCount + uniquevaluespercent.append(percent) #print(uniquevalues) #print(uniquevaluescount) mean=0 median=0 minimum=0 maximum=0 + q1=0 + q3=0 nullCount=datafront[kolona].isnull().sum() if(nullCount>0): allNullCols=allNullCols+1 - frontreturn={'columnName':kolona, + frontreturn={ + 'columnName':kolona, 'isNumber':False, 'uniqueValues':uniquevalues, 'uniqueValuesCount':uniquevaluescount, - 'median':float(mean), - 'mean':float(median), + 'uniqueValuesPercent':uniquevaluespercent, + 'mean':float(mean), + 'median':float(median), 'numNulls':int(nullCount), 'min':float(minimum), 'max':float(maximum), + 'q1':float(q1), + 'q3':float(q3), + } dict.append(frontreturn) else: @@ -66,18 +78,39 @@ def returnColumnsInfo(dataset): maximum=max(datafront[kolona]) mean=datafront[kolona].mean() median=s.median(datafront[kolona].copy().dropna()) + q1= np.percentile(datafront[kolona].copy().dropna(), 25) + q3= np.percentile(datafront[kolona].copy().dropna(), 75) nullCount=datafront[kolona].isnull().sum() if(nullCount>0): allNullCols=allNullCols+1 - frontreturn={'columnName':kolona, + + #pretvaranje u kategorijsku + datafront = datafront.astype({kolona: str}) + print(datafront.dtypes) + unique=datafront[kolona].value_counts() + uniquevaluesn=[] + uniquevaluescountn=[] + uniquevaluespercentn=[] + for val, count in unique.iteritems(): + if(val): + uniquevaluesn.append(val) + uniquevaluescountn.append(count) + percent=count/rowCount + uniquevaluespercentn.append(percent) + frontreturn={ + 'columnName':kolona, 'isNumber':1, - 'uniqueValues':[], - 'uniqueValuesCount':[], + 'uniqueValues':uniquevaluesn, + 'uniqueValuesCount':uniquevaluescountn, + 'uniqueValuesPercent':uniquevaluespercentn, 'mean':float(mean), 'median':float(median), 'numNulls':int(nullCount), 'min':float(minimum), 'max':float(maximum), + 'q1':float(q1), + 'q3':float(q3), + } dict.append(frontreturn) NullRows = datafront[datafront.isnull().any(axis=1)] |