From 84b53024a7e0b2c4a0ea34d559678301ad8ee750 Mon Sep 17 00:00:00 2001 From: Nevena Bojovic Date: Sun, 17 Apr 2022 20:36:34 +0200 Subject: Razlicita enkodiranja za kolone ML. --- backend/microservice/api/newmlservice.py | 85 +++++++++++++++++--------------- 1 file changed, 45 insertions(+), 40 deletions(-) (limited to 'backend/microservice/api/newmlservice.py') diff --git a/backend/microservice/api/newmlservice.py b/backend/microservice/api/newmlservice.py index a9bce3bb..9951c25f 100644 --- a/backend/microservice/api/newmlservice.py +++ b/backend/microservice/api/newmlservice.py @@ -156,48 +156,53 @@ def train(dataset, paramsModel,paramsExperiment,paramsDataset,callback): # ### Enkodiranje encoding=paramsExperiment["encoding"] - if(encoding=='label'): - encoder=LabelEncoder() - for col in data.columns: - if(data[col].dtype==np.object_): - data[col]=encoder.fit_transform(data[col]) + datafront=dataset.copy() + svekolone=datafront.columns + kategorijskekolone=datafront.select_dtypes(include=['object']).columns + for kolona in svekolone: + if(kolona in kategorijskekolone): + if(encoding=='label'): + encoder=LabelEncoder() + for col in data.columns: + if(data[col].dtype==np.object_): + data[col]=encoder.fit_transform(data[col]) - elif(encoding=='onehot'): - category_columns=[] - for col in data.columns: - if(data[col].dtype==np.object_): - category_columns.append(col) - data=pd.get_dummies(data, columns=category_columns, prefix=category_columns) - - elif(encoding=='ordinal'): - encoder = OrdinalEncoder() - for col in data.columns: - if(data[col].dtype==np.object_): - data[col]=encoder.fit_transform(data[col]) - - elif(encoding=='hashing'): - category_columns=[] - for col in data.columns: - if(data[col].dtype==np.object_): - category_columns.append(col) - encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns)) - encoder.fit_transform(data) - elif(encoding=='binary'): - category_columns=[] - for col in data.columns: - if(data[col].dtype==np.object_): - category_columns.append(col) - encoder=ce.BinaryEncoder(cols=category_columns, return_df=True) - encoder.fit_transform(data) - - elif(encoding=='baseN'): - category_columns=[] - for col in data.columns: - if(data[col].dtype==np.object_): - category_columns.append(col) - encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5) - encoder.fit_transform(data) + elif(encoding=='onehot'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + data=pd.get_dummies(data, columns=category_columns, prefix=category_columns) + + elif(encoding=='ordinal'): + encoder = OrdinalEncoder() + for col in data.columns: + if(data[col].dtype==np.object_): + data[col]=encoder.fit_transform(data[col]) + + elif(encoding=='hashing'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.HashingEncoder(cols=category_columns, n_components=len(category_columns)) + encoder.fit_transform(data) + elif(encoding=='binary'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.BinaryEncoder(cols=category_columns, return_df=True) + encoder.fit_transform(data) + + elif(encoding=='baseN'): + category_columns=[] + for col in data.columns: + if(data[col].dtype==np.object_): + category_columns.append(col) + encoder=ce.BaseNEncoder(cols=category_columns, return_df=True, base=5) + encoder.fit_transform(data) # # Input - output # -- cgit v1.2.3