diff --git a/machine_learning/helper.py b/machine_learning/helper.py index 01b7327..4742cca 100644 --- a/machine_learning/helper.py +++ b/machine_learning/helper.py @@ -69,8 +69,6 @@ def insert_row(df, row): def run_all_models(input_csv): # Prepare data model_input = pd.read_csv(input_csv) - model_input.dropna(axis=1, how="all", inplace=True) - model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_input.set_index(index_columns, inplace=True) @@ -78,6 +76,8 @@ def run_all_models(input_csv): data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] categorical_feature_colnames = ["gender", "startlanguage"] + additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col] + categorical_feature_colnames += additional_categorical_features categorical_features = data_x[categorical_feature_colnames].copy() mode_categorical_features = categorical_features.mode().iloc[0] # fillna with mode @@ -90,8 +90,6 @@ def run_all_models(input_csv): numerical_features = data_x.drop(categorical_feature_colnames, axis=1) train_x = pd.concat([numerical_features, categorical_features], axis=1) - imputer = SimpleImputer(missing_values=np.nan, strategy='mean') - train_x_imputed = imputer.fit_transform(train_x) # Prepare cross validation logo = LeaveOneGroupOut() @@ -106,7 +104,7 @@ def run_all_models(input_csv): lin_reg_rapids = linear_model.LinearRegression() lin_reg_scores = cross_val_score( lin_reg_rapids, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -120,7 +118,7 @@ def run_all_models(input_csv): ridge_reg = linear_model.Ridge(alpha=.5) ridge_reg_scores = cross_val_score( ridge_reg, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -134,7 +132,7 @@ def run_all_models(input_csv): lasso_reg = linear_model.Lasso(alpha=0.1) lasso_reg_score = cross_val_score( lasso_reg, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -148,7 +146,7 @@ def run_all_models(input_csv): bayesian_ridge_reg = linear_model.BayesianRidge() bayesian_ridge_reg_score = cross_val_score( bayesian_ridge_reg, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -162,7 +160,7 @@ def run_all_models(input_csv): ransac_reg = linear_model.RANSACRegressor() ransac_reg_score = cross_val_score( ransac_reg, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -176,7 +174,7 @@ def run_all_models(input_csv): svr = svm.SVR() svr_score = cross_val_score( svr, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -190,7 +188,7 @@ def run_all_models(input_csv): kridge = kernel_ridge.KernelRidge() kridge_score = cross_val_score( kridge, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -204,7 +202,7 @@ def run_all_models(input_csv): gpr = gaussian_process.GaussianProcessRegressor() gpr_score = cross_val_score( gpr, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -218,7 +216,7 @@ def run_all_models(input_csv): rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) rfr_score = cross_val_score( rfr, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -232,7 +230,7 @@ def run_all_models(input_csv): xgb = XGBRegressor() xgb_score = cross_val_score( xgb, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo, @@ -246,7 +244,7 @@ def run_all_models(input_csv): ada = ensemble.AdaBoostRegressor() ada_score = cross_val_score( ada, - X=train_x_imputed, + X=train_x, y=data_y, groups=data_groups, cv=logo,