Fix some bugs and set categorical columns as categories dtypes.

imputation_and_cleaning
Primoz 2022-11-28 12:44:25 +00:00
parent 99c2fab8f9
commit be0324fd01
3 changed files with 14 additions and 4 deletions

View File

@ -108,7 +108,7 @@ def straw_cleaning(sensor_data_files, provider, target):
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col] impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - loglocation features[impute_w_sn3] = features[impute_w_sn3].fillna(-1000000) # Special case of imputation - loglocation
# Impute location features # Impute location features
impute_locations = [col for col in features \ impute_locations = [col for col in features \
@ -218,6 +218,16 @@ def straw_cleaning(sensor_data_files, provider, target):
graph_bf_af(features, "10correlation_drop") graph_bf_af(features, "10correlation_drop")
# Transform categorical columns to category dtype
cat1 = [col for col in features.columns if "mostcommonactivity" in col]
if cat1: # Transform columns to category dtype (mostcommonactivity)
features[cat1] = features[cat1].astype(int).astype('category')
cat2 = [col for col in features.columns if "homelabel" in col]
if cat2: # Transform columns to category dtype (homelabel)
features[cat2] = features[cat2].astype(int).astype('category')
# (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME # (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
if features.isna().any().any(): if features.isna().any().any():
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.") raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")

View File

@ -9,8 +9,8 @@ def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
esm_names = column_names[esm_names_index] esm_names = column_names[esm_names_index]
target_variable_index = esm_names.str.contains(target_variable_name) target_variable_index = esm_names.str.contains(target_variable_name)
if all(~target_variable_index): if all(~target_variable_index):
warnings.warn(f"The requested target (, {target_variable_name} ,)cannot be found in the dataset. Please check the names of phone_esm_ columns in z_all_sensor_features_cleaned_straw_py.csv") warnings.warn(f"The requested target (, {target_variable_name} ,)cannot be found in the dataset. Please check the names of phone_esm_ columns in cleaned python file")
return False return None
sensor_features_plus_target = df_input.drop(esm_names, axis=1) sensor_features_plus_target = df_input.drop(esm_names, axis=1)
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]] sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]

View File

@ -7,7 +7,7 @@ target_variable_name = snakemake.params["target_variable"]
model_input = retain_target_column(cleaned_sensor_features, target_variable_name) model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
if not model_input: if model_input is None:
pd.DataFrame().to_csv(snakemake.output[0]) pd.DataFrame().to_csv(snakemake.output[0])
else: else:
model_input.to_csv(snakemake.output[0], index=False) model_input.to_csv(snakemake.output[0], index=False)