Fix some bugs and set categorical columns as categories dtypes.
parent
99c2fab8f9
commit
be0324fd01
|
@ -108,7 +108,7 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
|
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
|
||||||
features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - loglocation
|
features[impute_w_sn3] = features[impute_w_sn3].fillna(-1000000) # Special case of imputation - loglocation
|
||||||
|
|
||||||
# Impute location features
|
# Impute location features
|
||||||
impute_locations = [col for col in features \
|
impute_locations = [col for col in features \
|
||||||
|
@ -218,6 +218,16 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
|
|
||||||
graph_bf_af(features, "10correlation_drop")
|
graph_bf_af(features, "10correlation_drop")
|
||||||
|
|
||||||
|
# Transform categorical columns to category dtype
|
||||||
|
|
||||||
|
cat1 = [col for col in features.columns if "mostcommonactivity" in col]
|
||||||
|
if cat1: # Transform columns to category dtype (mostcommonactivity)
|
||||||
|
features[cat1] = features[cat1].astype(int).astype('category')
|
||||||
|
|
||||||
|
cat2 = [col for col in features.columns if "homelabel" in col]
|
||||||
|
if cat2: # Transform columns to category dtype (homelabel)
|
||||||
|
features[cat2] = features[cat2].astype(int).astype('category')
|
||||||
|
|
||||||
# (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
# (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||||
if features.isna().any().any():
|
if features.isna().any().any():
|
||||||
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
||||||
|
|
|
@ -9,8 +9,8 @@ def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
||||||
esm_names = column_names[esm_names_index]
|
esm_names = column_names[esm_names_index]
|
||||||
target_variable_index = esm_names.str.contains(target_variable_name)
|
target_variable_index = esm_names.str.contains(target_variable_name)
|
||||||
if all(~target_variable_index):
|
if all(~target_variable_index):
|
||||||
warnings.warn(f"The requested target (, {target_variable_name} ,)cannot be found in the dataset. Please check the names of phone_esm_ columns in z_all_sensor_features_cleaned_straw_py.csv")
|
warnings.warn(f"The requested target (, {target_variable_name} ,)cannot be found in the dataset. Please check the names of phone_esm_ columns in cleaned python file")
|
||||||
return False
|
return None
|
||||||
|
|
||||||
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
||||||
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
||||||
|
|
|
@ -7,7 +7,7 @@ target_variable_name = snakemake.params["target_variable"]
|
||||||
|
|
||||||
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
|
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
|
||||||
|
|
||||||
if not model_input:
|
if model_input is None:
|
||||||
pd.DataFrame().to_csv(snakemake.output[0])
|
pd.DataFrame().to_csv(snakemake.output[0])
|
||||||
else:
|
else:
|
||||||
model_input.to_csv(snakemake.output[0], index=False)
|
model_input.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
Loading…
Reference in New Issue