rapids/src/models/helper.py

19 lines
1.1 KiB
Python

import pandas as pd
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
column_names = df_input.columns
esm_names_index = column_names.str.startswith("phone_esm_straw")
# Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them.
esm_names = column_names[esm_names_index]
target_variable_index = esm_names.str.contains(target_variable_name)
if all(~target_variable_index):
raise ValueError("The requested target (", target_variable_name,
")cannot be found in the dataset.",
"Please check the names of phone_esm_ columns in z_all_sensor_features_cleaned_straw_py.csv")
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
# We will only keep one column related to phone_esm and that will be our target variable.
# Add it back to the very and of the data frame and rename it to target.
return sensor_features_plus_target