diff --git a/src/features/cr_features_helper_methods.py b/src/features/cr_features_helper_methods.py index 7bf02254..40cfec9f 100644 --- a/src/features/cr_features_helper_methods.py +++ b/src/features/cr_features_helper_methods.py @@ -15,13 +15,13 @@ def extract_second_order_features(intraday_features, so_features_names, prefix=" so_features = pd.DataFrame() #print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest()) if "mean" in so_features_names: - so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1) + so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean(numeric_only=True).add_suffix("_SO_mean")], axis=1) if "median" in so_features_names: - so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1) + so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median(numeric_only=True).add_suffix("_SO_median")], axis=1) if "sd" in so_features_names: - so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().fillna(0).add_suffix("_SO_sd")], axis=1) + so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std(numeric_only=True).fillna(0).add_suffix("_SO_sd")], axis=1) if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution? for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]: diff --git a/src/features/empatica_data_yield.py b/src/features/empatica_data_yield.py index 2df2bcd2..97d0255b 100644 --- a/src/features/empatica_data_yield.py +++ b/src/features/empatica_data_yield.py @@ -26,7 +26,7 @@ def calculate_empatica_data_yield(features): # TODO # Assigns 1 to values that are over 1 (in case of windows not being filled fully) features[empatica_data_yield_cols] = features[empatica_data_yield_cols].apply(lambda x: [y if y <= 1 or np.isnan(y) else 1 for y in x]) - features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0) + features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1, numeric_only=True).fillna(0) features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average) return features diff --git a/src/features/phone_esm/straw/process_user_event_related_segments.py b/src/features/phone_esm/straw/process_user_event_related_segments.py index 03eeb052..9d845ac1 100644 --- a/src/features/phone_esm/straw/process_user_event_related_segments.py +++ b/src/features/phone_esm/straw/process_user_event_related_segments.py @@ -140,8 +140,8 @@ def extract_ers(esm_df): # Extracted 3 targets that will be transfered in the csv file to the cleaning script. se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'}) - se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'}) - se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'}) + se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'}) + se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'}) # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count) extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \