Merge commit 'c05b047c2d9452151553961928c846c01d7395bc'

2022-06-25 20:06:24 +02:00 · 2022-06-25 20:06:24 +02:00 · ce04394679
parent ed5314aa98 c05b047c2d
commit ce04394679
2 changed files with 9 additions and 6 deletions
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@ -95,7 +95,7 @@ if not participant_info.empty:
                - limesurvey_demand.loc[rows_demand_reverse, "score_original"]
            )
            baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
-            if "demand" in requested_features:
+            if "limesurvey_demand" in requested_features:
                baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
                    "score"
                ].sum()
@ -136,9 +136,12 @@ if not participant_info.empty:
                ].sum()

        if "limesurvey_demand_control_ratio" in requested_features:
+            if limesurvey_control["score"].sum():
                limesurvey_demand_control_ratio = (
                        limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
                )
+            else:
+                limesurvey_demand_control_ratio = 0
            if (
                JCQ_NORMS[participant_info.loc[0, "gender"]][0]
                <= limesurvey_demand_control_ratio
--- a/src/features/all_cleaning_individual/rapids/main.R
+++ b/src/features/all_cleaning_individual/rapids/main.R
@ -44,11 +44,11 @@ rapids_cleaning <- function(sensor_data_files, provider){

    # Drop columns with a percentage of NA values above cols_nan_threshold
    if(nrow(clean_features))
-        clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
+        clean_features <- clean_features %>% select(where(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ), starts_with("phone_esm"))

    # Drop columns with zero variance
    if(drop_zero_variance_columns)
-    clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
+    clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime|phone_esm",names(.)) | sapply(., n_distinct, na.rm = T) > 1)

    # Drop highly correlated features
    if(as.logical(drop_highly_correlated_features$COMPUTE)){