Update data cleaning code

2021-10-22 13:44:27 -04:00 · 2021-10-22 13:44:27 -04:00 · 512355ca01
parent 3e7b9260d2
commit 512355ca01
11 changed files with 362 additions and 121 deletions
--- a/9
+++ b/9
@ -395,9 +395,12 @@ if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
    files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")

 # Data Cleaning
-if config["DATA_CLEANING"]["COMPUTE"]:
-    files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
-    files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
+for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
+    if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
+        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() +".csv", pid=config["PIDS"]))
+for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
+    if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
+        files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))

 rule all:
    input:
--- a/config.yaml
+++ b/config.yaml
@ -569,12 +569,36 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
 #                                                    Data Cleaning                                                     #
 ########################################################################################################################

-DATA_CLEANING:
-  COMPUTE: False
-  COLS_NAN_THRESHOLD: 0.3
-  COLS_VAR_THRESHOLD: True
-  ROWS_NAN_THRESHOLD: 0.3
-  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
-  CORR_VALID_PAIRS_THRESHOLD: 0.5
-  CORR_THRESHOLD: 0.95
+ALL_CLEANING_INDIVIDUAL:
+  PROVIDERS:
+    RAPIDS:
+      COMPUTE: False
+      IMPUTE_SELECTED_EVENT_FEATURES:
+        COMPUTE: True
+        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
+      COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
+      COLS_VAR_THRESHOLD: True
+      ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
+      DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
+      DROP_HIGHLY_CORRELATED_FEATURES:
+        COMPUTE: True
+        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
+        CORR_THRESHOLD: 0.95
+      SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R

+ALL_CLEANING_OVERALL:
+  PROVIDERS:
+    RAPIDS:
+      COMPUTE: False
+      IMPUTE_SELECTED_EVENT_FEATURES:
+        COMPUTE: True
+        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
+      COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
+      COLS_VAR_THRESHOLD: True
+      ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
+      DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
+      DROP_HIGHLY_CORRELATED_FEATURES:
+        COMPUTE: True
+        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
+        CORR_THRESHOLD: 0.95
+      SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
--- a/rules/features.smk
+++ b/rules/features.smk
@ -961,30 +961,27 @@ rule merge_sensor_features_for_all_participants:

 rule clean_sensor_features_for_individual_participants:
    input:
-        rules.merge_sensor_features_for_individual_participants.output
+        sensor_data = rules.merge_sensor_features_for_individual_participants.output
+    wildcard_constraints:
+        pid = config["PIDS"]
    params:
-        cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
-        cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
-        rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
-        data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
-        corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
-        corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
+        provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
+        provider_key = "{provider_key}",
+        sensor_key = "all_cleaning_individual"
    output:
-        "data/processed/features/{pid}/all_sensor_features_cleaned.csv"
+        "data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}.csv"
    script:
-        "../src/features/utils/clean_sensor_features.R"
+        "../src/features/entry.R"

 rule clean_sensor_features_for_all_participants:
    input:
-        rules.merge_sensor_features_for_all_participants.output
+        sensor_data = rules.merge_sensor_features_for_all_participants.output
    params:
-        cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
-        cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
-        rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
-        data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
-        corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
-        corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
+        provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
+        provider_key = "{provider_key}",
+        sensor_key = "all_cleaning_overall"
    output:
-        "data/processed/features/all_participants/all_sensor_features_cleaned.csv"
+        "data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}.csv"
    script:
-        "../src/features/utils/clean_sensor_features.R"
+        "../src/features/entry.R"
+
--- a/src/features/all_cleaning_individual/rapids/main.R
+++ b/src/features/all_cleaning_individual/rapids/main.R
@ -0,0 +1,85 @@
+source("renv/activate.R")
+library(tidyr)
+library("dplyr", warn.conflicts = F)
+library(tidyverse)
+library(caret)
+library(corrr)
+
+rapids_cleaning <- function(sensor_data_files, provider){
+
+    clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
+    impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
+    cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
+    drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
+    rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
+    data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
+    drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
+
+    # Impute selected event features
+    if(as.logical(impute_selected_event_features$COMPUTE)){
+        if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
+            stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
+        }
+        column_names <- colnames(clean_features)
+        selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
+        selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
+        selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
+        selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
+        selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
+        selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
+        selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
+        
+        selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
+        clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
+    }
+    
+    # Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
+    if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
+        stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
+    }
+    clean_features <- clean_features %>% 
+        filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
+
+    # Drop columns with a percentage of NA values above cols_nan_threshold
+    if(nrow(clean_features))
+        clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
+
+    # Drop columns with zero variance
+    if(drop_zero_variance_columns)
+    clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
+
+    # Drop highly correlated features
+    if(as.logical(drop_highly_correlated_features$COMPUTE)){
+        
+        min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
+        corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
+
+        features_for_corr <- clean_features %>% 
+            select_if(is.numeric) %>% 
+            select_if(sapply(., n_distinct, na.rm = T) > 1)
+
+        valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
+
+        if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
+
+            highly_correlated_features <- features_for_corr %>% 
+                correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
+                column_to_rownames(., var = "term") %>% 
+                as.matrix() %>% 
+                replace(!valid_pairs | is.na(.), 0) %>% 
+                findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
+
+            clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
+        
+        }
+    }
+
+    # Drop rows with a percentage of NA values above rows_nan_threshold
+    clean_features <- clean_features %>% 
+        mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
+        filter(percentage_na <= rows_nan_threshold) %>% 
+        select(-percentage_na)
+
+    return(clean_features)
+}
+
--- a/src/features/all_cleaning_overall/rapids/main.R
+++ b/src/features/all_cleaning_overall/rapids/main.R
@ -0,0 +1,85 @@
+source("renv/activate.R")
+library(tidyr)
+library("dplyr", warn.conflicts = F)
+library(tidyverse)
+library(caret)
+library(corrr)
+
+rapids_cleaning <- function(sensor_data_files, provider){
+
+    clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
+    impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
+    cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
+    drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
+    rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
+    data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
+    drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
+
+    # Impute selected event features
+    if(as.logical(impute_selected_event_features$COMPUTE)){
+        if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
+            stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
+        }
+        column_names <- colnames(clean_features)
+        selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
+        selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
+        selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
+        selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
+        selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
+        selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
+        selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
+        
+        selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
+        clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
+    }
+    
+    # Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
+    if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
+        stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
+    }
+    clean_features <- clean_features %>% 
+        filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
+
+    # Drop columns with a percentage of NA values above cols_nan_threshold
+    if(nrow(clean_features))
+        clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
+
+    # Drop columns with zero variance
+    if(drop_zero_variance_columns)
+    clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
+
+    # Drop highly correlated features
+    if(as.logical(drop_highly_correlated_features$COMPUTE)){
+        
+        min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
+        corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
+
+        features_for_corr <- clean_features %>% 
+            select_if(is.numeric) %>% 
+            select_if(sapply(., n_distinct, na.rm = T) > 1)
+
+        valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
+
+        if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
+
+            highly_correlated_features <- features_for_corr %>% 
+                correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
+                column_to_rownames(., var = "term") %>% 
+                as.matrix() %>% 
+                replace(!valid_pairs | is.na(.), 0) %>% 
+                findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
+
+            clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
+        
+        }
+    }
+
+    # Drop rows with a percentage of NA values above rows_nan_threshold
+    clean_features <- clean_features %>% 
+        mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
+        filter(percentage_na <= rows_nan_threshold) %>% 
+        select(-percentage_na)
+
+    return(clean_features)
+}
+
--- a/src/features/entry.R
+++ b/src/features/entry.R
@ -4,13 +4,19 @@ library("dplyr",warn.conflicts = F)
 library("tidyr")

 sensor_data_files <- snakemake@input
-sensor_data_files$time_segments_labels <- NULL
-time_segments_file <- snakemake@input[["time_segments_labels"]]

 provider <- snakemake@params["provider"][["provider"]]
 provider_key <- snakemake@params["provider_key"]
 sensor_key <- snakemake@params["sensor_key"]

-sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
+if("time_segments_labels" %in% names(sensor_data_files)){
+    # Extract sensor features
+    sensor_data_files$time_segments_labels <- NULL
+    time_segments_file <- snakemake@input[["time_segments_labels"]]
+    sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
+}else{
+    # Data cleaning
+    sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
+}

 write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)
--- a/src/features/entry.py
+++ b/src/features/entry.py
@ -1,14 +1,19 @@
 import pandas as pd
-from utils.utils import fetch_provider_features
+from utils.utils import fetch_provider_features, run_provider_cleaning_script

 sensor_data_files = dict(snakemake.input)
-del sensor_data_files["time_segments_labels"]
-time_segments_file = snakemake.input["time_segments_labels"]

 provider = snakemake.params["provider"]
 provider_key = snakemake.params["provider_key"]
 sensor_key = snakemake.params["sensor_key"]

-sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
+if "time_segments_labels" in sensor_data_files.keys():
+    # Extract sensor features
+    del sensor_data_files["time_segments_labels"]
+    time_segments_file = snakemake.input["time_segments_labels"]
+    sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
+else:
+    # Data cleaning
+    sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)

 sensor_features.to_csv(snakemake.output[0], index=False)
--- a/src/features/utils/clean_sensor_features.R
+++ b/src/features/utils/clean_sensor_features.R
@ -1,54 +0,0 @@
-source("renv/activate.R")
-library(tidyr)
-library("dplyr", warn.conflicts = F)
-library(tidyverse)
-library(caret)
-library(corrr)
-
-
-clean_features <- read.csv(snakemake@input[[1]])
-cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]])
-drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]])
-rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
-data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]])
-corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]])
-corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]])
-
-# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less or equal than data_yielded_hours_ratio_threshold
-clean_features <- clean_features %>% 
-  filter(phone_data_yield_rapids_ratiovalidyieldedhours > data_yielded_hours_ratio_threshold)
-
-# drop columns with a percentage of NA values above cols_nan_threshold
-if(nrow(clean_features))
-    clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
-
-if(drop_zero_variance_columns)
-  clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
-
-# drop highly correlated features
-features_for_corr <- clean_features %>% 
-  select_if(is.numeric) %>% 
-  select_if(sapply(., n_distinct, na.rm = T) > 1)
-
-valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr)
-
-if((dim(features_for_corr)[1] != 0) & (dim(features_for_corr)[2] != 0)){
-
-  highly_correlated_features <- features_for_corr %>% 
-    correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
-    column_to_rownames(., var = "term") %>% 
-    as.matrix() %>% 
-    replace(!valid_pairs | is.na(.), 0) %>% 
-    findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
-
-  clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
-  
-}
-
-# drop rows with a percentage of NA values above rows_nan_threshold
-clean_features <- clean_features %>% 
-  mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
-  filter(percentage_na <= rows_nan_threshold) %>% 
-  select(-percentage_na)
-
-write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
--- a/src/features/utils/utils.R
+++ b/src/features/utils/utils.R
@ -83,4 +83,14 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d
                                                "(.*)#(.*),(.*)", 
                                                remove = FALSE)
    return(sensor_features)
-}
+}
+
+run_provider_cleaning_script <- function(provider, provider_key, sensor_key, sensor_data_files){
+  source(provider[["SRC_SCRIPT"]])
+  print(paste(rapids_log_tag, "Processing", sensor_key, provider_key))
+  
+  cleaning_function <- match.fun(paste0(tolower(provider_key), "_cleaning"))
+  sensor_features <- cleaning_function(sensor_data_files, provider)
+
+  return(sensor_features)
+}
--- a/src/features/utils/utils.py
+++ b/src/features/utils/utils.py
@ -123,3 +123,13 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
            sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])

    return sensor_features
+
+def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
+    from importlib import import_module, util
+    print("{} Processing {} {}".format(rapids_log_tag, sensor_key, provider_key))
+
+    cleaning_module = import_path(provider["SRC_SCRIPT"])
+    cleaning_function = getattr(cleaning_module,  provider_key.lower() + "_cleaning")
+    sensor_features = cleaning_function(sensor_data_files, provider)
+
+    return sensor_features
--- a/tools/config.schema.yaml
+++ b/tools/config.schema.yaml
@ -36,17 +36,16 @@ required:
  - HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
  - HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
  - HEATMAP_FEATURE_CORRELATION_MATRIX
-  - DATA_CLEANING
+  - ALL_CLEANING_INDIVIDUAL
+  - ALL_CLEANING_OVERALL

 definitions:
  PROVIDER:
    type: object
-    required: [COMPUTE, SRC_SCRIPT, FEATURES]
+    required: [COMPUTE, SRC_SCRIPT]
    properties:
      COMPUTE:
        type: boolean
-      FEATURES:
-        type: [array, object]
      SRC_SCRIPT:
        type: string
        pattern: "^.*\\.(py|R)$"
@ -1258,31 +1257,102 @@ properties:
        type: string
        enum: ["pearson", "kendall", "spearman"]

-  DATA_CLEANING:
+  ALL_CLEANING_INDIVIDUAL:
    type: object
-    required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
+    required: [PROVIDERS]
    properties:
-      COMPUTE: 
-        type: boolean
-      COLS_NAN_THRESHOLD:
-        type: number
-        minimum: 0
-        maximum: 1
-      COLS_VAR_THRESHOLD:
-        type: boolean
-      ROWS_NAN_THRESHOLD: 
-        type: number
-        minimum: 0
-        maximum: 1
-      DATA_YIELDED_HOURS_RATIO_THRESHOLD:
-        type: number
-        minimum: 0
-        maximum: 1
-      CORR_VALID_PAIRS_THRESHOLD:
-        type: number
-        minimum: 0
-        maximum: 1
-      CORR_THRESHOLD:
-        type: number
-        minimum: 0
-        maximum: 1
+      PROVIDERS:
+        type: ["null", object]
+        properties:
+          RAPIDS:
+            allOf:
+                - $ref: "#/definitions/PROVIDER"
+                - properties:
+                    IMPUTE_SELECTED_EVENT_FEATURES:
+                      type: object
+                      required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
+                      properties:
+                        COMPUTE: 
+                          type: boolean
+                        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
+                          type: number
+                          minimum: 0
+                          maximum: 1
+                    COLS_NAN_THRESHOLD:
+                      type: number
+                      minimum: 0
+                      maximum: 1
+                    COLS_VAR_THRESHOLD:
+                      type: boolean
+                    ROWS_NAN_THRESHOLD:
+                      type: number
+                      minimum: 0
+                      maximum: 1
+                    DATA_YIELDED_HOURS_RATIO_THRESHOLD:
+                      type: number
+                      minimum: 0
+                      maximum: 1
+                    DROP_HIGHLY_CORRELATED_FEATURES:
+                      type: object
+                      required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
+                      properties:
+                        COMPUTE: 
+                          type: boolean
+                        MIN_OVERLAP_FOR_CORR_THRESHOLD:
+                          type: number
+                          minimum: 0
+                          maximum: 1
+                        CORR_THRESHOLD:
+                          type: number
+                          minimum: 0
+                          maximum: 1
+
+  ALL_CLEANING_OVERALL:
+    type: object
+    required: [PROVIDERS]
+    properties:
+      PROVIDERS:
+        type: ["null", object]
+        properties:
+          RAPIDS:
+            allOf:
+                - $ref: "#/definitions/PROVIDER"
+                - properties:
+                    IMPUTE_SELECTED_EVENT_FEATURES:
+                      type: object
+                      required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
+                      properties:
+                        COMPUTE: 
+                          type: boolean
+                        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
+                          type: number
+                          minimum: 0
+                          maximum: 1
+                    COLS_NAN_THRESHOLD:
+                      type: number
+                      minimum: 0
+                      maximum: 1
+                    COLS_VAR_THRESHOLD:
+                      type: boolean
+                    ROWS_NAN_THRESHOLD:
+                      type: number
+                      minimum: 0
+                      maximum: 1
+                    DATA_YIELDED_HOURS_RATIO_THRESHOLD:
+                      type: number
+                      minimum: 0
+                      maximum: 1
+                    DROP_HIGHLY_CORRELATED_FEATURES:
+                      type: object
+                      required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
+                      properties:
+                        COMPUTE: 
+                          type: boolean
+                        MIN_OVERLAP_FOR_CORR_THRESHOLD:
+                          type: number
+                          minimum: 0
+                          maximum: 1
+                        CORR_THRESHOLD:
+                          type: number
+                          minimum: 0
+                          maximum: 1