Update data cleaning code

2021-10-22 13:44:27 -04:00 · 2021-10-22 13:44:27 -04:00 · 512355ca01
parent 3e7b9260d2
commit 512355ca01
11 changed files with 362 additions and 121 deletions
--- a/9
+++ b/9
@ -395,9 +395,12 @@ if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
    files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
 # Data Cleaning
-if config["DATA_CLEANING"]["COMPUTE"]:
+for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
-    files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
+    if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
-    files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
+        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() +".csv", pid=config["PIDS"]))
 for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
    if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
        files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
 rule all:
    input:
--- a/config.yaml
+++ b/config.yaml
@ -569,12 +569,36 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
 #                                                    Data Cleaning                                                     #
 ########################################################################################################################
-DATA_CLEANING:
+ALL_CLEANING_INDIVIDUAL:
-  COMPUTE: False
+  PROVIDERS:
-  COLS_NAN_THRESHOLD: 0.3
+    RAPIDS:
-  COLS_VAR_THRESHOLD: True
+      COMPUTE: False
-  ROWS_NAN_THRESHOLD: 0.3
+      IMPUTE_SELECTED_EVENT_FEATURES:
-  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
+        COMPUTE: True
-  CORR_VALID_PAIRS_THRESHOLD: 0.5
+        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
-  CORR_THRESHOLD: 0.95
+      COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
      COLS_VAR_THRESHOLD: True
      ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
      DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
      DROP_HIGHLY_CORRELATED_FEATURES:
        COMPUTE: True
        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
        CORR_THRESHOLD: 0.95
      SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
 ALL_CLEANING_OVERALL:
  PROVIDERS:
    RAPIDS:
      COMPUTE: False
      IMPUTE_SELECTED_EVENT_FEATURES:
        COMPUTE: True
        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
      COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
      COLS_VAR_THRESHOLD: True
      ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
      DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
      DROP_HIGHLY_CORRELATED_FEATURES:
        COMPUTE: True
        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
        CORR_THRESHOLD: 0.95
      SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
--- a/rules/features.smk
+++ b/rules/features.smk
@ -961,30 +961,27 @@ rule merge_sensor_features_for_all_participants:
 rule clean_sensor_features_for_individual_participants:
    input:
-        rules.merge_sensor_features_for_individual_participants.output
+        sensor_data = rules.merge_sensor_features_for_individual_participants.output
    wildcard_constraints:
        pid = config["PIDS"]
    params:
-        cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
+        provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
-        cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
+        provider_key = "{provider_key}",
-        rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
+        sensor_key = "all_cleaning_individual"
        data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
        corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
        corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
    output:
-        "data/processed/features/{pid}/all_sensor_features_cleaned.csv"
+        "data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}.csv"
    script:
-        "../src/features/utils/clean_sensor_features.R"
+        "../src/features/entry.R"
 rule clean_sensor_features_for_all_participants:
    input:
-        rules.merge_sensor_features_for_all_participants.output
+        sensor_data = rules.merge_sensor_features_for_all_participants.output
    params:
-        cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
+        provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
-        cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
+        provider_key = "{provider_key}",
-        rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
+        sensor_key = "all_cleaning_overall"
        data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
        corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
        corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
    output:
-        "data/processed/features/all_participants/all_sensor_features_cleaned.csv"
+        "data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}.csv"
    script:
-        "../src/features/utils/clean_sensor_features.R"
+        "../src/features/entry.R"
--- a/src/features/all_cleaning_individual/rapids/main.R
+++ b/src/features/all_cleaning_individual/rapids/main.R
@ -0,0 +1,85 @@
 source("renv/activate.R")
 library(tidyr)
 library("dplyr", warn.conflicts = F)
 library(tidyverse)
 library(caret)
 library(corrr)
 rapids_cleaning <- function(sensor_data_files, provider){
    clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
    impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
    cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
    drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
    rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
    data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
    drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
    # Impute selected event features
    if(as.logical(impute_selected_event_features$COMPUTE)){
        if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
            stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
        }
        column_names <- colnames(clean_features)
        selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
        selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
        selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
        selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
        selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
        selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
        selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
        selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
        clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
    }
    # Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
    if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
        stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
    }
    clean_features <- clean_features %>% 
        filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
    # Drop columns with a percentage of NA values above cols_nan_threshold
    if(nrow(clean_features))
        clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
    # Drop columns with zero variance
    if(drop_zero_variance_columns)
    clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
    # Drop highly correlated features
    if(as.logical(drop_highly_correlated_features$COMPUTE)){
        min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
        corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
        features_for_corr <- clean_features %>% 
            select_if(is.numeric) %>% 
            select_if(sapply(., n_distinct, na.rm = T) > 1)
        valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
        if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
            highly_correlated_features <- features_for_corr %>% 
                correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
                column_to_rownames(., var = "term") %>% 
                as.matrix() %>% 
                replace(!valid_pairs | is.na(.), 0) %>% 
                findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
            clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
        }
    }
    # Drop rows with a percentage of NA values above rows_nan_threshold
    clean_features <- clean_features %>% 
        mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
        filter(percentage_na <= rows_nan_threshold) %>% 
        select(-percentage_na)
    return(clean_features)
 }
--- a/src/features/all_cleaning_overall/rapids/main.R
+++ b/src/features/all_cleaning_overall/rapids/main.R
@ -0,0 +1,85 @@
 source("renv/activate.R")
 library(tidyr)
 library("dplyr", warn.conflicts = F)
 library(tidyverse)
 library(caret)
 library(corrr)
 rapids_cleaning <- function(sensor_data_files, provider){
    clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
    impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
    cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
    drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
    rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
    data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
    drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
    # Impute selected event features
    if(as.logical(impute_selected_event_features$COMPUTE)){
        if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
            stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
        }
        column_names <- colnames(clean_features)
        selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
        selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
        selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
        selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
        selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
        selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
        selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
        selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
        clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
    }
    # Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
    if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
        stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
    }
    clean_features <- clean_features %>% 
        filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
    # Drop columns with a percentage of NA values above cols_nan_threshold
    if(nrow(clean_features))
        clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
    # Drop columns with zero variance
    if(drop_zero_variance_columns)
    clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
    # Drop highly correlated features
    if(as.logical(drop_highly_correlated_features$COMPUTE)){
        min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
        corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
        features_for_corr <- clean_features %>% 
            select_if(is.numeric) %>% 
            select_if(sapply(., n_distinct, na.rm = T) > 1)
        valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
        if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
            highly_correlated_features <- features_for_corr %>% 
                correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
                column_to_rownames(., var = "term") %>% 
                as.matrix() %>% 
                replace(!valid_pairs | is.na(.), 0) %>% 
                findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
            clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
        }
    }
    # Drop rows with a percentage of NA values above rows_nan_threshold
    clean_features <- clean_features %>% 
        mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
        filter(percentage_na <= rows_nan_threshold) %>% 
        select(-percentage_na)
    return(clean_features)
 }
--- a/src/features/entry.R
+++ b/src/features/entry.R
@ -4,13 +4,19 @@ library("dplyr",warn.conflicts = F)
 library("tidyr")
 sensor_data_files <- snakemake@input
 sensor_data_files$time_segments_labels <- NULL
 time_segments_file <- snakemake@input[["time_segments_labels"]]
 provider <- snakemake@params["provider"][["provider"]]
 provider_key <- snakemake@params["provider_key"]
 sensor_key <- snakemake@params["sensor_key"]
-sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
+if("time_segments_labels" %in% names(sensor_data_files)){
    # Extract sensor features
    sensor_data_files$time_segments_labels <- NULL
    time_segments_file <- snakemake@input[["time_segments_labels"]]
    sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
 }else{
    # Data cleaning
    sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
 }
 write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)
--- a/src/features/entry.py
+++ b/src/features/entry.py
@ -1,14 +1,19 @@
 import pandas as pd
-from utils.utils import fetch_provider_features
+from utils.utils import fetch_provider_features, run_provider_cleaning_script
 sensor_data_files = dict(snakemake.input)
 del sensor_data_files["time_segments_labels"]
 time_segments_file = snakemake.input["time_segments_labels"]
 provider = snakemake.params["provider"]
 provider_key = snakemake.params["provider_key"]
 sensor_key = snakemake.params["sensor_key"]
-sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
+if "time_segments_labels" in sensor_data_files.keys():
    # Extract sensor features
    del sensor_data_files["time_segments_labels"]
    time_segments_file = snakemake.input["time_segments_labels"]
    sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
 else:
    # Data cleaning
    sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
 sensor_features.to_csv(snakemake.output[0], index=False)
--- a/src/features/utils/clean_sensor_features.R
+++ b/src/features/utils/clean_sensor_features.R
@ -1,54 +0,0 @@
 source("renv/activate.R")
 library(tidyr)
 library("dplyr", warn.conflicts = F)
 library(tidyverse)
 library(caret)
 library(corrr)
 clean_features <- read.csv(snakemake@input[[1]])
 cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]])
 drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]])
 rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
 data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]])
 corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]])
 corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]])
 # drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less or equal than data_yielded_hours_ratio_threshold
 clean_features <- clean_features %>% 
  filter(phone_data_yield_rapids_ratiovalidyieldedhours > data_yielded_hours_ratio_threshold)
 # drop columns with a percentage of NA values above cols_nan_threshold
 if(nrow(clean_features))
    clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
 if(drop_zero_variance_columns)
  clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
 # drop highly correlated features
 features_for_corr <- clean_features %>% 
  select_if(is.numeric) %>% 
  select_if(sapply(., n_distinct, na.rm = T) > 1)
 valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr)
 if((dim(features_for_corr)[1] != 0) & (dim(features_for_corr)[2] != 0)){
  highly_correlated_features <- features_for_corr %>% 
    correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
    column_to_rownames(., var = "term") %>% 
    as.matrix() %>% 
    replace(!valid_pairs | is.na(.), 0) %>% 
    findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
  clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
 }
 # drop rows with a percentage of NA values above rows_nan_threshold
 clean_features <- clean_features %>% 
  mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
  filter(percentage_na <= rows_nan_threshold) %>% 
  select(-percentage_na)
 write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
--- a/src/features/utils/utils.R
+++ b/src/features/utils/utils.R
@ -84,3 +84,13 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d
                                                remove = FALSE)
    return(sensor_features)
 }
 run_provider_cleaning_script <- function(provider, provider_key, sensor_key, sensor_data_files){
  source(provider[["SRC_SCRIPT"]])
  print(paste(rapids_log_tag, "Processing", sensor_key, provider_key))
  cleaning_function <- match.fun(paste0(tolower(provider_key), "_cleaning"))
  sensor_features <- cleaning_function(sensor_data_files, provider)
  return(sensor_features)
 }
--- a/src/features/utils/utils.py
+++ b/src/features/utils/utils.py
@ -123,3 +123,13 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
            sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
    return sensor_features
 def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
    from importlib import import_module, util
    print("{} Processing {} {}".format(rapids_log_tag, sensor_key, provider_key))
    cleaning_module = import_path(provider["SRC_SCRIPT"])
    cleaning_function = getattr(cleaning_module,  provider_key.lower() + "_cleaning")
    sensor_features = cleaning_function(sensor_data_files, provider)
    return sensor_features
--- a/tools/config.schema.yaml
+++ b/tools/config.schema.yaml
@ -36,17 +36,16 @@ required:
  - HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
  - HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
  - HEATMAP_FEATURE_CORRELATION_MATRIX
-  - DATA_CLEANING
+  - ALL_CLEANING_INDIVIDUAL
  - ALL_CLEANING_OVERALL
 definitions:
  PROVIDER:
    type: object
-    required: [COMPUTE, SRC_SCRIPT, FEATURES]
+    required: [COMPUTE, SRC_SCRIPT]
    properties:
      COMPUTE:
        type: boolean
      FEATURES:
        type: [array, object]
      SRC_SCRIPT:
        type: string
        pattern: "^.*\\.(py|R)$"
@ -1258,31 +1257,102 @@ properties:
        type: string
        enum: ["pearson", "kendall", "spearman"]
-  DATA_CLEANING:
+  ALL_CLEANING_INDIVIDUAL:
    type: object
-    required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
+    required: [PROVIDERS]
    properties:
-      COMPUTE: 
+      PROVIDERS:
-        type: boolean
+        type: ["null", object]
-      COLS_NAN_THRESHOLD:
+        properties:
-        type: number
+          RAPIDS:
-        minimum: 0
+            allOf:
-        maximum: 1
+                - $ref: "#/definitions/PROVIDER"
-      COLS_VAR_THRESHOLD:
+                - properties:
-        type: boolean
+                    IMPUTE_SELECTED_EVENT_FEATURES:
-      ROWS_NAN_THRESHOLD: 
+                      type: object
-        type: number
+                      required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
-        minimum: 0
+                      properties:
-        maximum: 1
+                        COMPUTE: 
-      DATA_YIELDED_HOURS_RATIO_THRESHOLD:
+                          type: boolean
-        type: number
+                        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
-        minimum: 0
+                          type: number
-        maximum: 1
+                          minimum: 0
-      CORR_VALID_PAIRS_THRESHOLD:
+                          maximum: 1
-        type: number
+                    COLS_NAN_THRESHOLD:
-        minimum: 0
+                      type: number
-        maximum: 1
+                      minimum: 0
-      CORR_THRESHOLD:
+                      maximum: 1
-        type: number
+                    COLS_VAR_THRESHOLD:
-        minimum: 0
+                      type: boolean
-        maximum: 1
+                    ROWS_NAN_THRESHOLD:
                      type: number
                      minimum: 0
                      maximum: 1
                    DATA_YIELDED_HOURS_RATIO_THRESHOLD:
                      type: number
                      minimum: 0
                      maximum: 1
                    DROP_HIGHLY_CORRELATED_FEATURES:
                      type: object
                      required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
                      properties:
                        COMPUTE: 
                          type: boolean
                        MIN_OVERLAP_FOR_CORR_THRESHOLD:
                          type: number
                          minimum: 0
                          maximum: 1
                        CORR_THRESHOLD:
                          type: number
                          minimum: 0
                          maximum: 1
  ALL_CLEANING_OVERALL:
    type: object
    required: [PROVIDERS]
    properties:
      PROVIDERS:
        type: ["null", object]
        properties:
          RAPIDS:
            allOf:
                - $ref: "#/definitions/PROVIDER"
                - properties:
                    IMPUTE_SELECTED_EVENT_FEATURES:
                      type: object
                      required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
                      properties:
                        COMPUTE: 
                          type: boolean
                        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
                          type: number
                          minimum: 0
                          maximum: 1
                    COLS_NAN_THRESHOLD:
                      type: number
                      minimum: 0
                      maximum: 1
                    COLS_VAR_THRESHOLD:
                      type: boolean
                    ROWS_NAN_THRESHOLD:
                      type: number
                      minimum: 0
                      maximum: 1
                    DATA_YIELDED_HOURS_RATIO_THRESHOLD:
                      type: number
                      minimum: 0
                      maximum: 1
                    DROP_HIGHLY_CORRELATED_FEATURES:
                      type: object
                      required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
                      properties:
                        COMPUTE: 
                          type: boolean
                        MIN_OVERLAP_FOR_CORR_THRESHOLD:
                          type: number
                          minimum: 0
                          maximum: 1
                        CORR_THRESHOLD:
                          type: number
                          minimum: 0
                          maximum: 1