Update data cleaning code

data_cleaning
Meng Li 2021-10-22 13:44:27 -04:00
parent 3e7b9260d2
commit 512355ca01
11 changed files with 362 additions and 121 deletions

View File

@ -395,9 +395,12 @@ if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html") files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
# Data Cleaning # Data Cleaning
if config["DATA_CLEANING"]["COMPUTE"]: for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"])) if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv") files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() +".csv", pid=config["PIDS"]))
for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
rule all: rule all:
input: input:

View File

@ -569,12 +569,36 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
# Data Cleaning # # Data Cleaning #
######################################################################################################################## ########################################################################################################################
DATA_CLEANING: ALL_CLEANING_INDIVIDUAL:
PROVIDERS:
RAPIDS:
COMPUTE: False COMPUTE: False
COLS_NAN_THRESHOLD: 0.3 IMPUTE_SELECTED_EVENT_FEATURES:
COMPUTE: True
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
COLS_VAR_THRESHOLD: True COLS_VAR_THRESHOLD: True
ROWS_NAN_THRESHOLD: 0.3 ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75 DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
CORR_VALID_PAIRS_THRESHOLD: 0.5 DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95 CORR_THRESHOLD: 0.95
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
ALL_CLEANING_OVERALL:
PROVIDERS:
RAPIDS:
COMPUTE: False
IMPUTE_SELECTED_EVENT_FEATURES:
COMPUTE: True
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
COLS_VAR_THRESHOLD: True
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R

View File

@ -961,30 +961,27 @@ rule merge_sensor_features_for_all_participants:
rule clean_sensor_features_for_individual_participants: rule clean_sensor_features_for_individual_participants:
input: input:
rules.merge_sensor_features_for_individual_participants.output sensor_data = rules.merge_sensor_features_for_individual_participants.output
wildcard_constraints:
pid = config["PIDS"]
params: params:
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"], provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"], provider_key = "{provider_key}",
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"], sensor_key = "all_cleaning_individual"
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
output: output:
"data/processed/features/{pid}/all_sensor_features_cleaned.csv" "data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}.csv"
script: script:
"../src/features/utils/clean_sensor_features.R" "../src/features/entry.R"
rule clean_sensor_features_for_all_participants: rule clean_sensor_features_for_all_participants:
input: input:
rules.merge_sensor_features_for_all_participants.output sensor_data = rules.merge_sensor_features_for_all_participants.output
params: params:
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"], provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"], provider_key = "{provider_key}",
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"], sensor_key = "all_cleaning_overall"
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
output: output:
"data/processed/features/all_participants/all_sensor_features_cleaned.csv" "data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}.csv"
script: script:
"../src/features/utils/clean_sensor_features.R" "../src/features/entry.R"

View File

@ -0,0 +1,85 @@
source("renv/activate.R")
library(tidyr)
library("dplyr", warn.conflicts = F)
library(tidyverse)
library(caret)
library(corrr)
rapids_cleaning <- function(sensor_data_files, provider){
clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
# Impute selected event features
if(as.logical(impute_selected_event_features$COMPUTE)){
if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
}
column_names <- colnames(clean_features)
selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
}
# Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
}
clean_features <- clean_features %>%
filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
# Drop columns with a percentage of NA values above cols_nan_threshold
if(nrow(clean_features))
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
# Drop columns with zero variance
if(drop_zero_variance_columns)
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
# Drop highly correlated features
if(as.logical(drop_highly_correlated_features$COMPUTE)){
min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
features_for_corr <- clean_features %>%
select_if(is.numeric) %>%
select_if(sapply(., n_distinct, na.rm = T) > 1)
valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
highly_correlated_features <- features_for_corr %>%
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
column_to_rownames(., var = "term") %>%
as.matrix() %>%
replace(!valid_pairs | is.na(.), 0) %>%
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
}
}
# Drop rows with a percentage of NA values above rows_nan_threshold
clean_features <- clean_features %>%
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
filter(percentage_na <= rows_nan_threshold) %>%
select(-percentage_na)
return(clean_features)
}

View File

@ -0,0 +1,85 @@
source("renv/activate.R")
library(tidyr)
library("dplyr", warn.conflicts = F)
library(tidyverse)
library(caret)
library(corrr)
rapids_cleaning <- function(sensor_data_files, provider){
clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
# Impute selected event features
if(as.logical(impute_selected_event_features$COMPUTE)){
if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
}
column_names <- colnames(clean_features)
selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
}
# Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
}
clean_features <- clean_features %>%
filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
# Drop columns with a percentage of NA values above cols_nan_threshold
if(nrow(clean_features))
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
# Drop columns with zero variance
if(drop_zero_variance_columns)
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
# Drop highly correlated features
if(as.logical(drop_highly_correlated_features$COMPUTE)){
min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
features_for_corr <- clean_features %>%
select_if(is.numeric) %>%
select_if(sapply(., n_distinct, na.rm = T) > 1)
valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
highly_correlated_features <- features_for_corr %>%
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
column_to_rownames(., var = "term") %>%
as.matrix() %>%
replace(!valid_pairs | is.na(.), 0) %>%
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
}
}
# Drop rows with a percentage of NA values above rows_nan_threshold
clean_features <- clean_features %>%
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
filter(percentage_na <= rows_nan_threshold) %>%
select(-percentage_na)
return(clean_features)
}

View File

@ -4,13 +4,19 @@ library("dplyr",warn.conflicts = F)
library("tidyr") library("tidyr")
sensor_data_files <- snakemake@input sensor_data_files <- snakemake@input
sensor_data_files$time_segments_labels <- NULL
time_segments_file <- snakemake@input[["time_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]] provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"] provider_key <- snakemake@params["provider_key"]
sensor_key <- snakemake@params["sensor_key"] sensor_key <- snakemake@params["sensor_key"]
if("time_segments_labels" %in% names(sensor_data_files)){
# Extract sensor features
sensor_data_files$time_segments_labels <- NULL
time_segments_file <- snakemake@input[["time_segments_labels"]]
sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file) sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
}else{
# Data cleaning
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
}
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,14 +1,19 @@
import pandas as pd import pandas as pd
from utils.utils import fetch_provider_features from utils.utils import fetch_provider_features, run_provider_cleaning_script
sensor_data_files = dict(snakemake.input) sensor_data_files = dict(snakemake.input)
del sensor_data_files["time_segments_labels"]
time_segments_file = snakemake.input["time_segments_labels"]
provider = snakemake.params["provider"] provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"] provider_key = snakemake.params["provider_key"]
sensor_key = snakemake.params["sensor_key"] sensor_key = snakemake.params["sensor_key"]
if "time_segments_labels" in sensor_data_files.keys():
# Extract sensor features
del sensor_data_files["time_segments_labels"]
time_segments_file = snakemake.input["time_segments_labels"]
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file) sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
else:
# Data cleaning
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
sensor_features.to_csv(snakemake.output[0], index=False) sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -1,54 +0,0 @@
source("renv/activate.R")
library(tidyr)
library("dplyr", warn.conflicts = F)
library(tidyverse)
library(caret)
library(corrr)
clean_features <- read.csv(snakemake@input[[1]])
cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]])
drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]])
rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]])
corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]])
corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]])
# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less or equal than data_yielded_hours_ratio_threshold
clean_features <- clean_features %>%
filter(phone_data_yield_rapids_ratiovalidyieldedhours > data_yielded_hours_ratio_threshold)
# drop columns with a percentage of NA values above cols_nan_threshold
if(nrow(clean_features))
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
if(drop_zero_variance_columns)
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
# drop highly correlated features
features_for_corr <- clean_features %>%
select_if(is.numeric) %>%
select_if(sapply(., n_distinct, na.rm = T) > 1)
valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr)
if((dim(features_for_corr)[1] != 0) & (dim(features_for_corr)[2] != 0)){
highly_correlated_features <- features_for_corr %>%
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
column_to_rownames(., var = "term") %>%
as.matrix() %>%
replace(!valid_pairs | is.na(.), 0) %>%
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
}
# drop rows with a percentage of NA values above rows_nan_threshold
clean_features <- clean_features %>%
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
filter(percentage_na <= rows_nan_threshold) %>%
select(-percentage_na)
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -84,3 +84,13 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d
remove = FALSE) remove = FALSE)
return(sensor_features) return(sensor_features)
} }
run_provider_cleaning_script <- function(provider, provider_key, sensor_key, sensor_data_files){
source(provider[["SRC_SCRIPT"]])
print(paste(rapids_log_tag, "Processing", sensor_key, provider_key))
cleaning_function <- match.fun(paste0(tolower(provider_key), "_cleaning"))
sensor_features <- cleaning_function(sensor_data_files, provider)
return(sensor_features)
}

View File

@ -123,3 +123,13 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]]) sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
return sensor_features return sensor_features
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
from importlib import import_module, util
print("{} Processing {} {}".format(rapids_log_tag, sensor_key, provider_key))
cleaning_module = import_path(provider["SRC_SCRIPT"])
cleaning_function = getattr(cleaning_module, provider_key.lower() + "_cleaning")
sensor_features = cleaning_function(sensor_data_files, provider)
return sensor_features

View File

@ -36,17 +36,16 @@ required:
- HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT - HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
- HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT - HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
- HEATMAP_FEATURE_CORRELATION_MATRIX - HEATMAP_FEATURE_CORRELATION_MATRIX
- DATA_CLEANING - ALL_CLEANING_INDIVIDUAL
- ALL_CLEANING_OVERALL
definitions: definitions:
PROVIDER: PROVIDER:
type: object type: object
required: [COMPUTE, SRC_SCRIPT, FEATURES] required: [COMPUTE, SRC_SCRIPT]
properties: properties:
COMPUTE: COMPUTE:
type: boolean type: boolean
FEATURES:
type: [array, object]
SRC_SCRIPT: SRC_SCRIPT:
type: string type: string
pattern: "^.*\\.(py|R)$" pattern: "^.*\\.(py|R)$"
@ -1258,12 +1257,27 @@ properties:
type: string type: string
enum: ["pearson", "kendall", "spearman"] enum: ["pearson", "kendall", "spearman"]
DATA_CLEANING: ALL_CLEANING_INDIVIDUAL:
type: object type: object
required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD] required: [PROVIDERS]
properties:
PROVIDERS:
type: ["null", object]
properties:
RAPIDS:
allOf:
- $ref: "#/definitions/PROVIDER"
- properties:
IMPUTE_SELECTED_EVENT_FEATURES:
type: object
required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
properties: properties:
COMPUTE: COMPUTE:
type: boolean type: boolean
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
type: number
minimum: 0
maximum: 1
COLS_NAN_THRESHOLD: COLS_NAN_THRESHOLD:
type: number type: number
minimum: 0 minimum: 0
@ -1278,7 +1292,63 @@ properties:
type: number type: number
minimum: 0 minimum: 0
maximum: 1 maximum: 1
CORR_VALID_PAIRS_THRESHOLD: DROP_HIGHLY_CORRELATED_FEATURES:
type: object
required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
properties:
COMPUTE:
type: boolean
MIN_OVERLAP_FOR_CORR_THRESHOLD:
type: number
minimum: 0
maximum: 1
CORR_THRESHOLD:
type: number
minimum: 0
maximum: 1
ALL_CLEANING_OVERALL:
type: object
required: [PROVIDERS]
properties:
PROVIDERS:
type: ["null", object]
properties:
RAPIDS:
allOf:
- $ref: "#/definitions/PROVIDER"
- properties:
IMPUTE_SELECTED_EVENT_FEATURES:
type: object
required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
properties:
COMPUTE:
type: boolean
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
type: number
minimum: 0
maximum: 1
COLS_NAN_THRESHOLD:
type: number
minimum: 0
maximum: 1
COLS_VAR_THRESHOLD:
type: boolean
ROWS_NAN_THRESHOLD:
type: number
minimum: 0
maximum: 1
DATA_YIELDED_HOURS_RATIO_THRESHOLD:
type: number
minimum: 0
maximum: 1
DROP_HIGHLY_CORRELATED_FEATURES:
type: object
required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
properties:
COMPUTE:
type: boolean
MIN_OVERLAP_FOR_CORR_THRESHOLD:
type: number type: number
minimum: 0 minimum: 0
maximum: 1 maximum: 1