Update data cleaning code
parent
3e7b9260d2
commit
512355ca01
|
@ -395,9 +395,12 @@ if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
|||
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
||||
|
||||
# Data Cleaning
|
||||
if config["DATA_CLEANING"]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
|
||||
for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
|
||||
if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() +".csv", pid=config["PIDS"]))
|
||||
for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
|
||||
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
|
||||
|
||||
rule all:
|
||||
input:
|
||||
|
|
34
config.yaml
34
config.yaml
|
@ -569,12 +569,36 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
|||
# Data Cleaning #
|
||||
########################################################################################################################
|
||||
|
||||
DATA_CLEANING:
|
||||
ALL_CLEANING_INDIVIDUAL:
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: False
|
||||
COLS_NAN_THRESHOLD: 0.3
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 0.3
|
||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
||||
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
||||
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
||||
|
||||
ALL_CLEANING_OVERALL:
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: False
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
||||
|
|
|
@ -961,30 +961,27 @@ rule merge_sensor_features_for_all_participants:
|
|||
|
||||
rule clean_sensor_features_for_individual_participants:
|
||||
input:
|
||||
rules.merge_sensor_features_for_individual_participants.output
|
||||
sensor_data = rules.merge_sensor_features_for_individual_participants.output
|
||||
wildcard_constraints:
|
||||
pid = config["PIDS"]
|
||||
params:
|
||||
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
|
||||
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
|
||||
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
|
||||
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
|
||||
provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "all_cleaning_individual"
|
||||
output:
|
||||
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
||||
"data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/utils/clean_sensor_features.R"
|
||||
"../src/features/entry.R"
|
||||
|
||||
rule clean_sensor_features_for_all_participants:
|
||||
input:
|
||||
rules.merge_sensor_features_for_all_participants.output
|
||||
sensor_data = rules.merge_sensor_features_for_all_participants.output
|
||||
params:
|
||||
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
|
||||
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
|
||||
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
|
||||
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
|
||||
provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "all_cleaning_overall"
|
||||
output:
|
||||
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
||||
"data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/utils/clean_sensor_features.R"
|
||||
"../src/features/entry.R"
|
||||
|
||||
|
|
|
@ -0,0 +1,85 @@
|
|||
source("renv/activate.R")
|
||||
library(tidyr)
|
||||
library("dplyr", warn.conflicts = F)
|
||||
library(tidyverse)
|
||||
library(caret)
|
||||
library(corrr)
|
||||
|
||||
rapids_cleaning <- function(sensor_data_files, provider){
|
||||
|
||||
clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
|
||||
impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
|
||||
cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
|
||||
drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
|
||||
rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
|
||||
data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
|
||||
drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
|
||||
|
||||
# Impute selected event features
|
||||
if(as.logical(impute_selected_event_features$COMPUTE)){
|
||||
if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
|
||||
stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
||||
}
|
||||
column_names <- colnames(clean_features)
|
||||
selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
|
||||
selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
|
||||
selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
|
||||
selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
|
||||
selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
|
||||
selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
|
||||
selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
|
||||
|
||||
selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
|
||||
clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
|
||||
}
|
||||
|
||||
# Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
|
||||
if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
|
||||
stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
|
||||
}
|
||||
clean_features <- clean_features %>%
|
||||
filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
|
||||
|
||||
# Drop columns with a percentage of NA values above cols_nan_threshold
|
||||
if(nrow(clean_features))
|
||||
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
||||
|
||||
# Drop columns with zero variance
|
||||
if(drop_zero_variance_columns)
|
||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||
|
||||
# Drop highly correlated features
|
||||
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
||||
|
||||
min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
|
||||
corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
|
||||
|
||||
features_for_corr <- clean_features %>%
|
||||
select_if(is.numeric) %>%
|
||||
select_if(sapply(., n_distinct, na.rm = T) > 1)
|
||||
|
||||
valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
|
||||
|
||||
if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
|
||||
|
||||
highly_correlated_features <- features_for_corr %>%
|
||||
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
|
||||
column_to_rownames(., var = "term") %>%
|
||||
as.matrix() %>%
|
||||
replace(!valid_pairs | is.na(.), 0) %>%
|
||||
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
|
||||
|
||||
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
# Drop rows with a percentage of NA values above rows_nan_threshold
|
||||
clean_features <- clean_features %>%
|
||||
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
||||
filter(percentage_na <= rows_nan_threshold) %>%
|
||||
select(-percentage_na)
|
||||
|
||||
return(clean_features)
|
||||
}
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
source("renv/activate.R")
|
||||
library(tidyr)
|
||||
library("dplyr", warn.conflicts = F)
|
||||
library(tidyverse)
|
||||
library(caret)
|
||||
library(corrr)
|
||||
|
||||
rapids_cleaning <- function(sensor_data_files, provider){
|
||||
|
||||
clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
|
||||
impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
|
||||
cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
|
||||
drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
|
||||
rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
|
||||
data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
|
||||
drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
|
||||
|
||||
# Impute selected event features
|
||||
if(as.logical(impute_selected_event_features$COMPUTE)){
|
||||
if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
|
||||
stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
||||
}
|
||||
column_names <- colnames(clean_features)
|
||||
selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
|
||||
selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
|
||||
selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
|
||||
selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
|
||||
selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
|
||||
selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
|
||||
selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
|
||||
|
||||
selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
|
||||
clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
|
||||
}
|
||||
|
||||
# Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
|
||||
if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
|
||||
stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
|
||||
}
|
||||
clean_features <- clean_features %>%
|
||||
filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
|
||||
|
||||
# Drop columns with a percentage of NA values above cols_nan_threshold
|
||||
if(nrow(clean_features))
|
||||
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
||||
|
||||
# Drop columns with zero variance
|
||||
if(drop_zero_variance_columns)
|
||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||
|
||||
# Drop highly correlated features
|
||||
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
||||
|
||||
min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
|
||||
corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
|
||||
|
||||
features_for_corr <- clean_features %>%
|
||||
select_if(is.numeric) %>%
|
||||
select_if(sapply(., n_distinct, na.rm = T) > 1)
|
||||
|
||||
valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
|
||||
|
||||
if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
|
||||
|
||||
highly_correlated_features <- features_for_corr %>%
|
||||
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
|
||||
column_to_rownames(., var = "term") %>%
|
||||
as.matrix() %>%
|
||||
replace(!valid_pairs | is.na(.), 0) %>%
|
||||
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
|
||||
|
||||
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
# Drop rows with a percentage of NA values above rows_nan_threshold
|
||||
clean_features <- clean_features %>%
|
||||
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
||||
filter(percentage_na <= rows_nan_threshold) %>%
|
||||
select(-percentage_na)
|
||||
|
||||
return(clean_features)
|
||||
}
|
||||
|
|
@ -4,13 +4,19 @@ library("dplyr",warn.conflicts = F)
|
|||
library("tidyr")
|
||||
|
||||
sensor_data_files <- snakemake@input
|
||||
sensor_data_files$time_segments_labels <- NULL
|
||||
time_segments_file <- snakemake@input[["time_segments_labels"]]
|
||||
|
||||
provider <- snakemake@params["provider"][["provider"]]
|
||||
provider_key <- snakemake@params["provider_key"]
|
||||
sensor_key <- snakemake@params["sensor_key"]
|
||||
|
||||
sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
||||
if("time_segments_labels" %in% names(sensor_data_files)){
|
||||
# Extract sensor features
|
||||
sensor_data_files$time_segments_labels <- NULL
|
||||
time_segments_file <- snakemake@input[["time_segments_labels"]]
|
||||
sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
||||
}else{
|
||||
# Data cleaning
|
||||
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||
}
|
||||
|
||||
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -1,14 +1,19 @@
|
|||
import pandas as pd
|
||||
from utils.utils import fetch_provider_features
|
||||
from utils.utils import fetch_provider_features, run_provider_cleaning_script
|
||||
|
||||
sensor_data_files = dict(snakemake.input)
|
||||
del sensor_data_files["time_segments_labels"]
|
||||
time_segments_file = snakemake.input["time_segments_labels"]
|
||||
|
||||
provider = snakemake.params["provider"]
|
||||
provider_key = snakemake.params["provider_key"]
|
||||
sensor_key = snakemake.params["sensor_key"]
|
||||
|
||||
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
||||
if "time_segments_labels" in sensor_data_files.keys():
|
||||
# Extract sensor features
|
||||
del sensor_data_files["time_segments_labels"]
|
||||
time_segments_file = snakemake.input["time_segments_labels"]
|
||||
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
||||
else:
|
||||
# Data cleaning
|
||||
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||
|
||||
sensor_features.to_csv(snakemake.output[0], index=False)
|
|
@ -1,54 +0,0 @@
|
|||
source("renv/activate.R")
|
||||
library(tidyr)
|
||||
library("dplyr", warn.conflicts = F)
|
||||
library(tidyverse)
|
||||
library(caret)
|
||||
library(corrr)
|
||||
|
||||
|
||||
clean_features <- read.csv(snakemake@input[[1]])
|
||||
cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]])
|
||||
drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]])
|
||||
rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
|
||||
data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]])
|
||||
corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]])
|
||||
corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]])
|
||||
|
||||
# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less or equal than data_yielded_hours_ratio_threshold
|
||||
clean_features <- clean_features %>%
|
||||
filter(phone_data_yield_rapids_ratiovalidyieldedhours > data_yielded_hours_ratio_threshold)
|
||||
|
||||
# drop columns with a percentage of NA values above cols_nan_threshold
|
||||
if(nrow(clean_features))
|
||||
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
||||
|
||||
if(drop_zero_variance_columns)
|
||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||
|
||||
# drop highly correlated features
|
||||
features_for_corr <- clean_features %>%
|
||||
select_if(is.numeric) %>%
|
||||
select_if(sapply(., n_distinct, na.rm = T) > 1)
|
||||
|
||||
valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr)
|
||||
|
||||
if((dim(features_for_corr)[1] != 0) & (dim(features_for_corr)[2] != 0)){
|
||||
|
||||
highly_correlated_features <- features_for_corr %>%
|
||||
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
|
||||
column_to_rownames(., var = "term") %>%
|
||||
as.matrix() %>%
|
||||
replace(!valid_pairs | is.na(.), 0) %>%
|
||||
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
|
||||
|
||||
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
||||
|
||||
}
|
||||
|
||||
# drop rows with a percentage of NA values above rows_nan_threshold
|
||||
clean_features <- clean_features %>%
|
||||
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
||||
filter(percentage_na <= rows_nan_threshold) %>%
|
||||
select(-percentage_na)
|
||||
|
||||
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -84,3 +84,13 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d
|
|||
remove = FALSE)
|
||||
return(sensor_features)
|
||||
}
|
||||
|
||||
run_provider_cleaning_script <- function(provider, provider_key, sensor_key, sensor_data_files){
|
||||
source(provider[["SRC_SCRIPT"]])
|
||||
print(paste(rapids_log_tag, "Processing", sensor_key, provider_key))
|
||||
|
||||
cleaning_function <- match.fun(paste0(tolower(provider_key), "_cleaning"))
|
||||
sensor_features <- cleaning_function(sensor_data_files, provider)
|
||||
|
||||
return(sensor_features)
|
||||
}
|
||||
|
|
|
@ -123,3 +123,13 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
|
|||
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
return sensor_features
|
||||
|
||||
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
|
||||
from importlib import import_module, util
|
||||
print("{} Processing {} {}".format(rapids_log_tag, sensor_key, provider_key))
|
||||
|
||||
cleaning_module = import_path(provider["SRC_SCRIPT"])
|
||||
cleaning_function = getattr(cleaning_module, provider_key.lower() + "_cleaning")
|
||||
sensor_features = cleaning_function(sensor_data_files, provider)
|
||||
|
||||
return sensor_features
|
||||
|
|
|
@ -36,17 +36,16 @@ required:
|
|||
- HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
|
||||
- HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
|
||||
- HEATMAP_FEATURE_CORRELATION_MATRIX
|
||||
- DATA_CLEANING
|
||||
- ALL_CLEANING_INDIVIDUAL
|
||||
- ALL_CLEANING_OVERALL
|
||||
|
||||
definitions:
|
||||
PROVIDER:
|
||||
type: object
|
||||
required: [COMPUTE, SRC_SCRIPT, FEATURES]
|
||||
required: [COMPUTE, SRC_SCRIPT]
|
||||
properties:
|
||||
COMPUTE:
|
||||
type: boolean
|
||||
FEATURES:
|
||||
type: [array, object]
|
||||
SRC_SCRIPT:
|
||||
type: string
|
||||
pattern: "^.*\\.(py|R)$"
|
||||
|
@ -1258,12 +1257,27 @@ properties:
|
|||
type: string
|
||||
enum: ["pearson", "kendall", "spearman"]
|
||||
|
||||
DATA_CLEANING:
|
||||
ALL_CLEANING_INDIVIDUAL:
|
||||
type: object
|
||||
required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
|
||||
required: [PROVIDERS]
|
||||
properties:
|
||||
PROVIDERS:
|
||||
type: ["null", object]
|
||||
properties:
|
||||
RAPIDS:
|
||||
allOf:
|
||||
- $ref: "#/definitions/PROVIDER"
|
||||
- properties:
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
type: object
|
||||
required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
|
||||
properties:
|
||||
COMPUTE:
|
||||
type: boolean
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
COLS_NAN_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
|
@ -1278,7 +1292,63 @@ properties:
|
|||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
CORR_VALID_PAIRS_THRESHOLD:
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
type: object
|
||||
required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
|
||||
properties:
|
||||
COMPUTE:
|
||||
type: boolean
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
CORR_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
|
||||
ALL_CLEANING_OVERALL:
|
||||
type: object
|
||||
required: [PROVIDERS]
|
||||
properties:
|
||||
PROVIDERS:
|
||||
type: ["null", object]
|
||||
properties:
|
||||
RAPIDS:
|
||||
allOf:
|
||||
- $ref: "#/definitions/PROVIDER"
|
||||
- properties:
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
type: object
|
||||
required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
|
||||
properties:
|
||||
COMPUTE:
|
||||
type: boolean
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
COLS_NAN_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
COLS_VAR_THRESHOLD:
|
||||
type: boolean
|
||||
ROWS_NAN_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
type: object
|
||||
required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
|
||||
properties:
|
||||
COMPUTE:
|
||||
type: boolean
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
|
|
Loading…
Reference in New Issue