Update data cleaning code
parent
3e7b9260d2
commit
512355ca01
|
@ -395,9 +395,12 @@ if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
||||||
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
||||||
|
|
||||||
# Data Cleaning
|
# Data Cleaning
|
||||||
if config["DATA_CLEANING"]["COMPUTE"]:
|
for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
|
if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() +".csv", pid=config["PIDS"]))
|
||||||
|
for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
|
||||||
|
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
|
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
|
||||||
|
|
||||||
rule all:
|
rule all:
|
||||||
input:
|
input:
|
||||||
|
|
40
config.yaml
40
config.yaml
|
@ -569,12 +569,36 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
||||||
# Data Cleaning #
|
# Data Cleaning #
|
||||||
########################################################################################################################
|
########################################################################################################################
|
||||||
|
|
||||||
DATA_CLEANING:
|
ALL_CLEANING_INDIVIDUAL:
|
||||||
COMPUTE: False
|
PROVIDERS:
|
||||||
COLS_NAN_THRESHOLD: 0.3
|
RAPIDS:
|
||||||
COLS_VAR_THRESHOLD: True
|
COMPUTE: False
|
||||||
ROWS_NAN_THRESHOLD: 0.3
|
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
COMPUTE: True
|
||||||
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||||
CORR_THRESHOLD: 0.95
|
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||||
|
COLS_VAR_THRESHOLD: True
|
||||||
|
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||||
|
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
|
COMPUTE: True
|
||||||
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
|
CORR_THRESHOLD: 0.95
|
||||||
|
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
||||||
|
|
||||||
|
ALL_CLEANING_OVERALL:
|
||||||
|
PROVIDERS:
|
||||||
|
RAPIDS:
|
||||||
|
COMPUTE: False
|
||||||
|
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||||
|
COMPUTE: True
|
||||||
|
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||||
|
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||||
|
COLS_VAR_THRESHOLD: True
|
||||||
|
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||||
|
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
|
COMPUTE: True
|
||||||
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
|
CORR_THRESHOLD: 0.95
|
||||||
|
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
||||||
|
|
|
@ -961,30 +961,27 @@ rule merge_sensor_features_for_all_participants:
|
||||||
|
|
||||||
rule clean_sensor_features_for_individual_participants:
|
rule clean_sensor_features_for_individual_participants:
|
||||||
input:
|
input:
|
||||||
rules.merge_sensor_features_for_individual_participants.output
|
sensor_data = rules.merge_sensor_features_for_individual_participants.output
|
||||||
|
wildcard_constraints:
|
||||||
|
pid = config["PIDS"]
|
||||||
params:
|
params:
|
||||||
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
|
provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||||
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
|
provider_key = "{provider_key}",
|
||||||
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
|
sensor_key = "all_cleaning_individual"
|
||||||
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
|
||||||
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
|
|
||||||
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
|
|
||||||
output:
|
output:
|
||||||
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
"data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/utils/clean_sensor_features.R"
|
"../src/features/entry.R"
|
||||||
|
|
||||||
rule clean_sensor_features_for_all_participants:
|
rule clean_sensor_features_for_all_participants:
|
||||||
input:
|
input:
|
||||||
rules.merge_sensor_features_for_all_participants.output
|
sensor_data = rules.merge_sensor_features_for_all_participants.output
|
||||||
params:
|
params:
|
||||||
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
|
provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||||
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
|
provider_key = "{provider_key}",
|
||||||
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
|
sensor_key = "all_cleaning_overall"
|
||||||
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
|
||||||
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
|
|
||||||
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
|
|
||||||
output:
|
output:
|
||||||
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
"data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/utils/clean_sensor_features.R"
|
"../src/features/entry.R"
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
source("renv/activate.R")
|
||||||
|
library(tidyr)
|
||||||
|
library("dplyr", warn.conflicts = F)
|
||||||
|
library(tidyverse)
|
||||||
|
library(caret)
|
||||||
|
library(corrr)
|
||||||
|
|
||||||
|
rapids_cleaning <- function(sensor_data_files, provider){
|
||||||
|
|
||||||
|
clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
|
||||||
|
impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
|
||||||
|
cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
|
||||||
|
drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
|
||||||
|
rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
|
||||||
|
data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
|
||||||
|
drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
|
||||||
|
|
||||||
|
# Impute selected event features
|
||||||
|
if(as.logical(impute_selected_event_features$COMPUTE)){
|
||||||
|
if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
|
||||||
|
stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
||||||
|
}
|
||||||
|
column_names <- colnames(clean_features)
|
||||||
|
selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
|
||||||
|
selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
|
||||||
|
selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
|
||||||
|
selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
|
||||||
|
selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
|
||||||
|
selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
|
||||||
|
selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
|
||||||
|
|
||||||
|
selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
|
||||||
|
clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
|
||||||
|
if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
|
||||||
|
stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
|
||||||
|
}
|
||||||
|
clean_features <- clean_features %>%
|
||||||
|
filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
|
||||||
|
|
||||||
|
# Drop columns with a percentage of NA values above cols_nan_threshold
|
||||||
|
if(nrow(clean_features))
|
||||||
|
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
||||||
|
|
||||||
|
# Drop columns with zero variance
|
||||||
|
if(drop_zero_variance_columns)
|
||||||
|
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||||
|
|
||||||
|
# Drop highly correlated features
|
||||||
|
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
||||||
|
|
||||||
|
min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
|
||||||
|
corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
|
||||||
|
|
||||||
|
features_for_corr <- clean_features %>%
|
||||||
|
select_if(is.numeric) %>%
|
||||||
|
select_if(sapply(., n_distinct, na.rm = T) > 1)
|
||||||
|
|
||||||
|
valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
|
||||||
|
|
||||||
|
if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
|
||||||
|
|
||||||
|
highly_correlated_features <- features_for_corr %>%
|
||||||
|
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
|
||||||
|
column_to_rownames(., var = "term") %>%
|
||||||
|
as.matrix() %>%
|
||||||
|
replace(!valid_pairs | is.na(.), 0) %>%
|
||||||
|
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
|
||||||
|
|
||||||
|
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Drop rows with a percentage of NA values above rows_nan_threshold
|
||||||
|
clean_features <- clean_features %>%
|
||||||
|
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
||||||
|
filter(percentage_na <= rows_nan_threshold) %>%
|
||||||
|
select(-percentage_na)
|
||||||
|
|
||||||
|
return(clean_features)
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
source("renv/activate.R")
|
||||||
|
library(tidyr)
|
||||||
|
library("dplyr", warn.conflicts = F)
|
||||||
|
library(tidyverse)
|
||||||
|
library(caret)
|
||||||
|
library(corrr)
|
||||||
|
|
||||||
|
rapids_cleaning <- function(sensor_data_files, provider){
|
||||||
|
|
||||||
|
clean_features <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
|
||||||
|
impute_selected_event_features <- provider[["IMPUTE_SELECTED_EVENT_FEATURES"]]
|
||||||
|
cols_nan_threshold <- as.numeric(provider[["COLS_NAN_THRESHOLD"]])
|
||||||
|
drop_zero_variance_columns <- as.logical(provider[["COLS_VAR_THRESHOLD"]])
|
||||||
|
rows_nan_threshold <- as.numeric(provider[["ROWS_NAN_THRESHOLD"]])
|
||||||
|
data_yielded_hours_ratio_threshold <- as.numeric(provider[["DATA_YIELDED_HOURS_RATIO_THRESHOLD"]])
|
||||||
|
drop_highly_correlated_features <- provider[["DROP_HIGHLY_CORRELATED_FEATURES"]]
|
||||||
|
|
||||||
|
# Impute selected event features
|
||||||
|
if(as.logical(impute_selected_event_features$COMPUTE)){
|
||||||
|
if(!"phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames(clean_features)){
|
||||||
|
stop("Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
||||||
|
}
|
||||||
|
column_names <- colnames(clean_features)
|
||||||
|
selected_apps_features <- column_names[grepl("^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)", column_names)]
|
||||||
|
selected_battery_features <- column_names[grepl("^phone_battery_rapids_", column_names)]
|
||||||
|
selected_calls_features <- column_names[grepl("^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)", column_names)]
|
||||||
|
selected_keyboard_features <- column_names[grepl("^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)", column_names)]
|
||||||
|
selected_messages_features <- column_names[grepl("^phone_messages_rapids_.*_(count|distinctcontacts)", column_names)]
|
||||||
|
selected_screen_features <- column_names[grepl("^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)", column_names)]
|
||||||
|
selected_wifi_features <- column_names[grepl("^phone_wifi_(connected|visible)_rapids_", column_names)]
|
||||||
|
|
||||||
|
selected_columns <- c(selected_apps_features, selected_battery_features, selected_calls_features, selected_keyboard_features, selected_messages_features, selected_screen_features, selected_wifi_features)
|
||||||
|
clean_features[selected_columns][is.na(clean_features[selected_columns]) & (clean_features$phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features$MIN_DATA_YIELDED_MINUTES_TO_IMPUTE)] <- 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
|
||||||
|
if(!"phone_data_yield_rapids_ratiovalidyieldedhours" %in% colnames(clean_features)){
|
||||||
|
stop("Error: RAPIDS provider needs to clean data based on phone_data_yield_rapids_ratiovalidyieldedhours column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedhours' in [FEATURES].")
|
||||||
|
}
|
||||||
|
clean_features <- clean_features %>%
|
||||||
|
filter(phone_data_yield_rapids_ratiovalidyieldedhours >= data_yielded_hours_ratio_threshold)
|
||||||
|
|
||||||
|
# Drop columns with a percentage of NA values above cols_nan_threshold
|
||||||
|
if(nrow(clean_features))
|
||||||
|
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
||||||
|
|
||||||
|
# Drop columns with zero variance
|
||||||
|
if(drop_zero_variance_columns)
|
||||||
|
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||||
|
|
||||||
|
# Drop highly correlated features
|
||||||
|
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
||||||
|
|
||||||
|
min_overlap_for_corr_threshold <- as.numeric(drop_highly_correlated_features$MIN_OVERLAP_FOR_CORR_THRESHOLD)
|
||||||
|
corr_threshold <- as.numeric(drop_highly_correlated_features$CORR_THRESHOLD)
|
||||||
|
|
||||||
|
features_for_corr <- clean_features %>%
|
||||||
|
select_if(is.numeric) %>%
|
||||||
|
select_if(sapply(., n_distinct, na.rm = T) > 1)
|
||||||
|
|
||||||
|
valid_pairs <- crossprod(!is.na(features_for_corr)) >= min_overlap_for_corr_threshold * nrow(features_for_corr)
|
||||||
|
|
||||||
|
if((nrow(features_for_corr) != 0) & (ncol(features_for_corr) != 0)){
|
||||||
|
|
||||||
|
highly_correlated_features <- features_for_corr %>%
|
||||||
|
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
|
||||||
|
column_to_rownames(., var = "term") %>%
|
||||||
|
as.matrix() %>%
|
||||||
|
replace(!valid_pairs | is.na(.), 0) %>%
|
||||||
|
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
|
||||||
|
|
||||||
|
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Drop rows with a percentage of NA values above rows_nan_threshold
|
||||||
|
clean_features <- clean_features %>%
|
||||||
|
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
||||||
|
filter(percentage_na <= rows_nan_threshold) %>%
|
||||||
|
select(-percentage_na)
|
||||||
|
|
||||||
|
return(clean_features)
|
||||||
|
}
|
||||||
|
|
|
@ -4,13 +4,19 @@ library("dplyr",warn.conflicts = F)
|
||||||
library("tidyr")
|
library("tidyr")
|
||||||
|
|
||||||
sensor_data_files <- snakemake@input
|
sensor_data_files <- snakemake@input
|
||||||
sensor_data_files$time_segments_labels <- NULL
|
|
||||||
time_segments_file <- snakemake@input[["time_segments_labels"]]
|
|
||||||
|
|
||||||
provider <- snakemake@params["provider"][["provider"]]
|
provider <- snakemake@params["provider"][["provider"]]
|
||||||
provider_key <- snakemake@params["provider_key"]
|
provider_key <- snakemake@params["provider_key"]
|
||||||
sensor_key <- snakemake@params["sensor_key"]
|
sensor_key <- snakemake@params["sensor_key"]
|
||||||
|
|
||||||
sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
if("time_segments_labels" %in% names(sensor_data_files)){
|
||||||
|
# Extract sensor features
|
||||||
|
sensor_data_files$time_segments_labels <- NULL
|
||||||
|
time_segments_file <- snakemake@input[["time_segments_labels"]]
|
||||||
|
sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
||||||
|
}else{
|
||||||
|
# Data cleaning
|
||||||
|
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||||
|
}
|
||||||
|
|
||||||
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)
|
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -1,14 +1,19 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from utils.utils import fetch_provider_features
|
from utils.utils import fetch_provider_features, run_provider_cleaning_script
|
||||||
|
|
||||||
sensor_data_files = dict(snakemake.input)
|
sensor_data_files = dict(snakemake.input)
|
||||||
del sensor_data_files["time_segments_labels"]
|
|
||||||
time_segments_file = snakemake.input["time_segments_labels"]
|
|
||||||
|
|
||||||
provider = snakemake.params["provider"]
|
provider = snakemake.params["provider"]
|
||||||
provider_key = snakemake.params["provider_key"]
|
provider_key = snakemake.params["provider_key"]
|
||||||
sensor_key = snakemake.params["sensor_key"]
|
sensor_key = snakemake.params["sensor_key"]
|
||||||
|
|
||||||
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
if "time_segments_labels" in sensor_data_files.keys():
|
||||||
|
# Extract sensor features
|
||||||
|
del sensor_data_files["time_segments_labels"]
|
||||||
|
time_segments_file = snakemake.input["time_segments_labels"]
|
||||||
|
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
||||||
|
else:
|
||||||
|
# Data cleaning
|
||||||
|
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||||
|
|
||||||
sensor_features.to_csv(snakemake.output[0], index=False)
|
sensor_features.to_csv(snakemake.output[0], index=False)
|
|
@ -1,54 +0,0 @@
|
||||||
source("renv/activate.R")
|
|
||||||
library(tidyr)
|
|
||||||
library("dplyr", warn.conflicts = F)
|
|
||||||
library(tidyverse)
|
|
||||||
library(caret)
|
|
||||||
library(corrr)
|
|
||||||
|
|
||||||
|
|
||||||
clean_features <- read.csv(snakemake@input[[1]])
|
|
||||||
cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]])
|
|
||||||
drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]])
|
|
||||||
rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
|
|
||||||
data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]])
|
|
||||||
corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]])
|
|
||||||
corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]])
|
|
||||||
|
|
||||||
# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less or equal than data_yielded_hours_ratio_threshold
|
|
||||||
clean_features <- clean_features %>%
|
|
||||||
filter(phone_data_yield_rapids_ratiovalidyieldedhours > data_yielded_hours_ratio_threshold)
|
|
||||||
|
|
||||||
# drop columns with a percentage of NA values above cols_nan_threshold
|
|
||||||
if(nrow(clean_features))
|
|
||||||
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
|
||||||
|
|
||||||
if(drop_zero_variance_columns)
|
|
||||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
|
||||||
|
|
||||||
# drop highly correlated features
|
|
||||||
features_for_corr <- clean_features %>%
|
|
||||||
select_if(is.numeric) %>%
|
|
||||||
select_if(sapply(., n_distinct, na.rm = T) > 1)
|
|
||||||
|
|
||||||
valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr)
|
|
||||||
|
|
||||||
if((dim(features_for_corr)[1] != 0) & (dim(features_for_corr)[2] != 0)){
|
|
||||||
|
|
||||||
highly_correlated_features <- features_for_corr %>%
|
|
||||||
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
|
|
||||||
column_to_rownames(., var = "term") %>%
|
|
||||||
as.matrix() %>%
|
|
||||||
replace(!valid_pairs | is.na(.), 0) %>%
|
|
||||||
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
|
|
||||||
|
|
||||||
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
# drop rows with a percentage of NA values above rows_nan_threshold
|
|
||||||
clean_features <- clean_features %>%
|
|
||||||
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
|
||||||
filter(percentage_na <= rows_nan_threshold) %>%
|
|
||||||
select(-percentage_na)
|
|
||||||
|
|
||||||
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
|
|
@ -84,3 +84,13 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d
|
||||||
remove = FALSE)
|
remove = FALSE)
|
||||||
return(sensor_features)
|
return(sensor_features)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
run_provider_cleaning_script <- function(provider, provider_key, sensor_key, sensor_data_files){
|
||||||
|
source(provider[["SRC_SCRIPT"]])
|
||||||
|
print(paste(rapids_log_tag, "Processing", sensor_key, provider_key))
|
||||||
|
|
||||||
|
cleaning_function <- match.fun(paste0(tolower(provider_key), "_cleaning"))
|
||||||
|
sensor_features <- cleaning_function(sensor_data_files, provider)
|
||||||
|
|
||||||
|
return(sensor_features)
|
||||||
|
}
|
||||||
|
|
|
@ -123,3 +123,13 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
|
||||||
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||||
|
|
||||||
return sensor_features
|
return sensor_features
|
||||||
|
|
||||||
|
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
|
||||||
|
from importlib import import_module, util
|
||||||
|
print("{} Processing {} {}".format(rapids_log_tag, sensor_key, provider_key))
|
||||||
|
|
||||||
|
cleaning_module = import_path(provider["SRC_SCRIPT"])
|
||||||
|
cleaning_function = getattr(cleaning_module, provider_key.lower() + "_cleaning")
|
||||||
|
sensor_features = cleaning_function(sensor_data_files, provider)
|
||||||
|
|
||||||
|
return sensor_features
|
||||||
|
|
|
@ -36,17 +36,16 @@ required:
|
||||||
- HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
|
- HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
|
||||||
- HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
|
- HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
|
||||||
- HEATMAP_FEATURE_CORRELATION_MATRIX
|
- HEATMAP_FEATURE_CORRELATION_MATRIX
|
||||||
- DATA_CLEANING
|
- ALL_CLEANING_INDIVIDUAL
|
||||||
|
- ALL_CLEANING_OVERALL
|
||||||
|
|
||||||
definitions:
|
definitions:
|
||||||
PROVIDER:
|
PROVIDER:
|
||||||
type: object
|
type: object
|
||||||
required: [COMPUTE, SRC_SCRIPT, FEATURES]
|
required: [COMPUTE, SRC_SCRIPT]
|
||||||
properties:
|
properties:
|
||||||
COMPUTE:
|
COMPUTE:
|
||||||
type: boolean
|
type: boolean
|
||||||
FEATURES:
|
|
||||||
type: [array, object]
|
|
||||||
SRC_SCRIPT:
|
SRC_SCRIPT:
|
||||||
type: string
|
type: string
|
||||||
pattern: "^.*\\.(py|R)$"
|
pattern: "^.*\\.(py|R)$"
|
||||||
|
@ -1258,31 +1257,102 @@ properties:
|
||||||
type: string
|
type: string
|
||||||
enum: ["pearson", "kendall", "spearman"]
|
enum: ["pearson", "kendall", "spearman"]
|
||||||
|
|
||||||
DATA_CLEANING:
|
ALL_CLEANING_INDIVIDUAL:
|
||||||
type: object
|
type: object
|
||||||
required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
|
required: [PROVIDERS]
|
||||||
properties:
|
properties:
|
||||||
COMPUTE:
|
PROVIDERS:
|
||||||
type: boolean
|
type: ["null", object]
|
||||||
COLS_NAN_THRESHOLD:
|
properties:
|
||||||
type: number
|
RAPIDS:
|
||||||
minimum: 0
|
allOf:
|
||||||
maximum: 1
|
- $ref: "#/definitions/PROVIDER"
|
||||||
COLS_VAR_THRESHOLD:
|
- properties:
|
||||||
type: boolean
|
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||||
ROWS_NAN_THRESHOLD:
|
type: object
|
||||||
type: number
|
required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
|
||||||
minimum: 0
|
properties:
|
||||||
maximum: 1
|
COMPUTE:
|
||||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD:
|
type: boolean
|
||||||
type: number
|
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
|
||||||
minimum: 0
|
type: number
|
||||||
maximum: 1
|
minimum: 0
|
||||||
CORR_VALID_PAIRS_THRESHOLD:
|
maximum: 1
|
||||||
type: number
|
COLS_NAN_THRESHOLD:
|
||||||
minimum: 0
|
type: number
|
||||||
maximum: 1
|
minimum: 0
|
||||||
CORR_THRESHOLD:
|
maximum: 1
|
||||||
type: number
|
COLS_VAR_THRESHOLD:
|
||||||
minimum: 0
|
type: boolean
|
||||||
maximum: 1
|
ROWS_NAN_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
DATA_YIELDED_HOURS_RATIO_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
|
type: object
|
||||||
|
required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
|
||||||
|
properties:
|
||||||
|
COMPUTE:
|
||||||
|
type: boolean
|
||||||
|
MIN_OVERLAP_FOR_CORR_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
CORR_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
|
||||||
|
ALL_CLEANING_OVERALL:
|
||||||
|
type: object
|
||||||
|
required: [PROVIDERS]
|
||||||
|
properties:
|
||||||
|
PROVIDERS:
|
||||||
|
type: ["null", object]
|
||||||
|
properties:
|
||||||
|
RAPIDS:
|
||||||
|
allOf:
|
||||||
|
- $ref: "#/definitions/PROVIDER"
|
||||||
|
- properties:
|
||||||
|
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||||
|
type: object
|
||||||
|
required: [COMPUTE, MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]
|
||||||
|
properties:
|
||||||
|
COMPUTE:
|
||||||
|
type: boolean
|
||||||
|
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
COLS_NAN_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
COLS_VAR_THRESHOLD:
|
||||||
|
type: boolean
|
||||||
|
ROWS_NAN_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
DATA_YIELDED_HOURS_RATIO_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
|
type: object
|
||||||
|
required: [COMPUTE, MIN_OVERLAP_FOR_CORR_THRESHOLD, CORR_THRESHOLD]
|
||||||
|
properties:
|
||||||
|
COMPUTE:
|
||||||
|
type: boolean
|
||||||
|
MIN_OVERLAP_FOR_CORR_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
CORR_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
|
Loading…
Reference in New Issue