Move the data cleaning module from example workflow to main directory

data_cleaning
Meng Li 2021-10-03 11:16:47 -04:00
parent 8e3d5eb98c
commit 4a7989c058
8 changed files with 118 additions and 62 deletions

View File

@ -394,6 +394,10 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]: if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html") files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
# Data Cleaning
if config["DATA_CLEANING"]["COMPUTE"]:
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
rule all: rule all:
input: input:

View File

@ -564,3 +564,17 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
CORR_THRESHOLD: 0.1 CORR_THRESHOLD: 0.1
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"} CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
########################################################################################################################
# Data Cleaning #
########################################################################################################################
DATA_CLEANING:
COMPUTE: False
COLS_NAN_THRESHOLD: 0.3
COLS_VAR_THRESHOLD: True
ROWS_NAN_THRESHOLD: 0.3
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
CORR_VALID_PAIRS_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95

View File

@ -384,6 +384,11 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]: if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html") files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
# Data Cleaning
if config["DATA_CLEANING"]["COMPUTE"]:
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
# Analysis Workflow Example # Analysis Workflow Example
models, scalers = [], [] models, scalers = [], []
for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]: for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]:
@ -401,7 +406,6 @@ files_to_compute.extend(expand("data/raw/{pid}/participant_target_with_datetime.
files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]))
# Individual model # Individual model
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"])) files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
files_to_compute.extend(expand( files_to_compute.extend(expand(
@ -414,7 +418,6 @@ files_to_compute.extend(expand(
scaler=scalers)) scaler=scalers))
# Population model # Population model
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
files_to_compute.append("data/processed/models/population_model/input.csv") files_to_compute.append("data/processed/models/population_model/input.csv")
files_to_compute.extend(expand("data/processed/models/population_model/output_{cv_method}/baselines.csv", cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"])) files_to_compute.extend(expand("data/processed/models/population_model/output_{cv_method}/baselines.csv", cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
files_to_compute.extend(expand( files_to_compute.extend(expand(

View File

@ -534,6 +534,19 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"} CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
########################################################################################################################
# Data Cleaning #
########################################################################################################################
DATA_CLEANING:
COMPUTE: True
COLS_NAN_THRESHOLD: 0.3
COLS_VAR_THRESHOLD: True
ROWS_NAN_THRESHOLD: 0.3
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
CORR_VALID_PAIRS_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
######################################################################################################################## ########################################################################################################################
# Analysis Workflow Example # # Analysis Workflow Example #
@ -552,14 +565,6 @@ PARAMS_FOR_ANALYSIS:
FOLDER: data/external/example_workflow FOLDER: data/external/example_workflow
CONTAINER: participant_target.csv CONTAINER: participant_target.csv
# Cleaning Parameters
COLS_NAN_THRESHOLD: 0.3
COLS_VAR_THRESHOLD: True
ROWS_NAN_THRESHOLD: 0.3
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
CORR_VALID_PAIRS_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM] MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
CV_METHODS: [LeaveOneOut] CV_METHODS: [LeaveOneOut]
RESULT_COMPONENTS: [fold_predictions, fold_metrics, overall_results, fold_feature_importances] RESULT_COMPONENTS: [fold_predictions, fold_metrics, overall_results, fold_feature_importances]

View File

@ -761,22 +761,6 @@ rule fitbit_sleep_intraday_r_features:
script: script:
"../src/features/entry.R" "../src/features/entry.R"
rule merge_sensor_features_for_individual_participants:
input:
feature_files = input_merge_sensor_features_for_individual_participants
output:
"data/processed/features/{pid}/all_sensor_features.csv"
script:
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
rule merge_sensor_features_for_all_participants:
input:
feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
output:
"data/processed/features/all_participants/all_sensor_features.csv"
script:
"../src/features/utils/merge_sensor_features_for_all_participants.R"
rule empatica_accelerometer_python_features: rule empatica_accelerometer_python_features:
input: input:
sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv", sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
@ -958,3 +942,49 @@ rule empatica_tags_r_features:
"data/interim/{pid}/empatica_tags_features/empatica_tags_r_{provider_key}.csv" "data/interim/{pid}/empatica_tags_features/empatica_tags_r_{provider_key}.csv"
script: script:
"../src/features/entry.R" "../src/features/entry.R"
rule merge_sensor_features_for_individual_participants:
input:
feature_files = input_merge_sensor_features_for_individual_participants
output:
"data/processed/features/{pid}/all_sensor_features.csv"
script:
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
rule merge_sensor_features_for_all_participants:
input:
feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
output:
"data/processed/features/all_participants/all_sensor_features.csv"
script:
"../src/features/utils/merge_sensor_features_for_all_participants.R"
rule clean_sensor_features_for_individual_participants:
input:
rules.merge_sensor_features_for_individual_participants.output
params:
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
output:
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
script:
"../src/features/utils/clean_sensor_features.R"
rule clean_sensor_features_for_all_participants:
input:
rules.merge_sensor_features_for_all_participants.output
params:
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
output:
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
script:
"../src/features/utils/clean_sensor_features.R"

View File

@ -53,36 +53,6 @@ rule parse_targets:
script: script:
"../src/models/workflow_example/parse_targets.py" "../src/models/workflow_example/parse_targets.py"
rule clean_sensor_features_for_individual_participants:
input:
rules.merge_sensor_features_for_individual_participants.output
params:
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
output:
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
script:
"../src/models/workflow_example/clean_sensor_features.R"
rule clean_sensor_features_for_all_participants:
input:
rules.merge_sensor_features_for_all_participants.output
params:
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
output:
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
script:
"../src/models/workflow_example/clean_sensor_features.R"
rule merge_features_and_targets_for_individual_model: rule merge_features_and_targets_for_individual_model:
input: input:
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv", cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",

View File

@ -25,12 +25,6 @@ if(nrow(clean_features))
if(drop_zero_variance_columns) if(drop_zero_variance_columns)
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1) clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
# drop rows with a percentage of NA values above rows_nan_threshold
clean_features <- clean_features %>%
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
filter(percentage_na < rows_nan_threshold) %>%
select(-percentage_na)
# drop highly correlated features # drop highly correlated features
features_for_corr <- clean_features %>% features_for_corr <- clean_features %>%
select_if(is.numeric) %>% select_if(is.numeric) %>%
@ -47,4 +41,10 @@ highly_correlated_features <- features_for_corr %>%
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features] clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
# drop rows with a percentage of NA values above rows_nan_threshold
clean_features <- clean_features %>%
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
filter(percentage_na < rows_nan_threshold) %>%
select(-percentage_na)
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE) write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -36,6 +36,7 @@ required:
- HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT - HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
- HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT - HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
- HEATMAP_FEATURE_CORRELATION_MATRIX - HEATMAP_FEATURE_CORRELATION_MATRIX
- DATA_CLEANING
definitions: definitions:
PROVIDER: PROVIDER:
@ -1256,3 +1257,32 @@ properties:
CORR_METHOD: CORR_METHOD:
type: string type: string
enum: ["pearson", "kendall", "spearman"] enum: ["pearson", "kendall", "spearman"]
DATA_CLEANING:
type: object
required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
properties:
COMPUTE:
type: boolean
COLS_NAN_THRESHOLD:
type: number
minimum: 0
maximum: 1
COLS_VAR_THRESHOLD:
type: boolean
ROWS_NAN_THRESHOLD:
type: number
minimum: 0
maximum: 1
DATA_YIELDED_HOURS_RATIO_THRESHOLD:
type: number
minimum: 0
maximum: 1
CORR_VALID_PAIRS_THRESHOLD:
type: number
minimum: 0
maximum: 1
CORR_THRESHOLD:
type: number
minimum: 0
maximum: 1