Move the data cleaning module from example workflow to main directory
parent
8e3d5eb98c
commit
4a7989c058
|
@ -394,6 +394,10 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
|
|||
if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
||||
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
||||
|
||||
# Data Cleaning
|
||||
if config["DATA_CLEANING"]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
|
||||
|
||||
rule all:
|
||||
input:
|
||||
|
|
14
config.yaml
14
config.yaml
|
@ -564,3 +564,17 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
|||
CORR_THRESHOLD: 0.1
|
||||
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# Data Cleaning #
|
||||
########################################################################################################################
|
||||
|
||||
DATA_CLEANING:
|
||||
COMPUTE: False
|
||||
COLS_NAN_THRESHOLD: 0.3
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 0.3
|
||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
||||
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
|
||||
|
|
|
@ -384,6 +384,11 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
|
|||
if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
||||
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
||||
|
||||
# Data Cleaning
|
||||
if config["DATA_CLEANING"]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
|
||||
|
||||
# Analysis Workflow Example
|
||||
models, scalers = [], []
|
||||
for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]:
|
||||
|
@ -401,7 +406,6 @@ files_to_compute.extend(expand("data/raw/{pid}/participant_target_with_datetime.
|
|||
files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]))
|
||||
|
||||
# Individual model
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||
files_to_compute.extend(expand(
|
||||
|
@ -414,7 +418,6 @@ files_to_compute.extend(expand(
|
|||
scaler=scalers))
|
||||
|
||||
# Population model
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
|
||||
files_to_compute.append("data/processed/models/population_model/input.csv")
|
||||
files_to_compute.extend(expand("data/processed/models/population_model/output_{cv_method}/baselines.csv", cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||
files_to_compute.extend(expand(
|
||||
|
|
|
@ -534,6 +534,19 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
|||
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# Data Cleaning #
|
||||
########################################################################################################################
|
||||
|
||||
DATA_CLEANING:
|
||||
COMPUTE: True
|
||||
COLS_NAN_THRESHOLD: 0.3
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 0.3
|
||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
||||
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# Analysis Workflow Example #
|
||||
|
@ -552,14 +565,6 @@ PARAMS_FOR_ANALYSIS:
|
|||
FOLDER: data/external/example_workflow
|
||||
CONTAINER: participant_target.csv
|
||||
|
||||
# Cleaning Parameters
|
||||
COLS_NAN_THRESHOLD: 0.3
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 0.3
|
||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
||||
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
|
||||
MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
|
||||
CV_METHODS: [LeaveOneOut]
|
||||
RESULT_COMPONENTS: [fold_predictions, fold_metrics, overall_results, fold_feature_importances]
|
||||
|
|
|
@ -761,22 +761,6 @@ rule fitbit_sleep_intraday_r_features:
|
|||
script:
|
||||
"../src/features/entry.R"
|
||||
|
||||
rule merge_sensor_features_for_individual_participants:
|
||||
input:
|
||||
feature_files = input_merge_sensor_features_for_individual_participants
|
||||
output:
|
||||
"data/processed/features/{pid}/all_sensor_features.csv"
|
||||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
|
||||
|
||||
rule merge_sensor_features_for_all_participants:
|
||||
input:
|
||||
feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
|
||||
output:
|
||||
"data/processed/features/all_participants/all_sensor_features.csv"
|
||||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_all_participants.R"
|
||||
|
||||
rule empatica_accelerometer_python_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
|
||||
|
@ -958,3 +942,49 @@ rule empatica_tags_r_features:
|
|||
"data/interim/{pid}/empatica_tags_features/empatica_tags_r_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/entry.R"
|
||||
|
||||
rule merge_sensor_features_for_individual_participants:
|
||||
input:
|
||||
feature_files = input_merge_sensor_features_for_individual_participants
|
||||
output:
|
||||
"data/processed/features/{pid}/all_sensor_features.csv"
|
||||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
|
||||
|
||||
rule merge_sensor_features_for_all_participants:
|
||||
input:
|
||||
feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
|
||||
output:
|
||||
"data/processed/features/all_participants/all_sensor_features.csv"
|
||||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_all_participants.R"
|
||||
|
||||
rule clean_sensor_features_for_individual_participants:
|
||||
input:
|
||||
rules.merge_sensor_features_for_individual_participants.output
|
||||
params:
|
||||
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
|
||||
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
|
||||
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
|
||||
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
|
||||
output:
|
||||
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
||||
script:
|
||||
"../src/features/utils/clean_sensor_features.R"
|
||||
|
||||
rule clean_sensor_features_for_all_participants:
|
||||
input:
|
||||
rules.merge_sensor_features_for_all_participants.output
|
||||
params:
|
||||
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
|
||||
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
|
||||
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
|
||||
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
|
||||
output:
|
||||
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
||||
script:
|
||||
"../src/features/utils/clean_sensor_features.R"
|
||||
|
|
|
@ -53,36 +53,6 @@ rule parse_targets:
|
|||
script:
|
||||
"../src/models/workflow_example/parse_targets.py"
|
||||
|
||||
rule clean_sensor_features_for_individual_participants:
|
||||
input:
|
||||
rules.merge_sensor_features_for_individual_participants.output
|
||||
params:
|
||||
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
|
||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
||||
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||
corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||
corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
|
||||
output:
|
||||
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
||||
script:
|
||||
"../src/models/workflow_example/clean_sensor_features.R"
|
||||
|
||||
rule clean_sensor_features_for_all_participants:
|
||||
input:
|
||||
rules.merge_sensor_features_for_all_participants.output
|
||||
params:
|
||||
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
|
||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
||||
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||
corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||
corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
|
||||
output:
|
||||
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
||||
script:
|
||||
"../src/models/workflow_example/clean_sensor_features.R"
|
||||
|
||||
rule merge_features_and_targets_for_individual_model:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
|
||||
|
|
|
@ -25,12 +25,6 @@ if(nrow(clean_features))
|
|||
if(drop_zero_variance_columns)
|
||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||
|
||||
# drop rows with a percentage of NA values above rows_nan_threshold
|
||||
clean_features <- clean_features %>%
|
||||
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
||||
filter(percentage_na < rows_nan_threshold) %>%
|
||||
select(-percentage_na)
|
||||
|
||||
# drop highly correlated features
|
||||
features_for_corr <- clean_features %>%
|
||||
select_if(is.numeric) %>%
|
||||
|
@ -47,4 +41,10 @@ highly_correlated_features <- features_for_corr %>%
|
|||
|
||||
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
||||
|
||||
# drop rows with a percentage of NA values above rows_nan_threshold
|
||||
clean_features <- clean_features %>%
|
||||
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
||||
filter(percentage_na < rows_nan_threshold) %>%
|
||||
select(-percentage_na)
|
||||
|
||||
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -36,6 +36,7 @@ required:
|
|||
- HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
|
||||
- HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
|
||||
- HEATMAP_FEATURE_CORRELATION_MATRIX
|
||||
- DATA_CLEANING
|
||||
|
||||
definitions:
|
||||
PROVIDER:
|
||||
|
@ -1256,3 +1257,32 @@ properties:
|
|||
CORR_METHOD:
|
||||
type: string
|
||||
enum: ["pearson", "kendall", "spearman"]
|
||||
|
||||
DATA_CLEANING:
|
||||
type: object
|
||||
required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
|
||||
properties:
|
||||
COMPUTE:
|
||||
type: boolean
|
||||
COLS_NAN_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
COLS_VAR_THRESHOLD:
|
||||
type: boolean
|
||||
ROWS_NAN_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
CORR_VALID_PAIRS_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
CORR_THRESHOLD:
|
||||
type: number
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
|
|
Loading…
Reference in New Issue