Move the data cleaning module from example workflow to main directory
parent
8e3d5eb98c
commit
4a7989c058
|
@ -394,6 +394,10 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
|
||||||
if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
||||||
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
||||||
|
|
||||||
|
# Data Cleaning
|
||||||
|
if config["DATA_CLEANING"]["COMPUTE"]:
|
||||||
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
|
||||||
|
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
|
||||||
|
|
||||||
rule all:
|
rule all:
|
||||||
input:
|
input:
|
||||||
|
|
14
config.yaml
14
config.yaml
|
@ -564,3 +564,17 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
||||||
CORR_THRESHOLD: 0.1
|
CORR_THRESHOLD: 0.1
|
||||||
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
||||||
|
|
||||||
|
|
||||||
|
########################################################################################################################
|
||||||
|
# Data Cleaning #
|
||||||
|
########################################################################################################################
|
||||||
|
|
||||||
|
DATA_CLEANING:
|
||||||
|
COMPUTE: False
|
||||||
|
COLS_NAN_THRESHOLD: 0.3
|
||||||
|
COLS_VAR_THRESHOLD: True
|
||||||
|
ROWS_NAN_THRESHOLD: 0.3
|
||||||
|
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
||||||
|
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
||||||
|
CORR_THRESHOLD: 0.95
|
||||||
|
|
||||||
|
|
|
@ -384,6 +384,11 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
|
||||||
if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
||||||
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
|
||||||
|
|
||||||
|
# Data Cleaning
|
||||||
|
if config["DATA_CLEANING"]["COMPUTE"]:
|
||||||
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
|
||||||
|
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
|
||||||
|
|
||||||
# Analysis Workflow Example
|
# Analysis Workflow Example
|
||||||
models, scalers = [], []
|
models, scalers = [], []
|
||||||
for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]:
|
for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]:
|
||||||
|
@ -401,7 +406,6 @@ files_to_compute.extend(expand("data/raw/{pid}/participant_target_with_datetime.
|
||||||
files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]))
|
||||||
|
|
||||||
# Individual model
|
# Individual model
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
|
|
||||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||||
files_to_compute.extend(expand(
|
files_to_compute.extend(expand(
|
||||||
|
@ -414,7 +418,6 @@ files_to_compute.extend(expand(
|
||||||
scaler=scalers))
|
scaler=scalers))
|
||||||
|
|
||||||
# Population model
|
# Population model
|
||||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
|
|
||||||
files_to_compute.append("data/processed/models/population_model/input.csv")
|
files_to_compute.append("data/processed/models/population_model/input.csv")
|
||||||
files_to_compute.extend(expand("data/processed/models/population_model/output_{cv_method}/baselines.csv", cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
files_to_compute.extend(expand("data/processed/models/population_model/output_{cv_method}/baselines.csv", cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||||
files_to_compute.extend(expand(
|
files_to_compute.extend(expand(
|
||||||
|
|
|
@ -534,6 +534,19 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
||||||
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
||||||
|
|
||||||
|
|
||||||
|
########################################################################################################################
|
||||||
|
# Data Cleaning #
|
||||||
|
########################################################################################################################
|
||||||
|
|
||||||
|
DATA_CLEANING:
|
||||||
|
COMPUTE: True
|
||||||
|
COLS_NAN_THRESHOLD: 0.3
|
||||||
|
COLS_VAR_THRESHOLD: True
|
||||||
|
ROWS_NAN_THRESHOLD: 0.3
|
||||||
|
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
||||||
|
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
||||||
|
CORR_THRESHOLD: 0.95
|
||||||
|
|
||||||
|
|
||||||
########################################################################################################################
|
########################################################################################################################
|
||||||
# Analysis Workflow Example #
|
# Analysis Workflow Example #
|
||||||
|
@ -552,14 +565,6 @@ PARAMS_FOR_ANALYSIS:
|
||||||
FOLDER: data/external/example_workflow
|
FOLDER: data/external/example_workflow
|
||||||
CONTAINER: participant_target.csv
|
CONTAINER: participant_target.csv
|
||||||
|
|
||||||
# Cleaning Parameters
|
|
||||||
COLS_NAN_THRESHOLD: 0.3
|
|
||||||
COLS_VAR_THRESHOLD: True
|
|
||||||
ROWS_NAN_THRESHOLD: 0.3
|
|
||||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
|
||||||
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
|
||||||
CORR_THRESHOLD: 0.95
|
|
||||||
|
|
||||||
MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
|
MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
|
||||||
CV_METHODS: [LeaveOneOut]
|
CV_METHODS: [LeaveOneOut]
|
||||||
RESULT_COMPONENTS: [fold_predictions, fold_metrics, overall_results, fold_feature_importances]
|
RESULT_COMPONENTS: [fold_predictions, fold_metrics, overall_results, fold_feature_importances]
|
||||||
|
|
|
@ -761,22 +761,6 @@ rule fitbit_sleep_intraday_r_features:
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.R"
|
"../src/features/entry.R"
|
||||||
|
|
||||||
rule merge_sensor_features_for_individual_participants:
|
|
||||||
input:
|
|
||||||
feature_files = input_merge_sensor_features_for_individual_participants
|
|
||||||
output:
|
|
||||||
"data/processed/features/{pid}/all_sensor_features.csv"
|
|
||||||
script:
|
|
||||||
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
|
|
||||||
|
|
||||||
rule merge_sensor_features_for_all_participants:
|
|
||||||
input:
|
|
||||||
feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
|
|
||||||
output:
|
|
||||||
"data/processed/features/all_participants/all_sensor_features.csv"
|
|
||||||
script:
|
|
||||||
"../src/features/utils/merge_sensor_features_for_all_participants.R"
|
|
||||||
|
|
||||||
rule empatica_accelerometer_python_features:
|
rule empatica_accelerometer_python_features:
|
||||||
input:
|
input:
|
||||||
sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
|
sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
|
||||||
|
@ -958,3 +942,49 @@ rule empatica_tags_r_features:
|
||||||
"data/interim/{pid}/empatica_tags_features/empatica_tags_r_{provider_key}.csv"
|
"data/interim/{pid}/empatica_tags_features/empatica_tags_r_{provider_key}.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.R"
|
"../src/features/entry.R"
|
||||||
|
|
||||||
|
rule merge_sensor_features_for_individual_participants:
|
||||||
|
input:
|
||||||
|
feature_files = input_merge_sensor_features_for_individual_participants
|
||||||
|
output:
|
||||||
|
"data/processed/features/{pid}/all_sensor_features.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
|
||||||
|
|
||||||
|
rule merge_sensor_features_for_all_participants:
|
||||||
|
input:
|
||||||
|
feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
|
||||||
|
output:
|
||||||
|
"data/processed/features/all_participants/all_sensor_features.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/utils/merge_sensor_features_for_all_participants.R"
|
||||||
|
|
||||||
|
rule clean_sensor_features_for_individual_participants:
|
||||||
|
input:
|
||||||
|
rules.merge_sensor_features_for_individual_participants.output
|
||||||
|
params:
|
||||||
|
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
|
||||||
|
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
|
||||||
|
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
|
||||||
|
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||||
|
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||||
|
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
|
||||||
|
output:
|
||||||
|
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/utils/clean_sensor_features.R"
|
||||||
|
|
||||||
|
rule clean_sensor_features_for_all_participants:
|
||||||
|
input:
|
||||||
|
rules.merge_sensor_features_for_all_participants.output
|
||||||
|
params:
|
||||||
|
cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
|
||||||
|
cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
|
||||||
|
rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
|
||||||
|
data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||||
|
corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||||
|
corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
|
||||||
|
output:
|
||||||
|
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/utils/clean_sensor_features.R"
|
||||||
|
|
|
@ -53,36 +53,6 @@ rule parse_targets:
|
||||||
script:
|
script:
|
||||||
"../src/models/workflow_example/parse_targets.py"
|
"../src/models/workflow_example/parse_targets.py"
|
||||||
|
|
||||||
rule clean_sensor_features_for_individual_participants:
|
|
||||||
input:
|
|
||||||
rules.merge_sensor_features_for_individual_participants.output
|
|
||||||
params:
|
|
||||||
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
|
|
||||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
|
||||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
|
||||||
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
|
||||||
corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
|
|
||||||
corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
|
|
||||||
output:
|
|
||||||
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
|
||||||
script:
|
|
||||||
"../src/models/workflow_example/clean_sensor_features.R"
|
|
||||||
|
|
||||||
rule clean_sensor_features_for_all_participants:
|
|
||||||
input:
|
|
||||||
rules.merge_sensor_features_for_all_participants.output
|
|
||||||
params:
|
|
||||||
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
|
|
||||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
|
||||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
|
||||||
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
|
||||||
corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
|
|
||||||
corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
|
|
||||||
output:
|
|
||||||
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
|
||||||
script:
|
|
||||||
"../src/models/workflow_example/clean_sensor_features.R"
|
|
||||||
|
|
||||||
rule merge_features_and_targets_for_individual_model:
|
rule merge_features_and_targets_for_individual_model:
|
||||||
input:
|
input:
|
||||||
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
|
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
|
||||||
|
|
|
@ -25,12 +25,6 @@ if(nrow(clean_features))
|
||||||
if(drop_zero_variance_columns)
|
if(drop_zero_variance_columns)
|
||||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||||
|
|
||||||
# drop rows with a percentage of NA values above rows_nan_threshold
|
|
||||||
clean_features <- clean_features %>%
|
|
||||||
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
|
||||||
filter(percentage_na < rows_nan_threshold) %>%
|
|
||||||
select(-percentage_na)
|
|
||||||
|
|
||||||
# drop highly correlated features
|
# drop highly correlated features
|
||||||
features_for_corr <- clean_features %>%
|
features_for_corr <- clean_features %>%
|
||||||
select_if(is.numeric) %>%
|
select_if(is.numeric) %>%
|
||||||
|
@ -47,4 +41,10 @@ highly_correlated_features <- features_for_corr %>%
|
||||||
|
|
||||||
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
||||||
|
|
||||||
|
# drop rows with a percentage of NA values above rows_nan_threshold
|
||||||
|
clean_features <- clean_features %>%
|
||||||
|
mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>%
|
||||||
|
filter(percentage_na < rows_nan_threshold) %>%
|
||||||
|
select(-percentage_na)
|
||||||
|
|
||||||
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -36,6 +36,7 @@ required:
|
||||||
- HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
|
- HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
|
||||||
- HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
|
- HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
|
||||||
- HEATMAP_FEATURE_CORRELATION_MATRIX
|
- HEATMAP_FEATURE_CORRELATION_MATRIX
|
||||||
|
- DATA_CLEANING
|
||||||
|
|
||||||
definitions:
|
definitions:
|
||||||
PROVIDER:
|
PROVIDER:
|
||||||
|
@ -1256,3 +1257,32 @@ properties:
|
||||||
CORR_METHOD:
|
CORR_METHOD:
|
||||||
type: string
|
type: string
|
||||||
enum: ["pearson", "kendall", "spearman"]
|
enum: ["pearson", "kendall", "spearman"]
|
||||||
|
|
||||||
|
DATA_CLEANING:
|
||||||
|
type: object
|
||||||
|
required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
|
||||||
|
properties:
|
||||||
|
COMPUTE:
|
||||||
|
type: boolean
|
||||||
|
COLS_NAN_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
COLS_VAR_THRESHOLD:
|
||||||
|
type: boolean
|
||||||
|
ROWS_NAN_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
DATA_YIELDED_HOURS_RATIO_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
CORR_VALID_PAIRS_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
CORR_THRESHOLD:
|
||||||
|
type: number
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
|
Loading…
Reference in New Issue