From e13b89b125a4dd342c6404d67e1cb39af4f7b7f5 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Mon, 3 Aug 2020 13:09:16 -0400 Subject: [PATCH] Add restore_sql_file rule; notsummarised module; diff platforms for heatmap_days_by_sensors --- config.yaml | 7 ++--- reports/figures/.gitkeep | 0 rules/models.snakefile | 5 ++-- rules/preprocessing.snakefile | 12 +++++++++ rules/reports.snakefile | 23 +++++++++++++++- src/data/restore_sql_file.py | 21 +++++++++++++++ src/models/merge_features_and_targets.py | 28 +++++++++++++++----- src/models/modeling.py | 8 +++++- src/models/targets.py | 9 ++++++- src/visualization/heatmap_days_by_sensors.py | 11 +++++--- 10 files changed, 107 insertions(+), 17 deletions(-) delete mode 100644 reports/figures/.gitkeep create mode 100644 src/data/restore_sql_file.py diff --git a/config.yaml b/config.yaml index 558af929..18d3eda5 100644 --- a/config.yaml +++ b/config.yaml @@ -148,7 +148,7 @@ APPLICATIONS_FOREGROUND: social: ["socialnetworks", "socialmediatools"] entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"] SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps - EXCLUDED_CATEGORIES: ["system_apps", "tvvideoapps"] + EXCLUDED_CATEGORIES: ["system_apps"] EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] @@ -226,7 +226,7 @@ HEATMAP_DAYS_BY_SENSORS: MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour EXPECTED_NUM_OF_DAYS: -1 - PHONE_SENSORS_TABLES: ["accelerometer", "applications_foreground", "battery", "calls", "light", "locations", "messages", "screen", "plugin_google_activity_recognition", "plugin_studentlife_audio_android"] + SENSORS: [accelerometer, activity_recognition, applications_foreground, conversation, battery, bluetooth, calls, light, locations, messages, screen] HEATMAP_SENSED_BINS: PLOT: False @@ -244,9 +244,10 @@ OVERALL_COMPLIANCE_HEATMAP: PARAMS_FOR_ANALYSIS: COMPUTE: False GROUNDTRUTH_TABLE: participant_info + TARGET_TABLE: participant_target SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"] DAY_SEGMENTS: *day_segments - PHONE_FEATURES: [accelerometer, activity_recognition, applications_foreground, battery, calls_incoming, calls_missed, calls_outgoing, conversation, light, location_doryab, messages_received, messages_sent, screen] + PHONE_FEATURES: [accelerometer, activity_recognition, applications_foreground, battery, bluetooth, calls_incoming, calls_missed, calls_outgoing, conversation, light, location_doryab, messages_received, messages_sent, screen] FITBIT_FEATURES: [fitbit_heartrate, fitbit_step, fitbit_sleep] PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays] diff --git a/reports/figures/.gitkeep b/reports/figures/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/rules/models.snakefile b/rules/models.snakefile index e8cd6dc6..adc6e5d5 100644 --- a/rules/models.snakefile +++ b/rules/models.snakefile @@ -14,7 +14,7 @@ rule days_to_analyse: rule targets: input: - participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" + participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["TARGET_TABLE"] + "_raw.csv" params: pid = "{pid}", summarised = "{summarised}", @@ -142,7 +142,8 @@ rule merge_features_and_targets: summarised = "{summarised}", cols_var_threshold = "{cols_var_threshold}", numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"], - categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"] + categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"], + features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"], output: "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv" script: diff --git a/rules/preprocessing.snakefile b/rules/preprocessing.snakefile index 25338202..c8669190 100644 --- a/rules/preprocessing.snakefile +++ b/rules/preprocessing.snakefile @@ -1,3 +1,14 @@ +rule restore_sql_file: + input: + sql_file = "data/external/rapids_example.sql", + db_credentials = ".env" + params: + group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"] + output: + touch("data/interim/restore_sql_file.done") + script: + "../src/data/restore_sql_file.py" + rule download_participants: params: group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"], @@ -23,6 +34,7 @@ rule download_dataset: PHONE_SENSORS = [] PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["DORYAB_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]) +PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["TABLES"]) if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0: PHONE_SENSORS.append(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) diff --git a/rules/reports.snakefile b/rules/reports.snakefile index 7229a95e..07a198af 100644 --- a/rules/reports.snakefile +++ b/rules/reports.snakefile @@ -1,3 +1,24 @@ +def optional_heatmap_days_by_sensors_input(wildcards): + with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file: + external_file_content = external_file.readlines() + platforms = external_file_content[1].strip().split(",") + if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms): + platform = "android" + else: + platform = platforms[0] + + input_for_heatmap_days_by_sensors = [] + for sensor in config["HEATMAP_DAYS_BY_SENSORS"]["SENSORS"]: + if sensor == "activity_recognition" or sensor == "conversation": + if sensor.upper() not in config: + raise ValueError("Please check SENEORS parameter in HEATMAP_DAYS_BY_SENSORS section of config.yaml") + if platform not in ["android", "ios"]: + raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'") + input_for_heatmap_days_by_sensors.append("data/raw/{pid}/" + config[sensor.upper()]["DB_TABLE"][platform.upper()] + "_with_datetime.csv") + else: + input_for_heatmap_days_by_sensors.append("data/raw/{pid}/" + sensor + "_with_datetime.csv") + return input_for_heatmap_days_by_sensors + rule heatmap_features_correlations: input: features = expand("data/processed/{pid}/{sensor}_{day_segment}.csv", pid=config["PIDS"], sensor=config["HEATMAP_FEATURES_CORRELATIONS"]["PHONE_FEATURES"]+config["HEATMAP_FEATURES_CORRELATIONS"]["FITBIT_FEATURES"], day_segment=config["DAY_SEGMENTS"]), @@ -21,7 +42,7 @@ rule histogram_valid_sensed_hours: rule heatmap_days_by_sensors: input: - sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["HEATMAP_DAYS_BY_SENSORS"]["PHONE_SENSORS_TABLES"]), + sensors = optional_heatmap_days_by_sensors_input, phone_valid_sensed_days = "data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv" params: pid = "{pid}", diff --git a/src/data/restore_sql_file.py b/src/data/restore_sql_file.py new file mode 100644 index 00000000..dfa14011 --- /dev/null +++ b/src/data/restore_sql_file.py @@ -0,0 +1,21 @@ +import pandas as pd +import configparser +import os + +# read database credentials +group = snakemake.params["group"] +config = configparser.ConfigParser() +config.read(snakemake.input["db_credentials"]) + +# bash command to create table and restore tables from sql file +checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"use " + config[group]["database"] + "\"" +create_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"CREATE DATABASE IF NOT EXISTS " + config[group]["database"] + ";\"" +restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/" + config[group]["database"] + ".sql" + +try: + os.system(checkdb_cmd) +except: + print(config[group]["database"] + " DB already exists.") +else: + os.system(create_cmd) + os.system(restore_cmd) diff --git a/src/models/merge_features_and_targets.py b/src/models/merge_features_and_targets.py index 92860679..6a8cac97 100644 --- a/src/models/merge_features_and_targets.py +++ b/src/models/merge_features_and_targets.py @@ -32,19 +32,35 @@ summarised = snakemake.params["summarised"] cols_var_threshold = snakemake.params["cols_var_threshold"] numerical_operators = snakemake.params["numerical_operators"] categorical_operators = snakemake.params["categorical_operators"] - - -features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"]) -demographic_features = pd.read_csv(snakemake.input["demographic_features"], index_col=["pid"]) -targets = pd.read_csv(snakemake.input["targets"], index_col=["pid"]) +features_exclude_day_idx = snakemake.params["features_exclude_day_idx"] # Extract summarised features based on daily features: # for categorical features: calculate variance across all days # for numerical features: calculate mode across all days if summarised == "summarised": + + features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"]) + demographic_features = pd.read_csv(snakemake.input["demographic_features"], index_col=["pid"]) + targets = pd.read_csv(snakemake.input["targets"], index_col=["pid"]) + features = summariseFeatures(features, numerical_operators, categorical_operators, cols_var_threshold) -data = pd.concat([features, demographic_features, targets], axis=1, join="inner") + data = pd.concat([features, demographic_features, targets], axis=1, join="inner") + +elif summarised == "notsummarised": + + features = pd.read_csv(snakemake.input["cleaned_features"]) + demographic_features = pd.read_csv(snakemake.input["demographic_features"]) + + features = features.merge(demographic_features, on="pid", how="left").set_index(["pid", "local_date"]) + targets = pd.read_csv(snakemake.input["targets"], index_col=["pid", "local_date"]) + data = pd.concat([features, targets], axis=1, join="inner") + +else: + raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'") + +if features_exclude_day_idx and ("day_idx" in data.columns): + del data["day_idx"] data.to_csv(snakemake.output[0], index=True) diff --git a/src/models/modeling.py b/src/models/modeling.py index bea4275b..6271dee4 100644 --- a/src/models/modeling.py +++ b/src/models/modeling.py @@ -65,7 +65,13 @@ rowsnan_colsnan_days_colsvar_threshold = snakemake.params["rowsnan_colsnan_days_ # Read data and split -data = pd.read_csv(snakemake.input["data"], index_col=["pid"]) +if summarised == "summarised": + data = pd.read_csv(snakemake.input["data"], index_col=["pid"]) +elif summarised == "notsummarised": + data = pd.read_csv(snakemake.input["data"], index_col=["pid", "local_date"]) +else: + raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'") + data_x, data_y = data.drop("target", axis=1), data[["target"]] categorical_feature_colnames = categorical_colnames_demographic_features + getMatchingColNames(categorical_operators, data_x) diff --git a/src/models/targets.py b/src/models/targets.py index b7c4f771..e3ebcdd8 100644 --- a/src/models/targets.py +++ b/src/models/targets.py @@ -5,10 +5,10 @@ pid = snakemake.params["pid"] summarised = snakemake.params["summarised"] targets_ratio_threshold = snakemake.params["targets_ratio_threshold"] targets_value_threshold = snakemake.params["targets_value_threshold"] +participant_info = pd.read_csv(snakemake.input["participant_info"]) if summarised == "summarised": targets = pd.DataFrame(columns=["pid", "target"]) - participant_info = pd.read_csv(snakemake.input["participant_info"]) if not participant_info.empty: cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]] @@ -17,4 +17,11 @@ if summarised == "summarised": target = 1 if cesds.apply(lambda x : 1 if x >= targets_value_threshold else 0).sum() >= num_threshold else 0 targets.loc[0, :] = [pid, target] +elif summarised == "notsummarised": + targets = participant_info[["local_date", "target"]] + targets.insert(0, "pid", pid) + +else: + raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'") + targets.to_csv(snakemake.output[0], index=False) diff --git a/src/visualization/heatmap_days_by_sensors.py b/src/visualization/heatmap_days_by_sensors.py index a6918766..90793e38 100644 --- a/src/visualization/heatmap_days_by_sensors.py +++ b/src/visualization/heatmap_days_by_sensors.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import plotly.io as pio import plotly.graph_objects as go @@ -20,8 +21,12 @@ phone_valid_sensed_days = phone_valid_sensed_days[phone_valid_sensed_days["is_va row_count_sensors = pd.DataFrame() for sensor_path in snakemake.input["sensors"]: - # plugin_studentlife_audio_android => conversion; plugin_google_activity_recognition => AR; applications_foreground => apps - sensor_name = sensor_path.split("/")[-1].replace("_with_datetime.csv", "").replace("plugin_studentlife_audio_android", "conversion").replace("plugin_google_activity_recognition", "AR").replace("applications_foreground", "apps") + sensor_name = sensor_path.split("/")[-1].replace("_with_datetime.csv", "") + # plugin_studentlife_audio_android or plugin_studentlife_audio => conversion; plugin_google_activity_recognition or plugin_ios_activity_recognition => AR; applications_foreground => apps + sensor_name = sensor_name.replace("plugin_studentlife_audio_android", "conversion").replace("plugin_studentlife_audio", "conversion") \ + .replace("plugin_google_activity_recognition", "AR").replace("plugin_ios_activity_recognition", "AR") \ + .replace("applications_foreground", "apps") + sensor_data = pd.read_csv(sensor_path, encoding="ISO-8859-1", parse_dates=["local_date"], dtype={"label": str}) if sensor_data.empty: row_count_sensor = pd.DataFrame(columns=[sensor_name]) @@ -56,7 +61,7 @@ row_count_sensors = row_count_sensors.merge(all_dates, left_index=True, right_in # normalize each sensor (column) if row_count_sensors.count().max() > 1: - row_count_sensors_normalized = (row_count_sensors-row_count_sensors.min())/(row_count_sensors.max()-row_count_sensors.min()) + row_count_sensors_normalized = row_count_sensors.fillna(np.nan).apply(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)) if np.nanmax(x) != np.nanmin(x) else (x / np.nanmin(x)), axis=0) else: row_count_sensors_normalized = row_count_sensors