From 3a80f9377123072657dd1089afc91ff653908f33 Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 6 Jan 2021 11:12:10 -0500 Subject: [PATCH 1/4] Fix segment error when device ids is empty --- src/data/compute_time_segments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/compute_time_segments.py b/src/data/compute_time_segments.py index adf50958..6f48a5fc 100644 --- a/src/data/compute_time_segments.py +++ b/src/data/compute_time_segments.py @@ -204,7 +204,7 @@ def parse_time_segments(time_segments_file, segments_type, device_ids): participant_file = yaml.load(open(snakemake.input[1], 'r'), Loader=yaml.FullLoader) device_ids = [] for key in participant_file.keys(): - if "DEVICE_IDS" in participant_file[key]: + if "DEVICE_IDS" in participant_file[key] and isinstance(participant_file[key]["DEVICE_IDS"], list): device_ids = device_ids + participant_file[key]["DEVICE_IDS"] final_time_segments = parse_time_segments(snakemake.input[0], snakemake.params["time_segments_type"], device_ids) From 5203aa60d148b8eafb26580d84bbeffca3ea6cac Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 6 Jan 2021 11:14:15 -0500 Subject: [PATCH 2/4] Fix bugs in create participants files script - The PHONE and FITBIT flags were mixed up - The start/end dates from the CSV file weren't being parsed correctly --- src/data/create_participants_files.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/data/create_participants_files.R b/src/data/create_participants_files.R index 0aba554f..7042c971 100644 --- a/src/data/create_participants_files.R +++ b/src/data/create_participants_files.R @@ -11,8 +11,8 @@ group <- config$SOURCE$DATABASE_GROUP timezone <- config$SOURCE$TIMEZONE phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN -add_fitbit_section = config$PHONE_SECTION$ADD -add_phone_section = config$FITBIT_SECTION$ADD +add_phone_section = config$PHONE_SECTION$ADD +add_fitbit_section = config$FITBIT_SECTION$ADD phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS @@ -39,7 +39,8 @@ if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){ } else if(config$SOURCE$TYPE == "CSV_FILE"){ participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c", - start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c")) + start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c")) %>% + mutate(start_date = as.character(start_date), end_date = as.character(end_date)) # we read as date to validate format participants <- participants %>% mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","), platform = str_replace(platform, ";",","), @@ -55,16 +56,18 @@ participants %>% empty_fitbit <- c("FITBIT:", " DEVICE_IDS:", " LABEL:", " START_DATE:", " END_DATE:") row <- tibble(...) lines <- c() + start_date = if_else(is.na(row$start_date), "", row$start_date) + end_date = if_else(is.na(row$end_date), "", row$end_date) if(add_phone_section == TRUE && !is.na(row[phone_device_id_column])){ lines <- append(lines, c("PHONE:", paste0(" DEVICE_IDS: [",row[phone_device_id_column],"]"), paste0(" PLATFORMS: [",row$platform,"]"), - paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date))) + paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date))) }else lines <- append(lines, empty_phone) if(add_fitbit_section == TRUE && !is.na(row[fitbit_device_id_column])){ lines <- append(lines, c("FITBIT:", paste0(" DEVICE_IDS: [",row[fitbit_device_id_column],"]"), - paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date))) + paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date))) } else lines <- append(lines, empty_fitbit) From 4926497ae2b6a88a390d818f2ede79eda020cd29 Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 6 Jan 2021 11:43:01 -0500 Subject: [PATCH 3/4] Fix bugs in Fitbit data parsing - Fix the script that was breaking with an empty file - Fix the script that was breaking when start/end dates were empty - Ambigous and nonexistent DST times are handled now - Remove unnecessary else clause --- docs/change-log.md | 2 ++ src/data/fitbit_parse_calories.py | 8 ++++++-- src/data/fitbit_parse_heartrate.py | 14 ++++++++----- src/data/fitbit_parse_sleep.py | 32 ++++++++++++------------------ src/data/fitbit_parse_steps.py | 22 ++++++++++---------- 5 files changed, 40 insertions(+), 38 deletions(-) diff --git a/docs/change-log.md b/docs/change-log.md index 54619c71..c759b95f 100644 --- a/docs/change-log.md +++ b/docs/change-log.md @@ -5,6 +5,8 @@ - Update CI to create a release on a tagged push that passes the tests - Clarify in DB credential configuration that we only support MySQL - Add Windows installation instructions +- Fix bugs in the create_participants_file script +- Fix bugs in Fitbit data parsing. ## v0.3.1 - Update installation docs for RAPIDS' docker container - Fix example analysis use of accelerometer data in a plot diff --git a/src/data/fitbit_parse_calories.py b/src/data/fitbit_parse_calories.py index 059006bd..b0927a4d 100644 --- a/src/data/fitbit_parse_calories.py +++ b/src/data/fitbit_parse_calories.py @@ -41,10 +41,14 @@ elif table_format == "CSV": summary = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) intraday = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) +# if not pd.isnull(local_start_date) and not pd.isnull(local_end_date): + if summary.shape[0] > 0: - summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 + summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 + summary.dropna(subset=['timestamp'], inplace=True) if intraday.shape[0] > 0: - intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 + intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 + intraday.dropna(subset=['timestamp'], inplace=True) summary.to_csv(snakemake.output["summary_data"], index=False) intraday.to_csv(snakemake.output["intraday_data"], index=False) \ No newline at end of file diff --git a/src/data/fitbit_parse_heartrate.py b/src/data/fitbit_parse_heartrate.py index 33e9c484..4e6c0afd 100644 --- a/src/data/fitbit_parse_heartrate.py +++ b/src/data/fitbit_parse_heartrate.py @@ -97,7 +97,11 @@ def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, def parseHeartrateData(heartrate_data, fitbit_data_type): if heartrate_data.empty: - return pd.DataFrame(columns=HR_SUMMARY_COLUMNS), pd.DataFrame(columns=HR_INTRADAY_COLUMNS) + if fitbit_data_type == "summary": + return pd.DataFrame(columns=HR_SUMMARY_COLUMNS) + elif fitbit_data_type == "intraday": + return pd.DataFrame(columns=HR_INTRADAY_COLUMNS) + device_id = heartrate_data["device_id"].iloc[0] records_summary, records_intraday = [], [] @@ -121,8 +125,6 @@ def parseHeartrateData(heartrate_data, fitbit_data_type): parsed_data = pd.DataFrame(data=records_summary, columns=HR_SUMMARY_COLUMNS) elif fitbit_data_type == "intraday": parsed_data = pd.DataFrame(data=records_intraday, columns=HR_INTRADAY_COLUMNS) - else: - raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].") return parsed_data @@ -145,9 +147,11 @@ else: raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].") # Only keep dates in the range of [local_start_date, local_end_date) -parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)] +if not pd.isnull(local_start_date) and not pd.isnull(local_end_date): + parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)] if parsed_data.shape[0] > 0: - parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 + parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 + parsed_data.dropna(subset=['timestamp'], inplace=True) parsed_data.to_csv(snakemake.output[0], index=False) diff --git a/src/data/fitbit_parse_sleep.py b/src/data/fitbit_parse_sleep.py index a5f49d81..1995fb01 100644 --- a/src/data/fitbit_parse_sleep.py +++ b/src/data/fitbit_parse_sleep.py @@ -188,7 +188,10 @@ def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, re def parseSleepData(sleep_data, fitbit_data_type): SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2 if sleep_data.empty: - return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS), pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS) + if fitbit_data_type == "summary": + return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS) + elif fitbit_data_type == "intraday": + return pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS) device_id = sleep_data["device_id"].iloc[0] records_summary, records_intraday = [], [] # Parse JSON into individual records @@ -210,13 +213,9 @@ def parseSleepData(sleep_data, fitbit_data_type): parsed_data = pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS) elif fitbit_data_type == "intraday": parsed_data = pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS) - else: - raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].") return parsed_data - - timezone = snakemake.params["timezone"] column_format = snakemake.params["column_format"] fitbit_data_type = snakemake.params["fitbit_data_type"] @@ -235,31 +234,26 @@ elif column_format == "PLAIN_TEXT": parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) elif fitbit_data_type == "intraday": parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) - else: - raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].") else: raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].") if parsed_data.shape[0] > 0 and fitbit_data_type == "summary": - if sleep_episode_timestamp != "start" and sleep_episode_timestamp != "end": raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].") - # Column name to be considered as the event datetime datetime_column = "local_" + sleep_episode_timestamp + "_date_time" - # Only keep dates in the range of [local_start_date, local_end_date) - parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)] - # Convert datetime to timestamp - parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone).astype(np.int64) // 10**6 - # Drop useless columns: local_start_date_time and local_end_date_time + + if not pd.isnull(local_start_date) and not pd.isnull(local_end_date): + parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)] + parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 + parsed_data.dropna(subset=['timestamp'], inplace=True) parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True) if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday": - # Only keep dates in the range of [local_start_date, local_end_date) - parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)] - # Convert datetime to timestamp - parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 - # Unifying level + if not pd.isnull(local_start_date) and not pd.isnull(local_end_date): + parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)] + parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 + parsed_data.dropna(subset=['timestamp'], inplace=True) parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "wake", "restless"]), 0, 1) parsed_data.to_csv(snakemake.output[0], index=False) diff --git a/src/data/fitbit_parse_steps.py b/src/data/fitbit_parse_steps.py index b6f32eb7..92143ec1 100644 --- a/src/data/fitbit_parse_steps.py +++ b/src/data/fitbit_parse_steps.py @@ -9,9 +9,10 @@ STEPS_COLUMNS = ("device_id", "steps", "local_date_time", "timestamp") def parseStepsData(steps_data, fitbit_data_type): if steps_data.empty: - return pd.DataFrame(), pd.DataFrame(columns=STEPS_INTRADAY_COLUMNS) + return pd.DataFrame(columns=STEPS_COLUMNS) + device_id = steps_data["device_id"].iloc[0] - records_summary, records_intraday = [], [] + records = [] # Parse JSON into individual records for record in steps_data.fitbit_data: @@ -26,7 +27,7 @@ def parseStepsData(steps_data, fitbit_data_type): curr_date, 0) - records_summary.append(row_summary) + records.append(row_summary) # Parse intraday data if fitbit_data_type == "intraday": @@ -40,14 +41,9 @@ def parseStepsData(steps_data, fitbit_data_type): d_datetime, 0) - records_intraday.append(row_intraday) + records.append(row_intraday) - if fitbit_data_type == "summary": - parsed_data = pd.DataFrame(data=records_summary, columns=STEPS_COLUMNS) - elif fitbit_data_type == "intraday": - parsed_data = pd.DataFrame(data=records_intraday, columns=STEPS_COLUMNS) - else: - raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].") + parsed_data = pd.DataFrame(data=records, columns=STEPS_COLUMNS) return parsed_data @@ -71,9 +67,11 @@ else: raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].") # Only keep dates in the range of [local_start_date, local_end_date) -parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)] +if not pd.isnull(local_start_date) and not pd.isnull(local_end_date): + parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)] if parsed_data.shape[0] > 0: - parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 + parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 + parsed_data.dropna(subset=['timestamp'], inplace=True) parsed_data.to_csv(snakemake.output[0], index=False) From 3dd0e989a711f2bcfe59b343ce6d426c54ad845a Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 6 Jan 2021 12:09:06 -0500 Subject: [PATCH 4/4] Update Doryab location docs --- docs/features/phone-locations.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md index 646dc9b0..33cd3555 100644 --- a/docs/features/phone-locations.md +++ b/docs/features/phone-locations.md @@ -124,8 +124,8 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`: |timeattop1location |minutes |Time spent at the most significant location. |timeattop2location |minutes |Time spent at the 2nd most significant location. |timeattop3location |minutes |Time spent at the 3rd most significant location. -|movingtostaticratio | - | Ratio between the number of rows labeled Moving versus Static -|outlierstimepercent | - | Ratio between the number of rows that belong to non-significant clusters divided by the total number of rows in a time segment. +|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if it’s speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]` +|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]` |maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location). |minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location). |meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location).