From 3a80f9377123072657dd1089afc91ff653908f33 Mon Sep 17 00:00:00 2001
From: JulioV <JulioV@users.noreply.github.com>
Date: Wed, 6 Jan 2021 11:12:10 -0500
Subject: [PATCH 1/4] Fix segment error when device ids is empty

---
 src/data/compute_time_segments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data/compute_time_segments.py b/src/data/compute_time_segments.py
index adf50958..6f48a5fc 100644
--- a/src/data/compute_time_segments.py
+++ b/src/data/compute_time_segments.py
@@ -204,7 +204,7 @@ def parse_time_segments(time_segments_file, segments_type, device_ids):
 participant_file = yaml.load(open(snakemake.input[1], 'r'), Loader=yaml.FullLoader)
 device_ids = []
 for key in participant_file.keys():
-    if "DEVICE_IDS" in participant_file[key]:
+    if "DEVICE_IDS" in participant_file[key] and isinstance(participant_file[key]["DEVICE_IDS"], list):
         device_ids = device_ids + participant_file[key]["DEVICE_IDS"]
 
 final_time_segments = parse_time_segments(snakemake.input[0], snakemake.params["time_segments_type"], device_ids)

From 5203aa60d148b8eafb26580d84bbeffca3ea6cac Mon Sep 17 00:00:00 2001
From: JulioV <JulioV@users.noreply.github.com>
Date: Wed, 6 Jan 2021 11:14:15 -0500
Subject: [PATCH 2/4] Fix bugs in create participants files script

- The PHONE and FITBIT flags were mixed up
- The start/end dates from the CSV file weren't being parsed correctly
---
 src/data/create_participants_files.R | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/data/create_participants_files.R b/src/data/create_participants_files.R
index 0aba554f..7042c971 100644
--- a/src/data/create_participants_files.R
+++ b/src/data/create_participants_files.R
@@ -11,8 +11,8 @@ group <- config$SOURCE$DATABASE_GROUP
 timezone <- config$SOURCE$TIMEZONE
 phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN
 fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN
-add_fitbit_section = config$PHONE_SECTION$ADD
-add_phone_section = config$FITBIT_SECTION$ADD
+add_phone_section = config$PHONE_SECTION$ADD
+add_fitbit_section = config$FITBIT_SECTION$ADD
 phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS
 fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS
 
@@ -39,7 +39,8 @@ if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){
 
 } else if(config$SOURCE$TYPE == "CSV_FILE"){
   participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
-                            start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c"))
+                            start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c")) %>% 
+                            mutate(start_date = as.character(start_date), end_date = as.character(end_date)) # we read as date to validate format
   participants <- participants %>% 
   mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
          platform = str_replace(platform, ";",","),
@@ -55,16 +56,18 @@ participants %>%
     empty_fitbit <- c("FITBIT:", "  DEVICE_IDS:", "  LABEL:", "  START_DATE:", "  END_DATE:")
     row <- tibble(...)
     lines <- c()
+    start_date = if_else(is.na(row$start_date), "", row$start_date)
+    end_date = if_else(is.na(row$end_date), "", row$end_date)
 
     if(add_phone_section == TRUE && !is.na(row[phone_device_id_column])){
       lines <- append(lines, c("PHONE:", paste0("  DEVICE_IDS: [",row[phone_device_id_column],"]"), paste0("  PLATFORMS: [",row$platform,"]"),
-                               paste("  LABEL:",row$label), paste("  START_DATE:", row$start_date), paste("  END_DATE:", row$end_date)))
+                               paste("  LABEL:",row$label), paste("  START_DATE:", start_date), paste("  END_DATE:", end_date)))
     }else
       lines <- append(lines, empty_phone)
     
     if(add_fitbit_section == TRUE && !is.na(row[fitbit_device_id_column])){
       lines <- append(lines, c("FITBIT:", paste0("  DEVICE_IDS: [",row[fitbit_device_id_column],"]"),
-                               paste("  LABEL:",row$label), paste("  START_DATE:", row$start_date), paste("  END_DATE:", row$end_date)))
+                               paste("  LABEL:",row$label), paste("  START_DATE:", start_date), paste("  END_DATE:", end_date)))
     } else
       lines <- append(lines, empty_fitbit)
     

From 4926497ae2b6a88a390d818f2ede79eda020cd29 Mon Sep 17 00:00:00 2001
From: JulioV <JulioV@users.noreply.github.com>
Date: Wed, 6 Jan 2021 11:43:01 -0500
Subject: [PATCH 3/4] Fix bugs in Fitbit data parsing

- Fix the script that was breaking with an empty file
- Fix the script that was breaking when start/end dates were empty
- Ambigous and nonexistent DST times are handled now
- Remove unnecessary else clause
---
 docs/change-log.md                 |  2 ++
 src/data/fitbit_parse_calories.py  |  8 ++++++--
 src/data/fitbit_parse_heartrate.py | 14 ++++++++-----
 src/data/fitbit_parse_sleep.py     | 32 ++++++++++++------------------
 src/data/fitbit_parse_steps.py     | 22 ++++++++++----------
 5 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/docs/change-log.md b/docs/change-log.md
index 54619c71..c759b95f 100644
--- a/docs/change-log.md
+++ b/docs/change-log.md
@@ -5,6 +5,8 @@
 - Update CI to create a release on a tagged push that passes the tests
 - Clarify in DB credential configuration that we only support MySQL
 - Add Windows installation instructions
+- Fix bugs in the create_participants_file script
+- Fix bugs in Fitbit data parsing.
 ## v0.3.1
 - Update installation docs for RAPIDS' docker container
 - Fix example analysis use of accelerometer data in a plot
diff --git a/src/data/fitbit_parse_calories.py b/src/data/fitbit_parse_calories.py
index 059006bd..b0927a4d 100644
--- a/src/data/fitbit_parse_calories.py
+++ b/src/data/fitbit_parse_calories.py
@@ -41,10 +41,14 @@ elif table_format == "CSV":
     summary = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
     intraday = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
 
+#    if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
+
 if summary.shape[0] > 0:
-    summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
+    summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
+    summary.dropna(subset=['timestamp'], inplace=True)
 if intraday.shape[0] > 0:
-    intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
+    intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
+    intraday.dropna(subset=['timestamp'], inplace=True)
 
 summary.to_csv(snakemake.output["summary_data"], index=False)
 intraday.to_csv(snakemake.output["intraday_data"], index=False)
\ No newline at end of file
diff --git a/src/data/fitbit_parse_heartrate.py b/src/data/fitbit_parse_heartrate.py
index 33e9c484..4e6c0afd 100644
--- a/src/data/fitbit_parse_heartrate.py
+++ b/src/data/fitbit_parse_heartrate.py
@@ -97,7 +97,11 @@ def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date,
 
 def parseHeartrateData(heartrate_data, fitbit_data_type):
     if heartrate_data.empty:
-        return pd.DataFrame(columns=HR_SUMMARY_COLUMNS), pd.DataFrame(columns=HR_INTRADAY_COLUMNS)
+        if fitbit_data_type == "summary":
+            return pd.DataFrame(columns=HR_SUMMARY_COLUMNS)
+        elif fitbit_data_type == "intraday":
+            return pd.DataFrame(columns=HR_INTRADAY_COLUMNS)
+
     device_id = heartrate_data["device_id"].iloc[0]
     records_summary, records_intraday = [], []
 
@@ -121,8 +125,6 @@ def parseHeartrateData(heartrate_data, fitbit_data_type):
         parsed_data = pd.DataFrame(data=records_summary, columns=HR_SUMMARY_COLUMNS)
     elif fitbit_data_type == "intraday":
         parsed_data = pd.DataFrame(data=records_intraday, columns=HR_INTRADAY_COLUMNS)
-    else:
-        raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
     return parsed_data
     
 
@@ -145,9 +147,11 @@ else:
     raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
 
 # Only keep dates in the range of [local_start_date, local_end_date)
-parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
+if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
+    parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
 
 if parsed_data.shape[0] > 0:
-    parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
+    parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
+    parsed_data.dropna(subset=['timestamp'], inplace=True)
 
 parsed_data.to_csv(snakemake.output[0], index=False)
diff --git a/src/data/fitbit_parse_sleep.py b/src/data/fitbit_parse_sleep.py
index a5f49d81..1995fb01 100644
--- a/src/data/fitbit_parse_sleep.py
+++ b/src/data/fitbit_parse_sleep.py
@@ -188,7 +188,10 @@ def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, re
 def parseSleepData(sleep_data, fitbit_data_type):
     SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2
     if sleep_data.empty:
-        return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS), pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS)
+        if fitbit_data_type == "summary":
+            return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS)
+        elif fitbit_data_type == "intraday":
+            return pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS)
     device_id = sleep_data["device_id"].iloc[0]
     records_summary, records_intraday = [], []
     # Parse JSON into individual records
@@ -210,13 +213,9 @@ def parseSleepData(sleep_data, fitbit_data_type):
         parsed_data = pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS)
     elif fitbit_data_type == "intraday":
         parsed_data = pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS)
-    else:
-        raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
 
     return parsed_data
 
-
-
 timezone = snakemake.params["timezone"]
 column_format = snakemake.params["column_format"]
 fitbit_data_type = snakemake.params["fitbit_data_type"]
@@ -235,31 +234,26 @@ elif column_format == "PLAIN_TEXT":
         parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
     elif fitbit_data_type == "intraday":
         parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
-    else:
-        raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
 else:
     raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
 
 if parsed_data.shape[0] > 0 and fitbit_data_type == "summary":
-    
     if sleep_episode_timestamp != "start" and sleep_episode_timestamp != "end":
         raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].")
-
     # Column name to be considered as the event datetime
     datetime_column = "local_" + sleep_episode_timestamp + "_date_time"
-    # Only keep dates in the range of [local_start_date, local_end_date)
-    parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)]
-    # Convert datetime to timestamp
-    parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone).astype(np.int64) // 10**6
-    # Drop useless columns: local_start_date_time and local_end_date_time
+    
+    if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
+        parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)]
+    parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
+    parsed_data.dropna(subset=['timestamp'], inplace=True)
     parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True)
 
 if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday":
-    # Only keep dates in the range of [local_start_date, local_end_date)
-    parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
-    # Convert datetime to timestamp
-    parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
-    # Unifying level
+    if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
+        parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
+    parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
+    parsed_data.dropna(subset=['timestamp'], inplace=True)
     parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "wake", "restless"]), 0, 1)
 
 parsed_data.to_csv(snakemake.output[0], index=False)
diff --git a/src/data/fitbit_parse_steps.py b/src/data/fitbit_parse_steps.py
index b6f32eb7..92143ec1 100644
--- a/src/data/fitbit_parse_steps.py
+++ b/src/data/fitbit_parse_steps.py
@@ -9,9 +9,10 @@ STEPS_COLUMNS = ("device_id", "steps", "local_date_time", "timestamp")
 
 def parseStepsData(steps_data, fitbit_data_type):
     if steps_data.empty:
-        return pd.DataFrame(), pd.DataFrame(columns=STEPS_INTRADAY_COLUMNS)
+        return pd.DataFrame(columns=STEPS_COLUMNS)
+
     device_id = steps_data["device_id"].iloc[0]
-    records_summary, records_intraday = [], []
+    records = []
     
     # Parse JSON into individual records
     for record in steps_data.fitbit_data:
@@ -26,7 +27,7 @@ def parseStepsData(steps_data, fitbit_data_type):
                 curr_date,
                 0)
             
-            records_summary.append(row_summary)
+            records.append(row_summary)
 
         # Parse intraday data
         if fitbit_data_type == "intraday":
@@ -40,14 +41,9 @@ def parseStepsData(steps_data, fitbit_data_type):
                     d_datetime,
                     0)
 
-                records_intraday.append(row_intraday)
+                records.append(row_intraday)
     
-    if fitbit_data_type == "summary":
-        parsed_data = pd.DataFrame(data=records_summary, columns=STEPS_COLUMNS)
-    elif fitbit_data_type == "intraday":
-        parsed_data = pd.DataFrame(data=records_intraday, columns=STEPS_COLUMNS)
-    else:
-        raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
+        parsed_data = pd.DataFrame(data=records, columns=STEPS_COLUMNS)
 
     return parsed_data
 
@@ -71,9 +67,11 @@ else:
     raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
 
 # Only keep dates in the range of [local_start_date, local_end_date)
-parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
+if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
+    parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
 
 if parsed_data.shape[0] > 0:
-    parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
+    parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
+    parsed_data.dropna(subset=['timestamp'], inplace=True)
 
 parsed_data.to_csv(snakemake.output[0], index=False)

From 3dd0e989a711f2bcfe59b343ce6d426c54ad845a Mon Sep 17 00:00:00 2001
From: JulioV <JulioV@users.noreply.github.com>
Date: Wed, 6 Jan 2021 12:09:06 -0500
Subject: [PATCH 4/4] Update Doryab location docs

---
 docs/features/phone-locations.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md
index 646dc9b0..33cd3555 100644
--- a/docs/features/phone-locations.md
+++ b/docs/features/phone-locations.md
@@ -124,8 +124,8 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`:
 |timeattop1location                                           |minutes       |Time spent at the most significant location.
 |timeattop2location                                           |minutes       |Time spent at the 2nd most significant location.
 |timeattop3location                                           |minutes       |Time spent at the 3rd most significant location. 
-|movingtostaticratio                                          | -   |  Ratio between the number of rows labeled Moving versus Static
-|outlierstimepercent                                          | -   | Ratio between the number of rows that belong to non-significant clusters divided by the total number of rows in a time segment.
+|movingtostaticratio                                          | -   |  Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if it’s speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
+|outlierstimepercent                                          | -   | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
 |maxlengthstayatclusters                                      |minutes       |Maximum time spent in a cluster (significant location).
 |minlengthstayatclusters                                      |minutes       |Minimum time spent in a cluster (significant location).
 |meanlengthstayatclusters                                     |minutes       |Average time spent in a cluster (significant location).