Merge branch 'feature/fitbit-fix' into develop

pull/111/head
JulioV 2021-01-06 12:10:38 -05:00
commit 9fc48ee0dc
8 changed files with 51 additions and 46 deletions

View File

@ -5,6 +5,8 @@
- Update CI to create a release on a tagged push that passes the tests
- Clarify in DB credential configuration that we only support MySQL
- Add Windows installation instructions
- Fix bugs in the create_participants_file script
- Fix bugs in Fitbit data parsing.
## v0.3.1
- Update installation docs for RAPIDS' docker container
- Fix example analysis use of accelerometer data in a plot

View File

@ -124,8 +124,8 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`:
|timeattop1location |minutes |Time spent at the most significant location.
|timeattop2location |minutes |Time spent at the 2nd most significant location.
|timeattop3location |minutes |Time spent at the 3rd most significant location.
|movingtostaticratio | - | Ratio between the number of rows labeled Moving versus Static
|outlierstimepercent | - | Ratio between the number of rows that belong to non-significant clusters divided by the total number of rows in a time segment.
|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
|maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location).
|minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location).
|meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location).

View File

@ -204,7 +204,7 @@ def parse_time_segments(time_segments_file, segments_type, device_ids):
participant_file = yaml.load(open(snakemake.input[1], 'r'), Loader=yaml.FullLoader)
device_ids = []
for key in participant_file.keys():
if "DEVICE_IDS" in participant_file[key]:
if "DEVICE_IDS" in participant_file[key] and isinstance(participant_file[key]["DEVICE_IDS"], list):
device_ids = device_ids + participant_file[key]["DEVICE_IDS"]
final_time_segments = parse_time_segments(snakemake.input[0], snakemake.params["time_segments_type"], device_ids)

View File

@ -11,8 +11,8 @@ group <- config$SOURCE$DATABASE_GROUP
timezone <- config$SOURCE$TIMEZONE
phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN
fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN
add_fitbit_section = config$PHONE_SECTION$ADD
add_phone_section = config$FITBIT_SECTION$ADD
add_phone_section = config$PHONE_SECTION$ADD
add_fitbit_section = config$FITBIT_SECTION$ADD
phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS
fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS
@ -39,7 +39,8 @@ if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){
} else if(config$SOURCE$TYPE == "CSV_FILE"){
participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c"))
start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c")) %>%
mutate(start_date = as.character(start_date), end_date = as.character(end_date)) # we read as date to validate format
participants <- participants %>%
mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
platform = str_replace(platform, ";",","),
@ -55,16 +56,18 @@ participants %>%
empty_fitbit <- c("FITBIT:", " DEVICE_IDS:", " LABEL:", " START_DATE:", " END_DATE:")
row <- tibble(...)
lines <- c()
start_date = if_else(is.na(row$start_date), "", row$start_date)
end_date = if_else(is.na(row$end_date), "", row$end_date)
if(add_phone_section == TRUE && !is.na(row[phone_device_id_column])){
lines <- append(lines, c("PHONE:", paste0(" DEVICE_IDS: [",row[phone_device_id_column],"]"), paste0(" PLATFORMS: [",row$platform,"]"),
paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date)))
paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date)))
}else
lines <- append(lines, empty_phone)
if(add_fitbit_section == TRUE && !is.na(row[fitbit_device_id_column])){
lines <- append(lines, c("FITBIT:", paste0(" DEVICE_IDS: [",row[fitbit_device_id_column],"]"),
paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date)))
paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date)))
} else
lines <- append(lines, empty_fitbit)

View File

@ -41,10 +41,14 @@ elif table_format == "CSV":
summary = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
intraday = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
# if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
if summary.shape[0] > 0:
summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
summary.dropna(subset=['timestamp'], inplace=True)
if intraday.shape[0] > 0:
intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
intraday.dropna(subset=['timestamp'], inplace=True)
summary.to_csv(snakemake.output["summary_data"], index=False)
intraday.to_csv(snakemake.output["intraday_data"], index=False)

View File

@ -97,7 +97,11 @@ def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date,
def parseHeartrateData(heartrate_data, fitbit_data_type):
if heartrate_data.empty:
return pd.DataFrame(columns=HR_SUMMARY_COLUMNS), pd.DataFrame(columns=HR_INTRADAY_COLUMNS)
if fitbit_data_type == "summary":
return pd.DataFrame(columns=HR_SUMMARY_COLUMNS)
elif fitbit_data_type == "intraday":
return pd.DataFrame(columns=HR_INTRADAY_COLUMNS)
device_id = heartrate_data["device_id"].iloc[0]
records_summary, records_intraday = [], []
@ -121,8 +125,6 @@ def parseHeartrateData(heartrate_data, fitbit_data_type):
parsed_data = pd.DataFrame(data=records_summary, columns=HR_SUMMARY_COLUMNS)
elif fitbit_data_type == "intraday":
parsed_data = pd.DataFrame(data=records_intraday, columns=HR_INTRADAY_COLUMNS)
else:
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
return parsed_data
@ -145,9 +147,11 @@ else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
# Only keep dates in the range of [local_start_date, local_end_date)
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
if parsed_data.shape[0] > 0:
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
parsed_data.dropna(subset=['timestamp'], inplace=True)
parsed_data.to_csv(snakemake.output[0], index=False)

View File

@ -188,7 +188,10 @@ def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, re
def parseSleepData(sleep_data, fitbit_data_type):
SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2
if sleep_data.empty:
return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS), pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS)
if fitbit_data_type == "summary":
return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS)
elif fitbit_data_type == "intraday":
return pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS)
device_id = sleep_data["device_id"].iloc[0]
records_summary, records_intraday = [], []
# Parse JSON into individual records
@ -210,13 +213,9 @@ def parseSleepData(sleep_data, fitbit_data_type):
parsed_data = pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS)
elif fitbit_data_type == "intraday":
parsed_data = pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS)
else:
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
return parsed_data
timezone = snakemake.params["timezone"]
column_format = snakemake.params["column_format"]
fitbit_data_type = snakemake.params["fitbit_data_type"]
@ -235,31 +234,26 @@ elif column_format == "PLAIN_TEXT":
parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
elif fitbit_data_type == "intraday":
parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
else:
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
if parsed_data.shape[0] > 0 and fitbit_data_type == "summary":
if sleep_episode_timestamp != "start" and sleep_episode_timestamp != "end":
raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].")
# Column name to be considered as the event datetime
datetime_column = "local_" + sleep_episode_timestamp + "_date_time"
# Only keep dates in the range of [local_start_date, local_end_date)
parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)]
# Convert datetime to timestamp
parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone).astype(np.int64) // 10**6
# Drop useless columns: local_start_date_time and local_end_date_time
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)]
parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
parsed_data.dropna(subset=['timestamp'], inplace=True)
parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True)
if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday":
# Only keep dates in the range of [local_start_date, local_end_date)
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
# Convert datetime to timestamp
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
# Unifying level
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
parsed_data.dropna(subset=['timestamp'], inplace=True)
parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "wake", "restless"]), 0, 1)
parsed_data.to_csv(snakemake.output[0], index=False)

View File

@ -9,9 +9,10 @@ STEPS_COLUMNS = ("device_id", "steps", "local_date_time", "timestamp")
def parseStepsData(steps_data, fitbit_data_type):
if steps_data.empty:
return pd.DataFrame(), pd.DataFrame(columns=STEPS_INTRADAY_COLUMNS)
return pd.DataFrame(columns=STEPS_COLUMNS)
device_id = steps_data["device_id"].iloc[0]
records_summary, records_intraday = [], []
records = []
# Parse JSON into individual records
for record in steps_data.fitbit_data:
@ -26,7 +27,7 @@ def parseStepsData(steps_data, fitbit_data_type):
curr_date,
0)
records_summary.append(row_summary)
records.append(row_summary)
# Parse intraday data
if fitbit_data_type == "intraday":
@ -40,14 +41,9 @@ def parseStepsData(steps_data, fitbit_data_type):
d_datetime,
0)
records_intraday.append(row_intraday)
records.append(row_intraday)
if fitbit_data_type == "summary":
parsed_data = pd.DataFrame(data=records_summary, columns=STEPS_COLUMNS)
elif fitbit_data_type == "intraday":
parsed_data = pd.DataFrame(data=records_intraday, columns=STEPS_COLUMNS)
else:
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
parsed_data = pd.DataFrame(data=records, columns=STEPS_COLUMNS)
return parsed_data
@ -71,9 +67,11 @@ else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
# Only keep dates in the range of [local_start_date, local_end_date)
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
if parsed_data.shape[0] > 0:
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
parsed_data.dropna(subset=['timestamp'], inplace=True)
parsed_data.to_csv(snakemake.output[0], index=False)