Merge branch 'feature/fitbit-fix' into develop
commit
9fc48ee0dc
|
@ -5,6 +5,8 @@
|
|||
- Update CI to create a release on a tagged push that passes the tests
|
||||
- Clarify in DB credential configuration that we only support MySQL
|
||||
- Add Windows installation instructions
|
||||
- Fix bugs in the create_participants_file script
|
||||
- Fix bugs in Fitbit data parsing.
|
||||
## v0.3.1
|
||||
- Update installation docs for RAPIDS' docker container
|
||||
- Fix example analysis use of accelerometer data in a plot
|
||||
|
|
|
@ -124,8 +124,8 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`:
|
|||
|timeattop1location |minutes |Time spent at the most significant location.
|
||||
|timeattop2location |minutes |Time spent at the 2nd most significant location.
|
||||
|timeattop3location |minutes |Time spent at the 3rd most significant location.
|
||||
|movingtostaticratio | - | Ratio between the number of rows labeled Moving versus Static
|
||||
|outlierstimepercent | - | Ratio between the number of rows that belong to non-significant clusters divided by the total number of rows in a time segment.
|
||||
|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if it’s speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
|
||||
|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
|
||||
|maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location).
|
||||
|minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location).
|
||||
|meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location).
|
||||
|
|
|
@ -204,7 +204,7 @@ def parse_time_segments(time_segments_file, segments_type, device_ids):
|
|||
participant_file = yaml.load(open(snakemake.input[1], 'r'), Loader=yaml.FullLoader)
|
||||
device_ids = []
|
||||
for key in participant_file.keys():
|
||||
if "DEVICE_IDS" in participant_file[key]:
|
||||
if "DEVICE_IDS" in participant_file[key] and isinstance(participant_file[key]["DEVICE_IDS"], list):
|
||||
device_ids = device_ids + participant_file[key]["DEVICE_IDS"]
|
||||
|
||||
final_time_segments = parse_time_segments(snakemake.input[0], snakemake.params["time_segments_type"], device_ids)
|
||||
|
|
|
@ -11,8 +11,8 @@ group <- config$SOURCE$DATABASE_GROUP
|
|||
timezone <- config$SOURCE$TIMEZONE
|
||||
phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN
|
||||
fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN
|
||||
add_fitbit_section = config$PHONE_SECTION$ADD
|
||||
add_phone_section = config$FITBIT_SECTION$ADD
|
||||
add_phone_section = config$PHONE_SECTION$ADD
|
||||
add_fitbit_section = config$FITBIT_SECTION$ADD
|
||||
phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS
|
||||
fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS
|
||||
|
||||
|
@ -39,7 +39,8 @@ if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){
|
|||
|
||||
} else if(config$SOURCE$TYPE == "CSV_FILE"){
|
||||
participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
|
||||
start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c"))
|
||||
start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c")) %>%
|
||||
mutate(start_date = as.character(start_date), end_date = as.character(end_date)) # we read as date to validate format
|
||||
participants <- participants %>%
|
||||
mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
|
||||
platform = str_replace(platform, ";",","),
|
||||
|
@ -55,16 +56,18 @@ participants %>%
|
|||
empty_fitbit <- c("FITBIT:", " DEVICE_IDS:", " LABEL:", " START_DATE:", " END_DATE:")
|
||||
row <- tibble(...)
|
||||
lines <- c()
|
||||
start_date = if_else(is.na(row$start_date), "", row$start_date)
|
||||
end_date = if_else(is.na(row$end_date), "", row$end_date)
|
||||
|
||||
if(add_phone_section == TRUE && !is.na(row[phone_device_id_column])){
|
||||
lines <- append(lines, c("PHONE:", paste0(" DEVICE_IDS: [",row[phone_device_id_column],"]"), paste0(" PLATFORMS: [",row$platform,"]"),
|
||||
paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date)))
|
||||
paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date)))
|
||||
}else
|
||||
lines <- append(lines, empty_phone)
|
||||
|
||||
if(add_fitbit_section == TRUE && !is.na(row[fitbit_device_id_column])){
|
||||
lines <- append(lines, c("FITBIT:", paste0(" DEVICE_IDS: [",row[fitbit_device_id_column],"]"),
|
||||
paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date)))
|
||||
paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date)))
|
||||
} else
|
||||
lines <- append(lines, empty_fitbit)
|
||||
|
||||
|
|
|
@ -41,10 +41,14 @@ elif table_format == "CSV":
|
|||
summary = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
|
||||
intraday = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
|
||||
|
||||
# if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
|
||||
|
||||
if summary.shape[0] > 0:
|
||||
summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
|
||||
summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
|
||||
summary.dropna(subset=['timestamp'], inplace=True)
|
||||
if intraday.shape[0] > 0:
|
||||
intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
|
||||
intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
|
||||
intraday.dropna(subset=['timestamp'], inplace=True)
|
||||
|
||||
summary.to_csv(snakemake.output["summary_data"], index=False)
|
||||
intraday.to_csv(snakemake.output["intraday_data"], index=False)
|
|
@ -97,7 +97,11 @@ def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date,
|
|||
|
||||
def parseHeartrateData(heartrate_data, fitbit_data_type):
|
||||
if heartrate_data.empty:
|
||||
return pd.DataFrame(columns=HR_SUMMARY_COLUMNS), pd.DataFrame(columns=HR_INTRADAY_COLUMNS)
|
||||
if fitbit_data_type == "summary":
|
||||
return pd.DataFrame(columns=HR_SUMMARY_COLUMNS)
|
||||
elif fitbit_data_type == "intraday":
|
||||
return pd.DataFrame(columns=HR_INTRADAY_COLUMNS)
|
||||
|
||||
device_id = heartrate_data["device_id"].iloc[0]
|
||||
records_summary, records_intraday = [], []
|
||||
|
||||
|
@ -121,8 +125,6 @@ def parseHeartrateData(heartrate_data, fitbit_data_type):
|
|||
parsed_data = pd.DataFrame(data=records_summary, columns=HR_SUMMARY_COLUMNS)
|
||||
elif fitbit_data_type == "intraday":
|
||||
parsed_data = pd.DataFrame(data=records_intraday, columns=HR_INTRADAY_COLUMNS)
|
||||
else:
|
||||
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
|
||||
return parsed_data
|
||||
|
||||
|
||||
|
@ -145,9 +147,11 @@ else:
|
|||
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
|
||||
|
||||
# Only keep dates in the range of [local_start_date, local_end_date)
|
||||
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
|
||||
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
|
||||
|
||||
if parsed_data.shape[0] > 0:
|
||||
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
|
||||
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
|
||||
parsed_data.dropna(subset=['timestamp'], inplace=True)
|
||||
|
||||
parsed_data.to_csv(snakemake.output[0], index=False)
|
||||
|
|
|
@ -188,7 +188,10 @@ def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, re
|
|||
def parseSleepData(sleep_data, fitbit_data_type):
|
||||
SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2
|
||||
if sleep_data.empty:
|
||||
return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS), pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS)
|
||||
if fitbit_data_type == "summary":
|
||||
return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS)
|
||||
elif fitbit_data_type == "intraday":
|
||||
return pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS)
|
||||
device_id = sleep_data["device_id"].iloc[0]
|
||||
records_summary, records_intraday = [], []
|
||||
# Parse JSON into individual records
|
||||
|
@ -210,13 +213,9 @@ def parseSleepData(sleep_data, fitbit_data_type):
|
|||
parsed_data = pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS)
|
||||
elif fitbit_data_type == "intraday":
|
||||
parsed_data = pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS)
|
||||
else:
|
||||
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
|
||||
|
||||
return parsed_data
|
||||
|
||||
|
||||
|
||||
timezone = snakemake.params["timezone"]
|
||||
column_format = snakemake.params["column_format"]
|
||||
fitbit_data_type = snakemake.params["fitbit_data_type"]
|
||||
|
@ -235,31 +234,26 @@ elif column_format == "PLAIN_TEXT":
|
|||
parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
|
||||
elif fitbit_data_type == "intraday":
|
||||
parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
|
||||
else:
|
||||
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
|
||||
else:
|
||||
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
|
||||
|
||||
if parsed_data.shape[0] > 0 and fitbit_data_type == "summary":
|
||||
|
||||
if sleep_episode_timestamp != "start" and sleep_episode_timestamp != "end":
|
||||
raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].")
|
||||
|
||||
# Column name to be considered as the event datetime
|
||||
datetime_column = "local_" + sleep_episode_timestamp + "_date_time"
|
||||
# Only keep dates in the range of [local_start_date, local_end_date)
|
||||
|
||||
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
|
||||
parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)]
|
||||
# Convert datetime to timestamp
|
||||
parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone).astype(np.int64) // 10**6
|
||||
# Drop useless columns: local_start_date_time and local_end_date_time
|
||||
parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
|
||||
parsed_data.dropna(subset=['timestamp'], inplace=True)
|
||||
parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True)
|
||||
|
||||
if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday":
|
||||
# Only keep dates in the range of [local_start_date, local_end_date)
|
||||
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
|
||||
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
|
||||
# Convert datetime to timestamp
|
||||
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
|
||||
# Unifying level
|
||||
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
|
||||
parsed_data.dropna(subset=['timestamp'], inplace=True)
|
||||
parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "wake", "restless"]), 0, 1)
|
||||
|
||||
parsed_data.to_csv(snakemake.output[0], index=False)
|
||||
|
|
|
@ -9,9 +9,10 @@ STEPS_COLUMNS = ("device_id", "steps", "local_date_time", "timestamp")
|
|||
|
||||
def parseStepsData(steps_data, fitbit_data_type):
|
||||
if steps_data.empty:
|
||||
return pd.DataFrame(), pd.DataFrame(columns=STEPS_INTRADAY_COLUMNS)
|
||||
return pd.DataFrame(columns=STEPS_COLUMNS)
|
||||
|
||||
device_id = steps_data["device_id"].iloc[0]
|
||||
records_summary, records_intraday = [], []
|
||||
records = []
|
||||
|
||||
# Parse JSON into individual records
|
||||
for record in steps_data.fitbit_data:
|
||||
|
@ -26,7 +27,7 @@ def parseStepsData(steps_data, fitbit_data_type):
|
|||
curr_date,
|
||||
0)
|
||||
|
||||
records_summary.append(row_summary)
|
||||
records.append(row_summary)
|
||||
|
||||
# Parse intraday data
|
||||
if fitbit_data_type == "intraday":
|
||||
|
@ -40,14 +41,9 @@ def parseStepsData(steps_data, fitbit_data_type):
|
|||
d_datetime,
|
||||
0)
|
||||
|
||||
records_intraday.append(row_intraday)
|
||||
records.append(row_intraday)
|
||||
|
||||
if fitbit_data_type == "summary":
|
||||
parsed_data = pd.DataFrame(data=records_summary, columns=STEPS_COLUMNS)
|
||||
elif fitbit_data_type == "intraday":
|
||||
parsed_data = pd.DataFrame(data=records_intraday, columns=STEPS_COLUMNS)
|
||||
else:
|
||||
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
|
||||
parsed_data = pd.DataFrame(data=records, columns=STEPS_COLUMNS)
|
||||
|
||||
return parsed_data
|
||||
|
||||
|
@ -71,9 +67,11 @@ else:
|
|||
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
|
||||
|
||||
# Only keep dates in the range of [local_start_date, local_end_date)
|
||||
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
|
||||
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
|
||||
|
||||
if parsed_data.shape[0] > 0:
|
||||
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
|
||||
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
|
||||
parsed_data.dropna(subset=['timestamp'], inplace=True)
|
||||
|
||||
parsed_data.to_csv(snakemake.output[0], index=False)
|
||||
|
|
Loading…
Reference in New Issue