From 85f4ec2ab6ea9964dbbf007d9f8b3217e6569469 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Tue, 3 Mar 2020 17:31:15 -0500 Subject: [PATCH] Delete screen event features and add countepisode and episodepersensedminutes --- config.yaml | 5 +-- rules/features.snakefile | 4 +- src/features/screen_metrics.py | 68 +++++++++------------------------- 3 files changed, 20 insertions(+), 57 deletions(-) diff --git a/config.yaml b/config.yaml index 9325934c..48af9b4b 100644 --- a/config.yaml +++ b/config.yaml @@ -87,9 +87,8 @@ BATTERY: SCREEN: DAY_SEGMENTS: *day_segments - METRICS_EVENTS: ["counton", "countunlock", "unlocksperminute"] - METRICS_DELTAS: ["sumduration", "maxduration", "minduration", "avgduration", "stdduration"] - EPISODES: ["unlock"] + METRICS_DELTAS: ["countepisode", "episodepersensedminutes", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] + EPISODE_TYPES: ["unlock"] LIGHT: DAY_SEGMENTS: *day_segments diff --git a/rules/features.snakefile b/rules/features.snakefile index 568ee7f0..04da5f5d 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -97,14 +97,12 @@ rule battery_metrics: rule screen_metrics: input: - screen_events = "data/raw/{pid}/screen_with_datetime.csv", screen_deltas = "data/processed/{pid}/screen_deltas.csv", phone_sensed_bins = "data/interim/{pid}/phone_sensed_bins.csv" params: day_segment = "{day_segment}", - metrics_events = config["SCREEN"]["METRICS_EVENTS"], metrics_deltas = config["SCREEN"]["METRICS_DELTAS"], - episodes = config["SCREEN"]["EPISODES"], + episode_types = config["SCREEN"]["EPISODE_TYPES"], bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"] output: "data/processed/{pid}/screen_{day_segment}.csv" diff --git a/src/features/screen_metrics.py b/src/features/screen_metrics.py index c2e5024a..b7a1824d 100644 --- a/src/features/screen_metrics.py +++ b/src/features/screen_metrics.py @@ -5,9 +5,16 @@ import itertools from datetime import datetime, timedelta, time from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes -def getEpisodeDurationFeatures(screen_deltas, episode, metrics): +def getEpisodeDurationFeatures(screen_deltas, episode, metrics, phone_sensed_bins, bin_size): screen_deltas_episode = screen_deltas[screen_deltas["episode"] == episode] duration_helper = pd.DataFrame() + if "countepisode" in metrics: + duration_helper = pd.concat([duration_helper, screen_deltas_episode.groupby(["local_start_date"]).count()[["time_diff"]].rename(columns = {"time_diff": "screen_" + day_segment + "_countepisode" + episode})], axis = 1) + if "episodepersensedminutes" in metrics: + for date, row in screen_deltas_episode.groupby(["local_start_date"]).count()[["time_diff"]].iterrows(): + sensed_minutes = phone_sensed_bins.loc[date, :].sum() * bin_size + episode_per_sensedminutes = row["time_diff"] / (1 if sensed_minutes == 0 else sensed_minutes) + duration_helper.loc[date, "screen_" + day_segment + "_episodepersensedminutes" + episode] = episode_per_sensedminutes if "sumduration" in metrics: duration_helper = pd.concat([duration_helper, screen_deltas_episode.groupby(["local_start_date"]).sum()[["time_diff"]].rename(columns = {"time_diff": "screen_" + day_segment + "_sumduration" + episode})], axis = 1) if "maxduration" in metrics: @@ -22,72 +29,31 @@ def getEpisodeDurationFeatures(screen_deltas, episode, metrics): duration_helper = duration_helper.fillna(0) return duration_helper -def getEventFeatures(screen_data, metrics_events, phone_sensed_bins, bin_size): - if screen_data.empty: - return pd.DataFrame(columns=["screen_" + day_segment + "_" + x for x in metrics_events]) - # get count_helper - screen_status = screen_data.groupby(["local_date", "screen_status"]).count()[["timestamp"]].reset_index() - count_on = screen_status[screen_status["screen_status"] == 0].set_index("local_date")[["timestamp"]].rename(columns = {"timestamp": "count_on"}) - count_off = screen_status[screen_status["screen_status"] == 1].set_index("local_date")[["timestamp"]].rename(columns = {"timestamp": "count_off"}) - count_lock = screen_status[screen_status["screen_status"] == 2].set_index("local_date")[["timestamp"]].rename(columns = {"timestamp": "count_lock"}) - count_unlock = screen_status[screen_status["screen_status"] == 3].set_index("local_date")[["timestamp"]].rename(columns = {"timestamp": "count_unlock"}) - - count_helper = pd.concat([count_on, count_off, count_lock, count_unlock], axis = 1) - count_helper = count_helper.fillna(0).astype(np.int64) - # get unlocks per minute - for date, row in count_helper.iterrows(): - sensed_minutes = phone_sensed_bins.loc[date, :].sum() * bin_size - unlocks_per_minute = min(row["count_lock"], row["count_unlock"]) / (1 if sensed_minutes == 0 else sensed_minutes) - count_helper.loc[date, "unlocks_per_minute"] = unlocks_per_minute - - event_features = pd.DataFrame() - if "counton" in metrics_events: - event_features["screen_" + day_segment + "_counton"] = count_helper[["count_on", "count_off"]].min(axis=1) - if "countunlock" in metrics_events: - event_features["screen_" + day_segment + "_countunlock"] = count_helper[["count_lock", "count_unlock"]].min(axis=1) - if "unlocksperminute" in metrics_events: - event_features["screen_" + day_segment + "_unlocksperminute"] = count_helper["unlocks_per_minute"] - - return event_features - -screen_data = pd.read_csv(snakemake.input["screen_events"], parse_dates=["local_date_time", "local_date"]) screen_deltas = pd.read_csv(snakemake.input["screen_deltas"], parse_dates=["local_start_date_time", "local_end_date_time", "local_start_date", "local_end_date"]) phone_sensed_bins = pd.read_csv(snakemake.input["phone_sensed_bins"], parse_dates=["local_date"], index_col="local_date") phone_sensed_bins[phone_sensed_bins > 0] = 1 day_segment = snakemake.params["day_segment"] -metrics_events = snakemake.params["metrics_events"] metrics_deltas = snakemake.params["metrics_deltas"] -episodes = snakemake.params["episodes"] +episode_types = snakemake.params["episode_types"] bin_size = snakemake.params["bin_size"] -metrics_deltas_name = ["".join(metric) for metric in itertools.product(metrics_deltas, episodes)] - -if screen_data.empty: - screen_features = pd.DataFrame(columns=["local_date"]+["screen_" + day_segment + "_" + x for x in metrics_events + metrics_deltas_name]) -else: - # drop consecutive duplicates of screen_status keeping the last one - screen_data = screen_data.loc[(screen_data[["screen_status"]].shift(-1) != screen_data[["screen_status"]]).any(axis=1)].reset_index(drop=True) +metrics_deltas_name = ["".join(metric) for metric in itertools.product(metrics_deltas, episode_types)] +screen_features = pd.DataFrame(columns=["local_date"]+["screen_" + day_segment + "_" + x for x in metrics_deltas_name]) +if not screen_deltas.empty: # preprocess day_segment and episodes screen_deltas = splitOvernightEpisodes(screen_deltas, [], ["episode"]) - if day_segment != "daily": - screen_data = screen_data[screen_data["local_day_segment"] == day_segment] + if (not screen_deltas.empty) and (day_segment != "daily"): screen_deltas = splitMultiSegmentEpisodes(screen_deltas, day_segment, []) screen_deltas.set_index(["local_start_date"],inplace=True) - # extract features for events and episodes - event_features = getEventFeatures(screen_data, metrics_events, phone_sensed_bins, bin_size) + if not screen_deltas.empty: + screen_features = pd.DataFrame() + for episode in episode_types: + screen_features = pd.concat([screen_features, getEpisodeDurationFeatures(screen_deltas, episode, metrics_deltas, phone_sensed_bins, bin_size)], axis=1) - if screen_deltas.empty: - duration_features = pd.DataFrame(columns=["screen_" + day_segment + "_" + x for x in metrics_deltas_name]) - else: - duration_features = pd.DataFrame() - for episode in episodes: - duration_features = pd.concat([duration_features, getEpisodeDurationFeatures(screen_deltas, episode, metrics_deltas)], axis=1) - - screen_features = pd.concat([event_features, duration_features], axis = 1).fillna(0) screen_features = screen_features.rename_axis("local_date").reset_index() screen_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file