From 8d87f6e4970fa4ceed7736fd66616831317ad454 Mon Sep 17 00:00:00 2001 From: JulioV Date: Tue, 1 Sep 2020 15:25:35 -0400 Subject: [PATCH] Migrate app foreground to new file structure --- Snakefile | 12 +-- config.yaml | 23 ++--- rules/features.smk | 30 ++++--- rules/preprocessing.smk | 2 +- .../applications_foreground_base.py | 74 ---------------- .../applications_foreground_entry.R | 13 +++ .../applications_foreground_entry.py | 18 ++++ .../applications_foreground/rapids/main.py | 88 +++++++++++++++++++ 8 files changed, 159 insertions(+), 101 deletions(-) delete mode 100644 src/features/applications_foreground/applications_foreground_base.py create mode 100644 src/features/applications_foreground/applications_foreground_entry.R create mode 100644 src/features/applications_foreground/applications_foreground_entry.py create mode 100644 src/features/applications_foreground/rapids/main.py diff --git a/Snakefile b/Snakefile index 38133b8f..4719df9c 100644 --- a/Snakefile +++ b/Snakefile @@ -95,11 +95,13 @@ if config["ACCELEROMETER"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"])) files_to_compute.extend(expand("data/processed/{pid}/accelerometer_{day_segment}.csv", pid = config["PIDS"], day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"])) -if config["APPLICATIONS_FOREGROUND"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/applications_foreground_{day_segment}.csv", pid = config["PIDS"], day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"])) +for provider in config["APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys(): + if config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) + files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="APPLICATIONS_FOREGROUND".lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="APPLICATIONS_FOREGROUND".lower())) for provider in config["WIFI"]["PROVIDERS"].keys(): if config["WIFI"]["PROVIDERS"][provider]["COMPUTE"]: diff --git a/config.yaml b/config.yaml index 603e7a36..0ef95d12 100644 --- a/config.yaml +++ b/config.yaml @@ -156,17 +156,20 @@ ACCELEROMETER: VALID_SENSED_MINUTES: False APPLICATIONS_FOREGROUND: - COMPUTE: False DB_TABLE: applications_foreground - DAY_SEGMENTS: *day_segments - SINGLE_CATEGORIES: ["all", "email"] - MULTIPLE_CATEGORIES: - social: ["socialnetworks", "socialmediatools"] - entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"] - SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps - EXCLUDED_CATEGORIES: ["system_apps"] - EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] - FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] + PROVIDERS: + RAPIDS: + COMPUTE: TRUE + SINGLE_CATEGORIES: ["all", "email"] + MULTIPLE_CATEGORIES: + social: ["socialnetworks", "socialmediatools"] + entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"] + SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps + EXCLUDED_CATEGORIES: [] + EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] + FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] + SRC_FOLDER: "rapids" # inside src/features/applications_foreground + SRC_LANGUAGE: "python" HEARTRATE: COMPUTE: False diff --git a/rules/features.smk b/rules/features.smk index d01b3bf7..56c96972 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -224,21 +224,29 @@ rule accelerometer_features: script: "../src/features/accelerometer_features.py" -rule applications_foreground_features: +rule applications_foreground_r_features: input: - expand("data/interim/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]) + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]), + day_segments_labels = "data/interim/day_segments_labels.csv" params: - day_segment = "{day_segment}", - single_categories = config["APPLICATIONS_FOREGROUND"]["SINGLE_CATEGORIES"], - multiple_categories = config["APPLICATIONS_FOREGROUND"]["MULTIPLE_CATEGORIES"], - single_apps = config["APPLICATIONS_FOREGROUND"]["SINGLE_APPS"], - excluded_categories = config["APPLICATIONS_FOREGROUND"]["EXCLUDED_CATEGORIES"], - excluded_apps = config["APPLICATIONS_FOREGROUND"]["EXCLUDED_APPS"], - features = config["APPLICATIONS_FOREGROUND"]["FEATURES"], + provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" output: - "data/processed/{pid}/applications_foreground_{day_segment}.csv" + "data/interim/{pid}/applications_foreground_features/applications_foreground_r_{provider_key}.csv" script: - "../src/features/applications_foreground_features.py" + "../src/features/applications_foreground/applications_foreground_entry.R" + +rule applications_foreground_python_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]), + day_segments_labels = "data/interim/day_segments_labels.csv" + params: + provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" + output: + "data/interim/{pid}/applications_foreground_features/applications_foreground_python_{provider_key}.csv" + script: + "../src/features/applications_foreground/applications_foreground_entry.py" rule wifi_r_features: input: diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index e85cbdb4..9c27382c 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -134,7 +134,7 @@ rule application_genres: update_catalogue_file = config["APPLICATION_GENRES"]["UPDATE_CATALOGUE_FILE"], scrape_missing_genres = config["APPLICATION_GENRES"]["SCRAPE_MISSING_GENRES"] output: - "data/interim/{pid}/{sensor}_with_datetime_with_genre.csv" + "data/raw/{pid}/{sensor}_with_datetime_with_genre.csv" script: "../src/data/application_genres.R" diff --git a/src/features/applications_foreground/applications_foreground_base.py b/src/features/applications_foreground/applications_foreground_base.py deleted file mode 100644 index 5fb5e7f2..00000000 --- a/src/features/applications_foreground/applications_foreground_base.py +++ /dev/null @@ -1,74 +0,0 @@ -import pandas as pd -import itertools -from scipy.stats import entropy - - -def compute_features(filtered_data, apps_type, requested_features, apps_features, day_segment): - # There is the rare occasion that filtered_data is empty (found in testing) - if "timeoffirstuse" in requested_features: - time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_date", keep="first").set_index("local_date") - if time_first_event.empty: - apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = 'NA' - else: - apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] - if "timeoflastuse" in requested_features: - time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_date", keep="first").set_index("local_date") - if time_last_event.empty: - apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = 'NA' - else: - apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] - if "frequencyentropy" in requested_features: - apps_with_count = filtered_data.groupby(["local_date","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() - if (len(apps_with_count.index) < 2 ): - apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = 'NA' - else: - apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_date")["timestamp"].agg(entropy) - if "count" in requested_features: - apps_features["apps_" + day_segment + "_count" + apps_type] = filtered_data.groupby(["local_date"]).count()["timestamp"] - apps_features.fillna(value={"apps_" + day_segment + "_count" + apps_type: 0}, inplace=True) - return apps_features - - -def base_applications_foreground_features(apps_data, day_segment, requested_features, params): - multiple_categories_with_genres = params["multiple_categories_with_genres"] - single_categories = params["single_categories"] - multiple_categories = params["multiple_categories"] - apps = params["apps"] - - # deep copy the apps_data for the top1global computation - apps_data_global = apps_data.copy() - - apps_features = pd.DataFrame(columns=["local_date"] + ["apps_" + day_segment + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + apps)]]) - if not apps_data.empty: - if day_segment != "daily": - apps_data =apps_data[apps_data["local_day_segment"] == day_segment] - - if not apps_data.empty: - apps_features = pd.DataFrame() - # single category - single_categories.sort() - for sc in single_categories: - if sc == "all": - apps_features = compute_features(apps_data, "all", requested_features, apps_features, day_segment) - else: - filtered_data = apps_data[apps_data["genre"].isin([sc])] - apps_features = compute_features(filtered_data, sc, requested_features, apps_features, day_segment) - # multiple category - for mc in multiple_categories: - filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])] - apps_features = compute_features(filtered_data, mc, requested_features, apps_features, day_segment) - # single apps - for app in apps: - col_name = app - if app == "top1global": - # get the most used app - apps_with_count = apps_data_global.groupby(["local_date","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() - app = apps_with_count.iloc[0]["package_name"] - col_name = "top1global" - - filtered_data = apps_data[apps_data["package_name"].isin([app])] - apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, day_segment) - - apps_features = apps_features.reset_index() - - return apps_features diff --git a/src/features/applications_foreground/applications_foreground_entry.R b/src/features/applications_foreground/applications_foreground_entry.R new file mode 100644 index 00000000..277ab623 --- /dev/null +++ b/src/features/applications_foreground/applications_foreground_entry.R @@ -0,0 +1,13 @@ +source("renv/activate.R") +source("src/features/utils/utils.R") +library("dplyr") +library("tidyr") + +sensor_data_file <- snakemake@input[["sensor_data"]] +day_segments_file <- snakemake@input[["day_segments_labels"]] +provider <- snakemake@params["provider"][["provider"]] +provider_key <- snakemake@params["provider_key"] + +sensor_features <- fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file) + +write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/applications_foreground/applications_foreground_entry.py b/src/features/applications_foreground/applications_foreground_entry.py new file mode 100644 index 00000000..49b9b141 --- /dev/null +++ b/src/features/applications_foreground/applications_foreground_entry.py @@ -0,0 +1,18 @@ +import pandas as pd +from importlib import import_module, util +from pathlib import Path + +# import fetch_provider_features from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +fetch_provider_features = getattr(mod, "fetch_provider_features") + +sensor_data_file = snakemake.input["sensor_data"][0] +day_segments_file = snakemake.input["day_segments_labels"] +provider = snakemake.params["provider"] +provider_key = snakemake.params["provider_key"] + +sensor_features = fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file) + +sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/applications_foreground/rapids/main.py b/src/features/applications_foreground/rapids/main.py new file mode 100644 index 00000000..ab322139 --- /dev/null +++ b/src/features/applications_foreground/rapids/main.py @@ -0,0 +1,88 @@ +import pandas as pd +import numpy as np +import itertools +from scipy.stats import entropy + + +def compute_features(filtered_data, apps_type, requested_features, apps_features, day_segment): + # There is the rare occasion that filtered_data is empty (found in testing) + if "timeoffirstuse" in requested_features: + time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") + if time_first_event.empty: + apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = np.nan + else: + apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] + if "timeoflastuse" in requested_features: + time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") + if time_last_event.empty: + apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = np.nan + else: + apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] + if "frequencyentropy" in requested_features: + apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() + if (len(apps_with_count.index) < 2 ): + apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = np.nan + else: + apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy) + if "count" in requested_features: + apps_features["apps_rapids" + "_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"] + apps_features.fillna(value={"apps_rapids" + "_count" + apps_type: 0}, inplace=True) + return apps_features + + +def rapids_features(apps_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): + requested_features = provider["FEATURES"] + excluded_categories = provider["EXCLUDED_CATEGORIES"] + excluded_apps = provider["EXCLUDED_APPS"] + multiple_categories_with_genres = provider["MULTIPLE_CATEGORIES"] + single_categories = provider["SINGLE_CATEGORIES"] + multiple_categories = provider["MULTIPLE_CATEGORIES"] + single_apps = provider["SINGLE_APPS"] + + single_categories = list(set(single_categories) - set(excluded_categories)) + multiple_categories = list(multiple_categories_with_genres.keys() - set(excluded_categories)) + single_apps = list(set(single_apps) - set(excluded_apps)) + + # exclude categories in the excluded_categories list + if "system_apps" in excluded_categories: + apps_data = apps_data[apps_data["is_system_app"] == 0] + apps_data = apps_data[~apps_data["genre"].isin(excluded_categories)] + # exclude apps in the excluded_apps list + apps_data = apps_data[~apps_data["package_name"].isin(excluded_apps)] + + + + apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]]) + if not apps_data.empty: + apps_data = filter_data_by_segment(apps_data, day_segment) + # deep copy the apps_data for the top1global computation + apps_data_global = apps_data.copy() + + if not apps_data.empty: + apps_features = pd.DataFrame() + # single category + single_categories.sort() + for sc in single_categories: + if sc == "all": + apps_features = compute_features(apps_data, "all", requested_features, apps_features, day_segment) + else: + filtered_data = apps_data[apps_data["genre"].isin([sc])] + apps_features = compute_features(filtered_data, sc, requested_features, apps_features, day_segment) + # multiple category + for mc in multiple_categories: + filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])] + apps_features = compute_features(filtered_data, mc, requested_features, apps_features, day_segment) + # single apps + for app in single_apps: + col_name = app + if app == "top1global": + # get the most used app + apps_with_count = apps_data_global.groupby(["local_segment","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() + app = apps_with_count.iloc[0]["package_name"] + col_name = "top1global" + filtered_data = apps_data[apps_data["package_name"].isin([app])] + apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, day_segment) + + apps_features = apps_features.reset_index() + + return apps_features