From 8d87f6e4970fa4ceed7736fd66616831317ad454 Mon Sep 17 00:00:00 2001
From: JulioV <JulioV@users.noreply.github.com>
Date: Tue, 1 Sep 2020 15:25:35 -0400
Subject: [PATCH] Migrate app foreground to new file structure

---
 Snakefile                                     | 12 +--
 config.yaml                                   | 23 ++---
 rules/features.smk                            | 30 ++++---
 rules/preprocessing.smk                       |  2 +-
 .../applications_foreground_base.py           | 74 ----------------
 .../applications_foreground_entry.R           | 13 +++
 .../applications_foreground_entry.py          | 18 ++++
 .../applications_foreground/rapids/main.py    | 88 +++++++++++++++++++
 8 files changed, 159 insertions(+), 101 deletions(-)
 delete mode 100644 src/features/applications_foreground/applications_foreground_base.py
 create mode 100644 src/features/applications_foreground/applications_foreground_entry.R
 create mode 100644 src/features/applications_foreground/applications_foreground_entry.py
 create mode 100644 src/features/applications_foreground/rapids/main.py

diff --git a/Snakefile b/Snakefile
index 38133b8f..4719df9c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -95,11 +95,13 @@ if config["ACCELEROMETER"]["COMPUTE"]:
     files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"]))
     files_to_compute.extend(expand("data/processed/{pid}/accelerometer_{day_segment}.csv", pid = config["PIDS"], day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]))
 
-if config["APPLICATIONS_FOREGROUND"]["COMPUTE"]:
-    files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
-    files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
-    files_to_compute.extend(expand("data/interim/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
-    files_to_compute.extend(expand("data/processed/{pid}/applications_foreground_{day_segment}.csv", pid = config["PIDS"], day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"]))
+for provider in config["APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys():
+    if config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]:
+        files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
+        files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
+        files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
+        files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="APPLICATIONS_FOREGROUND".lower()))
+        files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="APPLICATIONS_FOREGROUND".lower()))
 
 for provider in config["WIFI"]["PROVIDERS"].keys():
     if config["WIFI"]["PROVIDERS"][provider]["COMPUTE"]:
diff --git a/config.yaml b/config.yaml
index 603e7a36..0ef95d12 100644
--- a/config.yaml
+++ b/config.yaml
@@ -156,17 +156,20 @@ ACCELEROMETER:
     VALID_SENSED_MINUTES: False
 
 APPLICATIONS_FOREGROUND:
-  COMPUTE: False
   DB_TABLE: applications_foreground
-  DAY_SEGMENTS: *day_segments
-  SINGLE_CATEGORIES: ["all", "email"]
-  MULTIPLE_CATEGORIES:
-    social: ["socialnetworks", "socialmediatools"]
-    entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
-  SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
-  EXCLUDED_CATEGORIES: ["system_apps"]
-  EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
-  FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
+  PROVIDERS:
+    RAPIDS:
+      COMPUTE: TRUE
+      SINGLE_CATEGORIES: ["all", "email"]
+      MULTIPLE_CATEGORIES:
+        social: ["socialnetworks", "socialmediatools"]
+        entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
+      SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
+      EXCLUDED_CATEGORIES: []
+      EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
+      FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
+      SRC_FOLDER: "rapids" # inside src/features/applications_foreground
+      SRC_LANGUAGE: "python"
 
 HEARTRATE:
   COMPUTE: False
diff --git a/rules/features.smk b/rules/features.smk
index d01b3bf7..56c96972 100644
--- a/rules/features.smk
+++ b/rules/features.smk
@@ -224,21 +224,29 @@ rule accelerometer_features:
     script:
         "../src/features/accelerometer_features.py"
 
-rule applications_foreground_features:
+rule applications_foreground_r_features:
     input:
-        expand("data/interim/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])
+        sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]),
+        day_segments_labels = "data/interim/day_segments_labels.csv"
     params:
-        day_segment = "{day_segment}",
-        single_categories = config["APPLICATIONS_FOREGROUND"]["SINGLE_CATEGORIES"],
-        multiple_categories = config["APPLICATIONS_FOREGROUND"]["MULTIPLE_CATEGORIES"],
-        single_apps = config["APPLICATIONS_FOREGROUND"]["SINGLE_APPS"],
-        excluded_categories = config["APPLICATIONS_FOREGROUND"]["EXCLUDED_CATEGORIES"],
-        excluded_apps = config["APPLICATIONS_FOREGROUND"]["EXCLUDED_APPS"],
-        features = config["APPLICATIONS_FOREGROUND"]["FEATURES"],
+        provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key],
+        provider_key = "{provider_key}"
     output:
-        "data/processed/{pid}/applications_foreground_{day_segment}.csv"
+        "data/interim/{pid}/applications_foreground_features/applications_foreground_r_{provider_key}.csv"
     script:
-        "../src/features/applications_foreground_features.py"
+        "../src/features/applications_foreground/applications_foreground_entry.R"
+
+rule applications_foreground_python_features:
+    input:
+        sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]),
+        day_segments_labels = "data/interim/day_segments_labels.csv"
+    params:
+        provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key],
+        provider_key = "{provider_key}"
+    output:
+        "data/interim/{pid}/applications_foreground_features/applications_foreground_python_{provider_key}.csv"
+    script:
+        "../src/features/applications_foreground/applications_foreground_entry.py"
 
 rule wifi_r_features:
     input:
diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk
index e85cbdb4..9c27382c 100644
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@@ -134,7 +134,7 @@ rule application_genres:
         update_catalogue_file = config["APPLICATION_GENRES"]["UPDATE_CATALOGUE_FILE"],
         scrape_missing_genres = config["APPLICATION_GENRES"]["SCRAPE_MISSING_GENRES"]
     output:
-        "data/interim/{pid}/{sensor}_with_datetime_with_genre.csv"
+        "data/raw/{pid}/{sensor}_with_datetime_with_genre.csv"
     script:
         "../src/data/application_genres.R"
 
diff --git a/src/features/applications_foreground/applications_foreground_base.py b/src/features/applications_foreground/applications_foreground_base.py
deleted file mode 100644
index 5fb5e7f2..00000000
--- a/src/features/applications_foreground/applications_foreground_base.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import pandas as pd
-import itertools
-from scipy.stats import entropy
-
-
-def compute_features(filtered_data, apps_type, requested_features, apps_features, day_segment):        
-    # There is the rare occasion that filtered_data is empty (found in testing)
-    if "timeoffirstuse" in requested_features:
-        time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_date", keep="first").set_index("local_date")
-        if time_first_event.empty:
-            apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = 'NA'
-        else:
-            apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
-    if "timeoflastuse" in requested_features:
-        time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_date", keep="first").set_index("local_date")
-        if time_last_event.empty:
-            apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = 'NA'
-        else:
-            apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
-    if "frequencyentropy" in requested_features:
-        apps_with_count = filtered_data.groupby(["local_date","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
-        if (len(apps_with_count.index) < 2 ):
-            apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = 'NA'
-        else:    
-            apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_date")["timestamp"].agg(entropy)
-    if "count" in requested_features:
-        apps_features["apps_" + day_segment + "_count" + apps_type] = filtered_data.groupby(["local_date"]).count()["timestamp"]
-        apps_features.fillna(value={"apps_" + day_segment + "_count" + apps_type: 0}, inplace=True)
-    return apps_features
-
-
-def base_applications_foreground_features(apps_data, day_segment, requested_features, params):
-    multiple_categories_with_genres = params["multiple_categories_with_genres"]
-    single_categories = params["single_categories"]
-    multiple_categories = params["multiple_categories"]
-    apps = params["apps"]
-
-    # deep copy the apps_data for the top1global computation
-    apps_data_global = apps_data.copy()
-
-    apps_features = pd.DataFrame(columns=["local_date"] + ["apps_" + day_segment + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + apps)]])
-    if not apps_data.empty:
-        if day_segment != "daily":
-            apps_data =apps_data[apps_data["local_day_segment"] == day_segment]
-        
-        if not apps_data.empty:
-            apps_features = pd.DataFrame()
-            # single category
-            single_categories.sort()
-            for sc in single_categories:
-                if sc == "all":
-                    apps_features = compute_features(apps_data, "all", requested_features, apps_features, day_segment)
-                else:
-                    filtered_data = apps_data[apps_data["genre"].isin([sc])]
-                    apps_features = compute_features(filtered_data, sc, requested_features, apps_features, day_segment)
-            # multiple category
-            for mc in multiple_categories:
-                filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])]
-                apps_features = compute_features(filtered_data, mc, requested_features, apps_features, day_segment)
-            # single apps
-            for app in apps:
-                col_name = app
-                if app == "top1global":
-                    # get the most used app
-                    apps_with_count = apps_data_global.groupby(["local_date","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
-                    app = apps_with_count.iloc[0]["package_name"]
-                    col_name = "top1global"
-                
-                filtered_data = apps_data[apps_data["package_name"].isin([app])]
-                apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, day_segment)
-
-            apps_features = apps_features.reset_index()
-    
-    return apps_features
diff --git a/src/features/applications_foreground/applications_foreground_entry.R b/src/features/applications_foreground/applications_foreground_entry.R
new file mode 100644
index 00000000..277ab623
--- /dev/null
+++ b/src/features/applications_foreground/applications_foreground_entry.R
@@ -0,0 +1,13 @@
+source("renv/activate.R")
+source("src/features/utils/utils.R")
+library("dplyr")
+library("tidyr")
+
+sensor_data_file <-  snakemake@input[["sensor_data"]]
+day_segments_file <-  snakemake@input[["day_segments_labels"]]
+provider <- snakemake@params["provider"][["provider"]]
+provider_key <- snakemake@params["provider_key"]
+
+sensor_features <- fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file)
+
+write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)
diff --git a/src/features/applications_foreground/applications_foreground_entry.py b/src/features/applications_foreground/applications_foreground_entry.py
new file mode 100644
index 00000000..49b9b141
--- /dev/null
+++ b/src/features/applications_foreground/applications_foreground_entry.py
@@ -0,0 +1,18 @@
+import pandas as pd
+from importlib import import_module, util
+from pathlib import Path
+
+# import fetch_provider_features from src/features/utils/utils.py
+spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
+mod = util.module_from_spec(spec)
+spec.loader.exec_module(mod)
+fetch_provider_features = getattr(mod,  "fetch_provider_features")
+
+sensor_data_file = snakemake.input["sensor_data"][0]
+day_segments_file = snakemake.input["day_segments_labels"]
+provider = snakemake.params["provider"]
+provider_key = snakemake.params["provider_key"]
+
+sensor_features = fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file)
+
+sensor_features.to_csv(snakemake.output[0], index=False)
\ No newline at end of file
diff --git a/src/features/applications_foreground/rapids/main.py b/src/features/applications_foreground/rapids/main.py
new file mode 100644
index 00000000..ab322139
--- /dev/null
+++ b/src/features/applications_foreground/rapids/main.py
@@ -0,0 +1,88 @@
+import pandas as pd
+import numpy as np
+import itertools
+from scipy.stats import entropy
+
+
+def compute_features(filtered_data, apps_type, requested_features, apps_features, day_segment):        
+    # There is the rare occasion that filtered_data is empty (found in testing)
+    if "timeoffirstuse" in requested_features:
+        time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
+        if time_first_event.empty:
+            apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = np.nan
+        else:
+            apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
+    if "timeoflastuse" in requested_features:
+        time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
+        if time_last_event.empty:
+            apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = np.nan
+        else:
+            apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
+    if "frequencyentropy" in requested_features:
+        apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
+        if (len(apps_with_count.index) < 2 ):
+            apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = np.nan
+        else:    
+            apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
+    if "count" in requested_features:
+        apps_features["apps_rapids" + "_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"]
+        apps_features.fillna(value={"apps_rapids" + "_count" + apps_type: 0}, inplace=True)
+    return apps_features
+
+
+def rapids_features(apps_data, day_segment, provider, filter_data_by_segment, *args, **kwargs):
+    requested_features = provider["FEATURES"]
+    excluded_categories = provider["EXCLUDED_CATEGORIES"]
+    excluded_apps = provider["EXCLUDED_APPS"]
+    multiple_categories_with_genres = provider["MULTIPLE_CATEGORIES"]
+    single_categories = provider["SINGLE_CATEGORIES"]
+    multiple_categories = provider["MULTIPLE_CATEGORIES"]
+    single_apps = provider["SINGLE_APPS"]
+
+    single_categories = list(set(single_categories) - set(excluded_categories))
+    multiple_categories = list(multiple_categories_with_genres.keys() - set(excluded_categories))
+    single_apps = list(set(single_apps) - set(excluded_apps))
+
+    # exclude categories in the excluded_categories list
+    if "system_apps" in excluded_categories:
+        apps_data = apps_data[apps_data["is_system_app"] == 0]
+    apps_data = apps_data[~apps_data["genre"].isin(excluded_categories)]
+    # exclude apps in the excluded_apps list
+    apps_data = apps_data[~apps_data["package_name"].isin(excluded_apps)]
+
+    
+
+    apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]])
+    if not apps_data.empty:
+        apps_data = filter_data_by_segment(apps_data, day_segment)
+        # deep copy the apps_data for the top1global computation
+        apps_data_global = apps_data.copy()
+
+        if not apps_data.empty:
+            apps_features = pd.DataFrame()
+            # single category
+            single_categories.sort()
+            for sc in single_categories:
+                if sc == "all":
+                    apps_features = compute_features(apps_data, "all", requested_features, apps_features, day_segment)
+                else:
+                    filtered_data = apps_data[apps_data["genre"].isin([sc])]
+                    apps_features = compute_features(filtered_data, sc, requested_features, apps_features, day_segment)
+            # multiple category
+            for mc in multiple_categories:
+                filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])]
+                apps_features = compute_features(filtered_data, mc, requested_features, apps_features, day_segment)
+            # single apps
+            for app in single_apps:
+                col_name = app
+                if app == "top1global":
+                    # get the most used app
+                    apps_with_count = apps_data_global.groupby(["local_segment","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
+                    app = apps_with_count.iloc[0]["package_name"]
+                    col_name = "top1global"
+                filtered_data = apps_data[apps_data["package_name"].isin([app])]
+                apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, day_segment)
+
+            apps_features = apps_features.reset_index()
+    
+    return apps_features