Migrate app foreground to new file structure
parent
681a77f23c
commit
8d87f6e497
12
Snakefile
12
Snakefile
|
@ -95,11 +95,13 @@ if config["ACCELEROMETER"]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"]))
|
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"]))
|
||||||
files_to_compute.extend(expand("data/processed/{pid}/accelerometer_{day_segment}.csv", pid = config["PIDS"], day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]))
|
files_to_compute.extend(expand("data/processed/{pid}/accelerometer_{day_segment}.csv", pid = config["PIDS"], day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]))
|
||||||
|
|
||||||
if config["APPLICATIONS_FOREGROUND"]["COMPUTE"]:
|
for provider in config["APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys():
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
|
if config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
|
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
|
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
|
||||||
files_to_compute.extend(expand("data/processed/{pid}/applications_foreground_{day_segment}.csv", pid = config["PIDS"], day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
|
||||||
|
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="APPLICATIONS_FOREGROUND".lower()))
|
||||||
|
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="APPLICATIONS_FOREGROUND".lower()))
|
||||||
|
|
||||||
for provider in config["WIFI"]["PROVIDERS"].keys():
|
for provider in config["WIFI"]["PROVIDERS"].keys():
|
||||||
if config["WIFI"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["WIFI"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
|
|
23
config.yaml
23
config.yaml
|
@ -156,17 +156,20 @@ ACCELEROMETER:
|
||||||
VALID_SENSED_MINUTES: False
|
VALID_SENSED_MINUTES: False
|
||||||
|
|
||||||
APPLICATIONS_FOREGROUND:
|
APPLICATIONS_FOREGROUND:
|
||||||
COMPUTE: False
|
|
||||||
DB_TABLE: applications_foreground
|
DB_TABLE: applications_foreground
|
||||||
DAY_SEGMENTS: *day_segments
|
PROVIDERS:
|
||||||
SINGLE_CATEGORIES: ["all", "email"]
|
RAPIDS:
|
||||||
MULTIPLE_CATEGORIES:
|
COMPUTE: TRUE
|
||||||
social: ["socialnetworks", "socialmediatools"]
|
SINGLE_CATEGORIES: ["all", "email"]
|
||||||
entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
|
MULTIPLE_CATEGORIES:
|
||||||
SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
|
social: ["socialnetworks", "socialmediatools"]
|
||||||
EXCLUDED_CATEGORIES: ["system_apps"]
|
entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
|
||||||
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
|
SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
|
||||||
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
|
EXCLUDED_CATEGORIES: []
|
||||||
|
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
|
||||||
|
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
|
||||||
|
SRC_FOLDER: "rapids" # inside src/features/applications_foreground
|
||||||
|
SRC_LANGUAGE: "python"
|
||||||
|
|
||||||
HEARTRATE:
|
HEARTRATE:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
|
|
|
@ -224,21 +224,29 @@ rule accelerometer_features:
|
||||||
script:
|
script:
|
||||||
"../src/features/accelerometer_features.py"
|
"../src/features/accelerometer_features.py"
|
||||||
|
|
||||||
rule applications_foreground_features:
|
rule applications_foreground_r_features:
|
||||||
input:
|
input:
|
||||||
expand("data/interim/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])
|
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]),
|
||||||
|
day_segments_labels = "data/interim/day_segments_labels.csv"
|
||||||
params:
|
params:
|
||||||
day_segment = "{day_segment}",
|
provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key],
|
||||||
single_categories = config["APPLICATIONS_FOREGROUND"]["SINGLE_CATEGORIES"],
|
provider_key = "{provider_key}"
|
||||||
multiple_categories = config["APPLICATIONS_FOREGROUND"]["MULTIPLE_CATEGORIES"],
|
|
||||||
single_apps = config["APPLICATIONS_FOREGROUND"]["SINGLE_APPS"],
|
|
||||||
excluded_categories = config["APPLICATIONS_FOREGROUND"]["EXCLUDED_CATEGORIES"],
|
|
||||||
excluded_apps = config["APPLICATIONS_FOREGROUND"]["EXCLUDED_APPS"],
|
|
||||||
features = config["APPLICATIONS_FOREGROUND"]["FEATURES"],
|
|
||||||
output:
|
output:
|
||||||
"data/processed/{pid}/applications_foreground_{day_segment}.csv"
|
"data/interim/{pid}/applications_foreground_features/applications_foreground_r_{provider_key}.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/applications_foreground_features.py"
|
"../src/features/applications_foreground/applications_foreground_entry.R"
|
||||||
|
|
||||||
|
rule applications_foreground_python_features:
|
||||||
|
input:
|
||||||
|
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]),
|
||||||
|
day_segments_labels = "data/interim/day_segments_labels.csv"
|
||||||
|
params:
|
||||||
|
provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key],
|
||||||
|
provider_key = "{provider_key}"
|
||||||
|
output:
|
||||||
|
"data/interim/{pid}/applications_foreground_features/applications_foreground_python_{provider_key}.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/applications_foreground/applications_foreground_entry.py"
|
||||||
|
|
||||||
rule wifi_r_features:
|
rule wifi_r_features:
|
||||||
input:
|
input:
|
||||||
|
|
|
@ -134,7 +134,7 @@ rule application_genres:
|
||||||
update_catalogue_file = config["APPLICATION_GENRES"]["UPDATE_CATALOGUE_FILE"],
|
update_catalogue_file = config["APPLICATION_GENRES"]["UPDATE_CATALOGUE_FILE"],
|
||||||
scrape_missing_genres = config["APPLICATION_GENRES"]["SCRAPE_MISSING_GENRES"]
|
scrape_missing_genres = config["APPLICATION_GENRES"]["SCRAPE_MISSING_GENRES"]
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/{sensor}_with_datetime_with_genre.csv"
|
"data/raw/{pid}/{sensor}_with_datetime_with_genre.csv"
|
||||||
script:
|
script:
|
||||||
"../src/data/application_genres.R"
|
"../src/data/application_genres.R"
|
||||||
|
|
||||||
|
|
|
@ -1,74 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import itertools
|
|
||||||
from scipy.stats import entropy
|
|
||||||
|
|
||||||
|
|
||||||
def compute_features(filtered_data, apps_type, requested_features, apps_features, day_segment):
|
|
||||||
# There is the rare occasion that filtered_data is empty (found in testing)
|
|
||||||
if "timeoffirstuse" in requested_features:
|
|
||||||
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_date", keep="first").set_index("local_date")
|
|
||||||
if time_first_event.empty:
|
|
||||||
apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = 'NA'
|
|
||||||
else:
|
|
||||||
apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
|
|
||||||
if "timeoflastuse" in requested_features:
|
|
||||||
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_date", keep="first").set_index("local_date")
|
|
||||||
if time_last_event.empty:
|
|
||||||
apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = 'NA'
|
|
||||||
else:
|
|
||||||
apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
|
|
||||||
if "frequencyentropy" in requested_features:
|
|
||||||
apps_with_count = filtered_data.groupby(["local_date","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
|
||||||
if (len(apps_with_count.index) < 2 ):
|
|
||||||
apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = 'NA'
|
|
||||||
else:
|
|
||||||
apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_date")["timestamp"].agg(entropy)
|
|
||||||
if "count" in requested_features:
|
|
||||||
apps_features["apps_" + day_segment + "_count" + apps_type] = filtered_data.groupby(["local_date"]).count()["timestamp"]
|
|
||||||
apps_features.fillna(value={"apps_" + day_segment + "_count" + apps_type: 0}, inplace=True)
|
|
||||||
return apps_features
|
|
||||||
|
|
||||||
|
|
||||||
def base_applications_foreground_features(apps_data, day_segment, requested_features, params):
|
|
||||||
multiple_categories_with_genres = params["multiple_categories_with_genres"]
|
|
||||||
single_categories = params["single_categories"]
|
|
||||||
multiple_categories = params["multiple_categories"]
|
|
||||||
apps = params["apps"]
|
|
||||||
|
|
||||||
# deep copy the apps_data for the top1global computation
|
|
||||||
apps_data_global = apps_data.copy()
|
|
||||||
|
|
||||||
apps_features = pd.DataFrame(columns=["local_date"] + ["apps_" + day_segment + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + apps)]])
|
|
||||||
if not apps_data.empty:
|
|
||||||
if day_segment != "daily":
|
|
||||||
apps_data =apps_data[apps_data["local_day_segment"] == day_segment]
|
|
||||||
|
|
||||||
if not apps_data.empty:
|
|
||||||
apps_features = pd.DataFrame()
|
|
||||||
# single category
|
|
||||||
single_categories.sort()
|
|
||||||
for sc in single_categories:
|
|
||||||
if sc == "all":
|
|
||||||
apps_features = compute_features(apps_data, "all", requested_features, apps_features, day_segment)
|
|
||||||
else:
|
|
||||||
filtered_data = apps_data[apps_data["genre"].isin([sc])]
|
|
||||||
apps_features = compute_features(filtered_data, sc, requested_features, apps_features, day_segment)
|
|
||||||
# multiple category
|
|
||||||
for mc in multiple_categories:
|
|
||||||
filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])]
|
|
||||||
apps_features = compute_features(filtered_data, mc, requested_features, apps_features, day_segment)
|
|
||||||
# single apps
|
|
||||||
for app in apps:
|
|
||||||
col_name = app
|
|
||||||
if app == "top1global":
|
|
||||||
# get the most used app
|
|
||||||
apps_with_count = apps_data_global.groupby(["local_date","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
|
||||||
app = apps_with_count.iloc[0]["package_name"]
|
|
||||||
col_name = "top1global"
|
|
||||||
|
|
||||||
filtered_data = apps_data[apps_data["package_name"].isin([app])]
|
|
||||||
apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, day_segment)
|
|
||||||
|
|
||||||
apps_features = apps_features.reset_index()
|
|
||||||
|
|
||||||
return apps_features
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
source("renv/activate.R")
|
||||||
|
source("src/features/utils/utils.R")
|
||||||
|
library("dplyr")
|
||||||
|
library("tidyr")
|
||||||
|
|
||||||
|
sensor_data_file <- snakemake@input[["sensor_data"]]
|
||||||
|
day_segments_file <- snakemake@input[["day_segments_labels"]]
|
||||||
|
provider <- snakemake@params["provider"][["provider"]]
|
||||||
|
provider_key <- snakemake@params["provider_key"]
|
||||||
|
|
||||||
|
sensor_features <- fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file)
|
||||||
|
|
||||||
|
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -0,0 +1,18 @@
|
||||||
|
import pandas as pd
|
||||||
|
from importlib import import_module, util
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# import fetch_provider_features from src/features/utils/utils.py
|
||||||
|
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
|
||||||
|
mod = util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(mod)
|
||||||
|
fetch_provider_features = getattr(mod, "fetch_provider_features")
|
||||||
|
|
||||||
|
sensor_data_file = snakemake.input["sensor_data"][0]
|
||||||
|
day_segments_file = snakemake.input["day_segments_labels"]
|
||||||
|
provider = snakemake.params["provider"]
|
||||||
|
provider_key = snakemake.params["provider_key"]
|
||||||
|
|
||||||
|
sensor_features = fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file)
|
||||||
|
|
||||||
|
sensor_features.to_csv(snakemake.output[0], index=False)
|
|
@ -0,0 +1,88 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import itertools
|
||||||
|
from scipy.stats import entropy
|
||||||
|
|
||||||
|
|
||||||
|
def compute_features(filtered_data, apps_type, requested_features, apps_features, day_segment):
|
||||||
|
# There is the rare occasion that filtered_data is empty (found in testing)
|
||||||
|
if "timeoffirstuse" in requested_features:
|
||||||
|
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
||||||
|
if time_first_event.empty:
|
||||||
|
apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = np.nan
|
||||||
|
else:
|
||||||
|
apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
|
||||||
|
if "timeoflastuse" in requested_features:
|
||||||
|
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
||||||
|
if time_last_event.empty:
|
||||||
|
apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = np.nan
|
||||||
|
else:
|
||||||
|
apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
|
||||||
|
if "frequencyentropy" in requested_features:
|
||||||
|
apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
||||||
|
if (len(apps_with_count.index) < 2 ):
|
||||||
|
apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = np.nan
|
||||||
|
else:
|
||||||
|
apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
|
||||||
|
if "count" in requested_features:
|
||||||
|
apps_features["apps_rapids" + "_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"]
|
||||||
|
apps_features.fillna(value={"apps_rapids" + "_count" + apps_type: 0}, inplace=True)
|
||||||
|
return apps_features
|
||||||
|
|
||||||
|
|
||||||
|
def rapids_features(apps_data, day_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||||
|
requested_features = provider["FEATURES"]
|
||||||
|
excluded_categories = provider["EXCLUDED_CATEGORIES"]
|
||||||
|
excluded_apps = provider["EXCLUDED_APPS"]
|
||||||
|
multiple_categories_with_genres = provider["MULTIPLE_CATEGORIES"]
|
||||||
|
single_categories = provider["SINGLE_CATEGORIES"]
|
||||||
|
multiple_categories = provider["MULTIPLE_CATEGORIES"]
|
||||||
|
single_apps = provider["SINGLE_APPS"]
|
||||||
|
|
||||||
|
single_categories = list(set(single_categories) - set(excluded_categories))
|
||||||
|
multiple_categories = list(multiple_categories_with_genres.keys() - set(excluded_categories))
|
||||||
|
single_apps = list(set(single_apps) - set(excluded_apps))
|
||||||
|
|
||||||
|
# exclude categories in the excluded_categories list
|
||||||
|
if "system_apps" in excluded_categories:
|
||||||
|
apps_data = apps_data[apps_data["is_system_app"] == 0]
|
||||||
|
apps_data = apps_data[~apps_data["genre"].isin(excluded_categories)]
|
||||||
|
# exclude apps in the excluded_apps list
|
||||||
|
apps_data = apps_data[~apps_data["package_name"].isin(excluded_apps)]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]])
|
||||||
|
if not apps_data.empty:
|
||||||
|
apps_data = filter_data_by_segment(apps_data, day_segment)
|
||||||
|
# deep copy the apps_data for the top1global computation
|
||||||
|
apps_data_global = apps_data.copy()
|
||||||
|
|
||||||
|
if not apps_data.empty:
|
||||||
|
apps_features = pd.DataFrame()
|
||||||
|
# single category
|
||||||
|
single_categories.sort()
|
||||||
|
for sc in single_categories:
|
||||||
|
if sc == "all":
|
||||||
|
apps_features = compute_features(apps_data, "all", requested_features, apps_features, day_segment)
|
||||||
|
else:
|
||||||
|
filtered_data = apps_data[apps_data["genre"].isin([sc])]
|
||||||
|
apps_features = compute_features(filtered_data, sc, requested_features, apps_features, day_segment)
|
||||||
|
# multiple category
|
||||||
|
for mc in multiple_categories:
|
||||||
|
filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])]
|
||||||
|
apps_features = compute_features(filtered_data, mc, requested_features, apps_features, day_segment)
|
||||||
|
# single apps
|
||||||
|
for app in single_apps:
|
||||||
|
col_name = app
|
||||||
|
if app == "top1global":
|
||||||
|
# get the most used app
|
||||||
|
apps_with_count = apps_data_global.groupby(["local_segment","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
||||||
|
app = apps_with_count.iloc[0]["package_name"]
|
||||||
|
col_name = "top1global"
|
||||||
|
filtered_data = apps_data[apps_data["package_name"].isin([app])]
|
||||||
|
apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, day_segment)
|
||||||
|
|
||||||
|
apps_features = apps_features.reset_index()
|
||||||
|
|
||||||
|
return apps_features
|
Loading…
Reference in New Issue