From 9048c06fc49ddf732437cb1565418be51a01ccd1 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Fri, 29 May 2020 17:04:24 -0400 Subject: [PATCH] Refactor application foreground features --- .../applications_foreground_base.py | 64 ++++++++++++++++++ .../applications_foreground_features.py | 65 ++++--------------- 2 files changed, 76 insertions(+), 53 deletions(-) create mode 100644 src/features/applications_foreground/applications_foreground_base.py diff --git a/src/features/applications_foreground/applications_foreground_base.py b/src/features/applications_foreground/applications_foreground_base.py new file mode 100644 index 00000000..976d9938 --- /dev/null +++ b/src/features/applications_foreground/applications_foreground_base.py @@ -0,0 +1,64 @@ +import pandas as pd +import itertools +from scipy.stats import entropy + + +def compute_features(filtered_data, apps_type, requested_features, apps_features, day_segment): + if "timeoffirstuse" in requested_features: + time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_date", keep="first").set_index("local_date") + apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] + if "timeoflastuse" in requested_features: + time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_date", keep="first").set_index("local_date") + apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] + if "frequencyentropy" in requested_features: + apps_with_count = filtered_data.groupby(["local_date","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() + apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_date")["timestamp"].agg(entropy) + if "count" in requested_features: + apps_features["apps_" + day_segment + "_count" + apps_type] = filtered_data.groupby(["local_date"]).count()["timestamp"] + apps_features.fillna(value={"apps_" + day_segment + "_count" + apps_type: 0}, inplace=True) + return apps_features + + +def base_applications_foreground_features(apps_data, day_segment, requested_features, params): + multiple_categories_with_genres = params["multiple_categories_with_genres"] + single_categories = params["single_categories"] + multiple_categories = params["multiple_categories"] + apps = params["apps"] + + # deep copy the apps_data for the top1global computation + apps_data_global = apps_data.copy() + + if apps_data.empty: + apps_features = pd.DataFrame(columns=["local_date"] + ["apps_" + day_segment + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + apps)]]) + else: + if day_segment != "daily": + apps_data =apps_data[apps_data["local_day_segment"] == day_segment] + + if not apps_data.empty: + apps_features = pd.DataFrame() + # single category + for sc in single_categories: + if sc == "all": + apps_features = compute_features(apps_data, "all", requested_features, apps_features, day_segment) + else: + filtered_data = apps_data[apps_data["genre"].isin([sc])] + apps_features = compute_features(filtered_data, sc, requested_features, apps_features, day_segment) + # multiple category + for mc in multiple_categories: + filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])] + apps_features = compute_features(filtered_data, mc, requested_features, apps_features, day_segment) + # single apps + for app in apps: + col_name = app + if app == "top1global": + # get the most used app + apps_with_count = apps_data_global.groupby(["local_date","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() + app = apps_with_count.iloc[0]["package_name"] + col_name = "top1global" + + filtered_data = apps_data[apps_data["package_name"].isin([app])] + apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, day_segment) + + apps_features = apps_features.reset_index() + + return apps_features diff --git a/src/features/applications_foreground_features.py b/src/features/applications_foreground_features.py index 3bba2e70..475e5598 100644 --- a/src/features/applications_foreground_features.py +++ b/src/features/applications_foreground_features.py @@ -1,24 +1,5 @@ import pandas as pd -import numpy as np -import itertools -from scipy.stats import entropy - - -def compute_features(filtered_data, apps_type, requested_features, apps_features): - if "timeoffirstuse" in requested_features: - time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_date", keep="first").set_index("local_date") - apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] - if "timeoflastuse" in requested_features: - time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_date", keep="first").set_index("local_date") - apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] - if "frequencyentropy" in requested_features: - apps_with_count = filtered_data.groupby(["local_date","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() - apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_date")["timestamp"].agg(entropy) - if "count" in requested_features: - apps_features["apps_" + day_segment + "_count" + apps_type] = filtered_data.groupby(["local_date"]).count()["timestamp"] - apps_features.fillna(value={"apps_" + day_segment + "_count" + apps_type: 0}, inplace=True) - return apps_features - +from applications_foreground.applications_foreground_base import base_applications_foreground_features apps_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"], encoding="ISO-8859-1") day_segment = snakemake.params["day_segment"] @@ -27,11 +8,19 @@ multiple_categories_with_genres = snakemake.params["multiple_categories"] single_apps = snakemake.params["single_apps"] excluded_categories = snakemake.params["excluded_categories"] excluded_apps = snakemake.params["excluded_apps"] -features = snakemake.params["features"] +requested_features = snakemake.params["features"] +apps_features = pd.DataFrame(columns=["local_date"]) single_categories = list(set(single_categories) - set(excluded_categories)) multiple_categories = list(multiple_categories_with_genres.keys() - set(excluded_categories)) apps = list(set(single_apps) - set(excluded_apps)) +type_count = len(single_categories) + len(multiple_categories) + len(apps) + +params = {} +params["multiple_categories_with_genres"] = multiple_categories_with_genres +params["single_categories"] = single_categories +params["multiple_categories"] = multiple_categories +params["apps"] = apps # exclude categories in the excluded_categories list if "system_apps" in excluded_categories: @@ -40,38 +29,8 @@ apps_data = apps_data[~apps_data["genre"].isin(excluded_categories)] # exclude apps in the excluded_apps list apps_data = apps_data[~apps_data["application_name"].isin(excluded_apps)] -# deep copy the apps_data for the top1global computation -apps_data_global = apps_data.copy() +apps_features = apps_features.merge(base_applications_foreground_features(apps_data, day_segment, requested_features, params), on="local_date", how="outer") -apps_features = pd.DataFrame(columns=["local_date"] + ["apps_" + day_segment + "_" + x for x in ["".join(feature) for feature in itertools.product(features, single_categories + multiple_categories + apps)]]) -if not apps_data.empty: - apps_features = pd.DataFrame() - if day_segment != "daily": - apps_data =apps_data[apps_data["local_day_segment"] == day_segment] - - # single category - for sc in single_categories: - if sc == "all": - apps_features = compute_features(apps_data, "all", features, apps_features) - else: - filtered_data = apps_data[apps_data["genre"].isin([sc])] - apps_features = compute_features(filtered_data, sc, features, apps_features) - # multiple category - for mc in multiple_categories: - filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])] - apps_features = compute_features(filtered_data, mc, features, apps_features) - # single apps - for app in apps: - col_name = app - if app == "top1global": - # get the most used app - apps_with_count = apps_data_global.groupby(["local_date","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() - app = apps_with_count.iloc[0]["package_name"] - col_name = "top1global" - - filtered_data = apps_data[apps_data["package_name"].isin([app])] - apps_features = compute_features(filtered_data, col_name, features, apps_features) - - apps_features = apps_features.reset_index() +assert len(requested_features) * type_count + 1 == apps_features.shape[1], "The number of features in the output dataframe (=" + str(apps_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your application foreground feature extraction functions" apps_features.to_csv(snakemake.output[0], index=False)