2020-09-01 21:25:35 +02:00
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
import itertools
|
|
|
|
from scipy.stats import entropy
|
|
|
|
|
|
|
|
|
2020-12-03 00:41:03 +01:00
|
|
|
def compute_features(filtered_data, apps_type, requested_features, apps_features, time_segment):
|
2020-09-01 21:25:35 +02:00
|
|
|
# There is the rare occasion that filtered_data is empty (found in testing)
|
|
|
|
if "timeoffirstuse" in requested_features:
|
|
|
|
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
|
|
|
if time_first_event.empty:
|
2020-11-30 20:42:19 +01:00
|
|
|
apps_features["timeoffirstuse" + apps_type] = np.nan
|
2020-09-01 21:25:35 +02:00
|
|
|
else:
|
2020-11-30 20:42:19 +01:00
|
|
|
apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
|
2020-09-01 21:25:35 +02:00
|
|
|
if "timeoflastuse" in requested_features:
|
|
|
|
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
|
|
|
if time_last_event.empty:
|
2020-11-30 20:42:19 +01:00
|
|
|
apps_features["timeoflastuse" + apps_type] = np.nan
|
2020-09-01 21:25:35 +02:00
|
|
|
else:
|
2020-11-30 20:42:19 +01:00
|
|
|
apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
|
2020-09-01 21:25:35 +02:00
|
|
|
if "frequencyentropy" in requested_features:
|
|
|
|
apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
|
|
|
if (len(apps_with_count.index) < 2 ):
|
2020-11-30 20:42:19 +01:00
|
|
|
apps_features["frequencyentropy" + apps_type] = np.nan
|
2020-09-01 21:25:35 +02:00
|
|
|
else:
|
2020-11-30 20:42:19 +01:00
|
|
|
apps_features["frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
|
2021-06-30 19:20:33 +02:00
|
|
|
if "countevent" in requested_features:
|
|
|
|
apps_features["countevent" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"]
|
|
|
|
|
|
|
|
if "countepisode" in requested_features:
|
|
|
|
apps_features["countepisode" + apps_type] = filtered_data.groupby(["local_segment"]).count()["start_timestamp"]
|
2020-09-01 21:25:35 +02:00
|
|
|
|
2021-04-13 22:45:32 +02:00
|
|
|
if "minduration" in requested_features:
|
2021-11-19 16:34:36 +01:00
|
|
|
apps_features["minduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].min()
|
2021-04-13 22:45:32 +02:00
|
|
|
|
|
|
|
if "maxduration" in requested_features:
|
2021-11-19 16:34:36 +01:00
|
|
|
apps_features["maxduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].max()
|
2021-04-13 22:45:32 +02:00
|
|
|
|
|
|
|
if "meanduration" in requested_features:
|
2021-11-19 16:34:36 +01:00
|
|
|
apps_features["meanduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].mean()
|
2021-04-13 22:45:32 +02:00
|
|
|
|
|
|
|
if "sumduration" in requested_features:
|
2021-11-19 16:34:36 +01:00
|
|
|
apps_features["sumduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].sum()
|
|
|
|
|
|
|
|
apps_features.index.names = ["local_segment"]
|
2021-04-13 22:45:32 +02:00
|
|
|
return apps_features
|
2020-10-08 00:11:06 +02:00
|
|
|
|
2021-04-13 22:45:32 +02:00
|
|
|
def process_app_features(data, requested_features, time_segment, provider, filter_data_by_segment):
|
|
|
|
|
2020-09-01 21:25:35 +02:00
|
|
|
excluded_categories = provider["EXCLUDED_CATEGORIES"]
|
|
|
|
excluded_apps = provider["EXCLUDED_APPS"]
|
|
|
|
single_categories = provider["SINGLE_CATEGORIES"]
|
2021-06-22 22:45:25 +02:00
|
|
|
multiple_categories = {}
|
|
|
|
if isinstance(provider["MULTIPLE_CATEGORIES"], dict):
|
|
|
|
for mcategory_name, mcategory_content in provider["MULTIPLE_CATEGORIES"].items():
|
|
|
|
if len(mcategory_content) > 0 and mcategory_name not in excluded_categories:
|
|
|
|
multiple_categories[mcategory_name] = mcategory_content
|
2021-06-22 22:56:09 +02:00
|
|
|
custom_categories = {}
|
|
|
|
if isinstance(provider["CUSTOM_CATEGORIES"], dict):
|
|
|
|
for owncategory_name, owncategory_content in provider["CUSTOM_CATEGORIES"].items():
|
2021-06-22 22:45:25 +02:00
|
|
|
if len(owncategory_content) > 0 and owncategory_name not in excluded_categories:
|
2021-06-22 22:56:09 +02:00
|
|
|
custom_categories[owncategory_name] = owncategory_content
|
2020-09-01 21:25:35 +02:00
|
|
|
single_apps = provider["SINGLE_APPS"]
|
|
|
|
single_categories = list(set(single_categories) - set(excluded_categories))
|
|
|
|
single_apps = list(set(single_apps) - set(excluded_apps))
|
|
|
|
|
|
|
|
# exclude categories in the excluded_categories list
|
|
|
|
if "system_apps" in excluded_categories:
|
2021-04-13 22:45:32 +02:00
|
|
|
data = data[data["is_system_app"] == 0]
|
|
|
|
data = data[~data["genre"].isin(excluded_categories)]
|
2020-09-01 21:25:35 +02:00
|
|
|
# exclude apps in the excluded_apps list
|
2021-04-13 22:45:32 +02:00
|
|
|
data = data[~data["package_name"].isin(excluded_apps)]
|
|
|
|
|
2021-06-22 22:56:09 +02:00
|
|
|
features = pd.DataFrame(columns=["local_segment"] + ["".join(feature) for feature in itertools.product(requested_features, single_categories + list(custom_categories.keys()) + list(multiple_categories.keys()) + single_apps)])
|
2021-04-13 22:45:32 +02:00
|
|
|
if not data.empty:
|
|
|
|
# deep copy the data for the top1global computation
|
|
|
|
data_global = data.copy()
|
2020-09-11 23:55:09 +02:00
|
|
|
|
2021-04-13 22:45:32 +02:00
|
|
|
data = filter_data_by_segment(data, time_segment)
|
|
|
|
|
|
|
|
if not data.empty:
|
|
|
|
features = pd.DataFrame()
|
2020-09-01 21:25:35 +02:00
|
|
|
# single category
|
|
|
|
single_categories.sort()
|
|
|
|
for sc in single_categories:
|
|
|
|
if sc == "all":
|
2021-04-13 22:45:32 +02:00
|
|
|
features = compute_features(data, "all", requested_features, features, time_segment)
|
2020-09-01 21:25:35 +02:00
|
|
|
else:
|
2021-04-13 22:45:32 +02:00
|
|
|
filtered_data = data[data["genre"].isin([sc])]
|
|
|
|
features = compute_features(filtered_data, sc, requested_features, features, time_segment)
|
2021-06-22 22:45:25 +02:00
|
|
|
# own categories
|
2021-06-22 22:56:09 +02:00
|
|
|
for owncategory_name, owncategory_content in custom_categories.items():
|
2021-06-22 22:45:25 +02:00
|
|
|
filtered_data = data[data["package_name"].isin(owncategory_content)]
|
|
|
|
features = compute_features(filtered_data, owncategory_name, requested_features, features, time_segment)
|
|
|
|
# multiple categories
|
|
|
|
for mcategory_name, mcategory_content in multiple_categories.items():
|
|
|
|
filtered_data = data[data["genre"].isin(mcategory_content)]
|
|
|
|
features = compute_features(filtered_data, mcategory_name, requested_features, features, time_segment)
|
2020-09-01 21:25:35 +02:00
|
|
|
# single apps
|
|
|
|
for app in single_apps:
|
|
|
|
col_name = app
|
|
|
|
if app == "top1global":
|
|
|
|
# get the most used app
|
2021-04-13 22:45:32 +02:00
|
|
|
apps_with_count = data_global.groupby(["package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
2020-09-01 21:25:35 +02:00
|
|
|
app = apps_with_count.iloc[0]["package_name"]
|
|
|
|
col_name = "top1global"
|
2021-04-13 22:45:32 +02:00
|
|
|
filtered_data = data[data["package_name"].isin([app])]
|
|
|
|
features = compute_features(filtered_data, col_name, requested_features, features, time_segment)
|
2020-09-11 23:55:09 +02:00
|
|
|
|
2021-04-13 22:45:32 +02:00
|
|
|
features = features.reset_index()
|
|
|
|
|
|
|
|
return features
|
|
|
|
|
|
|
|
def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
2020-09-01 21:25:35 +02:00
|
|
|
|
2021-04-13 22:45:32 +02:00
|
|
|
apps_events_data = pd.read_csv(sensor_data_files["sensor_data"])
|
|
|
|
requested_events_features = provider["FEATURES"]["APP_EVENTS"]
|
|
|
|
|
|
|
|
app_episodes_requirement = provider["INCLUDE_EPISODE_FEATURES"]
|
|
|
|
|
|
|
|
features = process_app_features(apps_events_data, requested_events_features, time_segment, provider, filter_data_by_segment)
|
|
|
|
|
|
|
|
if app_episodes_requirement:
|
|
|
|
episode_data = pd.read_csv(sensor_data_files["episode_data"])
|
|
|
|
requested_episodes_features = provider["FEATURES"]["APP_EPISODES"]
|
2021-06-29 17:30:31 +02:00
|
|
|
episode_data = episode_data.drop(episode_data[ (episode_data['duration'] < provider["IGNORE_EPISODES_SHORTER_THAN"]) | (episode_data['duration'] > provider["IGNORE_EPISODES_LONGER_THAN"])].index)
|
2021-04-13 22:45:32 +02:00
|
|
|
episodes_features = process_app_features(episode_data, requested_episodes_features, time_segment, provider, filter_data_by_segment)
|
|
|
|
|
|
|
|
features = pd.merge(episodes_features, features, how='outer', on='local_segment')
|
2021-06-22 22:56:09 +02:00
|
|
|
|
2021-11-19 16:34:36 +01:00
|
|
|
features.fillna(value={feature_name: 0 for feature_name in features.columns if feature_name.startswith(("countevent", "countepisode", "minduration", "maxduration", "meanduration", "sumduration"))}, inplace=True)
|
|
|
|
|
2021-04-13 22:45:32 +02:00
|
|
|
return features
|