rapids/src/features/phone_applications_foreground/rapids/main.py

import pandas as pd
import numpy as np
import itertools
from scipy.stats import entropy


def compute_features(filtered_data, apps_type, requested_features, apps_features, time_segment):        
    # There is the rare occasion that filtered_data is empty (found in testing)
    if "timeoffirstuse" in requested_features:
        time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
        if time_first_event.empty:
            apps_features["timeoffirstuse" + apps_type] = np.nan
        else:
            apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
    if "timeoflastuse" in requested_features:
        time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
        if time_last_event.empty:
            apps_features["timeoflastuse" + apps_type] = np.nan
        else:
            apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
    if "frequencyentropy" in requested_features:
        apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
        if (len(apps_with_count.index) < 2 ):
            apps_features["frequencyentropy" + apps_type] = np.nan
        else:    
            apps_features["frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
    if "countevent" in requested_features:
        apps_features["countevent" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"]

    if "countepisode" in requested_features:
        apps_features["countepisode" + apps_type] = filtered_data.groupby(["local_segment"]).count()["start_timestamp"]

    if "minduration" in requested_features:
        apps_features["minduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].min()
            
    if "maxduration" in requested_features:
        apps_features["maxduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].max()
            
    if "meanduration" in requested_features:
        apps_features["meanduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].mean()
            
    if "sumduration" in requested_features:
        apps_features["sumduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].sum()
    
    apps_features.index.names = ["local_segment"]
    return apps_features

def process_app_features(data, requested_features, time_segment, provider, filter_data_by_segment):
    
    excluded_categories = provider["EXCLUDED_CATEGORIES"]
    excluded_apps = provider["EXCLUDED_APPS"]
    single_categories = provider["SINGLE_CATEGORIES"]
    multiple_categories = {}
    if isinstance(provider["MULTIPLE_CATEGORIES"], dict):
        for mcategory_name, mcategory_content in provider["MULTIPLE_CATEGORIES"].items():
            if len(mcategory_content) > 0 and mcategory_name not in excluded_categories:
                multiple_categories[mcategory_name] = mcategory_content
    custom_categories = {}
    if isinstance(provider["CUSTOM_CATEGORIES"], dict):
        for owncategory_name, owncategory_content in provider["CUSTOM_CATEGORIES"].items():
            if len(owncategory_content) > 0 and owncategory_name not in excluded_categories:
                custom_categories[owncategory_name] = owncategory_content
    single_apps = provider["SINGLE_APPS"]
    single_categories = list(set(single_categories) - set(excluded_categories))
    single_apps = list(set(single_apps) - set(excluded_apps))

    # exclude categories in the excluded_categories list
    if "system_apps" in excluded_categories:
        data = data[data["is_system_app"] == 0]
    data = data[~data["genre"].isin(excluded_categories)]
    # exclude apps in the excluded_apps list
    data = data[~data["package_name"].isin(excluded_apps)]
            
    features = pd.DataFrame(columns=["local_segment"] + ["".join(feature) for feature in itertools.product(requested_features, single_categories + list(custom_categories.keys()) + list(multiple_categories.keys()) + single_apps)])
    if not data.empty:
        # deep copy the data for the top1global computation
        data_global = data.copy()
        
        data = filter_data_by_segment(data, time_segment)

        if not data.empty:
            features = pd.DataFrame()
            # single category
            single_categories.sort()
            for sc in single_categories:
                if sc == "all":
                    features = compute_features(data, "all", requested_features, features, time_segment)
                else:
                    filtered_data = data[data["genre"].isin([sc])]
                    features = compute_features(filtered_data, sc, requested_features, features, time_segment)
            # own categories
            for owncategory_name, owncategory_content in custom_categories.items():
                filtered_data = data[data["package_name"].isin(owncategory_content)]
                features = compute_features(filtered_data, owncategory_name, requested_features, features, time_segment)
            # multiple categories
            for mcategory_name, mcategory_content in multiple_categories.items():
                filtered_data = data[data["genre"].isin(mcategory_content)]
                features = compute_features(filtered_data, mcategory_name, requested_features, features, time_segment)
            # single apps
            for app in single_apps:
                col_name = app
                if app == "top1global":
                    # get the most used app
                    apps_with_count = data_global.groupby(["package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
                    app = apps_with_count.iloc[0]["package_name"]
                    col_name = "top1global"
                filtered_data = data[data["package_name"].isin([app])]
                features = compute_features(filtered_data, col_name, requested_features, features, time_segment)
 
            features = features.reset_index()

    return features

def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
    
    apps_events_data = pd.read_csv(sensor_data_files["sensor_data"])
    requested_events_features = provider["FEATURES"]["APP_EVENTS"]
    
    app_episodes_requirement = provider["INCLUDE_EPISODE_FEATURES"]
    
    features = process_app_features(apps_events_data, requested_events_features, time_segment, provider, filter_data_by_segment)
    
    if app_episodes_requirement:
        episode_data = pd.read_csv(sensor_data_files["episode_data"])
        requested_episodes_features = provider["FEATURES"]["APP_EPISODES"]
        episode_data = episode_data.drop(episode_data[ (episode_data['duration'] < provider["IGNORE_EPISODES_SHORTER_THAN"]) | (episode_data['duration'] > provider["IGNORE_EPISODES_LONGER_THAN"])].index)
        episodes_features = process_app_features(episode_data, requested_episodes_features, time_segment, provider, filter_data_by_segment)
        
        features = pd.merge(episodes_features, features, how='outer', on='local_segment')

    features.fillna(value={feature_name: 0 for feature_name in features.columns if feature_name.startswith(("countevent", "countepisode", "minduration", "maxduration", "meanduration", "sumduration"))}, inplace=True)
    
    return features
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`import pandas as pd`
			`import numpy as np`
			`import itertools`
			`from scipy.stats import entropy`


Refactor day segments to time segments 2020-12-03 00:41:03 +01:00			`def compute_features(filtered_data, apps_type, requested_features, apps_features, time_segment):`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`# There is the rare occasion that filtered_data is empty (found in testing)`
			`if "timeoffirstuse" in requested_features:`
			`time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")`
			`if time_first_event.empty:`
Update Python feature scripts to add sensor and provider names automatically 2020-11-30 20:42:19 +01:00			`apps_features["timeoffirstuse" + apps_type] = np.nan`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`else:`
Update Python feature scripts to add sensor and provider names automatically 2020-11-30 20:42:19 +01:00			`apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`if "timeoflastuse" in requested_features:`
			`time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")`
			`if time_last_event.empty:`
Update Python feature scripts to add sensor and provider names automatically 2020-11-30 20:42:19 +01:00			`apps_features["timeoflastuse" + apps_type] = np.nan`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`else:`
Update Python feature scripts to add sensor and provider names automatically 2020-11-30 20:42:19 +01:00			`apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`if "frequencyentropy" in requested_features:`
			`apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()`
			`if (len(apps_with_count.index) < 2 ):`
Update Python feature scripts to add sensor and provider names automatically 2020-11-30 20:42:19 +01:00			`apps_features["frequencyentropy" + apps_type] = np.nan`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`else:`
Update Python feature scripts to add sensor and provider names automatically 2020-11-30 20:42:19 +01:00			`apps_features["frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)`
Add app foreground episode count 2021-06-30 19:20:33 +02:00			`if "countevent" in requested_features:`
			`apps_features["countevent" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"]`

			`if "countepisode" in requested_features:`
			`apps_features["countepisode" + apps_type] = filtered_data.groupby(["local_segment"]).count()["start_timestamp"]`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`if "minduration" in requested_features:`
Data cleaning (#166) * Refactor data cleaning module: move it from example workflow to main directory * Replace NAs with 0 in selected event-based features * Add one step to drop highly correlated features Co-authored-by: Weiyu <weiyuhuang7@gmail.com> 2021-11-19 16:34:36 +01:00			`apps_features["minduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].min()`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00
			`if "maxduration" in requested_features:`
Data cleaning (#166) * Refactor data cleaning module: move it from example workflow to main directory * Replace NAs with 0 in selected event-based features * Add one step to drop highly correlated features Co-authored-by: Weiyu <weiyuhuang7@gmail.com> 2021-11-19 16:34:36 +01:00			`apps_features["maxduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].max()`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00
			`if "meanduration" in requested_features:`
Data cleaning (#166) * Refactor data cleaning module: move it from example workflow to main directory * Replace NAs with 0 in selected event-based features * Add one step to drop highly correlated features Co-authored-by: Weiyu <weiyuhuang7@gmail.com> 2021-11-19 16:34:36 +01:00			`apps_features["meanduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].mean()`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00
			`if "sumduration" in requested_features:`
Data cleaning (#166) * Refactor data cleaning module: move it from example workflow to main directory * Replace NAs with 0 in selected event-based features * Add one step to drop highly correlated features Co-authored-by: Weiyu <weiyuhuang7@gmail.com> 2021-11-19 16:34:36 +01:00			`apps_features["sumduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].sum()`

			`apps_features.index.names = ["local_segment"]`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`return apps_features`
Update AR module for segments; Refactor input format 2020-10-08 00:11:06 +02:00
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`def process_app_features(data, requested_features, time_segment, provider, filter_data_by_segment):`

Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`excluded_categories = provider["EXCLUDED_CATEGORIES"]`
			`excluded_apps = provider["EXCLUDED_APPS"]`
			`single_categories = provider["SINGLE_CATEGORIES"]`
Add own categories to app foreground features 2021-06-22 22:45:25 +02:00			`multiple_categories = {}`
			`if isinstance(provider["MULTIPLE_CATEGORIES"], dict):`
			`for mcategory_name, mcategory_content in provider["MULTIPLE_CATEGORIES"].items():`
			`if len(mcategory_content) > 0 and mcategory_name not in excluded_categories:`
			`multiple_categories[mcategory_name] = mcategory_content`
Change own to custom categories name 2021-06-22 22:56:09 +02:00			`custom_categories = {}`
			`if isinstance(provider["CUSTOM_CATEGORIES"], dict):`
			`for owncategory_name, owncategory_content in provider["CUSTOM_CATEGORIES"].items():`
Add own categories to app foreground features 2021-06-22 22:45:25 +02:00			`if len(owncategory_content) > 0 and owncategory_name not in excluded_categories:`
Change own to custom categories name 2021-06-22 22:56:09 +02:00			`custom_categories[owncategory_name] = owncategory_content`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`single_apps = provider["SINGLE_APPS"]`
			`single_categories = list(set(single_categories) - set(excluded_categories))`
			`single_apps = list(set(single_apps) - set(excluded_apps))`

			`# exclude categories in the excluded_categories list`
			`if "system_apps" in excluded_categories:`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`data = data[data["is_system_app"] == 0]`
			`data = data[~data["genre"].isin(excluded_categories)]`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`# exclude apps in the excluded_apps list`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`data = data[~data["package_name"].isin(excluded_apps)]`

Change own to custom categories name 2021-06-22 22:56:09 +02:00			`features = pd.DataFrame(columns=["local_segment"] + ["".join(feature) for feature in itertools.product(requested_features, single_categories + list(custom_categories.keys()) + list(multiple_categories.keys()) + single_apps)])`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`if not data.empty:`
			`# deep copy the data for the top1global computation`
			`data_global = data.copy()`
New Application_foreground testing & bugfixes 2020-09-11 23:55:09 +02:00
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`data = filter_data_by_segment(data, time_segment)`

			`if not data.empty:`
			`features = pd.DataFrame()`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`# single category`
			`single_categories.sort()`
			`for sc in single_categories:`
			`if sc == "all":`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`features = compute_features(data, "all", requested_features, features, time_segment)`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`else:`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`filtered_data = data[data["genre"].isin([sc])]`
			`features = compute_features(filtered_data, sc, requested_features, features, time_segment)`
Add own categories to app foreground features 2021-06-22 22:45:25 +02:00			`# own categories`
Change own to custom categories name 2021-06-22 22:56:09 +02:00			`for owncategory_name, owncategory_content in custom_categories.items():`
Add own categories to app foreground features 2021-06-22 22:45:25 +02:00			`filtered_data = data[data["package_name"].isin(owncategory_content)]`
			`features = compute_features(filtered_data, owncategory_name, requested_features, features, time_segment)`
			`# multiple categories`
			`for mcategory_name, mcategory_content in multiple_categories.items():`
			`filtered_data = data[data["genre"].isin(mcategory_content)]`
			`features = compute_features(filtered_data, mcategory_name, requested_features, features, time_segment)`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`# single apps`
			`for app in single_apps:`
			`col_name = app`
			`if app == "top1global":`
			`# get the most used app`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`apps_with_count = data_global.groupby(["package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00			`app = apps_with_count.iloc[0]["package_name"]`
			`col_name = "top1global"`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`filtered_data = data[data["package_name"].isin([app])]`
			`features = compute_features(filtered_data, col_name, requested_features, features, time_segment)`
New Application_foreground testing & bugfixes 2020-09-11 23:55:09 +02:00
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`features = features.reset_index()`

			`return features`

			`def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, args, *kwargs):`
Migrate app foreground to new file structure 2020-09-01 21:25:35 +02:00
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`apps_events_data = pd.read_csv(sensor_data_files["sensor_data"])`
			`requested_events_features = provider["FEATURES"]["APP_EVENTS"]`

			`app_episodes_requirement = provider["INCLUDE_EPISODE_FEATURES"]`

			`features = process_app_features(apps_events_data, requested_events_features, time_segment, provider, filter_data_by_segment)`

			`if app_episodes_requirement:`
			`episode_data = pd.read_csv(sensor_data_files["episode_data"])`
			`requested_episodes_features = provider["FEATURES"]["APP_EPISODES"]`
Fix app episode length bug 2021-06-29 17:30:31 +02:00			`episode_data = episode_data.drop(episode_data[ (episode_data['duration'] < provider["IGNORE_EPISODES_SHORTER_THAN"]) \| (episode_data['duration'] > provider["IGNORE_EPISODES_LONGER_THAN"])].index)`
adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`episodes_features = process_app_features(episode_data, requested_episodes_features, time_segment, provider, filter_data_by_segment)`

			`features = pd.merge(episodes_features, features, how='outer', on='local_segment')`
Change own to custom categories name 2021-06-22 22:56:09 +02:00
Data cleaning (#166) * Refactor data cleaning module: move it from example workflow to main directory * Replace NAs with 0 in selected event-based features * Add one step to drop highly correlated features Co-authored-by: Weiyu <weiyuhuang7@gmail.com> 2021-11-19 16:34:36 +01:00			`features.fillna(value={feature_name: 0 for feature_name in features.columns if feature_name.startswith(("countevent", "countepisode", "minduration", "maxduration", "meanduration", "sumduration"))}, inplace=True)`

adding app_episode with the changes as mentioned in the comments 2021-04-13 22:45:32 +02:00			`return features`