# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% # %matplotlib inline import os import sys import matplotlib.pyplot as plt import pandas as pd import seaborn as sns nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) # %% from config.models import AppCategories, Participant from setup import db_engine, session # %% query_app_categories = session.query(AppCategories) with db_engine.connect() as connection: df_app_categories = pd.read_sql(query_app_categories.statement, connection) # %% df_app_categories.head() # %% df_app_categories["play_store_genre"].value_counts() # %% df_category_not_found = df_app_categories[ df_app_categories["play_store_genre"] == "not_found" ] # %% df_category_not_found["play_store_response"].value_counts() # %% df_category_not_found["package_name"].value_counts() # %% manufacturers = [ "samsung", "oneplus", "huawei", "xiaomi", "lge", "motorola", "miui", "lenovo", "oppo", "mediatek", ] custom_rom = ["coloros", "lineageos", "myos", "cyanogenmod", "foundation.e"] other = ["android", "wssyncmldm"] rows_os_manufacturer = df_category_not_found["package_name"].str.contains( "|".join(manufacturers + custom_rom + other), case=False ) # %% with pd.option_context("display.max_rows", None, "display.max_columns", None): display(df_category_not_found.loc[~rows_os_manufacturer]) # %% [markdown] # # Export categories # %% [markdown] # Rename all of "not_found" to "system" or "other". # %% df_app_categories_to_export = df_app_categories.copy() rows_os_manufacturer_full = (df_app_categories_to_export["package_name"].str.contains( "|".join(manufacturers + custom_rom + other), case=False )) & (df_app_categories_to_export["play_store_genre"] == "not_found") df_app_categories_to_export.loc[rows_os_manufacturer_full, "play_store_genre"] = "System" # %% rows_not_found = (df_app_categories_to_export["play_store_genre"] == "not_found") df_app_categories_to_export.loc[rows_not_found, "play_store_genre"] = "Other" # %% df_app_categories_to_export["play_store_genre"].value_counts() # %% df_app_categories_to_export.rename(columns={"play_store_genre": "genre"},inplace=True) df_app_categories_to_export.to_csv("../data/app_categories.csv", columns=["package_hash","genre"],index=False) # %%