Compare commits

...

11 Commits

Author SHA1 Message Date
junos c66e046014 Use methods in helper.py. 2023-04-21 21:41:00 +02:00
junos 48118f125d Reformat ml_pipeline_regression.py 2023-04-21 21:34:54 +02:00
junos 583ee82e80 Add xgboost to dependencies and reformat helper.py. 2023-04-21 21:33:06 +02:00
junos 59552c18a9 Update python to 3.11. 2023-04-21 18:08:54 +02:00
junos a4ad4c3200 Check the relationship between is_system_app and System category. 2023-04-19 11:06:33 +02:00
junos 7e565c34db Only reclassify apps not found on Play Store.
Update coding files for app categories.
2023-04-19 11:01:00 +02:00
junos d6eea0fc00 Completely classify unknown applications. 2023-04-19 10:54:46 +02:00
junos 711b451eff Start to better classify system apps. 2023-04-19 09:48:57 +02:00
junos 0e66a5a963 Completely remove PACKAGE_NAMES_HASHED and instead provide a differently structured file. 2023-04-19 09:29:40 +02:00
junos c88cecc063 Categorize applications in config.yaml. 2023-04-18 20:39:58 +02:00
junos 66754a24aa Create and save catalogue. 2023-04-18 16:10:11 +02:00
8 changed files with 1551 additions and 288 deletions

View File

@ -3,6 +3,5 @@
<component name="VcsDirectoryMappings"> <component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" /> <mapping directory="$PROJECT_DIR$" vcs="Git" />
<mapping directory="$PROJECT_DIR$/rapids" vcs="Git" /> <mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
<mapping directory="$PROJECT_DIR$/rapids/calculatingfeatures" vcs="Git" />
</component> </component>
</project> </project>

View File

@ -1,9 +1,8 @@
name: straw2analysis name: straw2analysis
channels: channels:
- defaults
- conda-forge - conda-forge
dependencies: dependencies:
- python=3.9 - python=3.11
- black - black
- isort - isort
- flake8 - flake8
@ -23,4 +22,5 @@ dependencies:
- scikit-learn - scikit-learn
- sqlalchemy - sqlalchemy
- statsmodels - statsmodels
- tabulate - tabulate
- xgboost

View File

@ -15,91 +15,34 @@
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# %matplotlib inline # %matplotlib inline
import datetime
import importlib
import os import os
import sys import sys
import numpy as np import numpy as np
import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns
import yaml
from pyprojroot import here
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor
import xgboost as xg import xgboost as xg
from IPython.core.interactiveshell import InteractiveShell from machine_learning.helper import prepare_regression_model_input
InteractiveShell.ast_node_interactivity = "all" from sklearn import gaussian_process, kernel_ridge, linear_model, svm
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0] nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: if nb_dir not in sys.path:
sys.path.append(nb_dir) sys.path.append(nb_dir)
import machine_learning.features_sensor # %% jupyter={"source_hidden": true}
import machine_learning.labels model_input = pd.read_csv(
import machine_learning.model "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
)
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## PANAS negative affect
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") cv_method = "half_logo" # logo, half_logo, 5kfold
# %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
#if "pid" in model_input.columns:
# index_columns.append("pid")
model_input.set_index(index_columns, inplace=True)
cv_method = 'half_logo' # logo, half_logo, 5kfold
if cv_method == 'logo':
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
else:
model_input['pid_index'] = model_input.groupby('pid').cumcount()
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
# %% jupyter={"source_hidden": true}
categorical_features = data_x[categorical_feature_colnames].copy()
# %% jupyter={"source_hidden": true}
mode_categorical_features = categorical_features.mode().iloc[0]
# %% jupyter={"source_hidden": true}
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# %% jupyter={"source_hidden": true}
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
# %% jupyter={"source_hidden": true}
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
# %% jupyter={"source_hidden": true}
train_x = pd.concat([numerical_features, categorical_features], axis=1)
# %% jupyter={"source_hidden": true}
train_x.dtypes
train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut() logo = LeaveOneGroupOut()
logo.get_n_splits( logo.get_n_splits(
@ -109,7 +52,7 @@ logo.get_n_splits(
) )
# Defaults to 5 k folds in cross_validate method # Defaults to 5 k folds in cross_validate method
if cv_method != 'logo' and cv_method != 'half_logo': if cv_method != "logo" and cv_method != "half_logo":
logo = None logo = None
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
@ -120,7 +63,7 @@ sum(data_y.isna())
dummy_regr = DummyRegressor(strategy="mean") dummy_regr = DummyRegressor(strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
dummy_regressor = cross_validate( dummy_regressor = cross_validate(
@ -130,12 +73,26 @@ dummy_regressor = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error'])) np.median(dummy_regressor["test_neg_mean_squared_error"]),
print("R2", np.median(dummy_regressor['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(dummy_regressor["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(dummy_regressor["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(dummy_regressor["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Linear Regression # ### Linear Regression
@ -143,7 +100,7 @@ print("R2", np.median(dummy_regressor['test_r2']))
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model.LinearRegression() lin_reg_rapids = linear_model.LinearRegression()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
lin_reg_scores = cross_validate( lin_reg_scores = cross_validate(
@ -153,19 +110,33 @@ lin_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error'])) np.median(lin_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(lin_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(lin_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(lin_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(lin_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### XGBRegressor Linear Regression # ### XGBRegressor Linear Regression
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10) xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
xgb_reg_scores = cross_validate( xgb_reg_scores = cross_validate(
@ -175,19 +146,33 @@ xgb_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error'])) np.median(xgb_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(xgb_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(xgb_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(xgb_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### XGBRegressor Pseudo Huber Error Regression # ### XGBRegressor Pseudo Huber Error Regression
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10) xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
xgb_psuedo_huber_reg_scores = cross_validate( xgb_psuedo_huber_reg_scores = cross_validate(
@ -197,18 +182,32 @@ xgb_psuedo_huber_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error'])) np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Ridge regression # ### Ridge regression
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
ridge_reg = linear_model.Ridge(alpha=.5) ridge_reg = linear_model.Ridge(alpha=0.5)
# %% tags=[] jupyter={"source_hidden": true} # %% tags=[] jupyter={"source_hidden": true}
ridge_reg_scores = cross_validate( ridge_reg_scores = cross_validate(
@ -218,12 +217,26 @@ ridge_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error'])) np.median(ridge_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(ridge_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(ridge_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(ridge_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Lasso # ### Lasso
@ -239,12 +252,26 @@ lasso_reg_score = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error'])) np.median(lasso_reg_score["test_neg_mean_squared_error"]),
print("R2", np.median(lasso_reg_score['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(lasso_reg_score["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(lasso_reg_score["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(lasso_reg_score["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Bayesian Ridge # ### Bayesian Ridge
@ -260,12 +287,26 @@ bayesian_ridge_reg_score = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error'])) np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]),
print("R2", np.median(bayesian_ridge_reg_score['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(bayesian_ridge_reg_score["test_r2"]))
# %% [markdown] # %% [markdown]
# ### RANSAC (outlier robust regression) # ### RANSAC (outlier robust regression)
@ -281,12 +322,26 @@ ransac_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error'])) np.median(ransac_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(ransac_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(ransac_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(ransac_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Support vector regression # ### Support vector regression
@ -302,12 +357,25 @@ svr_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"])
print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error'])) )
print("R2", np.median(svr_scores['test_r2'])) print(
"Negative Mean Absolute Error",
np.median(svr_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(svr_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(svr_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Kernel Ridge regression # ### Kernel Ridge regression
@ -323,12 +391,26 @@ kridge_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error'])) np.median(kridge_scores["test_neg_mean_squared_error"]),
print("R2", np.median(kridge_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(kridge_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(kridge_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(kridge_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Gaussian Process Regression # ### Gaussian Process Regression
@ -345,11 +427,24 @@ gpr_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"])
print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error'])) )
print("R2", np.median(gpr_scores['test_r2'])) print(
"Negative Mean Absolute Error",
np.median(gpr_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(gpr_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(gpr_scores["test_r2"]))
# %% # %%

View File

@ -1,15 +1,18 @@
from pathlib import Path from pathlib import Path
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor, DummyClassifier
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xg
import pandas as pd
import numpy as np import numpy as np
import pandas as pd
from sklearn import (
ensemble,
gaussian_process,
kernel_ridge,
linear_model,
naive_bayes,
svm,
)
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from xgboost import XGBClassifier, XGBRegressor
def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame: def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
@ -65,28 +68,64 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
full_path = folder / export_filename full_path = folder / export_filename
return full_path return full_path
def insert_row(df, row): def insert_row(df, row):
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True) return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
def prepare_regression_model_input(input_csv):
model_input = pd.read_csv(input_csv) def prepare_regression_model_input(model_input, cv_method="logo"):
index_columns = [
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] "local_segment",
"local_segment_label",
"local_segment_start_datetime",
"local_segment_end_datetime",
]
model_input.set_index(index_columns, inplace=True) model_input.set_index(index_columns, inplace=True)
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] if cv_method == "logo":
data_x, data_y, data_groups = (
model_input.drop(["target", "pid"], axis=1),
model_input["target"],
model_input["pid"],
)
else:
model_input["pid_index"] = model_input.groupby("pid").cumcount()
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"] model_input["pid_index"] = (
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col] model_input["pid_index"] / model_input["pid_count"] + 1
).round()
model_input["pid_half"] = (
model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
)
data_x, data_y, data_groups = (
model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
model_input["target"],
model_input["pid_half"],
)
categorical_feature_colnames = [
"gender",
"startlanguage",
"limesurvey_demand_control_ratio_quartile",
]
additional_categorical_features = [
col
for col in data_x.columns
if "mostcommonactivity" in col or "homelabel" in col
]
categorical_feature_colnames += additional_categorical_features categorical_feature_colnames += additional_categorical_features
#TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
categorical_features = data_x[categorical_feature_colnames].copy() categorical_features = data_x[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0] mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode # fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features) categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding # one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category")) categorical_features = categorical_features.apply(
lambda col: col.astype("category")
)
if not categorical_features.empty: if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features) categorical_features = pd.get_dummies(categorical_features)
@ -108,7 +147,7 @@ def run_all_regression_models(input_csv):
data_y, data_y,
groups=data_groups, groups=data_groups,
) )
metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'] metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
test_metrics = ["test_" + metric for metric in metrics] test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "nanmedian"]) scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
@ -121,13 +160,13 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Dummy model:") print("Dummy model:")
print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2'])) print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics] scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "dummy" scores_df["method"] = "dummy"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -139,17 +178,17 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Linear regression:") print("Linear regression:")
print("R^2: ", np.nanmedian(lin_reg_scores['test_r2'])) print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics] scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "linear_reg" scores_df["method"] = "linear_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
ridge_reg = linear_model.Ridge(alpha=.5) ridge_reg = linear_model.Ridge(alpha=0.5)
ridge_reg_scores = cross_validate( ridge_reg_scores = cross_validate(
ridge_reg, ridge_reg,
X=data_x, X=data_x,
@ -157,16 +196,15 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Ridge regression") print("Ridge regression")
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics] scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "ridge_reg" scores_df["method"] = "ridge_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
lasso_reg = linear_model.Lasso(alpha=0.1) lasso_reg = linear_model.Lasso(alpha=0.1)
lasso_reg_score = cross_validate( lasso_reg_score = cross_validate(
lasso_reg, lasso_reg,
@ -175,12 +213,12 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Lasso regression") print("Lasso regression")
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics] scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "lasso_reg" scores_df["method"] = "lasso_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -192,12 +230,12 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Bayesian Ridge") print("Bayesian Ridge")
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics] scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "bayesian_ridge" scores_df["method"] = "bayesian_ridge"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -209,29 +247,23 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("RANSAC (outlier robust regression)") print("RANSAC (outlier robust regression)")
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics] scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "RANSAC" scores_df["method"] = "RANSAC"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
svr = svm.SVR() svr = svm.SVR()
svr_score = cross_validate( svr_score = cross_validate(
svr, svr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("Support vector regression") print("Support vector regression")
scores_df = pd.DataFrame(svr_score)[test_metrics] scores_df = pd.DataFrame(svr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "SVR" scores_df["method"] = "SVR"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -243,80 +275,56 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Kernel Ridge regression") print("Kernel Ridge regression")
scores_df = pd.DataFrame(kridge_score)[test_metrics] scores_df = pd.DataFrame(kridge_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "kernel_ridge" scores_df["method"] = "kernel_ridge"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
gpr = gaussian_process.GaussianProcessRegressor() gpr = gaussian_process.GaussianProcessRegressor()
gpr_score = cross_validate( gpr_score = cross_validate(
gpr, gpr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("Gaussian Process Regression") print("Gaussian Process Regression")
scores_df = pd.DataFrame(gpr_score)[test_metrics] scores_df = pd.DataFrame(gpr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "gaussian_proc" scores_df["method"] = "gaussian_proc"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
rfr_score = cross_validate( rfr_score = cross_validate(
rfr, rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("Random Forest Regression") print("Random Forest Regression")
scores_df = pd.DataFrame(rfr_score)[test_metrics] scores_df = pd.DataFrame(rfr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "random_forest" scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
xgb = XGBRegressor() xgb = XGBRegressor()
xgb_score = cross_validate( xgb_score = cross_validate(
xgb, xgb, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("XGBoost Regressor") print("XGBoost Regressor")
scores_df = pd.DataFrame(xgb_score)[test_metrics] scores_df = pd.DataFrame(xgb_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "XGBoost" scores_df["method"] = "XGBoost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
ada = ensemble.AdaBoostRegressor() ada = ensemble.AdaBoostRegressor()
ada_score = cross_validate( ada_score = cross_validate(
ada, ada, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("ADA Boost Regressor") print("ADA Boost Regressor")
scores_df = pd.DataFrame(ada_score)[test_metrics] scores_df = pd.DataFrame(ada_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "ADA_boost" scores_df["method"] = "ADA_boost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -324,7 +332,7 @@ def run_all_regression_models(input_csv):
def run_all_classification_models(data_x, data_y, data_groups, cv_method): def run_all_classification_models(data_x, data_y, data_groups, cv_method):
metrics = ['accuracy', 'average_precision', 'recall', 'f1'] metrics = ["accuracy", "average_precision", "recall", "f1"]
test_metrics = ["test_" + metric for metric in metrics] test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "mean"]) scores = pd.DataFrame(columns=["method", "max", "mean"])
@ -332,127 +340,127 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
dummy_class = DummyClassifier(strategy="most_frequent") dummy_class = DummyClassifier(strategy="most_frequent")
dummy_score = cross_validate( dummy_score = cross_validate(
dummy_class, dummy_class,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
error_score='raise', error_score="raise",
scoring=metrics scoring=metrics,
) )
print("Dummy") print("Dummy")
scores_df = pd.DataFrame(dummy_score)[test_metrics] scores_df = pd.DataFrame(dummy_score)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "Dummy" scores_df["method"] = "Dummy"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
logistic_regression = linear_model.LogisticRegression() logistic_regression = linear_model.LogisticRegression()
log_reg_scores = cross_validate( log_reg_scores = cross_validate(
logistic_regression, logistic_regression,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Logistic regression") print("Logistic regression")
scores_df = pd.DataFrame(log_reg_scores)[test_metrics] scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "logistic_reg" scores_df["method"] = "logistic_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
svc = svm.SVC() svc = svm.SVC()
svc_scores = cross_validate( svc_scores = cross_validate(
svc, svc,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Support Vector Machine") print("Support Vector Machine")
scores_df = pd.DataFrame(svc_scores)[test_metrics] scores_df = pd.DataFrame(svc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "svc" scores_df["method"] = "svc"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
gaussian_nb = naive_bayes.GaussianNB() gaussian_nb = naive_bayes.GaussianNB()
gaussian_nb_scores = cross_validate( gaussian_nb_scores = cross_validate(
gaussian_nb, gaussian_nb,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Gaussian Naive Bayes") print("Gaussian Naive Bayes")
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics] scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "gaussian_naive_bayes" scores_df["method"] = "gaussian_naive_bayes"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
sgdc = linear_model.SGDClassifier() sgdc = linear_model.SGDClassifier()
sgdc_scores = cross_validate( sgdc_scores = cross_validate(
sgdc, sgdc,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Stochastic Gradient Descent") print("Stochastic Gradient Descent")
scores_df = pd.DataFrame(sgdc_scores)[test_metrics] scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "stochastic_gradient_descent" scores_df["method"] = "stochastic_gradient_descent"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
rfc = ensemble.RandomForestClassifier() rfc = ensemble.RandomForestClassifier()
rfc_scores = cross_validate( rfc_scores = cross_validate(
rfc, rfc,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Random Forest") print("Random Forest")
scores_df = pd.DataFrame(rfc_scores)[test_metrics] scores_df = pd.DataFrame(rfc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "random_forest" scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
xgb_classifier = XGBClassifier() xgb_classifier = XGBClassifier()
xgb_scores = cross_validate( xgb_scores = cross_validate(
xgb_classifier, xgb_classifier,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("XGBoost") print("XGBoost")
scores_df = pd.DataFrame(xgb_scores)[test_metrics] scores_df = pd.DataFrame(xgb_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "xgboost" scores_df["method"] = "xgboost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])

View File

@ -34,18 +34,114 @@ df_app_categories <- tbl(con, "app_categories") %>%
head(df_app_categories) head(df_app_categories)
table(df_app_categories$play_store_genre) table(df_app_categories$play_store_genre)
# Correct some mistakes df_app_categories %>%
df_app_categories %<>% mutate( filter(play_store_genre == "not_found") %>%
play_store_genre = { group_by(play_store_response) %>%
function(x) { count()
# All "not_found" have an HTTP status of 404.
df_app_categories %>%
filter(play_store_genre == "not_found") %>%
group_by(package_name) %>%
count() %>%
arrange(desc(n))
# All "not_found" apps are unique.
# Exclude phone manufacturers, custom ROM names and similar.
manufacturers <- c(
"samsung",
"oneplus",
"huawei",
"xiaomi",
"lge",
"motorola",
"miui",
"lenovo",
"oppo",
"mediatek"
)
custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e")
other <- c("android", "wssyncmldm")
grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|")
rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name)
# Explore what remains after excluding above.
df_app_categories[!rows_os_manufacturer, ] %>%
filter(play_store_genre == "not_found")
# Also check the relationship between is_system_app and System category.
tbl(con, "applications") %>%
filter(is_system_app, play_store_genre != "System") %>%
count()
# They are perfectly correlated.
# Manually classify apps
df_app_categories[df_app_categories$play_store_genre == "not_found",] <-
df_app_categories %>%
filter(play_store_genre == "not_found") %>%
mutate(
play_store_genre =
case_when( case_when(
x == "Education,Education" ~ "Education", str_detect(str_to_lower(package_name), grep_pattern) ~ "System",
x == "EducationEducation" ~ "Education", str_detect(str_to_lower(package_name), "straw") ~ "STRAW",
x == "not_found" ~ "System", str_detect(str_to_lower(package_name), "chromium") ~ "Communication", # Same as chrome.
.default = x str_detect(str_to_lower(package_name), "skype") ~ "Communication", # Skype Lite not classified.
str_detect(str_to_lower(package_name), "imsservice") ~ "Communication", # IP Multimedia Subsystem
str_detect(str_to_lower(package_name), paste(c("covid", "empatica"), collapse = "|")) ~ "Medical",
str_detect(str_to_lower(package_name), paste(c("libri", "tachiyomi"), collapse = "|")) ~ "Books & Reference",
str_detect(str_to_lower(package_name), paste(c("bricks", "chess"), collapse = "|")) ~ "Casual",
str_detect(str_to_lower(package_name), "weather") ~ "Weather",
str_detect(str_to_lower(package_name), "excel") ~ "Productivity",
str_detect(str_to_lower(package_name), paste(c("qr", "barcode", "archimedes", "mixplorer", "winrar", "filemanager", "shot", "faceunlock", "signin", "milink"), collapse = "|")) ~ "Tools",
str_detect(str_to_lower(package_name), "stupeflix") ~ "Photography",
str_detect(str_to_lower(package_name), "anyme") ~ "Entertainment",
str_detect(str_to_lower(package_name), "vanced") ~ "Video Players & Editors",
str_detect(str_to_lower(package_name), paste(c("music", "radio", "dolby"), collapse = "|")) ~ "Music & Audio",
str_detect(str_to_lower(package_name), paste(c("tensorflow", "object_detection"), collapse = "|")) ~ "Education",
.default = play_store_genre
) )
} )
}(play_store_genre)
# Explore what remains after classifying above.
df_app_categories %>%
filter(play_store_genre == "not_found")
# After this, 13 applications remain, which I will classify as "Other".
# Correct some mistakes
# And classify 'not_found'
df_app_categories %<>%
mutate(
play_store_genre = {
function(x) {
case_when(
x == "Education,Education" ~ "Education",
x == "EducationEducation" ~ "Education",
x == "not_found" ~ "Other",
.default = x
)
}
}(play_store_genre)
) %>%
select(-package_name) %>%
rename(
genre = play_store_genre,
package_name = package_hash
)
table(df_app_categories$genre)
df_app_categories %>%
group_by(genre) %>%
count() %>%
arrange(desc(n)) %>%
write_csv("play_store_categories_count.csv")
write_csv(
x = select(df_app_categories, c(package_name, genre)),
file = "play_store_application_genre_catalogue.csv"
) )
dbDisconnect(con) dbDisconnect(con)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,45 @@
genre,n
System,261
Tools,96
Productivity,71
Health & Fitness,60
Finance,54
Communication,39
Music & Audio,39
Shopping,38
Lifestyle,33
Education,28
News & Magazines,24
Maps & Navigation,23
Entertainment,21
Business,18
Travel & Local,18
Books & Reference,16
Social,16
Weather,16
Food & Drink,14
Sports,14
Other,13
Photography,13
Puzzle,13
Video Players & Editors,12
Card,9
Casual,9
Personalization,8
Medical,7
Board,5
Strategy,4
House & Home,3
Trivia,3
Word,3
Adventure,2
Art & Design,2
Auto & Vehicles,2
Dating,2
Role Playing,2
STRAW,2
Simulation,2
"Board,Brain Games",1
"Entertainment,Music & Video",1
Parenting,1
Racing,1
1 genre n
2 System 261
3 Tools 96
4 Productivity 71
5 Health & Fitness 60
6 Finance 54
7 Communication 39
8 Music & Audio 39
9 Shopping 38
10 Lifestyle 33
11 Education 28
12 News & Magazines 24
13 Maps & Navigation 23
14 Entertainment 21
15 Business 18
16 Travel & Local 18
17 Books & Reference 16
18 Social 16
19 Weather 16
20 Food & Drink 14
21 Sports 14
22 Other 13
23 Photography 13
24 Puzzle 13
25 Video Players & Editors 12
26 Card 9
27 Casual 9
28 Personalization 8
29 Medical 7
30 Board 5
31 Strategy 4
32 House & Home 3
33 Trivia 3
34 Word 3
35 Adventure 2
36 Art & Design 2
37 Auto & Vehicles 2
38 Dating 2
39 Role Playing 2
40 STRAW 2
41 Simulation 2
42 Board,Brain Games 1
43 Entertainment,Music & Video 1
44 Parenting 1
45 Racing 1

2
rapids

@ -1 +1 @@
Subproject commit 03687a1ac204f0a4347eb758dada8005f68b0bb1 Subproject commit 63f5a526fce4d288499168e1701adadb8b885d82