Compare commits

..

No commits in common. "c66e046014943d2be1eade5019d4bd937cec4238" and "297eb459338ea5624ba3699e826ae3d9931a81a1" have entirely different histories.

8 changed files with 289 additions and 1552 deletions

View File

@ -3,5 +3,6 @@
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
<mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
<mapping directory="$PROJECT_DIR$/rapids/calculatingfeatures" vcs="Git" />
</component>
</project>

View File

@ -1,8 +1,9 @@
name: straw2analysis
channels:
- defaults
- conda-forge
dependencies:
- python=3.11
- python=3.9
- black
- isort
- flake8
@ -23,4 +24,3 @@ dependencies:
- sqlalchemy
- statsmodels
- tabulate
- xgboost

View File

@ -15,34 +15,91 @@
# %% jupyter={"source_hidden": true}
# %matplotlib inline
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xg
from machine_learning.helper import prepare_regression_model_input
from sklearn import gaussian_process, kernel_ridge, linear_model, svm
from sklearn.dummy import DummyRegressor
import seaborn as sns
import yaml
from pyprojroot import here
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
from sklearn.dummy import DummyRegressor
import xgboost as xg
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %% jupyter={"source_hidden": true}
model_input = pd.read_csv(
"../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
)
import machine_learning.features_sensor
import machine_learning.labels
import machine_learning.model
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## PANAS negative affect
# %% jupyter={"source_hidden": true}
cv_method = "half_logo" # logo, half_logo, 5kfold
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
# %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
#if "pid" in model_input.columns:
# index_columns.append("pid")
model_input.set_index(index_columns, inplace=True)
cv_method = 'half_logo' # logo, half_logo, 5kfold
if cv_method == 'logo':
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
else:
model_input['pid_index'] = model_input.groupby('pid').cumcount()
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
# %% jupyter={"source_hidden": true}
categorical_features = data_x[categorical_feature_colnames].copy()
# %% jupyter={"source_hidden": true}
mode_categorical_features = categorical_features.mode().iloc[0]
# %% jupyter={"source_hidden": true}
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# %% jupyter={"source_hidden": true}
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
# %% jupyter={"source_hidden": true}
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
# %% jupyter={"source_hidden": true}
train_x = pd.concat([numerical_features, categorical_features], axis=1)
# %% jupyter={"source_hidden": true}
train_x.dtypes
train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method)
# %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut()
logo.get_n_splits(
@ -52,7 +109,7 @@ logo.get_n_splits(
)
# Defaults to 5 k folds in cross_validate method
if cv_method != "logo" and cv_method != "half_logo":
if cv_method != 'logo' and cv_method != 'half_logo':
logo = None
# %% jupyter={"source_hidden": true}
@ -63,7 +120,7 @@ sum(data_y.isna())
dummy_regr = DummyRegressor(strategy="mean")
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# %% jupyter={"source_hidden": true}
dummy_regressor = cross_validate(
@ -73,26 +130,12 @@ dummy_regressor = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(dummy_regressor["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(dummy_regressor["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(dummy_regressor["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(dummy_regressor["test_r2"]))
print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
print("R2", np.median(dummy_regressor['test_r2']))
# %% [markdown]
# ### Linear Regression
@ -100,7 +143,7 @@ print("R2", np.median(dummy_regressor["test_r2"]))
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model.LinearRegression()
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# %% jupyter={"source_hidden": true}
lin_reg_scores = cross_validate(
@ -110,33 +153,19 @@ lin_reg_scores = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(lin_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(lin_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(lin_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(lin_reg_scores["test_r2"]))
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(lin_reg_scores['test_r2']))
# %% [markdown]
# ### XGBRegressor Linear Regression
# %% jupyter={"source_hidden": true}
xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10)
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# %% jupyter={"source_hidden": true}
xgb_reg_scores = cross_validate(
@ -146,33 +175,19 @@ xgb_reg_scores = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(xgb_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(xgb_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(xgb_reg_scores["test_r2"]))
print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(xgb_reg_scores['test_r2']))
# %% [markdown]
# ### XGBRegressor Pseudo Huber Error Regression
# %% jupyter={"source_hidden": true}
xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10)
xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# %% jupyter={"source_hidden": true}
xgb_psuedo_huber_reg_scores = cross_validate(
@ -182,32 +197,18 @@ xgb_psuedo_huber_reg_scores = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"]))
print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
# %% [markdown]
# ### Ridge regression
# %% jupyter={"source_hidden": true}
ridge_reg = linear_model.Ridge(alpha=0.5)
ridge_reg = linear_model.Ridge(alpha=.5)
# %% tags=[] jupyter={"source_hidden": true}
ridge_reg_scores = cross_validate(
@ -217,26 +218,12 @@ ridge_reg_scores = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(ridge_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(ridge_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(ridge_reg_scores["test_r2"]))
print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(ridge_reg_scores['test_r2']))
# %% [markdown]
# ### Lasso
@ -252,26 +239,12 @@ lasso_reg_score = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(lasso_reg_score["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(lasso_reg_score["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(lasso_reg_score["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(lasso_reg_score["test_r2"]))
print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
print("R2", np.median(lasso_reg_score['test_r2']))
# %% [markdown]
# ### Bayesian Ridge
@ -287,26 +260,12 @@ bayesian_ridge_reg_score = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(bayesian_ridge_reg_score["test_r2"]))
print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
# %% [markdown]
# ### RANSAC (outlier robust regression)
@ -322,26 +281,12 @@ ransac_reg_scores = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(ransac_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(ransac_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(ransac_reg_scores["test_r2"]))
print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(ransac_reg_scores['test_r2']))
# %% [markdown]
# ### Support vector regression
@ -357,25 +302,12 @@ svr_scores = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"])
)
print(
"Negative Mean Absolute Error",
np.median(svr_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(svr_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(svr_scores["test_r2"]))
print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(svr_scores['test_r2']))
# %% [markdown]
# ### Kernel Ridge regression
@ -391,26 +323,12 @@ kridge_scores = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error",
np.median(kridge_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(kridge_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(kridge_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(kridge_scores["test_r2"]))
print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(kridge_scores['test_r2']))
# %% [markdown]
# ### Gaussian Process Regression
@ -427,24 +345,11 @@ gpr_scores = cross_validate(
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print(
"Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"])
)
print(
"Negative Mean Absolute Error",
np.median(gpr_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(gpr_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(gpr_scores["test_r2"]))
print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(gpr_scores['test_r2']))
# %%

View File

@ -1,18 +1,15 @@
from pathlib import Path
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor, DummyClassifier
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xg
import numpy as np
import pandas as pd
from sklearn import (
ensemble,
gaussian_process,
kernel_ridge,
linear_model,
naive_bayes,
svm,
)
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from xgboost import XGBClassifier, XGBRegressor
import numpy as np
def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
@ -68,64 +65,28 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
full_path = folder / export_filename
return full_path
def insert_row(df, row):
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
def prepare_regression_model_input(input_csv):
def prepare_regression_model_input(model_input, cv_method="logo"):
index_columns = [
"local_segment",
"local_segment_label",
"local_segment_start_datetime",
"local_segment_end_datetime",
]
model_input = pd.read_csv(input_csv)
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input.set_index(index_columns, inplace=True)
if cv_method == "logo":
data_x, data_y, data_groups = (
model_input.drop(["target", "pid"], axis=1),
model_input["target"],
model_input["pid"],
)
else:
model_input["pid_index"] = model_input.groupby("pid").cumcount()
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
model_input["pid_index"] = (
model_input["pid_index"] / model_input["pid_count"] + 1
).round()
model_input["pid_half"] = (
model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
)
data_x, data_y, data_groups = (
model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
model_input["target"],
model_input["pid_half"],
)
categorical_feature_colnames = [
"gender",
"startlanguage",
"limesurvey_demand_control_ratio_quartile",
]
additional_categorical_features = [
col
for col in data_x.columns
if "mostcommonactivity" in col or "homelabel" in col
]
categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"]
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
#TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
categorical_features = data_x[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(
lambda col: col.astype("category")
)
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
@ -147,7 +108,7 @@ def run_all_regression_models(input_csv):
data_y,
groups=data_groups,
)
metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']
test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
@ -160,13 +121,13 @@ def run_all_regression_models(input_csv):
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics,
scoring=metrics
)
print("Dummy model:")
print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2']))
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "dummy"
scores = pd.concat([scores, scores_df])
@ -178,17 +139,17 @@ def run_all_regression_models(input_csv):
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics,
scoring=metrics
)
print("Linear regression:")
print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
print("R^2: ", np.nanmedian(lin_reg_scores['test_r2']))
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "linear_reg"
scores = pd.concat([scores, scores_df])
ridge_reg = linear_model.Ridge(alpha=0.5)
ridge_reg = linear_model.Ridge(alpha=.5)
ridge_reg_scores = cross_validate(
ridge_reg,
X=data_x,
@ -196,15 +157,16 @@ def run_all_regression_models(input_csv):
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics,
scoring=metrics
)
print("Ridge regression")
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "ridge_reg"
scores = pd.concat([scores, scores_df])
lasso_reg = linear_model.Lasso(alpha=0.1)
lasso_reg_score = cross_validate(
lasso_reg,
@ -213,12 +175,12 @@ def run_all_regression_models(input_csv):
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics,
scoring=metrics
)
print("Lasso regression")
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "lasso_reg"
scores = pd.concat([scores, scores_df])
@ -230,12 +192,12 @@ def run_all_regression_models(input_csv):
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics,
scoring=metrics
)
print("Bayesian Ridge")
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "bayesian_ridge"
scores = pd.concat([scores, scores_df])
@ -247,23 +209,29 @@ def run_all_regression_models(input_csv):
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics,
scoring=metrics
)
print("RANSAC (outlier robust regression)")
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "RANSAC"
scores = pd.concat([scores, scores_df])
svr = svm.SVR()
svr_score = cross_validate(
svr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
svr,
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
)
print("Support vector regression")
scores_df = pd.DataFrame(svr_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "SVR"
scores = pd.concat([scores, scores_df])
@ -275,56 +243,80 @@ def run_all_regression_models(input_csv):
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics,
scoring=metrics
)
print("Kernel Ridge regression")
scores_df = pd.DataFrame(kridge_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "kernel_ridge"
scores = pd.concat([scores, scores_df])
gpr = gaussian_process.GaussianProcessRegressor()
gpr_score = cross_validate(
gpr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
gpr,
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
)
print("Gaussian Process Regression")
scores_df = pd.DataFrame(gpr_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "gaussian_proc"
scores = pd.concat([scores, scores_df])
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
rfr_score = cross_validate(
rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
rfr,
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
)
print("Random Forest Regression")
scores_df = pd.DataFrame(rfr_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df])
xgb = XGBRegressor()
xgb_score = cross_validate(
xgb, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
xgb,
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
)
print("XGBoost Regressor")
scores_df = pd.DataFrame(xgb_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "XGBoost"
scores = pd.concat([scores, scores_df])
ada = ensemble.AdaBoostRegressor()
ada_score = cross_validate(
ada, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
ada,
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
)
print("ADA Boost Regressor")
scores_df = pd.DataFrame(ada_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "ADA_boost"
scores = pd.concat([scores, scores_df])
@ -332,7 +324,7 @@ def run_all_regression_models(input_csv):
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
metrics = ["accuracy", "average_precision", "recall", "f1"]
metrics = ['accuracy', 'average_precision', 'recall', 'f1']
test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "mean"])
@ -340,127 +332,127 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
dummy_class = DummyClassifier(strategy="most_frequent")
dummy_score = cross_validate(
dummy_class,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score="raise",
scoring=metrics,
dummy_class,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=metrics
)
print("Dummy")
scores_df = pd.DataFrame(dummy_score)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "Dummy"
scores = pd.concat([scores, scores_df])
logistic_regression = linear_model.LogisticRegression()
log_reg_scores = cross_validate(
logistic_regression,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics,
logistic_regression,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Logistic regression")
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "logistic_reg"
scores = pd.concat([scores, scores_df])
svc = svm.SVC()
svc_scores = cross_validate(
svc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics,
svc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Support Vector Machine")
scores_df = pd.DataFrame(svc_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "svc"
scores = pd.concat([scores, scores_df])
gaussian_nb = naive_bayes.GaussianNB()
gaussian_nb_scores = cross_validate(
gaussian_nb,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics,
gaussian_nb,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Gaussian Naive Bayes")
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "gaussian_naive_bayes"
scores = pd.concat([scores, scores_df])
sgdc = linear_model.SGDClassifier()
sgdc_scores = cross_validate(
sgdc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics,
sgdc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Stochastic Gradient Descent")
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "stochastic_gradient_descent"
scores = pd.concat([scores, scores_df])
rfc = ensemble.RandomForestClassifier()
rfc_scores = cross_validate(
rfc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics,
rfc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Random Forest")
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df])
xgb_classifier = XGBClassifier()
xgb_scores = cross_validate(
xgb_classifier,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics,
xgb_classifier,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("XGBoost")
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "xgboost"
scores = pd.concat([scores, scores_df])

View File

@ -34,114 +34,18 @@ df_app_categories <- tbl(con, "app_categories") %>%
head(df_app_categories)
table(df_app_categories$play_store_genre)
df_app_categories %>%
filter(play_store_genre == "not_found") %>%
group_by(play_store_response) %>%
count()
# All "not_found" have an HTTP status of 404.
df_app_categories %>%
filter(play_store_genre == "not_found") %>%
group_by(package_name) %>%
count() %>%
arrange(desc(n))
# All "not_found" apps are unique.
# Exclude phone manufacturers, custom ROM names and similar.
manufacturers <- c(
"samsung",
"oneplus",
"huawei",
"xiaomi",
"lge",
"motorola",
"miui",
"lenovo",
"oppo",
"mediatek"
)
custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e")
other <- c("android", "wssyncmldm")
grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|")
rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name)
# Explore what remains after excluding above.
df_app_categories[!rows_os_manufacturer, ] %>%
filter(play_store_genre == "not_found")
# Also check the relationship between is_system_app and System category.
tbl(con, "applications") %>%
filter(is_system_app, play_store_genre != "System") %>%
count()
# They are perfectly correlated.
# Manually classify apps
df_app_categories[df_app_categories$play_store_genre == "not_found",] <-
df_app_categories %>%
filter(play_store_genre == "not_found") %>%
mutate(
play_store_genre =
case_when(
str_detect(str_to_lower(package_name), grep_pattern) ~ "System",
str_detect(str_to_lower(package_name), "straw") ~ "STRAW",
str_detect(str_to_lower(package_name), "chromium") ~ "Communication", # Same as chrome.
str_detect(str_to_lower(package_name), "skype") ~ "Communication", # Skype Lite not classified.
str_detect(str_to_lower(package_name), "imsservice") ~ "Communication", # IP Multimedia Subsystem
str_detect(str_to_lower(package_name), paste(c("covid", "empatica"), collapse = "|")) ~ "Medical",
str_detect(str_to_lower(package_name), paste(c("libri", "tachiyomi"), collapse = "|")) ~ "Books & Reference",
str_detect(str_to_lower(package_name), paste(c("bricks", "chess"), collapse = "|")) ~ "Casual",
str_detect(str_to_lower(package_name), "weather") ~ "Weather",
str_detect(str_to_lower(package_name), "excel") ~ "Productivity",
str_detect(str_to_lower(package_name), paste(c("qr", "barcode", "archimedes", "mixplorer", "winrar", "filemanager", "shot", "faceunlock", "signin", "milink"), collapse = "|")) ~ "Tools",
str_detect(str_to_lower(package_name), "stupeflix") ~ "Photography",
str_detect(str_to_lower(package_name), "anyme") ~ "Entertainment",
str_detect(str_to_lower(package_name), "vanced") ~ "Video Players & Editors",
str_detect(str_to_lower(package_name), paste(c("music", "radio", "dolby"), collapse = "|")) ~ "Music & Audio",
str_detect(str_to_lower(package_name), paste(c("tensorflow", "object_detection"), collapse = "|")) ~ "Education",
.default = play_store_genre
)
)
# Explore what remains after classifying above.
df_app_categories %>%
filter(play_store_genre == "not_found")
# After this, 13 applications remain, which I will classify as "Other".
# Correct some mistakes
# And classify 'not_found'
df_app_categories %<>%
mutate(
play_store_genre = {
function(x) {
case_when(
x == "Education,Education" ~ "Education",
x == "EducationEducation" ~ "Education",
x == "not_found" ~ "Other",
.default = x
)
}
}(play_store_genre)
) %>%
select(-package_name) %>%
rename(
genre = play_store_genre,
package_name = package_hash
)
table(df_app_categories$genre)
df_app_categories %>%
group_by(genre) %>%
count() %>%
arrange(desc(n)) %>%
write_csv("play_store_categories_count.csv")
write_csv(
x = select(df_app_categories, c(package_name, genre)),
file = "play_store_application_genre_catalogue.csv"
df_app_categories %<>% mutate(
play_store_genre = {
function(x) {
case_when(
x == "Education,Education" ~ "Education",
x == "EducationEducation" ~ "Education",
x == "not_found" ~ "System",
.default = x
)
}
}(play_store_genre)
)
dbDisconnect(con)

File diff suppressed because it is too large Load Diff

View File

@ -1,45 +0,0 @@
genre,n
System,261
Tools,96
Productivity,71
Health & Fitness,60
Finance,54
Communication,39
Music & Audio,39
Shopping,38
Lifestyle,33
Education,28
News & Magazines,24
Maps & Navigation,23
Entertainment,21
Business,18
Travel & Local,18
Books & Reference,16
Social,16
Weather,16
Food & Drink,14
Sports,14
Other,13
Photography,13
Puzzle,13
Video Players & Editors,12
Card,9
Casual,9
Personalization,8
Medical,7
Board,5
Strategy,4
House & Home,3
Trivia,3
Word,3
Adventure,2
Art & Design,2
Auto & Vehicles,2
Dating,2
Role Playing,2
STRAW,2
Simulation,2
"Board,Brain Games",1
"Entertainment,Music & Video",1
Parenting,1
Racing,1
1 genre n
2 System 261
3 Tools 96
4 Productivity 71
5 Health & Fitness 60
6 Finance 54
7 Communication 39
8 Music & Audio 39
9 Shopping 38
10 Lifestyle 33
11 Education 28
12 News & Magazines 24
13 Maps & Navigation 23
14 Entertainment 21
15 Business 18
16 Travel & Local 18
17 Books & Reference 16
18 Social 16
19 Weather 16
20 Food & Drink 14
21 Sports 14
22 Other 13
23 Photography 13
24 Puzzle 13
25 Video Players & Editors 12
26 Card 9
27 Casual 9
28 Personalization 8
29 Medical 7
30 Board 5
31 Strategy 4
32 House & Home 3
33 Trivia 3
34 Word 3
35 Adventure 2
36 Art & Design 2
37 Auto & Vehicles 2
38 Dating 2
39 Role Playing 2
40 STRAW 2
41 Simulation 2
42 Board,Brain Games 1
43 Entertainment,Music & Video 1
44 Parenting 1
45 Racing 1

2
rapids

@ -1 +1 @@
Subproject commit 63f5a526fce4d288499168e1701adadb8b885d82
Subproject commit 03687a1ac204f0a4347eb758dada8005f68b0bb1