Cluster by demand_control_ratio_quartile.

ml_pipeline
junos 2023-01-04 18:16:14 +01:00
parent 2bb95657d8
commit 1516d1c000
3 changed files with 21 additions and 5 deletions

3
.gitignore vendored
View File

@ -12,3 +12,6 @@ __pycache__/
/data/*input*.csv /data/*input*.csv
/data/daily* /data/daily*
/data/intradaily* /data/intradaily*
/data/30min*
/presentation/*scores.csv
/presentation/Results.ods

View File

@ -9,6 +9,7 @@ dependencies:
- flake8 - flake8
- jupyterlab - jupyterlab
- jupytext - jupytext
- lightgbm
- mypy - mypy
- nodejs - nodejs
- pandas - pandas

View File

@ -31,7 +31,6 @@ from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier from sklearn.dummy import DummyClassifier
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
from lightgbm import LGBMClassifier
import xgboost as xg import xgboost as xg
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
@ -52,12 +51,12 @@ from machine_learning.classification_models import ClassificationModels
# %% [markdown] # %% [markdown]
# ## Set script's parameters # ## Set script's parameters
n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter) n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
@ -66,7 +65,7 @@ model_input.columns[list(model_input.columns).index('age'):-1]
lime_cols = [col for col in model_input if col.startswith('limesurvey')] lime_cols = [col for col in model_input if col.startswith('limesurvey')]
lime_cols lime_cols
lime_col = 'limesurvey_demand_control_ratio' lime_col = 'limesurvey_demand_control_ratio_quartile'
clust_col = lime_col clust_col = lime_col
model_input[clust_col].describe() model_input[clust_col].describe()
@ -75,9 +74,11 @@ model_input[clust_col].describe()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Filter-out outlier rows by clust_col # Filter-out outlier rows by clust_col
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)] #model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
#print(model_input)
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True) uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
uniq = uniq.dropna()
plt.bar(uniq['pid'], uniq[clust_col]) plt.bar(uniq['pid'], uniq[clust_col])
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
@ -182,3 +183,14 @@ for k in range(n_clusters):
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Get overall results # Get overall results
cm.get_total_models_scores(n_clusters=n_clusters) cm.get_total_models_scores(n_clusters=n_clusters)
# %%
final_scores = pd.DataFrame()
for model in cmodels:
final_scores = pd.concat([final_scores,pd.DataFrame.from_dict(cmodels[model])])
# %%
final_scores
# %%