Cluster by demand_control_ratio_quartile.

2023-01-04 18:16:14 +01:00 · 2023-01-04 18:16:14 +01:00 · 1516d1c000
parent 2bb95657d8
commit 1516d1c000
3 changed files with 21 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,3 +12,6 @@ __pycache__/
 /data/*input*.csv
 /data/daily*
 /data/intradaily*
+/data/30min*
+/presentation/*scores.csv
+/presentation/Results.ods
--- a/config/environment.yml
+++ b/config/environment.yml
@ -9,6 +9,7 @@ dependencies:
  - flake8
  - jupyterlab
  - jupytext
+  - lightgbm
  - mypy
  - nodejs
  - pandas
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -31,7 +31,6 @@ from sklearn.impute import SimpleImputer

 from sklearn.dummy import DummyClassifier
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
-from lightgbm import LGBMClassifier
 import xgboost as xg 

 from sklearn.cluster import KMeans
@ -52,12 +51,12 @@ from machine_learning.classification_models import ClassificationModels

 # %% [markdown]
 # ## Set script's parameters
-n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
+n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
 cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs

 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]

 clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
@ -66,7 +65,7 @@ model_input.columns[list(model_input.columns).index('age'):-1]

 lime_cols = [col for col in model_input if col.startswith('limesurvey')]
 lime_cols
-lime_col = 'limesurvey_demand_control_ratio'
+lime_col = 'limesurvey_demand_control_ratio_quartile'
 clust_col = lime_col

 model_input[clust_col].describe()
@ -75,9 +74,11 @@ model_input[clust_col].describe()
 # %% jupyter={"source_hidden": true}

 # Filter-out outlier rows by clust_col 
-model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]

+#print(model_input)
 uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+uniq = uniq.dropna()
 plt.bar(uniq['pid'], uniq[clust_col])

 # %% jupyter={"source_hidden": true}
@ -182,3 +183,14 @@ for k in range(n_clusters):
 # %% jupyter={"source_hidden": true}
 # Get overall results
 cm.get_total_models_scores(n_clusters=n_clusters)
+
+# %%
+final_scores = pd.DataFrame()
+for model in cmodels:
+    final_scores = pd.concat([final_scores,pd.DataFrame.from_dict(cmodels[model])])
+
+# %%
+final_scores
+
+# %%
+