From 1516d1c0008a51f8d82605bd0191eb424e13c970 Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Wed, 4 Jan 2023 18:16:14 +0100
Subject: [PATCH] Cluster by demand_control_ratio_quartile.

---
 .gitignore                                    |  3 +++
 config/environment.yml                        |  1 +
 ...pipeline_classification_with_clustering.py | 22 ++++++++++++++-----
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index d5b862d..31e264b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@ __pycache__/
 /data/*input*.csv
 /data/daily*
 /data/intradaily*
+/data/30min*
+/presentation/*scores.csv
+/presentation/Results.ods
diff --git a/config/environment.yml b/config/environment.yml
index 62cb210..192bc90 100644
--- a/config/environment.yml
+++ b/config/environment.yml
@@ -9,6 +9,7 @@ dependencies:
   - flake8
   - jupyterlab
   - jupytext
+  - lightgbm
   - mypy
   - nodejs
   - pandas
diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
index 0bf4417..0f730db 100644
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -31,7 +31,6 @@ from sklearn.impute import SimpleImputer
 
 from sklearn.dummy import DummyClassifier
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
-from lightgbm import LGBMClassifier
 import xgboost as xg 
 
 from sklearn.cluster import KMeans
@@ -52,12 +51,12 @@ from machine_learning.classification_models import ClassificationModels
 
 # %% [markdown]
 # ## Set script's parameters
-n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
+n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
 cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
 
 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 
 clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
@@ -66,7 +65,7 @@ model_input.columns[list(model_input.columns).index('age'):-1]
 
 lime_cols = [col for col in model_input if col.startswith('limesurvey')]
 lime_cols
-lime_col = 'limesurvey_demand_control_ratio'
+lime_col = 'limesurvey_demand_control_ratio_quartile'
 clust_col = lime_col
 
 model_input[clust_col].describe()
@@ -75,9 +74,11 @@ model_input[clust_col].describe()
 # %% jupyter={"source_hidden": true}
 
 # Filter-out outlier rows by clust_col 
-model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
 
+#print(model_input)
 uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+uniq = uniq.dropna()
 plt.bar(uniq['pid'], uniq[clust_col])
 
 # %% jupyter={"source_hidden": true}
@@ -182,3 +183,14 @@ for k in range(n_clusters):
 # %% jupyter={"source_hidden": true}
 # Get overall results
 cm.get_total_models_scores(n_clusters=n_clusters)
+
+# %%
+final_scores = pd.DataFrame()
+for model in cmodels:
+    final_scores = pd.concat([final_scores,pd.DataFrame.from_dict(cmodels[model])])
+
+# %%
+final_scores
+
+# %%
+