Merge branch 'imputation_and_cleaning' of https://repo.ijs.si/junoslukan/rapids into imputation_and_cleaning
commit
3cf7ca41aa
|
@ -0,0 +1,56 @@
|
||||||
|
from pprint import pprint
|
||||||
|
import sklearn.metrics
|
||||||
|
import autosklearn.regression
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import importlib
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
||||||
|
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split
|
||||||
|
from sklearn.metrics import mean_squared_error, r2_score
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
|
||||||
|
model_input = pd.read_csv("data/processed/models/population_model/z_input.csv") # Standardizirani podatki
|
||||||
|
|
||||||
|
model_input.dropna(axis=1, how="all", inplace=True)
|
||||||
|
model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
|
||||||
|
|
||||||
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||||
|
categorical_features = model_input[categorical_feature_colnames].copy()
|
||||||
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||||
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||||
|
if not categorical_features.empty:
|
||||||
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
||||||
|
model_in = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
|
||||||
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
model_in.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30)
|
||||||
|
|
||||||
|
automl = autosklearn.regression.AutoSklearnRegressor(
|
||||||
|
time_left_for_this_task=14400,
|
||||||
|
per_run_time_limit=120
|
||||||
|
)
|
||||||
|
automl.fit(X_train, y_train, dataset_name='straw')
|
||||||
|
|
||||||
|
print(automl.leaderboard())
|
||||||
|
pprint(automl.show_models(), indent=4)
|
||||||
|
|
||||||
|
train_predictions = automl.predict(X_train)
|
||||||
|
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
|
||||||
|
test_predictions = automl.predict(X_test)
|
||||||
|
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.exit()
|
|
@ -86,8 +86,6 @@ dependencies:
|
||||||
- readline=8.0
|
- readline=8.0
|
||||||
- requests=2.25.0
|
- requests=2.25.0
|
||||||
- retrying=1.3.3
|
- retrying=1.3.3
|
||||||
- scikit-learn=0.23.2
|
|
||||||
- scipy=1.5.2
|
|
||||||
- setuptools=51.0.0
|
- setuptools=51.0.0
|
||||||
- six=1.15.0
|
- six=1.15.0
|
||||||
- smmap=3.0.4
|
- smmap=3.0.4
|
||||||
|
@ -107,34 +105,61 @@ dependencies:
|
||||||
- zlib=1.2.11
|
- zlib=1.2.11
|
||||||
- pip:
|
- pip:
|
||||||
- amply==0.1.4
|
- amply==0.1.4
|
||||||
|
- auto-sklearn==0.14.7
|
||||||
- bidict==0.22.0
|
- bidict==0.22.0
|
||||||
- biosppy==0.8.0
|
- biosppy==0.8.0
|
||||||
|
- build==0.8.0
|
||||||
- cached-property==1.5.2
|
- cached-property==1.5.2
|
||||||
|
- cloudpickle==2.2.0
|
||||||
- configargparse==0.15.1
|
- configargparse==0.15.1
|
||||||
|
- configspace==0.4.21
|
||||||
- cr-features==0.2.1
|
- cr-features==0.2.1
|
||||||
- cycler==0.11.0
|
- cycler==0.11.0
|
||||||
|
- cython==0.29.32
|
||||||
|
- dask==2022.2.0
|
||||||
- decorator==4.4.2
|
- decorator==4.4.2
|
||||||
|
- distributed==2022.2.0
|
||||||
|
- distro==1.7.0
|
||||||
|
- emcee==3.1.2
|
||||||
- fonttools==4.33.2
|
- fonttools==4.33.2
|
||||||
|
- fsspec==2022.8.2
|
||||||
- h5py==3.6.0
|
- h5py==3.6.0
|
||||||
|
- heapdict==1.0.1
|
||||||
- hmmlearn==0.2.7
|
- hmmlearn==0.2.7
|
||||||
- ipython-genutils==0.2.0
|
- ipython-genutils==0.2.0
|
||||||
- jupyter-core==4.6.3
|
- jupyter-core==4.6.3
|
||||||
- kiwisolver==1.4.2
|
- kiwisolver==1.4.2
|
||||||
|
- liac-arff==2.5.0
|
||||||
|
- locket==1.0.0
|
||||||
- matplotlib==3.5.1
|
- matplotlib==3.5.1
|
||||||
|
- msgpack==1.0.4
|
||||||
- nbformat==5.0.7
|
- nbformat==5.0.7
|
||||||
- opencv-python==4.5.5.64
|
- opencv-python==4.5.5.64
|
||||||
- packaging==21.3
|
- packaging==21.3
|
||||||
|
- partd==1.3.0
|
||||||
- peakutils==1.3.3
|
- peakutils==1.3.3
|
||||||
|
- pep517==0.13.0
|
||||||
- pillow==9.1.0
|
- pillow==9.1.0
|
||||||
- pulp==2.4
|
- pulp==2.4
|
||||||
|
- pynisher==0.6.4
|
||||||
- pyparsing==2.4.7
|
- pyparsing==2.4.7
|
||||||
|
- pyrfr==0.8.3
|
||||||
- pyrsistent==0.15.5
|
- pyrsistent==0.15.5
|
||||||
- pywavelets==1.3.0
|
- pywavelets==1.3.0
|
||||||
- ratelimiter==1.2.0.post0
|
- ratelimiter==1.2.0.post0
|
||||||
|
- scikit-learn==0.24.2
|
||||||
|
- scipy==1.7.3
|
||||||
- seaborn==0.11.2
|
- seaborn==0.11.2
|
||||||
- shortuuid==1.0.8
|
- shortuuid==1.0.8
|
||||||
|
- smac==1.2
|
||||||
- snakemake==5.30.2
|
- snakemake==5.30.2
|
||||||
|
- sortedcontainers==2.4.0
|
||||||
|
- tblib==1.7.0
|
||||||
|
- tomli==2.0.1
|
||||||
|
- toolz==0.12.0
|
||||||
- toposort==1.5
|
- toposort==1.5
|
||||||
|
- tornado==6.2
|
||||||
- traitlets==4.3.3
|
- traitlets==4.3.3
|
||||||
- typing-extensions==4.2.0
|
- typing-extensions==4.2.0
|
||||||
|
- zict==2.2.0
|
||||||
prefix: /opt/conda/envs/rapids
|
prefix: /opt/conda/envs/rapids
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 248 KiB |
|
@ -15,6 +15,7 @@ def deviceFeatures(devices, ownership, common_devices, features_to_compute, feat
|
||||||
if "meanscans" in features_to_compute:
|
if "meanscans" in features_to_compute:
|
||||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer")
|
features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer")
|
||||||
if "stdscans" in features_to_compute:
|
if "stdscans" in features_to_compute:
|
||||||
|
# TODO: std scans
|
||||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer")
|
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer")
|
||||||
# Most frequent device within segments, across segments, and across dataset
|
# Most frequent device within segments, across segments, and across dataset
|
||||||
if "countscansmostfrequentdevicewithinsegments" in features_to_compute:
|
if "countscansmostfrequentdevicewithinsegments" in features_to_compute:
|
||||||
|
|
|
@ -36,6 +36,7 @@ def variance_and_logvariance_features(location_data, location_features):
|
||||||
location_data["latitude_for_wvar"] = (location_data["double_latitude"] - location_data["latitude_wavg"]) ** 2 * location_data["duration"] * 60
|
location_data["latitude_for_wvar"] = (location_data["double_latitude"] - location_data["latitude_wavg"]) ** 2 * location_data["duration"] * 60
|
||||||
location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60
|
location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60
|
||||||
|
|
||||||
|
# TODO: location variance
|
||||||
location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0)
|
location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0)
|
||||||
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan)
|
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan)
|
||||||
|
|
||||||
|
@ -112,6 +113,8 @@ def location_entropy(location_data):
|
||||||
entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"})
|
entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"})
|
||||||
|
|
||||||
entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique()
|
entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique()
|
||||||
|
|
||||||
|
# TODO: normalizedlocationentropy
|
||||||
entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"]
|
entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"]
|
||||||
|
|
||||||
return entropy
|
return entropy
|
||||||
|
@ -153,6 +156,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
# distance and speed features
|
# distance and speed features
|
||||||
moving_data = location_data[location_data["is_stationary"] == 0].copy()
|
moving_data = location_data[location_data["is_stationary"] == 0].copy()
|
||||||
location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True)
|
location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True)
|
||||||
|
# TODO: zakaj se ne zapolni varspeed z 0?
|
||||||
location_features[["totaldistance", "avgspeed", "varspeed"]] = location_features[["totaldistance", "avgspeed", "varspeed"]].fillna(0)
|
location_features[["totaldistance", "avgspeed", "varspeed"]] = location_features[["totaldistance", "avgspeed", "varspeed"]].fillna(0)
|
||||||
|
|
||||||
# stationary features
|
# stationary features
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv"
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
|
||||||
|
# Bluetooth
|
||||||
|
doryab_cols_bt = [col for col in df.columns if "bluetooth_doryab" in col]
|
||||||
|
df_bt = df[doryab_cols_bt]
|
||||||
|
|
||||||
|
print(len(doryab_cols_bt))
|
||||||
|
print(df_bt)
|
||||||
|
|
||||||
|
sns.heatmap(df_bt, xticklabels=1)
|
||||||
|
plt.savefig(f'bluetooth_doryab_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
# Location
|
||||||
|
doryab_cols_loc = [col for col in df.columns if "locations_doryab" in col]
|
||||||
|
df_loc = df[doryab_cols_loc]
|
||||||
|
|
||||||
|
print(len(doryab_cols_loc))
|
||||||
|
print(df_loc)
|
||||||
|
|
||||||
|
sns.heatmap(df_loc, xticklabels=1)
|
||||||
|
plt.savefig(f'locations_doryab_values', bbox_inches='tight')
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append('/rapids/')
|
||||||
|
from src.features import cr_features_helper_methods as crhm
|
||||||
|
|
||||||
|
pd.set_option("display.max_columns", None)
|
||||||
|
features_win = pd.read_csv("data/interim/p031/empatica_temperature_features/empatica_temperature_python_cr_windows.csv", usecols=[0, 1, 2, 3, 4, 5])
|
||||||
|
|
||||||
|
# First standardization method
|
||||||
|
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', "empatica_temperature_cr_level_1"]
|
||||||
|
z1_windows = features_win.copy()
|
||||||
|
z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)] = StandardScaler().fit_transform(z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)])
|
||||||
|
z1 = crhm.extract_second_order_features(z1_windows, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_")
|
||||||
|
z1 = z1.iloc[:,4:]
|
||||||
|
# print(z1)
|
||||||
|
|
||||||
|
# Second standardization method
|
||||||
|
so_features_reg = crhm.extract_second_order_features(features_win, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_")
|
||||||
|
so_features_reg = so_features_reg.iloc[:,4:]
|
||||||
|
z2 = pd.DataFrame(StandardScaler().fit_transform(so_features_reg), columns=so_features_reg.columns)
|
||||||
|
# print(z2)
|
||||||
|
|
||||||
|
# Standardization of the first standardization method values
|
||||||
|
z1_z = pd.DataFrame(StandardScaler().fit_transform(z1), columns=z1.columns)
|
||||||
|
# print(z1_z)
|
||||||
|
|
||||||
|
# For SD
|
||||||
|
fig, axs = plt.subplots(3, figsize=(8, 10))
|
||||||
|
axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||||
|
axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO")
|
||||||
|
|
||||||
|
axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||||
|
axs[1].set_title("Z2 - ekstrahirane značilke SO 'normalnih' vrednosti, nato standardizacija")
|
||||||
|
|
||||||
|
axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||||
|
axs[2].set_title("Standardiziran Z1")
|
||||||
|
|
||||||
|
fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_sd')
|
||||||
|
plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_sd', bbox_inches='tight')
|
||||||
|
|
||||||
|
showcase = pd.DataFrame()
|
||||||
|
showcase['Z1__SD'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||||
|
showcase['Z2__SD'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||||
|
showcase['Z1__SD_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||||
|
print(showcase)
|
||||||
|
|
||||||
|
# For
|
||||||
|
fig, axs = plt.subplots(3, figsize=(8, 10))
|
||||||
|
axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||||
|
axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO")
|
||||||
|
|
||||||
|
axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||||
|
axs[1].set_title("Z2")
|
||||||
|
|
||||||
|
axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||||
|
axs[2].set_title("Standardized Z1")
|
||||||
|
|
||||||
|
fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_nlargest')
|
||||||
|
plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_nlargest', bbox_inches='tight')
|
||||||
|
|
||||||
|
showcase2 = pd.DataFrame()
|
||||||
|
showcase2['Z1__nlargest'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||||
|
showcase2['Z2__nlargest'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||||
|
showcase2['Z1__nlargest_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||||
|
print(showcase2)
|
||||||
|
|
Loading…
Reference in New Issue