Writing testing scripts to determine the point of manual imputation.

notes
Primoz 2022-09-14 14:13:03 +00:00
parent dfbb758902
commit d5ab5a0394
6 changed files with 105 additions and 2 deletions

View File

@ -36,10 +36,10 @@ model_in = pd.concat([numerical_features, categorical_features], axis=1)
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_in.set_index(index_columns, inplace=True) model_in.set_index(index_columns, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.20) X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30)
automl = autosklearn.regression.AutoSklearnRegressor( automl = autosklearn.regression.AutoSklearnRegressor(
time_left_for_this_task=36000, time_left_for_this_task=14400,
per_run_time_limit=120 per_run_time_limit=120
) )
automl.fit(X_train, y_train, dataset_name='straw') automl.fit(X_train, y_train, dataset_name='straw')

Binary file not shown.

Before

Width:  |  Height:  |  Size: 248 KiB

View File

@ -15,6 +15,7 @@ def deviceFeatures(devices, ownership, common_devices, features_to_compute, feat
if "meanscans" in features_to_compute: if "meanscans" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer") features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer")
if "stdscans" in features_to_compute: if "stdscans" in features_to_compute:
# TODO: std scans
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer") features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer")
# Most frequent device within segments, across segments, and across dataset # Most frequent device within segments, across segments, and across dataset
if "countscansmostfrequentdevicewithinsegments" in features_to_compute: if "countscansmostfrequentdevicewithinsegments" in features_to_compute:

View File

@ -36,6 +36,7 @@ def variance_and_logvariance_features(location_data, location_features):
location_data["latitude_for_wvar"] = (location_data["double_latitude"] - location_data["latitude_wavg"]) ** 2 * location_data["duration"] * 60 location_data["latitude_for_wvar"] = (location_data["double_latitude"] - location_data["latitude_wavg"]) ** 2 * location_data["duration"] * 60
location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60 location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60
# TODO: location variance
location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0) location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0)
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan) location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan)
@ -112,6 +113,8 @@ def location_entropy(location_data):
entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"}) entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"})
entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique() entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique()
# TODO: normalizedlocationentropy
entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"] entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"]
return entropy return entropy
@ -153,6 +156,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
# distance and speed features # distance and speed features
moving_data = location_data[location_data["is_stationary"] == 0].copy() moving_data = location_data[location_data["is_stationary"] == 0].copy()
location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True) location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True)
# TODO: zakaj se ne zapolni varspeed z 0?
location_features[["totaldistance", "avgspeed", "varspeed"]] = location_features[["totaldistance", "avgspeed", "varspeed"]].fillna(0) location_features[["totaldistance", "avgspeed", "varspeed"]] = location_features[["totaldistance", "avgspeed", "varspeed"]].fillna(0)
# stationary features # stationary features

View File

@ -0,0 +1,28 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv"
df = pd.read_csv(path)
# Bluetooth
doryab_cols_bt = [col for col in df.columns if "bluetooth_doryab" in col]
df_bt = df[doryab_cols_bt]
print(len(doryab_cols_bt))
print(df_bt)
sns.heatmap(df_bt, xticklabels=1)
plt.savefig(f'bluetooth_doryab_values', bbox_inches='tight')
# Location
doryab_cols_loc = [col for col in df.columns if "locations_doryab" in col]
df_loc = df[doryab_cols_loc]
print(len(doryab_cols_loc))
print(df_loc)
sns.heatmap(df_loc, xticklabels=1)
plt.savefig(f'locations_doryab_values', bbox_inches='tight')

View File

@ -0,0 +1,70 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import sys
sys.path.append('/rapids/')
from src.features import cr_features_helper_methods as crhm
pd.set_option("display.max_columns", None)
features_win = pd.read_csv("data/interim/p031/empatica_temperature_features/empatica_temperature_python_cr_windows.csv", usecols=[0, 1, 2, 3, 4, 5])
# First standardization method
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', "empatica_temperature_cr_level_1"]
z1_windows = features_win.copy()
z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)] = StandardScaler().fit_transform(z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)])
z1 = crhm.extract_second_order_features(z1_windows, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_")
z1 = z1.iloc[:,4:]
# print(z1)
# Second standardization method
so_features_reg = crhm.extract_second_order_features(features_win, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_")
so_features_reg = so_features_reg.iloc[:,4:]
z2 = pd.DataFrame(StandardScaler().fit_transform(so_features_reg), columns=so_features_reg.columns)
# print(z2)
# Standardization of the first standardization method values
z1_z = pd.DataFrame(StandardScaler().fit_transform(z1), columns=z1.columns)
# print(z1_z)
# For SD
fig, axs = plt.subplots(3, figsize=(8, 10))
axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO")
axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
axs[1].set_title("Z2 - ekstrahirane značilke SO 'normalnih' vrednosti, nato standardizacija")
axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
axs[2].set_title("Standardiziran Z1")
fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_sd')
plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_sd', bbox_inches='tight')
showcase = pd.DataFrame()
showcase['Z1__SD'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
showcase['Z2__SD'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
showcase['Z1__SD_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
print(showcase)
# For
fig, axs = plt.subplots(3, figsize=(8, 10))
axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO")
axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
axs[1].set_title("Z2")
axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
axs[2].set_title("Standardized Z1")
fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_nlargest')
plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_nlargest', bbox_inches='tight')
showcase2 = pd.DataFrame()
showcase2['Z1__nlargest'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
showcase2['Z2__nlargest'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
showcase2['Z1__nlargest_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
print(showcase2)