stress_at_work_analysis/exploration/expl_light.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.11.4
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %%
# %matplotlib inline
import datetime
import os
import sys

import seaborn as sns
from pytz import timezone
from tabulate import tabulate

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import participants.query_db

TZ_LJ = timezone("Europe/Ljubljana")

# %%
from features.light import *

# %% [markdown]
# # Basic characteristics

# %%
df_light_nokia = get_light_data(["nokia_0000003"])
print(df_light_nokia)

# %%
participants_inactive_usernames = participants.query_db.get_usernames()
df_light_inactive = get_light_data(participants_inactive_usernames)

# %%
df_light_inactive.accuracy.value_counts()

# %% [markdown]
# From [SensorManager](https://developer.android.com/reference/android/hardware/SensorManager.html#SENSOR_STATUS_ACCURACY_HIGH):
#
# ```java
# public static final int SENSOR_STATUS_ACCURACY_HIGH
# ```
#
# This sensor is reporting data with maximum accuracy
#
# Constant Value: 3 (0x00000003)

# %%
df_light_inactive.double_light_lux.describe()

# %%
df_light_plot = df_light_inactive.copy()
df_light_plot["double_light_lux"] = df_light_plot["double_light_lux"] + 1
sns.displot(
    data=df_light_plot,
    x="double_light_lux",
    binwidth=0.1,
    log_scale=(True, False),
    height=8,
)

# %% [markdown]
# The official SensorManager Light constants are:
# * Cloudy sky: 100.0
# * Full moon: 0.25
# * No moon: 0.001
# * Overcast: 10000.0
# * Shade: 20000.0
# * Sunlight: 110000.0
# * Sunlight maximum: 120000.0
# * Sunrise: 400.0
#

# %%
df_light_low = df_light_inactive[df_light_inactive["double_light_lux"] <= 10]
sns.displot(data=df_light_low, x="double_light_lux", binwidth=0.5, height=8)

# %%
df_light_very_low = df_light_low[df_light_low["double_light_lux"] < 0.5]
df_light_very_low.double_light_lux.value_counts()

# %%
df_light_nokia["datetime_lj"] = df_light_nokia["timestamp"].apply(
    lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
)
df_light_nokia.loc[df_light_nokia["double_light_lux"] == 0, ["datetime_lj"]]

# %% [markdown]
# Zeroes are present during the day. It does happens when the sensor is physically blocked.

# %% [markdown]
# # Differences between participants

# %%
df_light_participants = (
    df_light_inactive[["participant_id", "device_id", "double_light_lux"]]
    .groupby(["participant_id", "device_id"])
    .agg(["mean", "median", "std", "min", "max"])
    .reset_index(col_level=1)
)
df_light_participants.columns = df_light_participants.columns.get_level_values(1)

# %%
df_light_participants[df_light_participants["min"] > 0]

# %%
df_light_inactive[
    df_light_inactive["device_id"] == "3188b03e-8b6f-45da-894e-769eed81bbda"
].shape

# %% [markdown]
# This was a Lenovo Vibe K6, but the small range of values is due to a reinstallation shortly after the first (unsuccessful) installation.

# %%
sns.displot(data=df_light_participants, x="mean", binwidth=0.1, log_scale=(True, False))

# %%
sns.displot(data=df_light_participants, x="max", binwidth=0.1, log_scale=(True, False))

# %% [markdown]
# Variability in means is probably due to variability in maxima.

# %%
histogram_median = sns.displot(
    data=df_light_participants, x="median", binwidth=50, log_scale=(False, False)
)

# %%
df_light_participants[df_light_participants["median"] > 1e4]

# %% [markdown]
# This was a Cubot KingKong Mini 2 phone.

# %%
histogram_median = sns.displot(
    data=df_light_participants, x="median", binwidth=50, log_scale=(False, False)
)
histogram_median.set(xlim=(0, 600))

# %% [markdown]
# Other medians are much more similar.

# %%
df_light_participants["std_rel"] = (
    df_light_participants["std"] / df_light_participants["max"]
)

# %%
sns.displot(data=df_light_participants, x="std_rel", binwidth=0.005)

# %% [markdown]
# Relative variability is homogeneous.
#
# This means that light data needs to be standardized. Min/max standardization would probably fit best.