stress_at_work_analysis/features/proximity.py

95 lines
3.1 KiB
Python

from collections.abc import Collection
import pandas as pd
from config.models import Participant, Proximity
from setup import db_engine, session
FEATURES_PROXIMITY = ["freq_prox_near", "prop_prox_near"]
def get_proximity_data(usernames: Collection) -> pd.DataFrame:
"""
Read the data from the proximity sensor table and return it in a dataframe.
Parameters
----------
usernames: Collection
A list of usernames to put into the WHERE condition.
Returns
-------
df_proximity: pd.DataFrame
A dataframe of proximity data.
"""
query_proximity = (
session.query(Proximity, Participant.username)
.filter(Participant.id == Proximity.participant_id)
.filter(Participant.username.in_(usernames))
)
with db_engine.connect() as connection:
df_proximity = pd.read_sql(query_proximity.statement, connection)
return df_proximity
def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame:
"""
This function recodes proximity from a double to a boolean value.
Different proximity sensors report different values,
but in our data only several distinct values have ever been found.
These are therefore converted into "near" and "far" binary values.
See expl_proximity.ipynb for additional info.
Parameters
----------
df_proximity: pd.DataFrame
A dataframe of proximity data.
Returns
-------
df_proximity: pd.DataFrame
The same dataframe with an additional column bool_prox_near,
indicating whether "near" proximity was reported.
False values correspond to "far" reported by this sensor.
"""
df_proximity = df_proximity.assign(bool_prox_near=lambda x: x.double_proximity == 0)
return df_proximity
def count_proximity(
df_proximity: pd.DataFrame, group_by: Collection = None
) -> pd.DataFrame:
"""
The function counts how many times a "near" value occurs in proximity
and calculates the proportion of this counts to all proximity values (i.e. relative count).
Parameters
----------
df_proximity: pd.DataFrame
A dataframe of proximity data.
group_by: Collection
A list of strings, specifying by which parameters to group.
By default, the features are calculated per participant, but could be "date_lj" etc.
Returns
-------
df_proximity_features: pd.DataFrame
A dataframe with the count of "near" proximity values and their relative count.
"""
if group_by is None:
group_by = ["participant_id"]
if "bool_prox_near" not in df_proximity:
df_proximity = recode_proximity(df_proximity)
df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
df_proximity_features = df_proximity.groupby(group_by).sum()[
["bool_prox_near", "bool_prox_far"]
]
df_proximity_features = df_proximity_features.assign(
prop_prox_near=lambda x: x.bool_prox_near / (x.bool_prox_near + x.bool_prox_far)
)
df_proximity_features = df_proximity_features.rename(
columns={"bool_prox_near": "freq_prox_near"}
).drop(columns="bool_prox_far", inplace=False)
return df_proximity_features