42 lines
2.1 KiB
Python
42 lines
2.1 KiB
Python
"""
|
|
This script can create the multiple timezones file based on timezone table collected by the AWARE app.
|
|
|
|
Input: timezone table collected by the AWARE app
|
|
---
|
|
Expected output:
|
|
|
|
| Column | Description |
|
|
|-------------|---------------------------------------------------------------------------------------------------------------|
|
|
| device_id | A string that uniquely identifies a smartphone or wearable |
|
|
| tzcode | A string with the appropriate code from this list that represents the time zone where the device sensed data |
|
|
| timestamp | A UNIX timestamp indicating when was the first time this device_id sensed data in tzcode |
|
|
|
|
How to run it?
|
|
1. Put the timezone table (timezone.csv) collected by the AWARE app under data/external folder
|
|
2. Run python tools/create_multi_timezones_file.py
|
|
|
|
"""
|
|
|
|
import re
|
|
import pandas as pd
|
|
|
|
# Load the timezone table collected by the AWARE app
|
|
data = pd.read_csv("data/external/timezone.csv")
|
|
# Load the first table of the List of tz database time zones page in wiki (Link: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
|
|
wiki_tz = pd.read_csv("data/external/wiki_tz.csv")
|
|
|
|
data["tzcode"] = data["timezone"].str.extract("("+"|".join(wiki_tz["TZ database name"].tolist())+")", expand=False)
|
|
data = data[["device_id", "tzcode", "timestamp"]]
|
|
|
|
# Sort by device_id and timestamp
|
|
data.sort_values(by=["device_id", "timestamp"], inplace=True)
|
|
# Only keep the first & last row for consecutive rows with the same tzcode per device_id
|
|
data_first = data.loc[(data["device_id"].shift(1) != data["device_id"]) | (data["tzcode"].shift(1) != data["tzcode"])]
|
|
data_last = data.loc[(data["device_id"].shift(-1) != data["device_id"]) | (data["tzcode"].shift(-1) != data["tzcode"])]
|
|
data = pd.concat([data_first, data_last], axis=0)
|
|
# Drop duplicates and sort by device_id and timestamp
|
|
data = data.drop_duplicates()
|
|
data.sort_values(by=["device_id", "timestamp"], inplace=True)
|
|
|
|
data.to_csv("data/external/multiple_timezones.csv", index=False)
|