rapids/tools/create_multi_timezones_file.py

"""
This script can create the multiple timezones file based on timezone table collected by the AWARE app.

Input: timezone table collected by the AWARE app
---
Expected output:

| Column      | Description                                                                                                   |
|-------------|---------------------------------------------------------------------------------------------------------------|
| device_id   | A string that uniquely identifies a smartphone or wearable                                                    |
| tzcode      | A string with the appropriate code from this list that represents the time zone where the device sensed data  |
| timestamp   | A UNIX timestamp indicating when was the first time this device_id sensed data in tzcode                      |

How to run it?
1. Put the timezone table (timezone.csv) collected by the AWARE app under data/external folder
2. Run python tools/create_multi_timezones_file.py

"""

import re
import pandas as pd

# Load the timezone table collected by the AWARE app
data = pd.read_csv("data/external/timezone.csv")
# Load the first table of the List of tz database time zones page in wiki (Link: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
wiki_tz = pd.read_csv("data/external/wiki_tz.csv")

data["tzcode"] = data["timezone"].str.extract("("+"|".join(wiki_tz["TZ database name"].tolist())+")", expand=False)
data = data[["device_id", "tzcode", "timestamp"]]

# Sort by device_id and timestamp
data.sort_values(by=["device_id", "timestamp"], inplace=True)
# Only keep the first & last row for consecutive rows with the same tzcode per device_id
data_first = data.loc[(data["device_id"].shift(1) != data["device_id"]) | (data["tzcode"].shift(1) != data["tzcode"])]
data_last = data.loc[(data["device_id"].shift(-1) != data["device_id"]) | (data["tzcode"].shift(-1) != data["tzcode"])]
data = pd.concat([data_first, data_last], axis=0)
# Drop duplicates and sort by device_id and timestamp
data = data.drop_duplicates()
data.sort_values(by=["device_id", "timestamp"], inplace=True)

data.to_csv("data/external/multiple_timezones.csv", index=False)
Add script & docs to create multi timezones file 2021-03-25 21:22:25 +01:00			`"""`
			`This script can create the multiple timezones file based on timezone table collected by the AWARE app.`

			`Input: timezone table collected by the AWARE app`
			`---`
			`Expected output:`

			`\| Column \| Description \|`
			`\|-------------\|---------------------------------------------------------------------------------------------------------------\|`
			`\| device_id \| A string that uniquely identifies a smartphone or wearable \|`
			`\| tzcode \| A string with the appropriate code from this list that represents the time zone where the device sensed data \|`
			`\| timestamp \| A UNIX timestamp indicating when was the first time this device_id sensed data in tzcode \|`

			`How to run it?`
			`1. Put the timezone table (timezone.csv) collected by the AWARE app under data/external folder`
			`2. Run python tools/create_multi_timezones_file.py`

			`"""`

			`import re`
			`import pandas as pd`

			`# Load the timezone table collected by the AWARE app`
			`data = pd.read_csv("data/external/timezone.csv")`
			`# Load the first table of the List of tz database time zones page in wiki (Link: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)`
			`wiki_tz = pd.read_csv("data/external/wiki_tz.csv")`

			`data["tzcode"] = data["timezone"].str.extract("("+"\|".join(wiki_tz["TZ database name"].tolist())+")", expand=False)`
			`data = data[["device_id", "tzcode", "timestamp"]]`

			`# Sort by device_id and timestamp`
			`data.sort_values(by=["device_id", "timestamp"], inplace=True)`
			`# Only keep the first & last row for consecutive rows with the same tzcode per device_id`
			`data_first = data.loc[(data["device_id"].shift(1) != data["device_id"]) \| (data["tzcode"].shift(1) != data["tzcode"])]`
			`data_last = data.loc[(data["device_id"].shift(-1) != data["device_id"]) \| (data["tzcode"].shift(-1) != data["tzcode"])]`
			`data = pd.concat([data_first, data_last], axis=0)`
			`# Drop duplicates and sort by device_id and timestamp`
			`data = data.drop_duplicates()`
			`data.sort_values(by=["device_id", "timestamp"], inplace=True)`

			`data.to_csv("data/external/multiple_timezones.csv", index=False)`