diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
new file mode 100644
index 0000000..2fd6442
--- /dev/null
+++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/.ipynb_checkpoints/Untitled1-checkpoint.ipynb b/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
new file mode 100644
index 0000000..027901f
--- /dev/null
+++ b/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
@@ -0,0 +1,393 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "nb_dir = os.path.split(os.getcwd())[0]\n",
+ "if nb_dir not in sys.path:\n",
+ " sys.path.append(nb_dir)\n",
+ " \n",
+ "from features.communication import *\n",
+ "import participants.query_db\n",
+ "\n",
+ "participants_inactive_usernames = participants.query_db.get_usernames()\n",
+ "df_sms = get_sms_data(participants_inactive_usernames)\n",
+ "df_calls = get_call_data(participants_inactive_usernames)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ " count_calls = count_comms(df_calls)\n",
+ " count_sms = count_comms(df_sms)\n",
+ "\n",
+ " count_joined = count_calls.merge(\n",
+ " count_sms, on=\"participant_id\", suffixes=(\"_calls\", \"_sms\")\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " participant_id | \n",
+ " no_incoming | \n",
+ " no_outgoing | \n",
+ " no_missed | \n",
+ " no_all_calls | \n",
+ " no_incoming_ratio | \n",
+ " no_outgoing_ratio | \n",
+ " duration_total_incoming | \n",
+ " duration_total_outgoing | \n",
+ " duration_max_incoming | \n",
+ " duration_max_outgoing | \n",
+ " no_received | \n",
+ " no_sent | \n",
+ " no_all_sms | \n",
+ " no_received_ratio | \n",
+ " no_sent_ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 13 | \n",
+ " 3.0 | \n",
+ " 21.0 | \n",
+ " 2.0 | \n",
+ " 26.0 | \n",
+ " 0.115385 | \n",
+ " 0.807692 | \n",
+ " 342.0 | \n",
+ " 2836.0 | \n",
+ " 196.0 | \n",
+ " 355.0 | \n",
+ " 7.0 | \n",
+ " 7.0 | \n",
+ " 14.0 | \n",
+ " 0.500000 | \n",
+ " 0.500000 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 14 | \n",
+ " 16.0 | \n",
+ " 22.0 | \n",
+ " 11.0 | \n",
+ " 49.0 | \n",
+ " 0.326531 | \n",
+ " 0.448980 | \n",
+ " 1873.0 | \n",
+ " 2789.0 | \n",
+ " 346.0 | \n",
+ " 694.0 | \n",
+ " 20.0 | \n",
+ " 14.0 | \n",
+ " 34.0 | \n",
+ " 0.588235 | \n",
+ " 0.411765 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 15 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " NaN | \n",
+ " 5.0 | \n",
+ " 0.600000 | \n",
+ " 0.400000 | \n",
+ " 310.0 | \n",
+ " 19.0 | \n",
+ " 154.0 | \n",
+ " 19.0 | \n",
+ " 73.0 | \n",
+ " 73.0 | \n",
+ " 146.0 | \n",
+ " 0.500000 | \n",
+ " 0.500000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 16 | \n",
+ " 4.0 | \n",
+ " 6.0 | \n",
+ " 3.0 | \n",
+ " 13.0 | \n",
+ " 0.307692 | \n",
+ " 0.461538 | \n",
+ " 1963.0 | \n",
+ " 849.0 | \n",
+ " 1037.0 | \n",
+ " 638.0 | \n",
+ " 8.0 | \n",
+ " 2.0 | \n",
+ " 10.0 | \n",
+ " 0.800000 | \n",
+ " 0.200000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 17 | \n",
+ " 20.0 | \n",
+ " 60.0 | \n",
+ " 8.0 | \n",
+ " 88.0 | \n",
+ " 0.227273 | \n",
+ " 0.681818 | \n",
+ " 5789.0 | \n",
+ " 17046.0 | \n",
+ " 1966.0 | \n",
+ " 3830.0 | \n",
+ " 7.0 | \n",
+ " 1.0 | \n",
+ " 8.0 | \n",
+ " 0.875000 | \n",
+ " 0.125000 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 56 | \n",
+ " 91 | \n",
+ " 15.0 | \n",
+ " 13.0 | \n",
+ " 3.0 | \n",
+ " 31.0 | \n",
+ " 0.483871 | \n",
+ " 0.419355 | \n",
+ " 3443.0 | \n",
+ " 3636.0 | \n",
+ " 644.0 | \n",
+ " 1315.0 | \n",
+ " 83.0 | \n",
+ " 44.0 | \n",
+ " 127.0 | \n",
+ " 0.653543 | \n",
+ " 0.346457 | \n",
+ "
\n",
+ " \n",
+ " 57 | \n",
+ " 92 | \n",
+ " 3.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 8.0 | \n",
+ " 0.375000 | \n",
+ " 0.500000 | \n",
+ " 231.0 | \n",
+ " 648.0 | \n",
+ " 167.0 | \n",
+ " 433.0 | \n",
+ " 4.0 | \n",
+ " 6.0 | \n",
+ " 10.0 | \n",
+ " 0.400000 | \n",
+ " 0.600000 | \n",
+ "
\n",
+ " \n",
+ " 58 | \n",
+ " 93 | \n",
+ " 22.0 | \n",
+ " 20.0 | \n",
+ " 9.0 | \n",
+ " 51.0 | \n",
+ " 0.431373 | \n",
+ " 0.392157 | \n",
+ " 2534.0 | \n",
+ " 1444.0 | \n",
+ " 443.0 | \n",
+ " 672.0 | \n",
+ " 48.0 | \n",
+ " 19.0 | \n",
+ " 67.0 | \n",
+ " 0.716418 | \n",
+ " 0.283582 | \n",
+ "
\n",
+ " \n",
+ " 59 | \n",
+ " 106 | \n",
+ " 12.0 | \n",
+ " 30.0 | \n",
+ " 6.0 | \n",
+ " 48.0 | \n",
+ " 0.250000 | \n",
+ " 0.625000 | \n",
+ " 3049.0 | \n",
+ " 2637.0 | \n",
+ " 878.0 | \n",
+ " 380.0 | \n",
+ " 10.0 | \n",
+ " 10.0 | \n",
+ " 20.0 | \n",
+ " 0.500000 | \n",
+ " 0.500000 | \n",
+ "
\n",
+ " \n",
+ " 60 | \n",
+ " 107 | \n",
+ " 11.0 | \n",
+ " 42.0 | \n",
+ " 13.0 | \n",
+ " 66.0 | \n",
+ " 0.166667 | \n",
+ " 0.636364 | \n",
+ " 3804.0 | \n",
+ " 9977.0 | \n",
+ " 1519.0 | \n",
+ " 1943.0 | \n",
+ " 80.0 | \n",
+ " 176.0 | \n",
+ " 256.0 | \n",
+ " 0.312500 | \n",
+ " 0.687500 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
61 rows × 16 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " participant_id no_incoming no_outgoing no_missed no_all_calls \\\n",
+ "0 13 3.0 21.0 2.0 26.0 \n",
+ "1 14 16.0 22.0 11.0 49.0 \n",
+ "2 15 3.0 2.0 NaN 5.0 \n",
+ "3 16 4.0 6.0 3.0 13.0 \n",
+ "4 17 20.0 60.0 8.0 88.0 \n",
+ ".. ... ... ... ... ... \n",
+ "56 91 15.0 13.0 3.0 31.0 \n",
+ "57 92 3.0 4.0 1.0 8.0 \n",
+ "58 93 22.0 20.0 9.0 51.0 \n",
+ "59 106 12.0 30.0 6.0 48.0 \n",
+ "60 107 11.0 42.0 13.0 66.0 \n",
+ "\n",
+ " no_incoming_ratio no_outgoing_ratio duration_total_incoming \\\n",
+ "0 0.115385 0.807692 342.0 \n",
+ "1 0.326531 0.448980 1873.0 \n",
+ "2 0.600000 0.400000 310.0 \n",
+ "3 0.307692 0.461538 1963.0 \n",
+ "4 0.227273 0.681818 5789.0 \n",
+ ".. ... ... ... \n",
+ "56 0.483871 0.419355 3443.0 \n",
+ "57 0.375000 0.500000 231.0 \n",
+ "58 0.431373 0.392157 2534.0 \n",
+ "59 0.250000 0.625000 3049.0 \n",
+ "60 0.166667 0.636364 3804.0 \n",
+ "\n",
+ " duration_total_outgoing duration_max_incoming duration_max_outgoing \\\n",
+ "0 2836.0 196.0 355.0 \n",
+ "1 2789.0 346.0 694.0 \n",
+ "2 19.0 154.0 19.0 \n",
+ "3 849.0 1037.0 638.0 \n",
+ "4 17046.0 1966.0 3830.0 \n",
+ ".. ... ... ... \n",
+ "56 3636.0 644.0 1315.0 \n",
+ "57 648.0 167.0 433.0 \n",
+ "58 1444.0 443.0 672.0 \n",
+ "59 2637.0 878.0 380.0 \n",
+ "60 9977.0 1519.0 1943.0 \n",
+ "\n",
+ " no_received no_sent no_all_sms no_received_ratio no_sent_ratio \n",
+ "0 7.0 7.0 14.0 0.500000 0.500000 \n",
+ "1 20.0 14.0 34.0 0.588235 0.411765 \n",
+ "2 73.0 73.0 146.0 0.500000 0.500000 \n",
+ "3 8.0 2.0 10.0 0.800000 0.200000 \n",
+ "4 7.0 1.0 8.0 0.875000 0.125000 \n",
+ ".. ... ... ... ... ... \n",
+ "56 83.0 44.0 127.0 0.653543 0.346457 \n",
+ "57 4.0 6.0 10.0 0.400000 0.600000 \n",
+ "58 48.0 19.0 67.0 0.716418 0.283582 \n",
+ "59 10.0 10.0 20.0 0.500000 0.500000 \n",
+ "60 80.0 176.0 256.0 0.312500 0.687500 \n",
+ "\n",
+ "[61 rows x 16 columns]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "count_joined.reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "straw2analysis",
+ "language": "python",
+ "name": "straw2analysis"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Untitled.ipynb b/Untitled.ipynb
new file mode 100644
index 0000000..40567fe
--- /dev/null
+++ b/Untitled.ipynb
@@ -0,0 +1,315 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "nb_dir = os.path.split(os.getcwd())[0]\n",
+ "if nb_dir not in sys.path:\n",
+ " sys.path.append(nb_dir)\n",
+ " \n",
+ "from features.communication import *\n",
+ "import participants.query_db\n",
+ "\n",
+ "participants_inactive_usernames = participants.query_db.get_usernames()\n",
+ "df_sms = get_sms_data(participants_inactive_usernames)\n",
+ "df_calls = get_call_data(participants_inactive_usernames)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_calls"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " _id | \n",
+ " timestamp | \n",
+ " device_id | \n",
+ " call_type | \n",
+ " call_duration | \n",
+ " trace | \n",
+ " participant_id | \n",
+ " username | \n",
+ " freq | \n",
+ " contact_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 118 | \n",
+ " 1 | \n",
+ " 1581936081010 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 176 | \n",
+ " 3 | \n",
+ " 1582295247982 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 1 | \n",
+ " 196 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 209 | \n",
+ " 4 | \n",
+ " 1582305634014 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 237 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 210 | \n",
+ " 5 | \n",
+ " 1582561530334 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 126 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 253 | \n",
+ " 6 | \n",
+ " 1582627576077 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 255 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 4645 | \n",
+ " 5874 | \n",
+ " 131 | \n",
+ " 1625602519843 | \n",
+ " 4b62a655-cbf0-4ac0-a448-06726f45b56a | \n",
+ " 2 | \n",
+ " 2270 | \n",
+ " f4ebca8dc7305fe424d6bf7fbcd2e5086f98b453 | \n",
+ " 90 | \n",
+ " uploader_53573 | \n",
+ " 6 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 4646 | \n",
+ " 5882 | \n",
+ " 139 | \n",
+ " 1625753023456 | \n",
+ " 4b62a655-cbf0-4ac0-a448-06726f45b56a | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 2e5d63f6fddca2b66be810b5946c42eda24f2dbe | \n",
+ " 90 | \n",
+ " uploader_53573 | \n",
+ " 2 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " 4647 | \n",
+ " 5883 | \n",
+ " 140 | \n",
+ " 1625754998767 | \n",
+ " 4b62a655-cbf0-4ac0-a448-06726f45b56a | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 2e5d63f6fddca2b66be810b5946c42eda24f2dbe | \n",
+ " 90 | \n",
+ " uploader_53573 | \n",
+ " 2 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " 4648 | \n",
+ " 5884 | \n",
+ " 141 | \n",
+ " 1625823008392 | \n",
+ " 4b62a655-cbf0-4ac0-a448-06726f45b56a | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 7316d58b7bb7de097a2421c56010ac024a489451 | \n",
+ " 90 | \n",
+ " uploader_53573 | \n",
+ " 1 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " 4649 | \n",
+ " 5903 | \n",
+ " 158 | \n",
+ " 1626110930233 | \n",
+ " 4b62a655-cbf0-4ac0-a448-06726f45b56a | \n",
+ " 2 | \n",
+ " 53 | \n",
+ " 7db4e9acf7c73837ddecdae5da523a28c774ba94 | \n",
+ " 90 | \n",
+ " uploader_53573 | \n",
+ " 1 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
4650 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id _id timestamp device_id \\\n",
+ "0 118 1 1581936081010 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "1 176 3 1582295247982 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "2 209 4 1582305634014 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "3 210 5 1582561530334 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "4 253 6 1582627576077 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "... ... ... ... ... \n",
+ "4645 5874 131 1625602519843 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
+ "4646 5882 139 1625753023456 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
+ "4647 5883 140 1625754998767 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
+ "4648 5884 141 1625823008392 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
+ "4649 5903 158 1626110930233 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
+ "\n",
+ " call_type call_duration trace \\\n",
+ "0 2 0 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "1 1 196 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "2 2 237 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "3 2 126 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "4 2 255 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "... ... ... ... \n",
+ "4645 2 2270 f4ebca8dc7305fe424d6bf7fbcd2e5086f98b453 \n",
+ "4646 3 0 2e5d63f6fddca2b66be810b5946c42eda24f2dbe \n",
+ "4647 3 0 2e5d63f6fddca2b66be810b5946c42eda24f2dbe \n",
+ "4648 2 0 7316d58b7bb7de097a2421c56010ac024a489451 \n",
+ "4649 2 53 7db4e9acf7c73837ddecdae5da523a28c774ba94 \n",
+ "\n",
+ " participant_id username freq contact_id \n",
+ "0 13 uploader_20449 21 0 \n",
+ "1 13 uploader_20449 21 0 \n",
+ "2 13 uploader_20449 21 0 \n",
+ "3 13 uploader_20449 21 0 \n",
+ "4 13 uploader_20449 21 0 \n",
+ "... ... ... ... ... \n",
+ "4645 90 uploader_53573 6 9 \n",
+ "4646 90 uploader_53573 2 13 \n",
+ "4647 90 uploader_53573 2 13 \n",
+ "4648 90 uploader_53573 1 21 \n",
+ "4649 90 uploader_53573 1 24 \n",
+ "\n",
+ "[4650 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contact_features(enumerate_contacts(df_calls))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "straw2analysis",
+ "language": "python",
+ "name": "straw2analysis"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Untitled1.ipynb b/Untitled1.ipynb
new file mode 100644
index 0000000..8ac4962
--- /dev/null
+++ b/Untitled1.ipynb
@@ -0,0 +1,788 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "nb_dir = os.path.split(os.getcwd())[0]\n",
+ "if nb_dir not in sys.path:\n",
+ " sys.path.append(nb_dir)\n",
+ " \n",
+ "from features.communication import *\n",
+ "import participants.query_db\n",
+ "\n",
+ "participants_inactive_usernames = participants.query_db.get_usernames()\n",
+ "df_sms = get_sms_data(participants_inactive_usernames)\n",
+ "df_calls = get_call_data(participants_inactive_usernames)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " _id | \n",
+ " timestamp | \n",
+ " device_id | \n",
+ " call_type | \n",
+ " call_duration | \n",
+ " trace | \n",
+ " participant_id | \n",
+ " username | \n",
+ " freq | \n",
+ " contact_id | \n",
+ " total_call_duration | \n",
+ " no_contacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 118 | \n",
+ " 1 | \n",
+ " 1581936081010 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ " 2844 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 176 | \n",
+ " 3 | \n",
+ " 1582295247982 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 1 | \n",
+ " 196 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ " 2844 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 209 | \n",
+ " 4 | \n",
+ " 1582305634014 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 237 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ " 2844 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 210 | \n",
+ " 5 | \n",
+ " 1582561530334 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 126 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ " 2844 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 253 | \n",
+ " 6 | \n",
+ " 1582627576077 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 255 | \n",
+ " 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ " 2844 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 4352 | \n",
+ " 5560 | \n",
+ " 59 | \n",
+ " 1621275689589 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 5eb72fe829c2af4a654007220119bdcf47499555 | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 1 | \n",
+ " 17 | \n",
+ " 0 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 4336 | \n",
+ " 5443 | \n",
+ " 29 | \n",
+ " 1620746142636 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 2 | \n",
+ " 189 | \n",
+ " 6dd761532337dfe596eb2e34f4c91216b38e28e2 | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 1 | \n",
+ " 18 | \n",
+ " 189 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 4316 | \n",
+ " 5237 | \n",
+ " 10 | \n",
+ " 1620140109908 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 2 | \n",
+ " 85 | \n",
+ " 9c4eab1dfc0114aecd64a7f594977acc9ab7936c | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 1 | \n",
+ " 19 | \n",
+ " 85 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 4347 | \n",
+ " 5524 | \n",
+ " 44 | \n",
+ " 1620971679122 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 1 | \n",
+ " 120 | \n",
+ " a9fa73b6137d09288429de20172095978730e4b8 | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 1 | \n",
+ " 20 | \n",
+ " 120 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 4331 | \n",
+ " 5364 | \n",
+ " 26 | \n",
+ " 1620630328635 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 2 | \n",
+ " 184 | \n",
+ " cfe98eee4a27b377f4cde1ea5c39d24d0475b533 | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 1 | \n",
+ " 21 | \n",
+ " 184 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
4650 rows × 13 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id _id timestamp device_id \\\n",
+ "0 118 1 1581936081010 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "1 176 3 1582295247982 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "2 209 4 1582305634014 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "3 210 5 1582561530334 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "4 253 6 1582627576077 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "... ... ... ... ... \n",
+ "4352 5560 59 1621275689589 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "4336 5443 29 1620746142636 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "4316 5237 10 1620140109908 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "4347 5524 44 1620971679122 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "4331 5364 26 1620630328635 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "\n",
+ " call_type call_duration trace \\\n",
+ "0 2 0 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "1 1 196 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "2 2 237 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "3 2 126 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "4 2 255 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
+ "... ... ... ... \n",
+ "4352 2 0 5eb72fe829c2af4a654007220119bdcf47499555 \n",
+ "4336 2 189 6dd761532337dfe596eb2e34f4c91216b38e28e2 \n",
+ "4316 2 85 9c4eab1dfc0114aecd64a7f594977acc9ab7936c \n",
+ "4347 1 120 a9fa73b6137d09288429de20172095978730e4b8 \n",
+ "4331 2 184 cfe98eee4a27b377f4cde1ea5c39d24d0475b533 \n",
+ "\n",
+ " participant_id username freq contact_id total_call_duration \\\n",
+ "0 13 uploader_20449 21 0 2844 \n",
+ "1 13 uploader_20449 21 0 2844 \n",
+ "2 13 uploader_20449 21 0 2844 \n",
+ "3 13 uploader_20449 21 0 2844 \n",
+ "4 13 uploader_20449 21 0 2844 \n",
+ "... ... ... ... ... ... \n",
+ "4352 107 uploader_89606 1 17 0 \n",
+ "4336 107 uploader_89606 1 18 189 \n",
+ "4316 107 uploader_89606 1 19 85 \n",
+ "4347 107 uploader_89606 1 20 120 \n",
+ "4331 107 uploader_89606 1 21 184 \n",
+ "\n",
+ " no_contacts \n",
+ "0 5 \n",
+ "1 5 \n",
+ "2 5 \n",
+ "3 5 \n",
+ "4 5 \n",
+ "... ... \n",
+ "4352 22 \n",
+ "4336 22 \n",
+ "4316 22 \n",
+ "4347 22 \n",
+ "4331 22 \n",
+ "\n",
+ "[4650 rows x 13 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contact_features(enumerate_contacts(df_calls))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " _id | \n",
+ " timestamp | \n",
+ " device_id | \n",
+ " message_type | \n",
+ " trace | \n",
+ " participant_id | \n",
+ " username | \n",
+ " freq | \n",
+ " contact_id | \n",
+ " no_contacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 195 | \n",
+ " 379 | \n",
+ " 7 | \n",
+ " 1582964434597 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 417b9c87f5b573530bcffba8577777b3a964d671 | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 196 | \n",
+ " 380 | \n",
+ " 8 | \n",
+ " 1582964434974 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 417b9c87f5b573530bcffba8577777b3a964d671 | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 197 | \n",
+ " 382 | \n",
+ " 10 | \n",
+ " 1582965988609 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 417b9c87f5b573530bcffba8577777b3a964d671 | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 198 | \n",
+ " 383 | \n",
+ " 11 | \n",
+ " 1582965988873 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 417b9c87f5b573530bcffba8577777b3a964d671 | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 199 | \n",
+ " 396 | \n",
+ " 12 | \n",
+ " 1582965988873 | \n",
+ " 78082f9f-98c2-468d-b4a2-7c835bd812bd | \n",
+ " 2 | \n",
+ " 417b9c87f5b573530bcffba8577777b3a964d671 | \n",
+ " 13 | \n",
+ " uploader_20449 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 5193 | \n",
+ " 6137 | \n",
+ " 4 | \n",
+ " 1619789360665 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 2 | \n",
+ " 2340c1d2b9e5d550373423a599014468a4dc3678 | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 3 | \n",
+ " 12 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " 5194 | \n",
+ " 6135 | \n",
+ " 2 | \n",
+ " 1619787273829 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 1 | \n",
+ " 2340c1d2b9e5d550373423a599014468a4dc3678 | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 3 | \n",
+ " 12 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " 5417 | \n",
+ " 6690 | \n",
+ " 210 | \n",
+ " 1620980437198 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 1 | \n",
+ " 198d7e63a2b4d8a7ca9bb92f74e6974ca17edc56 | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 1 | \n",
+ " 13 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " 5447 | \n",
+ " 6770 | \n",
+ " 256 | \n",
+ " 1621407668019 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 1 | \n",
+ " d4a67b53e704247de47064850efd3647e8dcaffb | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 1 | \n",
+ " 14 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " 5440 | \n",
+ " 6742 | \n",
+ " 248 | \n",
+ " 1621253313544 | \n",
+ " 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e | \n",
+ " 1 | \n",
+ " ffbc6a5f0f601cf2d9cdad9d3a588633e1a1967c | \n",
+ " 107 | \n",
+ " uploader_89606 | \n",
+ " 1 | \n",
+ " 15 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5864 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id _id timestamp device_id \\\n",
+ "195 379 7 1582964434597 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "196 380 8 1582964434974 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "197 382 10 1582965988609 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "198 383 11 1582965988873 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "199 396 12 1582965988873 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
+ "... ... ... ... ... \n",
+ "5193 6137 4 1619789360665 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "5194 6135 2 1619787273829 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "5417 6690 210 1620980437198 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "5447 6770 256 1621407668019 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "5440 6742 248 1621253313544 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
+ "\n",
+ " message_type trace participant_id \\\n",
+ "195 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
+ "196 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
+ "197 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
+ "198 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
+ "199 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
+ "... ... ... ... \n",
+ "5193 2 2340c1d2b9e5d550373423a599014468a4dc3678 107 \n",
+ "5194 1 2340c1d2b9e5d550373423a599014468a4dc3678 107 \n",
+ "5417 1 198d7e63a2b4d8a7ca9bb92f74e6974ca17edc56 107 \n",
+ "5447 1 d4a67b53e704247de47064850efd3647e8dcaffb 107 \n",
+ "5440 1 ffbc6a5f0f601cf2d9cdad9d3a588633e1a1967c 107 \n",
+ "\n",
+ " username freq contact_id no_contacts \n",
+ "195 uploader_20449 5 0 6 \n",
+ "196 uploader_20449 5 0 6 \n",
+ "197 uploader_20449 5 0 6 \n",
+ "198 uploader_20449 5 0 6 \n",
+ "199 uploader_20449 5 0 6 \n",
+ "... ... ... ... ... \n",
+ "5193 uploader_89606 3 12 16 \n",
+ "5194 uploader_89606 3 12 16 \n",
+ "5417 uploader_89606 1 13 16 \n",
+ "5447 uploader_89606 1 14 16 \n",
+ "5440 uploader_89606 1 15 16 \n",
+ "\n",
+ "[5864 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contact_features(enumerate_contacts(df_sms))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " participant_id | \n",
+ " no_calls_no_sms_ratio | \n",
+ " no_incoming_calls_no_recieved_sms_ratio | \n",
+ " no_outgoing_calls_no_sent_sms_ratio | \n",
+ " no_calls_contacts_no_sms_contacts_ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 13 | \n",
+ " 0.650000 | \n",
+ " 0.700000 | \n",
+ " 0.750000 | \n",
+ " 0.454545 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 14 | \n",
+ " 0.590361 | \n",
+ " 0.555556 | \n",
+ " 0.611111 | \n",
+ " 0.714286 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 15 | \n",
+ " 0.033113 | \n",
+ " 0.960526 | \n",
+ " 0.026667 | \n",
+ " 0.173913 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 16 | \n",
+ " 0.565217 | \n",
+ " 0.666667 | \n",
+ " 0.750000 | \n",
+ " 0.666667 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 17 | \n",
+ " 0.916667 | \n",
+ " 0.259259 | \n",
+ " 0.983607 | \n",
+ " 0.857143 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 56 | \n",
+ " 91 | \n",
+ " 0.196203 | \n",
+ " 0.846939 | \n",
+ " 0.228070 | \n",
+ " 0.666667 | \n",
+ "
\n",
+ " \n",
+ " 57 | \n",
+ " 92 | \n",
+ " 0.444444 | \n",
+ " 0.571429 | \n",
+ " 0.400000 | \n",
+ " 0.600000 | \n",
+ "
\n",
+ " \n",
+ " 58 | \n",
+ " 93 | \n",
+ " 0.432203 | \n",
+ " 0.685714 | \n",
+ " 0.512821 | \n",
+ " 0.428571 | \n",
+ "
\n",
+ " \n",
+ " 59 | \n",
+ " 106 | \n",
+ " 0.705882 | \n",
+ " 0.454545 | \n",
+ " 0.750000 | \n",
+ " 0.769231 | \n",
+ "
\n",
+ " \n",
+ " 60 | \n",
+ " 107 | \n",
+ " 0.204969 | \n",
+ " 0.879121 | \n",
+ " 0.192661 | \n",
+ " 0.578947 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
61 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " participant_id no_calls_no_sms_ratio \\\n",
+ "0 13 0.650000 \n",
+ "1 14 0.590361 \n",
+ "2 15 0.033113 \n",
+ "3 16 0.565217 \n",
+ "4 17 0.916667 \n",
+ ".. ... ... \n",
+ "56 91 0.196203 \n",
+ "57 92 0.444444 \n",
+ "58 93 0.432203 \n",
+ "59 106 0.705882 \n",
+ "60 107 0.204969 \n",
+ "\n",
+ " no_incoming_calls_no_recieved_sms_ratio \\\n",
+ "0 0.700000 \n",
+ "1 0.555556 \n",
+ "2 0.960526 \n",
+ "3 0.666667 \n",
+ "4 0.259259 \n",
+ ".. ... \n",
+ "56 0.846939 \n",
+ "57 0.571429 \n",
+ "58 0.685714 \n",
+ "59 0.454545 \n",
+ "60 0.879121 \n",
+ "\n",
+ " no_outgoing_calls_no_sent_sms_ratio \\\n",
+ "0 0.750000 \n",
+ "1 0.611111 \n",
+ "2 0.026667 \n",
+ "3 0.750000 \n",
+ "4 0.983607 \n",
+ ".. ... \n",
+ "56 0.228070 \n",
+ "57 0.400000 \n",
+ "58 0.512821 \n",
+ "59 0.750000 \n",
+ "60 0.192661 \n",
+ "\n",
+ " no_calls_contacts_no_sms_contacts_ratio \n",
+ "0 0.454545 \n",
+ "1 0.714286 \n",
+ "2 0.173913 \n",
+ "3 0.666667 \n",
+ "4 0.857143 \n",
+ ".. ... \n",
+ "56 0.666667 \n",
+ "57 0.600000 \n",
+ "58 0.428571 \n",
+ "59 0.769231 \n",
+ "60 0.578947 \n",
+ "\n",
+ "[61 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "calls_sms_features(df_calls, df_sms)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "straw2analysis",
+ "language": "python",
+ "name": "straw2analysis"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/exploration/expl_communication.py b/exploration/expl_communication.py
index e1a0c27..6a6ebc8 100644
--- a/exploration/expl_communication.py
+++ b/exploration/expl_communication.py
@@ -6,7 +6,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
-# jupytext_version: 1.11.2
+# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
@@ -39,6 +39,9 @@ print(df_calls)
# %%
count_comms(df_calls)
+# %%
+enumerate_contacts(df_calls)
+
# %%
df_sms = get_sms_data(["nokia_0000003"])
count_comms(df_sms)
@@ -53,6 +56,15 @@ import participants.query_db
participants_inactive_usernames = participants.query_db.get_usernames()
df_calls_inactive = get_call_data(participants_inactive_usernames)
+# %%
+participants_inactive_usernames
+
+# %%
+df_calls_inactive.head()
+
+# %%
+enumerate_contacts(df_calls_inactive).head()
+
# %%
df_calls_features = count_comms(df_calls_inactive)
df_calls_features.head()
@@ -70,6 +82,9 @@ calls_number = pd.wide_to_long(
suffix="\D+",
)
+# %%
+calls_number
+
# %%
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
diff --git a/features/communication.py b/features/communication.py
index d1dc4ee..a3c917c 100644
--- a/features/communication.py
+++ b/features/communication.py
@@ -86,7 +86,8 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
# In other words, recode the contacts into integers from 0 to n_contacts,
# so that the first one is contacted the most often.
contact_ids = (
- contact_counts.groupby("participant_id") # Group again for enumeration.
+ # Group again for enumeration.
+ contact_counts.groupby("participant_id")
.cumcount() # Enumerate (count) rows *within* participants.
.to_frame("contact_id")
)
@@ -150,8 +151,10 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
comm_features = comm_counts.join(comm_duration_total)
comm_features = comm_features.join(comm_duration_max)
try:
- comm_features.drop(columns="duration_total_" + call_types[3], inplace=True)
- comm_features.drop(columns="duration_max_" + call_types[3], inplace=True)
+ comm_features.drop(columns="duration_total_" +
+ call_types[3], inplace=True)
+ comm_features.drop(columns="duration_max_" +
+ call_types[3], inplace=True)
# The missed calls are always of 0 duration.
except KeyError:
pass
@@ -172,19 +175,145 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
)
# Ratio of incoming and outgoing messages to all messages.
else:
- raise KeyError("The dataframe contains neither call_type or message_type")
+ raise KeyError(
+ "The dataframe contains neither call_type or message_type")
return comm_features
-def contact_features():
- # TODO Implement a method that takes a DF with enumerated contacts as argument and calculates:
- # * Duration of calls per caller (for most common callers)
- # * Determine work vs non-work contacts by work hours heuristics
- # * Number of people contacted
- # And similarly for SMS.
- pass
+def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
+ """
+ Counts the number of people contacted (for each participant) and, if
+ df_enumerated is a dataframe containing calls data, the total duration
+ of calls between a participant and each of her contacts.
+
+ Parameters
+ ----------
+ df_enumerated: pd.DataFrame
+ A dataframe of calls or SMSes; return of function enumerate_contacts.
+
+ Returns
+ -------
+ comm_df: pd.DataFrame
+ The altered dataframe with the column no_contacts and, if df_enumerated
+ contains calls data, an additional column total_call_duration.
+ """
+
+ # Check whether df contains calls or SMS data since some
+ # features we want to calculate are type-specyfic
+ if "call_duration" in df_enumerated:
+ # Add a column with the total duration of calls between two people
+ duration_count = (
+ df_enumerated.groupby(
+ ["participant_id", "contact_id"]
+ )
+ # For each participant and for each caller, sum durations of their calls
+ ["call_duration"].sum()
+ .reset_index() # Make index (which is actually the participant id) a normal column
+ .rename(columns={"call_duration": "total_call_duration"})
+ )
+ # The new dataframe now contains columns containing information about
+ # participants, callers and the total duration of their calls. All that
+ # is now left to do is to merge the original df with the new one.
+ df_enumerated = df_enumerated.merge(
+ duration_count,
+ on=["participant_id", "contact_id"]
+ )
+
+ contact_count = (
+ df_enumerated.groupby(["participant_id"])
+ .nunique()["contact_id"] # For each participant, count the number of distinct contacts
+ .reset_index() # Make index (which is actually the participant id) a normal column
+ .rename(columns={"contact_id": "no_contacts"})
+ )
+
+ df_enumerated = (
+ # Merge df with the newely created df containing info about number of contacts
+ df_enumerated.merge(contact_count, on="participant_id")
+ # Sort first by participant_id and then by contact_id and
+ # thereby restore the inital ordering of input dataframes.
+ .sort_values(["participant_id", "contact_id"])
+ )
+
+ # TODO:Determine work vs non-work contacts by work hours heuristics
+
+ return df_enumerated
-def calls_sms_features():
- # TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages.
- pass
+def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
+ """
+ Calculates additional features relating calls and sms data.
+
+ Parameters
+ ----------
+ df_calls: pd.DataFrame
+ A dataframe of calls (return of get_call_data).
+ df_sms: pd.DataFrame
+ A dataframe of calls (return of get_sms_data).
+
+ Returns
+ -------
+ df_calls_sms: pd.DataFrame
+ The list of features relating calls and sms data for every participant.
+ These are:
+ * no_calls_no_sms_ratio:
+ proportion of calls in total number of communications
+ * no_incoming_calls_no_recieved_sms_ratio:
+ proportion of incoming calls in total number of incoming/recieved communications
+ * no_outgoing_calls_no_sent_sms_ratio:
+ proportion of outgoing calls in total number of outgoing/sent communications
+ * no_calls_contacts_no_sms_contacts_ratio:
+ proportion of calls contacts in total number of communication contacts
+ """
+
+ count_calls = count_comms(df_calls)
+ count_sms = count_comms(df_sms)
+
+ count_joined = (
+ count_calls.merge(
+ count_sms, on="participant_id", suffixes=("_calls", "_sms")
+ ) # Merge calls and sms features
+ .reset_index() # Make participant_id a regular column
+ .assign(
+ no_calls_no_sms_ratio=(
+ lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
+ ),
+ no_incoming_calls_no_recieved_sms_ratio=(
+ lambda x: x.no_received / (x.no_incoming + x.no_received)
+ ),
+ no_outgoing_calls_no_sent_sms_ratio=(
+ lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
+ ) # Calculate new features and create additional columns
+ )[
+ ["participant_id",
+ "no_calls_no_sms_ratio",
+ "no_incoming_calls_no_recieved_sms_ratio",
+ "no_outgoing_calls_no_sent_sms_ratio"]
+ ] # Filter out only the relevant feautres
+ )
+
+ features_calls = contact_features(enumerate_contacts(df_calls))
+ features_sms = contact_features(enumerate_contacts(df_sms))
+
+ features_joined = (
+ features_calls.merge(
+ features_sms, on="participant_id", suffixes=("_calls", "_sms")
+ ) # Merge calls and sms features
+ .reset_index() # Make participand_id a regular column
+ .assign(
+ no_calls_contacts_no_sms_contacts_ratio=(
+ lambda x: x.no_contacts_calls /
+ (x.no_contacts_calls + x.no_contacts_sms)
+ ) # Calculate new features and create additional columns
+ )[
+ ["participant_id",
+ "no_calls_contacts_no_sms_contacts_ratio"]
+ ] # Filter out only the relevant feautres
+ # Since we are interested only in some features and ignored
+ # others, a lot of duplicate rows were created. Remove them.
+ .drop_duplicates()
+ )
+
+ # Join the newely created dataframes
+ df_calls_sms = count_joined.merge(features_joined, on="participant_id")
+
+ return df_calls_sms