diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..2fd6442 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/.ipynb_checkpoints/Untitled1-checkpoint.ipynb b/.ipynb_checkpoints/Untitled1-checkpoint.ipynb new file mode 100644 index 0000000..027901f --- /dev/null +++ b/.ipynb_checkpoints/Untitled1-checkpoint.ipynb @@ -0,0 +1,393 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "nb_dir = os.path.split(os.getcwd())[0]\n", + "if nb_dir not in sys.path:\n", + " sys.path.append(nb_dir)\n", + " \n", + "from features.communication import *\n", + "import participants.query_db\n", + "\n", + "participants_inactive_usernames = participants.query_db.get_usernames()\n", + "df_sms = get_sms_data(participants_inactive_usernames)\n", + "df_calls = get_call_data(participants_inactive_usernames)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + " count_calls = count_comms(df_calls)\n", + " count_sms = count_comms(df_sms)\n", + "\n", + " count_joined = count_calls.merge(\n", + " count_sms, on=\"participant_id\", suffixes=(\"_calls\", \"_sms\")\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
participant_idno_incomingno_outgoingno_missedno_all_callsno_incoming_rationo_outgoing_ratioduration_total_incomingduration_total_outgoingduration_max_incomingduration_max_outgoingno_receivedno_sentno_all_smsno_received_rationo_sent_ratio
0133.021.02.026.00.1153850.807692342.02836.0196.0355.07.07.014.00.5000000.500000
11416.022.011.049.00.3265310.4489801873.02789.0346.0694.020.014.034.00.5882350.411765
2153.02.0NaN5.00.6000000.400000310.019.0154.019.073.073.0146.00.5000000.500000
3164.06.03.013.00.3076920.4615381963.0849.01037.0638.08.02.010.00.8000000.200000
41720.060.08.088.00.2272730.6818185789.017046.01966.03830.07.01.08.00.8750000.125000
...................................................
569115.013.03.031.00.4838710.4193553443.03636.0644.01315.083.044.0127.00.6535430.346457
57923.04.01.08.00.3750000.500000231.0648.0167.0433.04.06.010.00.4000000.600000
589322.020.09.051.00.4313730.3921572534.01444.0443.0672.048.019.067.00.7164180.283582
5910612.030.06.048.00.2500000.6250003049.02637.0878.0380.010.010.020.00.5000000.500000
6010711.042.013.066.00.1666670.6363643804.09977.01519.01943.080.0176.0256.00.3125000.687500
\n", + "

61 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " participant_id no_incoming no_outgoing no_missed no_all_calls \\\n", + "0 13 3.0 21.0 2.0 26.0 \n", + "1 14 16.0 22.0 11.0 49.0 \n", + "2 15 3.0 2.0 NaN 5.0 \n", + "3 16 4.0 6.0 3.0 13.0 \n", + "4 17 20.0 60.0 8.0 88.0 \n", + ".. ... ... ... ... ... \n", + "56 91 15.0 13.0 3.0 31.0 \n", + "57 92 3.0 4.0 1.0 8.0 \n", + "58 93 22.0 20.0 9.0 51.0 \n", + "59 106 12.0 30.0 6.0 48.0 \n", + "60 107 11.0 42.0 13.0 66.0 \n", + "\n", + " no_incoming_ratio no_outgoing_ratio duration_total_incoming \\\n", + "0 0.115385 0.807692 342.0 \n", + "1 0.326531 0.448980 1873.0 \n", + "2 0.600000 0.400000 310.0 \n", + "3 0.307692 0.461538 1963.0 \n", + "4 0.227273 0.681818 5789.0 \n", + ".. ... ... ... \n", + "56 0.483871 0.419355 3443.0 \n", + "57 0.375000 0.500000 231.0 \n", + "58 0.431373 0.392157 2534.0 \n", + "59 0.250000 0.625000 3049.0 \n", + "60 0.166667 0.636364 3804.0 \n", + "\n", + " duration_total_outgoing duration_max_incoming duration_max_outgoing \\\n", + "0 2836.0 196.0 355.0 \n", + "1 2789.0 346.0 694.0 \n", + "2 19.0 154.0 19.0 \n", + "3 849.0 1037.0 638.0 \n", + "4 17046.0 1966.0 3830.0 \n", + ".. ... ... ... \n", + "56 3636.0 644.0 1315.0 \n", + "57 648.0 167.0 433.0 \n", + "58 1444.0 443.0 672.0 \n", + "59 2637.0 878.0 380.0 \n", + "60 9977.0 1519.0 1943.0 \n", + "\n", + " no_received no_sent no_all_sms no_received_ratio no_sent_ratio \n", + "0 7.0 7.0 14.0 0.500000 0.500000 \n", + "1 20.0 14.0 34.0 0.588235 0.411765 \n", + "2 73.0 73.0 146.0 0.500000 0.500000 \n", + "3 8.0 2.0 10.0 0.800000 0.200000 \n", + "4 7.0 1.0 8.0 0.875000 0.125000 \n", + ".. ... ... ... ... ... \n", + "56 83.0 44.0 127.0 0.653543 0.346457 \n", + "57 4.0 6.0 10.0 0.400000 0.600000 \n", + "58 48.0 19.0 67.0 0.716418 0.283582 \n", + "59 10.0 10.0 20.0 0.500000 0.500000 \n", + "60 80.0 176.0 256.0 0.312500 0.687500 \n", + "\n", + "[61 rows x 16 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "count_joined.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "straw2analysis", + "language": "python", + "name": "straw2analysis" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..40567fe --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "nb_dir = os.path.split(os.getcwd())[0]\n", + "if nb_dir not in sys.path:\n", + " sys.path.append(nb_dir)\n", + " \n", + "from features.communication import *\n", + "import participants.query_db\n", + "\n", + "participants_inactive_usernames = participants.query_db.get_usernames()\n", + "df_sms = get_sms_data(participants_inactive_usernames)\n", + "df_calls = get_call_data(participants_inactive_usernames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_calls" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_idtimestampdevice_idcall_typecall_durationtraceparticipant_idusernamefreqcontact_id
01181158193608101078082f9f-98c2-468d-b4a2-7c835bd812bd2087ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_20449210
11763158229524798278082f9f-98c2-468d-b4a2-7c835bd812bd119687ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_20449210
22094158230563401478082f9f-98c2-468d-b4a2-7c835bd812bd223787ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_20449210
32105158256153033478082f9f-98c2-468d-b4a2-7c835bd812bd212687ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_20449210
42536158262757607778082f9f-98c2-468d-b4a2-7c835bd812bd225587ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_20449210
....................................
4645587413116256025198434b62a655-cbf0-4ac0-a448-06726f45b56a22270f4ebca8dc7305fe424d6bf7fbcd2e5086f98b45390uploader_5357369
4646588213916257530234564b62a655-cbf0-4ac0-a448-06726f45b56a302e5d63f6fddca2b66be810b5946c42eda24f2dbe90uploader_53573213
4647588314016257549987674b62a655-cbf0-4ac0-a448-06726f45b56a302e5d63f6fddca2b66be810b5946c42eda24f2dbe90uploader_53573213
4648588414116258230083924b62a655-cbf0-4ac0-a448-06726f45b56a207316d58b7bb7de097a2421c56010ac024a48945190uploader_53573121
4649590315816261109302334b62a655-cbf0-4ac0-a448-06726f45b56a2537db4e9acf7c73837ddecdae5da523a28c774ba9490uploader_53573124
\n", + "

4650 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id _id timestamp device_id \\\n", + "0 118 1 1581936081010 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "1 176 3 1582295247982 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "2 209 4 1582305634014 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "3 210 5 1582561530334 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "4 253 6 1582627576077 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "... ... ... ... ... \n", + "4645 5874 131 1625602519843 4b62a655-cbf0-4ac0-a448-06726f45b56a \n", + "4646 5882 139 1625753023456 4b62a655-cbf0-4ac0-a448-06726f45b56a \n", + "4647 5883 140 1625754998767 4b62a655-cbf0-4ac0-a448-06726f45b56a \n", + "4648 5884 141 1625823008392 4b62a655-cbf0-4ac0-a448-06726f45b56a \n", + "4649 5903 158 1626110930233 4b62a655-cbf0-4ac0-a448-06726f45b56a \n", + "\n", + " call_type call_duration trace \\\n", + "0 2 0 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "1 1 196 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "2 2 237 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "3 2 126 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "4 2 255 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "... ... ... ... \n", + "4645 2 2270 f4ebca8dc7305fe424d6bf7fbcd2e5086f98b453 \n", + "4646 3 0 2e5d63f6fddca2b66be810b5946c42eda24f2dbe \n", + "4647 3 0 2e5d63f6fddca2b66be810b5946c42eda24f2dbe \n", + "4648 2 0 7316d58b7bb7de097a2421c56010ac024a489451 \n", + "4649 2 53 7db4e9acf7c73837ddecdae5da523a28c774ba94 \n", + "\n", + " participant_id username freq contact_id \n", + "0 13 uploader_20449 21 0 \n", + "1 13 uploader_20449 21 0 \n", + "2 13 uploader_20449 21 0 \n", + "3 13 uploader_20449 21 0 \n", + "4 13 uploader_20449 21 0 \n", + "... ... ... ... ... \n", + "4645 90 uploader_53573 6 9 \n", + "4646 90 uploader_53573 2 13 \n", + "4647 90 uploader_53573 2 13 \n", + "4648 90 uploader_53573 1 21 \n", + "4649 90 uploader_53573 1 24 \n", + "\n", + "[4650 rows x 11 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "contact_features(enumerate_contacts(df_calls))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "straw2analysis", + "language": "python", + "name": "straw2analysis" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled1.ipynb b/Untitled1.ipynb new file mode 100644 index 0000000..8ac4962 --- /dev/null +++ b/Untitled1.ipynb @@ -0,0 +1,788 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "nb_dir = os.path.split(os.getcwd())[0]\n", + "if nb_dir not in sys.path:\n", + " sys.path.append(nb_dir)\n", + " \n", + "from features.communication import *\n", + "import participants.query_db\n", + "\n", + "participants_inactive_usernames = participants.query_db.get_usernames()\n", + "df_sms = get_sms_data(participants_inactive_usernames)\n", + "df_calls = get_call_data(participants_inactive_usernames)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_idtimestampdevice_idcall_typecall_durationtraceparticipant_idusernamefreqcontact_idtotal_call_durationno_contacts
01181158193608101078082f9f-98c2-468d-b4a2-7c835bd812bd2087ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_2044921028445
11763158229524798278082f9f-98c2-468d-b4a2-7c835bd812bd119687ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_2044921028445
22094158230563401478082f9f-98c2-468d-b4a2-7c835bd812bd223787ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_2044921028445
32105158256153033478082f9f-98c2-468d-b4a2-7c835bd812bd212687ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_2044921028445
42536158262757607778082f9f-98c2-468d-b4a2-7c835bd812bd225587ae5eb2c5b7fe30bea2821e2ec052453d89ea6b13uploader_2044921028445
..........................................
435255605916212756895899f54e35c-d7cb-4f4c-8dc1-17dc86f2635e205eb72fe829c2af4a654007220119bdcf47499555107uploader_89606117022
433654432916207461426369f54e35c-d7cb-4f4c-8dc1-17dc86f2635e21896dd761532337dfe596eb2e34f4c91216b38e28e2107uploader_8960611818922
431652371016201401099089f54e35c-d7cb-4f4c-8dc1-17dc86f2635e2859c4eab1dfc0114aecd64a7f594977acc9ab7936c107uploader_896061198522
434755244416209716791229f54e35c-d7cb-4f4c-8dc1-17dc86f2635e1120a9fa73b6137d09288429de20172095978730e4b8107uploader_8960612012022
433153642616206303286359f54e35c-d7cb-4f4c-8dc1-17dc86f2635e2184cfe98eee4a27b377f4cde1ea5c39d24d0475b533107uploader_8960612118422
\n", + "

4650 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " id _id timestamp device_id \\\n", + "0 118 1 1581936081010 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "1 176 3 1582295247982 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "2 209 4 1582305634014 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "3 210 5 1582561530334 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "4 253 6 1582627576077 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "... ... ... ... ... \n", + "4352 5560 59 1621275689589 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "4336 5443 29 1620746142636 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "4316 5237 10 1620140109908 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "4347 5524 44 1620971679122 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "4331 5364 26 1620630328635 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "\n", + " call_type call_duration trace \\\n", + "0 2 0 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "1 1 196 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "2 2 237 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "3 2 126 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "4 2 255 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n", + "... ... ... ... \n", + "4352 2 0 5eb72fe829c2af4a654007220119bdcf47499555 \n", + "4336 2 189 6dd761532337dfe596eb2e34f4c91216b38e28e2 \n", + "4316 2 85 9c4eab1dfc0114aecd64a7f594977acc9ab7936c \n", + "4347 1 120 a9fa73b6137d09288429de20172095978730e4b8 \n", + "4331 2 184 cfe98eee4a27b377f4cde1ea5c39d24d0475b533 \n", + "\n", + " participant_id username freq contact_id total_call_duration \\\n", + "0 13 uploader_20449 21 0 2844 \n", + "1 13 uploader_20449 21 0 2844 \n", + "2 13 uploader_20449 21 0 2844 \n", + "3 13 uploader_20449 21 0 2844 \n", + "4 13 uploader_20449 21 0 2844 \n", + "... ... ... ... ... ... \n", + "4352 107 uploader_89606 1 17 0 \n", + "4336 107 uploader_89606 1 18 189 \n", + "4316 107 uploader_89606 1 19 85 \n", + "4347 107 uploader_89606 1 20 120 \n", + "4331 107 uploader_89606 1 21 184 \n", + "\n", + " no_contacts \n", + "0 5 \n", + "1 5 \n", + "2 5 \n", + "3 5 \n", + "4 5 \n", + "... ... \n", + "4352 22 \n", + "4336 22 \n", + "4316 22 \n", + "4347 22 \n", + "4331 22 \n", + "\n", + "[4650 rows x 13 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "contact_features(enumerate_contacts(df_calls))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_idtimestampdevice_idmessage_typetraceparticipant_idusernamefreqcontact_idno_contacts
1953797158296443459778082f9f-98c2-468d-b4a2-7c835bd812bd2417b9c87f5b573530bcffba8577777b3a964d67113uploader_20449506
1963808158296443497478082f9f-98c2-468d-b4a2-7c835bd812bd2417b9c87f5b573530bcffba8577777b3a964d67113uploader_20449506
19738210158296598860978082f9f-98c2-468d-b4a2-7c835bd812bd2417b9c87f5b573530bcffba8577777b3a964d67113uploader_20449506
19838311158296598887378082f9f-98c2-468d-b4a2-7c835bd812bd2417b9c87f5b573530bcffba8577777b3a964d67113uploader_20449506
19939612158296598887378082f9f-98c2-468d-b4a2-7c835bd812bd2417b9c87f5b573530bcffba8577777b3a964d67113uploader_20449506
....................................
51936137416197893606659f54e35c-d7cb-4f4c-8dc1-17dc86f2635e22340c1d2b9e5d550373423a599014468a4dc3678107uploader_8960631216
51946135216197872738299f54e35c-d7cb-4f4c-8dc1-17dc86f2635e12340c1d2b9e5d550373423a599014468a4dc3678107uploader_8960631216
5417669021016209804371989f54e35c-d7cb-4f4c-8dc1-17dc86f2635e1198d7e63a2b4d8a7ca9bb92f74e6974ca17edc56107uploader_8960611316
5447677025616214076680199f54e35c-d7cb-4f4c-8dc1-17dc86f2635e1d4a67b53e704247de47064850efd3647e8dcaffb107uploader_8960611416
5440674224816212533135449f54e35c-d7cb-4f4c-8dc1-17dc86f2635e1ffbc6a5f0f601cf2d9cdad9d3a588633e1a1967c107uploader_8960611516
\n", + "

5864 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id _id timestamp device_id \\\n", + "195 379 7 1582964434597 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "196 380 8 1582964434974 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "197 382 10 1582965988609 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "198 383 11 1582965988873 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "199 396 12 1582965988873 78082f9f-98c2-468d-b4a2-7c835bd812bd \n", + "... ... ... ... ... \n", + "5193 6137 4 1619789360665 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "5194 6135 2 1619787273829 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "5417 6690 210 1620980437198 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "5447 6770 256 1621407668019 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "5440 6742 248 1621253313544 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n", + "\n", + " message_type trace participant_id \\\n", + "195 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n", + "196 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n", + "197 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n", + "198 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n", + "199 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n", + "... ... ... ... \n", + "5193 2 2340c1d2b9e5d550373423a599014468a4dc3678 107 \n", + "5194 1 2340c1d2b9e5d550373423a599014468a4dc3678 107 \n", + "5417 1 198d7e63a2b4d8a7ca9bb92f74e6974ca17edc56 107 \n", + "5447 1 d4a67b53e704247de47064850efd3647e8dcaffb 107 \n", + "5440 1 ffbc6a5f0f601cf2d9cdad9d3a588633e1a1967c 107 \n", + "\n", + " username freq contact_id no_contacts \n", + "195 uploader_20449 5 0 6 \n", + "196 uploader_20449 5 0 6 \n", + "197 uploader_20449 5 0 6 \n", + "198 uploader_20449 5 0 6 \n", + "199 uploader_20449 5 0 6 \n", + "... ... ... ... ... \n", + "5193 uploader_89606 3 12 16 \n", + "5194 uploader_89606 3 12 16 \n", + "5417 uploader_89606 1 13 16 \n", + "5447 uploader_89606 1 14 16 \n", + "5440 uploader_89606 1 15 16 \n", + "\n", + "[5864 rows x 11 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "contact_features(enumerate_contacts(df_sms))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
participant_idno_calls_no_sms_rationo_incoming_calls_no_recieved_sms_rationo_outgoing_calls_no_sent_sms_rationo_calls_contacts_no_sms_contacts_ratio
0130.6500000.7000000.7500000.454545
1140.5903610.5555560.6111110.714286
2150.0331130.9605260.0266670.173913
3160.5652170.6666670.7500000.666667
4170.9166670.2592590.9836070.857143
..................
56910.1962030.8469390.2280700.666667
57920.4444440.5714290.4000000.600000
58930.4322030.6857140.5128210.428571
591060.7058820.4545450.7500000.769231
601070.2049690.8791210.1926610.578947
\n", + "

61 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " participant_id no_calls_no_sms_ratio \\\n", + "0 13 0.650000 \n", + "1 14 0.590361 \n", + "2 15 0.033113 \n", + "3 16 0.565217 \n", + "4 17 0.916667 \n", + ".. ... ... \n", + "56 91 0.196203 \n", + "57 92 0.444444 \n", + "58 93 0.432203 \n", + "59 106 0.705882 \n", + "60 107 0.204969 \n", + "\n", + " no_incoming_calls_no_recieved_sms_ratio \\\n", + "0 0.700000 \n", + "1 0.555556 \n", + "2 0.960526 \n", + "3 0.666667 \n", + "4 0.259259 \n", + ".. ... \n", + "56 0.846939 \n", + "57 0.571429 \n", + "58 0.685714 \n", + "59 0.454545 \n", + "60 0.879121 \n", + "\n", + " no_outgoing_calls_no_sent_sms_ratio \\\n", + "0 0.750000 \n", + "1 0.611111 \n", + "2 0.026667 \n", + "3 0.750000 \n", + "4 0.983607 \n", + ".. ... \n", + "56 0.228070 \n", + "57 0.400000 \n", + "58 0.512821 \n", + "59 0.750000 \n", + "60 0.192661 \n", + "\n", + " no_calls_contacts_no_sms_contacts_ratio \n", + "0 0.454545 \n", + "1 0.714286 \n", + "2 0.173913 \n", + "3 0.666667 \n", + "4 0.857143 \n", + ".. ... \n", + "56 0.666667 \n", + "57 0.600000 \n", + "58 0.428571 \n", + "59 0.769231 \n", + "60 0.578947 \n", + "\n", + "[61 rows x 5 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "calls_sms_features(df_calls, df_sms)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "straw2analysis", + "language": "python", + "name": "straw2analysis" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/exploration/expl_communication.py b/exploration/expl_communication.py index e1a0c27..6a6ebc8 100644 --- a/exploration/expl_communication.py +++ b/exploration/expl_communication.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.2 +# jupytext_version: 1.11.4 # kernelspec: # display_name: straw2analysis # language: python @@ -39,6 +39,9 @@ print(df_calls) # %% count_comms(df_calls) +# %% +enumerate_contacts(df_calls) + # %% df_sms = get_sms_data(["nokia_0000003"]) count_comms(df_sms) @@ -53,6 +56,15 @@ import participants.query_db participants_inactive_usernames = participants.query_db.get_usernames() df_calls_inactive = get_call_data(participants_inactive_usernames) +# %% +participants_inactive_usernames + +# %% +df_calls_inactive.head() + +# %% +enumerate_contacts(df_calls_inactive).head() + # %% df_calls_features = count_comms(df_calls_inactive) df_calls_features.head() @@ -70,6 +82,9 @@ calls_number = pd.wide_to_long( suffix="\D+", ) +# %% +calls_number + # %% sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8) diff --git a/features/communication.py b/features/communication.py index d1dc4ee..a3c917c 100644 --- a/features/communication.py +++ b/features/communication.py @@ -86,7 +86,8 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame: # In other words, recode the contacts into integers from 0 to n_contacts, # so that the first one is contacted the most often. contact_ids = ( - contact_counts.groupby("participant_id") # Group again for enumeration. + # Group again for enumeration. + contact_counts.groupby("participant_id") .cumcount() # Enumerate (count) rows *within* participants. .to_frame("contact_id") ) @@ -150,8 +151,10 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: comm_features = comm_counts.join(comm_duration_total) comm_features = comm_features.join(comm_duration_max) try: - comm_features.drop(columns="duration_total_" + call_types[3], inplace=True) - comm_features.drop(columns="duration_max_" + call_types[3], inplace=True) + comm_features.drop(columns="duration_total_" + + call_types[3], inplace=True) + comm_features.drop(columns="duration_max_" + + call_types[3], inplace=True) # The missed calls are always of 0 duration. except KeyError: pass @@ -172,19 +175,145 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: ) # Ratio of incoming and outgoing messages to all messages. else: - raise KeyError("The dataframe contains neither call_type or message_type") + raise KeyError( + "The dataframe contains neither call_type or message_type") return comm_features -def contact_features(): - # TODO Implement a method that takes a DF with enumerated contacts as argument and calculates: - # * Duration of calls per caller (for most common callers) - # * Determine work vs non-work contacts by work hours heuristics - # * Number of people contacted - # And similarly for SMS. - pass +def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame: + """ + Counts the number of people contacted (for each participant) and, if + df_enumerated is a dataframe containing calls data, the total duration + of calls between a participant and each of her contacts. + + Parameters + ---------- + df_enumerated: pd.DataFrame + A dataframe of calls or SMSes; return of function enumerate_contacts. + + Returns + ------- + comm_df: pd.DataFrame + The altered dataframe with the column no_contacts and, if df_enumerated + contains calls data, an additional column total_call_duration. + """ + + # Check whether df contains calls or SMS data since some + # features we want to calculate are type-specyfic + if "call_duration" in df_enumerated: + # Add a column with the total duration of calls between two people + duration_count = ( + df_enumerated.groupby( + ["participant_id", "contact_id"] + ) + # For each participant and for each caller, sum durations of their calls + ["call_duration"].sum() + .reset_index() # Make index (which is actually the participant id) a normal column + .rename(columns={"call_duration": "total_call_duration"}) + ) + # The new dataframe now contains columns containing information about + # participants, callers and the total duration of their calls. All that + # is now left to do is to merge the original df with the new one. + df_enumerated = df_enumerated.merge( + duration_count, + on=["participant_id", "contact_id"] + ) + + contact_count = ( + df_enumerated.groupby(["participant_id"]) + .nunique()["contact_id"] # For each participant, count the number of distinct contacts + .reset_index() # Make index (which is actually the participant id) a normal column + .rename(columns={"contact_id": "no_contacts"}) + ) + + df_enumerated = ( + # Merge df with the newely created df containing info about number of contacts + df_enumerated.merge(contact_count, on="participant_id") + # Sort first by participant_id and then by contact_id and + # thereby restore the inital ordering of input dataframes. + .sort_values(["participant_id", "contact_id"]) + ) + + # TODO:Determine work vs non-work contacts by work hours heuristics + + return df_enumerated -def calls_sms_features(): - # TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages. - pass +def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame: + """ + Calculates additional features relating calls and sms data. + + Parameters + ---------- + df_calls: pd.DataFrame + A dataframe of calls (return of get_call_data). + df_sms: pd.DataFrame + A dataframe of calls (return of get_sms_data). + + Returns + ------- + df_calls_sms: pd.DataFrame + The list of features relating calls and sms data for every participant. + These are: + * no_calls_no_sms_ratio: + proportion of calls in total number of communications + * no_incoming_calls_no_recieved_sms_ratio: + proportion of incoming calls in total number of incoming/recieved communications + * no_outgoing_calls_no_sent_sms_ratio: + proportion of outgoing calls in total number of outgoing/sent communications + * no_calls_contacts_no_sms_contacts_ratio: + proportion of calls contacts in total number of communication contacts + """ + + count_calls = count_comms(df_calls) + count_sms = count_comms(df_sms) + + count_joined = ( + count_calls.merge( + count_sms, on="participant_id", suffixes=("_calls", "_sms") + ) # Merge calls and sms features + .reset_index() # Make participant_id a regular column + .assign( + no_calls_no_sms_ratio=( + lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms) + ), + no_incoming_calls_no_recieved_sms_ratio=( + lambda x: x.no_received / (x.no_incoming + x.no_received) + ), + no_outgoing_calls_no_sent_sms_ratio=( + lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent) + ) # Calculate new features and create additional columns + )[ + ["participant_id", + "no_calls_no_sms_ratio", + "no_incoming_calls_no_recieved_sms_ratio", + "no_outgoing_calls_no_sent_sms_ratio"] + ] # Filter out only the relevant feautres + ) + + features_calls = contact_features(enumerate_contacts(df_calls)) + features_sms = contact_features(enumerate_contacts(df_sms)) + + features_joined = ( + features_calls.merge( + features_sms, on="participant_id", suffixes=("_calls", "_sms") + ) # Merge calls and sms features + .reset_index() # Make participand_id a regular column + .assign( + no_calls_contacts_no_sms_contacts_ratio=( + lambda x: x.no_contacts_calls / + (x.no_contacts_calls + x.no_contacts_sms) + ) # Calculate new features and create additional columns + )[ + ["participant_id", + "no_calls_contacts_no_sms_contacts_ratio"] + ] # Filter out only the relevant feautres + # Since we are interested only in some features and ignored + # others, a lot of duplicate rows were created. Remove them. + .drop_duplicates() + ) + + # Join the newely created dataframes + df_calls_sms = count_joined.merge(features_joined, on="participant_id") + + return df_calls_sms