additional communication features

communication
Ivan Kobe 2021-08-04 13:45:54 +02:00
parent 1bdb334c42
commit d98b673824
6 changed files with 1661 additions and 15 deletions

View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,393 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"nb_dir = os.path.split(os.getcwd())[0]\n",
"if nb_dir not in sys.path:\n",
" sys.path.append(nb_dir)\n",
" \n",
"from features.communication import *\n",
"import participants.query_db\n",
"\n",
"participants_inactive_usernames = participants.query_db.get_usernames()\n",
"df_sms = get_sms_data(participants_inactive_usernames)\n",
"df_calls = get_call_data(participants_inactive_usernames)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
" count_calls = count_comms(df_calls)\n",
" count_sms = count_comms(df_sms)\n",
"\n",
" count_joined = count_calls.merge(\n",
" count_sms, on=\"participant_id\", suffixes=(\"_calls\", \"_sms\")\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>participant_id</th>\n",
" <th>no_incoming</th>\n",
" <th>no_outgoing</th>\n",
" <th>no_missed</th>\n",
" <th>no_all_calls</th>\n",
" <th>no_incoming_ratio</th>\n",
" <th>no_outgoing_ratio</th>\n",
" <th>duration_total_incoming</th>\n",
" <th>duration_total_outgoing</th>\n",
" <th>duration_max_incoming</th>\n",
" <th>duration_max_outgoing</th>\n",
" <th>no_received</th>\n",
" <th>no_sent</th>\n",
" <th>no_all_sms</th>\n",
" <th>no_received_ratio</th>\n",
" <th>no_sent_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13</td>\n",
" <td>3.0</td>\n",
" <td>21.0</td>\n",
" <td>2.0</td>\n",
" <td>26.0</td>\n",
" <td>0.115385</td>\n",
" <td>0.807692</td>\n",
" <td>342.0</td>\n",
" <td>2836.0</td>\n",
" <td>196.0</td>\n",
" <td>355.0</td>\n",
" <td>7.0</td>\n",
" <td>7.0</td>\n",
" <td>14.0</td>\n",
" <td>0.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>14</td>\n",
" <td>16.0</td>\n",
" <td>22.0</td>\n",
" <td>11.0</td>\n",
" <td>49.0</td>\n",
" <td>0.326531</td>\n",
" <td>0.448980</td>\n",
" <td>1873.0</td>\n",
" <td>2789.0</td>\n",
" <td>346.0</td>\n",
" <td>694.0</td>\n",
" <td>20.0</td>\n",
" <td>14.0</td>\n",
" <td>34.0</td>\n",
" <td>0.588235</td>\n",
" <td>0.411765</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>15</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>5.0</td>\n",
" <td>0.600000</td>\n",
" <td>0.400000</td>\n",
" <td>310.0</td>\n",
" <td>19.0</td>\n",
" <td>154.0</td>\n",
" <td>19.0</td>\n",
" <td>73.0</td>\n",
" <td>73.0</td>\n",
" <td>146.0</td>\n",
" <td>0.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>16</td>\n",
" <td>4.0</td>\n",
" <td>6.0</td>\n",
" <td>3.0</td>\n",
" <td>13.0</td>\n",
" <td>0.307692</td>\n",
" <td>0.461538</td>\n",
" <td>1963.0</td>\n",
" <td>849.0</td>\n",
" <td>1037.0</td>\n",
" <td>638.0</td>\n",
" <td>8.0</td>\n",
" <td>2.0</td>\n",
" <td>10.0</td>\n",
" <td>0.800000</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>17</td>\n",
" <td>20.0</td>\n",
" <td>60.0</td>\n",
" <td>8.0</td>\n",
" <td>88.0</td>\n",
" <td>0.227273</td>\n",
" <td>0.681818</td>\n",
" <td>5789.0</td>\n",
" <td>17046.0</td>\n",
" <td>1966.0</td>\n",
" <td>3830.0</td>\n",
" <td>7.0</td>\n",
" <td>1.0</td>\n",
" <td>8.0</td>\n",
" <td>0.875000</td>\n",
" <td>0.125000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>91</td>\n",
" <td>15.0</td>\n",
" <td>13.0</td>\n",
" <td>3.0</td>\n",
" <td>31.0</td>\n",
" <td>0.483871</td>\n",
" <td>0.419355</td>\n",
" <td>3443.0</td>\n",
" <td>3636.0</td>\n",
" <td>644.0</td>\n",
" <td>1315.0</td>\n",
" <td>83.0</td>\n",
" <td>44.0</td>\n",
" <td>127.0</td>\n",
" <td>0.653543</td>\n",
" <td>0.346457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>92</td>\n",
" <td>3.0</td>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>8.0</td>\n",
" <td>0.375000</td>\n",
" <td>0.500000</td>\n",
" <td>231.0</td>\n",
" <td>648.0</td>\n",
" <td>167.0</td>\n",
" <td>433.0</td>\n",
" <td>4.0</td>\n",
" <td>6.0</td>\n",
" <td>10.0</td>\n",
" <td>0.400000</td>\n",
" <td>0.600000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>93</td>\n",
" <td>22.0</td>\n",
" <td>20.0</td>\n",
" <td>9.0</td>\n",
" <td>51.0</td>\n",
" <td>0.431373</td>\n",
" <td>0.392157</td>\n",
" <td>2534.0</td>\n",
" <td>1444.0</td>\n",
" <td>443.0</td>\n",
" <td>672.0</td>\n",
" <td>48.0</td>\n",
" <td>19.0</td>\n",
" <td>67.0</td>\n",
" <td>0.716418</td>\n",
" <td>0.283582</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>106</td>\n",
" <td>12.0</td>\n",
" <td>30.0</td>\n",
" <td>6.0</td>\n",
" <td>48.0</td>\n",
" <td>0.250000</td>\n",
" <td>0.625000</td>\n",
" <td>3049.0</td>\n",
" <td>2637.0</td>\n",
" <td>878.0</td>\n",
" <td>380.0</td>\n",
" <td>10.0</td>\n",
" <td>10.0</td>\n",
" <td>20.0</td>\n",
" <td>0.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>107</td>\n",
" <td>11.0</td>\n",
" <td>42.0</td>\n",
" <td>13.0</td>\n",
" <td>66.0</td>\n",
" <td>0.166667</td>\n",
" <td>0.636364</td>\n",
" <td>3804.0</td>\n",
" <td>9977.0</td>\n",
" <td>1519.0</td>\n",
" <td>1943.0</td>\n",
" <td>80.0</td>\n",
" <td>176.0</td>\n",
" <td>256.0</td>\n",
" <td>0.312500</td>\n",
" <td>0.687500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>61 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" participant_id no_incoming no_outgoing no_missed no_all_calls \\\n",
"0 13 3.0 21.0 2.0 26.0 \n",
"1 14 16.0 22.0 11.0 49.0 \n",
"2 15 3.0 2.0 NaN 5.0 \n",
"3 16 4.0 6.0 3.0 13.0 \n",
"4 17 20.0 60.0 8.0 88.0 \n",
".. ... ... ... ... ... \n",
"56 91 15.0 13.0 3.0 31.0 \n",
"57 92 3.0 4.0 1.0 8.0 \n",
"58 93 22.0 20.0 9.0 51.0 \n",
"59 106 12.0 30.0 6.0 48.0 \n",
"60 107 11.0 42.0 13.0 66.0 \n",
"\n",
" no_incoming_ratio no_outgoing_ratio duration_total_incoming \\\n",
"0 0.115385 0.807692 342.0 \n",
"1 0.326531 0.448980 1873.0 \n",
"2 0.600000 0.400000 310.0 \n",
"3 0.307692 0.461538 1963.0 \n",
"4 0.227273 0.681818 5789.0 \n",
".. ... ... ... \n",
"56 0.483871 0.419355 3443.0 \n",
"57 0.375000 0.500000 231.0 \n",
"58 0.431373 0.392157 2534.0 \n",
"59 0.250000 0.625000 3049.0 \n",
"60 0.166667 0.636364 3804.0 \n",
"\n",
" duration_total_outgoing duration_max_incoming duration_max_outgoing \\\n",
"0 2836.0 196.0 355.0 \n",
"1 2789.0 346.0 694.0 \n",
"2 19.0 154.0 19.0 \n",
"3 849.0 1037.0 638.0 \n",
"4 17046.0 1966.0 3830.0 \n",
".. ... ... ... \n",
"56 3636.0 644.0 1315.0 \n",
"57 648.0 167.0 433.0 \n",
"58 1444.0 443.0 672.0 \n",
"59 2637.0 878.0 380.0 \n",
"60 9977.0 1519.0 1943.0 \n",
"\n",
" no_received no_sent no_all_sms no_received_ratio no_sent_ratio \n",
"0 7.0 7.0 14.0 0.500000 0.500000 \n",
"1 20.0 14.0 34.0 0.588235 0.411765 \n",
"2 73.0 73.0 146.0 0.500000 0.500000 \n",
"3 8.0 2.0 10.0 0.800000 0.200000 \n",
"4 7.0 1.0 8.0 0.875000 0.125000 \n",
".. ... ... ... ... ... \n",
"56 83.0 44.0 127.0 0.653543 0.346457 \n",
"57 4.0 6.0 10.0 0.400000 0.600000 \n",
"58 48.0 19.0 67.0 0.716418 0.283582 \n",
"59 10.0 10.0 20.0 0.500000 0.500000 \n",
"60 80.0 176.0 256.0 0.312500 0.687500 \n",
"\n",
"[61 rows x 16 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_joined.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "straw2analysis",
"language": "python",
"name": "straw2analysis"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

315
Untitled.ipynb 100644
View File

@ -0,0 +1,315 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"nb_dir = os.path.split(os.getcwd())[0]\n",
"if nb_dir not in sys.path:\n",
" sys.path.append(nb_dir)\n",
" \n",
"from features.communication import *\n",
"import participants.query_db\n",
"\n",
"participants_inactive_usernames = participants.query_db.get_usernames()\n",
"df_sms = get_sms_data(participants_inactive_usernames)\n",
"df_calls = get_call_data(participants_inactive_usernames)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_calls"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>_id</th>\n",
" <th>timestamp</th>\n",
" <th>device_id</th>\n",
" <th>call_type</th>\n",
" <th>call_duration</th>\n",
" <th>trace</th>\n",
" <th>participant_id</th>\n",
" <th>username</th>\n",
" <th>freq</th>\n",
" <th>contact_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>118</td>\n",
" <td>1</td>\n",
" <td>1581936081010</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>176</td>\n",
" <td>3</td>\n",
" <td>1582295247982</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>1</td>\n",
" <td>196</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>209</td>\n",
" <td>4</td>\n",
" <td>1582305634014</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>237</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>210</td>\n",
" <td>5</td>\n",
" <td>1582561530334</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>126</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>253</td>\n",
" <td>6</td>\n",
" <td>1582627576077</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>255</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4645</th>\n",
" <td>5874</td>\n",
" <td>131</td>\n",
" <td>1625602519843</td>\n",
" <td>4b62a655-cbf0-4ac0-a448-06726f45b56a</td>\n",
" <td>2</td>\n",
" <td>2270</td>\n",
" <td>f4ebca8dc7305fe424d6bf7fbcd2e5086f98b453</td>\n",
" <td>90</td>\n",
" <td>uploader_53573</td>\n",
" <td>6</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4646</th>\n",
" <td>5882</td>\n",
" <td>139</td>\n",
" <td>1625753023456</td>\n",
" <td>4b62a655-cbf0-4ac0-a448-06726f45b56a</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>2e5d63f6fddca2b66be810b5946c42eda24f2dbe</td>\n",
" <td>90</td>\n",
" <td>uploader_53573</td>\n",
" <td>2</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4647</th>\n",
" <td>5883</td>\n",
" <td>140</td>\n",
" <td>1625754998767</td>\n",
" <td>4b62a655-cbf0-4ac0-a448-06726f45b56a</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>2e5d63f6fddca2b66be810b5946c42eda24f2dbe</td>\n",
" <td>90</td>\n",
" <td>uploader_53573</td>\n",
" <td>2</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4648</th>\n",
" <td>5884</td>\n",
" <td>141</td>\n",
" <td>1625823008392</td>\n",
" <td>4b62a655-cbf0-4ac0-a448-06726f45b56a</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>7316d58b7bb7de097a2421c56010ac024a489451</td>\n",
" <td>90</td>\n",
" <td>uploader_53573</td>\n",
" <td>1</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4649</th>\n",
" <td>5903</td>\n",
" <td>158</td>\n",
" <td>1626110930233</td>\n",
" <td>4b62a655-cbf0-4ac0-a448-06726f45b56a</td>\n",
" <td>2</td>\n",
" <td>53</td>\n",
" <td>7db4e9acf7c73837ddecdae5da523a28c774ba94</td>\n",
" <td>90</td>\n",
" <td>uploader_53573</td>\n",
" <td>1</td>\n",
" <td>24</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4650 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" id _id timestamp device_id \\\n",
"0 118 1 1581936081010 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"1 176 3 1582295247982 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"2 209 4 1582305634014 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"3 210 5 1582561530334 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"4 253 6 1582627576077 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"... ... ... ... ... \n",
"4645 5874 131 1625602519843 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
"4646 5882 139 1625753023456 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
"4647 5883 140 1625754998767 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
"4648 5884 141 1625823008392 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
"4649 5903 158 1626110930233 4b62a655-cbf0-4ac0-a448-06726f45b56a \n",
"\n",
" call_type call_duration trace \\\n",
"0 2 0 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"1 1 196 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"2 2 237 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"3 2 126 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"4 2 255 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"... ... ... ... \n",
"4645 2 2270 f4ebca8dc7305fe424d6bf7fbcd2e5086f98b453 \n",
"4646 3 0 2e5d63f6fddca2b66be810b5946c42eda24f2dbe \n",
"4647 3 0 2e5d63f6fddca2b66be810b5946c42eda24f2dbe \n",
"4648 2 0 7316d58b7bb7de097a2421c56010ac024a489451 \n",
"4649 2 53 7db4e9acf7c73837ddecdae5da523a28c774ba94 \n",
"\n",
" participant_id username freq contact_id \n",
"0 13 uploader_20449 21 0 \n",
"1 13 uploader_20449 21 0 \n",
"2 13 uploader_20449 21 0 \n",
"3 13 uploader_20449 21 0 \n",
"4 13 uploader_20449 21 0 \n",
"... ... ... ... ... \n",
"4645 90 uploader_53573 6 9 \n",
"4646 90 uploader_53573 2 13 \n",
"4647 90 uploader_53573 2 13 \n",
"4648 90 uploader_53573 1 21 \n",
"4649 90 uploader_53573 1 24 \n",
"\n",
"[4650 rows x 11 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contact_features(enumerate_contacts(df_calls))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "straw2analysis",
"language": "python",
"name": "straw2analysis"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

788
Untitled1.ipynb 100644
View File

@ -0,0 +1,788 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"nb_dir = os.path.split(os.getcwd())[0]\n",
"if nb_dir not in sys.path:\n",
" sys.path.append(nb_dir)\n",
" \n",
"from features.communication import *\n",
"import participants.query_db\n",
"\n",
"participants_inactive_usernames = participants.query_db.get_usernames()\n",
"df_sms = get_sms_data(participants_inactive_usernames)\n",
"df_calls = get_call_data(participants_inactive_usernames)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>_id</th>\n",
" <th>timestamp</th>\n",
" <th>device_id</th>\n",
" <th>call_type</th>\n",
" <th>call_duration</th>\n",
" <th>trace</th>\n",
" <th>participant_id</th>\n",
" <th>username</th>\n",
" <th>freq</th>\n",
" <th>contact_id</th>\n",
" <th>total_call_duration</th>\n",
" <th>no_contacts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>118</td>\n",
" <td>1</td>\n",
" <td>1581936081010</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>2844</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>176</td>\n",
" <td>3</td>\n",
" <td>1582295247982</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>1</td>\n",
" <td>196</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>2844</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>209</td>\n",
" <td>4</td>\n",
" <td>1582305634014</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>237</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>2844</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>210</td>\n",
" <td>5</td>\n",
" <td>1582561530334</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>126</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>2844</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>253</td>\n",
" <td>6</td>\n",
" <td>1582627576077</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>255</td>\n",
" <td>87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>2844</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4352</th>\n",
" <td>5560</td>\n",
" <td>59</td>\n",
" <td>1621275689589</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>5eb72fe829c2af4a654007220119bdcf47499555</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4336</th>\n",
" <td>5443</td>\n",
" <td>29</td>\n",
" <td>1620746142636</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>2</td>\n",
" <td>189</td>\n",
" <td>6dd761532337dfe596eb2e34f4c91216b38e28e2</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>1</td>\n",
" <td>18</td>\n",
" <td>189</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4316</th>\n",
" <td>5237</td>\n",
" <td>10</td>\n",
" <td>1620140109908</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>2</td>\n",
" <td>85</td>\n",
" <td>9c4eab1dfc0114aecd64a7f594977acc9ab7936c</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>1</td>\n",
" <td>19</td>\n",
" <td>85</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4347</th>\n",
" <td>5524</td>\n",
" <td>44</td>\n",
" <td>1620971679122</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>1</td>\n",
" <td>120</td>\n",
" <td>a9fa73b6137d09288429de20172095978730e4b8</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>1</td>\n",
" <td>20</td>\n",
" <td>120</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4331</th>\n",
" <td>5364</td>\n",
" <td>26</td>\n",
" <td>1620630328635</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>2</td>\n",
" <td>184</td>\n",
" <td>cfe98eee4a27b377f4cde1ea5c39d24d0475b533</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>1</td>\n",
" <td>21</td>\n",
" <td>184</td>\n",
" <td>22</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4650 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" id _id timestamp device_id \\\n",
"0 118 1 1581936081010 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"1 176 3 1582295247982 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"2 209 4 1582305634014 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"3 210 5 1582561530334 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"4 253 6 1582627576077 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"... ... ... ... ... \n",
"4352 5560 59 1621275689589 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"4336 5443 29 1620746142636 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"4316 5237 10 1620140109908 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"4347 5524 44 1620971679122 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"4331 5364 26 1620630328635 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"\n",
" call_type call_duration trace \\\n",
"0 2 0 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"1 1 196 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"2 2 237 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"3 2 126 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"4 2 255 87ae5eb2c5b7fe30bea2821e2ec052453d89ea6b \n",
"... ... ... ... \n",
"4352 2 0 5eb72fe829c2af4a654007220119bdcf47499555 \n",
"4336 2 189 6dd761532337dfe596eb2e34f4c91216b38e28e2 \n",
"4316 2 85 9c4eab1dfc0114aecd64a7f594977acc9ab7936c \n",
"4347 1 120 a9fa73b6137d09288429de20172095978730e4b8 \n",
"4331 2 184 cfe98eee4a27b377f4cde1ea5c39d24d0475b533 \n",
"\n",
" participant_id username freq contact_id total_call_duration \\\n",
"0 13 uploader_20449 21 0 2844 \n",
"1 13 uploader_20449 21 0 2844 \n",
"2 13 uploader_20449 21 0 2844 \n",
"3 13 uploader_20449 21 0 2844 \n",
"4 13 uploader_20449 21 0 2844 \n",
"... ... ... ... ... ... \n",
"4352 107 uploader_89606 1 17 0 \n",
"4336 107 uploader_89606 1 18 189 \n",
"4316 107 uploader_89606 1 19 85 \n",
"4347 107 uploader_89606 1 20 120 \n",
"4331 107 uploader_89606 1 21 184 \n",
"\n",
" no_contacts \n",
"0 5 \n",
"1 5 \n",
"2 5 \n",
"3 5 \n",
"4 5 \n",
"... ... \n",
"4352 22 \n",
"4336 22 \n",
"4316 22 \n",
"4347 22 \n",
"4331 22 \n",
"\n",
"[4650 rows x 13 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contact_features(enumerate_contacts(df_calls))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>_id</th>\n",
" <th>timestamp</th>\n",
" <th>device_id</th>\n",
" <th>message_type</th>\n",
" <th>trace</th>\n",
" <th>participant_id</th>\n",
" <th>username</th>\n",
" <th>freq</th>\n",
" <th>contact_id</th>\n",
" <th>no_contacts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>195</th>\n",
" <td>379</td>\n",
" <td>7</td>\n",
" <td>1582964434597</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>417b9c87f5b573530bcffba8577777b3a964d671</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>380</td>\n",
" <td>8</td>\n",
" <td>1582964434974</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>417b9c87f5b573530bcffba8577777b3a964d671</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>382</td>\n",
" <td>10</td>\n",
" <td>1582965988609</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>417b9c87f5b573530bcffba8577777b3a964d671</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>383</td>\n",
" <td>11</td>\n",
" <td>1582965988873</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>417b9c87f5b573530bcffba8577777b3a964d671</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>396</td>\n",
" <td>12</td>\n",
" <td>1582965988873</td>\n",
" <td>78082f9f-98c2-468d-b4a2-7c835bd812bd</td>\n",
" <td>2</td>\n",
" <td>417b9c87f5b573530bcffba8577777b3a964d671</td>\n",
" <td>13</td>\n",
" <td>uploader_20449</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5193</th>\n",
" <td>6137</td>\n",
" <td>4</td>\n",
" <td>1619789360665</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>2</td>\n",
" <td>2340c1d2b9e5d550373423a599014468a4dc3678</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>3</td>\n",
" <td>12</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5194</th>\n",
" <td>6135</td>\n",
" <td>2</td>\n",
" <td>1619787273829</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>1</td>\n",
" <td>2340c1d2b9e5d550373423a599014468a4dc3678</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>3</td>\n",
" <td>12</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5417</th>\n",
" <td>6690</td>\n",
" <td>210</td>\n",
" <td>1620980437198</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>1</td>\n",
" <td>198d7e63a2b4d8a7ca9bb92f74e6974ca17edc56</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>1</td>\n",
" <td>13</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>6770</td>\n",
" <td>256</td>\n",
" <td>1621407668019</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>1</td>\n",
" <td>d4a67b53e704247de47064850efd3647e8dcaffb</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>1</td>\n",
" <td>14</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5440</th>\n",
" <td>6742</td>\n",
" <td>248</td>\n",
" <td>1621253313544</td>\n",
" <td>9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e</td>\n",
" <td>1</td>\n",
" <td>ffbc6a5f0f601cf2d9cdad9d3a588633e1a1967c</td>\n",
" <td>107</td>\n",
" <td>uploader_89606</td>\n",
" <td>1</td>\n",
" <td>15</td>\n",
" <td>16</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5864 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" id _id timestamp device_id \\\n",
"195 379 7 1582964434597 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"196 380 8 1582964434974 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"197 382 10 1582965988609 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"198 383 11 1582965988873 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"199 396 12 1582965988873 78082f9f-98c2-468d-b4a2-7c835bd812bd \n",
"... ... ... ... ... \n",
"5193 6137 4 1619789360665 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"5194 6135 2 1619787273829 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"5417 6690 210 1620980437198 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"5447 6770 256 1621407668019 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"5440 6742 248 1621253313544 9f54e35c-d7cb-4f4c-8dc1-17dc86f2635e \n",
"\n",
" message_type trace participant_id \\\n",
"195 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
"196 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
"197 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
"198 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
"199 2 417b9c87f5b573530bcffba8577777b3a964d671 13 \n",
"... ... ... ... \n",
"5193 2 2340c1d2b9e5d550373423a599014468a4dc3678 107 \n",
"5194 1 2340c1d2b9e5d550373423a599014468a4dc3678 107 \n",
"5417 1 198d7e63a2b4d8a7ca9bb92f74e6974ca17edc56 107 \n",
"5447 1 d4a67b53e704247de47064850efd3647e8dcaffb 107 \n",
"5440 1 ffbc6a5f0f601cf2d9cdad9d3a588633e1a1967c 107 \n",
"\n",
" username freq contact_id no_contacts \n",
"195 uploader_20449 5 0 6 \n",
"196 uploader_20449 5 0 6 \n",
"197 uploader_20449 5 0 6 \n",
"198 uploader_20449 5 0 6 \n",
"199 uploader_20449 5 0 6 \n",
"... ... ... ... ... \n",
"5193 uploader_89606 3 12 16 \n",
"5194 uploader_89606 3 12 16 \n",
"5417 uploader_89606 1 13 16 \n",
"5447 uploader_89606 1 14 16 \n",
"5440 uploader_89606 1 15 16 \n",
"\n",
"[5864 rows x 11 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contact_features(enumerate_contacts(df_sms))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>participant_id</th>\n",
" <th>no_calls_no_sms_ratio</th>\n",
" <th>no_incoming_calls_no_recieved_sms_ratio</th>\n",
" <th>no_outgoing_calls_no_sent_sms_ratio</th>\n",
" <th>no_calls_contacts_no_sms_contacts_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13</td>\n",
" <td>0.650000</td>\n",
" <td>0.700000</td>\n",
" <td>0.750000</td>\n",
" <td>0.454545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>14</td>\n",
" <td>0.590361</td>\n",
" <td>0.555556</td>\n",
" <td>0.611111</td>\n",
" <td>0.714286</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>15</td>\n",
" <td>0.033113</td>\n",
" <td>0.960526</td>\n",
" <td>0.026667</td>\n",
" <td>0.173913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>16</td>\n",
" <td>0.565217</td>\n",
" <td>0.666667</td>\n",
" <td>0.750000</td>\n",
" <td>0.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>17</td>\n",
" <td>0.916667</td>\n",
" <td>0.259259</td>\n",
" <td>0.983607</td>\n",
" <td>0.857143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>91</td>\n",
" <td>0.196203</td>\n",
" <td>0.846939</td>\n",
" <td>0.228070</td>\n",
" <td>0.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>92</td>\n",
" <td>0.444444</td>\n",
" <td>0.571429</td>\n",
" <td>0.400000</td>\n",
" <td>0.600000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>93</td>\n",
" <td>0.432203</td>\n",
" <td>0.685714</td>\n",
" <td>0.512821</td>\n",
" <td>0.428571</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>106</td>\n",
" <td>0.705882</td>\n",
" <td>0.454545</td>\n",
" <td>0.750000</td>\n",
" <td>0.769231</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>107</td>\n",
" <td>0.204969</td>\n",
" <td>0.879121</td>\n",
" <td>0.192661</td>\n",
" <td>0.578947</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>61 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" participant_id no_calls_no_sms_ratio \\\n",
"0 13 0.650000 \n",
"1 14 0.590361 \n",
"2 15 0.033113 \n",
"3 16 0.565217 \n",
"4 17 0.916667 \n",
".. ... ... \n",
"56 91 0.196203 \n",
"57 92 0.444444 \n",
"58 93 0.432203 \n",
"59 106 0.705882 \n",
"60 107 0.204969 \n",
"\n",
" no_incoming_calls_no_recieved_sms_ratio \\\n",
"0 0.700000 \n",
"1 0.555556 \n",
"2 0.960526 \n",
"3 0.666667 \n",
"4 0.259259 \n",
".. ... \n",
"56 0.846939 \n",
"57 0.571429 \n",
"58 0.685714 \n",
"59 0.454545 \n",
"60 0.879121 \n",
"\n",
" no_outgoing_calls_no_sent_sms_ratio \\\n",
"0 0.750000 \n",
"1 0.611111 \n",
"2 0.026667 \n",
"3 0.750000 \n",
"4 0.983607 \n",
".. ... \n",
"56 0.228070 \n",
"57 0.400000 \n",
"58 0.512821 \n",
"59 0.750000 \n",
"60 0.192661 \n",
"\n",
" no_calls_contacts_no_sms_contacts_ratio \n",
"0 0.454545 \n",
"1 0.714286 \n",
"2 0.173913 \n",
"3 0.666667 \n",
"4 0.857143 \n",
".. ... \n",
"56 0.666667 \n",
"57 0.600000 \n",
"58 0.428571 \n",
"59 0.769231 \n",
"60 0.578947 \n",
"\n",
"[61 rows x 5 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calls_sms_features(df_calls, df_sms)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "straw2analysis",
"language": "python",
"name": "straw2analysis"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.11.2 # jupytext_version: 1.11.4
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -39,6 +39,9 @@ print(df_calls)
# %% # %%
count_comms(df_calls) count_comms(df_calls)
# %%
enumerate_contacts(df_calls)
# %% # %%
df_sms = get_sms_data(["nokia_0000003"]) df_sms = get_sms_data(["nokia_0000003"])
count_comms(df_sms) count_comms(df_sms)
@ -53,6 +56,15 @@ import participants.query_db
participants_inactive_usernames = participants.query_db.get_usernames() participants_inactive_usernames = participants.query_db.get_usernames()
df_calls_inactive = get_call_data(participants_inactive_usernames) df_calls_inactive = get_call_data(participants_inactive_usernames)
# %%
participants_inactive_usernames
# %%
df_calls_inactive.head()
# %%
enumerate_contacts(df_calls_inactive).head()
# %% # %%
df_calls_features = count_comms(df_calls_inactive) df_calls_features = count_comms(df_calls_inactive)
df_calls_features.head() df_calls_features.head()
@ -70,6 +82,9 @@ calls_number = pd.wide_to_long(
suffix="\D+", suffix="\D+",
) )
# %%
calls_number
# %% # %%
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8) sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)

View File

@ -86,7 +86,8 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
# In other words, recode the contacts into integers from 0 to n_contacts, # In other words, recode the contacts into integers from 0 to n_contacts,
# so that the first one is contacted the most often. # so that the first one is contacted the most often.
contact_ids = ( contact_ids = (
contact_counts.groupby("participant_id") # Group again for enumeration. # Group again for enumeration.
contact_counts.groupby("participant_id")
.cumcount() # Enumerate (count) rows *within* participants. .cumcount() # Enumerate (count) rows *within* participants.
.to_frame("contact_id") .to_frame("contact_id")
) )
@ -150,8 +151,10 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
comm_features = comm_counts.join(comm_duration_total) comm_features = comm_counts.join(comm_duration_total)
comm_features = comm_features.join(comm_duration_max) comm_features = comm_features.join(comm_duration_max)
try: try:
comm_features.drop(columns="duration_total_" + call_types[3], inplace=True) comm_features.drop(columns="duration_total_" +
comm_features.drop(columns="duration_max_" + call_types[3], inplace=True) call_types[3], inplace=True)
comm_features.drop(columns="duration_max_" +
call_types[3], inplace=True)
# The missed calls are always of 0 duration. # The missed calls are always of 0 duration.
except KeyError: except KeyError:
pass pass
@ -172,19 +175,145 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
) )
# Ratio of incoming and outgoing messages to all messages. # Ratio of incoming and outgoing messages to all messages.
else: else:
raise KeyError("The dataframe contains neither call_type or message_type") raise KeyError(
"The dataframe contains neither call_type or message_type")
return comm_features return comm_features
def contact_features(): def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
# TODO Implement a method that takes a DF with enumerated contacts as argument and calculates: """
# * Duration of calls per caller (for most common callers) Counts the number of people contacted (for each participant) and, if
# * Determine work vs non-work contacts by work hours heuristics df_enumerated is a dataframe containing calls data, the total duration
# * Number of people contacted of calls between a participant and each of her contacts.
# And similarly for SMS.
pass Parameters
----------
df_enumerated: pd.DataFrame
A dataframe of calls or SMSes; return of function enumerate_contacts.
Returns
-------
comm_df: pd.DataFrame
The altered dataframe with the column no_contacts and, if df_enumerated
contains calls data, an additional column total_call_duration.
"""
# Check whether df contains calls or SMS data since some
# features we want to calculate are type-specyfic
if "call_duration" in df_enumerated:
# Add a column with the total duration of calls between two people
duration_count = (
df_enumerated.groupby(
["participant_id", "contact_id"]
)
# For each participant and for each caller, sum durations of their calls
["call_duration"].sum()
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"call_duration": "total_call_duration"})
)
# The new dataframe now contains columns containing information about
# participants, callers and the total duration of their calls. All that
# is now left to do is to merge the original df with the new one.
df_enumerated = df_enumerated.merge(
duration_count,
on=["participant_id", "contact_id"]
)
contact_count = (
df_enumerated.groupby(["participant_id"])
.nunique()["contact_id"] # For each participant, count the number of distinct contacts
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"contact_id": "no_contacts"})
)
df_enumerated = (
# Merge df with the newely created df containing info about number of contacts
df_enumerated.merge(contact_count, on="participant_id")
# Sort first by participant_id and then by contact_id and
# thereby restore the inital ordering of input dataframes.
.sort_values(["participant_id", "contact_id"])
)
# TODO:Determine work vs non-work contacts by work hours heuristics
return df_enumerated
def calls_sms_features(): def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
# TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages. """
pass Calculates additional features relating calls and sms data.
Parameters
----------
df_calls: pd.DataFrame
A dataframe of calls (return of get_call_data).
df_sms: pd.DataFrame
A dataframe of calls (return of get_sms_data).
Returns
-------
df_calls_sms: pd.DataFrame
The list of features relating calls and sms data for every participant.
These are:
* no_calls_no_sms_ratio:
proportion of calls in total number of communications
* no_incoming_calls_no_recieved_sms_ratio:
proportion of incoming calls in total number of incoming/recieved communications
* no_outgoing_calls_no_sent_sms_ratio:
proportion of outgoing calls in total number of outgoing/sent communications
* no_calls_contacts_no_sms_contacts_ratio:
proportion of calls contacts in total number of communication contacts
"""
count_calls = count_comms(df_calls)
count_sms = count_comms(df_sms)
count_joined = (
count_calls.merge(
count_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
no_calls_no_sms_ratio=(
lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
),
no_incoming_calls_no_recieved_sms_ratio=(
lambda x: x.no_received / (x.no_incoming + x.no_received)
),
no_outgoing_calls_no_sent_sms_ratio=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
) # Calculate new features and create additional columns
)[
["participant_id",
"no_calls_no_sms_ratio",
"no_incoming_calls_no_recieved_sms_ratio",
"no_outgoing_calls_no_sent_sms_ratio"]
] # Filter out only the relevant feautres
)
features_calls = contact_features(enumerate_contacts(df_calls))
features_sms = contact_features(enumerate_contacts(df_sms))
features_joined = (
features_calls.merge(
features_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participand_id a regular column
.assign(
no_calls_contacts_no_sms_contacts_ratio=(
lambda x: x.no_contacts_calls /
(x.no_contacts_calls + x.no_contacts_sms)
) # Calculate new features and create additional columns
)[
["participant_id",
"no_calls_contacts_no_sms_contacts_ratio"]
] # Filter out only the relevant feautres
# Since we are interested only in some features and ignored
# others, a lot of duplicate rows were created. Remove them.
.drop_duplicates()
)
# Join the newely created dataframes
df_calls_sms = count_joined.merge(features_joined, on="participant_id")
return df_calls_sms