diff --git a/.idea/misc.xml b/.idea/misc.xml index a99db41..8962e54 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -2,6 +2,6 @@ - \ No newline at end of file diff --git a/config/environment.yml b/config/environment.yml index d0d9b66..5db94bf 100644 --- a/config/environment.yml +++ b/config/environment.yml @@ -8,7 +8,9 @@ dependencies: - isort - flake8 - jupyterlab + - jupytext - mypy + - nodejs - pandas - psycopg2 - python-dotenv diff --git a/exploration/communication.ipynb b/exploration/communication.ipynb index 7c4c6d9..0ce4700 100644 --- a/exploration/communication.ipynb +++ b/exploration/communication.ipynb @@ -37,7 +37,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Example of feature calculation" + "# Example of communication data and feature calculation" ] }, { @@ -229,14 +229,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Explore the whole dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Call data" + "# Call data" ] }, { @@ -529,7 +522,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 12, @@ -561,7 +554,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 13, @@ -787,9 +780,157 @@ "source": [ "sns.boxplot(x=\"contact_id\", y=\"freq\", data=df_calls_frequent)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SMS data" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
message_typeno_receivedno_sent
count49.00000043.000000
mean51.16326552.511628
std61.47911166.010956
min4.0000001.000000
25%10.00000010.500000
50%29.00000023.000000
75%61.00000069.500000
max283.000000277.000000
\n", + "
" + ], + "text/plain": [ + "message_type no_received no_sent\n", + "count 49.000000 43.000000\n", + "mean 51.163265 52.511628\n", + "std 61.479111 66.010956\n", + "min 4.000000 1.000000\n", + "25% 10.000000 10.500000\n", + "50% 29.000000 23.000000\n", + "75% 61.000000 69.500000\n", + "max 283.000000 277.000000" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sms_inactive = get_sms_data(participants_inactive_usernames)\n", + "df_sms_features = count_comms(df_sms_inactive)\n", + "df_sms_features.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAo4AAAI4CAYAAADknWiIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAg9UlEQVR4nO3deZhld13n8c+XdKBJQsIiMGw9AUWGxRilgQCCbEIk0biwRcEgkKgjCC4MAXRodXwSBBVUBLoBE7bIrkA0i6wjJiEEQichLD4EIYKEDJImhiCdfOePe1vKSlfVr7q66nanX6/nqafqnnPuOb86fbr7/Zxz7z3V3QEAgKXcZNYDAABg7yAcAQAYIhwBABgiHAEAGCIcAQAYsm7WAxhx5JFH9hlnnDHrYQAAu1/NegCM2yvOOF555ZWzHgIAwD5vrwhHAABmTzgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBk1cKxql5XVVdU1cVzpr2kqj5dVVur6l1VdcvV2j4AALvXap5xPCXJkfOmnZ3kPt19WJLPJnn+Km4fAIDdaNXCsbs/nOTr86ad1d3bpw/PTXLn1do+AAC717oZbvtpSd6y0MyqOiHJCUmyYcOGtRrTDRy26cxsu3b7gvMPXr8uWzc9Zg1HBAAwGzMJx6p6YZLtSd600DLdvTnJ5iTZuHFjr9HQbmDbtdtz2vFHLDj/2C3nruFoAABmZ83DsaqOS3J0kkd298yCEACA5VnTcKyqI5M8L8mPdvc1a7ltAABWZjU/jue0JOckuUdVXV5VT0/y50lukeTsqrqwql61WtsHAGD3WrUzjt197E4mv3a1tgcAwOpy5xgAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhghHAACGCEcAAIYIRwAAhqxaOFbV66rqiqq6eM60W1fV2VX1uen3W63W9gEA2L1W84zjKUmOnDftxCTv6+67J3nf9DEAAHuBVQvH7v5wkq/Pm3xMklOnP5+a5KdWa/sAAOxea/0ax9t391eSZPr9dgstWFUnVNXHqupjX/va19ZsgAAA7Nwe++aY7t7c3Ru7e+Ntb3vbWQ8HAGCft9bh+NWqukOSTL9fscbbBwBgF611OL47yXHTn49L8jdrvH0AAHbRan4cz2lJzklyj6q6vKqenuTkJD9WVZ9L8mPTxwAA7AXWrdaKu/vYBWY9crW2CQDA6tlj3xwDAMCeRTgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBkJuFYVb9eVZdU1cVVdVpVrZ/FOAAAGLfm4VhVd0rya0k2dvd9kuyX5ElrPQ4AAJZnVpeq1yW5eVWtS3JAki/PaBwAAAxat9Yb7O5/qaqXJvlikm8lOau7z5q/XFWdkOSEJNmwYcPqDejkDcm1Vy2ywJtXb9sAAHuRNQ/HqrpVkmOS3DXJN5K8raqe3N1vnLtcd29OsjlJNm7c2Ks2oGuvSo5778LzX71t1TYNALA3mcWl6kcluay7v9bd30nyziQPmsE4AABYhlmE4xeTHFFVB1RVJXlkkktnMA4AAJZhzcOxu89L8vYkH09y0XQMm9d6HAAALM+av8YxSbr7RUleNIttAwCwa9w5BgCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhQ+FYVQ8emQYAwI3X6BnHPxucBgDAjdS6xWZW1QOTPCjJbavqN+bMOjjJfqs5MAAA9iyLhmOSmyY5aLrcLeZM35bkcas1KAAA9jyLhmN3fyjJh6rqlO7+5zUaEwAAe6ClzjjucLOq2pzk0LnP6e5HrMagAADY84yG49uSvCrJa5Jct3rDAQBgTzUajtu7+5WrOhIAAPZoox/H856q+p9VdYequvWOr1UdGQAAN1BVh1bVz81i26NnHI+bfn/unGmd5G67dzgAACzh0CQ/l+TNa73hoTOO3X3XnXyJRgBgrzI9W/fpqnpNVV1cVW+qqkdV1Ueq6nNVdf+qOrCqXldV51fVJ6rqmOlz711VH62qC6tqa1Xdfbrs6VX1yen6njhd9n9Pn39xVW2uqppOv9/0uedU1Uuq6uLp9P2mj8+fzv+lRX6Nk5M8ZDqOX6+q/1tVh8/5HT9SVYdV1aaqekNVvX/6ux0/Z5nnztnW747uv6EzjlX1Czub3t2vH90QAMAe4vuSPD7JCUnOz+Ts3Y8k+ckkL0jyqSTv7+6nVdUtk3y0qv4+yS8neXl3v6mqbprJzVAem+TL3X1UklTVIdNt/Hl3/9502huSHJ3kPUn+MskJ3f2PVXXynDE9PclV3X2/qrpZko9U1VndfdlOxn9ikt/q7qOn6/96kqcmeU5VfX+Sm3X31qr6mSSHJTkiyYFJPlFVpye5T5K7J7l/kkry7qp6aHd/eKkdN/oax/vN+XpIkk2Z7FwAgL3NZd19UXdfn+SSJO/r7k5yUSaXgR+d5MSqujDJB5OsT7IhyTlJXlBVz0vy37v7W9PnPKqqXlxVD+nuq6bbeHhVnVdVFyV5RJJ7TyP0Ft39j9Nl5l5qfnSSX5hu87wkt8kk7ka8LcnRVbV/kqclOWXOvL/p7m9195VJPpBJLD56+vWJJB9P8j9GtzV0xrG7nzX38bSm3zDyXACAPcy35/x8/ZzH12fSRtcl+dnu/sy8511aVeclOSrJmVX1jO5+f1XdN5MzjydV1VlJ/jDJXyTZ2N1fqqpNmcRnLTKmSvKs7j5zub9Md19TVWcnOSbJE5JsnDt7/uLTbZ3U3a9e7rZGzzjOd03GKxgAYG9yZpJnzXld4g9Nv98tyee7+0+TvDvJYVV1xyTXdPcbk7w0yQ9nEolJcmVVHZTpbZq7+9+SfLOqjpjOf9K8bf7K9Kxhqur7q+rABcb3zfzXW0Enk8/a/tMk53f31+dMP6aq1lfVbZI8LJNL82cmedp0bKmqO1XV7UZ2zOhrHN+T7xbrfknumeStI88FANjL/H6SlyXZOo3HL2TyGsUnJnlyVX0nyb8m+b1MXsb3kqq6Psl3kvxKd3+jqrZkchn7C5nE2g5PT7Klqv49k8vgOy5tvyaTy+Qfn27za0l+aoHxbU2yvao+meSU7v6T7r6gqrZl8hrKuT6a5PRMLrX/fnd/OcmXq+qeSc6ZtvHVSZ6c5Iqldszox/G8dM7P25P8c3dfPvhcAIA9Qnd/IZM3h+x4/NQF5t3gXc3dfVKSk+ZNPnP6NX/Z307y2zsZwiXdfViSVNWJST42Xf76TN6Y84KB3+E7SR45d9r0zOdNkpw1b/HPdvcJO1nHy5O8fKltzTf6cTwfSvLpTE6L3irJfyx3QwAA5Kjpx+hcnMkbjv/PSlc4/fSb85K8cBqgq2b0UvUTkrwkk1OqleTPquq53f32VRwbAMCNSne/JclbRpatqh/IDd+M/O3ufsC8db4+yQ0+IrG7N+3iMBc0eqn6hUnu191XJElV3TbJ3ycRjgAAq6C7L0py+KzHMdfou6pvsiMap/7fMp4LAMCNwOgZxzOq6swkp00fPzHJ3+7qRqcfgPmaTF6A2kme1t3n7Or6AABYfYuGY1V9X5Lbd/dzp7et+ZFMXuN4TpI3rWC7L09yRnc/bnrLngNWsC4AANbAUpebX5bJh0ymu9/Z3b/R3b+eydnGl+3KBqvq4CQPTfLa6Xr/o7u/sSvrAgBg7Sx1qfrQ7t46f2J3f6yqDt3Fbd4tkw+1/Muq+sEkFyR5dnf/+y6uDwBgyKEnnv7FJHfZjav80hdOPmrDblzfkKr6yST36u6Td8O6ru7ug0aWXSoc1y8y7+bjQ7rBNn84k/sxnldVL09yYpLfmbtQVZ2Q5IQk2bBhzf88lmfTIQvPW39IcuIX124sAMBi7pLk4btxfR9YzsLTu8LUSj9vsbvfncltD9fUUuF4flUd391b5k6sqqdncqZwV1ye5PLuPm/6+O2ZhON/0d2bk2xOko0bN86/Qfee5bj3Ljzv1KPXbhwAwB5nepX27zKJzAcm+euqOjrJzZK8q7tfNF3uF5L8ViZvHN7a3U+ZfgTiqzK5ZWCSPKe7P1JVT02yMZOPTPxkkrt19/VVdUCSz2RyhXdDklckuW2Sa5Ic392frqq7JnlzJh14xnJ+l6XC8TlJ3lVVP5/vhuLGJDdN8tPL2dAO3f2vVfWlqrpHd38mk1vmfGpX1gUAsJe4R5JfTPLXSR6X5P6ZvOH43VX10Ew+6vCFSR7c3VdW1a2nz3t5kj/p7n+oqg2Z3N7wnjtW2t1XTe9Z/aOZhOlPJDmzu79TVZuT/HJ3f66qHpDkL5I8YrrOV3b366vqV5fzSywajt391SQPqqqH57v3bjy9u9+/nI3sxLOSvGn6jurPZ7IjAQBurP65u8+tqpcmeXSST0ynH5Tk7kl+MMnbu/vKJOnur0/nPyrJvSZXuJMkB1fVLeat+y2ZfFTiB5I8KclfVNVBSR6U5G1znnuz6fcHJ/nZ6c9vSPLi0V9i6HMcu/sDWeY1/CXWd2EmZy4BAPYFO94EXElO6u5Xz51ZVb+WySXq+W6S5IHd/a15y899+O4kJ03PUt43yfuTHJjkG919+ALj2aWXAbr7CwDA2jkzydOmZwRTVXeqqtsleV+SJ1TVbabTd1yqPivJM3c8uaoOn7/C7r46yUczuQT93u6+rru3Jbmsqh4/fV5NP80mST6SyZnJJPn55Qx+9M4xAAA3Bl/KbryKOl3fsO4+q6rumeSc6VnDq5M8ubsvqao/SPKhqrouk0vZT03ya0leUVVbM+m2Dyf55Z2s+i1J3pbkYXOm/XySV1bVbyfZP8lfZfJGmmcneXNVPTvJO5YzfuEIAOwzZvGZi939hXz3vSLp7pdncnZw/nKnJjl13rQrM3n94vxlT0lyypzHb8/kMvjcZS5LcuROnntZJu/u3mH4syBdqgYAYIhwBABgiHAEAGCIcAQAYIhwBABgiHAEAGCIj+MBAPYdmw75YpK77MY1fimbrlr1j/iZfvD3Hbv7b1d7W4sRjgDAvuQuSR6+G9e3Oz9MfDGHZ3K75pmGo0vVAACrqKoOrKrTq+qTVXVxVT2xqu5bVR+qqguq6syqusN02Q9W1Yur6qNV9dmqekhV3TTJ7yV5YlVdWFU3+EDwteKMIwDA6joyyZe7+6gkqapDkvxdkmO6+2vTEPyDJE+bLr+uu+9fVY9N8qLuflRV/e8kG7v7mTvbwFoRjgAAq+uiJC+tqhcneW+Sf8vkFoRnT+9XvV+Sr8xZ/p3T7xckOXTthrk04QgAsIq6+7NVdd8kj01yUpKzk1zS3Q9c4Cnfnn6/LntYq3mNIwDAKqqqOya5prvfmOSlSR6Q5LZV9cDp/P2r6t5LrOabSW6xuiNd2h5VsQAAq+xL2b3vhP7SwDI/kOQlVXV9ku8k+ZUk25P86fT1juuSvCzJJYus4wNJTqyqC5Oc1N1vWcmgd5VwBAD2HWvwmYvzdfeZSc7cyayH7mTZh835+cpMX+PY3V9Pcr/VGeE4l6oBABgiHAEAGCIcAQAYIhwBABgiHAEAGCIcAQAYIhwBABgiHAEAGCIcAQAYIhwBABgiHAEAGCIcAQAYIhwBABgiHAEAGCIcAQAYIhwBABgiHAEAGCIcAQAYsm7WA7ixO+zaLdl24ukLzj8412Tr+mcsvIL1hyQnfnEVRgYAsDzCcZVty4E57fgjFpx/7JZzk+Peu/AKTj16FUYFALB8LlUDADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwRDgCADBEOAIAMEQ4AgAwZGbhWFX7VdUnquq9sxoDAADjZnnG8dlJLp3h9gEAWIaZhGNV3TnJUUleM4vtAwCwfLM64/iyJP8ryfUz2j4AAMu0bq03WFVHJ7miuy+oqoctstwJSU5Ikg0bNqzN4HbBQbkmh7568fkrcdi1W7LtxNMXnH/w+nXZuukxK9oGAMCINQ/HJA9O8pNV9dgk65McXFVv7O4nz12ouzcn2ZwkGzdu7LUf5pgt+/9RcuRJCy9wxu8necQur39bDsxpxx+x4Pxjt5y7y+sGAFiONb9U3d3P7+47d/ehSZ6U5P3zoxEAgD2Pz3EEAGDILC5V/6fu/mCSD85yDAAAjHHGEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHrZj2AvcIpRy0876YHrN04VsFhm87Mtmu3Lzj/4PXrsnXTY9ZwRADAnko4jjjypFmPYNVsu3Z7Tjv+iAXnH7vl3DUcDQCwJ3OpGgCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIWsejlV1l6r6QFVdWlWXVNWz13oMAAAs37oZbHN7kt/s7o9X1S2SXFBVZ3f3p2YwFgAABq35Gcfu/kp3f3z68zeTXJrkTms9DgAAlmcWZxz/U1UdmuSHkpy3k3knJDkhSTZs2LC2A9vTnHLUIjN/Z1U3fVC+lUNPPH3B+Qfnmmw9+fELzj9s05nZdu321Rjad8ewfl22bnrMqm4DbsyW+nvq7xiww8zCsaoOSvKOJM/p7m3z53f35iSbk2Tjxo29xsPbsxx50sLz3nPNqm56y/4vXXT7x75n8edvu3Z7Tjv+iN08qnlj2HLuqq4fbuyW+nvq7xiww0zeVV1V+2cSjW/q7nfOYgwAACzPLN5VXUlem+TS7v7jtd4+AAC7ZhZnHB+c5ClJHlFVF06/HjuDcQAAsAxr/hrH7v6HJLXW2wUAYGXcOQYAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIetmPYBZO+zaLdn26m0Lzj9o/xVu4KYHJKcctcgCv7Oi1R+Ub+XQE0/f9effbDccApsOWWTmm5f4/VfuoPzmivbBwfn3bF1//ILzD7v2NdmWA1aw/muydf0zFl5g/SHJiV9ceP7JG5Jrr9rl5x+26cxsu3b7wuNbvy5bNz1m17e/lKV+v7WwxO+w1J/xkvtoxlb8Z7zaVngMr/j53Oj/DrB29vlw3JYDc9pP7HoULOkRS4The65Z0eq3/MRtFl/gjOcnT931qBpy3HsXnvfqbcmRJ63q5res8PnHvieL/g7bXr0tpx1/xK6vf8u5i++jU49efAXXXrWi52+7dvui4z92y7kr2/5Slvr91sISv8NSf8ZL7qMZW/Gf8Wpb4TG84udzo/87wNpxqRoAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCEzCceqOrKqPlNV/1RVJ85iDAAALM+ah2NV7ZfkFUl+PMm9khxbVfda63EAALA8szjjeP8k/9Tdn+/u/0jyV0mOmcE4AABYhurutd1g1eOSHNndz5g+fkqSB3T3M+ctd0KSE6YP75HkM7t5KN+T5Mr1/+1u973eSz13yXXXXJX9Djhk1sPYa9l/K2cfroz9t3L7yj7cL9fnW//6+QtWYdXfk+TT3X3kKqybVbBuBtusnUy7Qb129+Ykm1dtEFUf6+6Nq7X+fUFVfWz7VVfYh7vI/ls5+3Bl7L+Vsw9XZvp/sWjci8ziVNvlSe4y5/Gdk3x5BuMAAGAZZhGO5ye5e1XdtapumuRJSd49g3EAALAMa36puru3V9Uzk5yZZL8kr+vuS9Z6HFnFy+D7EPtwZey/lbMPV8b+Wzn7cGXsv73Mmr85BgCAvZO3EwMAMEQ4AgAwZJ8MR7c8XL6q+kJVXVRVF1bVx6bTbl1VZ1fV56bfbzXrce5Jqup1VXVFVV08Z9qC+6yqnj89Jj9TVY+Zzaj3HAvsv01V9S/T4/DCqnrsnHn23zxVdZeq+kBVXVpVl1TVs6fTHYcDFtl/jsNBVbW+qj5aVZ+c7sPfnU53DO6l9rnXOE5vefjZJD+WyUcDnZ/k2O7+1EwHtoerqi8k2djdV86Z9odJvt7dJ08D/Fbd/bxZjXFPU1UPTXJ1ktd3932m03a6z6a33Twtkzsr3THJ3yf5/u6+bkbDn7kF9t+mJFd390vnLWv/7URV3SHJHbr741V1iyQXJPmpJE+N43BJi+y/J8RxOKSqKsmB3X11Ve2f5B+SPDvJz8QxuFfaF884uuXh7nNMklOnP5+ayT+oTHX3h5N8fd7khfbZMUn+qru/3d2XJfmnTI7VfdYC+28h9t9OdPdXuvvj05+/meTSJHeK43DIIvtvIfbfPD1x9fTh/tOvjmNwr7UvhuOdknxpzuPLs/g/BEx0krOq6oLp7SCT5Pbd/ZVk8g9sktvNbHR7j4X2meNy3DOrauv0UvaOy1v23xKq6tAkP5TkvDgOl23e/ksch8Oqar+qujDJFUnO7m7H4F5sXwzHoVsecgMP7u4fTvLjSX51ehmR3cdxOeaVSb43yeFJvpLkj6bT7b9FVNVBSd6R5DndvW2xRXcybZ/fjzvZf47DZeju67r78EzuFHf/qrrPIovbh3u4fTEc3fJwF3T3l6ffr0jyrkwuHXx1+hqgHa8FumJ2I9xrLLTPHJcDuvur0/+Erk+yJd+9hGX/LWD6urJ3JHlTd79zOtlxOGhn+89xuGu6+xtJPpjkyDgG91r7Yji65eEyVdWB0xeGp6oOTPLoJBdnst+Omy52XJK/mc0I9yoL7bN3J3lSVd2squ6a5O5JPjqD8e3RdvxHM/XTmRyHif23U9M3Jrw2yaXd/cdzZjkOByy0/xyH46rqtlV1y+nPN0/yqCSfjmNwr7XmtxyctT3olod7k9snedfk39CsS/Lm7j6jqs5P8taqenqSLyZ5/AzHuMepqtOSPCzJ91TV5UlelOTk7GSfdfclVfXWJJ9Ksj3Jr+7r7yJcYP89rKoOz+TS1ReS/FJi/y3iwUmekuSi6WvMkuQFcRyOWmj/Hes4HHaHJKdOP9HkJkne2t3vrapz4hjcK+1zH8cDAMCu2RcvVQMAsAuEIwAAQ4QjAABDhCMAAEOEIwAAQ4QjAABDhCMAAEOEIzAzVXVoVV1aVVuq6pKqOquqbl5Vh1fVuVW1tareVVW3mvVYARCOwOzdPckruvveSb6R5GeTvD7J87r7sCQXZXLXGABmTDgCs3ZZd184/fmCJN+b5Jbd/aHptFOTPHQWAwPgvxKOwKx9e87P1yW55YzGAcAShCOwp7kqyb9V1UOmj5+S5EOLLA/AGlk36wEA7MRxSV5VVQck+XySX5zxeABIUt096zEAALAXcKkaAIAhwhEAgCHCEQCAIcIRAIAhwhEAgCHCEQCAIcIRAIAh/x+oDhzGTBBYGAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sms_number = pd.wide_to_long(\n", + " df_sms_features.reset_index(), \n", + " i=\"participant_id\", \n", + " j=\"message_type\", \n", + " stubnames=\"no\", \n", + " sep=\"_\", \n", + " suffix=\"\\D+\"\n", + ")\n", + "sns.displot(sms_number, x=\"no\", hue=\"message_type\", binwidth=5, element=\"step\", height=8)" + ] } ], "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, "kernelspec": { "display_name": "straw2analysis", "language": "python", @@ -805,7 +946,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.4" } }, "nbformat": 4, diff --git a/exploration/communication.py b/exploration/communication.py new file mode 100644 index 0000000..d077ed1 --- /dev/null +++ b/exploration/communication.py @@ -0,0 +1,126 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.11.2 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% +import os +import sys + +import matplotlib.pyplot as plt + +# %% +import seaborn as sns + +nb_dir = os.path.split(os.getcwd())[0] +if nb_dir not in sys.path: + sys.path.append(nb_dir) + +# %% +from features.communication import * + +# %% [markdown] +# # Example of communication data and feature calculation + +# %% +df_calls = get_call_data(["nokia_0000003"]) +print(df_calls) + +# %% +count_comms(df_calls) + +# %% +df_sms = get_sms_data(["nokia_0000003"]) +count_comms(df_sms) + +# %% [markdown] +# # Call data + +# %% +import participants.query_db + +# %% +participants_inactive_usernames = participants.query_db.get_usernames() +df_calls_inactive = get_call_data(participants_inactive_usernames) + +# %% +df_calls_features = count_comms(df_calls_inactive) +df_calls_features.head() + +# %% +df_calls_features.describe() + +# %% +calls_number = pd.wide_to_long( + df_calls_features.reset_index(), + i="participant_id", + j="call_type", + stubnames="no", + sep="_", + suffix="\D+", +) + +# %% +sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8) + +# %% +calls_duration = pd.wide_to_long( + df_calls_features.reset_index(), + i="participant_id", + j="call_type", + stubnames="duration", + sep="_", + suffix="\D+", +) +sns.displot( + calls_duration, + x="duration", + hue="call_type", + multiple="dodge", + height=8, + log_scale=(True, False), +) + +# %% [markdown] +# ## Most frequent contacts by participant + +# %% +df_calls_inactive = enumerate_contacts(df_calls_inactive) +df_calls_inactive.tail() + +# %% +df_calls_frequent = df_calls_inactive.query("contact_id < 5") + +# %% +sns.boxplot(x="contact_id", y="freq", data=df_calls_frequent) + +# %% [markdown] +# # SMS data + +# %% +df_sms_inactive = get_sms_data(participants_inactive_usernames) +df_sms_features = count_comms(df_sms_inactive) +df_sms_features.describe() + +# %% +sms_number = pd.wide_to_long( + df_sms_features.reset_index(), + i="participant_id", + j="message_type", + stubnames="no", + sep="_", + suffix="\D+", +) +sns.displot( + sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8 +) diff --git a/exploration/screen.ipynb b/exploration/screen.ipynb index c903116..5ecb8b1 100644 --- a/exploration/screen.ipynb +++ b/exploration/screen.ipynb @@ -231,6 +231,9 @@ } ], "metadata": { + "jupytext": { + "formats": "ipynb,auto:percent" + }, "kernelspec": { "display_name": "straw2analysis", "language": "python", diff --git a/exploration/screen.py b/exploration/screen.py new file mode 100644 index 0000000..eb6937c --- /dev/null +++ b/exploration/screen.py @@ -0,0 +1,61 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.11.2 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% +import os +import sys + +from tabulate import tabulate + +nb_dir = os.path.split(os.getcwd())[0] +if nb_dir not in sys.path: + sys.path.append(nb_dir) + +import participants.query_db + +# %% +from features.screen import * + +# %% +df_screen_nokia = get_screen_data(["nokia_0000003"]) + +# %% +print(df_screen_nokia) + +# %% +participants_inactive_usernames = participants.query_db.get_usernames() +df_screen_inactive = get_screen_data(participants_inactive_usernames) + +# %% +df_screen_inactive["screen_status"] = ( + df_screen_inactive["screen_status"] + .astype("category") + .cat.rename_categories(screen_status) +) +screen_freq = df_screen_inactive.value_counts("screen_status") +tabulate(screen_freq.to_frame(), tablefmt="html") + +# %% +screen_status + +# %% [markdown] +# A typical sequence might be: off -> locked -> on -> unlocked (0 -> 2 -> 1 -> 3) + +# %% +status_diff = df_screen_nokia.sort_values("timestamp")["screen_status"].diff() +status_diff.value_counts().to_frame() + +# %% [markdown] +# But I have also seen off -> on -> unlocked (with 2 - locked missing) and off -> locked -> on -> off -> locked (*again*). diff --git a/features/screen.py b/features/screen.py index facdb0c..424c025 100644 --- a/features/screen.py +++ b/features/screen.py @@ -2,7 +2,7 @@ from typing import List import pandas as pd -from config.models import Screen, Participant +from config.models import Participant, Screen from setup import db_engine, session screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"}