Compare commits

...

14 Commits

Author SHA1 Message Date
junos 777e6f0a58 calls_sms_features() now returns all communication features. 2021-08-18 15:41:47 +02:00
junos 2d78aacd18 Compile a list of contact features and add a test. 2021-08-18 15:35:42 +02:00
junos c88336481e Add a test for SMS features. 2021-08-18 15:28:46 +02:00
junos 1bc996413e Clarify names for no_all calls/sms feature.
Add another test.
2021-08-18 15:23:30 +02:00
junos a2a44c202a Calculate common features outside if...else. 2021-08-18 10:54:54 +02:00
junos 4740e94d37 Fix a bug introduced in e7fe4e8398 . 2021-08-18 10:51:48 +02:00
junos b1ad8d1309 List calls features. 2021-08-17 16:27:34 +02:00
junos bb75abcb9b Add tests for proximity. 2021-08-17 16:07:52 +02:00
junos e7fe4e8398 Simplify merge into join. 2021-08-17 13:53:19 +02:00
Junos Lukan cf28aa547a Merge branch 'communication' into 'master'
separated features

See merge request junoslukan/straw2analysis!2
2021-08-17 11:42:03 +00:00
junos d6f36ec8f8 [WIP] Finish the class by assigning columns and validating model. 2021-08-13 17:41:04 +02:00
junos b06ec6e1ae [WIP] Methods to get the labels and data plus aggregate them. 2021-08-12 19:07:14 +02:00
junos 622477f19f [WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
Ivan Kobe 74b4f9ddbe separated features 2021-08-10 12:59:47 +02:00
9 changed files with 365 additions and 106 deletions

View File

@ -0,0 +1,63 @@
id,timestamp,device_id,_id,double_proximity,accuracy,label,dateTime
39017,1565802024310,f67354f7-d675-4b76-80c8-123cc4744a5b,2962,0,3,,2019-08-14T17:00:24Z
39018,1565802051075,f67354f7-d675-4b76-80c8-123cc4744a5b,2963,0,3,,2019-08-14T17:00:51Z
39019,1565802051354,f67354f7-d675-4b76-80c8-123cc4744a5b,2964,8,3,,2019-08-14T17:00:51Z
39089,1565010418305,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,51,5,3,,2019-08-05T13:06:58Z
39090,1565010772188,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,52,5,3,,2019-08-05T13:12:52Z
39091,1565012334450,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,53,5,3,,2019-08-05T13:38:54Z
39092,1565013000660,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,54,5,3,,2019-08-05T13:50:00Z
39093,1565022742894,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,55,0,3,,2019-08-05T16:32:22Z
39094,1565089295906,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,56,5,3,,2019-08-06T11:01:35Z
39095,1565096030817,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,57,0,3,,2019-08-06T12:53:50Z
39096,1565096367694,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,58,5,3,,2019-08-06T12:59:27Z
39097,1565096408570,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,59,5,3,,2019-08-06T13:00:08Z
39098,1565116821528,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,60,5,3,,2019-08-06T18:40:21Z
39099,1565131345333,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,61,0,3,,2019-08-06T22:42:25Z
39100,1565131375072,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,62,5,3,,2019-08-06T22:42:55Z
39101,1565131386353,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,63,0,3,,2019-08-06T22:43:06Z
39102,1565131389213,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,64,5,3,,2019-08-06T22:43:09Z
39103,1565131448891,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,65,0,3,,2019-08-06T22:44:08Z
39104,1565131454131,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,66,5,3,,2019-08-06T22:44:14Z
39105,1565176143083,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,67,0,3,,2019-08-07T11:09:03Z
39106,1565179569310,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,68,5,3,,2019-08-07T12:06:09Z
39107,1565180699173,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,69,5,3,,2019-08-07T12:24:59Z
39108,1565182538578,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,70,5,3,,2019-08-07T12:55:38Z
39109,1565192592776,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,71,0,3,,2019-08-07T15:43:12Z
39110,1565216023797,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,72,5,3,,2019-08-07T22:13:43Z
39111,1565248358647,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,73,0,3,,2019-08-08T07:12:38Z
39112,1565275859157,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,74,5,3,,2019-08-08T14:50:59Z
39113,1565304201431,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,75,0,3,,2019-08-08T22:43:21Z
39114,1565304229591,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,76,5,3,,2019-08-08T22:43:49Z
39115,1565304262050,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,77,0,3,,2019-08-08T22:44:22Z
39116,1565613142970,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,78,5,3,,2019-08-12T12:32:22Z
39117,1565618266531,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,79,5,3,,2019-08-12T13:57:46Z
39118,1565618410488,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,80,5,3,,2019-08-12T14:00:10Z
39119,1565618704942,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,81,5,3,,2019-08-12T14:05:04Z
39120,1565619005315,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,82,5,3,,2019-08-12T14:10:05Z
39121,1565619405904,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,83,5,3,,2019-08-12T14:16:45Z
39122,1565619678037,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,84,5,3,,2019-08-12T14:21:18Z
39123,1565621206713,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,85,5,3,,2019-08-12T14:46:46Z
39124,1565626622125,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,86,5,3,,2019-08-12T16:17:02Z
39125,1565684876738,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,87,5,3,,2019-08-13T08:27:56Z
39126,1565684956618,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,88,5,3,,2019-08-13T08:29:16Z
39127,1565684965647,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,89,5,3,,2019-08-13T08:29:25Z
39128,1565685092246,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,90,5,3,,2019-08-13T08:31:32Z
39129,1565685136337,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,91,5,3,,2019-08-13T08:32:16Z
39130,1565685147453,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,92,5,3,,2019-08-13T08:32:27Z
39131,1565685212523,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,93,5,3,,2019-08-13T08:33:32Z
39132,1565703397796,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,94,0,3,,2019-08-13T13:36:37Z
39133,1565776203019,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,95,5,3,,2019-08-14T09:50:03Z
39134,1565776434168,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,96,5,3,,2019-08-14T09:53:54Z
39135,1565776435231,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,97,0,3,,2019-08-14T09:53:55Z
39136,1565776443368,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,98,5,3,,2019-08-14T09:54:03Z
39137,1565779277109,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,99,0,3,,2019-08-14T10:41:17Z
39138,1565780016327,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,100,5,3,,2019-08-14T10:53:36Z
39139,1565780027437,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,101,5,3,,2019-08-14T10:53:47Z
39140,1565783470934,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,102,5,3,,2019-08-14T11:51:10Z
39141,1565783801540,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,103,0,3,,2019-08-14T11:56:41Z
39142,1565783802120,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,104,5,3,,2019-08-14T11:56:42Z
39143,1565783861495,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,105,5,3,,2019-08-14T11:57:41Z
39144,1565785318762,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,106,0,3,,2019-08-14T12:21:58Z
39145,1565785319346,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,107,5,3,,2019-08-14T12:21:59Z
39146,1565960121019,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,108,5,3,,2019-08-16T12:55:21Z
39147,1565960226792,fdb06d4a-ee6e-4336-9a96-fc8d2715f243,109,5,3,,2019-08-16T12:57:06Z
1 id timestamp device_id _id double_proximity accuracy label dateTime
2 39017 1565802024310 f67354f7-d675-4b76-80c8-123cc4744a5b 2962 0 3 2019-08-14T17:00:24Z
3 39018 1565802051075 f67354f7-d675-4b76-80c8-123cc4744a5b 2963 0 3 2019-08-14T17:00:51Z
4 39019 1565802051354 f67354f7-d675-4b76-80c8-123cc4744a5b 2964 8 3 2019-08-14T17:00:51Z
5 39089 1565010418305 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 51 5 3 2019-08-05T13:06:58Z
6 39090 1565010772188 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 52 5 3 2019-08-05T13:12:52Z
7 39091 1565012334450 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 53 5 3 2019-08-05T13:38:54Z
8 39092 1565013000660 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 54 5 3 2019-08-05T13:50:00Z
9 39093 1565022742894 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 55 0 3 2019-08-05T16:32:22Z
10 39094 1565089295906 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 56 5 3 2019-08-06T11:01:35Z
11 39095 1565096030817 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 57 0 3 2019-08-06T12:53:50Z
12 39096 1565096367694 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 58 5 3 2019-08-06T12:59:27Z
13 39097 1565096408570 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 59 5 3 2019-08-06T13:00:08Z
14 39098 1565116821528 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 60 5 3 2019-08-06T18:40:21Z
15 39099 1565131345333 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 61 0 3 2019-08-06T22:42:25Z
16 39100 1565131375072 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 62 5 3 2019-08-06T22:42:55Z
17 39101 1565131386353 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 63 0 3 2019-08-06T22:43:06Z
18 39102 1565131389213 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 64 5 3 2019-08-06T22:43:09Z
19 39103 1565131448891 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 65 0 3 2019-08-06T22:44:08Z
20 39104 1565131454131 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 66 5 3 2019-08-06T22:44:14Z
21 39105 1565176143083 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 67 0 3 2019-08-07T11:09:03Z
22 39106 1565179569310 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 68 5 3 2019-08-07T12:06:09Z
23 39107 1565180699173 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 69 5 3 2019-08-07T12:24:59Z
24 39108 1565182538578 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 70 5 3 2019-08-07T12:55:38Z
25 39109 1565192592776 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 71 0 3 2019-08-07T15:43:12Z
26 39110 1565216023797 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 72 5 3 2019-08-07T22:13:43Z
27 39111 1565248358647 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 73 0 3 2019-08-08T07:12:38Z
28 39112 1565275859157 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 74 5 3 2019-08-08T14:50:59Z
29 39113 1565304201431 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 75 0 3 2019-08-08T22:43:21Z
30 39114 1565304229591 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 76 5 3 2019-08-08T22:43:49Z
31 39115 1565304262050 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 77 0 3 2019-08-08T22:44:22Z
32 39116 1565613142970 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 78 5 3 2019-08-12T12:32:22Z
33 39117 1565618266531 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 79 5 3 2019-08-12T13:57:46Z
34 39118 1565618410488 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 80 5 3 2019-08-12T14:00:10Z
35 39119 1565618704942 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 81 5 3 2019-08-12T14:05:04Z
36 39120 1565619005315 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 82 5 3 2019-08-12T14:10:05Z
37 39121 1565619405904 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 83 5 3 2019-08-12T14:16:45Z
38 39122 1565619678037 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 84 5 3 2019-08-12T14:21:18Z
39 39123 1565621206713 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 85 5 3 2019-08-12T14:46:46Z
40 39124 1565626622125 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 86 5 3 2019-08-12T16:17:02Z
41 39125 1565684876738 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 87 5 3 2019-08-13T08:27:56Z
42 39126 1565684956618 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 88 5 3 2019-08-13T08:29:16Z
43 39127 1565684965647 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 89 5 3 2019-08-13T08:29:25Z
44 39128 1565685092246 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 90 5 3 2019-08-13T08:31:32Z
45 39129 1565685136337 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 91 5 3 2019-08-13T08:32:16Z
46 39130 1565685147453 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 92 5 3 2019-08-13T08:32:27Z
47 39131 1565685212523 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 93 5 3 2019-08-13T08:33:32Z
48 39132 1565703397796 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 94 0 3 2019-08-13T13:36:37Z
49 39133 1565776203019 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 95 5 3 2019-08-14T09:50:03Z
50 39134 1565776434168 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 96 5 3 2019-08-14T09:53:54Z
51 39135 1565776435231 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 97 0 3 2019-08-14T09:53:55Z
52 39136 1565776443368 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 98 5 3 2019-08-14T09:54:03Z
53 39137 1565779277109 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 99 0 3 2019-08-14T10:41:17Z
54 39138 1565780016327 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 100 5 3 2019-08-14T10:53:36Z
55 39139 1565780027437 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 101 5 3 2019-08-14T10:53:47Z
56 39140 1565783470934 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 102 5 3 2019-08-14T11:51:10Z
57 39141 1565783801540 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 103 0 3 2019-08-14T11:56:41Z
58 39142 1565783802120 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 104 5 3 2019-08-14T11:56:42Z
59 39143 1565783861495 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 105 5 3 2019-08-14T11:57:41Z
60 39144 1565785318762 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 106 0 3 2019-08-14T12:21:58Z
61 39145 1565785319346 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 107 5 3 2019-08-14T12:21:59Z
62 39146 1565960121019 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 108 5 3 2019-08-16T12:55:21Z
63 39147 1565960226792 fdb06d4a-ee6e-4336-9a96-fc8d2715f243 109 5 3 2019-08-16T12:57:06Z

View File

@ -148,3 +148,28 @@ lin_reg_proximity.score(
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
)
# %% [markdown]
# # Merging these into a pipeline
# %%
from machine_learning import pipeline
# %%
ml_pipeline = pipeline.MachineLearningPipeline(
labels_questionnaire="PANAS", data_types="proximity"
)
# %%
ml_pipeline.get_labels()
# %% tags=[]
ml_pipeline.get_sensor_data()
# %%
ml_pipeline.aggregate_daily()
# %%
ml_pipeline.df_full_data_daily_means
# %%

View File

@ -8,6 +8,43 @@ from setup import db_engine, session
call_types = {1: "incoming", 2: "outgoing", 3: "missed"}
sms_types = {1: "received", 2: "sent"}
FEATURES_CALLS = (
["no_calls_all"]
+ ["no_" + call_type for call_type in call_types.values()]
+ ["duration_total_" + call_types.get(1), "duration_total_" + call_types.get(2)]
+ ["duration_max_" + call_types.get(1), "duration_max_" + call_types.get(2)]
+ ["no_" + call_types.get(1) + "_ratio", "no_" + call_types.get(2) + "_ratio"]
+ ["no_contacts_calls"]
)
# FEATURES_CALLS =
# ["no_calls_all",
# "no_incoming", "no_outgoing", "no_missed",
# "duration_total_incoming", "duration_total_outgoing",
# "duration_max_incoming", "duration_max_outgoing",
# "no_incoming_ratio", "no_outgoing_ratio",
# "no_contacts"]
FEATURES_SMS = (
["no_sms_all"]
+ ["no_" + sms_type for sms_type in sms_types.values()]
+ ["no_" + sms_types.get(1) + "_ratio", "no_" + sms_types.get(2) + "_ratio"]
+ ["no_contacts_sms"]
)
# FEATURES_SMS =
# ["no_sms_all",
# "no_received", "no_sent",
# "no_received_ratio", "no_sent_ratio",
# "no_contacts"]
FEATURES_CONTACT = [
"proportion_calls_all",
"proportion_calls_incoming",
"proportion_calls_outgoing",
"proportion_calls_contacts",
"proportion_calls_missed_sms_received",
]
def get_call_data(usernames: Collection) -> pd.DataFrame:
"""
@ -114,10 +151,12 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
These are:
* the number of calls by type (incoming, outgoing missed) and in total,
* the ratio of incoming and outgoing calls to the total number of calls,
* the total and maximum duration of calls by type, and
* the number of messages by type (received, sent).
* the total and maximum duration of calls by type,
* the number of messages by type (received, sent), and
* the number of communication contacts by type.
"""
if "call_type" in comm_df:
data_type = "calls"
comm_counts = (
comm_df.value_counts(subset=["participant_id", "call_type"])
.unstack()
@ -125,11 +164,11 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
.add_prefix("no_")
)
# Count calls by type.
comm_counts["no_all"] = comm_counts.sum(axis=1)
comm_counts["no_calls_all"] = comm_counts.sum(axis=1)
# Add a total count of calls.
comm_counts = comm_counts.assign(
no_incoming_ratio=lambda x: x.no_incoming / x.no_all,
no_outgoing_ratio=lambda x: x.no_outgoing / x.no_all,
no_incoming_ratio=lambda x: x.no_incoming / x.no_calls_all,
no_outgoing_ratio=lambda x: x.no_outgoing / x.no_calls_all,
)
# Ratio of incoming and outgoing calls to all calls.
comm_duration_total = (
@ -159,44 +198,56 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
# If there were no missed calls, this exception is raised.
# But we are dropping the column anyway, so no need to deal with the exception.
elif "message_type" in comm_df:
data_type = "sms"
comm_counts = (
comm_df.value_counts(subset=["participant_id", "message_type"])
.unstack()
.rename(columns=sms_types)
.add_prefix("no_")
)
comm_counts["no_all"] = comm_counts.sum(axis=1)
comm_counts["no_sms_all"] = comm_counts.sum(axis=1)
# Add a total count of messages.
comm_features = comm_counts.assign(
no_received_ratio=lambda x: x.no_received / x.no_all,
no_sent_ratio=lambda x: x.no_sent / x.no_all,
no_received_ratio=lambda x: x.no_received / x.no_sms_all,
no_sent_ratio=lambda x: x.no_sent / x.no_sms_all,
)
# Ratio of incoming and outgoing messages to all messages.
else:
raise KeyError("The dataframe contains neither call_type or message_type")
comm_contacts_counts = (
enumerate_contacts(comm_df)
.groupby(["participant_id"])
.nunique()["contact_id"]
.rename("no_contacts_" + data_type)
)
# Number of communication contacts
comm_features = comm_features.join(comm_contacts_counts)
return comm_features
def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
def contact_features(comm_df: pd.DataFrame) -> pd.DataFrame:
"""
Counts the number of people contacted (for each participant) and, if
df_enumerated is a dataframe containing calls data, the total duration
of calls between a participant and each of her contacts.
For each participant and for each of his contacts, this function
counts the number of communications (by type) between them. If the
argument passed is a dataframe with calls data, it additionally counts
the total duration of calls between every pair (participant, contact).
Parameters
----------
df_enumerated: pd.DataFrame
A dataframe of calls or SMSes; return of function enumerate_contacts.
comm_df: pd.DataFrame
A dataframe of calls or SMSes.
Returns
-------
comm_df: pd.DataFrame
The altered dataframe with the column no_contacts and, if df_enumerated
contains calls data, an additional column total_call_duration.
A new dataframe with a row for each pair (participant, contact).
"""
df_enumerated = enumerate_contacts(comm_df)
contacts_count = (
df_enumerated.groupby(["participant_id", "contact_id"]).size().reset_index()
)
# Check whether df contains calls or SMS data since some
# features we want to calculate are type-specyfic
# features we want to calculate are type-specific
if "call_duration" in df_enumerated:
# Add a column with the total duration of calls between two people
duration_count = (
@ -207,33 +258,14 @@ def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"call_duration": "total_call_duration"})
)
# The new dataframe now contains columns containing information about
# participants, callers and the total duration of their calls. All that
# is now left to do is to merge the original df with the new one.
df_enumerated = df_enumerated.merge(
contacts_count = contacts_count.merge(
duration_count, on=["participant_id", "contact_id"]
)
contact_count = (
df_enumerated.groupby(["participant_id"])
.nunique()[
"contact_id"
] # For each participant, count the number of distinct contacts
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"contact_id": "no_contacts"})
)
df_enumerated = (
# Merge df with the newely created df containing info about number of contacts
df_enumerated.merge(contact_count, on="participant_id")
# Sort first by participant_id and then by contact_id and
# thereby restore the inital ordering of input dataframes.
.sort_values(["participant_id", "contact_id"])
)
contacts_count.rename(columns={0: "no_calls"}, inplace=True)
else:
contacts_count.rename(columns={0: "no_sms"}, inplace=True)
# TODO:Determine work vs non-work contacts by work hours heuristics
return df_enumerated
return contacts_count
def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
@ -245,14 +277,14 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF
df_calls: pd.DataFrame
A dataframe of calls (return of get_call_data).
df_sms: pd.DataFrame
A dataframe of calls (return of get_sms_data).
A dataframe of SMSes (return of get_sms_data).
Returns
-------
df_calls_sms: pd.DataFrame
The list of features relating calls and sms data for every participant.
These are:
* proportion_calls:
* proportion_calls_all:
proportion of calls in total number of communications
* proportion_calls_incoming:
proportion of incoming calls in total number of incoming/received communications
@ -263,62 +295,24 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF
* proportion_calls_contacts:
proportion of calls contacts in total number of communication contacts
"""
count_calls = count_comms(df_calls)
count_sms = count_comms(df_sms)
count_joined = (
count_calls.merge(
count_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls=(
lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
),
proportion_calls_incoming=(
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
),
proportion_calls_missed_sms_received=(
lambda x: x.no_missed / (x.no_missed + x.no_received)
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
)
# Calculate new features and create additional columns
)[
[
"participant_id",
"proportion_calls",
"proportion_calls_incoming",
"proportion_calls_outgoing",
"proportion_calls_missed_sms_received",
]
] # Filter out only the relevant features
count_joined = count_calls.join(count_sms).assign(
proportion_calls_all=(
lambda x: x.no_calls_all / (x.no_calls_all + x.no_sms_all)
),
proportion_calls_incoming=(
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
),
proportion_calls_missed_sms_received=(
lambda x: x.no_missed / (x.no_missed + x.no_received)
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
),
proportion_calls_contacts=(
lambda x: x.no_contacts_calls / (x.no_contacts_calls + x.no_contacts_sms)
)
# Calculate new features and create additional columns
)
features_calls = contact_features(enumerate_contacts(df_calls))
features_sms = contact_features(enumerate_contacts(df_sms))
features_joined = (
features_calls.merge(
features_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls_contacts=(
lambda x: x.no_contacts_calls
/ (x.no_contacts_calls + x.no_contacts_sms)
) # Calculate new features and create additional columns
)[
["participant_id", "proportion_calls_contacts"]
] # Filter out only the relevant features
# Since we are interested only in some features and ignored
# others, a lot of duplicate rows were created. Remove them.
.drop_duplicates()
)
# Join the newly created dataframes
df_calls_sms = count_joined.merge(features_joined, on="participant_id")
return df_calls_sms
return count_joined

View File

@ -5,6 +5,8 @@ import pandas as pd
from config.models import Participant, Proximity
from setup import db_engine, session
FEATURES_PROXIMITY = ["freq_prox_near", "prop_prox_near"]
def get_proximity_data(usernames: Collection) -> pd.DataFrame:
"""
@ -56,7 +58,7 @@ def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame:
def count_proximity(
df_proximity: pd.DataFrame, group_by: Collection = ["participant_id"]
df_proximity: pd.DataFrame, group_by: Collection = None
) -> pd.DataFrame:
"""
The function counts how many times a "near" value occurs in proximity
@ -75,6 +77,8 @@ def count_proximity(
df_proximity_features: pd.DataFrame
A dataframe with the count of "near" proximity values and their relative count.
"""
if group_by is None:
group_by = ["participant_id"]
if "bool_prox_near" not in df_proximity:
df_proximity = recode_proximity(df_proximity)
df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]

View File

@ -38,8 +38,10 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# - OFF -> ON -> unlocked (a true phone unlock)
# - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
# Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
# "I have also seen off -> on -> unlocked (with 2 - locked missing)
# and off -> locked -> on -> off -> locked (*again*)."
# "I have also seen
# off -> on -> unlocked (with 2 - locked missing)
# and
# off -> locked -> on -> off -> locked (*again*)."
# Either clean the data beforehand or deal with these inconsistencies in this function.
pass

View File

@ -0,0 +1,7 @@
QUESTIONNAIRE_IDS = {"PANAS": {"PA": 8.0, "NA": 9.0}}
QUESTIONNAIRE_IDS_RENAME = {}
for questionnaire in QUESTIONNAIRE_IDS.items():
for k, v in questionnaire[1].items():
QUESTIONNAIRE_IDS_RENAME[v] = k

View File

@ -0,0 +1,125 @@
import datetime
import pandas as pd
from sklearn.model_selection import cross_val_score
import participants.query_db
from features import esm, helper, proximity
from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME
class MachineLearningPipeline:
def __init__(
self,
labels_questionnaire,
labels_scale,
data_types,
participants_usernames=None,
feature_names=None,
grouping_variable=None,
):
if participants_usernames is None:
participants_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
self.participants_usernames = participants_usernames
self.labels_questionnaire = labels_questionnaire
self.data_types = data_types
if feature_names is None:
self.feature_names = []
self.df_features = pd.DataFrame()
self.labels_scale = labels_scale
self.df_labels = pd.DataFrame()
self.grouping_variable = grouping_variable
self.df_groups = pd.DataFrame()
self.model = None
self.validation_method = None
self.df_esm = pd.DataFrame()
self.df_esm_preprocessed = pd.DataFrame()
self.df_esm_interest = pd.DataFrame()
self.df_esm_clean = pd.DataFrame()
self.df_proximity = pd.DataFrame()
self.df_full_data_daily_means = pd.DataFrame()
self.df_esm_daily_means = pd.DataFrame()
self.df_proximity_daily_counts = pd.DataFrame()
def get_labels(self):
self.df_esm = esm.get_esm_data(self.participants_usernames)
self.df_esm_preprocessed = esm.preprocess_esm(self.df_esm)
if self.labels_questionnaire == "PANAS":
self.df_esm_interest = self.df_esm_preprocessed[
(
self.df_esm_preprocessed["questionnaire_id"]
== QUESTIONNAIRE_IDS.get("PANAS").get("PA")
)
| (
self.df_esm_preprocessed["questionnaire_id"]
== QUESTIONNAIRE_IDS.get("PANAS").get("NA")
)
]
self.df_esm_clean = esm.clean_up_esm(self.df_esm_interest)
def get_sensor_data(self):
if "proximity" in self.data_types:
self.df_proximity = proximity.get_proximity_data(
self.participants_usernames
)
self.df_proximity = helper.get_date_from_timestamp(self.df_proximity)
self.df_proximity = proximity.recode_proximity(self.df_proximity)
def aggregate_daily(self):
self.df_esm_daily_means = (
self.df_esm_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
.esm_user_answer_numeric.agg("mean")
.reset_index()
.rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
)
self.df_esm_daily_means = (
self.df_esm_daily_means.pivot(
index=["participant_id", "date_lj"],
columns="questionnaire_id",
values="esm_numeric_mean",
)
.reset_index(col_level=1)
.rename(columns=QUESTIONNAIRE_IDS_RENAME)
.set_index(["participant_id", "date_lj"])
)
self.df_full_data_daily_means = self.df_esm_daily_means.copy()
if "proximity" in self.data_types:
self.df_proximity_daily_counts = proximity.count_proximity(
self.df_proximity, ["participant_id", "date_lj"]
)
self.df_full_data_daily_means = self.df_full_data_daily_means.join(
self.df_proximity_daily_counts
)
def assign_columns(self):
self.df_features = self.df_full_data_daily_means[self.feature_names]
self.df_labels = self.df_full_data_daily_means[self.labels_scale]
if self.grouping_variable:
self.df_groups = self.df_full_data_daily_means[self.grouping_variable]
else:
self.df_groups = None
def validate_model(self):
if self.model is None:
raise AttributeError(
"Please, specify a machine learning model first, by setting the .model attribute."
)
if self.validation_method is None:
raise AttributeError(
"Please, specify a cross validation method first, by setting the .validation_method attribute."
)
cross_val_score(
estimator=self.model,
X=self.df_features,
y=self.df_labels,
groups=self.df_groups,
cv=self.validation_method,
n_jobs=-1,
)

View File

@ -5,7 +5,7 @@ import pandas as pd
from numpy.random import default_rng
from pandas.testing import assert_series_equal
from features.communication import count_comms, enumerate_contacts, get_call_data
from features.communication import *
rng = default_rng()
@ -76,10 +76,18 @@ class CallsFeatures(unittest.TestCase):
def test_count_comms_calls(self):
self.features = count_comms(self.calls)
print(self.features)
self.assertIsInstance(self.features, pd.DataFrame)
self.assertCountEqual(self.features.columns.to_list(), FEATURES_CALLS)
def test_count_comms_sms(self):
self.features = count_comms(self.sms)
print(self.features)
self.assertIsInstance(self.features, pd.DataFrame)
self.assertCountEqual(self.features.columns.to_list(), FEATURES_SMS)
def test_calls_sms_features(self):
self.features_call_sms = calls_sms_features(self.calls, self.sms)
self.assertIsInstance(self.features_call_sms, pd.DataFrame)
self.assertCountEqual(
self.features_call_sms.columns.to_list(),
FEATURES_CALLS + FEATURES_SMS + FEATURES_CONTACT,
)

View File

@ -0,0 +1,31 @@
import unittest
from features.proximity import *
class ProximityFeatures(unittest.TestCase):
df_proximity = pd.DataFrame()
df_proximity_recoded = pd.DataFrame()
df_proximity_features = pd.DataFrame()
@classmethod
def setUpClass(cls) -> None:
cls.df_proximity = pd.read_csv("../data/example_proximity.csv")
cls.df_proximity["participant_id"] = 99
def test_recode_proximity(self):
self.df_proximity_recoded = recode_proximity(self.df_proximity)
self.assertIn("bool_prox_near", self.df_proximity_recoded)
# Is the recoded column present?
self.assertIn(True, self.df_proximity_recoded.bool_prox_near)
# Are there "near" values in the data?
self.assertIn(False, self.df_proximity_recoded.bool_prox_near)
# Are there "far" values in the data?
def test_count_proximity(self):
self.df_proximity_recoded = recode_proximity(self.df_proximity)
self.df_proximity_features = count_proximity(self.df_proximity_recoded)
print(self.df_proximity_features.columns)
self.assertCountEqual(
self.df_proximity_features.columns.to_list(), FEATURES_PROXIMITY
)