-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathICJ_descriptives_Feb2020.Rmd
1695 lines (1429 loc) · 66.9 KB
/
ICJ_descriptives_Feb2020.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "ICJ Descriptives February 2020"
author: "Jan Savinc"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
html_document:
toc: true
toc_float: true
fig_caption: yes
code_folding: hide
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
# knitr::opts_chunk$set(echo = TRUE) # before report is finalised
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
```
```{r, warning=FALSE}
library(tidyverse)
library(lubridate)
library(knitr)
```
# Loading data
## Importing files
```{r}
## The data sits in the folder "./data_routine_Jan2020"
raw_data <- list()
raw_data$cases <- readxl::read_excel("./data_routine_Jan2020/ENU ICJ Case Details 21.01.20.xls", trim_ws = TRUE, guess_max = 10e5)
raw_data$hna <- readxl::read_excel("./data_routine_Jan2020/ENU ICJ HNA and Plan 20.01.20.xlsx", trim_ws = TRUE, guess_max = 10e5)
raw_data$onwardref <- readxl::read_excel("./data_routine_Nov2019/ICJ HNA and Plan (BOICJ035RS) 01.11.19.xls", sheet=2, trim_ws = TRUE, guess_max = 10e5)
raw_data$inwardref <- readxl::read_excel("./data_routine_Jan2020/ENU ICJ Referrals 20.01.20.xls", trim_ws = TRUE, guess_max = 10e5)
raw_data$reviews <- readxl::read_excel("./data_routine_Jan2020/ENU ICJ Reviews 23.01.20.xls", trim_ws = TRUE, guess_max = 10e5)
raw_data$additional_visits <- readxl::read_excel("./data_routine_Jan2020/ENU ICJ Additional Visits 21.01.20.xls", trim_ws = TRUE, guess_max = 10e5)
raw_data$outcomes <- readxl::read_excel("./data_routine_Jan2020/ENU ICJ Outcomes 23.01.20.xls", trim_ws = TRUE, guess_max = 10e5)
## load reference names for columns
columnNameReference <- with(data = read.csv("./columnNameReference.csv", stringsAsFactors = FALSE),
setNames(object=standard, nm=cf_post2014)) # creates a named list
## we also load reference files for the names of concerns
concernsReference <- read.table("./concernsReference.txt", sep = ",", stringsAsFactors = FALSE)[,1]
```
## Helper functions
To avoid copying the entire script of helper functions I ever developed, here's just the subset used:
```{r, include=FALSE}
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
MacPaletteLight <-c("#8cc63f", "#aecfc5", "#bfbdaf", "#c0928e", "#e89842")
MacPaletteMid <- c("#00a246", "#5dada4", "#a19f91", "#a7776b", "#d37f42")
MacPaletteDark <- c("#005c46", "#387a7b", "#7b7a6d", "#874346", "#b16438")
MacPaletteGreen <- c("#8cc63f","#00a246","#005c46")
MacPaletteAqua <- c("#aecfc5", "#5dada4", "#387a7b")
MacPaletteStone <- c("#bfbdaf", "#a19f91", "#7b7a6d")
MacPaletteRust <- c("#c0928e", "#a7776b", "#874346")
MacPaletteOrange <- c("#e89842", "#d37f42", "#b16438")
# theme_set(theme_cowplot(font_size=12)
## import 'times' font, if that's what you're into
# windowsFonts(times = windowsFont('Computer Modern Roman'))
#
theme_plot <- theme_bw() +
#theme(panel.grid.major=element_line(size=.25)) +
theme(panel.grid.major=element_blank()) +
theme(panel.grid.minor=element_blank()) +
theme(panel.background=element_rect(fill = "white", size=0.2)) +
theme(plot.title=element_text(size=12, vjust=1.25, family="CM Sans", face="bold")) +
theme(axis.line = element_line(size=0.2)) +
theme(axis.text.x=element_text(size=10, family="CM Sans")) +
theme(axis.text.y=element_text(size=10, family="CM Sans")) +
theme(axis.title.x=element_text(size=10, vjust=0, family="CM Sans")) +
theme(axis.title.y=element_text(size=9, vjust=1.25, family="CM Sans")) +
theme(strip.background = element_rect(fill = "white", size = 0.2)) +
theme(strip.placement = "outside")
theme(strip.text.x = element_text(size = 9, family="CM Sans")) +
theme(strip.text.y = element_text(size = 9, family="CM Sans")) +
theme(legend.text = element_text(size = 9, family="CM Sans")) +
theme(legend.title = element_text(size = 9, family="CM Sans")) +
theme(legend.position="bottom") +
theme(plot.caption = element_text(size=9, family="CM Sans")) +
theme(plot.background = element_blank()) +
theme(legend.background = element_rect(fill = 'transparent'))
#theme(aspect.ratio = 1)
## group cancer types
group_cancer_types <- function(cancer_type) {
cancer_type_grouped <- case_when(
str_detect(tolower(cancer_type), "bowel|anal|rectum") ~ "bowel",
str_detect(tolower(cancer_type), "breast") ~ "breast",
str_detect(tolower(cancer_type), "prostat") ~ "prostate",
str_detect(tolower(cancer_type), "lung") ~ "lung",
TRUE ~ "other"
)
return(cancer_type_grouped)
}
group_visit_locations <- function(visit_location) {
visit_location_grouped <-
case_when(
str_detect(visit_location,pattern=regex("library", ignore_case = TRUE)) ~ "Library",
str_detect(visit_location,pattern=regex("home", ignore_case = TRUE)) ~ "Home visit",
str_detect(visit_location,pattern=regex("outreach", ignore_case = TRUE)) ~ "Outreach",
str_detect(visit_location,pattern=regex("workplace", ignore_case = TRUE)) ~ "Workplace",
str_detect(visit_location,pattern=regex("phone", ignore_case = TRUE)) ~ "Telephone",
str_detect(visit_location,pattern=regex("hospital|infirmary|beatson|victoria", ignore_case = TRUE)) ~ "Hospital",
str_detect(visit_location,pattern=regex("hospice", ignore_case = TRUE)) ~ "Hospice",
str_detect(visit_location,pattern=regex("other|leisure", ignore_case = TRUE)) ~ "Other",
str_detect(visit_location,pattern=regex("F Pearce", ignore_case = TRUE)) ~ "Pearce Institute/Macmillan Café",
is.na(visit_location) ~ "Undefined or missing",
TRUE ~ "Other"
)
return(visit_location_grouped)
}
group_referral_sources <- function(referral_source) {
referral_source_grouped <-
case_when(
str_detect(referral_source,pattern=regex("^A\\s|hospice", ignore_case = TRUE)) ~ "Hospice",
str_detect(referral_source,pattern=regex("^O\\s|outreach", ignore_case = TRUE)) ~ "Outreach/Opt out pilot", # outreach before hospital, otherwise the hospital outreach locations get categorised as hospital!
str_detect(referral_source,pattern=regex("^B\\s|hospital|gri\\s|qeuh", ignore_case = TRUE)) ~ "Hospital",
str_detect(referral_source,pattern=regex("^C\\s|library", ignore_case = TRUE)) ~ "Library",
str_detect(referral_source,pattern=regex("^D\\s|shire", ignore_case = TRUE)) ~ "Local authority (N/S Lanarskhire; W Dunbartonshire)",
str_detect(referral_source,pattern=regex("^G\\s|^I\\s", ignore_case = TRUE)) ~ "Macmillan/Glasgow Life/GCC HR/PCUK",
str_detect(referral_source,pattern=regex("^GP\\s|\\sgp", ignore_case = TRUE)) ~ "GP",
str_detect(referral_source,pattern=regex("^H\\s|social work", ignore_case = TRUE)) ~ "Social work",
str_detect(referral_source,pattern=regex("^K\\s", ignore_case = TRUE)) ~ "CNS/District nurse",
str_detect(referral_source,pattern=regex("^L\\s", ignore_case = TRUE)) ~ "Self/carer/friend/family member",
str_detect(referral_source,pattern=regex("^N\\s", ignore_case = TRUE)) ~ "Other 3rd sector",
str_detect(referral_source,pattern=regex("nhs letter", ignore_case = TRUE)) ~ "NHS letter",
is.na(referral_source) ~ "Undefined or missing",
TRUE ~ "Other"
)
return(referral_source_grouped)
}
## function that produces a string with the names of columns that contain different values
names_of_columns_where_values_differ <- function(data_tbl) {
return(paste(names(select_if(.tbl = data_tbl, .predicate = ~n_distinct(.)!=1)), collapse = ", "))
}
## helper function for combining N and proportion
num_and_prop <- function(n, denominator) {
n_prop <- if_else(
condition = n <= 10,
true = "N<=10",
false = paste0(n," (",scales::percent(n/denominator),")")
)
return(n_prop)
}
```
## Data checking & cleaning
All data was checked for duplicates briefly, and where entire rows in the data were duplicated they were removed.
```{r, include=FALSE}
## check data for duplicates
any(duplicated(raw_data$cases))
any(duplicated(raw_data$hna))
any(duplicated(raw_data$onwardref))
any(duplicated(raw_data$inwardref))
any(duplicated(raw_data$reviews))
any(duplicated(raw_data$additional_visits))
any(duplicated(raw_data$outcomes))
clean_data<-list()
clean_data$cases <- raw_data$cases %>%
mutate_if(is.character, ~ifelse(.=="not indicated", NA_character_, .)) %>%
distinct
all(names(clean_data$cases) %in% names(columnNameReference))
## rename the columns to reference columns
names(clean_data$cases) <-
ifelse(
names(clean_data$cases) %in% names(columnNameReference),
yes = columnNameReference[names(clean_data$cases)],
no = names(clean_data$cases)
)
clean_data$hna <- raw_data$hna %>%
mutate_if(is.character, ~ifelse(.=="not indicated", NA_character_, .)) %>%
distinct
## rename the columns to reference columns
names(clean_data$hna) <-
ifelse(
names(clean_data$hna) %in% names(columnNameReference),
yes = columnNameReference[names(clean_data$hna)],
no = names(clean_data$hna)
)
## group visit locations
clean_data$hna <-
clean_data$hna %>%
select(-matches("^SUMMARY_")) %>% # remove the textual descriptions of concern severity, we don't need them
select(-matches("involvement")) %>% # remove the tags for the various agencies involved, we don't use them for reporting
mutate(
visit_location_grouped = group_visit_locations(visit_location)
) %>%
mutate_at(vars(matches("^CONCERN")), as.numeric) %>%
distinct
clean_data$onwardref <- raw_data$onwardref %>%
mutate_if(is.character, ~ifelse(.=="not indicated", NA_character_, .)) %>%
distinct
all(names(clean_data$onwardref) %in% names(columnNameReference))
## rename the columns to reference columns
names(clean_data$onwardref) <-
ifelse(
names(clean_data$onwardref) %in% names(columnNameReference),
yes = columnNameReference[names(clean_data$onwardref)],
no = names(clean_data$onwardref)
)
clean_data$inwardref <- raw_data$inwardref %>%
mutate_if(is.character, ~ifelse(.=="not indicated", NA_character_, .)) %>%
distinct
all(names(clean_data$inwardref) %in% names(columnNameReference))
## rename the columns to reference columns
names(clean_data$inwardref) <-
ifelse(
names(clean_data$inwardref) %in% names(columnNameReference),
yes = columnNameReference[names(clean_data$inwardref)],
no = names(clean_data$inwardref)
)
clean_data$inwardref <-
clean_data$inwardref %>%
mutate(referral_source_grouped = group_referral_sources(referral_source)) %>%
distinct
clean_data$reviews <- raw_data$reviews %>%
mutate_if(is.character, ~ifelse(.=="not indicated", NA_character_, .)) %>%
distinct
all(names(clean_data$reviews) %in% names(columnNameReference))
## rename the columns to reference columns
names(clean_data$reviews) <-
ifelse(
names(clean_data$reviews) %in% names(columnNameReference),
yes = columnNameReference[names(clean_data$reviews)],
no = names(clean_data$reviews)
)
clean_data$reviews <-
clean_data$reviews %>%
mutate_at(vars(matches("^BEFORE|^AFTER")), as.numeric) %>% # convert the concern scores to numeric
select(-matches("^DIFF_CAT")) %>% # remove categorical differences, we can work those out by hand later
distinct
clean_data$additional_visits <-
raw_data$additional_visits %>%
mutate_if(is.character, ~ifelse(.=="not indicated", NA_character_, .)) %>%
distinct
names(clean_data$additional_visits) <-
ifelse(
names(clean_data$additional_visits) %in% names(columnNameReference),
yes = columnNameReference[names(clean_data$additional_visits)],
no = names(clean_data$additional_visits)
)
clean_data$outcomes <-
raw_data$outcomes %>%
mutate_if(is.character, ~ifelse(.=="not indicated", NA_character_, .)) %>%
distinct
names(clean_data$outcomes) <-
ifelse(
names(clean_data$outcomes) %in% names(columnNameReference),
yes = columnNameReference[names(clean_data$outcomes)],
no = names(clean_data$outcomes)
)
```
### Essentially blank entries
There are a number of entries that show up as essentially blank; for example, in an HNA entry, there are a number of data columns denoting concern severity, visit location, how long the HNA took, etc., in addition to some administrative entries that denote the date of an entry, its relative number for that individual, etc, which are automatically added to a record.
We can find blank entries by looking at blank substantial sections of the record as opposed to the purely administrative sections.
Note: this only applies to assessments - HNAs, reviews, and additional visits; other spreadsheets either had complete data or it would be inappropriate to remove individual records - e.g. for case details, an individual's id and age being recorded in the absence of anything else is still a valid record.
#### Removing essentially blank entries
```{r}
# for HNAs, the relevant columns are from visit location onwards:
substantive_columns <-
list(
additional_visits = clean_data$additional_visits %>% select(visit_location_ni_inc : reason_for_additional_visit_ni_inc) %>% names, # only 3 entries are substantial
hna = clean_data$hna %>% select(visit_location : ncol(.)) %>% names, # everything from visit location onwards
reviews = clean_data$reviews %>% select(matches("^BEFORE|^AFTER|^DIFF|overall_score|further_holistic")) %>% names # the entire concerns section, including overall concern and the further HNA needed column
)
clean_data[c("hna","reviews","additional_visits")] <-
map2(
.x = clean_data[c("hna","reviews","additional_visits")],
.y = substantive_columns[c("hna","reviews","additional_visits")],
.f = function(x,y) {x %>% filter(rowSums(!is.na(select(., one_of(y)))) != 0)}
)
```
### Partial duplicates
In wide-formatted data (one entry per id), entries with the same id and same date are partial duplicates that occur for various reasons, and can be merged - in some cases they are data entry errors where a second entry was added later, etc.
This doesn't apply to the Onward referrals, Additional visits, and Outcomes spreadsheets, because they were provided in long (or long-ish...) format - i.e. one entry per action, and multiple actions per individual on the same date aren't necessarily duplicates
Notes:
* some individuals have multiple records made on the same date - these should definitely be merged together!
* the rank_from_first_to_last_completed_hna variable denotes the number of hna records for the individual, in chronological order
- some people have entries with the same rank, which seems to be the case where there is a delay between when the entry was first made (assessment start date) and when the entry was closed (assesment end date). In those cases the end date is the same etween the duplicate rank entries, with the start date in the second entry also being the same as the end date. It makes sense to treat these as belonging to the same HNA (the records may differ in which concerns were reported, etc), and the time taken and location are also different between them.
```{r}
map_dfr(
clean_data,
~group_by(., id, assessment_start_date) %>%
filter(n()>1) %>%
summarise(num_partial_duplicate_entries=n()) %>%
ungroup %>%
count(num_partial_duplicate_entries),
.id = "source"
) %>%
filter(!source %in% c("onwardref","additional_visits","outcomes")) %>%
kable(caption="Number of cases with more than one entry on the same date, by spreadsheet, and how many partial duplicates on the same date there were.")
deduplicate_column <- function(column) {
if (n_distinct(column)==1) return(column[1]) # return first value if only 1 distinct value exists
column <- column[!is.na(column)] # remove NAs
if (n_distinct(column)==1) return(column[1]) # return unique value if it exists now that NAs removed
if (is.character(column)) column <- column[!tolower(column) %in% c("n","no","none")] # remove N, No, None
if (is.numeric(column)) return(max(abs(column))) # if numeric, return the largest absolute value
return(tail(column,n=1)) # finally, remove the last value; this ensures we get a single value and is a stand-in for properly sorting factor values; in most cases we should have found a unique value before, so this only really applies when the choice is between 2 actual values that differ in the records
# TODO: implement factor ordering of text responses to improve this
}
```
#### Manual review of partial duplicates
Note: these won't show in the report for confidentiality purposes!
It's clear there are partial duplicates in all of the spreadsheets. Taking only the first or last entry doesn't work either, because either can be mostly blank. Ideally, we would know a hierarchy of possible values, e.g. blank/missing, followed by a default value (No/None), followed by actual values (Yes/XXX). Then, when the choice is between a blank and a default value, we choose the default, and in a choice between a default and actual value, we choose the actual value.
For numeric columns, we take the higher absolute value if there are two competing values.
```{r, echo=FALSE}
# Note: the below are nested df for each combination of id & assessment start date where there were partially duplicate entries; the last row in each nested df is the deduplicated row, with the preceding two rows being the partial duplicates
# View(clean_data$cases %>% group_by(id, assessment_start_date) %>% filter(n()>1) %>% nest() %>% mutate(data=map(data, ~select_if(.tbl = .,.predicate=~n_distinct(.)!=1))) %>% mutate(data = map(data, (function(x) {bind_rows(x,summarise_all(x, ~deduplicate_column(.)))}))))
#
# View(clean_data$hna %>% group_by(id, assessment_start_date) %>% filter(n()>1) %>% nest() %>% mutate(data=map(data, ~select_if(.tbl = .,.predicate=~n_distinct(.)!=1))) %>% mutate(data = map(data, (function(x) {bind_rows(x,summarise_all(x, ~deduplicate_column(.)))}))))
#
# View(clean_data$inwardref %>% group_by(id, assessment_start_date) %>% filter(n()>1) %>% nest() %>% mutate(data=map(data, ~select_if(.tbl = .,.predicate=~n_distinct(.)!=1))) %>% mutate(data = map(data, (function(x) {bind_rows(x,summarise_all(x, ~deduplicate_column(.)))}))))
#
# View(clean_data$reviews %>% group_by(id, assessment_start_date) %>% filter(n()>1) %>% nest() %>% mutate(data=map(data, ~select_if(.tbl = .,.predicate=~n_distinct(.)!=1))) %>% mutate(data = map(data, (function(x) {bind_rows(x,summarise_all(x, ~deduplicate_column(.)))}))))
```
#### Merging partial duplicates
```{r}
partial_duplicates_deduplicated <-
map(
clean_data[c("cases","hna","inwardref","reviews")],
~group_by(., id, assessment_start_date) %>% filter(n()>1) %>% summarise_all(~deduplicate_column(.)) %>% ungroup
)
clean_data[c("cases","hna","inwardref","reviews")] <-
map2(
.x = clean_data[c("cases","hna","inwardref","reviews")],
.y = partial_duplicates_deduplicated[c("cases","hna","inwardref","reviews")],
.f = ~anti_join(x = .x, y = .y, by=c("id","assessment_start_date")) %>% bind_rows(., .y)
)
```
### Formatting Outcomes to long format
Outcomes are recorded as wide data (one row per individual) with up to 10 agencies (agency_1, agency_2, etc.) - for programmatic use of data, these need to be converted to long format.
This is complicated by the fact that the columns are only partly systematically organised!
The first step is to identify a primary key in the data on which we can later join it - we'll chop the table into id+other data, and 10 id+agency data chunks:
```{r}
clean_data$outcomes_agencies <-
clean_data$outcomes %>%
mutate(outcomes_key=1:n()) %>% # add a key to each row
rename_at(vars(matches("agency_\\d+$")), ~paste0(.,"_name")) %>% # rename agency_1 to agency_1_name for easier parsing later
(function (data_tbl) {
id_and_agency <- data_tbl %>% select(outcomes_key, matches("agency"))
id_and_other_data <- data_tbl %>% select(-matches("agency"))
id_and_agency_longish <-
map_dfr(
.x = 1:10,
.f = ~select(id_and_agency, outcomes_key, matches(paste0("agency_",.x,"(\\_|$)"))) %>%
rename_all(~str_replace(., pattern="\\_\\d+", replacement = "")) %>%
mutate_at(vars(matches("agency")), ~as.character(.))
) %>%
filter(!is.na(agency_name)) %>% # remove entries with blank agency name - we can't use them even if scores were recorded!
mutate(
agency_score = as.numeric(agency_score), # convert back to numeric
agency_score = if_else(agency_score > 10, 10, agency_score), # change scores above 10 to 10, keep otherwise
agency_what_happened = tolower(agency_what_happened),
agency_what_happened = if_else(str_detect(agency_what_happened, "chose not to attend"), "chose not to attend", agency_what_happened),
agency_was_service_helpful = tolower(agency_was_service_helpful),
agency_contact = tolower(agency_contact)
)
left_join(id_and_other_data, id_and_agency_longish, by="outcomes_key")
})
# pivot_longer(cols=matches("agency"), names_to = c("agency_num","agency_var"), names_pattern = "agency_(\\d)_(.*)", values_to="agency_val")
```
# ICJ descriptives of routine data
## Number of service users
Since the inception of the service (the earliest case in the data was `r clean_data$cases$assessment_start_date %>% min`), ICJ has served a total of N=`r nrow(clean_data$cases)` people, or N=`r n_distinct(clean_data$cases$id)` individuals (some of whom had multiple assessments).
A total of N=`r nrow(clean_data$hna)` HNAs were made. The table below shows a breakdown of number of assessments made by year & month:
### Referrals to ICJ by month & year
```{r}
table(lubridate::year(clean_data$inwardref$assessment_start_date),lubridate::month(clean_data$inwardref$assessment_start_date, label=TRUE)) %>%
kable(caption = "Breakdown of Referrals made by year & month")
```
### Case details recorded by month & year
```{r}
table(lubridate::year(clean_data$cases$assessment_start_date),lubridate::month(clean_data$cases$assessment_start_date, label=TRUE)) %>%
kable(caption = "Breakdown of Case details recorded by year & month")
```
### HNAs made by month & year
```{r}
table(lubridate::year(clean_data$hna$assessment_start_date),lubridate::month(clean_data$hna$assessment_start_date, label=TRUE)) %>%
kable(caption = "Breakdown of HNAs made by year & month")
```
### Reviews by month & year
```{r}
table(lubridate::year(clean_data$reviews$assessment_start_date),lubridate::month(clean_data$reviews$assessment_start_date, label=TRUE)) %>%
kable(caption = "Breakdown of Reviews made by year & month")
```
### Referrals, cases, HNAs
```{r}
map_dfr(
clean_data[c("inwardref","cases","hna")],
~count(., Year=year(assessment_start_date)),
.id = "source"
) %>%
mutate(
source=case_when(
source=="inwardref" ~ "Referrals to ICJ",
source=="cases" ~ "Case details recorded",
source=="hna" ~ "HNAs",
TRUE ~ source
)
) %>%
pivot_wider(names_from=Year, values_from = n) %>%
kable(caption = "Number of referrals to ICJ, cases recorded, and HNAs made, by year.")
```
## Time between referral and first HNA
```{r}
clean_data$inwardref %>%
group_by(id) %>%
top_n(assessment_start_date, n=-1) %>%
slice(1) %>%
ungroup %>%
left_join(
clean_data$cases %>%
group_by(id) %>%
top_n(assessment_start_date, n=-1) %>%
slice(1) %>%
ungroup,
by="id"
) %>%
mutate(time_between_referral_and_hna = as.numeric(as.duration(interval(assessment_start_date.x,assessment_start_date.y)), unit="days")) %>%
count(time_between_referral_and_hna) %>%
mutate(
time_between_referral_and_hna_grouped = case_when(
time_between_referral_and_hna < 0 ~ "HNA recorded before referral",
time_between_referral_and_hna == 0 ~ "Same day",
time_between_referral_and_hna < 30 ~ "Within a month",
time_between_referral_and_hna < 90 ~ "Within 3 months",
time_between_referral_and_hna >= 90 ~ "More than 3 months"
)
) %>%
filter(!is.na(time_between_referral_and_hna)) %>%
group_by(time_between_referral_and_hna_grouped) %>%
summarise(
n=sum(n)
) %>%
ungroup %>%
mutate(proportion=scales::percent(n/sum(n))) %>%
kable(caption = "Time elapsed between referral and first HNA. Note that because people might have had more than one referral, and more than one HNA, these may not reflect true values.")
```
The majority of HNAs were recorded on the same day as the referral - this is probably a consequence of the data collection process.
## Take up of referrals
Note: we compute this by identifying the unique number of individuals who had an HNA, and divide that by the number of individuals who were referred to ICJ.
### Take up by SIMD area
Note: Deprivation vigintiles aren't currently available, so this uses SIMD area instead, which is hopefully identical to quintiles!
```{r}
clean_data$inwardref %>%
group_by(id) %>%
top_n(assessment_start_date, n=-1) %>%
ungroup %>%
select(id, simd_area_ref=simd_area) %>%
left_join(
clean_data$hna %>%
group_by(id) %>%
top_n(assessment_start_date, n=-1) %>%
ungroup %>%
select(id, simd_area_hna=simd_area),
by = "id"
) %>%
mutate(
had_hna = !is.na(simd_area_hna), # this works b/c there were no NAs
simd_area_ref = parse_number(simd_area_ref)
) %>%
replace_na(list(simd_area_ref="Undefined or missing")) %>%
count(simd_area_ref, had_hna) %>%
group_by(simd_area_ref) %>%
mutate(
proportion=scales::percent(n/sum(n))
) %>%
filter(had_hna) %>%
select(-had_hna) %>%
kable(caption = "Proportion of individuals by SIMD area recorded at referral who had an HNA.")
```
### Take up by sex
```{r}
clean_data$inwardref %>%
group_by(id) %>%
top_n(assessment_start_date, n=-1) %>%
ungroup %>%
select(id, sex_ref=assessment_subject_gender) %>%
left_join(
clean_data$hna %>%
group_by(id) %>%
top_n(assessment_start_date, n=-1) %>%
ungroup %>%
select(id, sex_hna=assessment_subject_gender),
by = "id"
) %>%
mutate(
had_hna = !is.na(sex_hna), # this works b/c there were no NAs
sex_ref = if_else(sex_ref %in% c("M","F"), sex_ref, "Undefined or missing")
) %>%
count(sex_ref, had_hna) %>%
group_by(sex_ref) %>%
mutate(
proportion=scales::percent(n/sum(n))
) %>%
filter(had_hna) %>%
select(-had_hna) %>%
kable(caption = "Proportion of individuals who had an HNA by sex recorded at referral.")
```
## Service user demographics
### Sex
The breakdown of ICJ users by sex (since service inception) was:
```{r}
clean_data$cases %>%
rename(Sex=assessment_subject_gender) %>%
mutate(
Sex = case_when(
Sex=="U" ~ "Undefined or missing",
is.na(Sex) ~ "Undefined or missing",
TRUE ~ Sex
)
) %>%
count(Sex) %>%
rename(N=n) %>%
mutate(Proportion = scales::percent(N/sum(N))) %>%
kable(caption = "Breakdown of ICJ users by sex, over lifetime of ICJ.")
clean_data$cases %>%
rename(Sex=assessment_subject_gender) %>%
mutate(
Sex = case_when(
Sex=="U" ~ "Undefined or missing",
is.na(Sex) ~ "Undefined or missing",
TRUE ~ Sex
)
) %>%
count(Sex, Year=year(assessment_start_date)) %>%
rename(N=n) %>%
group_by(Year) %>%
mutate(n_prop = num_and_prop(N,sum(N))) %>%
select(-N) %>%
pivot_wider(names_from = Year, values_from = n_prop) %>%
kable(caption = "Breakdown of ICJ users by sex, by year.")
```
### Age
The breakdown of ICJ users by age recorded in Case details (since service inception) was:
```{r}
clean_data$cases %>%
rename(Age=ageband_at_assessment) %>%
count(Age) %>%
replace_na(list(Age="Undefined or missing")) %>%
rename(N=n) %>%
mutate(Proportion = scales::percent(N/sum(N))) %>%
mutate(
Proportion=if_else(N<=10, "N<=10", as.character(Proportion)),
N=if_else(N<=10, "N<=10", as.character(N))
) %>%
kable(caption = "Breakdown of ICJ users by age, over lifetime of ICJ.")
```
The breakdown of ICJ users by age, by year:
```{r}
clean_data$cases %>%
rename(Age=ageband_at_assessment) %>%
count(Age, Year = year(assessment_start_date)) %>%
replace_na(list(Age="Undefined or missing")) %>%
group_by(Year) %>%
mutate(n_prop = num_and_prop(n, sum(n))) %>%
select(-n) %>%
arrange(Age, Year) %>%
pivot_wider(names_from = Year, values_from = n_prop, values_fill = list(n_prop="N<=10")) %>%
kable(caption = "Breakdown of ICJ users by age, by year.")
```
### Deprivation
Note: as of 13 Feb 2020, deprivation data was not included in the ICJ data. In the meantime, we can use the SIMD area as a palceholder - this probably stands for SIMD quintiles!
The breakdown of ICJ users by deprivation (since service inception) was:
```{r}
clean_data$hna %>%
mutate(simd_area=parse_number(simd_area)) %>%
count(simd_area) %>%
replace_na(list(simd_area="Undefined or missing")) %>%
rename(N=n) %>%
mutate(Proportion = scales::percent(N/sum(N))) %>%
kable(caption = "Breakdown of ICJ users by SIMD Area (1=most deprived).")
# clean_data$hna %>%
# rename(Deprivation=Vigintiles) %>%
# mutate(Deprivation=parse_number(Deprivation)) %>%
# count(Deprivation) %>%
# mutate(Deprivation=ifelse(is.na(Deprivation),"Undefined or missing",as.character(Deprivation))) %>%
# rename(N=n) %>%
# mutate(Proportion = scales::percent(N/sum(N))) %>%
# kable(caption = "Breakdown of ICJ users by deprivation (SIMD2016 vigintile; 1=most deprived). Note: the total doesn't match up with the breakdowns above because deprivation vigintiles were recorded in HNAs.")
```
The same breakdown shown graphically:
```{r, fig.caption="Graphical Breakdown of ICJ users by deprivation (SIMD Area; 1=most deprived), over lifetime of ICJ."}
clean_data$hna %>%
mutate(simd_area=parse_number(simd_area)) %>%
count(simd_area) %>%
# replace_na(list(simd_area="Undefined or missing")) %>%
ggplot(., aes(x=factor(simd_area), y=n)) +
scale_x_discrete(breaks=c(1:5,as.numeric(NA)), labels=c(1:5,"N/A")) +
geom_col() +
theme_plot +
labs(x="Deprivation (SIMD Area; 1=most deprived)", y="Number of people")
# fig_caption = "Graphical Breakdown of ICJ users by deprivation (SIMD2016 vigintile; 1=most deprived)."
#
# clean_data$hna %>%
# rename(Deprivation=Vigintiles) %>%
# mutate(Deprivation=parse_number(Deprivation)) %>%
# count(Deprivation) %>%
# mutate(Deprivation = factor(Deprivation)) %>%
# ggplot(., aes(x=Deprivation, y=n)) +
# scale_x_discrete(breaks=c(1:20,as.numeric(NA)), labels=c(1:20,"N/A")) +
# geom_col() +
# theme_plot +
# labs(x="Deprivation (SIMD2016 vigintile; 1=most deprived)", y="Number of people")
```
#### Deprivation by year
Note: as of 13 Feb 2020, deprivation data was not included in the ICJ data. In the meantime, we can use the SIMD area as a palceholder - this probably stands for SIMD quintiles!
```{r}
clean_data$hna %>%
mutate(simd_area=parse_number(simd_area)) %>%
count(Year=year(assessment_start_date),simd_area) %>%
replace_na(list(simd_area="Undefined or missing")) %>%
rename(N=n) %>%
group_by(Year) %>%
mutate(n_prop = num_and_prop(N, sum(N))) %>%
select(-N) %>%
pivot_wider(names_from = Year, values_from=n_prop, values_fill = list(n_prop="N<=10")) %>%
kable(caption = "Breakdown of ICJ users by SIMD Area (1=most deprived).")
```
Shown graphically:
```{r, fig.caption="Graphical Breakdown of ICJ users by deprivation (SIMD Area; 1=most deprived), over lifetime of ICJ."}
clean_data$hna %>%
mutate(simd_area=parse_number(simd_area),Year=year(assessment_start_date)) %>%
ggplot(., aes(x=factor(simd_area))) +
facet_wrap(~Year, scales="free_y") +
scale_x_discrete(breaks=c(1:5,as.numeric(NA)), labels=c(1:5,"N/A")) +
geom_bar() +
theme_plot +
labs(
x="Deprivation (SIMD Area; 1=most deprived)", y="Number of people",
caption = "Panels show years separately. Note: y-scale is different between years!"
) +
NULL
```
### Cancer type
```{r}
clean_data$cases %>%
pivot_longer(cols=matches("diagnosis"), names_to = "source", values_to = "diagnosis", values_drop_na = TRUE) %>%
count(diagnosis, sort = TRUE) %>%
group_by(n<11) %>%
mutate(
diagnosis = if_else(condition = n<11,true = paste(diagnosis, collapse=", "),false = diagnosis),
n = if_else(condition = n<11,true = sum(n),false = n)
) %>%
ungroup %>%
distinct %>%
mutate(
denominator = nrow(clean_data$cases),
proportion = scales::percent(n/denominator)
) %>%
select(-denominator, -`n < 11`) %>%
kable(caption = "Cancer type & proportions. Cancers reported 10 or fewer times aggregated. Note that because an individual can have more than one diagnosis, the proportions don't add up to 100%.")
```
#### Big 4 cancer types over the years
```{r}
clean_data$cases %>%
mutate(case_index = 1:n()) %>% # for tracking nubmer of individuals per year
mutate(Year = year(assessment_start_date)) %>%
pivot_longer(cols=matches("diagnosis"), names_to = "source", values_to = "diagnosis", values_drop_na = TRUE) %>%
mutate(cancer_type = group_cancer_types(diagnosis)) %>%
distinct %>%
group_by(Year, cancer_type) %>%
summarise(n = n_distinct(case_index)) %>%
left_join(clean_data$cases %>% count(Year=year(assessment_start_date), name="denominator"), by="Year") %>%
mutate(n_prop = num_and_prop(n,denominator)) %>%
select(-n,-denominator) %>%
pivot_wider(names_from=Year, values_from=n_prop) %>%
kable(caption = "Cancer type & proportions within users each year. Note that because an individual can have more than one diagnosis, the proportions don't add up to 100%.")
```
#### Cancer types in 2019
```{r}
clean_data$cases %>%
filter(year(assessment_start_date)==2019) %>%
pivot_longer(cols=matches("diagnosis"), names_to = "source", values_to = "diagnosis", values_drop_na = TRUE) %>%
count(diagnosis, sort = TRUE) %>%
group_by(n<11) %>%
mutate(
diagnosis = if_else(condition = n<11,true = paste(diagnosis, collapse=", "),false = diagnosis),
n = if_else(condition = n<11,true = sum(n),false = n)
) %>%
ungroup %>%
distinct %>%
mutate(
denominator = nrow(clean_data$cases %>% filter(year(assessment_start_date)==2019)),
proportion = scales::percent(n/denominator)
) %>%
select(-denominator, -`n < 11`) %>%
kable(caption = "Cancer type & proportions for ICJ users in 2019. Cancers reported 10 or fewer times aggregated. Note that because an individual can have more than one diagnosis, the proportions don't add up to 100%.")
```
### Stage in cancer journey
The breakdown of ICJ users by Stage in cancer journey (since service inception) was:
```{r}
clean_data$hna %>%
rename(Stage=stage_in_journey) %>%
count(Stage) %>%
rename(N=n) %>%
replace_na(list(Stage="Missing or undefined")) %>% arrange(desc(N)) %>%
mutate(
Proportion = scales::percent(N/sum(N)),
Proportion = ifelse(N<=10, "N<=10", Proportion),
N = ifelse(N<=10, "N<=10", N)
) %>%
kable(caption = "Breakdown of ICJ users by Stage in cancer journey, over lifetime of ICJ.")
```
#### Cancer stages by year
The breakdown of stages by year:
```{r}
clean_data$hna %>%
rename(Stage=stage_in_journey) %>%
count(Stage, Year=year(assessment_start_date)) %>%
rename(N=n) %>%
replace_na(list(Stage="Missing or undefined")) %>%
arrange(Year) %>%
group_by(Year) %>%
mutate(
n_prop = num_and_prop(N, sum(N))
) %>%
ungroup %>%
select(-N) %>%
pivot_wider(names_from = Year, values_from=n_prop, values_fill = list(n_prop="N<=10")) %>%
kable(caption = "Breakdown of ICJ users by Stage in cancer journey, by year of HNA assessment.")
```
### HNA visit location
#### Grouped locations
There were many locations recorded, so they were grouped according to the following table:
```{r}
clean_data$hna %>% select(visit_location_grouped,visit_location) %>% distinct %>% arrange(visit_location_grouped,visit_location) %>%
kable(caption = "How HNA locations were grouped.")
```
#### HNA locations overall
```{r}
clean_data$hna %>%
rename(Location=visit_location_grouped) %>%
count(Location) %>%
rename(N=n) %>%
replace_na(list(Location="Missing or undefined")) %>%
arrange(desc(N)) %>%
mutate(Proportion = scales::percent(N/sum(N))) %>%
kable(caption = "Breakdown of HNAs by visit location, over ICJ lifetime.")
```
#### HNA locations, by year
The below table shows breakdown of locations by year:
```{r}
clean_data$hna %>%
rename(Location=visit_location_grouped) %>%
count(Year = year(assessment_start_date), Location) %>%
group_by(Year) %>%
mutate(n_prop = num_and_prop(n,sum(n))) %>%
select(-n) %>%
pivot_wider(names_from=Year,values_from=n_prop) %>%
kable(caption = "Breakdown of HNAs by visit location, by year.")
```
#### Proportion of home visits over the years
The location of visits changed over time. In particular, the percentage of home visits has changed as follows:
```{r}
clean_data$hna %>%
mutate(
Location=visit_location_grouped,
Year = year(assessment_start_date)
) %>%
group_by(Year) %>%
count(Location) %>%
rename(N=n) %>%
replace_na(list(Location="Missing or undefined")) %>%
arrange(Year,desc(N)) %>%
mutate(Proportion = scales::percent(N/sum(N))) %>%
filter(Location=="Home visit") %>%
kable(caption = "Proportion of HNAs in each year that were home visits.")
```
#### Proportion of HNAs in hospital
```{r}
clean_data$hna %>%
mutate(
Location=visit_location_grouped,
Year = year(assessment_start_date)
) %>%
group_by(Year) %>%
count(Location) %>%
rename(N=n) %>%
replace_na(list(Location="Missing or undefined")) %>%
arrange(Year,desc(N)) %>%
mutate(Proportion = scales::percent(N/sum(N))) %>%
filter(Location=="Hospital") %>%
kable(caption = "Proportion of HNAs in each year that were in hospital.")
```
### Multiple HNAs
How many individuals had more than one HNA? This is somewhat complicated by partial duplicates, where essentially the same HNA was done in several parts over multiple days, in which case we would need to compile the parts that belong to the same assessment.
Note: an individual may have gotten in touch with ICJ in quick succession, so some HNAs happen within a few days' time, whereas in other cases the same individual may only have gotten in touch after a year or so!
```{r}
clean_data$hna %>%
group_by(id) %>%
summarise(number_hnas = n()) %>%
count(number_hnas) %>%
group_by(number_hnas>=4) %>%
mutate(
number_hnas = if_else(number_hnas>=4, paste0(min(number_hnas),"-",max(number_hnas)),as.character(number_hnas)),
n = if_else(number_hnas>=4, sum(n), n)
) %>%
ungroup %>%
distinct %>%
select(number_hnas, n) %>%
mutate(proportion = scales::percent(n / n_distinct(clean_data$hna$id))) %>%
kable(caption = "Number and proportion of individuals who had one or more HNAs over the lifetime of ICJ.")
clean_data$hna %>%
group_by(id, Year = year(assessment_start_date)) %>%
summarise(number_hnas = n()) %>%
ungroup %>%
count(more_than_one=number_hnas>1,Year) %>% # this is still the number of individuals
left_join(clean_data$hna %>% group_by(Year=year(assessment_start_date)) %>% summarise(denominator=n_distinct(id)), by="Year") %>%
mutate(
more_than_one = if_else(more_than_one, "More than 1 HNA", "Just one HNA"),
n_prop = paste0(n, " (", scales::percent(n/denominator),")")
) %>%
select(-n, -denominator) %>%
pivot_wider(names_from = more_than_one, values_from = n_prop) %>%
kable(caption = "Number & proportion of individuals who had more than one HNA by year.")
```
### Employment status - raw
```{r}
clean_data$cases %>%
count(Employment=employment_status_inc_ni, name = "N") %>%
replace_na(list(Employment="Missing or undefined")) %>%
mutate(
Proportion = scales::percent(N/sum(N)),
Proportion = ifelse(N<=10, "N<=10", Proportion),
N = ifelse(N<=10, "N<=10", N)
) %>%
kable(caption = "Employment status of ICJ users using raw employment status data, over lifetime of service.")
```
### Employment status - simplified
We can count as unemployed persons who are:
* in voluntary work
* Not in educ/train/emp
* Unemployed / job seeker
* Not currently working rec support of ESS
Any kind of education (com) wil be treated as "in education".
Retirement is treated as-is.
```{r}
clean_data$cases %>%
mutate(
Employment = case_when(
str_detect(tolower(employment_status_inc_ni), "not in educ|unemploy|not currently working|voluntary") ~ "Unemployed",