-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscores.R
742 lines (616 loc) · 28.6 KB
/
scores.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
# this script contains all used scoring rules used to evaluate the models for a given data set
# this function computes all scoring rules
# truth: the true values
# est: the estimates, of the same length as truth. By default calculated from estMat
# var: the estimates, of the same length as truth. By default calculated from estMat
# lower: the lower end of the credible interval. By default calculated from estMat
# upper: the upper end of the credible interval. By default calculated from estMat
# estMat: a matrix of joint estimate draws, with number of rows equal to the length of truth, a number of
# columns equal to the number of draws. If not included, a gaussian distribution is assumed.
# significance: the significance level of the credible interval. By default 80%
# distances: the distances to the nearest observation if not NULL, scores are broken up
# as a function of nearest neighbor distances
# breaks: the number of equal spaced bins to break the scores into as a function of distance
# NOTE: Discrete, count level credible intervals are estimated based on the input estMat along with coverage and CRPS
getScores = function(truth, est=NULL, var=NULL, lower=NULL, upper=NULL, estMat=NULL, significance=.8,
distances=NULL, breaks=30, doFuzzyReject=TRUE, getAverage=TRUE) {
# if distances is included, must also break down scoring rules by distance bins
if(!is.null(distances)) {
# construct the distance bins with which to group the data and compute scores within
if(length(breaks) == 1)
breaks = seq(0, max(distances), l=breaks)
binsI = cut(distances, breaks, labels=1:(length(breaks)-1), include.lowest=TRUE)
centers = breaks[1:(length(breaks)-1)] + diff(breaks)/2
uniqueBinsI = sort(unique(binsI))
# determine the number of observations per bin
nPerBin = as.numeric(table(binsI))
# helper function to compute the scoring rules for a given bin
getSubScores = function(uniqueBinI, truth, est, var, lower, upper, estMat, significance) {
thisDatI = binsI == uniqueBinI
newEstMat = NULL
if(!is.null(estMat))
newEstMat = matrix(estMat[thisDatI,], ncol=ncol(estMat))
getScores(truth[thisDatI], est[thisDatI], var[thisDatI], lower[thisDatI], upper[thisDatI],
newEstMat, significance, doFuzzyReject=doFuzzyReject)
}
# calculate scores for each bin individually
binnedScores = t(sapply(uniqueBinsI, getSubScores, truth=truth, est=est, var=var, lower=lower, upper=upper,
estMat=estMat, significance=significance))
# make sure each variable in binnedScores is a numeric, not a list...
temp = matrix(unlist(binnedScores), nrow=length(uniqueBinsI))
theseNames = colnames(binnedScores)
binnedScores = data.frame(temp)
names(binnedScores) = theseNames
binnedScores = as.data.frame(cbind(NNDist=centers[uniqueBinsI], nPerBin=nPerBin[uniqueBinsI], binnedScores))
}
# compute central estimates if estMat is not null
if(!is.null(estMat)) {
if(is.null(est))
est = rowMeans(estMat)
}
# first calculate bias, variance, and MSE
out = mse(truth, est, getAverage=getAverage)
thisMSE = out$MSE
thisBias = out$bias
thisVar = out$var
# calculate coverage and credible interval width with and without binomial variation
intScore = intervalScore(truth, est, var, lower, upper, estMat=estMat,
significance=significance, returnIntervalWidth=TRUE,
returnCoverage=TRUE,
doFuzzyReject=doFuzzyReject, getAverage=getAverage)
if(getAverage) {
thisIntScore = intScore[1]
thisCoverage = intScore[2]
thisWidth = intScore[3]
} else {
thisIntScore = intScore[,1]
thisCoverage = intScore[,2]
thisWidth = intScore[,3]
}
# calculate CRPS
thisCRPS = crps(truth, est, var, estMat=estMat, getAverage=getAverage)
# collect the results in a data frame
results = matrix(c(thisBias, thisVar, thisMSE, sqrt(thisMSE), thisCRPS, thisIntScore, thisCoverage,
thisWidth), ncol=8)
colnames(results) = c("Bias", "Var", "MSE", "RMSE", "CRPS", "IntervalScore", "Coverage", "Width")
results = as.data.frame(results)
# include both binned and pooled results in one final table
if(!is.null(distances)) {
results = list(pooledResults=results, binnedResults=binnedScores)
}
results
}
# calculate bias, variance, and MSE
mse <- function(truth, est, weights=NULL, getAverage=TRUE){
if(!is.null(weights))
weights = weights / sum(weights, na.rm=TRUE)
res = est - truth
if(!is.null(weights)) {
thisVar = (res - sum(res*weights, na.rm=TRUE))^2
if(getAverage) {
MSE = sum(res^2 * weights, na.rm=TRUE)
bias=sum(res * weights, na.rm=TRUE)
thisVar = sum(thisVar * weights, na.rm=TRUE)
} else {
MSE = res^2
bias = res
}
out = list(MSE=MSE, bias=bias, var=thisVar)
}
else {
thisVar = (res - mean(res, na.rm=TRUE))^2
if(getAverage) {
MSE = mean(res^2, na.rm=TRUE)
bias=mean(res, na.rm=TRUE)
thisVar = mean(thisVar, na.rm=TRUE)
} else {
MSE = res^2
bias=res
}
out = list(MSE=MSE, bias=bias, var=thisVar)
}
out
}
# either include both lower and upper, or include either:
# - the joint estimate draw matrix
# - estimates and variances (assumes gaussian)
# truth: the true empirical proportions of mortality rates within the regions or enumeration areas of interest
# lower: the lower end of the credible interval
# upper: the upper end of the credible interval
# estMat: a matrix of joint draws of estimates, with number of rows equal to the length of truth, a number of
# columns equal to the number of draws. If not included, a lgaussian distribution is assumed. Can be
# Gaussian or discrete values such as empirical proportions
# significance: the significance level of the credible interval. By default 80%
# doFuzzyReject: based on https://www.jstor.org/stable/pdf/20061193.pdf
# ns: a vector of maximum possible counts (denominators) for each observation. Used only for random/fuzzy reject.
# Can be left out, in which case it will be inferred from the minimum draw difference in each row of estMat.
coverage = function(truth, est=NULL, var=NULL, lower=NULL, upper=NULL,
estMat=NULL, significance=.8, returnIntervalWidth=FALSE,
doFuzzyReject=TRUE, getAverage=TRUE, ns=NULL){
if(any(is.null(lower)) || any(is.null(upper))) {
# if the user did not supply their own credible intervals, we must get them ourselves given the other information
if(is.null(estMat) && (is.null(est) || is.null(var)))
stop("either include both lower and upper, est and var, or estMat")
if(!is.null(est) && !is.null(var) && is.null(estMat)) {
# in this case, we must calculate lower and upper assuming gaussianity
lower = qnorm((1 - significance) / 2, est, sqrt(var))
upper = qnorm(1 - (1 - significance) / 2, est, sqrt(var))
}
else {
# we don't have information about the predictive distribution, and don't assume normality.
# Instead, use the user supplied to probability matrix estMat
# take the quantiles of the probability draws
CIs = apply(estMat, 1, function(ps) {quantile(ps, probs=c((1 - significance) / 2, 1 - (1 - significance) / 2))})
lower = CIs[1,]
upper = CIs[2,]
}
}
if(any(lower > upper)) {
warning("lower > upper, reordering")
tmp = lower
wrongOrder = lower > upper
lower[wrongOrder] = upper[wrongOrder]
upper[wrongOrder] = tmp[wrongOrder]
}
res = lower <= truth & upper >= truth
if(returnIntervalWidth)
width = upper - lower
if(doFuzzyReject) {
# in this case, we sometimes randomly reject if the truth is at the edge of the coverage interval. First
# determine what values are at the edge of the intervals, then determine the probability of rejection
# for each, then randomly reject
atLowerEdge = which(lower == truth)
atUpperEdge = which(upper == truth)
probRejectLower = sapply(atLowerEdge, function(i) {((1 - significance) / 2 - mean(estMat[i,] < lower[i])) / mean(estMat[i,] == lower[i])})
probRejectUpper = sapply(atUpperEdge, function(i) {((1 - significance) / 2 - mean(estMat[i,] > upper[i])) / mean(estMat[i,] == upper[i])})
rejectLower = probRejectLower
rejectUpper = probRejectUpper
# determine minimum differences between probabilities
if(is.null(ns))
deltas = apply(estMat, 1, function(x) {min(diff(sort(unique(x))))})
else
deltas = 1 / ns
if(length(atLowerEdge) != 0) {
res[atLowerEdge] = sapply(1:length(atLowerEdge), function(i) {min(res[atLowerEdge][i], (1-rejectLower[i]))})
# if reject, reduce CI width
width[atLowerEdge] = width[atLowerEdge] - deltas[atLowerEdge] * as.numeric(rejectLower)
}
if(length(atUpperEdge) != 0) {
res[atUpperEdge] = sapply(1:length(atUpperEdge), function(i) {min(res[atUpperEdge][i], (1-rejectUpper[i]))})
# if reject, reduce CI width
width[atUpperEdge] = width[atUpperEdge] - deltas[atUpperEdge] * as.numeric(rejectUpper)
}
# res[atLowerEdge] = res[atLowerEdge] & (!rejectLower)
# res[atUpperEdge] = res[atUpperEdge] & (!rejectUpper)
}
if(getAverage)
allResults = c(coverage=mean(res, na.rm=TRUE))
else
allResults = c(coverage=res)
if(returnIntervalWidth) {
if(getAverage)
allResults = c(allResults, width=mean(width, na.rm=TRUE))
else
allResults = cbind(allResults, width=width)
}
allResults
}
# truth: a vector of observations on the desired scale
# est: a vector of logit-scale predictions of the same length as truth
# my.var: a vector of logit-scale predictive variances of the same length as truth
# estMat: if available, use these probability draws in the integration. Use this argument
# when a gaussian approximation to the (possibly transformed) posterior is unreasonable
# getAverage: if FALSE, returns score for individual observations. Otherwise for all observations
crps <- function(truth, est=NULL, my.var=NULL, estMat=NULL, getAverage=TRUE){
if(!is.null(est) && !is.null(my.var) && is.null(estMat)) {
sig = sqrt(my.var)
x0 <- (truth - est) / sig
res <- sig * (1 / sqrt(pi) - 2 * dnorm(x0) - x0 * (2 * pnorm(x0) - 1))
## sign as in Held (2008)
res <- -res
}
else {
# Integrate numerically using estMat
if(is.null(estMat))
stop("must include either or both est and my.var, or estMat")
# the following code was commented out since it has been modified to be computationally efficient
# # compute the crps for this row of truth
# crpsRow = function(rowI) {
# thisTruth = truth[rowI]
#
# # either build the predictive cdf assuming normality on the logit scale or from the
# # empirical distribution given by estMat if the user supplies it
# if(is.null(estMat)) {
# thisEst = est[rowI]
# thisVar = my.var[rowI]
# thisCdf = function(ws) {pnorm(logit(ws), thisEst, sqrt(thisVar))}
# } else {
# thisCdf = ecdf(estMat[rowI,])
# }
#
# intFun = function(ws) {
# (thisCdf(ws) - (ws >= thisTruth))^2
# }
#
# if(is.null(estMat)) {
# # when integrating we will set bounds on the integral to be reasonable to avoid
# # faulty "divergent integral" error. The bounds will be 20 standard errors out
# # of the estimate, making sure to include the truth.
# lowerBound = max(0, min(thisTruth - .01, expit(thisEst - 20 * sqrt(thisVar))))
# upperBound = min(1, max(thisTruth + .01, expit(thisEst + 20 * sqrt(thisVar))))
# integrate(intFun, lowerBound, upperBound)$value
# }
# else {
# # since we are using the empirical distribution, there is a closed form for the integral
# ps = estMat[rowI,]
# allPoints = sort(c(ps, thisTruth))
# deltas = diff(allPoints)
# sum(deltas * intFun(allPoints[1:length(ps)]))
# }
# }
#
# crpsRow2 = function(rowI) {
# thisTruth = truth[rowI]
#
# # either build the predictive cdf assuming normality on the logit scale or from the
# # empirical distribution given by estMat if the user supplies it
# if(is.null(estMat)) {
# thisEst = est[rowI]
# thisVar = my.var[rowI]
# thisCdf = function(ws) {pnorm(logit(ws), thisEst, sqrt(thisVar))}
# } else {
# # thisCdf = ecdf(estMat[rowI,])
# sorted = sort(estMat[rowI,])
# thisCdf = approxfun(sorted, (1:length(sorted))/length(sorted),
# method = "constant", yleft = 0, yright = 1, f = 0, ties = "ordered")
# }
#
# intFun = function(ws) {
# (thisCdf(ws) - (ws >= thisTruth))^2
# }
#
# if(is.null(estMat)) {
# # when integrating we will set bounds on the integral to be reasonable to avoid
# # faulty "divergent integral" error. The bounds will be 20 standard errors out
# # of the estimate, making sure to include the truth.
# lowerBound = max(0, min(thisTruth - .01, expit(thisEst - 20 * sqrt(thisVar))))
# upperBound = min(1, max(thisTruth + .01, expit(thisEst + 20 * sqrt(thisVar))))
# integrate(intFun, lowerBound, upperBound)$value
# }
# else {
# # since we are using the empirical distribution, there is a closed form for the integral
# allPoints = sort(c(sorted, thisTruth))
# firstGreater = match(TRUE, sorted >= thisTruth)
# if(is.na(firstGreater))
# allPoints = c(sorted, thisTruth)
# else if(firstGreater == 1)
# allPoints = c(thisTruth, sorted)
# else
# allPoints = c(sorted[1:(firstGreater - 1)], thisTruth, sorted[firstGreater:length(sorted)])
#
# deltas = diff(allPoints)
# sum(deltas * intFun(allPoints[1:length(sorted)]))
# }
# }
crpsRow = function(rowI) {
thisTruth = truth[rowI]
# build the predictive cdf assuming from the empirical distribution given by
# estMat
# thisCdf = ecdf(estMat[rowI,])
sorted = estMat[rowI,] # already sorted
# since we are using the empirical distribution, there is a closed form for the integral
allPoints = sort(c(sorted, thisTruth))
deltas = diff(allPoints)
firstGreater = match(TRUE, sorted >= thisTruth)
vals = (1:length(sorted))/length(sorted)
if(is.na(firstGreater))
return(sum((vals)^2 * deltas, na.rm=TRUE))
else if(firstGreater == 1)
return(deltas[1] + sum((1-vals[1:(length(sorted)-1)])^2 * deltas[2:length(deltas)], na.rm=TRUE))
else {
left = sum(vals[1:(firstGreater-1)]^2 * deltas[1:(firstGreater-1)], na.rm=TRUE)
mid = sum((1 - vals[firstGreater-1])^2 * deltas[firstGreater], na.rm=TRUE)
right = ifelse(firstGreater == length(vals), 0, sum((1 - vals[firstGreater:(length(vals)-1)])^2 * deltas[(firstGreater+1):length(deltas)], na.rm=TRUE))
return(left+mid+right)
}
# intFun = function(ws) {
# (thisCdf(ws) - (ws >= thisTruth))^2
# }
}
if(!is.null(estMat))
estMat = t(apply(estMat, 1, sort))
res = sapply(1:length(truth), crpsRow)
}
if(getAverage)
mean(res, na.rm=TRUE)
else
res
}
# either include both lower and upper, or include either:
# - the joint estimate draw matrix
# - estimates and variances (assumes gaussian)
# truth: the true empirical proportions of mortality rates within the regions or enumeration areas of interest
# lower: the lower end of the credible interval
# upper: the upper end of the credible interval
# estMat: a matrix of joint draws of estimates, with number of rows equal to the length of truth, a number of
# columns equal to the number of draws. If not included, a lgaussian distribution is assumed. Can be
# Gaussian or discrete values such as empirical proportions
# significance: the significance level of the credible interval. By default 80%
# doFuzzyReject: based on https://www.jstor.org/stable/pdf/20061193.pdf
# ns: a vector of maximum possible counts (denominators) for each observation. Used only for random/fuzzy reject.
# Can be left out, in which case it will be inferred from the minimum draw difference in each row of estMat.
# getAverage: if FALSE, returns score for individual observations. Otherwise for all observations
# NOTE: this does not account for fuzzy CIs for discrete data. Defines on p 13 of:
# https://www.tandfonline.com/doi/pdf/10.1198/016214506000001437?casa_token=0vXXqMZ3M2IAAAAA:BYmw_z2zaASEcAvFrNDf6PQ157vq6FAQuDuI9depRZp44RJ_M8zbY47CN_KGXHMXP9CHJL02bTDT
intervalScore = function(truth, est=NULL, var=NULL, lower=NULL, upper=NULL,
estMat=NULL, significance=.8, returnIntervalWidth=FALSE,
returnCoverage=FALSE, doFuzzyReject=TRUE, getAverage=TRUE, ns=NULL){
if(any(is.null(lower)) || any(is.null(upper))) {
# if the user did not supply their own credible intervals, we must get them ourselves given the other information
if(is.null(estMat) && (is.null(est) || is.null(var)))
stop("either include both lower and upper, est and var, or estMat")
if(!is.null(est) && !is.null(var) && is.null(estMat)) {
# in this case, we must calculate lower and upper assuming gaussianity
lower = qnorm((1 - significance) / 2, est, sqrt(var))
upper = qnorm(1 - (1 - significance) / 2, est, sqrt(var))
}
else {
# we don't have information about the predictive distribution, and don't assume normality.
# Instead, use the user supplied to probability matrix estMat
# take the quantiles of the probability draws
CIs = apply(estMat, 1, function(ps) {quantile(ps, probs=c((1 - significance) / 2, 1 - (1 - significance) / 2))})
lower = CIs[1,]
upper = CIs[2,]
}
}
if(any(lower > upper)) {
warning("lower > upper, reordering")
tmp = lower
wrongOrder = lower > upper
lower[wrongOrder] = upper[wrongOrder]
upper[wrongOrder] = tmp[wrongOrder]
}
greaterThanLower = lower <= truth
lessThanUpper = upper >= truth
if(returnCoverage) {
cvg = greaterThanLower & lessThanUpper
}
if(returnIntervalWidth)
width = upper - lower
if(doFuzzyReject) {
# in this case, we fuzy reject if the truth is at the edge of the coverage interval. First
# determine what values are at the edge of the intervals, then determine the probability of rejection
# for each, then randomly reject
atLowerEdge = which(lower == truth)
atUpperEdge = which(upper == truth)
probRejectLower = sapply(atLowerEdge, function(i) {((1 - significance) / 2 - mean(estMat[i,] < lower[i])) / mean(estMat[i,] == lower[i])})
probRejectUpper = sapply(atUpperEdge, function(i) {((1 - significance) / 2 - mean(estMat[i,] > upper[i])) / mean(estMat[i,] == upper[i])})
rejectLower = probRejectLower
rejectUpper = probRejectUpper
# determine minimum differences between probabilities
if(is.null(ns))
deltas = apply(estMat, 1, function(x) {min(diff(sort(unique(x))))})
else
deltas = 1 / ns
# reduce CI width based on fuzzy boundaries
width = width - deltas*rejectLower - deltas*rejectUpper
upper = upper - deltas*rejectUpper
lower = lower + deltas*rejectLower
# width = upper - lower (this should be the same as above)
if(returnCoverage) {
if(length(atLowerEdge) != 0) {
cvg[atLowerEdge] = sapply(1:length(atLowerEdge), function(i) {min(cvg[atLowerEdge][i], (1-rejectLower[i]))})
}
if(length(atUpperEdge) != 0) {
cvg[atUpperEdge] = sapply(1:length(atUpperEdge), function(i) {min(cvg[atUpperEdge][i], (1-rejectUpper[i]))})
}
}
}
# calculate interval score
alpha = 1 - significance
theseScores = upper - lower +
2/alpha * (lower - truth) * as.numeric(!greaterThanLower) +
2/alpha * (truth - upper) * as.numeric(!lessThanUpper)
if(getAverage)
allResults = c(intScore=mean(theseScores, na.rm=TRUE))
else
allResults = c(intScore=theseScores)
if(returnCoverage) {
if(getAverage)
allResults = c(allResults, coverage=mean(cvg, na.rm=TRUE))
else
allResults = cbind(allResults, coverage=cvg)
}
if(returnIntervalWidth) {
if(getAverage)
allResults = c(allResults, width=mean(width, na.rm=TRUE))
else
allResults = cbind(allResults, width=width)
}
allResults
}
# averages a list of many tables, each returned from the getScores function with distanceBreaks set by user
averageBinnedScores = function(tableList) {
if(length(tableList) == 1) {
# base case
return(as.data.frame(tableList[[1]]))
} else if(is.null(tableList[[2]])) {
# minor case (some of the tables might be NULL)
return(averageBinnedScores(tableList[-2]))
} else {
# recursive case
firstTable = tableList[[1]]
secondTable = tableList[[2]]
# make sure all tables are matrices
firstTable = as.matrix(firstTable)
secondTable = as.matrix(secondTable)
# make sure tables have matched bins
uniqueDists = sort(unique(c(firstTable[,1], secondTable[,1])))
firstMatch = match(firstTable[,1], uniqueDists)
secondMatch = match(secondTable[,1], uniqueDists)
newFirstTable = matrix(0, nrow=length(uniqueDists), ncol=ncol(firstTable))
newFirstTable[,1] = uniqueDists
newFirstTable[firstMatch,] = firstTable
colnames(newFirstTable) = colnames(firstTable)
newSecondTable = matrix(0, nrow=length(uniqueDists), ncol=ncol(secondTable))
newSecondTable[,1] = uniqueDists
newSecondTable[secondMatch,] = secondTable
colnames(newSecondTable) = colnames(secondTable)
firstTable = newFirstTable
secondTable = newSecondTable
# calculate weights for averaging
# ns1 = firstTable$nPerBin
# ns2 = secondTable$nPerBin
ns1 = firstTable[,2]
ns2 = secondTable[,2]
nsTotal = ns1 + ns2
ws1 = ns1 / nsTotal
ws2 = ns2 / nsTotal
# perform weighted averaging
newTable = firstTable
newTable[,2] = ns1 + ns2
newTable[,3:ncol(firstTable)] = sweep(firstTable[,3:ncol(firstTable)], 1, ws1, "*") + sweep(secondTable[,3:ncol(secondTable)], 1, ws2, "*")
# return results recursively
return(averageBinnedScores(c(list(newTable), tableList[-(1:2)])))
}
}
aggregateScoresByDistance = function(singleScores, breaks=30, observationType=c("All", "Urban", "Rural"), predictionType=c("All", "Urban", "Rural"),
dat=NULL, targetPop=c("women", "children"), distanceBreaksType=c("quantiles", "even"), nPerBin=NULL, maxDist=Inf) {
# NNDist=nndistsAA, NNDistU=nndistsUA, NNDistu=nndistsuA, dataI
targetPop = match.arg(targetPop)
observationType = match.arg(observationType)
predictionType = match.arg(predictionType)
distanceBreaksType = match.arg(distanceBreaksType)
# subset prediction points by urbanicity if necessary. First determine whether prediction points are urban, then filter
if(targetPop == "women") {
resultNameRoot="Ed"
if(is.null(dat)) {
out = load("../U5MR/kenyaDataEd.RData")
dat = ed
}
load("../U5MR/popGridAdjustedWomen.RData")
} else if(targetPop == "children") {
resultNameRoot="Mort"
if(is.null(dat)) {
out = load("../U5MR/kenyaData.RData")
dat = mort
}
load("../U5MR/popGridAdjusted.RData")
}
predictionUrban = dat$urban[singleScores$dataI]
if(predictionType == "Urban") {
singleScores = singleScores[predictionUrban,]
} else if(predictionType == "Rural") {
singleScores = singleScores[!predictionUrban,]
}
# Now determine distance to which type of point we use
distanceType = ""
if(observationType=="Urban") {
distanceType = "U"
} else if(observationType == "Rural"){
distanceType = "u"
}
distanceVar = paste0("NNDist", distanceType)
distances = singleScores[[distanceVar]]
# remove distances beyond the maximum of breaks
if(length(breaks) != 1) {
maxDist = min(maxDist, max(breaks))
}
badDistances = distances >= maxDist
singleScores = singleScores[!badDistances,]
distances = distances[!badDistances]
# sort table by distances
sortI = sort(distances, index.return=TRUE)$ix
singleScores = singleScores[sortI,]
distances = distances[sortI]
# calculate default breaks for the bin limits if necessary
if(length(breaks) == 1) {
nBreaks = breaks
if(distanceBreaksType == "even" && is.null(nPerBin)) {
breaks = seq(0, max(distances), l=nBreaks)
} else {
if(is.null(nPerBin))
nPerBin = ceiling(nrow(singleScores)/nBreaks)
# get endpoints of the bins, average their values when calculating breaks
startI = seq(nPerBin+1, nrow(singleScores), by=nPerBin)
endI = startI - 1
breaks = c(0, c(rowMeans(cbind(distances[startI], distances[endI]))), distances[length(distances)]+1e-6)
}
}
# construct the distance bins with which to group the data and compute scores within
binsI = cut(distances, breaks, labels=1:(length(breaks)-1), include.lowest=TRUE)
centers = breaks[1:(length(breaks)-1)] + diff(breaks)/2
uniqueBinsI = sort(unique(binsI))
# determine the number of observations per bin
nPerBin = as.numeric(table(binsI))
# helper function to compute the scoring rules for a given bin
getSubScores = function(uniqueBinI) {
thisDatI = binsI == uniqueBinI
# thisSingleScoresBinomial = data.frame(c(list(Region=thisRegion, dataI=which(thisSampleI), NNDist=nndistsAA, NNDistU=nndistsUA, NNDistu=nndistsuA), getScores(truth, est, vars, lower, upper, estMatBinomial, getAverage=FALSE), Time=time[3]))
colMeans(singleScores[thisDatI,-c(1, 2)])
}
# calculate scores for each bin individually
binnedScores = t(sapply(uniqueBinsI, getSubScores))
# make sure each variable in binnedScores is a numeric, not a list...
temp = matrix(unlist(binnedScores), nrow=length(uniqueBinsI))
theseNames = colnames(binnedScores)
binnedScores = data.frame(temp)
names(binnedScores) = theseNames
out = as.data.frame(cbind(nPerBin=nPerBin[uniqueBinsI], binnedScores))
out[[distanceVar]] = centers[uniqueBinsI]
out
}
aggregateScoresByDistanceBasic = function(singleScores, breaks=30, distanceVar="NNDist",
distanceBreaksType=c("quantiles", "even"),
nPerBin=NULL, maxDist=Inf) {
distanceBreaksType = match.arg(distanceBreaksType)
# Now determine distance to which type of point we use
distanceVarI = grep(distanceVar, colnames(singleScores))
distances = singleScores[,distanceVarI]
# remove distances beyond the maximum of breaks
if(length(breaks) != 1) {
maxDist = min(maxDist, max(breaks))
}
badDistances = distances >= maxDist
singleScores = singleScores[!badDistances,]
distances = distances[!badDistances]
# sort table by distances
sortI = sort(distances, index.return=TRUE)$ix
singleScores = singleScores[sortI,]
distances = distances[sortI]
# calculate default breaks for the bin limits if necessary
if(length(breaks) == 1) {
nBreaks = breaks
if(distanceBreaksType == "even" && is.null(nPerBin)) {
breaks = seq(0, max(distances), l=nBreaks)
} else {
if(is.null(nPerBin))
nPerBin = ceiling(nrow(singleScores)/nBreaks)
# get endpoints of the bins, average their values when calculating breaks
startI = seq(nPerBin+1, nrow(singleScores), by=nPerBin)
endI = startI - 1
breaks = c(0, c(rowMeans(cbind(distances[startI], distances[endI]))), distances[length(distances)]+1e-6)
}
}
# construct the distance bins with which to group the data and compute scores within
binsI = cut(distances, breaks, labels=1:(length(breaks)-1), include.lowest=TRUE)
centers = breaks[1:(length(breaks)-1)] + diff(breaks)/2
uniqueBinsI = sort(unique(binsI))
# determine the number of observations per bin
nPerBin = as.numeric(table(binsI))
# helper function to compute the scoring rules for a given bin
getSubScores = function(uniqueBinI) {
thisDatI = binsI == uniqueBinI
colMeans(singleScores[thisDatI,-distanceVarI], na.rm=TRUE)
}
# calculate scores for each bin individually
binnedScores = t(sapply(uniqueBinsI, getSubScores))
# make sure each variable in binnedScores is a numeric, not a list...
temp = matrix(unlist(binnedScores), nrow=length(uniqueBinsI))
theseNames = colnames(binnedScores)
binnedScores = data.frame(temp)
names(binnedScores) = theseNames
out = as.data.frame(cbind(nPerBin=nPerBin[uniqueBinsI], binnedScores))
out[[distanceVar]] = centers[uniqueBinsI]
out
}