-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathordinal.R
112 lines (87 loc) · 3.7 KB
/
ordinal.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
## Grant Gasser
## Created 2/26/2019
## Ordinal Regression on ADNI Q3 Data
#### DATA DESCRIPTION ####
'''
directory.id Id assigned to imaging directories. Not from LONi data
Subject (PTID) Participant ID
RID Participant roster ID
Image.Data.ID MRI ID
Modality Image type
Visit 1=screening scan
Acq.Date MRI date
DX.bl Diagnosis at baseline
EXAMDATE Examination Date
AGE Age at baseline
PTGENDER Sex
PTEDUCAT Years of Education
PTETHCAT Ethnicity
PTRACCAT Race
APOE4 APOE4 genotype
MMSE MMSE score
imputed_genotype Challenge specific designation, TRUE=has imputed genotypes
APOE Genotype APOE allele 1 and allele 2 genotypes
Dx Codes for Submission The LMCI in the ADNI data is equivalent to MCI in test. This column just converts LMCI->MCI
'''
#Read and view data
list.files('AD_Challenge_Training_Data_Clinical_Updated_7.22.2014')
dat = read.csv('AD_Challenge_Training_Data_Clinical_Updated_7.22.2014/ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv')
head(dat)
dim(dat)
num_rows = dim(dat)[1]
#What type of variables?
str(dat)
#APOE4 and imputed_genotype should be factor variables
dat$APOE4 = factor(dat$APOE4)
dat$imputed_genotype = factor(dat$imputed_genotype)
#Labels: AD (Alzheimer's), LMCI (Limited Mild Cognitive Impairment), CN (Cognitively Normal)
Y = dat$DX.bl
summary(Y)
#make sure Y is ordinal
Y = factor(Y, levels=c('CN', 'LMCI', 'AD'), ordered=TRUE)
y_test = factor(y_test, levels=c('CN', 'LMCI', 'AD'), ordered=TRUE)
head(Y)
#Fit ordinal regression considering cognitive state as a spectrum CN < LMCI < AD
#Train-test split: 75% train (471), 25% test (157)
num_test = num_rows/4
num_train = num_rows - num_test
#Pick variables 10:18 for training data
train = dat[1:num_train, 10:18]
test = dat[(num_train+1):num_rows, 10:18]
y_train = Y[1:num_train]
y_test = Y[(num_train+1):num_rows]
head(train)
dim(train)
str(train)
## FIT ORDINAL REGRESSION
library(MASS)
library(car)
library(glm.predict)
#Remove Ethnicity (PTETHCAT, doesn't provide much info) and APOE.Genotype variable (polr function can't handle the)
ordinal.fit = polr(y_train ~ .-PTETHCAT -APOE.Genotype, data=train, Hess=TRUE)
summary(ordinal.fit) #AIC = 583
test = data.frame(test)
predictions = predict(ordinal.fit, newdata=test)
predictions = factor(predictions, levels=c('CN', 'LMCI', 'AD'), ordered=TRUE)
#calculate accuracy of predictions
stopifnot(length(predictions) == length(y_test))
accuracy_vector = (predictions == y_test)
head(accuracy_vector)
num_correct = table(accuracy_vector)[2]
accuracy = num_correct / num_test
#Accuracy = 70%
#This metric is not optimal though, as it does not consider the effect of false positives and false negatives.
#For example, it is much worse to diagnose someone as Cognitively Normal (CN) when in fact they have Alheimer's (AD) than it is
#to diagnose someone as Limited Mild Cognitively Impaired (LMCI) when they are Cognitively Normal (CN), along with many other cases.
# Look at common mistakes
incorrect_predictions = predictions[!accuracy_vector]
labels_incorrectly_predicted = y_test[!accuracy_vector]
#plot
barplot(prop.table(table(incorrect_predictions)))
barplot(prop.table(table(labels_incorrectly_predicted)))
#On incorrect predictions, predicted CN too often (~50% of the time) where CN was <10% of the actual correct labels for
#observations incorrectly predicted. Can be viewed as a False Negative and would be very bad to predict CN when a patient
#has LMCI or AD.
#Distribution of incorrect predictions: 48% CN, 40% LMCI, 12% AD
#Distribution of labels on incorrect predictions: 8% CN, 60% LMCI, 30% AD
#Main problem: False Negatives. Predicting CN and under-predicting LMCI and AD.