Part4.Rmd

---
title: "Feature importance using DREMI scores - Application to Benedicte's cohort"
author: "Dimitrios Kleftogiannis"
date: "2024-03-15"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

### Utility
  
This code is part of the study titled "Automated cell type annotation and exploration of single-cell signalling dynamics using mass cytometry".

The utility of this code is use XGBoost feature importance and identify the most stable features, meaning that we estimate the frequency of selection and tha gain metric returned by the algorithm. The data used here are described at "Early response evaluation by single cell signaling profiling in acute myeloid leukemia", doi: 10.1038/s41467-022-35624-4

### Contact
  
Comments and bug reports are welcome, please email: Dimitrios Kleftogiannis (dimitrios.kleftogiannis@uib.no)

We are also interested to know about how you have used our framework, including any improvements that you have implemented.

You are free to modify, extend or distribute our source codes, as long as our copyright notice remains unchanged and included in its entirety. 

### License

This code is licensed under the MIT License.

Copyright 2023, University of Bergen (UiB) and NeuroSysMed, Norway


## Load R libraries.
We also define functions that are required
```{r load packages, echo=TRUE, eval=TRUE, error=TRUE, warning=FALSE,cache=TRUE}
library(ggplot2)
library(readxl)
library(dplyr)
library(ggridges)
library(MASS)
library(matrixStats)
library(reshape2)
library(ggthemes)
library(RColorBrewer)
library(ggbeeswarm)
library(DMwR)
library(xgboost)
library(pROC)
```


## Setting workspace and loading in-house functions
```{r initialise workspace,echo=TRUE,eval=TRUE,error=TRUE, warning=FALSE,cache=TRUE}
#load the functions
setwd("/Users/kleftogi/Desktop/CyTOF_paper_corrections")
set.seed(1234)
```


## Use XGBoost feauture importance 
The input would be DREMI scores with SMOTE 1:2 ratios between positive and negative classes.

We will assess the most stable features 

```{r predicting survival using XGBoost and SMOTE on DREMI,echo=TRUE,eval=TRUE,error=TRUE, warning=FALSE,cache=TRUE}

#load the clinical info from external files --> we are not allowed to share this file; please contact the authors
filename <- "AML_Benedicte_cohort/barcode_metadata_leukemia_cohort.xlsx"
#read the samples
md <- read_excel(filename)
patient_samples <- md[md$condition=='0h',]
file_names_patients <- patient_samples$file_name
file_names_patients <- paste('AML_Benedicte_cohort/raw_data/',file_names_patients,sep='')
file_barcode_patients <- patient_samples$barcode
metadata_filename <- "AML_Benedicte_cohort/leukemia_cohort_info.xlsx"
md_0h <- read_excel(metadata_filename)

load('DREMI_feature_vector_v2.RDa')

#retrieve the markers of interest - for this example we focus on signaling markers only
signal.col.names <- c("pAxl","CyclinB1","pNFkB","pErk","pSTAT1",
                      "pP38","pSTAT3","pCREB","pHist3","Casp3",
                      "pSTAT5","p4EBP1","pAkt","pRB","pS6")

#we generate all possible pairs of signaling markers in the panel
x = signal.col.names
testRelation <- with(subset(expand.grid(x,x),Var1!=Var2),paste0(Var1,'-',Var2))

#perform cell type specific modeling
cellTypes <- c('B','CD4_T','CD8_T','HSCs_MPPs', 'Monocytes','NK','pDCs') 

####################################################################################################
#SMOTE XGBoost 1:1 ratio - save the feature importance
totalPerformanceXGBoostDREMI_1_1 <- data.frame()
N <- 1000

#initiate cell type specific matrices
featureImportanceSummaryGain_B <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryGain_B <- as.data.frame(featureImportanceSummaryGain_B)
featureImportanceSummaryGain_B$Feature <- testRelation
featureImportanceSummaryFrequency_B <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryFrequency_B <- as.data.frame(featureImportanceSummaryFrequency_B)
featureImportanceSummaryFrequency_B$Feature <- testRelation

featureImportanceSummaryGain_CD4T <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryGain_CD4T <- as.data.frame(featureImportanceSummaryGain_CD4T)
featureImportanceSummaryGain_CD4T$Feature <- testRelation
featureImportanceSummaryFrequency_CD4T <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryFrequency_CD4T <- as.data.frame(featureImportanceSummaryFrequency_CD4T)
featureImportanceSummaryFrequency_CD4T$Feature <- testRelation

featureImportanceSummaryGain_CD8T <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryGain_CD8T <- as.data.frame(featureImportanceSummaryGain_CD8T)
featureImportanceSummaryGain_CD8T$Feature <- testRelation
featureImportanceSummaryFrequency_CD8T <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryFrequency_CD8T <- as.data.frame(featureImportanceSummaryFrequency_CD8T)
featureImportanceSummaryFrequency_CD8T$Feature <- testRelation

featureImportanceSummaryGain_HSCsMPPs <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryGain_HSCsMPPs <- as.data.frame(featureImportanceSummaryGain_HSCsMPPs)
featureImportanceSummaryGain_HSCsMPPs$Feature <- testRelation
featureImportanceSummaryFrequency_HSCsMPPs <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryFrequency_HSCsMPPs <- as.data.frame(featureImportanceSummaryFrequency_HSCsMPPs)
featureImportanceSummaryFrequency_HSCsMPPs$Feature <- testRelation

featureImportanceSummaryGain_Monocytes <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryGain_Monocytes <- as.data.frame(featureImportanceSummaryGain_Monocytes)
featureImportanceSummaryGain_Monocytes$Feature <- testRelation
featureImportanceSummaryFrequency_Monocytes <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryFrequency_Monocytes <- as.data.frame(featureImportanceSummaryFrequency_Monocytes)
featureImportanceSummaryFrequency_Monocytes$Feature <- testRelation

featureImportanceSummaryGain_pDCs <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryGain_pDCs <- as.data.frame(featureImportanceSummaryGain_pDCs)
featureImportanceSummaryGain_pDCs$Feature <- testRelation
featureImportanceSummaryFrequency_pDCs <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryFrequency_pDCs <- as.data.frame(featureImportanceSummaryFrequency_pDCs)
featureImportanceSummaryFrequency_pDCs$Feature <- testRelation

featureImportanceSummaryGain_NK <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryGain_NK <- as.data.frame(featureImportanceSummaryGain_NK)
featureImportanceSummaryGain_NK$Feature <- testRelation
featureImportanceSummaryFrequency_NK <- matrix(0,nrow = length(testRelation),ncol = N )
featureImportanceSummaryFrequency_NK <- as.data.frame(featureImportanceSummaryFrequency_NK)
featureImportanceSummaryFrequency_NK$Feature <- testRelation

start_time = Sys.time()
for(iter in 1:N){
    
         for(currentCellType in cellTypes){
           
          #fetch the data
          currentDataValues <- DREMI_feature_vector[DREMI_feature_vector$CellType==currentCellType,]
          #first we check the the cases where there are -1 in the dataset.this means that DREMI was not computed so we filter out those samples
          a <- which(currentDataValues[,1]==-1)
          if(length(a)<=0){
            a <- 1
          }else{
            currentDataValues <- currentDataValues[-a,]
          }
          
          #fetch the data and filter to perform SMOTE sampling
          currentDataValues_filt <- currentDataValues[,c(1:210,218)]
          currentDataValues_filt$SurvivalGroup <- as.factor(currentDataValues_filt$SurvivalGroup)
          #1000 vs 110 gives balanced data 
          currentDataValues_new <- SMOTE(SurvivalGroup ~ ., currentDataValues_filt, perc.over = 1000, perc.under = 120)
          currentDataValues_new$SurvivalStatus <- ifelse(currentDataValues_new$SurvivalGroup=='STS',1,0)

          #generate random disjoint sets for training, validation and testing
          cut_point <- round(nrow(currentDataValues_new)*0.8)
          a <- sample(nrow(currentDataValues_new))
          asel <- a[1:cut_point]
          train_valid_Set <- currentDataValues_new[asel,]
          asel <- a[(cut_point+1):length(a)]
          testSet <- currentDataValues_new[asel,]
          
          #if the test set has only CellType from one class AUC cannot be built, thus we ship
          check1 <- which(testSet$SurvivalStatus==0)
          check2 <- which(testSet$SurvivalStatus==1)
          if(length(check1)>0 & length(check2)>0){
                     
                  #split the train_valid_Set to validation and actual train with ratios again 60-40
                  cut_point <- round(nrow(train_valid_Set)*0.6)
                  a <- sample(nrow(train_valid_Set))
                  asel <- a[1:cut_point]
                  trainSet <- train_valid_Set[asel,]
                  asel <- a[(cut_point+1):length(a)]
                  validationSet <- train_valid_Set[asel,]
                  
                  depthTunning <- data.frame()
                  for(depthVal in c(2,4,6,8,10,12,14,16,18,20,30,40,50,60)){
                    
                     Model <- xgboost(data = as.matrix(trainSet[,testRelation]),
                                    label = trainSet$SurvivalStatus,
                                    max_depth = depthVal,
                                    eta = 0.3,
                                    nthread = 2,
                                    nrounds = 4,
                                    objective = "binary:logistic",
                                    verbose = 0)
                    
                     pred <- predict(Model, as.matrix(validationSet[,testRelation]))
                     prediction <- as.numeric(pred > 0.5)
                     
                     #if you want to estimate error
                     #err <- mean(as.numeric(pred > 0.5) != validationSet$SurvivalStatus)
                     #print(paste("test-error=", err))
                     #assess performance
                     prediction <- factor(prediction,levels = c(0,1))
                     validationSet$SurvivalStatus <- factor(validationSet$SurvivalStatus,levels = c(0,1))
                     perf <- confusionMatrix(prediction, validationSet$SurvivalStatus, positive = c('1'))
                     perf <- as.data.frame(t(perf$byClass))
                     perf$depthVal <- depthVal
                     depthTunning <- rbind(depthTunning,perf)
                  }
                  #find the depthVal that maximises F1 in the validation set
                 a <- which.max(depthTunning$F1)
                 if(length(a)>0){
                   bestdepthVal <- depthTunning[a,'depthVal']
                 }else{
                   bestdepthVal <- 4
                 }
                 
                 #final model after tuning, tested on the test set
                 bstModel <- xgboost(data = as.matrix(train_valid_Set[,testRelation]),
                                    label = train_valid_Set$SurvivalStatus,
                                    max_depth = bestdepthVal,
                                    eta = 0.3,
                                    nthread = 2,
                                    nrounds = 4,
                                    objective = "binary:logistic",
                                    verbose = 0)
                 
                 importance_matrix <- xgb.importance(model = bstModel)
                 
                 #a bit ugly part of code, but it was easy to control it and avoid nested lists
                 if(currentCellType=='B'){
                   dr <- match(importance_matrix$Feature,featureImportanceSummaryGain_B$Feature)
                   featureImportanceSummaryGain_B[dr,iter] <- importance_matrix$Gain
                   featureImportanceSummaryFrequency_B[dr,iter] <- importance_matrix$Frequency
                 }
                 
                 if(currentCellType=='CD4_T'){
                   dr <- match(importance_matrix$Feature,featureImportanceSummaryGain_CD4T$Feature)
                   featureImportanceSummaryGain_CD4T[dr,iter] <- importance_matrix$Gain
                   featureImportanceSummaryFrequency_CD4T[dr,iter] <- importance_matrix$Frequency
                 }
                 
                 if(currentCellType=='CD8_T'){
                   dr <- match(importance_matrix$Feature,featureImportanceSummaryGain_CD8T$Feature)
                   featureImportanceSummaryGain_CD8T[dr,iter] <- importance_matrix$Gain
                   featureImportanceSummaryFrequency_CD8T[dr,iter] <- importance_matrix$Frequency
                 }
                 
                 if(currentCellType=='HSCs_MPPs'){
                   dr <- match(importance_matrix$Feature,featureImportanceSummaryGain_HSCsMPPs$Feature)
                   featureImportanceSummaryGain_HSCsMPPs[dr,iter] <- importance_matrix$Gain
                   featureImportanceSummaryFrequency_HSCsMPPs[dr,iter] <- importance_matrix$Frequency
                 }
                 
                 if(currentCellType=='Monocytes'){
                   dr <- match(importance_matrix$Feature,featureImportanceSummaryGain_Monocytes$Feature)
                   featureImportanceSummaryGain_Monocytes[dr,iter] <- importance_matrix$Gain
                   featureImportanceSummaryFrequency_Monocytes[dr,iter] <- importance_matrix$Frequency
                 }
                 
                 if(currentCellType=='NK'){
                   dr <- match(importance_matrix$Feature,featureImportanceSummaryGain_NK$Feature)
                   featureImportanceSummaryGain_NK[dr,iter] <- importance_matrix$Gain
                   featureImportanceSummaryFrequency_NK[dr,iter] <- importance_matrix$Frequency
                 }
                 
                  if(currentCellType=='pDCs'){
                   dr <- match(importance_matrix$Feature,featureImportanceSummaryGain_pDCs$Feature)
                   featureImportanceSummaryGain_pDCs[dr,iter] <- importance_matrix$Gain
                   featureImportanceSummaryFrequency_pDCs[dr,iter] <- importance_matrix$Frequency
                 }
                 
                 pred <- predict(bstModel, as.matrix(testSet[,testRelation]))
                 prediction <- as.numeric(pred > 0.5)
                 prediction <- factor(prediction,levels = c(0,1))
                 testSet$SurvivalStatus <- factor(testSet$SurvivalStatus,levels = c(0,1))
                 perf <- confusionMatrix(prediction, testSet$SurvivalStatus, positive = c('1'))
                 perf <- as.data.frame(t(perf$byClass)) 
                 
                 #ROC using pROC package
                 tS <- as.numeric(testSet$SurvivalStatus)-1
                 pR <- as.numeric(prediction)-1
                 a <- roc(tS, pR,levels=c("1","0"))
                
                 perf$AUC <- a$auc     
                 perf$depthVal <- bestdepthVal
                 perf$CellType <- currentCellType
                 perf$Iter <- iter
                 totalPerformanceXGBoostDREMI_1_1 <- rbind(totalPerformanceXGBoostDREMI_1_1,perf)
          }
         }
  
  if(iter%%50==0){
    
    str <- paste('Processed iter: ',iter,' /',N,sep='')
    cat(str)
    cat('\n')
    gc()
  }
}
end_time = Sys.time()
end_time-start_time


gc()

#summarise selected features per cell type
library(ggrepel)
db13 <- c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', 
          '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', 'slategray3',
          'khaki3','bisque3','coral1','mediumaquamarine',
          '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080','gray88' ,'#ffffff', '#000000')


############################################################################################
############################################################################################

#B
n <- 1
dataGain <- featureImportanceSummaryGain_B
dataFreq <- featureImportanceSummaryFrequency_B

dataGain$TotalGain <- rowSums(dataGain[,c(1:N)])
dataGain$AverageGain <- rowMeans(dataGain[,c(1:N)])
dataFreq$AverageFreq <- rowMeans(dataFreq[,c(1:N)])
for(i in 1:nrow(dataFreq)){
  for(j in 1:N){
    if(dataFreq[i,j]!=0){
      dataFreq[i,j] <- 1
    }
  }
}
dataFreq$TotalCount <- rowSums(dataFreq[,c(1:N)])

plotData <- data.frame(Features=dataGain$Feature,
                       TotalGain=dataGain$TotalGain,
                       AvgGain=dataGain$AverageGain,
                       AvgFreq=dataFreq$AverageFreq,
                       Counts=dataFreq$TotalCount)

plotData <- arrange(plotData,desc(AvgGain))
#plot the first 10 total gain features
tmp <- plotData[1:10,]
str <- paste('B cells - top10 features',sep='')
o1 <- ggplot(tmp, aes(x = reorder(Features,AvgGain), y = AvgGain)) +
  geom_bar(stat='identity',width = 0.58,alpha=0.68,size=0.2,color='gray8',fill=db13[n])+
  ylab('Avg. Gain (1000 iter.)')+
  ggtitle(str)+
  theme_bw()+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)+
  guides(fill = guide_legend(override.aes = list(size=3),nrow=1,title=""))+
  coord_flip()

tmp  <- arrange(plotData,desc(AvgGain),Counts)
 str <- paste('B cells',sep='')
 o4 <-  ggplot(tmp,aes(y=AvgGain,x=Counts))+
    geom_point(size = 2,color=db13[n])+
    geom_label_repel(aes(label = Features),
                  box.padding   = 0.1, 
                  point.padding = 0.2,
                  size=3,
                  segment.color = 'grey50')+
    ggtitle(str)+
   ylab('Avg. Gain (1000 iter.)')+
   xlab('#Times selected')+
    theme_bw()+
   xlim(0,1000)+
    theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_text( size = 12 ),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)

 myfile <- paste('B_Gain.pdf',sep ='')
 pdf(myfile)
 print(o1)
 dev.off()
 
 myfile <- paste('B_Gain_Count.pdf',sep ='')
 pdf(myfile)
 print(o4)
 dev.off()
 
 
############################################################################################
############################################################################################ 

#CD4_T
n <- 2
dataGain <- featureImportanceSummaryGain_CD4T
dataFreq <- featureImportanceSummaryFrequency_CD4T

dataGain$TotalGain <- rowSums(dataGain[,c(1:N)])
dataGain$AverageGain <- rowMeans(dataGain[,c(1:N)])
dataFreq$AverageFreq <- rowMeans(dataFreq[,c(1:N)])
for(i in 1:nrow(dataFreq)){
  for(j in 1:N){
    if(dataFreq[i,j]!=0){
      dataFreq[i,j] <- 1
    }
  }
}
dataFreq$TotalCount <- rowSums(dataFreq[,c(1:N)])

plotData <- data.frame(Features=dataGain$Feature,
                       TotalGain=dataGain$TotalGain,
                       AvgGain=dataGain$AverageGain,
                       AvgFreq=dataFreq$AverageFreq,
                       Counts=dataFreq$TotalCount)

plotData <- arrange(plotData,desc(AvgGain))
#plot the first 10 total gain features
tmp <- plotData[1:10,]
str <- paste('CD4_T cells - top10 features',sep='')
o1 <- ggplot(tmp, aes(x = reorder(Features,AvgGain), y = AvgGain)) +
  geom_bar(stat='identity',width = 0.58,alpha=0.68,size=0.2,color='gray8',fill=db13[n])+
  ylab('Avg. Gain (1000 iter.)')+
  ggtitle(str)+
  theme_bw()+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)+
  guides(fill = guide_legend(override.aes = list(size=3),nrow=1,title=""))+
  coord_flip()

tmp  <- arrange(plotData,desc(AvgGain),Counts)
 str <- paste('CD4_T cells',sep='')
 o4 <-  ggplot(tmp,aes(y=AvgGain,x=Counts))+
    geom_point(size = 2,color=db13[n])+
    geom_label_repel(aes(label = Features),
                  box.padding   = 0.1, 
                  point.padding = 0.2,
                  size=3,
                  segment.color = 'grey50')+
    ggtitle(str)+
   ylab('Avg. Gain (1000 iter.)')+
   xlab('#Times selected')+
    theme_bw()+
   xlim(0,1000)+
    theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_text( size = 12 ),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)

 myfile <- paste('CD4T_Gain.pdf',sep ='')
 pdf(myfile)
 print(o1)
 dev.off()
 
 myfile <- paste('CD4T_Gain_Count.pdf',sep ='')
 pdf(myfile)
 print(o4)
 dev.off()
 
############################################################################################
############################################################################################
  
#CD8_T
n <- 3
dataGain <- featureImportanceSummaryGain_CD8T
dataFreq <- featureImportanceSummaryFrequency_CD8T

dataGain$TotalGain <- rowSums(dataGain[,c(1:N)])
dataGain$AverageGain <- rowMeans(dataGain[,c(1:N)])
dataFreq$AverageFreq <- rowMeans(dataFreq[,c(1:N)])
for(i in 1:nrow(dataFreq)){
  for(j in 1:N){
    if(dataFreq[i,j]!=0){
      dataFreq[i,j] <- 1
    }
  }
}
dataFreq$TotalCount <- rowSums(dataFreq[,c(1:N)])

plotData <- data.frame(Features=dataGain$Feature,
                       TotalGain=dataGain$TotalGain,
                       AvgGain=dataGain$AverageGain,
                       AvgFreq=dataFreq$AverageFreq,
                       Counts=dataFreq$TotalCount)

plotData <- arrange(plotData,desc(AvgGain))
#plot the first 10 total gain features
tmp <- plotData[1:10,]
str <- paste('CD8_T cells - top10 features',sep='')
o1 <- ggplot(tmp, aes(x = reorder(Features,AvgGain), y = AvgGain)) +
  geom_bar(stat='identity',width = 0.58,alpha=0.68,size=0.2,color='gray8',fill=db13[n])+
  ylab('Avg. Gain (1000 iter.)')+
  ggtitle(str)+
  theme_bw()+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)+
  guides(fill = guide_legend(override.aes = list(size=3),nrow=1,title=""))+
  coord_flip()

tmp  <- arrange(plotData,desc(AvgGain),Counts)
 str <- paste('CD8_T cells',sep='')
 o4 <-  ggplot(tmp,aes(y=AvgGain,x=Counts))+
    geom_point(size = 2,color=db13[n])+
    geom_label_repel(aes(label = Features),
                  box.padding   = 0.1, 
                  point.padding = 0.2,
                  size=3,
                  segment.color = 'grey50')+
    ggtitle(str)+
   ylab('Avg. Gain (1000 iter.)')+
   xlab('#Times selected')+
   xlim(0,1000)+
    theme_bw()+
    theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_text( size = 12 ),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)

 myfile <- paste('CD8T_Gain.pdf',sep ='')
 pdf(myfile)
 print(o1)
 dev.off()
 
 myfile <- paste('CD8T_Gain_Count.pdf',sep ='')
 pdf(myfile)
 print(o4)
 dev.off()
 
############################################################################################
############################################################################################ 

#HSCs_MPPs
n <- 4
dataGain <- featureImportanceSummaryGain_HSCsMPPs
dataFreq <- featureImportanceSummaryFrequency_HSCsMPPs

dataGain$TotalGain <- rowSums(dataGain[,c(1:N)])
dataGain$AverageGain <- rowMeans(dataGain[,c(1:N)])
dataFreq$AverageFreq <- rowMeans(dataFreq[,c(1:N)])
for(i in 1:nrow(dataFreq)){
  for(j in 1:N){
    if(dataFreq[i,j]!=0){
      dataFreq[i,j] <- 1
    }
  }
}
dataFreq$TotalCount <- rowSums(dataFreq[,c(1:N)])

plotData <- data.frame(Features=dataGain$Feature,
                       TotalGain=dataGain$TotalGain,
                       AvgGain=dataGain$AverageGain,
                       AvgFreq=dataFreq$AverageFreq,
                       Counts=dataFreq$TotalCount)

plotData <- arrange(plotData,desc(AvgGain))
#plot the first 10 total gain features
tmp <- plotData[1:10,]
str <- paste('HSCs & MPPs cells - top10 features',sep='')
o1 <- ggplot(tmp, aes(x = reorder(Features,AvgGain), y = AvgGain)) +
  geom_bar(stat='identity',width = 0.58,alpha=0.68,size=0.2,color='gray8',fill=db13[n])+
  ylab('Avg. Gain (1000 iter.)')+
  ggtitle(str)+
  theme_bw()+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)+
  guides(fill = guide_legend(override.aes = list(size=3),nrow=1,title=""))+
  coord_flip()

tmp  <- arrange(plotData,desc(AvgGain),Counts)
 str <- paste('HSCs & MPPs cells',sep='')
 o4 <-  ggplot(tmp,aes(y=AvgGain,x=Counts))+
    geom_point(size = 2,color=db13[n])+
    geom_label_repel(aes(label = Features),
                  box.padding   = 0.1, 
                  point.padding = 0.2,
                  size=3,
                  segment.color = 'grey50')+
    ggtitle(str)+
   ylab('Avg. Gain (1000 iter.)')+
   xlab('#Times selected')+
   xlim(0,1000)+
    theme_bw()+
    theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_text( size = 12 ),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)

 myfile <- paste('HSCsMPP_Gain.pdf',sep ='')
 pdf(myfile)
 print(o1)
 dev.off()
 
 myfile <- paste('HSCsMPP_Gain_Count.pdf',sep ='')
 pdf(myfile)
 print(o4)
 dev.off()
 
############################################################################################
############################################################################################  

#Monocytes
n <- 5
dataGain <- featureImportanceSummaryGain_Monocytes
dataFreq <- featureImportanceSummaryFrequency_Monocytes

dataGain$TotalGain <- rowSums(dataGain[,c(1:N)])
dataGain$AverageGain <- rowMeans(dataGain[,c(1:N)])
dataFreq$AverageFreq <- rowMeans(dataFreq[,c(1:N)])
for(i in 1:nrow(dataFreq)){
  for(j in 1:N){
    if(dataFreq[i,j]!=0){
      dataFreq[i,j] <- 1
    }
  }
}
dataFreq$TotalCount <- rowSums(dataFreq[,c(1:N)])

plotData <- data.frame(Features=dataGain$Feature,
                       TotalGain=dataGain$TotalGain,
                       AvgGain=dataGain$AverageGain,
                       AvgFreq=dataFreq$AverageFreq,
                       Counts=dataFreq$TotalCount)

plotData <- arrange(plotData,desc(AvgGain))
#plot the first 10 total gain features
tmp <- plotData[1:10,]
str <- paste('Monocytes - top10 features',sep='')
o1 <- ggplot(tmp, aes(x = reorder(Features,AvgGain), y = AvgGain)) +
  geom_bar(stat='identity',width = 0.58,alpha=0.68,size=0.2,color='gray8',fill=db13[n])+
  ylab('Avg. Gain (1000 iter.)')+
  ggtitle(str)+
  theme_bw()+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)+
  guides(fill = guide_legend(override.aes = list(size=3),nrow=1,title=""))+
  coord_flip()

tmp  <- arrange(plotData,desc(AvgGain),Counts)
 str <- paste('Monocytes',sep='')
 o4 <-  ggplot(tmp,aes(y=AvgGain,x=Counts))+
    geom_point(size = 2,color=db13[n])+
    geom_label_repel(aes(label = Features),
                  box.padding   = 0.1, 
                  point.padding = 0.2,
                  size=3,
                  segment.color = 'grey50')+
    ggtitle(str)+
   ylab('Avg. Gain (1000 iter.)')+
   xlab('#Times selected')+
   xlim(0,1000)+
    theme_bw()+
    theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_text( size = 12 ),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)

 myfile <- paste('Monocytes_Gain.pdf',sep ='')
 pdf(myfile)
 print(o1)
 dev.off()
 
 myfile <- paste('Monocytes_Gain_Count.pdf',sep ='')
 pdf(myfile)
 print(o4)
 dev.off()
 
############################################################################################
############################################################################################  
 
#NK
n <- 6
dataGain <- featureImportanceSummaryGain_NK
dataFreq <- featureImportanceSummaryFrequency_NK

dataGain$TotalGain <- rowSums(dataGain[,c(1:N)])
dataGain$AverageGain <- rowMeans(dataGain[,c(1:N)])
dataFreq$AverageFreq <- rowMeans(dataFreq[,c(1:N)])
for(i in 1:nrow(dataFreq)){
  for(j in 1:N){
    if(dataFreq[i,j]!=0){
      dataFreq[i,j] <- 1
    }
  }
}
dataFreq$TotalCount <- rowSums(dataFreq[,c(1:N)])

plotData <- data.frame(Features=dataGain$Feature,
                       TotalGain=dataGain$TotalGain,
                       AvgGain=dataGain$AverageGain,
                       AvgFreq=dataFreq$AverageFreq,
                       Counts=dataFreq$TotalCount)

plotData <- arrange(plotData,desc(AvgGain))
#plot the first 10 total gain features
tmp <- plotData[1:10,]
str <- paste('NK cells - top10 features',sep='')
o1 <- ggplot(tmp, aes(x = reorder(Features,AvgGain), y = AvgGain)) +
  geom_bar(stat='identity',width = 0.58,alpha=0.68,size=0.2,color='gray8',fill=db13[n])+
  ylab('Avg. Gain (1000 iter.)')+
  ggtitle(str)+
  theme_bw()+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)+
  guides(fill = guide_legend(override.aes = list(size=3),nrow=1,title=""))+
  coord_flip()

tmp  <- arrange(plotData,desc(AvgGain),Counts)
 str <- paste('NK cells',sep='')
 o4 <-  ggplot(tmp,aes(y=AvgGain,x=Counts))+
    geom_point(size = 2,color=db13[n])+
    geom_label_repel(aes(label = Features),
                  box.padding   = 0.1, 
                  point.padding = 0.2,
                  size=3,
                  segment.color = 'grey50')+
    ggtitle(str)+
   ylab('Avg. Gain (1000 iter.)')+
   xlab('#Times selected')+
   xlim(0,1000)+
    theme_bw()+
    theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_text( size = 12 ),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)

 myfile <- paste('NK_Gain.pdf',sep ='')
 pdf(myfile)
 print(o1)
 dev.off()
 
 myfile <- paste('NK_Gain_Count.pdf',sep ='')
 pdf(myfile)
 print(o4)
 dev.off()
 
############################################################################################
############################################################################################  
 
 #pDC
n <- 7
dataGain <- featureImportanceSummaryGain_pDCs
dataFreq <- featureImportanceSummaryFrequency_pDCs

dataGain$TotalGain <- rowSums(dataGain[,c(1:N)])
dataGain$AverageGain <- rowMeans(dataGain[,c(1:N)])
dataFreq$AverageFreq <- rowMeans(dataFreq[,c(1:N)])
for(i in 1:nrow(dataFreq)){
  for(j in 1:N){
    if(dataFreq[i,j]!=0){
      dataFreq[i,j] <- 1
    }
  }
}
dataFreq$TotalCount <- rowSums(dataFreq[,c(1:N)])

plotData <- data.frame(Features=dataGain$Feature,
                       TotalGain=dataGain$TotalGain,
                       AvgGain=dataGain$AverageGain,
                       AvgFreq=dataFreq$AverageFreq,
                       Counts=dataFreq$TotalCount)

plotData <- arrange(plotData,desc(AvgGain))
#plot the first 10 total gain features
tmp <- plotData[1:10,]
str <- paste('pDC cells - top10 features',sep='')
o1 <- ggplot(tmp, aes(x = reorder(Features,AvgGain), y = AvgGain)) +
  geom_bar(stat='identity',width = 0.58,alpha=0.68,size=0.2,color='gray8',fill=db13[n])+
  ylab('Avg. Gain (1000 iter.)')+
  ggtitle(str)+
  theme_bw()+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)+
  guides(fill = guide_legend(override.aes = list(size=3),nrow=1,title=""))+
  coord_flip()

tmp  <- arrange(plotData,desc(AvgGain),Counts)
 str <- paste('pDC cells',sep='')
 o4 <-  ggplot(tmp,aes(y=AvgGain,x=Counts))+
    geom_point(size = 2,color=db13[n])+
    geom_label_repel(aes(label = Features),
                  box.padding   = 0.1, 
                  point.padding = 0.2,
                  size=3,
                  segment.color = 'grey50')+
    ggtitle(str)+
   ylab('Avg. Gain (1000 iter.)')+
   xlab('#Times selected')+
   xlim(0,1000)+
    theme_bw()+
    theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5, size = 12),
        axis.title.x = element_text( size = 12,face = 'bold' ),
        axis.title.y = element_text( size = 12 ),
        strip.text = element_text(size = 12,face='bold',lineheight=1),
        legend.position = "none",aspect.ratio = 1)

 myfile <- paste('pDC_Gain.pdf',sep ='')
 pdf(myfile)
 print(o1)
 dev.off()
 
 myfile <- paste('pDC_Gain_Count.pdf',sep ='')
 pdf(myfile)
 print(o4)
 dev.off()

############################################################################################
############################################################################################  

#finally we estimate the average performance per cell type 
 meanPerformances <- data.frame()
  #compute the median per cell type and per method
  for(currentCellType in cellTypes){
   
    tmp <-  totalPerformanceXGBoostDREMI_1_1[totalPerformanceXGBoostDREMI_1_1$CellType==currentCellType,]
    #compute the medians
    tmp <- tmp %>%
    group_by(CellType) %>%
    dplyr::summarize(meanSEN = mean(Sensitivity, na.rm=TRUE),
                     meanSPE = mean(Specificity, na.rm=TRUE),
                     meanF1 = mean(F1, na.rm=TRUE),
                     meanAUC = mean(AUC, na.rm=TRUE))
    tmp$CellType <- currentCellType
    meanPerformances <- rbind(meanPerformances,tmp)
  }

 tmp <- melt(meanPerformances,id.vars = c("CellType"))
 tmp$CellType <- factor(tmp$CellType,levels = cellTypes)
 colnames(tmp)[2] <- 'Metric'
 tmp$Metric <- factor(tmp$Metric,levels = c('meanSEN','meanSPE','meanF1','meanAUC'))
 
 performance_summary <- ggplot(tmp, aes(x = CellType, y = value, fill = CellType)) +
              geom_bar(stat="identity",size=0.3,width = 0.5,alpha=0.88,color='black')+
              facet_wrap(~Metric,ncol = 2)+
              #geom_hline(yintercept = 0.9,color='red')+
              ylab('Mean performance per cell type')+
              theme_bw() +
              scale_fill_manual(values = db13,name='')+
              theme(axis.text.y = element_text( size = 12 ),
                      axis.text.x = element_text(angle = 0, vjust = 0.05, hjust = 0.95, size = 12),
                      axis.title.x =element_text( size = 12 ) ,
                      axis.title.y = element_blank(),
                      strip.text = element_text(size = 10,face='bold',lineheight=1),
                      legend.position = "none",aspect.ratio = 1,
                      legend.text=element_text(size=12))+coord_flip()
   
 myfile <- paste('best_DREMI_avg_performance.pdf',sep ='')
 pdf(myfile)
 print(performance_summary)
 dev.off()
 
```


## Use the clinical data to find possible associations of selected DREMI features with clinical outcome
In this subsection we use the features from the previous analysis to investigate possible associations with survival.

```{r assocation with survival,echo=TRUE,eval=TRUE,error=TRUE, warning=FALSE,cache=TRUE}
library(survival)
library(survminer)

#clinical info 
#load the clinical info --> not allowed to share this data; please contact the authors
metadata_filename <- "AML_Benedicte_cohort/leukemia_cohort_info.xlsx"
md_0h <- read_excel(metadata_filename)

#initialise the top selected features based on the feature the achieves the highest gain and frequency of selection

# B cells -->
myCurrentCellType <- 'B'
mySelectedFeatures <- c('pP38-pSTAT3')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#plot 1 based on survival
plot_data <- summary_patient_data[,c(mySelectedFeatures,'SurvivalGroup')]
plot_data <- melt(plot_data,id.vars = 'SurvivalGroup')
plot_data$SurvivalGroup <- factor(plot_data$SurvivalGroup,levels = c('STS','LTS'))

DREMI_B_STS_LTS <- ggplot(plot_data,aes(x=SurvivalGroup,y=(value),fill=SurvivalGroup),color=SurvivalGroup)+
  geom_beeswarm(size=0.78,alpha=0.8)+
  geom_boxplot(width=0.38 ,size=0.58,alpha=0.8,fatten=TRUE,
               position=position_dodge(width=0.78),
               outlier.colour = "black",
               outlier.shape = 21,
               outlier.fill = "gray",
               outlier.size = 0.4)+
  facet_wrap(~variable,scales = "free_x",ncol = 1)+
  theme_bw()+
  ggtitle('B cells')+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1, size = 12),
        axis.title.y = element_blank(),
        axis.title.x = element_text( size = 12 ),
        strip.text = element_text(size = 14,face='bold',lineheight=1),
        legend.position = "none",
        legend.text = element_text(size=12),
        strip.background = element_rect(colour = "black", fill = "white"),
        plot.title = element_text(size = 14, face = "bold",hjust = 0.5),aspect.ratio = 0.6)+
  ylab('DREMI scores')+
  scale_fill_manual(values=c('#e6194b', '#3cb44b'),name='')+
  theme(panel.spacing = unit(1.8, "lines"))+coord_flip()

#save the abundance plot
myfile <- paste('DREMI_B_STS_LTS.pdf',sep ='')
pdf(myfile)
print(DREMI_B_STS_LTS)
dev.off()


#then we perfrom KP survival analysis by discretising the DREMI values
test <- summary_patient_data

a1 <- quantile(test$`pP38-pSTAT3`)
test$`pP38-pSTAT3` <- ifelse(test$`pP38-pSTAT3`>a1[3],'High','Low')
test$`pP38-pSTAT3` <- factor(test$`pP38-pSTAT3`,levels = c('Low','High'))

data_B <- test[,c(1,2)]
colnames(data_B)[1] <- paste('B_',colnames(data_B)[1],sep='')

colnames(test)[1] <- 'D1'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ D1, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  legend.title = "pP38-pSTAT3",
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_B_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()

###################################################################################################

myCurrentCellType <- 'CD4_T'
mySelectedFeatures <- c('pSTAT3-pCREB')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    
   #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#plot 1 based on survival
plot_data <- summary_patient_data[,c(mySelectedFeatures,'SurvivalGroup')]
plot_data <- melt(plot_data,id.vars = 'SurvivalGroup')
plot_data$SurvivalGroup <- factor(plot_data$SurvivalGroup,levels = c('STS','LTS'))

DREMI_CD4T_STS_LTS <- ggplot(plot_data,aes(x=SurvivalGroup,y=(value),fill=SurvivalGroup),color=SurvivalGroup)+
  geom_beeswarm(size=0.78,alpha=0.8)+
  geom_boxplot(width=0.38 ,size=0.58,alpha=0.8,fatten=TRUE,
               position=position_dodge(width=0.78),
               outlier.colour = "black",
               outlier.shape = 21,
               outlier.fill = "gray",
               outlier.size = 0.4)+
  facet_wrap(~variable,scales = "free_x",ncol = 1)+
  theme_bw()+
  ggtitle('CD4 T cells')+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1, size = 12),
        axis.title.y = element_blank(),
        axis.title.x = element_text( size = 12 ),
        strip.text = element_text(size = 14,face='bold',lineheight=1),
        legend.position = "none",
        legend.text = element_text(size=12),
        strip.background = element_rect(colour = "black", fill = "white"),
        plot.title = element_text(size = 14, face = "bold",hjust = 0.5),aspect.ratio = 0.6)+
  ylab('DREMI scores')+
  scale_fill_manual(values=c('#e6194b', '#3cb44b'),name='')+
  theme(panel.spacing = unit(1.8, "lines"))+coord_flip()

#save the abundance plot
myfile <- paste('DREMI_CD4T_STS_LTS.pdf',sep ='')
pdf(myfile)
print(DREMI_CD4T_STS_LTS)
dev.off()


#then we perfrom KP survival analysis by discretising the DREMI values
test <- summary_patient_data

a1 <- quantile(test$`pSTAT3-pCREB`)
test$`pSTAT3-pCREB` <- ifelse(test$`pSTAT3-pCREB`>a1[3],'High','Low')
test$`pSTAT3-pCREB` <- factor(test$`pSTAT3-pCREB`,levels = c('Low','High'))

data_CD4T <- test[,c(1,2)]
colnames(data_CD4T)[1] <- paste('CD4T_',colnames(data_CD4T)[1],sep='')

colnames(test)[1] <- 'D1'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ D1, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  legend.title = "pSTAT3-pCREB",
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_CD4T_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()


###################################################################################################
#initialise the top selected features based on at least 0.75 selection frequency and gain above 0.15

myCurrentCellType <- 'HSCs_MPPs'
mySelectedFeatures <- c('pSTAT3-pErk')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
    
  }
}

#plot 1 based on survival
plot_data <- summary_patient_data[,c(mySelectedFeatures,'SurvivalGroup')]
plot_data <- melt(plot_data,id.vars = 'SurvivalGroup')
plot_data$SurvivalGroup <- factor(plot_data$SurvivalGroup,levels = c('STS','LTS'))

DREMI_HSCsMPPs_STS_LTS <- ggplot(plot_data,aes(x=SurvivalGroup,y=(value),fill=SurvivalGroup),color=SurvivalGroup)+
  geom_beeswarm(size=0.78,alpha=0.8)+
  geom_boxplot(width=0.38 ,size=0.58,alpha=0.8,fatten=TRUE,
               position=position_dodge(width=0.78),
               outlier.colour = "black",
               outlier.shape = 21,
               outlier.fill = "gray",
               outlier.size = 0.4)+
  facet_wrap(~variable,scales = "free_x",ncol = 1)+
  theme_bw()+
  ggtitle('HSCs & MPPs  cells')+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1, size = 12),
        axis.title.y = element_blank(),
        axis.title.x = element_text( size = 12 ),
        strip.text = element_text(size = 14,face='bold',lineheight=1),
        legend.position = "none",
        legend.text = element_text(size=12),
        strip.background = element_rect(colour = "black", fill = "white"),
        plot.title = element_text(size = 14, face = "bold",hjust = 0.5),aspect.ratio = 0.6)+
  ylab('DREMI scores')+
  scale_fill_manual(values=c('#e6194b', '#3cb44b'),name='')+
  theme(panel.spacing = unit(1.8, "lines"))+coord_flip()

#save the abundance plot
myfile <- paste('DREMI_HSCsMPPs_STS_LTS.pdf',sep ='')
pdf(myfile)
print(DREMI_HSCsMPPs_STS_LTS)
dev.off()


#then we perfrom KP survival analysis by discretising the DREMI values
test <- summary_patient_data

a1 <- quantile(test$`pSTAT3-pErk`)
test$`pSTAT3-pErk` <- ifelse(test$`pSTAT3-pErk`>a1[3],'High','Low')
test$`pSTAT3-pErk` <- factor(test$`pSTAT3-pErk`,levels = c('Low','High'))

data_HSCsMPPs <- test[,c(1,2)]
colnames(data_HSCsMPPs)[1] <- paste('HSCsMPPs_',colnames(data_HSCsMPPs)[1],sep='')

colnames(test)[1] <- 'D1'
fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ D1, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  legend.title = "pSTAT3-pErk",
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_HSCsMPPs_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()

###################################################################################################

myCurrentCellType <- 'CD8_T'
mySelectedFeatures <- c('pCREB-pSTAT3')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#plot 1 based on survival
plot_data <- summary_patient_data[,c(mySelectedFeatures,'SurvivalGroup')]
plot_data <- melt(plot_data,id.vars = 'SurvivalGroup')
plot_data$SurvivalGroup <- factor(plot_data$SurvivalGroup,levels = c('STS','LTS'))

DREMI_CD8T_STS_LTS <- ggplot(plot_data,aes(x=SurvivalGroup,y=(value),fill=SurvivalGroup),color=SurvivalGroup)+
  geom_beeswarm(size=0.78,alpha=0.8)+
  geom_boxplot(width=0.38 ,size=0.58,alpha=0.8,fatten=TRUE,
               position=position_dodge(width=0.78),
               outlier.colour = "black",
               outlier.shape = 21,
               outlier.fill = "gray",
               outlier.size = 0.4)+
  facet_wrap(~variable,scales = "free_x",ncol = 1)+
  theme_bw()+
  ggtitle('CD8 T cells')+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1, size = 12),
        axis.title.y = element_blank(),
        axis.title.x = element_text( size = 12 ),
        strip.text = element_text(size = 14,face='bold',lineheight=1),
        legend.position = "none",
        legend.text = element_text(size=12),
        strip.background = element_rect(colour = "black", fill = "white"),
        plot.title = element_text(size = 14, face = "bold",hjust = 0.5),aspect.ratio = 0.6)+
  ylab('DREMI scores')+
  scale_fill_manual(values=c('#e6194b', '#3cb44b'),name='')+
  theme(panel.spacing = unit(1.8, "lines"))+coord_flip()

#save the abundance plot
myfile <- paste('DREMI_CD8T_STS_LTS.pdf',sep ='')
pdf(myfile)
print(DREMI_CD8T_STS_LTS)
dev.off()


#then we perfrom KP survival analysis by discretising the DREMI values
test <- summary_patient_data

a1 <- quantile(test$`pCREB-pSTAT3`)
test$`pCREB-pSTAT3` <- ifelse(test$`pCREB-pSTAT3`>a1[3],'High','Low')
test$`pCREB-pSTAT3` <- factor(test$`pCREB-pSTAT3`,levels = c('Low','High'))

data_CD8T <- test[,c(1,2)]
colnames(data_CD8T)[1] <- paste('CD8T_',colnames(data_CD8T)[1],sep='')

colnames(test)[1] <- 'D1'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ D1, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  legend.title = "pCREB-pSTAT3",
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_CD8T_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()

###################################################################################################

myCurrentCellType <- 'Monocytes'
mySelectedFeatures <- c('pErk-pSTAT5')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#plot 1 based on survival
plot_data <- summary_patient_data[,c(mySelectedFeatures,'SurvivalGroup')]
plot_data <- melt(plot_data,id.vars = 'SurvivalGroup')
plot_data$SurvivalGroup <- factor(plot_data$SurvivalGroup,levels = c('STS','LTS'))

DREMI_Monocytes_STS_LTS <- ggplot(plot_data,aes(x=SurvivalGroup,y=(value),fill=SurvivalGroup),color=SurvivalGroup)+
  geom_beeswarm(size=0.78,alpha=0.8)+
  geom_boxplot(width=0.38 ,size=0.58,alpha=0.8,fatten=TRUE,
               position=position_dodge(width=0.78),
               outlier.colour = "black",
               outlier.shape = 21,
               outlier.fill = "gray",
               outlier.size = 0.4)+
  facet_wrap(~variable,scales = "free_x",ncol = 1)+
  theme_bw()+
  ggtitle('Monocytes')+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1, size = 12),
        axis.title.y = element_blank(),
        axis.title.x = element_text( size = 12 ),
        strip.text = element_text(size = 14,face='bold',lineheight=1),
        legend.position = "none",
        legend.text = element_text(size=12),
        strip.background = element_rect(colour = "black", fill = "white"),
        plot.title = element_text(size = 14, face = "bold",hjust = 0.5),aspect.ratio = 0.6)+
  ylab('DREMI scores')+
  scale_fill_manual(values=c('#e6194b', '#3cb44b'),name='')+
  theme(panel.spacing = unit(1.8, "lines"))+coord_flip()

#save the abundance plot
myfile <- paste('DREMI_Monocytes_STS_LTS.pdf',sep ='')
pdf(myfile)
print(DREMI_Monocytes_STS_LTS)
dev.off()


#then we perfrom KP survival analysis by discretising the DREMI values
test <- summary_patient_data

a1 <- quantile(test$`pErk-pSTAT5`)
test$`pErk-pSTAT5` <- ifelse(test$`pErk-pSTAT5`>a1[3],'High','Low')
test$`pErk-pSTAT5` <- factor(test$`pErk-pSTAT5`,levels = c('Low','High'))

data_Monocytes <- test[,c(1,2)]
colnames(data_Monocytes)[1] <- paste('Monocytes_',colnames(data_Monocytes)[1],sep='')

colnames(test)[1] <- 'D1'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ D1, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  legend.title = "pErk-pSTAT5",
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_Monocytes_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()

###################################################################################################

myCurrentCellType <- 'NK'
mySelectedFeatures <- c('pS6-pSTAT5')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#plot 1 based on survival
plot_data <- summary_patient_data[,c(mySelectedFeatures,'SurvivalGroup')]
plot_data <- melt(plot_data,id.vars = 'SurvivalGroup')
plot_data$SurvivalGroup <- factor(plot_data$SurvivalGroup,levels = c('STS','LTS'))

DREMI_NK_STS_LTS <- ggplot(plot_data,aes(x=SurvivalGroup,y=(value),fill=SurvivalGroup),color=SurvivalGroup)+
  geom_beeswarm(size=0.78,alpha=0.8)+
  geom_boxplot(width=0.38 ,size=0.58,alpha=0.8,fatten=TRUE,
               position=position_dodge(width=0.78),
               outlier.colour = "black",
               outlier.shape = 21,
               outlier.fill = "gray",
               outlier.size = 0.4)+
  facet_wrap(~variable,scales = "free_x",ncol = 1)+
  theme_bw()+
  ggtitle('NK cells')+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1, size = 12),
        axis.title.y = element_blank(),
        axis.title.x = element_text( size = 12 ),
        strip.text = element_text(size = 14,face='bold',lineheight=1),
        legend.position = "none",
        legend.text = element_text(size=12),
        strip.background = element_rect(colour = "black", fill = "white"),
        plot.title = element_text(size = 14, face = "bold",hjust = 0.5),aspect.ratio = 0.6)+
  ylab('DREMI scores')+
  scale_fill_manual(values=c('#e6194b', '#3cb44b'),name='')+
  theme(panel.spacing = unit(1.8, "lines"))+coord_flip()

#save the abundance plot
myfile <- paste('DREMI_NK_STS_LTS.pdf',sep ='')
pdf(myfile)
print(DREMI_NK_STS_LTS)
dev.off()


#then we perfrom KP survival analysis by discretising the DREMI values
test <- summary_patient_data

a1 <- quantile(test$`pS6-pSTAT5`)
test$`pS6-pSTAT5` <- ifelse(test$`pS6-pSTAT5`>a1[3],'High','Low')
test$`pS6-pSTAT5` <- factor(test$`pS6-pSTAT5`,levels = c('Low','High'))

data_NK <- test[,c(1,2)]
colnames(data_NK)[1] <- paste('NK_',colnames(data_NK)[1],sep='')

colnames(test)[1] <- 'D1'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ D1, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  legend.title = "pS6-pSTAT5",
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_NK_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()

###################################################################################################

myCurrentCellType <- 'pDCs'
mySelectedFeatures <- c('p4EBP1-Casp3')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#plot 1 based on survival
plot_data <- summary_patient_data[,c(mySelectedFeatures,'SurvivalGroup')]
plot_data <- melt(plot_data,id.vars = 'SurvivalGroup')
plot_data$SurvivalGroup <- factor(plot_data$SurvivalGroup,levels = c('STS','LTS'))

DREMI_pDCs_STS_LTS <- ggplot(plot_data,aes(x=SurvivalGroup,y=(value),fill=SurvivalGroup),color=SurvivalGroup)+
  geom_beeswarm(size=0.78,alpha=0.8)+
  geom_boxplot(width=0.38 ,size=0.58,alpha=0.8,fatten=TRUE,
               position=position_dodge(width=0.78),
               outlier.colour = "black",
               outlier.shape = 21,
               outlier.fill = "gray",
               outlier.size = 0.4)+
  facet_wrap(~variable,scales = "free_x",ncol = 1)+
  theme_bw()+
  ggtitle('pDCs cells')+
  theme(axis.text.y = element_text( size = 12 ),
        axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1, size = 12),
        axis.title.y = element_blank(),
        axis.title.x = element_text( size = 12 ),
        strip.text = element_text(size = 14,face='bold',lineheight=1),
        legend.position = "none",
        legend.text = element_text(size=12),
        strip.background = element_rect(colour = "black", fill = "white"),
        plot.title = element_text(size = 14, face = "bold",hjust = 0.5),aspect.ratio = 0.6)+
  ylab('DREMI scores')+
  scale_fill_manual(values=c('#e6194b', '#3cb44b'),name='')+
  theme(panel.spacing = unit(1.8, "lines"))+coord_flip()

#save the abundance plot
myfile <- paste('DREMI_pDCs_STS_LTS.pdf',sep ='')
pdf(myfile)
print(DREMI_pDCs_STS_LTS)
dev.off()


#then we perfrom KP survival analysis by discretising the DREMI values
test <- summary_patient_data

a1 <- quantile(test$`p4EBP1-Casp3`)
test$`p4EBP1-Casp3` <- ifelse(test$`p4EBP1-Casp3`>a1[3],'High','Low')
test$`p4EBP1-Casp3` <- factor(test$`p4EBP1-Casp3`,levels = c('Low','High'))

data_pDCs <- test[,c(1,2,4,8,9)]
colnames(data_pDCs)[1] <- paste('pDCs_',colnames(data_pDCs)[1],sep='')

colnames(test)[1] <- 'D1'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ D1, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  legend.title = "p4EBP1-Casp3",
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_pDCs_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()

###################################################################################################
###################################################################################################
###################################################################################################

```

## Perform multivariate survival analysis using the best DREMI scores

```{r DREMI-based Cox modelling,echo=TRUE,eval=TRUE,error=TRUE, warning=FALSE,cache=TRUE}

#first need to combine the cell type specific data frames
combinedData <- merge(data_B, data_CD4T,by="PatientNr")
combinedData <- merge(combinedData,data_CD8T,by="PatientNr")
combinedData <- merge(combinedData,data_HSCsMPPs,by="PatientNr")
combinedData <- merge(combinedData,data_Monocytes,by="PatientNr")
combinedData <- merge(combinedData,data_NK,by="PatientNr")
combinedData <- merge(combinedData,data_pDCs,by="PatientNr")

colnames(combinedData)[2] <- 'D1'
colnames(combinedData)[3] <- 'D2'
colnames(combinedData)[4] <- 'D3'
colnames(combinedData)[5] <- 'D4'
colnames(combinedData)[6] <- 'D5'
colnames(combinedData)[7] <- 'D6'
colnames(combinedData)[8] <- 'D7'

model_comb<-coxph(Surv(combinedData$SurvivalTime,combinedData$SurvivalStatus)~(D1+D2+D3+D4+D5+D6+D7), 
                  data = combinedData)

plot_comb <- ggforest(model_comb,data=combinedData,noDigits = 2)
test.ph.comb <- cox.zph(model_comb)
s_comb <- ggcoxzph(test.ph.comb,font.main = 4,font.x=4,font.y=4)

myfile<-paste('Cox_model_DREMI.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(plot_comb)
dev.off()

myfile<-paste('Residuals_DREMI.pdf',sep='')
pdf(myfile,onefile = TRUE)
print(s_comb)
dev.off()


```

## Exploring possible associations of DREMI scores with clinical and genetic info 
In this subsection we use FLT3-ITD status together with cell type-specific DREMI scores to improve patient stratification

```{r synergy with ELN,echo=TRUE,eval=TRUE,error=TRUE, warning=FALSE,cache=TRUE}

#first work with ELN alone
test <- md_0h

a <- which(test$`ELN 2017 risk`=='NA')
test <- test[-a,]
test$CombinedELN <- test$`ELN 2017 risk`
test$CombinedELN <- ifelse(test$`ELN 2017 risk`=='Intermediate' | test$`ELN 2017 risk`=='Adverse','nonFavorable','Favorable')

survivalTime <- test[,"5-year survival (days)"]
survivalTime <- survivalTime$`5-year survival (days)`

#fix the survival time
for(idx in 1:length(survivalTime)){
  
  if(survivalTime[idx]=='Alive'){
    survivalTime[idx] <- 5*365
  }else{
    survivalTime[idx] <- as.numeric(survivalTime[idx])
  }
}

survivalTime <- as.numeric(survivalTime)

test <- as.data.frame(test)

test$SurvivalTime <- survivalTime
test$SurvivalStatus <- ifelse(test$SurvivalTime<5*365,1,0)


fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ CombinedELN, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  #legend.title = "pCREB-pSAT",
                  legend.labs = c("Favorable","Intermediate/Adverse"),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)
myfile<-paste('KaplanMeier_simple_ELN_comb.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()


###################################################################################################
#check mutational status for FLT3

test <- md_0h

test$FLT3_ITD <- test$`FLT3-ITD`
test$FLT3_ITD <- ifelse(test$FLT3_ITD=='Present','Mutated','Wt')

survivalTime <- test[,"5-year survival (days)"]
survivalTime <- survivalTime$`5-year survival (days)`

#fix the survival time
for(idx in 1:length(survivalTime)){
  
  if(survivalTime[idx]=='Alive'){
    survivalTime[idx] <- 5*365
  }else{
    survivalTime[idx] <- as.numeric(survivalTime[idx])
  }
}

#remove patient P33 and P45 as they cannot be used for survival analyses 
a <- which(test$`Patient nr`=="P33" | test$`Patient nr`=="P45")

survivalTime <- as.numeric(survivalTime)

test <- as.data.frame(test)

test$SurvivalTime <- survivalTime
test$SurvivalStatus <- ifelse(test$SurvivalTime<5*365,1,0)
#remove patient P33 and P45 as they cannot be used for survival analyses 
test <- test[-a,]

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ FLT3_ITD, data = test)
o <-  ggsurvplot(fit, data = test,
                  pval = TRUE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#1B9E77","#E7298A"),
                  legend.title = "FLT3_ITD",
                  #legend.labs = c("Mutated","Wt"),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o <- ggarrange(o$plot,o$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)
myfile<-paste('KaplanMeier_simple_FLT3_ITD.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o)
dev.off()


###### process again the info from DREMI features ###### 

myCurrentCellType <- 'B'
mySelectedFeatures <- c('pP38-pSTAT3')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#perform combined KP survival analysis by discretising the DREMI values together with ELN
test <- summary_patient_data
a1 <- quantile(test$`pP38-pSTAT3`)
test$`pP38-pSTAT3` <- ifelse(test$`pP38-pSTAT3`>a1[3],'High','Low')
test$`pP38-pSTAT3` <- factor(test$`pP38-pSTAT3`,levels = c('Low','High'))

colnames(test)[1] <- 'D1'
colnames(test)[6] <- 'FLT3_ITD'
fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ FLT3_ITD+D1, data = test)
o1 <-  ggsurvplot(fit, data = test,
                  pval = FALSE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#D95F02", "#7570B3","#66A61E","#E6AB02"),
                  #legend.title = "pCREB-pSAT",
                  #legend.labs = c("Favorable","Intermediate",'Adverse'),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o1 <- ggarrange(o1$plot,o1$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_B_FLT3_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o1)
dev.off()

###### process again the info from DREMI features ###### 

myCurrentCellType <- 'CD4_T'
mySelectedFeatures <- c('pSTAT3-pCREB')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
      cat(a)
      cat(' ')
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#perform combined KP survival analysis by discretising the DREMI values together with ELN
test <- summary_patient_data
a1 <- quantile(test$`pSTAT3-pCREB`)
test$`pSTAT3-pCREB` <- ifelse(test$`pSTAT3-pCREB`>a1[3],'High','Low')
test$`pSTAT3-pCREB` <- factor(test$`pSTAT3-pCREB`,levels = c('Low','High'))

colnames(test)[1] <- 'D1'
colnames(test)[6] <- 'FLT3_ITD'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ FLT3_ITD+D1, data = test)
o1 <-  ggsurvplot(fit, data = test,
                  pval = FALSE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#D95F02", "#7570B3","#66A61E","#E6AB02"),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o1 <- ggarrange(o1$plot,o1$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_CD4T_FLT3_ITD_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o1)
dev.off()

##########################################################
myCurrentCellType <- 'Monocytes'
mySelectedFeatures <- c('pErk-pSTAT5')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#perform combined KP survival analysis by discretising the DREMI values together with ELN
test <- summary_patient_data
a1 <- quantile(test$`pErk-pSTAT5`)
test$`pErk-pSTAT5` <- ifelse(test$`pErk-pSTAT5`>a1[3],'High','Low')
test$`pErk-pSTAT5` <- factor(test$`pErk-pSTAT5`,levels = c('Low','High'))

colnames(test)[1] <- 'D1'
colnames(test)[6] <- 'FLT3_ITD'
fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ FLT3_ITD+D1, data = test)
o1 <-  ggsurvplot(fit, data = test,
                  pval = FALSE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#D95F02", "#7570B3","#66A61E","#E6AB02"),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o1 <- ggarrange(o1$plot,o1$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_Monocytes_FLT3_ITD_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o1)
dev.off()

##########################################################
myCurrentCellType <- 'HSCs_MPPs'
mySelectedFeatures <- c('pSTAT3-pErk')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#perform combined KP survival analysis by discretising the DREMI values together with ELN
test <- summary_patient_data
a1 <- quantile(test$`pSTAT3-pErk`)
test$`pSTAT3-pErk` <- ifelse(test$`pSTAT3-pErk`>a1[3],'High','Low')
test$`pSTAT3-pErk` <- factor(test$`pSTAT3-pErk`,levels = c('Low','High'))

colnames(test)[1] <- 'D1'
colnames(test)[6] <- 'FLT3_ITD'
fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ FLT3_ITD+D1, data = test)
o1 <-  ggsurvplot(fit, data = test,
                  pval = FALSE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#D95F02", "#7570B3","#66A61E","#E6AB02"),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o1 <- ggarrange(o1$plot,o1$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_HSCMPPs_FLT3_ITD_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o1)
dev.off()

##########################################################
myCurrentCellType <- 'NK'
mySelectedFeatures <- c('pS6-pSTAT5')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#perform combined KP survival analysis by discretising the DREMI values together with ELN
test <- summary_patient_data
a1 <- quantile(test$`pS6-pSTAT5`)
test$`pS6-pSTAT5` <- ifelse(test$`pS6-pSTAT5`>a1[3],'High','Low')
test$`pS6-pSTAT5` <- factor(test$`pS6-pSTAT5`,levels = c('Low','High'))

colnames(test)[1] <- 'D1'
colnames(test)[6] <- 'FLT3_ITD'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ FLT3_ITD+D1, data = test)
o1 <-  ggsurvplot(fit, data = test,
                  pval = FALSE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#D95F02", "#7570B3","#66A61E","#E6AB02"),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o1 <- ggarrange(o1$plot,o1$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_NK_FLT3_ITD_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o1)
dev.off()

##########################################################
myCurrentCellType <- 'pDCs'
mySelectedFeatures <- c('p4EBP1-Casp3')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
      cat(a)
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#perform combined KP survival analysis by discretising the DREMI values together with ELN
test <- summary_patient_data
a1 <- quantile(test$`p4EBP1-Casp3`)
test$`p4EBP1-Casp3` <- ifelse(test$`p4EBP1-Casp3`>a1[3],'High','Low')
test$`p4EBP1-Casp3` <- factor(test$`p4EBP1-Casp3`,levels = c('Low','High'))

colnames(test)[1] <- 'D1'
colnames(test)[6] <- 'FLT3_ITD'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ FLT3_ITD+D1, data = test)
o1 <-  ggsurvplot(fit, data = test,
                  pval = FALSE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#D95F02", "#7570B3","#66A61E","#E6AB02"),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o1 <- ggarrange(o1$plot,o1$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_pDCs_FLT3_ITD_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o1)
dev.off()


##########################################################
myCurrentCellType <- 'CD8_T'
mySelectedFeatures <- c('pCREB-pSTAT3')
summary_patient_data <- data.frame()
for(idx in 1:nrow(patient_samples)){
  
  myPatientID <- patient_samples[idx,'patient_id']
  a <- which(md_0h$Patient_nr_cytobank==myPatientID$patient_id)
  if(length(a)>0){
    survivalTime <- md_0h[a,"5-year survival (days)"]
    chromAbber <- md_0h[a,'Karyotype']
    chromAbber <- chromAbber$Karyotype
    ELN <- md_0h[a,'ELN 2017 risk']
    ELN <- ELN$`ELN 2017 risk`
    if(ELN=='NA'){
      ELN <- NA
    }
    if(chromAbber=='NA'){
      chromAbber <- NA
    }
    mutFLT3 <- md_0h[a,'FLT3-ITD']
    mutFLT3 <- mutFLT3$`FLT3-ITD`
    if(mutFLT3=='Present'){
      mutFLT3 <- 'Mutated'
    }else{
      mutFLT3 <- 'Wt'
    }
    #fetch the patient cells
    promptKey <- paste('P',idx,sep='')
    data <- DREMI_feature_vector[DREMI_feature_vector$PatientNr==promptKey &
                                   DREMI_feature_vector$CellType==myCurrentCellType,]
    #check for -1 that means no DREMI score available
    if(data[1,1]==-1){
      a <- 'skip'
      cat(a)
    }else{
      data <- data[,c(mySelectedFeatures,
                    'PatientNr','PatientID','SurvivalTime','karyotype','FLT3',
                    'CellType','SurvivalStatus','SurvivalGroup')]
      data$ELN <- ELN
      summary_patient_data <- rbind(summary_patient_data,data)
    }
  }
}

#perform combined KP survival analysis by discretising the DREMI values together with ELN
test <- summary_patient_data
a1 <- quantile(test$`pCREB-pSTAT3`)
test$`pCREB-pSTAT3` <- ifelse(test$`pCREB-pSTAT3`>a1[3],'High','Low')
test$`pCREB-pSTAT3` <- factor(test$`pCREB-pSTAT3`,levels = c('Low','High'))

colnames(test)[1] <- 'D1'
colnames(test)[6] <- 'FLT3_ITD'

fit<- survfit(Surv(SurvivalTime, SurvivalStatus) ~ FLT3_ITD+D1, data = test)
o1 <-  ggsurvplot(fit, data = test,
                  pval = FALSE, 
                  xlab='Months from diagnosis',ylab='Survival probability',tables.height = 0.2,
                  font.legend = c(8, "bold", "darkblue"),censor=FALSE,
                  palette = c("#D95F02", "#7570B3","#66A61E","#E6AB02"),
                  surv.median.line = "hv",
                  risk.table = TRUE,
                  tables.theme = theme_cleantable(),
                  ggtheme = theme_bw(),
                  conf.int = FALSE,
                  font.x = c(12),
                  font.y = c(12),
                  font.tickslab=c(12))

o1 <- ggarrange(o1$plot,o1$table,heights = c(2, 0.7),
                ncol = 1, nrow = 2)

myfile<-paste('KaplanMeier_CD8T_FLT3_ITD_D1.pdf',sep='')
pdf(myfile,onefile = TRUE,width = 6,height = 5)
print(o1)
dev.off()


```