-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_cgcnt.R
106 lines (91 loc) · 4.53 KB
/
read_cgcnt.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# the function reads in summarised counts of c/g from uniquely pseudoaligned reads
# the counts from .cgcount files are summarised in long form
## ZC test
# location_cgcnt="/Users/zacc/Documents/MtSinai/5mC_BrainBlood/Paper/re_analysis/tissuecell_analysis/test"
# cg_count<-read_cgcnt(location_cgcnt)
read_cgcnt<-function(location_cgcnt){
file.names<-list.files(location_cgcnt, pattern = "cgcount", full.names=T)
file.names<-file.names[!(1:length(file.names) %in% -grep("_false_",file.names))]
file.names<-file.names[!(1:length(file.names) %in% -grep("_true_",file.names))]
res<-c("counts","sample","assay_long","u_psdcnt","assay","sample_name","psd_cell","ctga","cg" )
for (i in 1:length(file.names)){
if (length(readLines(file.names[i])) == 0){
stop(paste0("file ",i, " has no lines"), call.=FALSE)
} else {
df<-read.delim(file.names[i], sep = "" , header = T , nrows = 100,
na.strings ="", stringsAsFactors= F)
df<-df[,!colnames(df) %in% c("X0_C","X0_G")]
sample<-rep(basename(file.names))[i]
df<-cbind(df,sample)
df<-df[,-grep("counts.",colnames(df))]
df2<-reshape(df, direction = "long", varying = colnames(df)[grep("chr",colnames(df))], v.names = "u_psdcnt",
idvar = c("counts","sample"), timevar = "assay", times = colnames(df)[grep("chr",colnames(df))],
new.row.names=c(1:(ncol(df)*10000)))
df2$u_psdcnt<-as.numeric(df2$u_psdcnt)
#df2<-df2[!is.na(df2$u_psdcnt),]
row.names(df2)<-NULL
# get assay name
s=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[1,]))
s2=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[2,]))
s3=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[3,]))
assay<-paste(s,s2,s3,sep="_")
# get sample name
sample_name=gsub("_cgcount","",df2$sample)
# get psd_cell
psd_cell=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[6,]))
# get CT/GA
ctga=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[4,]))
# get C/G
cg=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[7,]))
df2<-cbind(df2,assay,sample_name,psd_cell,ctga,cg)
res<-rbind(res,df2)
}
}
df3<-res[-1,]
colnames(df3)<-c("counts","sample","assay_long","u_psdcnt","assay","sample_name","psd_cell","ctga","cg" )
df3$u_psdcnt<-as.numeric(df3$u_psdcnt)
return(df3)
}
read_tfcgcnt<-function(location_cgcnt){
file.names<-list.files(location_cgcnt, pattern = "cgcount", full.names=T)
toMatch<-c("_false_","_true_")
file.names<-file.names[grep(paste(toMatch,collapse="|"),file.names)]
res<-c("counts","sample","assay_long","u_psdcnt","assay","sample_name","psd_cell","ctga","cg" )
for (i in 1:length(file.names)){
if (length(readLines(file.names[i])) == 0){
stop(paste0("file ",i, " has no lines"), call.=FALSE)
} else {
df<-read.delim(file.names[i], sep = "" , header = T , nrows = 100,
na.strings ="", stringsAsFactors= F)
df<-df[,!colnames(df) %in% c("X0_C","X0_G")]
sample<-rep(basename(file.names))[i]
df<-cbind(df,sample)
df<-df[,-grep("counts.",colnames(df))]
df2<-reshape(df, direction = "long", varying = colnames(df)[grep("chr",colnames(df))], v.names = "u_psdcnt",
idvar = c("counts","sample"), timevar = "assay", times = colnames(df)[grep("chr",colnames(df))],
new.row.names=c(1:(ncol(df)*10000)))
df2$u_psdcnt<-as.numeric(df2$u_psdcnt)
#df2<-df2[!is.na(df2$u_psdcnt),]
row.names(df2)<-NULL
# get assay name
s=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[1,]))
s2=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[2,]))
s3=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[3,]))
assay<-paste(s,s2,s3,sep="_")
# get sample name
sample_name=gsub("_cgcount","",df2$sample)
# get psd_cell
psd_cell=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[6,]))
# get CT/GA
ctga=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[4,]))
# get C/G
cg=unname(unlist(as.data.frame(strsplit(as.character(df2$assay),"_"))[7,]))
df2<-cbind(df2,assay,sample_name,psd_cell,ctga,cg)
res<-rbind(res,df2)
}
}
df3<-res[-1,]
colnames(df3)<-c("counts","sample","assay_long","u_psdcnt","assay","sample_name","psd_cell","ctga","cg" )
df3$u_psdcnt<-as.numeric(df3$u_psdcnt)
return(df3)
}