-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_data.R
81 lines (63 loc) · 2.83 KB
/
read_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
setwd("~/MEGA/Master/PG/project/")
library("snpMatrix")
library("ggmap")
library("MASS")
library("magrittr")
library("dplyr")
library("tidyr")
library("ggplot2")
library("cowplot")
#library("cowplot")
library("ggthemes")
library("gplots")
theme_set(theme_minimal())
source("scripts/functions.R")
library("RColorBrewer")
myPalette <- brewer.pal(n = 5, name = "Dark2")
width <- 15
height <- 7.82
zebras <- read.table("zebra_package/ourPlainsQuaggaGrevys_popInfo.txt", header = T)
zebras_original <- zebras
zebras %>% group_by(country) %>% summarise(count = n())
zebras %>% group_by(subspecies) %>% summarise(count = n())
bim <- read.table("zebra_package/ourPlainsQuaggaGrevys_maxMissing0.2_maf.bim",
col.names = c("chr", "id", "morgan", "coord", "v1", "v2"))
longlat <- read.table("zebra_package/longlat.txt")
lon <- longlat$lon
lat <- longlat$lat
input_data <- "zebra_package/plainsQuaggaGrevys2017_maxMissing0.2_maf_sorted"
prefix <- "plainsQuaggaGrevys2017_maxMissing0.2_maf_sorted"
#input_data <- "../practica/mon27/pruneddata"
data <- read.plink(input_data)
geno <- as.integer([email protected]) %>% matrix(nrow = nrow([email protected])) %>% t
# Recode so that 0 1 2 indicates the # of the allele in .bim
geno[geno == 0] <- NA
geno <- geno - 1
#table(geno)
unfiltered_geno <- geno # save unfiltered geno
#geno <- temp
ROH <- select_SNPs(geno) # select meaningful SNPs only
bim_ROH <- bim[ROH$selector,] %>% arrange(chr, coord)
ROH <- cbind(zebras, ROH %>% t %>% as.data.frame) %>% dplyr::arrange(species, subspecies) # bind zebras info and geno databas
result <- select_SNPs(geno, bim) # select meaningful SNPs only
geno <- result$geno # return filtered database
selector <- result$selector # return logical vector showing which are the meaningful SNPs
bim <- bim[selector,] # subset the bim file with this logical vector
bim_sorted <- bim %>% arrange(chr, coord) # sort the bim file according to chromosome and coordinate
# that way SNPs follow the genome
selector2 <- bim_sorted$id2 # generate new selector
# showing how to select the SNPs
# in the original databse so that they follow the right order
full_dataset <- cbind(zebras, geno %>% t %>% as.data.frame) %>% dplyr::arrange(species, subspecies) # bind zebras info and geno databas
# Sort geno by species and subspecies by extracting the geno columns in the sorted full_dataset
# Full dataset is sorted by species, subspecies
geno <- dplyr::select(full_dataset, -(ID:species)) %>% as.matrix() %>% t
full_lonlat <- cbind(lon, lat, full_dataset[-63,])
clusters <- readRDS(".clusters.RDS")
full_lonlat$clusters <- as.factor(clusters[-63]) # exclude quagga
zebras <- full_dataset %>% dplyr::select(ID:species)
subspecies <- zebras[, 5]
species <- zebras[, 6]
homo <- !(geno == 1)
subPlains <- c("plains", levels(subspecies)) %>% as.list
geno_sorted <- full_dataset %>% .[,7:ncol(full_dataset)]