-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_relateds.py
181 lines (164 loc) · 6.3 KB
/
remove_relateds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python
"""This program outputs a set of unrelateds, with parameters specified by the user"""
# Usage: pyhton3.6 remove_relateds.py ponderosa_pairs_file.txt king.seg fam_file.fam <5th, 4th, 3rd, 2nd>
# the last parameter is the maximum degree of relatedness to allow
# Author: CW
import sys
import os.path
try:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
except:
print('Unable to find either pandas, numpy or sklearn. If using augrabies, rerun the command but replace "python" with "/software/anaconda3/4.5.12/lssc0-linux/bin/python3.6"')
print("Alternatively, run with a virtual environment and pip install pandas, numpy, and sklearn.")
sys.exit()
__author__ = "Cole Williams"
__email__ = "[email protected]"
try:
degree = sys.argv[-1]
fam_file = sys.argv[-2]
king_file = sys.argv[-3]
pairs_file = sys.argv[-4]
except:
print("Please format command as follows: python remove_relateds.py [pairs file] [king file] [fam file] [degree threshold]")
print("The pairs file is the .pairs file generated from PONDEROSA.")
print("The pairs file should be included to most accurately remove relateds.")
print("If you do not wish to include the pairs file, replace it with None and the script will use KING's degree inference.")
print("Alternatively, you can replace this argument with a float indicating the max pi hat value to be considered unrelated.")
print("The king file is the .seg file generated by KING.")
print("The fam file is the .fam file of individuals to consider. Assumes all individuals in the 2nd column (IID col) are genotyped.")
print("The degree threshold is the maximum degree of relatedness to include as unrelated, either 5th, 4th, 3rd, or 2nd")
print("For instance, using 3rd will include all pairs who are 3rd degree related or less")
sys.exit()
def find_unrelated(degree,king_file,fam_file,pairs_file):
ibd_data = {(i.split()[1],i.split()[3]):[float(i.split()[6]),float(i.split()[7])] for i in open(king_file).readlines()[1:]}
fam_list = [i.split()[1] for i in open(fam_file).readlines()]
sd = 2
degree_list = ["5th","4th","3rd","2nd","FS","PO"]
def lda_classif(degree,pairs_df):
def in_sd(ibd1_val,data):
return abs(ibd1_val-data[0]) < (sd*data[1])
degrees = degree_list[degree_list.index(degree):]
pairs_df = pairs_df[pairs_df["DEGREE"].isin(degrees)]
mean_stdev = {}
for i in degrees:
mean_stdev[i] = [pairs_df[pairs_df["DEGREE"]==i]["IBD1"].mean(),pairs_df[pairs_df["DEGREE"]==i]["IBD1"].std()]
pairs_df["KEEP"] = pairs_df.apply(lambda x: in_sd(x.IBD1,mean_stdev[x.DEGREE]),axis=1)
pairs_df = pairs_df[pairs_df["KEEP"]].copy()
train_val,train_lab = pairs_df[["IBD1","IBD2"]].values.tolist(),pairs_df["DEGREE"].values.tolist()
classif = LinearDiscriminantAnalysis().fit(train_val,train_lab)
return classif
def fifth_classif(pairs_df):
pairs_df = pairs_df[pairs_df["DEGREE"]=="4th"].copy()
mean,stdev = pairs_df["PIHAT"].mean(),pairs_df["PIHAT"].std()
return mean - (sd*stdev)
def king_classif(degree):
related = degree_list[degree_list.index(degree):][1:]
return [(i.split()[1],i.split()[3]) for i in open(king_file).readlines()[1:] if i.split()[-1] in related]
def get_coeff(iid1,iid2):
if (iid1,iid2) in ibd_data:
return ibd_data[(iid1,iid2)]
if (iid2,iid1) in ibd_data:
return ibd_data[(iid2,iid1)]
return [0,0]
def unrelated(iid1,iid2,degree,classif,mode):
ibd1,ibd2 = get_coeff(iid1,iid2)
pihat = (0.5*ibd1) + ibd2
if mode == "float":
return pihat <= classif
if mode == "king":
return not((iid1,iid2) in classif or (iid2,iid1) in classif)
if mode == "fifth":
return pihat < classif
if mode == "lda":
pred_deg = classif.predict([[ibd1,ibd2]])[0]
return pred_deg in degree_list[:degree_list.index(degree)+1]
def init_list(degree,classif,mode):
out_list = []
for iid1 in fam_list:
l = [0,iid1]
for iid2 in fam_list:
if iid1 == iid2:
continue
if unrelated(iid1,iid2,degree,classif,mode):
l[0] += 1
out_list.append(l)
out_list.sort(reverse=True)
return out_list
def pick_new(cur_list,iid_list,degree,classif,mode):
found = False
for iid in iid_list:
iid1 = iid[1]
add = True
for iid2 in cur_list:
if iid1 == iid2:
add = False
continue
if not unrelated(iid1,iid2,degree,classif,mode):
add = False
break
if add:
found = True
break
if found:
iid_list.remove(iid)
cur_list.append(iid1)
return cur_list
def make_list(degree,classif,mode,iid_list,unrelated_list):
stop = False
while not stop:
cur_len = len(unrelated_list)
unrelated_list = pick_new(unrelated_list,iid_list,degree,classif,mode)
new_len = len(unrelated_list)
if cur_len == new_len:
stop = True
return unrelated_list
def iterate_lists(degree,classif,mode):
unrelated_len = 0
iid_list = init_list(degree,classif,mode)
master_copy = iid_list[:]
starting_iid = []
while unrelated_len < master_copy[0][0]:
unrelated_list = make_list(degree,classif,mode,iid_list[:],[master_copy[0][1]])
unrelated_len = len(unrelated_list)
starting_iid.append([unrelated_len,master_copy[0][1],unrelated_list])
master_copy = master_copy[1:]
if master_copy == []:
break
break
starting_iid.sort(reverse=True)
return starting_iid[0][2]
if os.path.isfile(pairs_file):
pairs_df = pd.read_csv(pairs_file,delim_whitespace=True)
pairs_df = pairs_df.dropna(subset=["IBD1","IBD2","PIHAT"])
if degree == "5th":
mode = "fifth"
classif = fifth_classif(pairs_df)
else:
mode = "lda"
classif = lda_classif(degree,pairs_df)
else:
try:
mode = "float"
classif = float(pairs_file)
except:
mode = "king"
classif = king_classif(degree)
print("mode: %s" % mode)
print("degree: %s" % degree)
unrelated_list = iterate_lists(degree,classif,mode)
print("Total unrelateds: %s" % len(unrelated_list))
write_df = pd.DataFrame(unrelated_list,columns=["IID"])
write_df.to_csv(path_or_buf="unrelateds.csv")
max_ibd = []
for iid1 in unrelated_list:
unrelated_list = unrelated_list[1:]
for iid2 in unrelated_list:
ibd1,ibd2 = get_coeff(iid1,iid2)
pihat = (0.5*ibd1) + ibd2
max_ibd.append(pihat)
print("Max pi hat of unrelateds: %s" % max(max_ibd))
print('Writing to "unrelateds.csv" in the current directory. Here is a preview:')
print(write_df.head())
find_unrelated(degree,king_file,fam_file,pairs_file)