-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakeHapFile.py
353 lines (296 loc) · 13.8 KB
/
makeHapFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
import os
import sys
import pandas as pd
import numpy as np
from collections import OrderedDict
from scipy import stats
import argparse
"""
A python parser to convert the reference genome panel (haplotype) data to HAPLOTYPE format required by
phaseExtender. This data is used as simulated data to test the phasing quality produced by phaseExtender.
"""
# print(os.getcwd())
# set the working directory
# os.chdir("/home/priyanka/Dropbox/SharedWithHarish/TestSwitchErrors/HapMap3_r2_b36_2009/Contd_PhaseQC_PhaseExtender/")
# print(os.getcwd())
def main():
parser = argparse.ArgumentParser()
print()
print(
"Purpose of the application : This application simulates ReadBackPhased haplotype given *.haps, *.legend and *.sample file are provided."
)
parser.add_argument(
"-haps",
help='the "*.haps" file that contains haplotype data in numeric format',
required=True,
)
parser.add_argument(
"-legend",
help='the "*.legend" file that contains genomic coordinates and genotypes as IUPAC bases.',
required=True,
)
parser.add_argument(
"-sample_file",
help='the "*.sample" file that contains sample names and have their position (rows) \
matched with column position in "*.haps" file.',
required=True,
)
parser.add_argument(
"-chr",
help='The chromosome name the "*.legend" and "*.haps" file belongs to.',
required=True,
)
parser.add_argument(
"-samples",
help='The samples from "*.sample" file that are to be converted to RBphased data.',
required=True,
)
parser.add_argument(
"-output_truth", help="Write the truth HAPLOTYPE file.", required=True
)
parser.add_argument(
"-output_sim", help="Write the simulated HAPLOTYPE file.", required=True
)
# activate "args"
global args
args = parser.parse_args()
chr_ = args.chr
sample_f = args.sample_file
haps_f = args.haps
samples = args.samples
legend_f = args.legend
out_tr = args.output_truth
out_sim = args.output_sim
### pick samples (and it's position in haplotype file) to convert to haplotype file compatible with phaseExtender
sample_of_interest = []
if "," in args.samples:
sample_of_interest = args.samples.split(",")
else:
sample_of_interest += args.sample
# sample_of_interest = ['NA12891', 'NA12892']
"""
# Just some existing sample names
sample_of_interest = ['NA12891', 'NA12892', 'NA06989', 'NA10850', 'NA06984', 'NA07056',
'NA12045', 'NA11843', 'NA12890', 'NA12889', 'NA12347', 'NA12348',
'NA12273', 'NA12827', 'NA12342', 'NA12340', 'NA11894', 'NA12763',
'NA11881', 'NA12005']
"""
# from the "*.sample" file extract the hapltoype position for the sample of interest
# store the "sample name" and it's index position as a dictionary.
# with open('hapmap3_r2_b36_all.sample') as sample_file:
with open(sample_f) as sample_file:
sample_pos_dict = OrderedDict() # to store sample, position values
for nth, lines in enumerate(sample_file):
# print (nth, lines)
sample_id = lines.rstrip("\n").split(" ")[0]
if sample_id in sample_of_interest:
sample_pos_dict[sample_id] = [nth * 2 - 1, nth * 2]
# reduce the size of the list after the sample's index is found
sample_of_interest = [x for x in sample_of_interest if x != sample_id]
# if the sample isn't found in sample file raise the error
if len(sample_of_interest) != 0:
print(
'The following samples are not found in the provided "*.sample" file.'
)
print(sample_of_interest)
print("Please remove those sample .... ")
print("exiting.... ")
print()
sys.exit()
## load the "*.legend" file
# haps_legend = pd.read_csv('hapmap3_r2_b36_chr20.legend', sep = ' ')
haps_legend = pd.read_csv(legend_f, sep=" ")
# extract ref and alt allele from "*.legend" file.
ref_alt_data = haps_legend.drop(["rsID"], axis=1)
# ref_alt_data.insert(0, column='CHROM', value=20)
ref_alt_data.insert(0, column="CHROM", value=chr_)
ref_alt_data["a1"] = ref_alt_data["a0"] + "," + ref_alt_data["a1"]
ref_alt_data.rename(
index=str,
columns={"position": "POS", "a0": "REF", "a1": "all-alleles"},
inplace=True,
)
### Now, take the "*.legend" and "*.haps" file and prepare HAPLOTYPE file for each "sample of interest".
# haps_file = pd.read_csv('hapmap3_r2_b36_chr20.haps', sep = ' ', header=None)
haps_file = pd.read_csv(haps_f, sep=" ", header=None)
print("\nExtracting haplotype data for each sample of interest ..... ")
# Extract the haplotype from the "sample of interest"
# we will store the haplotype data as keys (sample id) and values (pandas df)
sample_hap_dict = OrderedDict()
nth = 1
for sample_name, sample_idx in sample_pos_dict.items():
print("Sample %s : " % nth, sample_name)
nth += 1
haplotype = pd.DataFrame() # empty dataframe
# extract the haplotype data
haplotype["hapA"] = haps_file[sample_idx[0] - 1]
haplotype["hapB"] = haps_file[sample_idx[1] - 1]
# join the haplotype data with legend file
# index doesn't matter here, because we are doing side by side join.
# but, the two file should have exact same length.
haplotype = haps_legend.join(haplotype)
# rename column "position" to "POS"
haplotype.rename(index=str, columns={"position": "POS"}, inplace=True)
# add chromosome name/number
# haplotype.insert(0, column='CHROM', value=20)
haplotype.insert(0, column="CHROM", value=chr_)
# drop the homozygous genotype position and then reset the index
haplotype = haplotype[haplotype["hapA"] != haplotype["hapB"]]
haplotype.reset_index(drop=True, inplace=True)
## convert numeric haplotype into IUPAC base
haplotype[sample_name + "_hapA"] = haplotype.apply(
lambda row: row["a0"] if row["hapA"] == 0 else row["a1"], axis=1
)
haplotype[sample_name + "_hapB"] = haplotype.apply(
lambda row: row["a0"] if row["hapB"] == 0 else row["a1"], axis=1
)
# add the converted haplotype to column "sample_name:PG_al"
haplotype[sample_name + ":PG_al"] = haplotype.apply(
lambda row: row[sample_name + "_hapA"] + "|" + row[sample_name + "_hapB"],
axis=1,
)
# drop the not required columns
haplotype.drop(
[
"rsID",
"a0",
"a1",
"hapA",
"hapB",
sample_name + "_hapA",
sample_name + "_hapB",
],
inplace=True,
axis=1,
)
##### ******************** #####
### Starting simulation to create a RBphased haplotype blocks (number of RBphased blocks and size of each block)
## randomly create ReadBackPhase haplotype blocks
# we will use the function "make_poissonish()" to get a list of RBphased blocks
hap_length = len(
haplotype
) # total number of "heterozyous varaints" in the haplotype
avg_rbphased_size = np.random.choice(
[5, 6, 7]
) # randomly choosing avg_rbphased_size
number_of_rbphased_blocks = int(round(hap_length / avg_rbphased_size))
## preparing random unique PI index to add to the dataframe
# this part may be moved to a separate function in future if desired.
# rbphased_blocks = make_poissonish(9366, 1500) #if a fixed "hap length" and "number of rbphased_blocks" is desired
rbphased_blocks = make_poissonish(hap_length, number_of_rbphased_blocks)
"""
## If you want some to print the statistics of the simulated RBphased block sizes and distribution
print('\nLength of the haplotype')
print(len(haplotype))
print(hap_length, avg_rbphased_size, number_of_rbphased_blocks)
print('\nrbphased_blocks')
print(rbphased_blocks)
print('stats', np.mean(rbphased_blocks), np.median(rbphased_blocks),
stats.mode(rbphased_blocks), np.var(rbphased_blocks))
print(np.max(rbphased_blocks), np.min(rbphased_blocks))
"""
# now, split the whole phased data into RBphased chunks,
# create unique index for each RBphased chunk
unique_rbphased_index = [x + 1 for x in range(number_of_rbphased_blocks)]
# expand each unique index by the number of times (of rbphased_blocks) ..
# *Remember that the unique index and it's times are index matched.
pi_index_values = [
[x] * y for x, y in zip(unique_rbphased_index, rbphased_blocks)
]
pi_index_values = [item for items in pi_index_values for item in items]
# add the index values to dataframe as "sample name:PI"
# haplotype[sample_name + ':PI'] = pd.Series(pi_index_values).values
haplotype.insert(
loc=2, column=sample_name + ":PI", value=pd.Series(pi_index_values).values
)
## Store the "truth haplotype" set to file
truth_haplotype = haplotype[
["CHROM", "POS", sample_name + ":PI", sample_name + ":PG_al"]
]
## If you want to write the truth set data to a file before flipping the RBphase haplotype data
# haplotype.to_csv(sample_name + '_TruthSetHaplotype.txt', sep='\t', index = False,
# columns=['CHROM', 'POS', sample_name + ':PI', sample_name + ':PG_al'])
## Flip the position of RBphased blocks index is an odd number
haplotype[sample_name + ":PG_al"] = haplotype.apply(
lambda row: "|".join(row[sample_name + ":PG_al"].split("|")[::-1])
if row[sample_name + ":PI"] % 2 == 0
else row[sample_name + ":PG_al"],
axis=1,
)
## Store the "simulated haplotype set" to a file.
simulated_haplotype = haplotype[
["CHROM", "POS", sample_name + ":PI", sample_name + ":PG_al"]
]
# * If you want to write the simulated RBphased haplotype data to a file
# haplotype.to_csv(sample_name + '_SimulatedRBphasedHaplotype.txt', sep='\t', index = False,
# columns=['CHROM', 'POS', sample_name + ':PI', sample_name + ':PG_al'])
## add the simulated data (pandas dataframe) to an ordered dictionary
# sample_hap_dict[sample_name] = haplotype
sample_hap_dict[sample_name] = [truth_haplotype, simulated_haplotype]
## Now, merge the dataframe (haplotype from several samples) into a single HAPLOTYPE file
truth_haplotype_file = pd.DataFrame()
simulated_haplotype_file = pd.DataFrame()
for ks, vs in sample_hap_dict.items():
if len(truth_haplotype_file) == 0:
truth_haplotype_file = vs[0]
simulated_haplotype_file = vs[1]
else:
truth_haplotype_file = truth_haplotype_file.merge(
vs[0], on=["CHROM", "POS"], how="outer"
).fillna(".")
simulated_haplotype_file = simulated_haplotype_file.merge(
vs[1], on=["CHROM", "POS"], how="outer"
).fillna(".")
### Finally, merge the haplotype for each sample with "ref-alt" data to create complete HAPLOTYPE file
final_haplotype_file = ref_alt_data.merge(
truth_haplotype_file, on=["CHROM", "POS"], how="inner"
)
# Write the "HAPLOTYPE DATA" to a file.
print('\nWriting truth set haplotype data to a file "%s" .... ' % (out_tr))
final_haplotype_file.to_csv(out_tr, sep="\t", index=False)
## write the final simulated haplotype for all the samples
final_simulated_haplotype_file = ref_alt_data.merge(
simulated_haplotype_file, on=["CHROM", "POS"], how="inner"
)
# Write the simulated "HAPLOTYPE DATA" to a file.
print('\nWriting simulated set haplotype data to a file "%s" .... ' % (out_sim))
final_simulated_haplotype_file.to_csv(out_sim, sep="\t", index=False)
print("\nProcess completed !!! :) :) ")
print()
# this function is designed to create a right skewed simulation data
# the number represent the size of RBphased haplotype blocks
# The sum of the haplotype size should be equal to the size of total heterozyous variants.
def make_poissonish(n, num_bins):
if n > 30 * num_bins:
print("requested n exceeds 30 / bin")
exit(-1)
if n < 3 * num_bins:
print("requested n cannot fill 3 / bin")
exit(-1)
# Disperse minimum quantity per bin in all bins, then determine remainder
lst = [3 for _ in range(num_bins)]
number_remaining = n - num_bins * 3
# Allocate counts to all bins using a truncated Poisson
for i in range(num_bins):
# dial the rate up or down depending on whether we're falling
# behind or getting ahead in allocating observations to bins
rate = number_remaining / float(num_bins - i) # avg per remaining bin
# keep generating until we meet the constraint requirement (acceptance/rejection)
while True:
x = np.random.poisson(rate)
if x <= 27 and x <= number_remaining:
break
# Found an acceptable count, put it in this bin and move on
lst[i] += x
number_remaining -= x
# If there are still observations remaining, disperse them
# randomly across bins that have remaining capacity
while number_remaining > 0:
i = np.random.randint(0, num_bins)
if lst[i] >= 30: # not this one, it's already full! #* changed
continue
lst[i] += 1
number_remaining -= 1
return lst
if __name__ == "__main__":
main()