-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheffective_sample_size_short_haploidX.py
84 lines (65 loc) · 2.46 KB
/
effective_sample_size_short_haploidX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/python
# # calculate effective sample size for poolseq samples, following Ferretti et al 2013
import sys
import os
import glob
from math import factorial
import sympy
#chr_name = sys.argv[1]
#chr_name = input("Enter name of chromosome: ")
chr_name = "X"
nr_max_40 = 200
nr_max_80 = 400
nr_max_100 = 500
ne_dict_40 = {}
ne_dict_80 = {}
ne_dict_100 = {}
## function to calculate effective sample size as the expectation of unique chromosomal draws
# nr: number reads, nc: number of chromosomes (redo as nc x 2 for diploids = 80)?? Check X chromosome ploidy based on males/females
# possible 24 males/16 females
def expectation_sympy(nr, nc):
return float(sum(j*factorial(nc)*sympy.functions.combinatorial.numbers.stirling(nr, j)/(factorial(nc - j)*nc**nr) for j in range(1, nc + 1)))
def effect_samp_size(nr, nc, nmax):
if(nr > nmax):
return expectation_sympy(nmax, nc)
else:
return expectation_sympy(nr, nc)
for nr40 in range(1,nr_max_40):
ne_dict_40[nr40] = expectation_sympy(nr40, 40)
for nr80 in range(1,nr_max_80):
ne_dict_80[nr80] = expectation_sympy(nr80, 80)
#for nr100 in range(1, nr_max_100):
# ne_dict_100[nr100] = expectation_sympy(nr100,100)
#this information isn't used in this script
#poly_sites = []
#poly_file = open("SNP_Sites", 'r')
#next(poly_file)
#for line in poly_file:
# line = line.rstrip('\n')
# poly_sites = poly_sites + [line]
#datfiles = glob.glob('home/mshpak/Lundflies/VCF/Pooled_Round1/CountFile_*_ + chr_name)
datfiles = glob.glob("CountFile_*_" + chr_name)
#datfiles = ['CountFile_SRR8439156_2L']
for fname in datfiles:
dfile_new = fname + '_new'
# new file will be based on effective sample size
dfile = open(fname, 'r')
newfile = open("Effect_Size_" + fname, 'w')
#newfile.write("A,C,G,T" + '\n')
for line in dfile:
#print("stages1")
line.rstrip('\n')
tvec = line.split('\t')
tvec = list(map(int,tvec))
rdepth = sum(tvec)
#effective sample size
if rdepth==0:
newfile.write('0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\n')
else:
#print("stages2")
neff = ne_dict_40[rdepth]
tfreq = [num/sum(tvec) for num in tvec]
new_count = [freq*neff for freq in tfreq]
new_count_convert = [str(x) for x in new_count]
new_count_str = '\t'.join(new_count_convert)
newfile.write(new_count_str + '\n')