-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_sliding_window_graph.py
executable file
·86 lines (64 loc) · 3.05 KB
/
create_sliding_window_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Sliding Window Script
# Author: Leah Roberts (Scott Beatson Lab)
# Date: 05-09-14
# This script is designed to iteratively count the number of genes
# within a predefined window size. The output is a csv file, which
# gives the start and end location of the window, as well as the
# number of specified genes within that region.
# This csv file can also be easily parsed into a single text file
# suitable as input into EasyFig. Input into Easyfig will result
# in a graph above the genome figure corresponding to the number
# of genes in each window across the genome.
# To parse the results to a text file for EasyFig, type:
### cat results.txt | cut -f3 -d, > results_easyfig.txt
import glob
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation
# The infile need to be in a directory named "Genomes"
# The infile needs to be in embl format
in_files = glob.glob('Genomes/*.embl')
# Defines the window size that the script will move through
# This can be changed based on preference, but will make
# the script run slower.
window_size = 100000
print_counter = 0
result = 0
for f in in_files:
cur_genome = SeqIO.parse(f, "embl") # Parses the input file
for record in cur_genome:
print "infile = %s" % (record.description) # Prints the name of the infile (can also be "record.id")
sequence_length = len(record)
start_pos = 1 # Defines the starting position
end_of_line = len(record) - window_size # Defines the end of the "sliding window"
# Define the end position:
# The end position is either:
# The start position + the window size; or
# The very end of the sequence (if the start position moves
# within the last sliding window region)
if start_pos >= end_of_line:
end_pos = len(record)
else:
end_pos = start_pos + window_size
print "starting analysis..."
while start_pos <= sequence_length:
for feat in record.features:
if feat.type == "operon": # Defines which gene type to count - this can be changed
if feat.location.end in range(start_pos, end_pos):
result += 1
newfile = open("results.txt", "a+") # Opens a new file to write results
newfile.write("%i, %i, %i\n" % (start_pos, end_pos, result)) # Writes out results
IS_count = 0 # Resets the counter for the new region
start_pos += 100 # Moves to the next region in predefined increments - these can be changed
end_pos += 100 # Moves the end position - this needs to be the same as start_pos
print_counter += 100 # This counts the number of positions that have been iterated through
# The 'if' statement below is used to check that the script is still working
# As described above, the print_counter variable tracks the number of positions
# that have been iterated through.
# Once the print counter reaches 500000 (or any predefined number), it prints
# a working statement.
# The counter then resets to zero, and the counting starts over again.
if print_counter == 500000:
print "500,000 positions have passed..."
print_counter = 0
newfile.close()
print "finished!"