Skip to content

Commit

Permalink
Merge pull request #1 from darcyabjones/master
Browse files Browse the repository at this point in the history
Merge changes from main repo
  • Loading branch information
darcyabjones authored Oct 31, 2019
2 parents fa119a3 + d661c0b commit bd33a69
Show file tree
Hide file tree
Showing 8 changed files with 302 additions and 36 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ You can also raise an issue on the github repository and I'll try to help.
| `--mmseqs_max_evalue` | 0.001 | The maximum e-value to use to consider `mmseqs` profile matches against the genomes significant. |
| `--min_intra_frequency` | 4 | The minimum number of copies a clustered repeat family must have within a genome for it to be considered "present". |
| `--min_inter_proportion` | 0.2 | The minimum proportion of genomes that the clustered repeat family must be present in (after `--min_intra_frequency`) to be considered a geniune family. |
| `--repeatmodeler_min_len` | 10000 | The minimum scaffold length to allow for predicting repeats in repeatmodeler. Scaffolds smaller than this will be removed to avoid sampling bias. |
| `--eahelitron_three_prime_fuzzy_level` | 3 | Passed on to the EAHelitron parameter `-r`. |
| `--eahelitron_upstream_length` | 3000 | Passed on to the EAHElitron parameter `-u`. |
| `--eahelitron_downstream_length` | 50 | Passed on to the EAHelitron parameter `d`. |
Expand Down
136 changes: 136 additions & 0 deletions bin/exclude_contained_subseqs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3

import sys
import argparse
import re

from collections import defaultdict

from Bio import SeqIO

from intervaltree import Interval, IntervalTree


def cli(prog, args):
parser = argparse.ArgumentParser(
prog=prog,
description=""" Extracts all sequences from a fasta msa file into a
stockholm alignment file.
"""
)

parser.add_argument(
"infile",
type=argparse.FileType('r'),
help="Input fasta file.",
)

parser.add_argument(
"-o", "--outfile",
default=sys.stdout,
type=argparse.FileType('w'),
help=("Output filtered fastas."),
)

parser.add_argument(
"-c", "--coverage",
default=0.95,
type=float,
help="The coverage of the smaller sequence required to exclude."
)

return parser.parse_args(args)


def parse_id_as_interval(id_string, regex):
""" The fasta ids contain the locus information. """

match = regex.match(id_string)
genome = match.group("genome")
seqid = match.group("seqid")
start_tmp = int(match.group("start"))
end_tmp = int(match.group("end"))

start = min([start_tmp, end_tmp])
end = max([start_tmp, end_tmp])
del start_tmp
del end_tmp

return (genome, seqid, start, end)


def coverage(left, right):
intersection_begin = max([left.begin, right.begin])
intersection_end = min([left.end, right.end])
size = intersection_end - intersection_begin
return size / min([left.length(), right.length()])


def main():
args = cli(sys.argv[0], sys.argv[1:])
regex = re.compile(r"^>?(?P<genome>[^:\s]+):(?P<seqid>[^:\s]+)"
r":(?P<start>\d+)-(?P<end>\d+)")

itree = defaultdict(IntervalTree)
seqs = {s.id: s for s in SeqIO.parse(args.infile, format="fasta")}

for id_ in seqs.keys():
genome, seqid, start, end = parse_id_as_interval(id_, regex)
interval = Interval(start, end, data=id_)

if interval in itree[(genome, seqid)]:
continue

envelops = itree[(genome, seqid)].envelop(interval)
overlaps = itree[(genome, seqid)].overlap(interval)

# If the interval completely overlaps one already in the tree
# replace it. This would be covered by overlaps, by should be faster.
if len(envelops) > 0:
itree[(genome, seqid)].remove_overlap(start, end)
itree[(genome, seqid)].add(interval)

# If the interval partially overlaps one, or is completely contained
# by an interval in the tree, we interrogate further.
elif len(overlaps) > 0:
to_remove = []
add_to_tree = True

for i in overlaps:
cov_match = coverage(i, interval) > args.coverage

# If the coverage of the shorter interval is above a threshold
# and the interval already in the tree is the shorter one,
# we flag it for replacement.
if cov_match and i.length() < interval.length():
to_remove.append(i)

# If the new interval was the shorter of the intervals.
elif cov_match:
add_to_tree = False
break

# We reached all the way through without discarding the new
# interval.
if add_to_tree:
for i in to_remove:
itree[(genome, seqid)].remove(i)

itree[(genome, seqid)].add(interval)

# If it doesn't overlap the sequence at all.
else:
itree[(genome, seqid)].add(interval)

for (genome, seqid), subitree in itree.items():
SeqIO.write(
(seqs[i.data] for i in subitree),
args.outfile,
format="fasta"
)

return


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions bin/repeatmodeler2gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,15 @@ def parse_block(handle, source, type_):
elif not line.startswith("#"):
ids.append(line.split(maxsplit=1)[0])

seen = set()

out = []
for id_ in ids:
if id_ in seen:
continue
else:
seen.add(id)

seqid, start, end, strand = id_to_loc(id_)

if len(species) == 0:
Expand Down
2 changes: 0 additions & 2 deletions bin/rmout2gff3.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,12 +233,10 @@ def as_gff3(self, source="RepeatMasker"):
ontology_terms.extend(["SO:0000206", "SO:SINE_element"])

if "helitron" in self.kind.lower():
type_ = "helitron"
ontology_terms.extend(["SO:0000544", "SO:helitron"])

if ("maverick" in self.kind.lower()
or "polinton" in self.kind.lower()):
type_ = "polinton"
ontology_terms.extend(["SO:0001170", "SO:polinton"])

if "mite" in self.kind.lower():
Expand Down
38 changes: 25 additions & 13 deletions bin/stk2fasta.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/usr/bin/env python3

import re
import sys
import argparse

from Bio import SeqIO
from Bio import AlignIO

from Bio.Alphabet import Gapped, SingleLetterAlphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


def cli(prog, args):
Expand Down Expand Up @@ -34,21 +34,33 @@ def cli(prog, args):
return parser.parse_args(args)


def line_to_seq(line, regex):
sline = regex.split(line.strip(), maxsplit=1)
seq = SeqRecord(
id=sline[0],
name=sline[0],
description=sline[0],
seq=Seq(sline[1].replace("-", "").replace(".", "")),
)

return seq


def main():
args = cli(sys.argv[0], sys.argv[1:])
regex = re.compile(r"\s+")

seen = set()
seqs = []
alignments = AlignIO.parse(
args.infile,
format="stockholm",
alphabet=Gapped(SingleLetterAlphabet(), "-")
)
for line in args.infile:
if line.startswith("#") or line.startswith("/"):
continue

i = 1
for alignment in alignments:
for seq in alignment:
seq.seq = seq.seq.ungap()
seq = line_to_seq(line, regex)
if seq.id not in seen:
seqs.append(seq)
i += 1

seen.add(seq.id)

SeqIO.write(seqs, args.outfile, "fasta")
return
Expand Down
4 changes: 2 additions & 2 deletions conf/pawsey_zeus.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ process {

withLabel: biggish_task {
cpus = 8
memory = 16.GB
memory = 31.GB
}

withLabel: big_task {
cpus = 16
memory = 32.GB
memory = 63.GB
}
}

Loading

0 comments on commit bd33a69

Please sign in to comment.