forked from sjackman/pgcpdna
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
169 lines (124 loc) · 4.69 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# Annotate and visualize the white spruce plastid genome
# Written by Shaun Jackman @sjackman
name=pg29-plastid
ref=NC_021456
# Picea abies chloroplast complete genome
edirect_query='Picea abies[Organism] chloroplast[Title] complete genome[Title] RefSeq[Keyword]'
all: $(name).gff.gene $(name).gbk.png $(name)-manual.gbk.png \
$(name)-manual.tbl $(name)-manual.tbl.gene $(name)-manual.sqn
clean:
rm -f $(name).orig.gff $(name).gff $(name).orig.gbk $(name).gbk $(name).gbk.png \
$(name)-manual.tbl $(name)-manual.tbl.gene $(name)-manual.sqn
install-deps:
brew install edirect genometools maker ogdraw tbl2asn
pip install --upgrade biopython bcbio-gff
.PHONY: all clean install-deps
.DELETE_ON_ERROR:
.SECONDARY:
# Fetch data from NCBI
cds_aa.orig.fa cds_na.orig.fa: %.fa:
esearch -db nuccore -query $(edirect_query) \
|efetch -format fasta_$* >$@
cds_aa.fa cds_na.fa: %.fa: %.orig.fa
sed -E 's/^>(.*gene=([^]]*).*)$$/>\2|\1/' $< >$@
# asn,faa,ffn,fna,frn,gbk,gff,ptt,rnt,rpt,val
plastids/%:
mkdir -p plastids
curl -fsS http://ftp.cbi.pku.edu.cn/pub/database/Genome/Chloroplasts/$@ >$@
%.faa: plastids/%.gbk
bin/gbk-to-faa <$< >$@
%.frn: plastids/%.frn
sed 's/^>.*\[gene=/>/;s/\].*$$//' $< >$@
# Prodigal
# Annotate genes using Prodigal
%.prodigal.gff: %.fa
prodigal -c -m -g 11 -p meta -f gff -a $*.prodigal.faa -d $*.prodigal.ffn -s $*.prodigal.tsv -i $< -o $@
# Select Prodigal annotations not overlapping manual annotations using bedtools
%.prodigal.orf.gff: %.prodigal.gff %-manual.gff
bedtools intersect -v -header -a $< -b $*-manual.gff |gt gff3 -sort - >$@
# Extract DNA sequences of ORF features from a FASTA file
%.prodigal.orf.gff.fa: %.prodigal.orf.gff %.fa
gt extractfeat -type CDS -coords -matchdescstart -retainids -seqid -seqfile $*.fa $< >$@
# ARAGORN
# Annotate tRNA using ARAGORN
%.aragorn.tsv: %.fa
aragorn -gcbact -i -c -w -o $@ $<
# MAKER
# Annotate genes using MAKER
pg29-plastid.maker.output/stamp: %.maker.output/stamp: maker_opts.ctl %.fa $(ref).frn cds_aa.fa
maker -fix_nucleotides
touch $@
%.orig.gff: %.maker.output/stamp
gff3_merge -s -g -n -d $*.maker.output/$*_master_datastore_index.log >$@
%.gff: %.orig.gff
gsed '/rrn/s/mRNA/rRNA/; \
/trn/s/mRNA/tRNA/' $< \
|gt gff3 -addintrons - >$@
# Extract DNA sequences of GFF CDS features from a FASTA file
%.gff.CDS.fa: %.gff %.fa
gt extractfeat -type CDS -join -coords -matchdescstart -retainids -seqid -seqfile $*.fa $< >$@
# Extract aa sequences of GFF CDS features from a FASTA file
# Hack: The gene chlL uses an alternative GUG start codon.
%.gff.aa.fa: %.gff %.fa
gt extractfeat -type CDS -join -translate -coords -matchdescstart -retainids -seqid -seqfile $*.fa $< | \
sed 's/^VKIAVYGKGG/MKIAVYGKGG/' >$@
# Extract sequences of GFF intron features
%.gff.intron.fa: %.gff %.fa
gt extractfeat -type intron -coords -matchdescstart -retainids -seqid -seqfile $*.fa $< >$@
# Convert GFF to GenBank format
%.orig.gbk: %.gff %.fa
bin/gff_to_genbank.py $^
mv $*.gb $@
%.gbk: %-header.gbk %.orig.gbk
(cat $< && sed -En '/^FEATURES/,$${ \
s/Name=/gene=/; \
s/gene="([^|"]*)\|[^"]*"/gene="\1"/; \
p;}' $*.orig.gbk) >$@
# Organellar Genome Draw
%.gbf.png: %.gbf %.ircoord
drawgenemap --format png --infile $< --outfile $< \
--gc --ircoord `<$*.ircoord` \
--density 126
mogrify -units PixelsPerInch -density 300 $@
%.gbk.png: %.gbk %.ircoord
drawgenemap --format png --infile $< --outfile $< \
--gc --ircoord `<$*.ircoord` \
--density 126
mogrify -units PixelsPerInch -density 300 $@
# GenomeTools sketch
%.gff.png: %.gff
gt sketch $@ $<
# Report the annotated genes
%.gff.gene: %.gff
sort -k1,1 -k4,4n -k5,5n $< \
|gsed -nE '/\tgene\t/!d; \
s/^.*Name=([^|;]*).*$$/\1/; \
s/-gene//; \
p' >$@
# Generate a tbl file for tbl2asn and GenBank submission
%-gene-product.tsv: %.gff
(printf "gene\tproduct\n" \
&& sed -En 's/%2C/,/g;s~%2F~/~g; \
s/^.*gene=([^;]*);.*product=([^;]*).*$$/\1 \2/p' $< |sort -u) >$@
# Convert GFF to TBL
%.tbl: %.gff %-product.tsv %.gff.aa.fa
bin/gff3-to-tbl --centre=BCGSC --locustag=OU3CP $^ >$@
# Extract the names of genes from a TBL file
%.tbl.gene: %.tbl
awk '$$1 == "gene" {print $$2}' $< >$@
# Add structured comments to the FASTA file
%.fsa: %.fa
(echo '>1 [organism=Picea glauca] [location=chloroplast] [completeness=complete] [topology=circular] [gcode=11]'; \
tail -n +2 $<) >$@
# tbl2asn
# Convert TBL to GBF and SQN
%.gbf %.sqn: %.fsa %.sbt %.tbl %.cmt
tbl2asn -i $< -t $*.sbt -w $*.cmt -Z $*.discrep -Vbv
gsed -i 's/DEFINITION Picea glauca/& chloroplast complete genome/' $*.gbf
# Symlinks
pg29-plastid-manual.fa: pg29-plastid.fa
ln -s $< $@
pg29-plastid-manual.ircoord: pg29-plastid.ircoord
ln -s $< $@
pg29-plastid-manual-header.gbk: pg29-plastid-header.gbk
ln -s $< $@