-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathMakefile
804 lines (626 loc) · 26.2 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
# The tests work in JAR's setup...
# $^ = all prerequisites
# $< = first prerequisite
# $@ = file name of target
# $(*F) = The directory part and the file-within-directory part of
# the stem (stem = % in pattern rules)
# Modify as appropriate to your own hardware - I set it one or two Gbyte
# below physical memory size
JAVAFLAGS?=-Xmx14G
all: compile
EXCL=--exclude="*~" --exclude=".??*" --exclude="#*" --exclude=debug
# External URLs. These may change from time to time
SILVA_ARCHIVE=https://www.arb-silva.de/no_cache/download/archive
NCBI_ORIGIN_URL=ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
GBIF_ORIGIN_URL=http://rs.gbif.org/datasets/backbone/backbone-current.zip
IRMNG_ORIGIN_URL=http://www.cmar.csiro.au/datacentre/downloads/IRMNG_DWC.zip
AMENDMENTS_ORIGIN_URL=https://github.com/OpenTreeOfLife/amendments-1.git
# ----- Version selection -----
-include config.mk
fetch: $(FETCHES)
# ----- Taxonomy source locations -----
# Where to put tarballs
#TARDIR=/raid/www/roots/opentree/ott
TARDIR?=tarballs
# -----
# Smasher related configuration
CP=-classpath ".:lib/*"
JAVA=JYTHONPATH=util java $(JAVAFLAGS) $(CP)
SMASH=$(JAVA) org.opentreeoflife.smasher.Smasher
CLASS=org/opentreeoflife/smasher/Smasher.class
JAVASOURCES=$(shell find org/opentreeoflife -name "*.java")
# ----- Targets
# The open tree reference taxonomy
# for debugging
ott: r/ott-NEW/source/debug/transcript.out
refresh/ott: r/ott-NEW/source/.made
bin/christen ott-NEW
# The works
# Reinstate later: r/ott-NEW/source/debug/otu_differences.tsv
r/ott-NEW/source/.made: r/ott-NEW/source/debug/transcript.out \
r/ott-NEW/source/version.txt \
r/ott-NEW/source/README.html
touch $@
r/ott-NEW/source/debug/transcript.out: bin/jython $(CLASS) \
make-ott.py assemble_ott.py \
curation/adjustments.py \
curation/amendments.py \
util/proposition.py \
curation/separation/taxonomy.tsv \
$(RESOURCES) \
r/idlist-HEAD/source/.made \
r/ott-HEAD/resource/.made \
curation/lamiales/taxonomy.tsv \
curation/h2007/tree.tre \
curation/edits/ott_edits.tsv \
ids_that_are_otus.tsv ids_in_synthesis.tsv \
inclusions.csv \
r/ott-NEW/source
@date
@rm -f *py.class util/*py.class curation/*py.class
@mkdir -p r/ott-NEW/source
@echo SET THE VERSION.
bin/put ott-NEW draft $$((1 + `bin/get ott-NEW draft 0`))
@echo Writing transcript to r/ott-NEW/source/debug/transcript.out
rm -f /tmp/make-ott-completed
(bin/jython make-ott.py ott-NEW 2>&1 \
&& touch /tmp/make-ott-completed) \
| tee r/ott-NEW/source/transcript.out.new
[ -e /tmp/make-ott-completed ]
bin/put ott-NEW "generated_on" `date +"%Y%m%d"`
bin/put ott-NEW "date" `date +"%Y%m%d"`
(cd r/ott-NEW/source && \
[ -e taxonomy.tsv ] && \
mkdir -p debug && \
mv *.* debug/ && \
mv debug/taxonomy.tsv debug/synonyms.tsv debug/forwards.tsv ./ && \
mv debug/transcript.out.new debug/transcript.out)
r/ott-NEW/source/version.txt: r/ott-NEW/source
echo `bin/get ott-NEW version`draft`bin/get ott-NEW draft` \
>r/ott-NEW/source/version.txt
r/ott-NEW/source/README.html: r/ott-NEW/source/debug/transcript.out util/make_readme.py
python util/make_readme.py r/ott-NEW/source/ >$@
r/ott-NEW/source: r/ott-HEAD/resource/.made
bin/put ott-NEW minor $$((1 + `bin/get ott-HEAD minor`))
bin/put ott-NEW draft 0
bin/put ott-NEW ott_idspace ott
mkdir $@
r/ott-HEAD/resource/.made: r/ott-HEAD/source/.made
(cd r/ott-HEAD && rm -f resource && ln -s source resource)
# kludge - it's set as a dependency in properties.json, but doesn't live in r/
# (needed for idlist)
r/h2007-HEAD/resource/.made:
r/study713-HEAD/resource/.made:
# ----- Taxonomy sources
# N.b. unpack-archive will fetch an archive if there isn't one already
# there
# Pattern rules!
# From the gnu make manual, re pattern rules:
# "all of its prerequisites (after pattern substitution) must name
# files that exist or can be made"
# For our purposes this means most pattern rules should have zero
# prerequisistes.
# My experience is that this is not true. The pattern rule won't
# apply even if the prerequisiste *can* be made.
# "It is possible that more than one pattern rule will meet these
# criteria. In that case, make will choose the rule with the
# shortest stem (that is, the pattern that matches most
# specifically). If more than one pattern rule has the shortest
# stem, make will choose the first one found in the makefile."
fetch-all: $(UNPACKS)
# unpack does a fetch, if necessary
unpack/%:
bin/unpack-archive $(*F)
# (why doesn't this just use bin/set-head?)
unpack-to-head/%: unpack/%
@h=r/`bin/get $(*F) series`-HEAD && \
if [ -L $$h ]; then \
true; else \
echo $$h := $(*F); ln -s $(*F) $$h; \
fi
# store does a pack, if necessary
store/%:
if [ -e r/$(*F)/source/.made ]; then bin/store-archive $(*F); fi
store-all: $(STORES)
# Not quite right... need to handle draft releases
ott-release: ott store-all
# --- Source: SILVA
# Significant tabs !!!
# Silva 115: 206M uncompresses to 817M
# issue #62 - verify (is it a tsv file or csv file?)
# Create the taxonomy import files from the no_sequences digest & accessions
r/silva-HEAD/resource/.made: import_scripts/silva/process_silva.py \
r/silva-HEAD/source/.made \
r/silva-HEAD/work/cluster_names.tsv
@mkdir -p r/silva-HEAD/resource
python import_scripts/silva/process_silva.py \
r/silva-HEAD/source/silva_no_sequences.fasta \
r/silva-HEAD/work/cluster_names.tsv \
r/silva-HEAD/source/origin_info.json \
r/silva-HEAD/resource
touch $@
# Get taxon names from Genbank accessions file.
# The accessions file has genbank id, ncbi id, strain, taxon name.
# -- it seems the accessions file now has taxon names, so we
# probably don't need to take NCBI taxonomy as an input.
r/silva-HEAD/work/cluster_names.tsv: r/ncbi-HEAD/resource/.made \
r/genbank-HEAD/resource/.made
@mkdir -p `dirname $@`
python import_scripts/silva/get_taxon_names.py \
r/ncbi-HEAD/resource/taxonomy.tsv \
r/genbank-HEAD/resource/accessions.tsv \
mv -f [email protected] $@
# Refresh from web.
# Digestify fasta file and create sources for archive
refresh/silva: r/silva-NEW/source/.made
bin/christen silva-NEW
r/silva-NEW/source/.made: r/silva-NEW/source/silva_no_sequences.fasta
python util/origin_info.py \
`cat r/silva-NEW/work/date` \
`cat r/silva-NEW/work/origin_url` \
>r/silva-NEW/source/origin_info.json
ls -l r/silva-NEW/source
touch $@
r/silva-NEW/source/silva_no_sequences.fasta: r/silva-NEW/work/download.gz
gunzip -c r/silva-NEW/work/download.gz | \
grep ">.*;" > r/silva-NEW/source/silva_no_sequences.fasta
# Get the fasta file (big; for release 128, it's 150M)
r/silva-NEW/work/download.tgz: r/silva-NEW
@mkdir -p r/silva-NEW/work
r=`bin/get silva-NEW version`; \
bin/put silva-NEW origin_url \
"$(SILVA_ARCHIVE)/release_$$r/Exports/SILVA_$$r_SSURef_Nr99_tax_silva.fasta.gz"
wget -q --output-file=r/silva-NEW/work/download.tgz.new \
`bin/get silva-NEW origin_url` && \
mv -f r/silva-NEW/work/download.tgz.new r/silva-NEW/work/download.tgz
# Figure out which release we want, before downloading, by scanning the index.html file
r/silva-NEW:
bin/new-version silva .tgz public
bin/put silva-NEW ott_idspace "silva"
wget -q -O - $(SILVA_ARCHIVE)/ | \
python import_scripts/silva/get_silva_release_info.py | \
(read r d; bin/put silva-NEW version $$r && \
bin/put silva-NEW date $$d)
touch $@
# --- Source: Digest of genbank sequence records
# -rw-r--r--+ 1 jar staff 17773988 Dec 16 19:38 feed/silva/work/accessions.tsv
# -rw-r--r--+ 1 jar staff 6728426 Dec 2 22:42 feed/silva/accessionid_to_taxonid.tsv
# -rw-r--r--+ 1 jar staff 6573 Jun 28 2016 feed/genbank/accessionFromGenbank.py
# -rw-r--r--+ 1 jar staff 1098 Oct 7 2015 feed/genbank/makeaccessionid2taxonid.py
#
# accessionFromGenbank.py reads genbank and writes Genbank.pickle
# makeaccessionid2taxonid.py reads Genbank.pickle writes accessionid_to_taxonid.tsv
#
# feed/silva/accessionid_to_taxonid.tsv:
# As of 2017-04-25, this file was in the repository. 6.7M
# It's an input to the SILVA build.
# It really ought to be versioned and archived.
# Originally made: 30 Nov 2013
#
# accessions.tsv not in repo, is more recent, includes taxon name and a column for strain.
# Created before 6 July 2016, don't know when. But none of the rows have strain info.
# This takes a long time - reads every flat file in genbank
# Also - be sure to update RANGES in accessionFromGenbank.py, so you
# don't miss any genbank records! Manual process now, could be
# automated.
r/genbank-HEAD/resource/.made: r/genbank-HEAD/source/.made
(cd r/genbank-HEAD && rm -f resource && ln -s source resource)
r/genbank-NEW/source/.made: r/genbank-NEW/work/Genbank.pickle \
import_scripts/genbank/makeaccessionid2taxonid.py
@echo Making accessions.tsv
@mkdir -p r/genbank-NEW/source
python import_scripts/genbank/makeaccessionid2taxonid.py \
r/genbank-NEW/work/Genbank.pickle \
r/genbank-NEW/source/accessions.tsv
touch $@
# Alert: sometimes we might want silva-NEW instead of silva-HEAD
r/genbank-NEW/work/Genbank.pickle: import_scripts/genbank/accessionFromGenbank.py \
r/silva-HEAD/source/silva_no_sequences.fasta \
r/genbank-NEW
mkdir -p r/genbank-NEW/work
@echo "*** Reading all of Genbank - this can take a while!"
python import_scripts/genbank/accessionFromGenbank.py \
r/silva-HEAD/source/silva_no_sequences.fasta \
r/genbank-NEW/work/Genbank.pickle
d=`python util/modification_date.py r/genbank-NEW/work/Genbank.pickle`; \
bin/put genbank-NEW date $$d && \
bin/put genbank-NEW version $$d
r/genbank-NEW:
bin/new-version genbank .tgz pd
# --- Source: Index Fungorum in Open Tree form
refresh/fung: r/fung-NEW/source/.made
bin/christen fung-NEW
r/fung-HEAD/resource/.made: r/fung-HEAD/source/.made
(cd r/fung-HEAD; rm resource; ln -sf source resource)
r/fung-NEW/source/.made: r/fung-1/resource/.made r/fung-3/resource/.made r/fung-4/resource/.made
@mkdir r/fung-NEW/source
python import_scripts/fung/cobble_fung.py
rm -rf r/fung-NEW/source
mv hackedfung r/fung-NEW/source
cp -p r/fung-4/resource/synonyms.tsv resource/fung-NEW
touch $@
r/fung-NEW:
bin/new-version fung .tgz public
bin/put ott-NEW ott_idspace if
(cd r/fung-NEW; rm resource; ln -sf source resource)
# How fung-9 was created:
# import_scripts/fung/cobble_fung.py
# which reads fung-4, fung-2, fung-1
#
# The earlier versions were created using various versions of the
# feed/if.py (process_fungorum.py) script, operating on various files
# from Paul Kirk as unput.
# --- Source: WoRMS in Open Tree form
# Make resource (i.e. taxonomy) from source (digest of WoRMS API traversal)
r/worms-HEAD/resource/.made: import_scripts/worms/process_worms.py r/worms-HEAD/source/.made
python import_scripts/worms/process_worms.py r/worms-HEAD/source/digest r/worms-HEAD/resource
touch $@
refresh/worms: r/worms-NEW/source/.made
bin/christen worms-NEW
r/worms-NEW/source/.made: import_scripts/worms/fetch_worms.py r/worms-NEW
mkdir -p r/worms-NEW/work r/worms-NEW/source/digest
time python import_scripts/worms/fetch_worms.py --queue r/worms-NEW/work/q.q \
--out r/worms-NEW/source/digest --chunks 5000 --chunksize 500
d=`gdate +"%Y%m%d"`; bin/put worms-NEW date $$d && bin/put worms-NEW version $$d
touch $@
r/worms-NEW:
bin/new-version worms .tgz public
bin/put worms-NEW ott_idspace worms
# --- Source: NCBI Taxonomy
# Build resource from source.
# Formerly (version 1.0), where we now have /dev/null, we had
# ../data/ncbi/ncbi.taxonomy.homonym.ids.MANUAL_KEEP
r/ncbi-HEAD/resource/.made: r/ncbi-HEAD/source/.made import_scripts/ncbi/process_ncbi_taxonomy.py
@rm -rf r/ncbi-HEAD/resource.new
@mkdir -p r/ncbi-HEAD/resource.new
python import_scripts/ncbi/process_ncbi_taxonomy.py F r/ncbi-HEAD/source \
/dev/null r/ncbi-HEAD/resource.new $(NCBI_ORIGIN_URL)
rm -rf `dirname $@`
mv -f r/ncbi-HEAD/resource.new `dirname $@`
touch $@
# Refresh source from web. Source depends on archive.
# Override default pattern rule (archive instead of source).
refresh/ncbi: r/ncbi-NEW/source/.made
bin/christen ncbi-NEW
r/ncbi-NEW/source/.made: r/ncbi-NEW/archive/.made
bin/unpack-archive ncbi-NEW
d=`python util/modification_date.py r/ncbi-NEW/source/names.dmp`; \
bin/put ncbi-NEW date $$d && \
bin/put ncbi-NEW version $$d
r/ncbi-NEW/archive/.made: r/ncbi-NEW
@mkdir -p r/ncbi-NEW/archive
bin/put ncbi-NEW origin_url $(NCBI_ORIGIN_URL)
wget -q --output-document=r/ncbi-NEW/archive/archive.tgz $(NCBI_ORIGIN_URL)
bin/put ncbi-NEW bytes `wc -c r/ncbi-NEW/archive/archive.tgz | (read c d && echo $$c)`
touch $@
r/ncbi-NEW:
bin/new-version ncbi .tgz pd
bin/put ncbi-NEW ott_idspace ncbi
# --- Source: GBIF
# Formerly, where it says /dev/null, we had ../data/gbif/ignore.txt
r/gbif-HEAD/resource/.made: r/gbif-HEAD/work/projection.tsv \
import_scripts/gbif/process_gbif_taxonomy.py
@mkdir -p `dirname $@`
@mkdir -p r/gbif-HEAD/resource.new
python import_scripts/gbif/process_gbif_taxonomy.py \
r/gbif-HEAD/work/projection.tsv \
r/gbif-HEAD/resource.new
rm -rf r/gbif-HEAD/resource
mv r/gbif-HEAD/resource.new r/gbif-HEAD/resource
touch $@
# The python script wants `pip install python-dwca-reader`
r/gbif-HEAD/work/projection.tsv: r/gbif-HEAD/archive/.made \
import_scripts/gbif/project.py
@mkdir -p `dirname $@`
python import_scripts/gbif/project.py r/gbif-HEAD/archive/archive.zip [email protected]
mv [email protected] $@
# Get a new GBIF from the web and store to r/gbif-HEAD/archive/archive.zip
# Was http://ecat-dev.gbif.org/repository/export/checklist1.zip
# Could be http://rs.gbif.org/datasets/backbone/backbone.zip
# 2016-05-17 purl.org is broken, cannot update this link
# GBIF_URL=http://purl.org/opentree/gbif-backbone-2013-07-02.zip
# should be very similar to IRMNG
refresh/gbif: r/gbif-NEW/source/.made
bin/christen gbif-NEW
r/gbif-NEW/source/.made: r/gbif-NEW/archive/.made
bin/unpack-archive gbif-NEW
d=`python util/modification_date.py r/gbif-NEW/source/eml.xml`; \
bin/put gbif-NEW date $$d && \
bin/put gbif-NEW version $$d
r/gbif-NEW/archive/.made: r/gbif-NEW
@mkdir -p r/gbif-NEW/archive
bin/put gbif-NEW origin_url $(GBIF_ORIGIN_URL)
wget -q --output-document=r/gbif-NEW/archive/archive.zip "$(GBIF_ORIGIN_URL)"
bin/put gbif-NEW bytes `wc -c r/gbif-NEW/archive/archive.zip | \
(read c d && echo $$c)`
touch $@
r/gbif-NEW:
bin/new-version gbif .zip public
bin/put gbif-NEW ott_idspace gbif
# --- Source: IRMNG
r/irmng-HEAD/resource/.made: r/irmng-HEAD/source/.made \
import_scripts/irmng/process_irmng.py
@mkdir -p `dirname $@`.new
python import_scripts/irmng/process_irmng.py \
r/irmng-HEAD/source/IRMNG_DWC_20*.csv \
r/irmng-HEAD/source/IRMNG_DWC_SP_PROFILE_20*.csv \
r/irmng-HEAD/resource.new/taxonomy.tsv \
r/irmng-HEAD/resource.new/synonyms.tsv
rm -rf r/irmng-HEAD/resource
mv r/irmng-HEAD/resource.new r/irmng-HEAD/resource
touch $@
# Build IRMNG from Tony's .csv files
# should be mostly the same as for GBIF
# Refresh makes archive instead of source
refresh/irmng: r/irmng-NEW/source/.made
bin/christen irmng-NEW
r/irmng-NEW/source/.made: r/irmng-NEW/archive/.made
bin/unpack-archive irmng-NEW
d=`python util/modification_date.py r/irmng-NEW/source/IRMNG_DWC.csv`; \
bin/put irmng-NEW date $$d && \
bin/put irmng-NEW version $$d
touch $@
r/irmng-NEW/archive/.made: r/irmng-NEW
@mkdir -p r/irmng-NEW/archive
bin/put irmng-NEW origin_url $(IRMNG_ORIGIN_URL)
wget -q --output-document=r/irmng-NEW/archive/archive.zip "$(IRMNG_ORIGIN_URL)"
bin/put irmng-NEW bytes `wc -c r/irmng-NEW/archive/archive.zip | (read c d && echo $$c)`
touch $@
r/irmng-NEW:
bin/new-version irmng .zip public
bin/put ott-NEW ott_idspace irmng
# --- Source: EOL page ids
r/eol-HEAD/resource/.made: r/eol-HEAD/resource/eol-mappings.csv
touch $@
r/eol-HEAD/resource/eol-mappings.csv: r/eol-HEAD/source/.made \
import_scripts/eol/process_eol.py
mkdir -p r/eol-HEAD/resource
python import_scripts/eol/process_eol.py \
< r/eol-HEAD/source/digest.csv \
mv [email protected] $@
# From Yan Wong:
# ((('ncbi', 1172), ('if', 596), ('worms', 123), ('irmng', 1347), ('gbif', 800)))
# Haven't figure out how to automate the download yet. The site uses
# some screwy javascript that seems to require manual operation.
refresh/eol: r/eol-NEW/source/.made
bin/christen eol-NEW
r/eol-NEW/source/.made: r/eol-NEW/source/digest.csv
touch $@
r/eol-NEW/source/digest.csv: r/eol-NEW
mkdir -p r/eol-NEW/work
@([ -e r/eol-NEW/work/identifiers.csv.gz ] || \
(echo "** Please download the EOL identifiers file"; \
echo "** and place it at r/eol-NEW/work/identifiers.csv.gz."; \
echo "** Visit this page: http://opendata.eol.org/dataset/identifiers"; \
exit 1))
mkdir -p r/eol-NEW/source
gunzip -c r/eol-NEW/work/identifiers.csv.gz | grep ',596,\|,1172,\|,123,\|,1347,\|,800,' \
> r/eol-NEW/source/digest.csv.new
d=`python util/modification_date.py r/eol-NEW/work/identifiers.csv.gz`; \
bin/put eol-NEW date $$d && bin/put eol-NEW version $$d
mv r/eol-NEW/source/digest.csv.new r/eol-NEW/source/digest.csv
r/eol-NEW:
bin/new-version eol .tgz public
# --- Source: Open Tree curated amendments
# Note, no dependence on /source/.
# A dependence on /source/ would attempt archive retrieval, which would fail.
r/amendments-HEAD/resource/.made: r/amendments-HEAD/source/.made
(cd r/amendments-HEAD && rm -f resource && ln -s source resource)
# Recursive make is not generally recommended, but don't see what else
# to do in this case. We don't want HEAD to depend (in the Makefile
# sense) on PREVIOUS, since otherwise a demand for HEAD would force
# a retrieval of PREVIOUS, which is not always what we want.
r/amendments-HEAD/.issue:
$(MAKE) r/amendments-PREVIOUS/.is-previous
bin/set-head amendments amendments-PREVIOUS easy
touch $@
# New version: fetch from github
refresh/amendments: r/amendments-NEW/source/.made
bin/christen amendments-NEW
r/amendments-NEW/source/.made: tmp/amendments/amendments-1 r/amendments-NEW
@mkdir -p r/amendments-NEW/source/amendments-1
(cd tmp/amendments/amendments-1 && \
git checkout master && \
git pull)
bin/put amendments-NEW version `(cd tmp/amendments/amendments-1; git log -n 1) | \
head -1 | sed -e 's/commit //'`
cp -pr tmp/amendments/amendments-1/amendments \
r/amendments-NEW/source/amendments-1/
touch $@
# Local clone
tmp/amendments/amendments-1:
@mkdir -p tmp/amendments
(cd tmp/amendments; git clone $(AMENDMENTS_ORIGIN_URL))
r/amendments-NEW:
bin/new-version amendments .tgz cc0
# --- Source: OTT id list compiled from all previous OTT versions
r/idlist-HEAD/resource/.made: r/idlist-HEAD/source/.made
(cd r/idlist-HEAD && rm -f resource && ln -s source resource)
# When we build OTT 3.1, we need idlist-3.0, but we have
# idlist-PREVIOUS which is idlist-2.10, which has ids through OTT 2.9.
# So, to make the id list needed to build OTT 3.1, we extend
# idlist-2.10 with new identifiers added in OTT 3.0 (ott-HEAD), obtaining
# idlist-3.0, which is then used as input in build OTT 3.1.
# Whew!
refresh/idlist: r/idlist-NEW/source/.made
bin/christen idlist-NEW
r/idlist-NEW/source/.made: r/idlist-PREVIOUS/source/.made \
r/ott-HEAD/source/.made \
r/idlist-NEW
@rm -rf r/idlist-NEW/source
@mkdir -p r/idlist-NEW/source
cp -pr r/idlist-PREVIOUS/source/regs r/idlist-NEW/source/
bin/put idlist-NEW version `bin/get ott-HEAD version`
python import_scripts/idlist/extend_idlist.py \
r/idlist-PREVIOUS/source \
r/ott-HEAD/source \
`bin/get ott-HEAD name` \
r/ott-HEAD/properties.json \
r/idlist-NEW/source
touch $@
r/idlist-NEW:
bin/new-version idlist .tgz cc0
# ----- TBD: wikidata.py (from which we get EOL ids)
# Get wikidata dump from https://dumps.wikimedia.org/wikidatawiki/entities/wikidata-DDDDDDDD-all.json.gz
# e.g. DDDDDDDD = 20170424
# The .bz2 would be better (only 66% the size).
r/wikidata-NEW/source/.made: import_scripts/wikidata/get_wikidata.py
@echo wikidata/EOL NYI
# Make a digest that we can archive.
# source qid, wikidata id, EOL id
# To make resource, look up OTT source ids ... or maybe just use the
# digest directly
# ----- Phylesystem OTU list
# Used only for reporting! We want to avoid having OTT depend on
# phylesystem.
# This rule typically won't run, since the target is checked in.
# To refresh, remove the target, then make it
ids_that_are_otus.tsv:
time python util/ids_that_are_otus.py [email protected]
mv [email protected] $@
wc $@
SSH_PATH_PREFIX?=files.opentreeoflife.org:files.opentreeoflife.org
# Synthetic tree OTT id list (this rule assumes you're part of the
# Open Tree project and can use scp; could be modified to use
# curl. this doesn't matter much since the list is checked into the
# repo.)
# To refresh, remove the target, then make it
ids_in_synthesis.tsv:
rm -rf synth-nexson-tmp
mkdir -p synth-nexson-tmp
scp -p "$(SSH_PATH_PREFIX)/synthesis/current/output/phylo_snapshot/*@*.json" synth-nexson-tmp/
time bin/jython util/ids_in_synthesis.py --dir synth-nexson-tmp --outfile [email protected]
mv [email protected] $@
# ----- Products
# For publishing OTT drafts or releases.
# File names beginning with # are emacs lock links.
# Maybe this rule isn't needed now that we have pack/ and store/ targets?
tarball: r/ott-HEAD/archive/.made
# Then, something like
# scp -p -i ~/.ssh/opentree/opentree.pem tarballs/ott2.9draft3.tgz \
# [email protected]:files.opentreeoflife.org/ott/ott2.9/
# This file is big
r/ott-NEW/work/differences.tsv: r/ott-HEAD/resource/.made r/ott-NEW/source/debug/transcript.out
$(SMASH) --diff r/ott-HEAD/resource/ r/ott-NEW/source/ [email protected]
mv [email protected] $@
wc $@
# OTUs only
r/ott-NEW/source/debug/otu_differences.tsv: r/ott-NEW/work/differences.tsv
$(SMASH) --join ids_that_are_otus.tsv r/ott-NEW/work/differences.tsv >[email protected]
mv [email protected] $@
wc $@
tags: $(JAVASOURCES)
etags *.py util/*.py $(JAVASOURCES)
# ----- Libraries
lib/jython-standalone-2.7.0.jar:
wget -q -O "$@" --no-check-certificate \
"http://search.maven.org/remotecontent?filepath=org/python/jython-standalone/2.7.0/jython-standalone-2.7.0.jar"
@ls -l $@
lib/json-simple-1.1.1.jar:
wget -q --output-document=$@ --no-check-certificate \
"http://search.maven.org/remotecontent?filepath=com/googlecode/json-simple/json-simple/1.1.1/json-simple-1.1.1.jar"
@ls -l $@
lib/junit-4.12.jar:
wget -q --output-document=$@ --no-check-certificate \
"http://search.maven.org/remotecontent?filepath=junit/junit/4.12/junit-4.12.jar"
@ls -l $@
# -----Taxon inclusion tests
inclusions.csv:
wget -q --output-document=$@ --no-check-certificate \
"https://raw.githubusercontent.com/OpenTreeOfLife/germinator/master/taxa/inclusions.csv"
# ----- Testing
# Trivial, not very useful any more
test-smasher: compile
$(JAVA) org.opentreeoflife.smasher.Test
# internal tests
test2: $(CLASS)
$(SMASH) --test
check:
bash run-tests.sh
# These are run by the OTT build script so this is usually redundant
inclusion-tests: inclusions.csv bin/jython
bin/jython util/check_inclusions.py inclusions.csv r/ott-NEW/source/
# -----------------------------------------------------------------------------
# Taxonomy-subset test system ('make test')
SELECTION?=Asterales
SELECTION_TAG?=aster
# t/tax/prev/taxonomy.tsv: r/ott-HEAD/resource/taxonomy.tsv - correct expensive
t/tax/prev_$(SELECTION_TAG)/taxonomy.tsv:
@mkdir -p `dirname $@`
$(SMASH) r/ott-HEAD/resource/ --select2 $(SELECTION) --out t/tax/prev_$(SELECTION_TAG)/
# dependency on r/ncbi-HEAD/resource/taxonomy.tsv - correct expensive
t/tax/ncbi_$(SELECTION_TAG)/taxonomy.tsv:
@mkdir -p `dirname $@`
$(SMASH) r/ncbi-HEAD/resource/ --select2 $(SELECTION) --out t/tax/ncbi_$(SELECTION_TAG)/
# dependency on GBIF taxonomy.tsv - correct but expensive
t/tax/gbif_$(SELECTION_TAG)/taxonomy.tsv:
@mkdir -p `dirname $@`
$(SMASH) r/gbif-HEAD/resource/ --select2 $(SELECTION) --out t/tax/gbif_$(SELECTION_TAG)/
# Previously:
#t/tax/aster/taxonomy.tsv: $(CLASS) \
# t/tax/ncbi_aster/taxonomy.tsv \
# t/tax/gbif_aster/taxonomy.tsv \
# t/tax/prev_aster/taxonomy.tsv \
# t/edits/edits.tsv
# @mkdir -p `dirname $@`
# $(SMASH) t/tax/ncbi_aster/ t/tax/gbif_aster/ \
# --edits t/edits/ \
# --ids t/tax/prev_aster/ \
# --out t/tax/aster/
# New:
t/tax/$(SELECTION_TAG)/taxonomy.tsv: compile t/aster.py \
t/tax/ncbi_$(SELECTION_TAG)/taxonomy.tsv \
t/tax/gbif_$(SELECTION_TAG)/taxonomy.tsv \
t/tax/prev_$(SELECTION_TAG)/taxonomy.tsv \
t/edits/edits.tsv \
bin/jython
@mkdir -p `dirname $@`
bin/jython t/aster.py
t/tax/$(SELECTION_TAG)/README.html: t/tax/$(SELECTION_TAG)/about.json util/make_readme.py
python util/make_readme.py t/tax/$(SELECTION_TAG)/ >$@
test: t/tax/$(SELECTION_TAG)/taxonomy.tsv t/tax/$(SELECTION_TAG)/README.html
$(SELECTION_TAG)-tarball: t/tax/$(SELECTION_TAG)/taxonomy.tsv
(mkdir -p $(TARDIR) && \
tar czvf $(TARDIR)/$(SELECTION_TAG).tgz.tmp -C t/tax $(SELECTION_TAG) && \
mv $(TARDIR)/$(SELECTION_TAG).tgz.tmp $(TARDIR)/$(SELECTION_TAG).tgz )
# ----- Smasher
# Shorthand target
compile: $(CLASS)
# Compile the Java classes
$(CLASS): $(JAVASOURCES) \
lib/jython-standalone-2.7.0.jar \
lib/json-simple-1.1.1.jar \
lib/junit-4.12.jar
javac -g $(CP) $(JAVASOURCES)
# Script to start up jython (with OTT classes preloaded)
bin/jython:
mkdir -p bin
(echo "#!/bin/bash"; \
echo "export JYTHONPATH=.:$$PWD:$$PWD/util:$$PWD/lib/json-simple-1.1.1.jar"; \
echo '[ "x$$JAVAFLAGS" = x ] && JAVAFLAGS="$(JAVAFLAGS)"'; \
echo exec java '$$JAVAFLAGS' -jar $$PWD/lib/jython-standalone-2.7.0.jar '$$*') >$@
chmod +x $@
# Script to start up the background daemon
bin/smasher:
mkdir -p bin
(echo "#!/bin/bash"; \
echo "cd $$PWD/service"; \
echo ./service '$$*') >$@
chmod +x $@
# ----- Clean
# The 'clean' target deletes everything except files fetched from the Internet.
# To really delete everything, use the 'distclean' target.
clean:
rm -f `find . -name "*.class"`
rm -rf bin/jython
rm -rf r/*/resource r/*/source r/*/work
rm -rf config.mk
rm -rf r/*-NEW
rm r/*-HEAD r/*-PREVIOUS
rm -rf tmp *.tmp properties
rm -rf t/amendments t/tax/$(SELECTION_TAG)
distclean: clean
rm -f lib/*
rm -rf r tmp raw