Restructuring the repository to make it installable

yassineS · Dec 16, 2014 · 8d7d085 · 8d7d085
1 parent f5a457e
commit 8d7d085
Show file tree

Hide file tree

Showing 46 changed files with 2,281 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,2 @@
+GenomeKey: for COSMOS 2.0
+==========
diff --git a/bin/genomekey b/bin/genomekey
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+import ipdb
+import argparse
+
+from cosmos import Execution, Cosmos, Recipe, Input, ExecutionStatus, signal_execution_status_change
+
+from genomekey.settings            import settings as genomekey_settings # will override cosmos.settings
+from genomekey.workflows.pipeline  import pipeline
+
+###############################
+# bam
+###############################
+
+def bam(workflow,input_bam,input_bam_list,test_bam,high_coverage,chromosome_only_split,**kwargs):
+    """
+    Input file is a bam with properly annotated readgroups.
+    """
+    input_bams = input_bam_list.read().strip().split('\n') if input_bam_list else []
+    if input_bam:
+        input_bams.append(input_bam.name)
+
+    if len(input_bams) == 0:
+        raise WorkflowException, 'At least 1 BAM input required'
+
+    # if we have a high coverage genome, override GATK options for realignment
+    if high_coverage:
+        genomekey_settings['gatk_realigntarget'] = '--mismatchFraction 0.30 --maxIntervalSize 650'
+        genomekey_settings['gatk_indelrealign'] = '--maxReadsInMemory 300000 --maxReadsForRealignment 500000 --maxReadsForConsensuses 500 --maxConsensuses 100'
+
+#    dag = DAG(ignore_stage_name_collisions=True)
+
+#    dag.sequence_(
+#        pipeline(input_bams, test_bam=test_bam, chromosome_only_split=chromosome_only_split),
+#        configure(genomekey_settings),
+#        add_run(workflow)
+#    )
+
+###############################
+# CLI Configuration
+###############################
+
+def main():
+
+    ## Override default drmaa_specification
+    session.drmaa_spec = genomekey_settings['drmaa_spec']
+
+    parser = argparse.ArgumentParser(description='WGA')
+    subparsers = parser.add_subparsers(title="Commands", metavar="<command>")
+
+    bam_sp = subparsers.add_parser('bam',help="Input is a bam or bam file list",description=bam.__doc__)
+    cli.add_workflow_args(bam_sp)
+    bam_sp.add_argument('-i', '--input_bam',     type=file,help='A path to a BAM file')
+    bam_sp.add_argument('-il','--input_bam_list',type=file,help='A path to a file containing a list of paths to BAMs, separated by newlines')
+    bam_sp.add_argument('-t','--test_bam',action='store_true',help='Only do stages on chr1, skips VQSR, strictly for testing only')
+    bam_sp.add_argument('-hc','--high_coverage',action='store_true',help='Special GATK options to handle high-coverage genomes')
+    bam_sp.add_argument('-c','--chromosome_only_split',action='store_true',help='Split only on chromosomes not read groups')
+    bam_sp.set_defaults(func=bam)
+
+    wf,kwargs = cli.parse_args(parser)
+    kwargs['func'](wf,**kwargs)
+
+
+if __name__ == '__main__':
+    with ipdb.launch_ipdb_on_exception():    
+        main()
diff --git a/bin/genomekey.py~ b/bin/genomekey.py~
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+import ipdb
+import argparse
+
+from cosmos                import session
+from cosmos.Workflow       import cli
+from cosmos.lib.ezflow.dag import DAG,configure,add_run
+
+from genomekey.settings            import settings as genomekey_settings # will override cosmos.settings
+from genomekey.workflows.pipeline  import pipeline
+
+###############################
+# bam
+###############################
+
+def bam(workflow,input_bam,input_bam_list,test_bam,high_coverage,chromosome_only_split,**kwargs):
+    """
+    Input file is a bam with properly annotated readgroups.
+    """
+    input_bams = input_bam_list.read().strip().split('\n') if input_bam_list else []
+    if input_bam:
+        input_bams.append(input_bam.name)
+
+    if len(input_bams) == 0:
+        raise WorkflowException, 'At least 1 BAM input required'
+
+    # if we have a high coverage genome, override GATK options for realignment
+    if high_coverage:
+        genomekey_settings['gatk_realigntarget'] = '--mismatchFraction 0.30 --maxIntervalSize 650'
+        genomekey_settings['gatk_indelrealign'] = '--maxReadsInMemory 300000 --maxReadsForRealignment 500000 --maxReadsForConsensuses 500 --maxConsensuses 100'
+
+    dag = DAG(ignore_stage_name_collisions=True)
+
+    dag.sequence_(
+        pipeline(input_bams, test_bam=test_bam, chromosome_only_split=chromosome_only_split),
+        configure(genomekey_settings),
+        add_run(workflow)
+    )
+
+###############################
+# CLI Configuration
+###############################
+
+def main():
+
+    ## Override default drmaa_specification
+    session.drmaa_spec = genomekey_settings['drmaa_spec']
+
+    parser = argparse.ArgumentParser(description='WGA')
+    subparsers = parser.add_subparsers(title="Commands", metavar="<command>")
+
+    bam_sp = subparsers.add_parser('bam',help="Input is a bam or bam file list",description=bam.__doc__)
+    cli.add_workflow_args(bam_sp)
+    bam_sp.add_argument('-i', '--input_bam',     type=file,help='A path to a BAM file')
+    bam_sp.add_argument('-il','--input_bam_list',type=file,help='A path to a file containing a list of paths to BAMs, separated by newlines')
+    bam_sp.add_argument('-t','--test_bam',action='store_true',help='Only do stages on chr1, skips VQSR, strictly for testing only')
+    bam_sp.add_argument('-hc','--high_coverage',action='store_true',help='Special GATK options to handle high-coverage genomes')
+    bam_sp.add_argument('-c','--chromosome_only_split',action='store_true',help='Split only on chromosomes not read groups')
+    bam_sp.set_defaults(func=bam)
+
+    wf,kwargs = cli.parse_args(parser)
+    kwargs['func'](wf,**kwargs)
+
+
+if __name__ == '__main__':
+    with ipdb.launch_ipdb_on_exception():    
+        main()
diff --git a/genomekey/LICENSE b/genomekey/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Harvard Medical School
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/genomekey/__init__.py b/genomekey/__init__.py
diff --git a/genomekey/settings.py b/genomekey/settings.py
@@ -0,0 +1,89 @@
+import os, sys
+
+"""
+#from cosmos.config import settings as cosmos_settings
+
+
+#def _get_drmaa_native_specification(jobAttempt):
+#    task = jobAttempt.task
+#    drm  = cosmos_settings['DRM']
+
+    cpu_req  = task.cpu_requirement
+    mem_req  = task.memory_requirement
+    time_req = task.time_requirement
+    queue    = task.workflow.default_queue
+    
+    if drm == 'LSF':           # for Orchestra Runs
+        if time_req <= 12*60: queue = 'rodeo_unlimited'
+        else:                 queue = 'long'
+                
+#        return '-R rusage[mem={0}] span[hosts=1] -n {1} -W 0:{2} -q {3}'.format(mem_req, cpu_req, time_req, queue)
+
+    elif drm == 'GE':
+        return '-l spock_mem={mem_req}M,num_proc={cpu_req}'.format(mem_req=mem_req, cpu_req=cpu_req)
+
+    else:
+        raise Exception('DRM not supported')
+
+
+svr  = cosmos_settings['server_name']
+
+# get path to current genomekey installation
+genomekey_path = os.path.abspath(os.path.dirname(sys.argv[0]))
+
+if svr == 'orchestra':
+    ref_path   = '/groups/cbi/WGA/reference'
+    tools_path = '/groups/cbi/WGA/tools'
+
+elif svr == 'aws':
+    ref_path   = '/WGA/reference'
+    tools_path = '/WGA/tools'
+
+elif svr == 'gce':
+    ref_path   = '/pseq/WGA/ref'  # In shared disk
+    tools_path = '/tools/'        # In boot   disk
+else:
+    raise Exception('Unknown server_name {0} in Cosmos configuration: must be one of [orchestra, aws, gce]'.format(svr))
+
+
+opj = os.path.join
+"""
+ref_path   = '/groups/cbi/WGA/reference'
+tools_path = '/groups/cbi/WGA/tools'
+
+settings = {
+    'date'                  : '$(date "+%T %D")',
+    'java'                  : opj(tools_path, 'java -d64 -XX:ParallelGCThreads=2 -XX:+UseParallelOldGC -XX:+AggressiveOpts'),
+    'scratch'               : '/tmp',
+
+    'bamUtil'               : opj(tools_path, 'bamUtil.v1.0.11'),
+    'bwa'                   : opj(tools_path, 'bwa.v0.7.7'),              
+    'gatk'                  : opj(tools_path, 'gatk.v3.1.1.jar'),
+    'picard_dir'            : opj(tools_path, 'picard.v1.109'),  
+    'samtools'              : opj(tools_path, 'samtools.v0.1.19'),
+
+    'annovarext'            : opj(tools_path, 'annovarext'),
+
+    'reference_fasta'       : opj(ref_path,   'human_g1k_v37.fasta'),
+    'dbsnp_vcf'             : opj(ref_path,   'dbsnp_137.b37.excluding_sites_after_129.vcf'),
+    'hapmap_vcf'            : opj(ref_path,   'hapmap_3.3.b37.vcf'),
+    'mills_vcf'             : opj(ref_path,   'Mills_and_1000G_gold_standard.indels.b37.vcf'),
+    '1ksnp_vcf'             : opj(ref_path,   '1000G_phase1.snps.high_confidence.b37.vcf'),
+    '1komni_vcf'            : opj(ref_path,   '1000G_omni2.5.b37.vcf'),
+    '1kindel_vcf'           : opj(ref_path,   '1000G_phase1.indels.b37.vcf'),
+    # keep in home directory for the moment, should ultimately be part of the AMI
+    #'empty_sam'             : opj(ref_path,   'empty.sam'),
+    'empty_sam'             : opj(genomekey_path, '../test/empty.sam'),
+
+    'gatk_realigntarget'    : '',
+    'gatk_indelrealign'     : '',
+
+    'drmaa_spec'            : _get_drmaa_native_specification
+}
+"""
+if   svr == 'aws':
+    settings['scratch'] = '/mnt'
+elif svr == 'gce':
+    settings['scratch'] = '/mnt'
+
+"""
diff --git a/genomekey/tools/__init__.py b/genomekey/tools/__init__.py