diff --git a/CHANGELOG.md b/CHANGELOG.md index 6478e3f..be4042d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Singularity bind paths were not being set properly. (#119, @kelly-sovacool) - Update docker containers to set `$PYTHONPATH`. (#119, #125, @kelly-sovacool) - Otherwise, this environment variable can be carried over and cause package conflicts when singularity is not run with `-C`. + - Also use `python -E` to ensure the `$PYTHONPATH` is not carried over. (#129, @kelly-sovacool) - Fix `reconfig` to correctly replace variables in the config file. (#121, @kelly-sovacool) - Prevent using excessive memory when copying reference files. (#126, @kelly-sovacool) diff --git a/workflow/rules/findcircrna.smk b/workflow/rules/findcircrna.smk index 84712fe..e72dfb4 100644 --- a/workflow/rules/findcircrna.smk +++ b/workflow/rules/findcircrna.smk @@ -141,7 +141,7 @@ def get_per_sample_files_to_merge(wildcards): # 2. parse the back_spliced_junction BED from above along with known splicing annotations to CircExplorer2 'parse' to create # a. circularRNA_known.txt ... circRNAs around known gene exons # b. low_conf_circularRNA_known.txt .... circRNAs with low confidence -# 3. parse back_spliced_junction BED along with circularRNA_known.txt and low_conf_circularRNA_known.txt to custom python -E script +# 3. parse back_spliced_junction BED along with circularRNA_known.txt and low_conf_circularRNA_known.txt to custom python script # to create an aggregated list of BSJs with following columns: # | # | ColName | # |---|-------------| @@ -1283,7 +1283,7 @@ rule merge_per_sample: container: config['containers']['star_ucsc_cufflinks'] shell: """ -python3 {params.script} \\ +python3 -E {params.script} \\ --pyscript {params.pyscript} \\ --dcc {params.ndcc} \\ --mapsplice {params.nmapsplice} \\ diff --git a/workflow/rules/post_findcircrna_processing.smk b/workflow/rules/post_findcircrna_processing.smk index f44f6b6..18ef2b7 100644 --- a/workflow/rules/post_findcircrna_processing.smk +++ b/workflow/rules/post_findcircrna_processing.smk @@ -74,7 +74,7 @@ BSJbedbn=$(basename {output.BSJbed}) if [ "{params.peorse}" == "PE" ];then -python3 {params.scriptpe} \\ +python3 -E {params.scriptpe} \\ --inbam {input.chimericbam} \\ --sample_counts_table {input.countstable} \\ --plusbam {params.tmpdir}/{params.sample}.BSJ.plus.bam \\ @@ -91,7 +91,7 @@ python3 {params.scriptpe} \\ else -python3 {params.scriptse} \\ +python3 -E {params.scriptse} \\ --inbam {input.chimericbam} \\ --sample_counts_table {input.countstable} \\ --plusbam {params.tmpdir}/{params.sample}.BSJ.plus.bam \\ @@ -127,7 +127,7 @@ for i in $(echo {params.viruses}|tr ',' ' ');do bash {params.bam2bwscript} ${{outdir}}/{params.sample}.${{i}}.BSJ.bam {params.tmpdir} done -python3 {params.flankscript} --reffa {params.reffa} --inbsjbedgz {params.tmpdir}/${{BSJbedbn}} --outbsjbedgz {output.BSJbed} +python3 -E {params.flankscript} --reffa {params.reffa} --inbsjbedgz {params.tmpdir}/${{BSJbedbn}} --outbsjbedgz {output.BSJbed} rm -rf {params.tmpdir} """ @@ -283,12 +283,12 @@ rule create_circExplorer_merged_found_counts_table: """ set -exo pipefail mkdir -p {params.tmpdir} - python3 {params.pythonscript} \\ + python3 -E {params.pythonscript} \\ -b {input.bsj_found_counts} \\ -l {input.linear_spliced_counts} \\ -o {output.found_counts_table} - python3 {params.pythonscript2} \\ + python3 -E {params.pythonscript2} \\ --annotationcounts {input.annotation_counts} \\ --allfoundcounts {output.found_counts_table} \\ --countstable {output.count_counts_table} @@ -324,9 +324,9 @@ if RUN_MAPSPLICE: for bamfile in {input};do bamfile_bn=$(basename $bamfile) if [ "{params.peorse}" == "PE" ];then - echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} --pe > {params.tmpdir}/${{bamfile_bn}}.counts" + echo "python3 -E {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} --pe > {params.tmpdir}/${{bamfile_bn}}.counts" else - echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} > {params.tmpdir}/${{bamfile_bn}}.counts" + echo "python3 -E {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} > {params.tmpdir}/${{bamfile_bn}}.counts" fi done > {params.tmpdir}/do_bamstats parallel -j 2 < {params.tmpdir}/do_bamstats @@ -374,9 +374,9 @@ else: for bamfile in {input};do bamfile_bn=$(basename $bamfile) if [ "{params.peorse}" == "PE" ];then - echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} --pe > {params.tmpdir}/${{bamfile_bn}}.counts" + echo "python3 -E {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} --pe > {params.tmpdir}/${{bamfile_bn}}.counts" else - echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} > {params.tmpdir}/${{bamfile_bn}}.counts" + echo "python3 -E {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} > {params.tmpdir}/${{bamfile_bn}}.counts" fi done > {params.tmpdir}/do_bamstats parallel -j 2 < {params.tmpdir}/do_bamstats @@ -450,7 +450,7 @@ rule create_hq_bams: outdir=$(dirname {output.outbam}) if [ ! -d $outdir ];then mkdir -p $outdir;fi cd $outdir - python3 {params.script} \\ + python3 -E {params.script} \\ -i {input.inbam} \\ -t {input.countstable} \\ -o {output.outbam} \\ diff --git a/workflow/scripts/_create_circExplorer_linear_bam.v2.sh b/workflow/scripts/_create_circExplorer_linear_bam.v2.sh index cf44b03..f5f5dc4 100755 --- a/workflow/scripts/_create_circExplorer_linear_bam.v2.sh +++ b/workflow/scripts/_create_circExplorer_linear_bam.v2.sh @@ -42,7 +42,7 @@ start0=$(date +%s.%N) # _bedintersect_to_rid2jid.py # also requires these tools # bedtools - # python3 with pysam library + # python3 -E with pysam library # samtools # bedSort from ucsc tools # ucsc @@ -126,14 +126,14 @@ start=$(date +%s.%N) if [ "$peorse" == "PE" ];then -python3 ${SCRIPT_DIR}/filter_bam.py \ +python3 -E ${SCRIPT_DIR}/filter_bam.py \ --inbam $non_chimeric_star2p_bam \ --outbam $filtered_bam \ --pe else -python3 ${SCRIPT_DIR}/filter_bam.py \ +python3 -E ${SCRIPT_DIR}/filter_bam.py \ --inbam $non_chimeric_star2p_bam \ --outbam $filtered_bam \ @@ -183,7 +183,7 @@ bedSort ${tmpdir}/BSJ.ends.bed ${tmpdir}/BSJ.ends.bed printtime $SCRIPT_NAME $start0 $start "finding max readlength" start=$(date +%s.%N) -python3 ${SCRIPT_DIR}/bam_get_max_readlen.py -i $filtered_bam > ${tmpdir}/${sample_name}.maxrl +python3 -E ${SCRIPT_DIR}/bam_get_max_readlen.py -i $filtered_bam > ${tmpdir}/${sample_name}.maxrl maxrl=$(cat ${tmpdir}/${sample_name}.maxrl) two_maxrl=$((maxrl*2)) @@ -226,7 +226,7 @@ pigz -p4 -f ${rid2jidgzip%.*} && rm -f ${rid2jidgzip%.*}.tmp printtime $SCRIPT_NAME $start0 $start "filter linear and spliced readids to those near BSJs only; creating counts table" start=$(date +%s.%N) -python3 ${SCRIPT_DIR}/_filter_linear_spliced_readids_w_rid2jid.py \ +python3 -E ${SCRIPT_DIR}/_filter_linear_spliced_readids_w_rid2jid.py \ --linearin ${tmpdir}/${sample_name}.linear.readids.gz \ --splicedin ${tmpdir}/${sample_name}.spliced.readids.gz \ --rid2jid ${rid2jidgzip} \ @@ -254,10 +254,10 @@ splicedbam_bn=$(basename $splicedbam) linearbam_all_bn=$(basename $linearbam_all) splicedbam_all_bn=$(basename $splicedbam_all) -echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_bn} --readids $linearrids" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_bn} --readids $splicedrids" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_all_bn} --readids ${tmpdir}/${sample_name}.linear.readids.gz" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_all_bn} --readids ${tmpdir}/${sample_name}.spliced.readids.gz" >> ${tmpdir}/para2 +echo "python3 -E ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_bn} --readids $linearrids" >> ${tmpdir}/para2 +echo "python3 -E ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_bn} --readids $splicedrids" >> ${tmpdir}/para2 +echo "python3 -E ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_all_bn} --readids ${tmpdir}/${sample_name}.linear.readids.gz" >> ${tmpdir}/para2 +echo "python3 -E ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_all_bn} --readids ${tmpdir}/${sample_name}.spliced.readids.gz" >> ${tmpdir}/para2 parallel -j 4 < ${tmpdir}/para2 @@ -283,10 +283,10 @@ samtools sort -l 9 -T ${tmpdir}/sorttmp --write-index -@${threads} -O BAM -o ${s if [ -f ${tmpdir}/para3 ];then rm -f ${tmpdir}/para3;fi -echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $linearbam --sample_name $sample_name --regions $regions --prefix linear_BSJ --outdir $outdir --host $host --additives $additives --viruses $viruses" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $splicedbam --sample_name $sample_name --regions $regions --prefix spliced_BSJ --outdir $outdir --host $host --additives $additives --viruses $viruses" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $linearbam_all --sample_name $sample_name --regions $regions --prefix linear --outdir $outdir --host $host --additives $additives --viruses $viruses" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $splicedbam_all --sample_name $sample_name --regions $regions --prefix spliced --outdir $outdir --host $host --additives $additives --viruses $viruses" >> ${tmpdir}/para3 +echo "python3 -E ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $linearbam --sample_name $sample_name --regions $regions --prefix linear_BSJ --outdir $outdir --host $host --additives $additives --viruses $viruses" >> ${tmpdir}/para3 +echo "python3 -E ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $splicedbam --sample_name $sample_name --regions $regions --prefix spliced_BSJ --outdir $outdir --host $host --additives $additives --viruses $viruses" >> ${tmpdir}/para3 +echo "python3 -E ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $linearbam_all --sample_name $sample_name --regions $regions --prefix linear --outdir $outdir --host $host --additives $additives --viruses $viruses" >> ${tmpdir}/para3 +echo "python3 -E ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $splicedbam_all --sample_name $sample_name --regions $regions --prefix spliced --outdir $outdir --host $host --additives $additives --viruses $viruses" >> ${tmpdir}/para3 parallel -j 4 < ${tmpdir}/para3 diff --git a/workflow/scripts/_make_merge_per_sample_sh.py b/workflow/scripts/_make_merge_per_sample_sh.py index 1b1365f..5c248f5 100755 --- a/workflow/scripts/_make_merge_per_sample_sh.py +++ b/workflow/scripts/_make_merge_per_sample_sh.py @@ -1,78 +1,154 @@ import argparse from os.path import join -def _get_counts_file_path(sampledir,s,prog): - if prog=="circExplorer": - return join(sampledir,"circExplorer",s+".circExplorer.counts_table.tsv") - if prog=="circExplorerbwa": - return join(sampledir,"circExplorer_BWA",s+".circExplorer_bwa.annotation_counts.tsv") - elif prog=="ciri": - return join(sampledir,"ciri",s+".ciri.out.filtered") - elif prog=="dcc": - return join(sampledir,"DCC",s+".dcc.counts_table.tsv.filtered") - elif prog=="mapsplice": - return join(sampledir,"MapSplice",s+".mapsplice.counts_table.tsv.filtered") - elif prog=="nclscan": - return join(sampledir,"NCLscan",s+".nclscan.counts_table.tsv.filtered") - elif prog=="circrnafinder": - return join(sampledir,"circRNA_finder",s+".circRNA_finder.counts_table.tsv.filtered") - elif prog=="findcirc": - return join(sampledir,"find_circ",s+".find_circ.bed.filtered") +def _get_counts_file_path(sampledir, s, prog): + if prog == "circExplorer": + return join(sampledir, "circExplorer", s + ".circExplorer.counts_table.tsv") + if prog == "circExplorerbwa": + return join( + sampledir, "circExplorer_BWA", s + ".circExplorer_bwa.annotation_counts.tsv" + ) + elif prog == "ciri": + return join(sampledir, "ciri", s + ".ciri.out.filtered") + elif prog == "dcc": + return join(sampledir, "DCC", s + ".dcc.counts_table.tsv.filtered") + elif prog == "mapsplice": + return join(sampledir, "MapSplice", s + ".mapsplice.counts_table.tsv.filtered") + elif prog == "nclscan": + return join(sampledir, "NCLscan", s + ".nclscan.counts_table.tsv.filtered") + elif prog == "circrnafinder": + return join( + sampledir, "circRNA_finder", s + ".circRNA_finder.counts_table.tsv.filtered" + ) + elif prog == "findcirc": + return join(sampledir, "find_circ", s + ".find_circ.bed.filtered") -def main() : - parser = argparse.ArgumentParser(description='Merge per sample Counts from different circRNA detection tools') + +def main(): + parser = argparse.ArgumentParser( + description="Merge per sample Counts from different circRNA detection tools" + ) # INPUTS - parser.add_argument('--pyscript', dest='pyscript', type=str, required=True, - help='python script to be run') - parser.add_argument('--sampledir', dest='sampledir', type=str, required=True, - help='generally //') - parser.add_argument('--dcc', dest='dcc', type=int, required=False, default=0, - help='n_run_dcc') - parser.add_argument('--mapsplice', dest='mapsplice', type=int, required=False,default=0, - help='n_run_mapslice') - parser.add_argument('--findcirc', dest='findcirc', type=int, required=False,default=0, - help='n_run_findcirc') - parser.add_argument('--nclscan', dest='nclscan', type=int, required=False,default=0, - help='n_run_nclscan') - parser.add_argument('--circrnafinder', dest='circrnafinder', type=int, required=False, default=0, - help='n_run_circrnafinder') - parser.add_argument('--samplename', dest='samplename', type=str, required=True, - help='Sample Name') - parser.add_argument('--min_read_count_reqd', dest='minreads', type=int, required=False, default=2, - help='Read count threshold..circRNA with lower than this number of read support are excluded! (default=2)') - parser.add_argument('--hqcc', dest='hqcc', type=str, required=False, default="circExplorer,circExplorer_bwa", - help='Comma separated list of high confidence core callers (default="circExplorer,circExplorer_bwa")') - parser.add_argument('--hqccpn', dest='hqccpn', type=int, required=False, default=1, - help='Define n:high confidence core callers plus n callers are required to call this circRNA HQ (default 1)') - parser.add_argument("--reffa",dest="reffa",required=True,type=str, - help="reference fasta file path") - parser.add_argument('--pyscriptoutfile',required=True,help='merged table') + parser.add_argument( + "--pyscript", + dest="pyscript", + type=str, + required=True, + help="python script to be run", + ) + parser.add_argument( + "--sampledir", + dest="sampledir", + type=str, + required=True, + help="generally //", + ) + parser.add_argument( + "--dcc", dest="dcc", type=int, required=False, default=0, help="n_run_dcc" + ) + parser.add_argument( + "--mapsplice", + dest="mapsplice", + type=int, + required=False, + default=0, + help="n_run_mapslice", + ) + parser.add_argument( + "--findcirc", + dest="findcirc", + type=int, + required=False, + default=0, + help="n_run_findcirc", + ) + parser.add_argument( + "--nclscan", + dest="nclscan", + type=int, + required=False, + default=0, + help="n_run_nclscan", + ) + parser.add_argument( + "--circrnafinder", + dest="circrnafinder", + type=int, + required=False, + default=0, + help="n_run_circrnafinder", + ) + parser.add_argument( + "--samplename", dest="samplename", type=str, required=True, help="Sample Name" + ) + parser.add_argument( + "--min_read_count_reqd", + dest="minreads", + type=int, + required=False, + default=2, + help="Read count threshold..circRNA with lower than this number of read support are excluded! (default=2)", + ) + parser.add_argument( + "--hqcc", + dest="hqcc", + type=str, + required=False, + default="circExplorer,circExplorer_bwa", + help='Comma separated list of high confidence core callers (default="circExplorer,circExplorer_bwa")', + ) + parser.add_argument( + "--hqccpn", + dest="hqccpn", + type=int, + required=False, + default=1, + help="Define n:high confidence core callers plus n callers are required to call this circRNA HQ (default 1)", + ) + parser.add_argument( + "--reffa", + dest="reffa", + required=True, + type=str, + help="reference fasta file path", + ) + parser.add_argument("--pyscriptoutfile", required=True, help="merged table") # OUTPUTS - parser.add_argument('--outscript',required=True,help='output bash script') + parser.add_argument("--outscript", required=True, help="output bash script") args = parser.parse_args() - - sd=args.sampledir - sn=args.samplename - parameters="" - parameters+=" --circExplorer "+_get_counts_file_path(sd,sn,'circExplorer') - parameters+=" --ciri "+_get_counts_file_path(sd,sn,'ciri') - parameters+=" --circExplorerbwa "+_get_counts_file_path(sd,sn,'circExplorerbwa') - if args.dcc==1: parameters+=" --dcc "+_get_counts_file_path(sd,sn,'dcc') - if args.mapsplice==1: parameters+=" --mapsplice "+_get_counts_file_path(sd,sn,'mapsplice') - if args.nclscan==1: parameters+=" --nclscan "+_get_counts_file_path(sd,sn,'nclscan') - if args.circrnafinder==1: parameters+=" --circrnafinder "+_get_counts_file_path(sd,sn,'circrnafinder') - if args.findcirc==1: parameters+=" --findcirc "+_get_counts_file_path(sd,sn,'findcirc') - parameters+=" --reffa "+args.reffa - parameters+=" --min_read_count_reqd "+str(args.minreads) - parameters+=" --samplename "+sn - parameters+=" --hqcc "+args.hqcc - parameters+=" --hqccpn "+str(args.hqccpn) - parameters+=" -o "+args.pyscriptoutfile - - with open(args.outscript,'w') as outscript: - outscript.write("python3 "+args.pyscript+parameters) + + sd = args.sampledir + sn = args.samplename + parameters = "" + parameters += " --circExplorer " + _get_counts_file_path(sd, sn, "circExplorer") + parameters += " --ciri " + _get_counts_file_path(sd, sn, "ciri") + parameters += " --circExplorerbwa " + _get_counts_file_path( + sd, sn, "circExplorerbwa" + ) + if args.dcc == 1: + parameters += " --dcc " + _get_counts_file_path(sd, sn, "dcc") + if args.mapsplice == 1: + parameters += " --mapsplice " + _get_counts_file_path(sd, sn, "mapsplice") + if args.nclscan == 1: + parameters += " --nclscan " + _get_counts_file_path(sd, sn, "nclscan") + if args.circrnafinder == 1: + parameters += " --circrnafinder " + _get_counts_file_path( + sd, sn, "circrnafinder" + ) + if args.findcirc == 1: + parameters += " --findcirc " + _get_counts_file_path(sd, sn, "findcirc") + parameters += " --reffa " + args.reffa + parameters += " --min_read_count_reqd " + str(args.minreads) + parameters += " --samplename " + sn + parameters += " --hqcc " + args.hqcc + parameters += " --hqccpn " + str(args.hqccpn) + parameters += " -o " + args.pyscriptoutfile + + with open(args.outscript, "w") as outscript: + outscript.write("python3 -E " + args.pyscript + parameters) outscript.close() + if __name__ == "__main__": - main() \ No newline at end of file + main()