-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscMaestro
executable file
·211 lines (186 loc) · 10.2 KB
/
scMaestro
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/usr/bin/env python
import argparse
import glob
import itertools
import logging as sflog
import os
import re
import sys
import subprocess
import pandas as pd
sflog.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=sflog.DEBUG)
from lib.utils import prep_ref, get_smk_file, write_configfile, get_config_val
from lib.smk_arg import smk_rerun_dryrun_unlock
from lib.argparse_multi import multi_conditional_required_flag
# ARC-v1 for analyzing the GEX portion of multiome data. NOTE: this mode cannot be auto-detected.
ASSAY_CHEMISTRY = ['auto', 'threeprime', 'fiveprime', 'SC3Pv2', 'SC3Pv3', 'SC3Pv3LT', 'SC3Pv3HT', 'SC5P-PE', 'SC5P-R2', 'SC3Pv1']
# get the path of the script
bin_path = os.path.dirname(os.path.realpath(__file__))
work_path = os.getcwd()
os.chdir(work_path)
#print(work_path)
class UniqueStore(argparse.Action):
def __call__(self, parser, namespace, values, option_string):
if getattr(namespace, self.dest, self.default) is not self.default:
parser.error(option_string + " appears several times.")
setattr(namespace, self.dest, values)
def main(args):
#prep_config(args)
prep_ref(args)
workflow_dir = os.path.join(os.getcwd(), 'workflow')
# Check if the workflow directory exists
if os.path.exists(workflow_dir):
print(f"The 'workflow' directory already exists at {workflow_dir}.")
print("Aborting process. Please remove the folder 'workflow' manually or rename it, or run subcommand 'rerun'")
exit(1)
else:
print("No existing 'workflow' folder. Proceeding with copy...")
subprocess.check_output('cat %s/workflow/config/program.py |grep -v "copydir = " | grep -v "active_scripts =" > program.py' % bin_path, shell=True)
subprocess.check_output('cp -r %s/workflow/ .' % bin_path, shell=True)
subprocess.check_output('cp %s/workflow/config/submit_ext.sh submit.sh' % bin_path, shell=True)
args.fullanalysis = getattr(args, "fullanalysis", False)
smk_file = get_smk_file(args.pipeline, args.fullanalysis)
#print(smk_file)
smk_file_re = re.sub(r"/", r"\/", smk_file)
#print(smk_file_re)
os.system("""sed -i "s/snakemake --jobname/snakemake -s %s --jobname /" %s """ % (smk_file_re, "submit.sh"))
print('CONFIG FILE:')
with open('config.py') as f:
print(f.read())
submit = input('Submit now? (y/n): ')
while submit.lower() not in ['y', 'n']:
submit = input('Please answer yes(y) or no(n): ')
if submit.lower() == 'y':
subprocess.check_output('sbatch submit.sh', shell=True)
print("Submitted snakemake run for %s in %s" % (",".join(args.fastqs), work_path))
if __name__ == '__main__':
## description - Text to display before the argument help (default: none)
parent_parser = argparse.ArgumentParser(add_help=False)
parent_parser.add_argument('-f', '--fastqs', metavar='FASTQPATH', nargs = "*", \
action = UniqueStore, required=True,
help='Path(s) to fastq files, multiple paths can be provided together. eg. "-f path1 path2"')
parent_parser.add_argument('-g', '--genome',
help='Genome build, e.g. "hg38", "mm10"', required=True)
parent_parser_ref = argparse.ArgumentParser(add_help=False)
parent_parser_ref.add_argument('-r', '--reference', metavar='REFERENCE FOLDER',
help='Reference folder for alignment. VDJ reference is needed if subcommand "vdj" \
is used.', required=True)
#parent_parser.add_argument("-n", "--dryrun", action="store_true", help="dry run")
#parent_parser.add_argument("--rerun", action="store_true",
# help="dry run then prompt for submitting jobs that need to be rerun")
#parent_parser.add_argument("--unlock", action="store_true",
# help="unlock working directory")
parent_parser_fa = argparse.ArgumentParser(add_help=False)
parent_parser_fa.add_argument("--fullanalysis", action="store_true",
help="Run full analysis pipeline")
parent_parser_cellnum = argparse.ArgumentParser(add_help=False)
group_cell_number = parent_parser_cellnum.add_mutually_exclusive_group()
group_cell_number.add_argument("--force", type=int,
help="Run Cell Ranger with --force-cell ")
group_cell_number.add_argument("--expect", type=int,
help="Run Cell Ranger with --expect-cells")
parent_parser_cellnum.add_argument("--exclude-introns", action="store_true",
help="Exclude intronic reads in count. To maximize sensitivity for \
whole transcriptome 3’/5’ Single Cell Gene Expression and 3’ \
Cell Multiplexing experiments, introns will be included in \
the analysis by default for cellranger (>v7.0.0) count and multi. ")
parent_parser_vdjchain = argparse.ArgumentParser(add_help=False)
parent_parser_vdjchain.add_argument("--chain", default="auto", choices=['auto', 'TR', 'IG'],
help="Force the analysis to be carried out for a particular chain type.")
parent_parser_lib = argparse.ArgumentParser(add_help=False)
parent_parser_lib.add_argument("-l", "--library_config",
required=True,
help="CSV file with the library configration.")
parent_chemistry = argparse.ArgumentParser(add_help=False)
parent_chemistry.add_argument("--chemistry",
type=str, choices = ASSAY_CHEMISTRY,
help="CSV file with the library configration.")
parser = argparse.ArgumentParser(description='scMastro: Comprehensive workflow for processing single cell sequencing data')
subparsers = parser.add_subparsers(help='sub-command help', dest = 'command')
## rna pipeline
parser_rna = subparsers.add_parser("rna",
parents = [parent_parser, parent_parser_ref, parent_parser_fa, parent_parser_cellnum],
help='Snakemake pipeline for 10x scRNA-seq data analysis')
## multi pipeline
parser_multi = subparsers.add_parser("multi",
parents = [parent_parser, parent_parser_ref, parent_parser_vdjchain,
parent_chemistry, parent_parser_fa, parent_parser_lib],
help='Snakemake pipeline for 10x CellRanger multi data analysis')
parser_multi.add_argument("--vdj_ref",
help="Reference folder for VDJ data ")
#parser_multi.add_argument("--ref_vdj", type = str,
# help="Reference folder for VDJ data ")
parser_multi.add_argument("--cmo", action="store_true",
help="CMO information will be used for multi analysis")
parser_multi.add_argument("--count", action="store_true",
help="Run cellranger count for projects with HTO libraries \
with more 10 individuals mixed. ")
parser_vdj = subparsers.add_parser("vdj",
parents = [parent_parser, parent_parser_ref, parent_parser_vdjchain],
help='Snakemake pipeline for 10x VDJ data')
parser_atac = subparsers.add_parser("atac",
parents = [parent_parser, parent_parser_ref],
help='Snakemake pipeline for 10x scATAC-seq data analysis')
parser_atac.add_argument("--chemistry",
type=str, choices = ["ARC-v1"],
help="CSV file with the library configration.")
parser_multiome = subparsers.add_parser("multiome",
parents = [parent_parser, parent_parser_ref, parent_parser_cellnum, parent_parser_lib],
help='Snakemake pipeline for 10x Multiome ATAC + GEX data')
parser_fixedrna = subparsers.add_parser("fixedrna",
parents = [parent_parser, parent_parser_lib],
help='Snakemake pipeline for 10x Fixed RNA profiling (FRP) data')
parser_fixedrna.add_argument("--probe_set",
required=True,
help="Probe set for FRP data")
parser_fixedrna.add_argument("--singleplex", action = "store_true",
help="Singleplex FRP")
parser_fixedrna.add_argument("--multiplex", action = "store", type=str,
help="Mutiplexing information for FRP")
#parser_spatial = subparsers.add_parser("spatial",
# parents = [parent_parser],
# help='Snakemake pipeline for 10x spatial data')
#parser_spatial.add_argument("--images", metavar="images",
# action = "store", type=str, required= True,
# help="Metadata for image information. [Required if spatial pipeline is used]")
#spatial_methods = ["cytassist_v1", "cytassist_v2", "non_cytassist"]
#parser_spatial.add_argument("--spatial-method",
# metavar=f"spatial_method {spatial_methods}",
# type=str, choices = spatial_methods, required= True,
# help="""Specify the method used for spatial transcriptomics analysis. \
# Options: \
# - 'cytassist_v1': Employ Cytassist method using probe set v1 \
# - 'cytassist_v2': Employ Cytassist method using probe set v2 \
# - 'non_cytassist': Employ non-Cytassist method capturing polyA mRNAs""")
parser_rerun = subparsers.add_parser("rerun",
help='Equavalent of snakemake --rerun')
parser_rerun = subparsers.add_parser("dryrun",
help='Equavalent of snakemake --dryrun')
parser_rerun = subparsers.add_parser("unlock",
help='Equavalent of snakemake --unlock')
args = parser.parse_args()
dict_cmd = vars(args)
if dict_cmd['command'] is None:
parser.print_help()
sys.exit(0)
multi_conditional_required_flag(args, parser)
sflog.info("Subcommand is: " + str(dict_cmd['command']))
sflog.info(args)
if dict_cmd['command'] == 'dryrun' or dict_cmd['command'] == 'rerun' or dict_cmd['command'] == 'unlock':
smk_rerun_dryrun_unlock(dict_cmd['command'], subparsers)
else:
work_path = os.getcwd()
os.chdir(work_path)
sample = list(set([os.path.basename(file).split('.')[0] for file in list(itertools.chain.from_iterable([glob.glob(i + '/*') for i in args.fastqs]))]))
samps = []
for item in sample:
if len(re.findall(r"(\S*)_S\d+_L0\d{2}_[RI]", item)) > 0:
samps.append(re.findall(r"(\S*)_S\d+_L0\d{2}_[RI]", item)[0])
else:
samps.append(item)
samples = list(set(s.replace('Sample_', '') for s in set(samps)))
samples = sorted(samples)
write_configfile(args, work_path)
args.pipeline = vars(args)["command"]
print(args)
main(args)