scMaestro

#!/usr/bin/env python

import argparse
import glob
import itertools
import logging as sflog
import os
import re
import sys
import subprocess
import pandas as pd

sflog.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=sflog.DEBUG)

from lib.utils import prep_ref, get_smk_file, write_configfile, get_config_val
from lib.smk_arg import smk_rerun_dryrun_unlock
from lib.argparse_multi import multi_conditional_required_flag

# ARC-v1 for analyzing the GEX portion of multiome data. NOTE: this mode cannot be auto-detected.
ASSAY_CHEMISTRY = ['auto', 'threeprime', 'fiveprime', 'SC3Pv2', 'SC3Pv3', 'SC3Pv3LT', 'SC3Pv3HT', 'SC5P-PE', 'SC5P-R2', 'SC3Pv1']

# get the path of the script
bin_path = os.path.dirname(os.path.realpath(__file__))

work_path = os.getcwd()
os.chdir(work_path)
#print(work_path)

class UniqueStore(argparse.Action):
    def __call__(self, parser, namespace, values, option_string):
        if getattr(namespace, self.dest, self.default) is not self.default:
            parser.error(option_string + " appears several times.")
        setattr(namespace, self.dest, values)

def main(args):
    #prep_config(args)
    prep_ref(args)
    
    workflow_dir = os.path.join(os.getcwd(), 'workflow')

    # Check if the workflow directory exists
    if os.path.exists(workflow_dir):
        print(f"The 'workflow' directory already exists at {workflow_dir}.")
        print("Aborting process. Please remove the folder 'workflow' manually or rename it, or run subcommand 'rerun'")
        exit(1)
    else:
        print("No existing 'workflow' folder. Proceeding with copy...")

    subprocess.check_output('cat %s/workflow/config/program.py |grep -v "copydir = " | grep -v "active_scripts =" > program.py' % bin_path, shell=True) 
    subprocess.check_output('cp -r %s/workflow/ .' % bin_path, shell=True) 
    subprocess.check_output('cp %s/workflow/config/submit_ext.sh submit.sh' % bin_path, shell=True) 
    args.fullanalysis = getattr(args, "fullanalysis", False)
    smk_file = get_smk_file(args.pipeline, args.fullanalysis)
    #print(smk_file)
    smk_file_re = re.sub(r"/", r"\/", smk_file)
    #print(smk_file_re)
    os.system("""sed -i "s/snakemake --jobname/snakemake -s %s --jobname /" %s """ % (smk_file_re, "submit.sh"))
    print('CONFIG FILE:')
    with open('config.py') as f:
        print(f.read())
    submit = input('Submit now? (y/n): ')
    while submit.lower() not in ['y', 'n']:
        submit = input('Please answer yes(y) or no(n): ')
    if submit.lower() == 'y':
        subprocess.check_output('sbatch submit.sh', shell=True)
        print("Submitted snakemake run for %s in %s" % (",".join(args.fastqs), work_path))

if __name__ == '__main__':
    ## description - Text to display before the argument help (default: none)

    parent_parser = argparse.ArgumentParser(add_help=False)  
    parent_parser.add_argument('-f', '--fastqs', metavar='FASTQPATH', nargs = "*",  \
            action = UniqueStore, required=True, 
            help='Path(s) to fastq files, multiple paths can be provided together. eg. "-f path1 path2"')
    parent_parser.add_argument('-g', '--genome', 
            help='Genome build, e.g. "hg38", "mm10"', required=True)
    parent_parser_ref = argparse.ArgumentParser(add_help=False) 
    parent_parser_ref.add_argument('-r', '--reference', metavar='REFERENCE FOLDER',
            help='Reference folder for alignment. VDJ reference is needed if subcommand "vdj" \
            is used.', required=True)
    #parent_parser.add_argument("-n", "--dryrun", action="store_true", help="dry run") 
    #parent_parser.add_argument("--rerun", action="store_true",
    #        help="dry run then prompt for submitting jobs that need to be rerun")
    #parent_parser.add_argument("--unlock", action="store_true",
    #        help="unlock working directory")

    parent_parser_fa = argparse.ArgumentParser(add_help=False) 
    parent_parser_fa.add_argument("--fullanalysis", action="store_true",
        help="Run full analysis pipeline")
    
    parent_parser_cellnum = argparse.ArgumentParser(add_help=False) 
    group_cell_number = parent_parser_cellnum.add_mutually_exclusive_group()
    group_cell_number.add_argument("--force", type=int,
        help="Run Cell Ranger with --force-cell ")
    group_cell_number.add_argument("--expect", type=int,
        help="Run Cell Ranger with --expect-cells")
    parent_parser_cellnum.add_argument("--exclude-introns", action="store_true",
        help="Exclude intronic reads in count. To maximize sensitivity for \
                whole transcriptome 3’/5’ Single Cell Gene Expression and 3’ \
                Cell Multiplexing experiments, introns will be included in \
                the analysis by default for cellranger (>v7.0.0) count and multi. ") 
    
    parent_parser_vdjchain = argparse.ArgumentParser(add_help=False)
    parent_parser_vdjchain.add_argument("--chain", default="auto", choices=['auto', 'TR', 'IG'], 
        help="Force the analysis to be carried out for a particular chain type.")

    parent_parser_lib = argparse.ArgumentParser(add_help=False)
    parent_parser_lib.add_argument("-l", "--library_config",
        required=True,
        help="CSV file with the library configration.")
    
    parent_chemistry = argparse.ArgumentParser(add_help=False)
    parent_chemistry.add_argument("--chemistry",
        type=str, choices = ASSAY_CHEMISTRY,   
        help="CSV file with the library configration.")

    parser = argparse.ArgumentParser(description='scMastro: Comprehensive workflow for processing single cell sequencing data')
    subparsers = parser.add_subparsers(help='sub-command help', dest = 'command')
    ## rna pipeline
    parser_rna = subparsers.add_parser("rna", 
        parents = [parent_parser, parent_parser_ref, parent_parser_fa, parent_parser_cellnum], 
        help='Snakemake pipeline for 10x scRNA-seq data analysis')

    ## multi pipeline
    parser_multi = subparsers.add_parser("multi", 
        parents = [parent_parser, parent_parser_ref, parent_parser_vdjchain, 
                   parent_chemistry, parent_parser_fa, parent_parser_lib], 
        help='Snakemake pipeline for 10x CellRanger multi data analysis')
    parser_multi.add_argument("--vdj_ref",
        help="Reference folder for VDJ data ")
    #parser_multi.add_argument("--ref_vdj", type = str,  
    #    help="Reference folder for VDJ data ")
    parser_multi.add_argument("--cmo", action="store_true",
        help="CMO information will be used for multi analysis")
    parser_multi.add_argument("--count", action="store_true",
        help="Run cellranger count for projects with HTO libraries \
                with more 10 individuals mixed. ")
     
    parser_vdj = subparsers.add_parser("vdj", 
        parents = [parent_parser, parent_parser_ref, parent_parser_vdjchain], 
        help='Snakemake pipeline for 10x VDJ data')
    
    parser_atac = subparsers.add_parser("atac", 
        parents = [parent_parser, parent_parser_ref], 
        help='Snakemake pipeline for 10x scATAC-seq data analysis')
    parser_atac.add_argument("--chemistry",
        type=str, choices = ["ARC-v1"],
        help="CSV file with the library configration.")

    parser_multiome = subparsers.add_parser("multiome", 
        parents = [parent_parser, parent_parser_ref, parent_parser_cellnum, parent_parser_lib], 
        help='Snakemake pipeline for 10x Multiome ATAC + GEX data')
    
    parser_fixedrna = subparsers.add_parser("fixedrna", 
        parents = [parent_parser, parent_parser_lib], 
        help='Snakemake pipeline for 10x Fixed RNA profiling (FRP) data')
    parser_fixedrna.add_argument("--probe_set",
        required=True,
        help="Probe set for FRP data")
    parser_fixedrna.add_argument("--singleplex", action = "store_true",
        help="Singleplex FRP")
    parser_fixedrna.add_argument("--multiplex", action = "store", type=str,
        help="Mutiplexing information for FRP")
    #parser_spatial = subparsers.add_parser("spatial", 
    #    parents = [parent_parser], 
    #    help='Snakemake pipeline for 10x spatial data')
    #parser_spatial.add_argument("--images", metavar="images",
    #    action = "store", type=str, required= True, 
    #    help="Metadata for image information. [Required if spatial pipeline is used]")
    #spatial_methods = ["cytassist_v1", "cytassist_v2", "non_cytassist"]
    #parser_spatial.add_argument("--spatial-method", 
    #    metavar=f"spatial_method {spatial_methods}", 
    #    type=str, choices = spatial_methods, required= True, 
    #    help="""Specify the method used for spatial transcriptomics analysis. \
    #          Options:                                                        \
    #          - 'cytassist_v1': Employ Cytassist method using probe set v1    \
    #          - 'cytassist_v2': Employ Cytassist method using probe set v2    \
    #          - 'non_cytassist': Employ non-Cytassist method capturing  polyA mRNAs""")
    
    parser_rerun = subparsers.add_parser("rerun",
        help='Equavalent of snakemake --rerun')
    parser_rerun = subparsers.add_parser("dryrun",
        help='Equavalent of snakemake --dryrun')
    parser_rerun = subparsers.add_parser("unlock",
        help='Equavalent of snakemake --unlock')
    args = parser.parse_args()
    dict_cmd = vars(args)
    if dict_cmd['command'] is None:
        parser.print_help()
        sys.exit(0)
    multi_conditional_required_flag(args, parser)
    sflog.info("Subcommand is: " + str(dict_cmd['command']))
    sflog.info(args)
    if dict_cmd['command'] == 'dryrun' or dict_cmd['command'] == 'rerun' or dict_cmd['command'] == 'unlock':
        smk_rerun_dryrun_unlock(dict_cmd['command'], subparsers)
    else:
        work_path = os.getcwd()
        os.chdir(work_path)
        sample = list(set([os.path.basename(file).split('.')[0] for file in list(itertools.chain.from_iterable([glob.glob(i + '/*') for i in args.fastqs]))]))
        samps = []
        for item in sample:
            if len(re.findall(r"(\S*)_S\d+_L0\d{2}_[RI]", item)) > 0:
              samps.append(re.findall(r"(\S*)_S\d+_L0\d{2}_[RI]", item)[0])
            else:
              samps.append(item)
        samples = list(set(s.replace('Sample_', '') for s in set(samps)))
        samples = sorted(samples)
        write_configfile(args, work_path)
        args.pipeline = vars(args)["command"]
        print(args)
        main(args)