-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrna_vdj.py
executable file
·272 lines (225 loc) · 8.47 KB
/
rna_vdj.py
1
import pandas as pdimport argparseimport gzipimport osfrom os import listdirimport pysamfrom os.path import isfile, joinfrom collections import defaultdictfrom concurrent.futures import ProcessPoolExecutorfrom celescope.tools.utils import genDict, format_number, log, read_barcode_fileTRACER_PATH = '/SGRNJ/Public/Software/tracer/tracer'CONF_PATH = '/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/tcr_fl/20201103/tracer_SGR.conf'CONDA = 'vdjpuzzle1'CONDA_SUB = 'celescope_tracer'BRACER_PATH = 'bracer'BRACER_CONDA = 'bracer'BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf'# 获取转录组barcodedef get_raw_barcode(tsvfile, cluster): assign_data = pd.read_csv(tsvfile, sep="\t", index_col=0) barcode_rna = [] cluster_list = [int(item) for item in cluster.split(',')] for c in cluster_list: barcode_rna = barcode_rna.extend(list(assign_data[assign_data["cluster"] == c].index)) return barcode_rna# 获取反转互补序列def dna_resevre_complement2(sequence): sequence = sequence[::-1] # trantab = str.maketrans(intab, outtab) # 制作翻译表 trantab = str.maketrans('ACGTacgtRYMKrymkVBHDvbhd', 'TGCAtgcaYRKMyrkmBVDHbvdh') string = sequence.translate(trantab) # str.translate(trantab) # 转换字符 return stringdef run_barcode(tsvfile, cluster): sequence_list = get_raw_barcode(tsvfile, cluster) new_sequence = [] for sequence in sequence_list: new_sequence.append(dna_resevre_complement2(sequence)) file = open("barcode.txt", 'a') for i in range(len(new_sequence)): s = str(new_sequence[i]).replace("'", '').replace("'", '') # 去除[],这两行按数据不同,可以选择 s = s + '\n' # 去除单引号,逗号,每行末尾追加换行符 file.write(s) file.close() return new_sequence# 解压fq.gz文件def un_gz(fqfile_name): # 获取文件的名称,去掉后缀名 f_name = fqfile_name.replace(".gz", "") # 开始解压 g_file = gzip.GzipFile(fqfile_name) # 读取解压后的文件,并写入去掉后缀名的同名文件(即得到解压后的文件) open(f_name, "wb+").write(g_file.read()) g_file.close()# 利用解压后的fq文件生成barcode_reads dict,并按照reads id数目进行排序def get_readid(fqfile_name, topn): un_gz(fqfile_name) with open("barcode.txt") as bc: barcodes = bc.readlines() barcode_read_id_dict = {} for barcode in barcodes: barcode = barcode.strip('\n') barcode_read_id_dict[barcode] = [] file = fqfile_name.strip('.gz') with open(file) as fq: reads = fq.readlines() l = len(reads) reads_id = reads[0:l:4] id_reads = {} for i in reads_id: id_reads[i[1:25]] = [] for i in reads_id: id_reads[i[1:25]].append(i) # 建立所有reads与id的字典 for barcode in barcodes: barcode = barcode.strip('\n') barcode_read_id_dict[barcode] = id_reads[barcode] # 生成BCR barcode与reads的字典 # 生成排序字典,按照UMI数目降序排序,若有topn参数,则选取前topn个细胞进行后续分析 barcode_umi_count = {} for barcode in barcodes: barcode = barcode.strip('\n') barcode_umi_count[barcode] = len(barcode_read_id_dict[barcode]) barcode_umi_count = sorted(barcode_umi_count.items(), key=lambda item: item[1], reverse=True) barcode_use = {} if topn and topn != 'None': topn = int(topn) barcode_umi_count = barcode_umi_count[:topn] for n in barcode_umi_count: barcode_use[n[0]] = barcode_read_id_dict[n[0]] else: for n in barcode_umi_count: barcode_use[n[0]] = barcode_read_id_dict[n[0]] del id_reads del barcode_umi_count # 生成read id文件用去拆分fq if not os.path.exists('idlist'): os.makedirs('idlist') namelist = [] for i in list(barcode_use.keys()): i = i.strip('\n') filename = 'idlist/' + i + '.txt' namelist.append(filename) file = open(filename, 'a') for read in barcode_use[i]: read = read.strip('@') file.write(read) file.close() file = open("name_list.txt", "a") for name in namelist: name = name + "\n" file.write(name) file.close()# 拆分fq文件def split_fq(fqfile_name, fastq_dir): if not os.path.exists(f'{fastq_dir}'): os.makedirs(f'{fastq_dir}') file = fqfile_name.strip(".gz") with open("name_list.txt") as nl: nl = list(nl) i = 1 for name in nl: name = name.strip("\n") cmd = (f'source activate zhouxinT;' f"/SGRNJ02/RandD4/zhouxin/software/seqtk/seqtk subseq " f"{file} " f'{name} ' f'> ' f'{fastq_dir}/{i}.fq') os.system(cmd) i += 1# 开始组装@logdef bracer_summarise(outdir): bracer_outdir = f'{outdir}/bracer' cmd = ( f'source activate {BRACER_CONDA}; ' f'{BRACER_PATH} summarise ' f'-c {BRACER_CONF} ' f'{bracer_outdir} ' ) bracer_summarise.logger.info(cmd) os.system(cmd) def bracer(fq, outdir, species): prefix = os.path.basename(fq).strip('.fq') cmd = ( f'source activate {BRACER_CONDA}; ' f'{BRACER_PATH} assemble ' f'--fragment_length 150 ' f'--fragment_sd 5 ' f'--single_end ' f'--species {species} ' f'-c {BRACER_CONF} ' f'{prefix} ' f'{outdir}/bracer ' f'{fq} ' ) os.system(cmd) def tracer_summarise(outdir): tracer_outdir = f'{outdir}/tracer' cmd = ( f'source activate {CONDA_SUB}; ' f'{TRACER_PATH} summarise ' f'-c {CONF_PATH} ' f'{tracer_outdir} ' ) tracer_summarise.logger.info(cmd) os.system(cmd)def tracer(fq, outdir, species): prefix = os.path.basename(fq).strip('.fq') cmd = ( f'source activate {CONDA}; ' f'{TRACER_PATH} assemble ' f'--fragment_length 150 ' f'--fragment_sd 5 ' f'--single_end ' f'--species {species} ' f'-c {CONF_PATH} ' f'{fq} ' f'{prefix} ' f'{outdir}/tracer ' ) os.system(cmd) @logdef run_tracer(sample, outdir, fastq_dir, species, thread): fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))] outdirs = [outdir] * len(fqs) species = [species] * len(fqs) if not os.path.exists(f'{outdir}/tracer'): os.makedirs(f'{outdir}/tracer') all_res = [] with ProcessPoolExecutor(thread) as pool: for res in pool.map(tracer, fqs, outdirs, species): all_res.append(res) tracer_summarise(outdir) def run_bracer(sample, outdir, fastq_dir, species, thread): fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))] outdirs = [outdir] * len(fqs) species = [species] * len(fqs) if not os.path.exists(f'{outdir}/bracer'): os.makedirs(f'{outdir}/bracer') all_res = [] with ProcessPoolExecutor(thread) as pool: for res in pool.map(bracer, fqs, outdirs, species): all_res.append(res) bracer_summarise(outdir)def main(): parser = argparse.ArgumentParser() parser.add_argument("--tsvfile", help="tSNE file after auto assign") parser.add_argument("--cluster", help="cluster of T or B cells") parser.add_argument("--fqfile_name", help="TCR or BCR fqfile") parser.add_argument("--outdir", help="assemble outdir") parser.add_argument("--sample", help="sample name") parser.add_argument("--fastq_dir", help="fastq file dir") parser.add_argument('--mod', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], default="Hsap") parser.add_argument('--topn', help='select top N cells for analysis') parser.add_argument('--thread', help='thread') args = parser.parse_args() run_barcode(args.tsvfile, args.cluster) get_readid(args.fqfile_name, args.topn) split_fq(args.fqfile_name, args.fastq_dir) if args.mod == 'TCR': run_tracer(args.sample, args.outdir, args.fastq_dir, args.species, int(args.thread)) elif args.mod == 'BCR': run_bracer(args.sample, args.outdir, args.fastq_dir, args.species, int(args.thread))if __name__ == "__main__": main()