diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 76836c49..fe8df090 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b85b095..3addd465 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ ## Unreleased +New Features: +- Combine `--bam`, `--bam-list` and `--sample-bam` arguments #128 +- Combine `--ploidy` and `--sample-ploidy` arguments #128 +- Combine `--inbreeding` and `--sample-inbreeding` arguments #128 +- Combine `--mcmc-temperatures` and `--sample-mcmc-temperatures` arguments #128 + + + ## Beta v0.7.0 New Features: @@ -18,6 +26,8 @@ VCF Changes: - Added `NOA` filter to indicate loci where no alleles were observed (e.g., masked reference only) - Added `AF0` filter to indicate invalid prior allele frequencies in which all frequencies were zero + + ## Beta v0.6.0 New Features: diff --git a/cli-assemble-help.txt b/cli-assemble-help.txt index 37754ef8..5c9eeea9 100644 --- a/cli-assemble-help.txt +++ b/cli-assemble-help.txt @@ -1,11 +1,7 @@ usage: MCMC haplotype assembly [-h] [--region REGION] [--region-id REGION_ID] [--targets TARGETS] [--variants VARIANTS] - [--reference REFERENCE] [--bam [BAM ...]] - [--bam-list BAM_LIST] [--sample-bam SAMPLE_BAM] - [--ploidy PLOIDY] - [--sample-ploidy SAMPLE_PLOIDY] - [--inbreeding INBREEDING] - [--sample-inbreeding SAMPLE_INBREEDING] + [--reference REFERENCE] [--bam BAM [BAM ...]] + [--ploidy PLOIDY] [--inbreeding INBREEDING] [--sample-pool SAMPLE_POOL] [--base-error-rate BASE_ERROR_RATE] [--use-base-phred-scores] @@ -24,7 +20,6 @@ usage: MCMC haplotype assembly [-h] [--region REGION] [--region-id REGION_ID] [--mcmc-dosage-step-probability MCMC_DOSAGE_STEP_PROBABILITY] [--mcmc-partial-dosage-step-probability MCMC_PARTIAL_DOSAGE_STEP_PROBABILITY] [--mcmc-temperatures [MCMC_TEMPERATURES ...]] - [--sample-mcmc-temperatures SAMPLE_MCMC_TEMPERATURES] [--haplotype-posterior-threshold HAPLOTYPE_POSTERIOR_THRESHOLD] optional arguments: @@ -49,42 +44,34 @@ optional arguments: within this file. --reference REFERENCE Indexed fasta file containing the reference genome. - --bam [BAM ...] A list of 0 or more bam files. All samples found - within the listed bam files will be genotypes unless - the --sample-list parameter is used. - --bam-list BAM_LIST A file containing a list of bam file paths (one per - line). This can optionally be used in place of or - combined with the --bam parameter. - --sample-bam SAMPLE_BAM - A file containing a list of samples with bam file - paths. Each line of the file should be a sample - identifier followed by a tab and then a bam file path. - This can optionally be used in place the --bam and - --bam-list parameters. This is faster than using those - parameters when running many small jobs. An error will - be thrown if a sample is not found within its - specified bam file. - --ploidy PLOIDY Default ploidy for all samples (default = 2). This - value is used for all samples which are not specified - using the --sample-ploidy parameter - --sample-ploidy SAMPLE_PLOIDY - A file containing a list of samples with a ploidy - value used to indicate where their ploidy differs from - the default value. Each line should contain a sample - identifier followed by a tab and then an integer - ploidy value. + --bam BAM [BAM ...] Bam file(s) to use in analysis. This may be (1) a list + of one or more bam filepaths, (2) a plain-text file + containing a single bam filepath on each line, (3) a + plain-text file containing a sample identifier and its + corresponding bam filepath on each line separated by a + tab. If options (1) or (2) are used then all samples + within each bam will be used within the analysis. If + option (3) is used then only the specified sample will + be extracted from each bam file and An error will be + raised if a sample is not found within its specified + bam file. + --ploidy PLOIDY Specify sample ploidy (default = 2).This may be (1) a + single integer used to specify the ploidy of all + samples or (2) a file containing a list of all samples + and their ploidy. If option (2) is used then each line + of the plaintext file must contain a single sample + identifier and the ploidy of that sample separated by + a tab. --inbreeding INBREEDING - Default inbreeding coefficient for all samples - (default = 0.0). This value is used for all samples - which are not specified using the --sample-inbreeding - parameter. - --sample-inbreeding SAMPLE_INBREEDING - A file containing a list of samples with an inbreeding - coefficient used to indicate where their expected - inbreeding coefficient default value. Each line should - contain a sample identifier followed by a tab and then - a inbreeding coefficient value within the interval [0, - 1]. + Specify expected sample inbreeding coefficient + (default = 0.0).This may be (1) a single floating + point value in the interval [0, 1] used to specify the + inbreeding coefficient of all samples or (2) a file + containing a list of all samples and their inbreeding + coefficient. If option (2) is used then each line of + the plaintext file must contain a single sample + identifier and the inbreeding coefficient of that + sample separated by a tab. --sample-pool SAMPLE_POOL A name used to pool all sample reads into a single sample. WARNING: this is an experimental feature. @@ -163,20 +150,16 @@ optional arguments: sub-step during each step of the MCMC. (default = 0.5). --mcmc-temperatures [MCMC_TEMPERATURES ...] - A list of inverse-temperatures to use for parallel - tempered chains. These values must be between 0 and 1 - and will automatically be sorted in ascending order. - The cold chain value of 1.0 will be added - automatically if it is not specified. - --sample-mcmc-temperatures SAMPLE_MCMC_TEMPERATURES - A file containing a list of samples with mcmc - (inverse) temperatures. Each line of the file should - start with a sample identifier followed by tab - seperated numeric values between 0 and 1. The number - of temperatures specified may vary between samples. - Samples not listed in this file will use the default - values specified with the --mcmc-temperatures - argument. + Specify inverse-temperatures to use for parallel + tempered chains (default = 1.0 i.e., no tempering). + This may be either (1) a list of floating point values + or (2) a file containing a list of samples with mcmc + inverse-temperatures. If option (2) is used then the + file must contain a single sample per line followed by + a list of tab separated inverse temperatures. The + number of inverse-temperatures may differ between + samples and any samples not included in the list will + default to not using tempering. --haplotype-posterior-threshold HAPLOTYPE_POSTERIOR_THRESHOLD Posterior probability required for a haplotype to be included in the output VCF as an alternative allele. diff --git a/cli-call-exact-help.txt b/cli-call-exact-help.txt index 6b865d9e..10f3f1cb 100644 --- a/cli-call-exact-help.txt +++ b/cli-call-exact-help.txt @@ -2,11 +2,8 @@ usage: Exact haplotype calling [-h] [--haplotypes HAPLOTYPES] [--haplotype-frequencies HAPLOTYPE_FREQUENCIES] [--haplotype-frequencies-prior] [--skip-rare-haplotypes SKIP_RARE_HAPLOTYPES] - [--bam [BAM ...]] [--bam-list BAM_LIST] - [--sample-bam SAMPLE_BAM] [--ploidy PLOIDY] - [--sample-ploidy SAMPLE_PLOIDY] + [--bam BAM [BAM ...]] [--ploidy PLOIDY] [--inbreeding INBREEDING] - [--sample-inbreeding SAMPLE_INBREEDING] [--sample-pool SAMPLE_POOL] [--base-error-rate BASE_ERROR_RATE] [--use-base-phred-scores] @@ -37,42 +34,34 @@ optional arguments: if their frequency within that file is less than the specified value. This requires that the --haplotype- frequencies parameter is also specified. - --bam [BAM ...] A list of 0 or more bam files. All samples found - within the listed bam files will be genotypes unless - the --sample-list parameter is used. - --bam-list BAM_LIST A file containing a list of bam file paths (one per - line). This can optionally be used in place of or - combined with the --bam parameter. - --sample-bam SAMPLE_BAM - A file containing a list of samples with bam file - paths. Each line of the file should be a sample - identifier followed by a tab and then a bam file path. - This can optionally be used in place the --bam and - --bam-list parameters. This is faster than using those - parameters when running many small jobs. An error will - be thrown if a sample is not found within its - specified bam file. - --ploidy PLOIDY Default ploidy for all samples (default = 2). This - value is used for all samples which are not specified - using the --sample-ploidy parameter - --sample-ploidy SAMPLE_PLOIDY - A file containing a list of samples with a ploidy - value used to indicate where their ploidy differs from - the default value. Each line should contain a sample - identifier followed by a tab and then an integer - ploidy value. + --bam BAM [BAM ...] Bam file(s) to use in analysis. This may be (1) a list + of one or more bam filepaths, (2) a plain-text file + containing a single bam filepath on each line, (3) a + plain-text file containing a sample identifier and its + corresponding bam filepath on each line separated by a + tab. If options (1) or (2) are used then all samples + within each bam will be used within the analysis. If + option (3) is used then only the specified sample will + be extracted from each bam file and An error will be + raised if a sample is not found within its specified + bam file. + --ploidy PLOIDY Specify sample ploidy (default = 2).This may be (1) a + single integer used to specify the ploidy of all + samples or (2) a file containing a list of all samples + and their ploidy. If option (2) is used then each line + of the plaintext file must contain a single sample + identifier and the ploidy of that sample separated by + a tab. --inbreeding INBREEDING - Default inbreeding coefficient for all samples - (default = 0.0). This value is used for all samples - which are not specified using the --sample-inbreeding - parameter. - --sample-inbreeding SAMPLE_INBREEDING - A file containing a list of samples with an inbreeding - coefficient used to indicate where their expected - inbreeding coefficient default value. Each line should - contain a sample identifier followed by a tab and then - a inbreeding coefficient value within the interval [0, - 1]. + Specify expected sample inbreeding coefficient + (default = 0.0).This may be (1) a single floating + point value in the interval [0, 1] used to specify the + inbreeding coefficient of all samples or (2) a file + containing a list of all samples and their inbreeding + coefficient. If option (2) is used then each line of + the plaintext file must contain a single sample + identifier and the inbreeding coefficient of that + sample separated by a tab. --sample-pool SAMPLE_POOL A name used to pool all sample reads into a single sample. WARNING: this is an experimental feature. diff --git a/cli-call-help.txt b/cli-call-help.txt index 708c0b72..7f2d60cd 100644 --- a/cli-call-help.txt +++ b/cli-call-help.txt @@ -2,11 +2,8 @@ usage: MCMC haplotype calling [-h] [--haplotypes HAPLOTYPES] [--haplotype-frequencies HAPLOTYPE_FREQUENCIES] [--haplotype-frequencies-prior] [--skip-rare-haplotypes SKIP_RARE_HAPLOTYPES] - [--bam [BAM ...]] [--bam-list BAM_LIST] - [--sample-bam SAMPLE_BAM] [--ploidy PLOIDY] - [--sample-ploidy SAMPLE_PLOIDY] + [--bam BAM [BAM ...]] [--ploidy PLOIDY] [--inbreeding INBREEDING] - [--sample-inbreeding SAMPLE_INBREEDING] [--sample-pool SAMPLE_POOL] [--base-error-rate BASE_ERROR_RATE] [--use-base-phred-scores] @@ -41,42 +38,34 @@ optional arguments: if their frequency within that file is less than the specified value. This requires that the --haplotype- frequencies parameter is also specified. - --bam [BAM ...] A list of 0 or more bam files. All samples found - within the listed bam files will be genotypes unless - the --sample-list parameter is used. - --bam-list BAM_LIST A file containing a list of bam file paths (one per - line). This can optionally be used in place of or - combined with the --bam parameter. - --sample-bam SAMPLE_BAM - A file containing a list of samples with bam file - paths. Each line of the file should be a sample - identifier followed by a tab and then a bam file path. - This can optionally be used in place the --bam and - --bam-list parameters. This is faster than using those - parameters when running many small jobs. An error will - be thrown if a sample is not found within its - specified bam file. - --ploidy PLOIDY Default ploidy for all samples (default = 2). This - value is used for all samples which are not specified - using the --sample-ploidy parameter - --sample-ploidy SAMPLE_PLOIDY - A file containing a list of samples with a ploidy - value used to indicate where their ploidy differs from - the default value. Each line should contain a sample - identifier followed by a tab and then an integer - ploidy value. + --bam BAM [BAM ...] Bam file(s) to use in analysis. This may be (1) a list + of one or more bam filepaths, (2) a plain-text file + containing a single bam filepath on each line, (3) a + plain-text file containing a sample identifier and its + corresponding bam filepath on each line separated by a + tab. If options (1) or (2) are used then all samples + within each bam will be used within the analysis. If + option (3) is used then only the specified sample will + be extracted from each bam file and An error will be + raised if a sample is not found within its specified + bam file. + --ploidy PLOIDY Specify sample ploidy (default = 2).This may be (1) a + single integer used to specify the ploidy of all + samples or (2) a file containing a list of all samples + and their ploidy. If option (2) is used then each line + of the plaintext file must contain a single sample + identifier and the ploidy of that sample separated by + a tab. --inbreeding INBREEDING - Default inbreeding coefficient for all samples - (default = 0.0). This value is used for all samples - which are not specified using the --sample-inbreeding - parameter. - --sample-inbreeding SAMPLE_INBREEDING - A file containing a list of samples with an inbreeding - coefficient used to indicate where their expected - inbreeding coefficient default value. Each line should - contain a sample identifier followed by a tab and then - a inbreeding coefficient value within the interval [0, - 1]. + Specify expected sample inbreeding coefficient + (default = 0.0).This may be (1) a single floating + point value in the interval [0, 1] used to specify the + inbreeding coefficient of all samples or (2) a file + containing a list of all samples and their inbreeding + coefficient. If option (2) is used then each line of + the plaintext file must contain a single sample + identifier and the inbreeding coefficient of that + sample separated by a tab. --sample-pool SAMPLE_POOL A name used to pool all sample reads into a single sample. WARNING: this is an experimental feature. diff --git a/mchap/application/arguments.py b/mchap/application/arguments.py index aa1d549e..d51973c9 100644 --- a/mchap/application/arguments.py +++ b/mchap/application/arguments.py @@ -1,4 +1,5 @@ import copy +import pysam from dataclasses import dataclass from mchap.constant import PFEIFFER_ERROR @@ -131,108 +132,57 @@ def add_to(self, parser): "--bam", dict( type=str, - nargs="*", + nargs="+", default=[], help=( - "A list of 0 or more bam files. " - "All samples found within the listed bam files will be genotypes " - "unless the --sample-list parameter is used." + "Bam file(s) to use in analysis. " + "This may be (1) a list of one or more bam filepaths, " + "(2) a plain-text file containing a single bam filepath on each line, " + "(3) a plain-text file containing a sample identifier and its " + "corresponding bam filepath on each line separated by a tab. " + "If options (1) or (2) are used then all samples within each bam will be used within the analysis. " + "If option (3) is used then only the specified sample will be extracted from each bam file and " + "An error will be raised if a sample is not found within its specified bam file." ), ), ) -bam_list = Parameter( - "--bam-list", - dict( - type=str, - nargs=1, - default=[None], - help=( - "A file containing a list of bam file paths (one per line). " - "This can optionally be used in place of or combined with the --bam " - "parameter." - ), - ), -) - -sample_bam = Parameter( - "--sample-bam", - dict( - type=str, - nargs=1, - default=[None], - help=( - "A file containing a list of samples with bam file paths. " - "Each line of the file should be a sample identifier followed " - "by a tab and then a bam file path. " - "This can optionally be used in place the --bam and --bam-list " - "parameters. This is faster than using those parameters when running " - "many small jobs. " - "An error will be thrown if a sample is not found within its specified " - "bam file." - ), - ), -) ploidy = Parameter( "--ploidy", - dict( - type=int, - nargs=1, - default=[2], - help=( - "Default ploidy for all samples (default = 2). " - "This value is used for all samples which are not specified using " - "the --sample-ploidy parameter" - ), - ), -) - -sample_ploidy = Parameter( - "--sample-ploidy", dict( type=str, nargs=1, - default=[None], + default=["2"], help=( - "A file containing a list of samples with a ploidy value " - "used to indicate where their ploidy differs from the " - "default value. Each line should contain a sample identifier " - "followed by a tab and then an integer ploidy value." + "Specify sample ploidy (default = 2)." + "This may be (1) a single integer used to specify the ploidy of all samples or " + "(2) a file containing a list of all samples and their ploidy. " + "If option (2) is used then each line of the plaintext file must " + "contain a single sample identifier and the ploidy of that sample separated by a tab." ), ), ) + inbreeding = Parameter( "--inbreeding", - dict( - type=float, - nargs=1, - default=[0.0], - help=( - "Default inbreeding coefficient for all samples (default = 0.0). " - "This value is used for all samples which are not specified using " - "the --sample-inbreeding parameter." - ), - ), -) - -sample_inbreeding = Parameter( - "--sample-inbreeding", dict( type=str, nargs=1, - default=[None], - help=( - "A file containing a list of samples with an inbreeding coefficient " - "used to indicate where their expected inbreeding coefficient " - "default value. Each line should contain a sample identifier " - "followed by a tab and then a inbreeding coefficient value " - "within the interval [0, 1]." + default=["0.0"], + help=( + "Specify expected sample inbreeding coefficient (default = 0.0)." + "This may be (1) a single floating point value in the interval [0, 1] " + "used to specify the inbreeding coefficient of all samples or " + "(2) a file containing a list of all samples and their inbreeding coefficient. " + "If option (2) is used then each line of the plaintext file must " + "contain a single sample identifier and the inbreeding coefficient of that sample separated by a tab." ), ), ) + sample_pool = Parameter( "--sample-pool", dict( @@ -412,35 +362,20 @@ def add_to(self, parser): ), ) - mcmc_temperatures = Parameter( "--mcmc-temperatures", - dict( - type=float, - nargs="*", - default=[1.0], - help=( - "A list of inverse-temperatures to use for parallel tempered chains. " - "These values must be between 0 and 1 and will automatically be sorted in " - "ascending order. The cold chain value of 1.0 will be added automatically if " - "it is not specified." - ), - ), -) - -sample_mcmc_temperatures = Parameter( - "--sample-mcmc-temperatures", dict( type=str, - nargs=1, - default=[None], - help=( - "A file containing a list of samples with mcmc (inverse) temperatures. " - "Each line of the file should start with a sample identifier followed by " - "tab seperated numeric values between 0 and 1. " - "The number of temperatures specified may vary between samples. " - "Samples not listed in this file will use the default values specified " - "with the --mcmc-temperatures argument." + nargs="*", + default=["1.0"], + help=( + "Specify inverse-temperatures to use for parallel tempered chains (default = 1.0 i.e., no tempering). " + "This may be either (1) a list of floating point values or " + "(2) a file containing a list of samples with mcmc inverse-temperatures. " + "If option (2) is used then the file must contain a single sample per line " + "followed by a list of tab separated inverse temperatures. " + "The number of inverse-temperatures may differ between samples and any samples " + "not included in the list will default to not using tempering." ), ), ) @@ -589,12 +524,8 @@ def add_to(self, parser): DEFAULT_PARSER_ARGUMENTS = [ bam, - bam_list, - sample_bam, ploidy, - sample_ploidy, inbreeding, - sample_inbreeding, sample_pool, base_error_rate, ignore_base_phred_scores, @@ -642,21 +573,19 @@ def add_to(self, parser): mcmc_dosage_step_probability, mcmc_partial_dosage_step_probability, mcmc_temperatures, - sample_mcmc_temperatures, haplotype_posterior_threshold, ] ) -def parse_sample_bam_paths(arguments): +def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field): """Combine arguments relating to sample bam file specification. Parameters ---------- - arguments - Parsed arguments containing some combination of - arguments for "bam", "bam_list", "sample_bam", and - "sample_list". + argument : list[str] + list of bam filepaths or single plaintext filepath + read_group_field : str Returns ------- @@ -665,102 +594,98 @@ def parse_sample_bam_paths(arguments): sample_bam : dict Dict mapping samples to bam paths. """ - sample_bams = dict() - - # bam paths - bams = [] - if hasattr(arguments, "bam"): - bams = arguments.bam - if hasattr(arguments, "bam_list"): - path = arguments.bam_list[0] - if path: - with open(arguments.bam_list[0]) as f: - bams += [line.strip() for line in f.readlines()] - if len(bams) != len(set(bams)): - raise IOError("Duplicate input bams") - sample_bams.update(extract_sample_ids(bams, id=arguments.read_group_field[0])) - - # sample-bams map - if hasattr(arguments, "sample_bam"): - # only use values in sample_bam file - path = arguments.sample_bam[0] - if path and len(sample_bams) > 0: - raise IOError( - "The --sample-bam argument cannot be combined with --bam or --bam-list." - ) - elif path: - with open(path) as f: - for line in f.readlines(): - sample, bam = line.strip().split("\t") - sample_bams[sample] = bam - - samples = list(sample_bams.keys()) - if len(samples) != len(set(samples)): - raise IOError("Duplicate input samples") - - # samples as pools. TODO: multi-pools - pool = arguments.sample_pool[0] - if pool is None: + + # case of list of bam paths + textfile = False + if len(bam_argument) == 1: + try: + pysam.AlignmentFile(bam_argument[0]) + except ValueError: + # not a bam + textfile = True + else: + bams = bam_argument + else: + bams = bam_argument + if not textfile: + sample_bams = extract_sample_ids(bams, id=read_group_field) + samples = list(sample_bams) + + # case of plain-text filepath + if textfile: + with open(bam_argument[0]) as f: + lines = [line.strip().split("\t") for line in f.readlines()] + n_fields = len(lines[0]) + for line in lines: + if len(line) != n_fields: + raise ValueError("Inconsistent number of fields") + if n_fields == 1: + # list of bam paths + bams = [line[0] for line in lines] + sample_bams = extract_sample_ids(bams, id=read_group_field) + samples = list(sample_bams) + elif n_fields == 2: + # list of sample-bam pairs + samples = [line[0] for line in lines] + sample_bams = dict(lines) + else: + raise ValueError("Too many fields") + + # handle sample pooling + if sample_pool_argument is None: + # pools of 1 sample_bams = {k: [(k, v)] for k, v in sample_bams.items()} else: - samples = [pool] - sample_bams = {pool: [(k, v) for k, v in sample_bams.items()]} + # pool all samples + samples = [sample_pool_argument] + sample_bams = {sample_pool_argument: [(k, v) for k, v in sample_bams.items()]} + # TODO: multiple pools return samples, sample_bams -def parse_sample_value_map(arguments, samples, default, sample_map, type): +def parse_sample_value_map(argument, samples, type): """Combine arguments specified for a default value and sample-value map file. Parameters ---------- - arguments - Parsed arguments containing some the default value argument - and the optionally the sample-value map file argument. + argument : str + Argument to parse. samples : list List of sample names - default : str - Name of argument with default value. - sample_map : str - Path of file containing tab-seperated per sample values. type : type - Type of the specified values. + Type of the specified values (float or int). Returns ------- sample_values : dict Dict mapping samples to values. """ - sample_value = dict() - assert hasattr(arguments, default) - # sample value map - if hasattr(arguments, sample_map): - path = getattr(arguments, sample_map)[0] - if path: - with open(path) as f: - for line in f.readlines(): - sample, value = line.strip().split("\t") - sample_value[sample] = type(value) - # default value - default_value = getattr(arguments, default)[0] - for sample in samples: - if sample in sample_value: - pass - else: - sample_value[sample] = default_value - return sample_value + if (type is int) and argument.isdigit(): + value = int(argument) + return {s: value for s in samples} + if (type is float) and argument.replace(".", "", 1).isdigit(): + value = float(argument) + return {s: value for s in samples} + data = dict() + with open(argument) as f: + for line in f.readlines(): + sample, value = line.strip().split("\t") + data[sample] = type(value) + for s in samples: + if s not in data: + raise ValueError("Sample '{}' not found in file '{}'".format(s, argument)) + return data -def parse_sample_temperatures(arguments, samples): +def parse_sample_temperatures(mcmc_temperatures_argument, samples): """Parse inverse temperatures for MCMC simulation with parallel-tempering. Parameters ---------- - arguments - Parsed arguments containing the "mcmc_temperatures" - argument and optionally the "sample_mcmc_temperatures" - argument. + mcmc_temperatures_argument : str + Value(s) for mcmc_temperatures. samples : list List of samples. @@ -770,37 +695,37 @@ def parse_sample_temperatures(arguments, samples): Dict mapping each sample to a list of temperatures (floats). """ - assert hasattr(arguments, "mcmc_temperatures") - # per sample mcmc temperatures - sample_mcmc_temperatures = dict() - if hasattr(arguments, "sample_mcmc_temperatures"): - path = arguments.sample_mcmc_temperatures[0] - if path: - with open(path) as f: - for line in f.readlines(): - values = line.strip().split("\t") - sample = values[0] - temps = [float(v) for v in values[1:]] - temps.sort() - assert temps[0] > 0.0 - assert temps[-1] <= 1.0 - if temps[-1] != 1.0: - temps.append(1.0) - sample_mcmc_temperatures[sample] = temps - - # default mcmc temperatures - temps = arguments.mcmc_temperatures - temps.sort() - assert temps[0] > 0.0 - assert temps[-1] <= 1.0 - if temps[-1] != 1.0: - temps.append(1.0) - for sample in samples: - if sample in sample_mcmc_temperatures: - pass - else: - sample_mcmc_temperatures[sample] = temps - return sample_mcmc_temperatures + if len(mcmc_temperatures_argument) > 1: + # must be a list of temps + floats = True + elif mcmc_temperatures_argument[0].replace(".", "", 1).isdigit(): + # must be a single temp + floats = True + else: + floats = False + if floats: + temps = [float(s) for s in mcmc_temperatures_argument] + temps.sort() + assert temps[0] > 0.0 + assert temps[-1] <= 1.0 + if temps[-1] != 1.0: + temps.append(1.0) + return {s: temps for s in samples} + # case of a file, default to 1.0 + data = {s: [1.0] for s in samples} + with open(mcmc_temperatures_argument[0]) as f: + for line in f.readlines(): + values = line.strip().split("\t") + sample = values[0] + temps = [float(v) for v in values[1:]] + temps.sort() + assert temps[0] > 0.0 + assert temps[-1] <= 1.0 + if temps[-1] != 1.0: + temps.append(1.0) + data[sample] = temps + assert len(samples) == len(data) + return data def collect_default_program_arguments(arguments): @@ -811,19 +736,17 @@ def collect_default_program_arguments(arguments): "Cannot ignore base phred scores if --base-error-rate is 0" ) # merge sample specific data with defaults - samples, sample_bams = parse_sample_bam_paths(arguments) + samples, sample_bams = parse_sample_bam_paths( + arguments.bam, arguments.sample_pool[0], arguments.read_group_field[0] + ) sample_ploidy = parse_sample_value_map( - arguments, + arguments.ploidy[0], samples, - default="ploidy", - sample_map="sample_ploidy", type=int, ) sample_inbreeding = parse_sample_value_map( - arguments, + arguments.inbreeding[0], samples, - default="inbreeding", - sample_map="sample_inbreeding", type=float, ) return dict( @@ -882,7 +805,7 @@ def collect_assemble_mcmc_program_arguments(arguments): raise ValueError("Cannot combine --targets and --region arguments.") data = collect_default_mcmc_program_arguments(arguments) sample_mcmc_temperatures = parse_sample_temperatures( - arguments, samples=data["samples"] + arguments.mcmc_temperatures, samples=data["samples"] ) data.update( dict( diff --git a/mchap/tests/test_application_assemble.py b/mchap/tests/test_application_assemble.py index c999dabd..978c96d4 100644 --- a/mchap/tests/test_application_assemble.py +++ b/mchap/tests/test_application_assemble.py @@ -102,32 +102,30 @@ def test_Program__cli_lists(): with open(tmp_sample_ploidy, "w") as f: f.write("SAMPLE3\t2\n") f.write("SAMPLE1\t6\n") - # SAMPLE4 uses default + f.write("SAMPLE2\t4\n") tmp_sample_inbreeding = dirpath + "/sample-inbreeding.txt" with open(tmp_sample_inbreeding, "w") as f: f.write("SAMPLE3\t0.1\n") f.write("SAMPLE1\t0.2\n") - # SAMPLE4 uses default + f.write("SAMPLE2\t0.0\n") tmp_sample_mcmc_temperatures = dirpath + "/sample-mcmc-temperatures.txt" with open(tmp_sample_mcmc_temperatures, "w") as f: f.write("SAMPLE3\t0.8\t0.1\t1\t0.2\n") # out of order f.write("SAMPLE1\t0.2\n") # missing cold chain - # SAMPLE4 uses default + # SAMPLE2 uses default command = [ "mchap", "assemble", - "--bam-list", + "--bam", tmp_bam_list, - "--sample-ploidy", - tmp_sample_ploidy, "--ploidy", - "4", - "--sample-inbreeding", + tmp_sample_ploidy, + "--inbreeding", tmp_sample_inbreeding, - "--sample-mcmc-temperatures", + "--mcmc-temperatures", tmp_sample_mcmc_temperatures, "--targets", BED, @@ -474,7 +472,7 @@ def test_Program__run_stdout__region(region, region_id, cache_threshold): command = [ "mchap", "assemble", - "--sample-bam", + "--bam", tmp_sample_bams, "--ploidy", "4", diff --git a/mchap/tests/test_docs.py b/mchap/tests/test_docs.py index 0980a063..d1d83175 100644 --- a/mchap/tests/test_docs.py +++ b/mchap/tests/test_docs.py @@ -4,7 +4,7 @@ import subprocess -@pytest.mark.skipif(sys.version_info < (3, 9), reason="argparse formatting changed") +@pytest.mark.skipif(sys.version_info != (3, 9), reason="argparse formatting changed") @pytest.mark.parametrize( "subtool", [ diff --git a/setup.py b/setup.py index 14642f39..50cd6092 100644 --- a/setup.py +++ b/setup.py @@ -40,9 +40,9 @@ def read_file(file_name): "Intended Audience :: Science/Research", "Natural Language :: English", "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering :: Bio-Informatics", ], )