From 3402b19bb2882dc19a02af731a696eb225af5f6d Mon Sep 17 00:00:00 2001 From: Tavi Nathanson Date: Tue, 25 Aug 2015 17:13:35 -0400 Subject: [PATCH 1/3] Add NetMHCIIpan to mhctools --- mhctools/__init__.py | 2 + mhctools/alleles.py | 129 +++++++++++++++++-------- mhctools/base_commandline_predictor.py | 11 ++- mhctools/base_predictor.py | 13 ++- mhctools/common.py | 4 +- mhctools/file_formats.py | 16 ++- mhctools/netmhcii_pan.py | 106 ++++++++++++++++++++ setup.py | 2 +- test/test_mhc_allele_names.py | 52 +++++++--- test/test_netmhcii_pan.py | 32 ++++++ 10 files changed, 300 insertions(+), 67 deletions(-) create mode 100644 mhctools/netmhcii_pan.py create mode 100644 test/test_netmhcii_pan.py diff --git a/mhctools/__init__.py b/mhctools/__init__.py index b3a3740..2eefd7c 100644 --- a/mhctools/__init__.py +++ b/mhctools/__init__.py @@ -11,6 +11,7 @@ from .netmhc import NetMHC from .netmhc_cons import NetMHCcons from .netmhc_pan import NetMHCpan +from .netmhcii_pan import NetMHCIIpan from .random_predictor import RandomBindingPredictor __all__ = [ @@ -25,5 +26,6 @@ "NetMHC", "NetMHCcons", "NetMHCpan", + "NetMHCIIpan", "RandomBindingPredictor", ] diff --git a/mhctools/alleles.py b/mhctools/alleles.py index 0f6178b..8715344 100644 --- a/mhctools/alleles.py +++ b/mhctools/alleles.py @@ -26,7 +26,7 @@ class AlleleParseError(Exception): dog="DLA", sheep=["OVA", "Ovar", "Ovca"], swine="SLA", - mouse=["H2"], + mouse=["H2", "H-2"], rainbow_trout="Onmy", rat=["Rano", "Rara", "RT1"], salmon="Sasa", @@ -114,6 +114,43 @@ def _parse_mouse_allele_name(name): return gene_name.upper(), allele.lower() def parse_allele_name(name): + """ + Handle different forms of both single and alpha-beta allele names. + Alpha-beta alleles may look like: + + DPA10105-DPB110001 + HLA-DPA1*01:05-DPB1*100:01 + hla-dpa1*0105-dpb1*10001 + dpa1*0105-dpb1*10001 + """ + species, name = split_species(name) + parts = name.split("-") + assert len(parts) <= 2, "Allele has too many parts: %s" % name + if len(parts) == 1: + return (parse_single_allele_name(name, species),) + else: + return (parse_single_allele_name(parts[0], species), + parse_single_allele_name(parts[1], species)) + +def split_species(name): + """ + Splits off the species component of the allele name from the rest of it. + + Given "HLA-A*02:01", returns ("HLA", "A*02:01"). + """ + species = None + for species_list in SPECIES_PREFIXES.values(): + if isinstance(species_list, str): + species_list = [species_list] + for curr_species in species_list: + prefix = name[:len(curr_species) + 1].upper() + if prefix == (curr_species.upper() + "-"): + species = curr_species + name = name[len(curr_species) + 1:] + return (species, name) + return (species, name) + +def parse_single_allele_name(name, species=None): """Takes an allele name and splits it into four parts: 1) species prefix 2) gene name @@ -141,20 +178,17 @@ def parse_allele_name(name): if len(name) == 0: raise ValueError("Can't normalize empty MHC allele name") - if name.upper().startswith("H2") or name.upper().startswith("H-2"): - gene, allele_code = _parse_mouse_allele_name(name) + species_from_name, name = split_species(name) + if species: + assert not species_from_name, ("If a species is passed in, we better not have another " + "species in the name itself.") + else: + species = species_from_name + + if species == "H-2" or species == "H2": + gene, allele_code = _parse_mouse_allele_name("H-2-" + name) # mice don't have allele families return AlleleName("H-2", gene, "", allele_code) - species = None - for species_list in SPECIES_PREFIXES.values(): - if isinstance(species_list, str): - species_list = [species_list] - for curr_species in species_list: - prefix = name[:len(curr_species) + 1].upper() - if prefix == (curr_species.upper() + "-"): - species = curr_species - name = name[len(curr_species) + 1:] - break if len(name) == 0: raise AlleleParseError("Incomplete MHC allele name: %s" % (original,)) @@ -165,10 +199,10 @@ def parse_allele_name(name): raise AlleleParseError("Can't parse allele name: %s" % original) species = "HLA" - if name[0] == "D": + if name[0].upper() == "D": # MHC class II genes like "DQA1" need to be parsed with both # letters and numbers - gene, name = _parse_alphanum(name) + gene, name = _parse_alphanum(name, 4) elif name[0].isalpha(): # if there are more separators to come, then assume the gene names # can have the form "DQA1" @@ -216,7 +250,19 @@ def parse_allele_name(name): _normalized_allele_cache = {} def normalize_allele_name(raw_allele): - """MHC alleles are named with a frustatingly loose system. It's not uncommon + """Cache repeated calls to non-compact normalized alleles. + + See normalize_allele_name_uncache for details. + Also see parse_allele_name for details regarding alpha-beta pairs. + """ + if raw_allele in _normalized_allele_cache: + return _normalized_allele_cache[raw_allele] + normalized = normalize_allele_name_uncached(raw_allele, is_compact=False) + _normalized_allele_cache[raw_allele] = normalized + return normalized + +def normalize_allele_name_uncached(raw_allele, is_compact=False): + """MHC alleles are named with a frustratingly loose system. It's not uncommon to see dozens of different forms for the same allele. For example, these all refer to the same MHC sequence: @@ -252,37 +298,38 @@ def normalize_allele_name(raw_allele): These should all be normalized to: HLA-A*02:01 """ - if raw_allele in _normalized_allele_cache: - return _normalized_allele_cache[raw_allele] - parsed_allele = parse_allele_name(raw_allele) - if len(parsed_allele.allele_family) > 0: - normalized = "%s-%s*%s:%s" % ( - parsed_allele.species, - parsed_allele.gene, - parsed_allele.allele_family, - parsed_allele.allele_code) - else: - # mice don't have allele families - # e.g. H-2-Kd - # species = H-2 - # gene = K - # allele = d - normalized = "%s-%s%s" % ( - parsed_allele.species, - parsed_allele.gene, - parsed_allele.allele_code) - _normalized_allele_cache[raw_allele] = normalized + parsed_alleles = parse_allele_name(raw_allele) + species = parsed_alleles[0].species + normalized = species if not is_compact else "" + for parsed_allele in parsed_alleles: + assert parsed_allele.species == species, \ + "Multiple species referenced in the same allele name: %s and %s" % ( + parsed_allele.species, species) + if len(parsed_allele.allele_family) > 0: + normalized += "%s%s%s%s%s%s" % ( + "-" if not is_compact else "", + parsed_allele.gene, + "*" if not is_compact else "", + parsed_allele.allele_family, + ":" if not is_compact else "", + parsed_allele.allele_code) + else: + # mice don't have allele families + # e.g. H-2-Kd + # species = H-2 + # gene = K + # allele = d + normalized += "%s%s%s" % ( + "-" if not is_compact else "", + parsed_allele.gene, + parsed_allele.allele_code) return normalized def compact_allele_name(raw_allele): """ Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db """ - parsed_allele = parse_allele_name(raw_allele) - return "%s%s%s" % ( - parsed_allele.gene, - parsed_allele.allele_family, - parsed_allele.allele_code) + return normalize_allele_name_uncached(raw_allele, is_compact=True) def _parse_substring(allele, pred, max_len=None): """ diff --git a/mhctools/base_commandline_predictor.py b/mhctools/base_commandline_predictor.py index 8334cc4..4393beb 100644 --- a/mhctools/base_commandline_predictor.py +++ b/mhctools/base_commandline_predictor.py @@ -32,7 +32,8 @@ def __init__( command, alleles, epitope_lengths, - supported_allele_flag='-listMHC'): + supported_allele_flag='-listMHC', + normalize_allele_func=normalize_allele_name): self.name = name self.command = command self.supported_allele_flag = supported_allele_flag @@ -59,7 +60,8 @@ def __init__( self, alleles, epitope_lengths, - valid_alleles=valid_alleles) + valid_alleles=valid_alleles, + normalize_allele_func=normalize_allele_func) @staticmethod def _determine_valid_alleles(command, supported_allele_flag): @@ -80,8 +82,9 @@ def _determine_valid_alleles(command, supported_allele_flag): line = line.strip() if not line.startswith('#') and len(line) > 0: try: - allele = normalize_allele_name(line) - valid_alleles.add(allele) + # We don't need to normalize, as this is the actual allele name that + # the predictor is expecting. + valid_alleles.add(line) except AlleleParseError as error: logging.info("Skipping allele %s: %s" % ( line, error)) diff --git a/mhctools/base_predictor.py b/mhctools/base_predictor.py index 472a173..d56a5b1 100644 --- a/mhctools/base_predictor.py +++ b/mhctools/base_predictor.py @@ -28,7 +28,8 @@ def __init__( self, alleles, epitope_lengths, - valid_alleles=None): + valid_alleles=None, + normalize_allele_func=normalize_allele_name): """ Parameters ---------- @@ -44,12 +45,17 @@ def __init__( valid_alleles : list, optional If given, constrain HLA alleles to be contained within this set. + + normalize_allele_func : function, optional + If given, normalize alleles using this function when + comparing them with the predictor's valid set of + alleles. """ # I find myself often constructing a predictor with just one allele # so as a convenience, allow user to not wrap that allele as a list if isinstance(alleles, str): alleles = [alleles] - self.alleles = self._check_hla_alleles(alleles, valid_alleles) + self.alleles = self._check_hla_alleles(alleles, normalize_allele_func, valid_alleles) if isinstance(epitope_lengths, int): epitope_lengths = [epitope_lengths] @@ -79,6 +85,7 @@ def __str__(self): @staticmethod def _check_hla_alleles( alleles, + normalize_allele_func, valid_alleles=None): """ Given a list of HLA alleles and an optional list of valid @@ -87,7 +94,7 @@ def _check_hla_alleles( """ require_iterable_of(alleles, str, "HLA alleles") alleles = [ - normalize_allele_name(allele.strip().upper()) + normalize_allele_func(allele.strip().upper()) for allele in alleles ] if valid_alleles: diff --git a/mhctools/common.py b/mhctools/common.py index fac3730..ff3047d 100644 --- a/mhctools/common.py +++ b/mhctools/common.py @@ -21,7 +21,7 @@ except NameError: string_classes = (str,) -def seq_to_str(obj): +def seq_to_str(obj, sep=","): """ Given a sequence convert it to a comma separated string. If, however, the argument is a single object, return its string @@ -30,7 +30,7 @@ def seq_to_str(obj): if isinstance(obj, string_classes): return obj elif isinstance(obj, (list, tuple)): - return ",".join([str(x) for x in obj]) + return sep.join([str(x) for x in obj]) else: return str(obj) diff --git a/mhctools/file_formats.py b/mhctools/file_formats.py index 8af35dd..e7d7772 100644 --- a/mhctools/file_formats.py +++ b/mhctools/file_formats.py @@ -169,7 +169,9 @@ def parse_netmhcpan_stdout( prediction_method_name="netmhcpan", sequence_key_mapping=None): """ - Parse the output format for NetMHCpan and NetMHCcons, which looks like: + Parse the output format for NetMHCpan, NetMHCcons, and NetMHCIIpan*, which looks like: + + * netMHCIIpan has two extra fields # Affinity Threshold for Strong binding peptides 50.000', # Affinity Threshold for Weak binding peptides 500.000', @@ -196,11 +198,17 @@ def parse_netmhcpan_stdout( fasta_dictionary=fasta_dictionary, prediction_method_name=prediction_method_name) + # netMHCIIpan has some extra fields + is_mhciipan = prediction_method_name == "netmhciipan" + n_required_fields = 9 if is_mhciipan else 7 + for fields in split_stdout_lines(netmhc_output): - n_required_fields = 7 if len(fields) >= n_required_fields: - pos, allele, peptide, key, log_affinity, ic50, rank = \ - fields[:n_required_fields] + if is_mhciipan: + pos, allele, peptide, key, Pos, Core, log_affinity, ic50, rank = ( + fields[:n_required_fields]) + else: + pos, allele, peptide, key, log_affinity, ic50, rank = fields[:n_required_fields] try: pos = int(pos) allele = str(allele) diff --git a/mhctools/netmhcii_pan.py b/mhctools/netmhcii_pan.py new file mode 100644 index 0000000..ddc1b14 --- /dev/null +++ b/mhctools/netmhcii_pan.py @@ -0,0 +1,106 @@ +# Copyright (c) 2014. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import +import tempfile +import logging + +from .alleles import parse_allele_name +from .base_commandline_predictor import BaseCommandlinePredictor +from .cleanup_context import CleanupFiles +from .common import check_sequence_dictionary, seq_to_str +from .file_formats import create_input_fasta_files, parse_netmhcpan_stdout +from .process_helpers import AsyncProcess + +class NetMHCIIpan(BaseCommandlinePredictor): + + def __init__( + self, + alleles, + netmhc_command="netMHCIIpan", + epitope_lengths=[15, 16, 17, 18, 19, 20]): + + def normalize_allele(allele_name): + """ + netMHCIIpan has some unique requirements for allele formats, + expecting the following forms: + - DRB1_0101 (for standard alleles) + - HLA-DQA10501-DQB10636 (for specifying alpha and beta alleles) + """ + parsed_alleles = parse_allele_name(allele_name) + if len(parsed_alleles) == 1: + return "%s_%s%s" % (parsed_alleles[0].gene, + parsed_alleles[0].allele_family, + parsed_alleles[0].allele_code) + else: + return "HLA-%s%s%s-%s%s%s" % ( + parsed_alleles[0].gene, + parsed_alleles[0].allele_family, + parsed_alleles[0].allele_code, + parsed_alleles[1].gene, + parsed_alleles[1].allele_family, + parsed_alleles[1].allele_code) + + BaseCommandlinePredictor.__init__( + self, + name="NetMHCIIpan", + command=netmhc_command, + alleles=alleles, + epitope_lengths=epitope_lengths, + supported_allele_flag="-list", + normalize_allele_func=normalize_allele) + + def predict(self, fasta_dictionary): + fasta_dictionary = check_sequence_dictionary(fasta_dictionary) + input_filenames, sequence_key_mapping = create_input_fasta_files( + fasta_dictionary) + # TODO: We are not currently using the file chunking + # functionality here. See NetMHCcons. + input_filename = input_filenames[0] + + alleles_str = \ + ",".join(allele.replace("*", "") for allele in self.alleles) + output_file = tempfile.NamedTemporaryFile( + "w", + prefix="netMHCIIpan_output", + delete=False) + args = [ + self.command, + "-length", seq_to_str(self.epitope_lengths, sep=" "), + "-f", input_filename, + "-a", alleles_str + ] + logging.info(" ".join(args)) + + with CleanupFiles( + filenames=[input_filename], + files=[output_file]): + process = AsyncProcess( + args=args, + redirect_stdout_file=output_file) + process.wait() + # need to flush written output and re-open for read + output_file.close() + with open(output_file.name, 'r') as f: + file_contents = f.read() + epitope_collection = parse_netmhcpan_stdout( + file_contents, + sequence_key_mapping=sequence_key_mapping, + fasta_dictionary=fasta_dictionary, + prediction_method_name="netmhciipan") + + if len(epitope_collection) == 0: + logging.warn(file_contents) + raise ValueError("No epitopes from netMHCIIpan") + return epitope_collection diff --git a/setup.py b/setup.py index f88fb10..0d0eb38 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ if __name__ == '__main__': setup( name='mhctools', - version="0.1.4", + version="0.1.5", description="Python interface to running command-line and web-based MHC binding predictors", author="Alex Rubinsteyn", author_email="alex {dot} rubinsteyn {at} mssm {dot} edu", diff --git a/test/test_mhc_allele_names.py b/test/test_mhc_allele_names.py index 5c16cf4..27f1408 100644 --- a/test/test_mhc_allele_names.py +++ b/test/test_mhc_allele_names.py @@ -17,8 +17,6 @@ # - RT1-9.5*f # - RT1-M3-1*av1 -# TODO: test human and mice class II alleles - hla_02_01_names = [ "HLA-A*02:01", "HLA-A*0201", @@ -31,9 +29,9 @@ "A0201", "A2", "A2:01", - "HLA-A2" + "HLA-A2", # lower case - "hla-a*0201" + "hla-a*0201", "a*0201", "a*02:01", "a0201" @@ -63,40 +61,70 @@ def test_macaque_alleles(): eq_(compact_allele_name(allele_name), "B0702") def test_dog_class2_allele(): - eq_(parse_allele_name("DLA-DQA1*00101"), + eq_(parse_allele_name("DLA-DQA1*00101")[0], AlleleName("DLA", "DQA1", "01", "01")) def test_sheep_class1_allele(): - eq_(parse_allele_name("Ovar-N*50001"), + eq_(parse_allele_name("Ovar-N*50001")[0], AlleleName("Ovar", "N", "500", "01")) def test_sheep_class2_allele(): - eq_(parse_allele_name("Ovar-DRB1*0804"), + eq_(parse_allele_name("Ovar-DRB1*0804")[0], AlleleName("Ovar", "DRB1", "08", "04")) def test_mouse_class1_alleles_H2_Kk(): # H2-Kk - eq_(parse_allele_name("H2-Kk"), + eq_(parse_allele_name("H2-Kk")[0], AlleleName("H-2", "K", "", "k")) eq_(normalize_allele_name("H2-Kk"), "H-2-Kk") eq_(compact_allele_name("H-2-Kk"), "Kk") # with a hyphen in "H-2" - eq_(parse_allele_name("H-2-Kk"), + eq_(parse_allele_name("H-2-Kk")[0], AlleleName("H-2", "K", "", "k")) eq_(normalize_allele_name("H-2-Kk"), "H-2-Kk") eq_(compact_allele_name("H-2-Kk"), "Kk") - def test_mouse_class1_alleles_H2_Db(): # H2-Db - eq_(parse_allele_name("H2-Db"), + eq_(parse_allele_name("H2-Db")[0], AlleleName("H-2", "D", "", "b")) eq_(normalize_allele_name("H2-Db"), "H-2-Db") eq_(compact_allele_name("H2-Db"), "Db") # with hyphen in "H-2" - eq_(parse_allele_name("H-2-Db"), + eq_(parse_allele_name("H-2-Db")[0], AlleleName("H-2", "D", "", "b")) eq_(normalize_allele_name("H-2-Db"), "H-2-Db") eq_(compact_allele_name("H-2-Db"), "Db") + +def test_human_class2(): + expected = "HLA-DRB1*01:02" + for name in ["DRB10102", + "DRB1*0102", + "HLA-DRB1*0102", + "HLA-DRB1*01:02"]: + result = normalize_allele_name(name) + eq_(result, expected) + +def test_human_class2_alpha_beta(): + expected = "HLA-DPA1*01:05-DPB1*100:01" + for name in ["DPA10105-DPB110001", + "HLA-DPA1*01:05-DPB1*100:01", + "hla-dpa1*0105-dpb1*10001", + "dpa1*0105-dpb1*10001"]: + result = normalize_allele_name(name) + eq_(result, expected) + +def test_mouse_class2_alleles(): + # H2-IAb + eq_(parse_allele_name("H2-IAb")[0], + AlleleName("H-2", "IA", "", "b")) + eq_(normalize_allele_name("H2-IAb"), "H-2-IAb") + eq_(compact_allele_name("H2-IAb"), "IAb") + + # with hyphen in "H-2" + eq_(parse_allele_name("H-2-IAb")[0], + AlleleName("H-2", "IA", "", "b")) + eq_(normalize_allele_name("H-2-IAb"), "H-2-IAb") + eq_(compact_allele_name("H-2-IAb"), "IAb") diff --git a/test/test_netmhcii_pan.py b/test/test_netmhcii_pan.py new file mode 100644 index 0000000..96728cb --- /dev/null +++ b/test/test_netmhcii_pan.py @@ -0,0 +1,32 @@ +from mhctools import NetMHCIIpan +from mhctools.alleles import normalize_allele_name + +def test_netmhcii_pan(): + alleles = [normalize_allele_name("HLA-DRB1*01:01")] + ii_pan_predictor = NetMHCIIpan( + alleles=alleles, + epitope_lengths=[15, 16]) + fasta_dictionary = { + "SMAD4-001": "PAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT", + "TP53-001": "SQAMDDLMLSPDDIEQWFTED" + } + epitope_collection = ii_pan_predictor.predict( + fasta_dictionary=fasta_dictionary) + + assert len(epitope_collection) == 27, \ + "Expected 27 epitopes from %s" % (epitope_collection,) + +def test_netmhcii_pan_alpha_beta(): + alleles = [normalize_allele_name("HLA-DPA1*01:05-DPB1*100:01")] + ii_pan_predictor = NetMHCIIpan( + alleles=alleles, + epitope_lengths=[15, 16]) + fasta_dictionary = { + "SMAD4-001": "PAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT", + "TP53-001": "SQAMDDLMLSPDDIEQWFTED" + } + epitope_collection = ii_pan_predictor.predict( + fasta_dictionary=fasta_dictionary) + + assert len(epitope_collection) == 27, \ + "Expected 27 epitopes from %s" % (epitope_collection,) From 2122dcb5adda1ec8a2f031b18e55c881d1c55cb7 Mon Sep 17 00:00:00 2001 From: Tavi Nathanson Date: Tue, 25 Aug 2015 17:24:51 -0400 Subject: [PATCH 2/3] Update README, fixes #27 --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 398efdd..d0ffb2d 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,10 @@ strongest_predicted_binder = epitope_collection[0] ``` ## API -The following models are available in `mhctools`: +The following models are available in `mhctools`: +* `NetMHC`: requires locally installed version of [NetMHC](http://www.cbs.dtu.dk/services/NetMHC/) * `NetMHCpan`: requires locally installed version of [NetMHCpan](http://www.cbs.dtu.dk/services/NetMHCpan/) +* `NetMHCIIpan`: requires locally installed version of [NetMHCIIpan](http://www.cbs.dtu.dk/services/NetMHCIIpan/) * `NetMHCcons`: requires locally installed version of [NetMHCcons](http://www.cbs.dtu.dk/services/NetMHCcons/) * `IedbMhcClass1`: Uses IEDB's REST API for class I binding predictions. * `IedbMhcClass2`: Uses IEDB's REST API for class II binding predictions. From 93e8ffc0b6d6590a994260760b85f38db4e6789b Mon Sep 17 00:00:00 2001 From: Tavi Nathanson Date: Tue, 25 Aug 2015 18:38:49 -0400 Subject: [PATCH 3/3] Address code review for iipan --- mhctools/alleles.py | 89 +++++++++++++++----------- mhctools/base_commandline_predictor.py | 13 ++-- mhctools/base_predictor.py | 13 +--- mhctools/file_formats.py | 11 ++-- mhctools/netmhcii_pan.py | 53 ++++++++------- test/test_mhc_allele_names.py | 31 +++++---- 6 files changed, 108 insertions(+), 102 deletions(-) diff --git a/mhctools/alleles.py b/mhctools/alleles.py index 8715344..73102b6 100644 --- a/mhctools/alleles.py +++ b/mhctools/alleles.py @@ -113,7 +113,7 @@ def _parse_mouse_allele_name(name): allele = name[0] return gene_name.upper(), allele.lower() -def parse_allele_name(name): +def parse_classi_or_classii_allele_name(name): """ Handle different forms of both single and alpha-beta allele names. Alpha-beta alleles may look like: @@ -122,17 +122,22 @@ def parse_allele_name(name): HLA-DPA1*01:05-DPB1*100:01 hla-dpa1*0105-dpb1*10001 dpa1*0105-dpb1*10001 + HLA-DPA1*01:05/DPB1*100:01 """ - species, name = split_species(name) + species, name = split_species_prefix(name) + + # Handle the case where alpha/beta pairs are separated with a /. + name = name.replace("/", "-") + parts = name.split("-") assert len(parts) <= 2, "Allele has too many parts: %s" % name if len(parts) == 1: - return (parse_single_allele_name(name, species),) + return (parse_allele_name(name, species),) else: - return (parse_single_allele_name(parts[0], species), - parse_single_allele_name(parts[1], species)) + return (parse_allele_name(parts[0], species), + parse_allele_name(parts[1], species)) -def split_species(name): +def split_species_prefix(name): """ Splits off the species component of the allele name from the rest of it. @@ -150,13 +155,16 @@ def split_species(name): return (species, name) return (species, name) -def parse_single_allele_name(name, species=None): +def parse_allele_name(name, species_prefix=None): """Takes an allele name and splits it into four parts: 1) species prefix 2) gene name 3) allele family 4) allele code + If species_prefix is provided, that is used instead of getting the species prefix from the name. + (And in that case, a species prefix in the name will result in an error being raised.) + For example, in all of the following inputs: "HLA-A*02:01" "A0201" @@ -178,10 +186,12 @@ def parse_single_allele_name(name, species=None): if len(name) == 0: raise ValueError("Can't normalize empty MHC allele name") - species_from_name, name = split_species(name) - if species: - assert not species_from_name, ("If a species is passed in, we better not have another " - "species in the name itself.") + species_from_name, name = split_species_prefix(name) + if species_prefix: + if species_from_name: + raise ValueError("If a species is passed in, we better not have another " + "species in the name itself.") + species = species_prefix else: species = species_from_name @@ -250,21 +260,12 @@ def parse_single_allele_name(name, species=None): _normalized_allele_cache = {} def normalize_allele_name(raw_allele): - """Cache repeated calls to non-compact normalized alleles. - - See normalize_allele_name_uncache for details. - Also see parse_allele_name for details regarding alpha-beta pairs. - """ - if raw_allele in _normalized_allele_cache: - return _normalized_allele_cache[raw_allele] - normalized = normalize_allele_name_uncached(raw_allele, is_compact=False) - _normalized_allele_cache[raw_allele] = normalized - return normalized - -def normalize_allele_name_uncached(raw_allele, is_compact=False): """MHC alleles are named with a frustratingly loose system. It's not uncommon to see dozens of different forms for the same allele. + Note: this function works with both class I and class II allele names (including + alpha/beta pairs). + For example, these all refer to the same MHC sequence: - HLA-A*02:01 - HLA-A02:01 @@ -298,38 +299,50 @@ def normalize_allele_name_uncached(raw_allele, is_compact=False): These should all be normalized to: HLA-A*02:01 """ - parsed_alleles = parse_allele_name(raw_allele) + if raw_allele in _normalized_allele_cache: + return _normalized_allele_cache[raw_allele] + parsed_alleles = parse_classi_or_classii_allele_name(raw_allele) species = parsed_alleles[0].species - normalized = species if not is_compact else "" + normalized_list = [species] for parsed_allele in parsed_alleles: - assert parsed_allele.species == species, \ - "Multiple species referenced in the same allele name: %s and %s" % ( - parsed_allele.species, species) if len(parsed_allele.allele_family) > 0: - normalized += "%s%s%s%s%s%s" % ( - "-" if not is_compact else "", + normalized_list.append("%s*%s:%s" % ( parsed_allele.gene, - "*" if not is_compact else "", parsed_allele.allele_family, - ":" if not is_compact else "", - parsed_allele.allele_code) + parsed_allele.allele_code)) else: # mice don't have allele families # e.g. H-2-Kd # species = H-2 # gene = K # allele = d - normalized += "%s%s%s" % ( - "-" if not is_compact else "", + normalized_list.append("%s%s" % ( parsed_allele.gene, - parsed_allele.allele_code) + parsed_allele.allele_code)) + normalized = "-".join(normalized_list) + _normalized_allele_cache[raw_allele] = normalized return normalized def compact_allele_name(raw_allele): """ - Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db + Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db or + HLA-DPA1*01:05-DPB1*100:01 into DPA10105-DPB110001 """ - return normalize_allele_name_uncached(raw_allele, is_compact=True) + parsed_alleles = parse_classi_or_classii_allele_name(raw_allele) + species = parsed_alleles[0].species + normalized_list = [] + for parsed_allele in parsed_alleles: + if len(parsed_allele.allele_family) > 0: + normalized_list.append("%s%s%s" % ( + parsed_allele.gene, + parsed_allele.allele_family, + parsed_allele.allele_code)) + else: + # mice don't have allele families + normalized_list.append("%s%s" % ( + parsed_allele.gene, + parsed_allele.allele_code)) + return "-".join(normalized_list) def _parse_substring(allele, pred, max_len=None): """ diff --git a/mhctools/base_commandline_predictor.py b/mhctools/base_commandline_predictor.py index 4393beb..2670e01 100644 --- a/mhctools/base_commandline_predictor.py +++ b/mhctools/base_commandline_predictor.py @@ -32,8 +32,7 @@ def __init__( command, alleles, epitope_lengths, - supported_allele_flag='-listMHC', - normalize_allele_func=normalize_allele_name): + supported_allele_flag='-listMHC'): self.name = name self.command = command self.supported_allele_flag = supported_allele_flag @@ -60,8 +59,7 @@ def __init__( self, alleles, epitope_lengths, - valid_alleles=valid_alleles, - normalize_allele_func=normalize_allele_func) + valid_alleles=valid_alleles) @staticmethod def _determine_valid_alleles(command, supported_allele_flag): @@ -82,9 +80,10 @@ def _determine_valid_alleles(command, supported_allele_flag): line = line.strip() if not line.startswith('#') and len(line) > 0: try: - # We don't need to normalize, as this is the actual allele name that - # the predictor is expecting. - valid_alleles.add(line) + # We need to normalize these alleles (the output of the predictor + # when it lists its supported alleles) so that they are comparable with + # our own alleles. + valid_alleles.add(normalize_allele_name(line)) except AlleleParseError as error: logging.info("Skipping allele %s: %s" % ( line, error)) diff --git a/mhctools/base_predictor.py b/mhctools/base_predictor.py index d56a5b1..472a173 100644 --- a/mhctools/base_predictor.py +++ b/mhctools/base_predictor.py @@ -28,8 +28,7 @@ def __init__( self, alleles, epitope_lengths, - valid_alleles=None, - normalize_allele_func=normalize_allele_name): + valid_alleles=None): """ Parameters ---------- @@ -45,17 +44,12 @@ def __init__( valid_alleles : list, optional If given, constrain HLA alleles to be contained within this set. - - normalize_allele_func : function, optional - If given, normalize alleles using this function when - comparing them with the predictor's valid set of - alleles. """ # I find myself often constructing a predictor with just one allele # so as a convenience, allow user to not wrap that allele as a list if isinstance(alleles, str): alleles = [alleles] - self.alleles = self._check_hla_alleles(alleles, normalize_allele_func, valid_alleles) + self.alleles = self._check_hla_alleles(alleles, valid_alleles) if isinstance(epitope_lengths, int): epitope_lengths = [epitope_lengths] @@ -85,7 +79,6 @@ def __str__(self): @staticmethod def _check_hla_alleles( alleles, - normalize_allele_func, valid_alleles=None): """ Given a list of HLA alleles and an optional list of valid @@ -94,7 +87,7 @@ def _check_hla_alleles( """ require_iterable_of(alleles, str, "HLA alleles") alleles = [ - normalize_allele_func(allele.strip().upper()) + normalize_allele_name(allele.strip().upper()) for allele in alleles ] if valid_alleles: diff --git a/mhctools/file_formats.py b/mhctools/file_formats.py index e7d7772..69e189f 100644 --- a/mhctools/file_formats.py +++ b/mhctools/file_formats.py @@ -167,9 +167,10 @@ def parse_netmhcpan_stdout( netmhc_output, fasta_dictionary, prediction_method_name="netmhcpan", - sequence_key_mapping=None): + sequence_key_mapping=None, + is_netmhcpanii=False): """ - Parse the output format for NetMHCpan, NetMHCcons, and NetMHCIIpan*, which looks like: + Parse the output format for NetMHCpan, NetMHCIIpan* and NetMHCcons, which looks like: * netMHCIIpan has two extra fields @@ -199,12 +200,10 @@ def parse_netmhcpan_stdout( prediction_method_name=prediction_method_name) # netMHCIIpan has some extra fields - is_mhciipan = prediction_method_name == "netmhciipan" - n_required_fields = 9 if is_mhciipan else 7 - + n_required_fields = 9 if is_netmhcpanii else 7 for fields in split_stdout_lines(netmhc_output): if len(fields) >= n_required_fields: - if is_mhciipan: + if is_netmhcpanii: pos, allele, peptide, key, Pos, Core, log_affinity, ic50, rank = ( fields[:n_required_fields]) else: diff --git a/mhctools/netmhcii_pan.py b/mhctools/netmhcii_pan.py index ddc1b14..dd6d96e 100644 --- a/mhctools/netmhcii_pan.py +++ b/mhctools/netmhcii_pan.py @@ -16,7 +16,7 @@ import tempfile import logging -from .alleles import parse_allele_name +from .alleles import parse_classi_or_classii_allele_name from .base_commandline_predictor import BaseCommandlinePredictor from .cleanup_context import CleanupFiles from .common import check_sequence_dictionary, seq_to_str @@ -30,36 +30,34 @@ def __init__( alleles, netmhc_command="netMHCIIpan", epitope_lengths=[15, 16, 17, 18, 19, 20]): - - def normalize_allele(allele_name): - """ - netMHCIIpan has some unique requirements for allele formats, - expecting the following forms: - - DRB1_0101 (for standard alleles) - - HLA-DQA10501-DQB10636 (for specifying alpha and beta alleles) - """ - parsed_alleles = parse_allele_name(allele_name) - if len(parsed_alleles) == 1: - return "%s_%s%s" % (parsed_alleles[0].gene, - parsed_alleles[0].allele_family, - parsed_alleles[0].allele_code) - else: - return "HLA-%s%s%s-%s%s%s" % ( - parsed_alleles[0].gene, - parsed_alleles[0].allele_family, - parsed_alleles[0].allele_code, - parsed_alleles[1].gene, - parsed_alleles[1].allele_family, - parsed_alleles[1].allele_code) - BaseCommandlinePredictor.__init__( self, name="NetMHCIIpan", command=netmhc_command, alleles=alleles, epitope_lengths=epitope_lengths, - supported_allele_flag="-list", - normalize_allele_func=normalize_allele) + supported_allele_flag="-list") + + def normalize_allele(self, allele_name): + """ + netMHCIIpan has some unique requirements for allele formats, + expecting the following forms: + - DRB1_0101 (for non-alpha/beta pairs) + - HLA-DQA10501-DQB10636 (for alpha and beta pairs) + """ + parsed_alleles = parse_classi_or_classii_allele_name(allele_name) + if len(parsed_alleles) == 1: + return "%s_%s%s" % (parsed_alleles[0].gene, + parsed_alleles[0].allele_family, + parsed_alleles[0].allele_code) + else: + return "HLA-%s%s%s-%s%s%s" % ( + parsed_alleles[0].gene, + parsed_alleles[0].allele_family, + parsed_alleles[0].allele_code, + parsed_alleles[1].gene, + parsed_alleles[1].allele_family, + parsed_alleles[1].allele_code) def predict(self, fasta_dictionary): fasta_dictionary = check_sequence_dictionary(fasta_dictionary) @@ -70,7 +68,7 @@ def predict(self, fasta_dictionary): input_filename = input_filenames[0] alleles_str = \ - ",".join(allele.replace("*", "") for allele in self.alleles) + ",".join(self.normalize_allele(allele) for allele in self.alleles) output_file = tempfile.NamedTemporaryFile( "w", prefix="netMHCIIpan_output", @@ -98,7 +96,8 @@ def predict(self, fasta_dictionary): file_contents, sequence_key_mapping=sequence_key_mapping, fasta_dictionary=fasta_dictionary, - prediction_method_name="netmhciipan") + prediction_method_name="netmhciipan", + is_netmhcpanii=True) if len(epitope_collection) == 0: logging.warn(file_contents) diff --git a/test/test_mhc_allele_names.py b/test/test_mhc_allele_names.py index 27f1408..88f6cc7 100644 --- a/test/test_mhc_allele_names.py +++ b/test/test_mhc_allele_names.py @@ -61,70 +61,73 @@ def test_macaque_alleles(): eq_(compact_allele_name(allele_name), "B0702") def test_dog_class2_allele(): - eq_(parse_allele_name("DLA-DQA1*00101")[0], + eq_(parse_allele_name("DLA-DQA1*00101"), AlleleName("DLA", "DQA1", "01", "01")) def test_sheep_class1_allele(): - eq_(parse_allele_name("Ovar-N*50001")[0], + eq_(parse_allele_name("Ovar-N*50001"), AlleleName("Ovar", "N", "500", "01")) def test_sheep_class2_allele(): - eq_(parse_allele_name("Ovar-DRB1*0804")[0], + eq_(parse_allele_name("Ovar-DRB1*0804"), AlleleName("Ovar", "DRB1", "08", "04")) def test_mouse_class1_alleles_H2_Kk(): # H2-Kk - eq_(parse_allele_name("H2-Kk")[0], + eq_(parse_allele_name("H2-Kk"), AlleleName("H-2", "K", "", "k")) eq_(normalize_allele_name("H2-Kk"), "H-2-Kk") eq_(compact_allele_name("H-2-Kk"), "Kk") # with a hyphen in "H-2" - eq_(parse_allele_name("H-2-Kk")[0], + eq_(parse_allele_name("H-2-Kk"), AlleleName("H-2", "K", "", "k")) eq_(normalize_allele_name("H-2-Kk"), "H-2-Kk") eq_(compact_allele_name("H-2-Kk"), "Kk") def test_mouse_class1_alleles_H2_Db(): # H2-Db - eq_(parse_allele_name("H2-Db")[0], + eq_(parse_allele_name("H2-Db"), AlleleName("H-2", "D", "", "b")) eq_(normalize_allele_name("H2-Db"), "H-2-Db") eq_(compact_allele_name("H2-Db"), "Db") # with hyphen in "H-2" - eq_(parse_allele_name("H-2-Db")[0], + eq_(parse_allele_name("H-2-Db"), AlleleName("H-2", "D", "", "b")) eq_(normalize_allele_name("H-2-Db"), "H-2-Db") eq_(compact_allele_name("H-2-Db"), "Db") def test_human_class2(): expected = "HLA-DRB1*01:02" + expected_compact = "DRB10102" for name in ["DRB10102", "DRB1*0102", "HLA-DRB1*0102", "HLA-DRB1*01:02"]: - result = normalize_allele_name(name) - eq_(result, expected) + eq_(normalize_allele_name(name), expected) + eq_(compact_allele_name(name), expected_compact) def test_human_class2_alpha_beta(): expected = "HLA-DPA1*01:05-DPB1*100:01" + expected_compact = "DPA10105-DPB110001" for name in ["DPA10105-DPB110001", "HLA-DPA1*01:05-DPB1*100:01", "hla-dpa1*0105-dpb1*10001", - "dpa1*0105-dpb1*10001"]: - result = normalize_allele_name(name) - eq_(result, expected) + "dpa1*0105-dpb1*10001", + "HLA-DPA1*01:05/DPB1*100:01"]: + eq_(normalize_allele_name(name), expected) + eq_(compact_allele_name(name), expected_compact) def test_mouse_class2_alleles(): # H2-IAb - eq_(parse_allele_name("H2-IAb")[0], + eq_(parse_allele_name("H2-IAb"), AlleleName("H-2", "IA", "", "b")) eq_(normalize_allele_name("H2-IAb"), "H-2-IAb") eq_(compact_allele_name("H2-IAb"), "IAb") # with hyphen in "H-2" - eq_(parse_allele_name("H-2-IAb")[0], + eq_(parse_allele_name("H-2-IAb"), AlleleName("H-2", "IA", "", "b")) eq_(normalize_allele_name("H-2-IAb"), "H-2-IAb") eq_(compact_allele_name("H-2-IAb"), "IAb")