diff --git a/README.md b/README.md index 398efdd..d0ffb2d 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,10 @@ strongest_predicted_binder = epitope_collection[0] ``` ## API -The following models are available in `mhctools`: +The following models are available in `mhctools`: +* `NetMHC`: requires locally installed version of [NetMHC](http://www.cbs.dtu.dk/services/NetMHC/) * `NetMHCpan`: requires locally installed version of [NetMHCpan](http://www.cbs.dtu.dk/services/NetMHCpan/) +* `NetMHCIIpan`: requires locally installed version of [NetMHCIIpan](http://www.cbs.dtu.dk/services/NetMHCIIpan/) * `NetMHCcons`: requires locally installed version of [NetMHCcons](http://www.cbs.dtu.dk/services/NetMHCcons/) * `IedbMhcClass1`: Uses IEDB's REST API for class I binding predictions. * `IedbMhcClass2`: Uses IEDB's REST API for class II binding predictions. diff --git a/mhctools/__init__.py b/mhctools/__init__.py index b3a3740..2eefd7c 100644 --- a/mhctools/__init__.py +++ b/mhctools/__init__.py @@ -11,6 +11,7 @@ from .netmhc import NetMHC from .netmhc_cons import NetMHCcons from .netmhc_pan import NetMHCpan +from .netmhcii_pan import NetMHCIIpan from .random_predictor import RandomBindingPredictor __all__ = [ @@ -25,5 +26,6 @@ "NetMHC", "NetMHCcons", "NetMHCpan", + "NetMHCIIpan", "RandomBindingPredictor", ] diff --git a/mhctools/alleles.py b/mhctools/alleles.py index 0f6178b..73102b6 100644 --- a/mhctools/alleles.py +++ b/mhctools/alleles.py @@ -26,7 +26,7 @@ class AlleleParseError(Exception): dog="DLA", sheep=["OVA", "Ovar", "Ovca"], swine="SLA", - mouse=["H2"], + mouse=["H2", "H-2"], rainbow_trout="Onmy", rat=["Rano", "Rara", "RT1"], salmon="Sasa", @@ -113,13 +113,58 @@ def _parse_mouse_allele_name(name): allele = name[0] return gene_name.upper(), allele.lower() -def parse_allele_name(name): +def parse_classi_or_classii_allele_name(name): + """ + Handle different forms of both single and alpha-beta allele names. + Alpha-beta alleles may look like: + + DPA10105-DPB110001 + HLA-DPA1*01:05-DPB1*100:01 + hla-dpa1*0105-dpb1*10001 + dpa1*0105-dpb1*10001 + HLA-DPA1*01:05/DPB1*100:01 + """ + species, name = split_species_prefix(name) + + # Handle the case where alpha/beta pairs are separated with a /. + name = name.replace("/", "-") + + parts = name.split("-") + assert len(parts) <= 2, "Allele has too many parts: %s" % name + if len(parts) == 1: + return (parse_allele_name(name, species),) + else: + return (parse_allele_name(parts[0], species), + parse_allele_name(parts[1], species)) + +def split_species_prefix(name): + """ + Splits off the species component of the allele name from the rest of it. + + Given "HLA-A*02:01", returns ("HLA", "A*02:01"). + """ + species = None + for species_list in SPECIES_PREFIXES.values(): + if isinstance(species_list, str): + species_list = [species_list] + for curr_species in species_list: + prefix = name[:len(curr_species) + 1].upper() + if prefix == (curr_species.upper() + "-"): + species = curr_species + name = name[len(curr_species) + 1:] + return (species, name) + return (species, name) + +def parse_allele_name(name, species_prefix=None): """Takes an allele name and splits it into four parts: 1) species prefix 2) gene name 3) allele family 4) allele code + If species_prefix is provided, that is used instead of getting the species prefix from the name. + (And in that case, a species prefix in the name will result in an error being raised.) + For example, in all of the following inputs: "HLA-A*02:01" "A0201" @@ -141,20 +186,19 @@ def parse_allele_name(name): if len(name) == 0: raise ValueError("Can't normalize empty MHC allele name") - if name.upper().startswith("H2") or name.upper().startswith("H-2"): - gene, allele_code = _parse_mouse_allele_name(name) + species_from_name, name = split_species_prefix(name) + if species_prefix: + if species_from_name: + raise ValueError("If a species is passed in, we better not have another " + "species in the name itself.") + species = species_prefix + else: + species = species_from_name + + if species == "H-2" or species == "H2": + gene, allele_code = _parse_mouse_allele_name("H-2-" + name) # mice don't have allele families return AlleleName("H-2", gene, "", allele_code) - species = None - for species_list in SPECIES_PREFIXES.values(): - if isinstance(species_list, str): - species_list = [species_list] - for curr_species in species_list: - prefix = name[:len(curr_species) + 1].upper() - if prefix == (curr_species.upper() + "-"): - species = curr_species - name = name[len(curr_species) + 1:] - break if len(name) == 0: raise AlleleParseError("Incomplete MHC allele name: %s" % (original,)) @@ -165,10 +209,10 @@ def parse_allele_name(name): raise AlleleParseError("Can't parse allele name: %s" % original) species = "HLA" - if name[0] == "D": + if name[0].upper() == "D": # MHC class II genes like "DQA1" need to be parsed with both # letters and numbers - gene, name = _parse_alphanum(name) + gene, name = _parse_alphanum(name, 4) elif name[0].isalpha(): # if there are more separators to come, then assume the gene names # can have the form "DQA1" @@ -216,9 +260,12 @@ def parse_allele_name(name): _normalized_allele_cache = {} def normalize_allele_name(raw_allele): - """MHC alleles are named with a frustatingly loose system. It's not uncommon + """MHC alleles are named with a frustratingly loose system. It's not uncommon to see dozens of different forms for the same allele. + Note: this function works with both class I and class II allele names (including + alpha/beta pairs). + For example, these all refer to the same MHC sequence: - HLA-A*02:01 - HLA-A02:01 @@ -254,35 +301,48 @@ def normalize_allele_name(raw_allele): """ if raw_allele in _normalized_allele_cache: return _normalized_allele_cache[raw_allele] - parsed_allele = parse_allele_name(raw_allele) - if len(parsed_allele.allele_family) > 0: - normalized = "%s-%s*%s:%s" % ( - parsed_allele.species, - parsed_allele.gene, - parsed_allele.allele_family, - parsed_allele.allele_code) - else: - # mice don't have allele families - # e.g. H-2-Kd - # species = H-2 - # gene = K - # allele = d - normalized = "%s-%s%s" % ( - parsed_allele.species, - parsed_allele.gene, - parsed_allele.allele_code) + parsed_alleles = parse_classi_or_classii_allele_name(raw_allele) + species = parsed_alleles[0].species + normalized_list = [species] + for parsed_allele in parsed_alleles: + if len(parsed_allele.allele_family) > 0: + normalized_list.append("%s*%s:%s" % ( + parsed_allele.gene, + parsed_allele.allele_family, + parsed_allele.allele_code)) + else: + # mice don't have allele families + # e.g. H-2-Kd + # species = H-2 + # gene = K + # allele = d + normalized_list.append("%s%s" % ( + parsed_allele.gene, + parsed_allele.allele_code)) + normalized = "-".join(normalized_list) _normalized_allele_cache[raw_allele] = normalized return normalized def compact_allele_name(raw_allele): """ - Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db + Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db or + HLA-DPA1*01:05-DPB1*100:01 into DPA10105-DPB110001 """ - parsed_allele = parse_allele_name(raw_allele) - return "%s%s%s" % ( - parsed_allele.gene, - parsed_allele.allele_family, - parsed_allele.allele_code) + parsed_alleles = parse_classi_or_classii_allele_name(raw_allele) + species = parsed_alleles[0].species + normalized_list = [] + for parsed_allele in parsed_alleles: + if len(parsed_allele.allele_family) > 0: + normalized_list.append("%s%s%s" % ( + parsed_allele.gene, + parsed_allele.allele_family, + parsed_allele.allele_code)) + else: + # mice don't have allele families + normalized_list.append("%s%s" % ( + parsed_allele.gene, + parsed_allele.allele_code)) + return "-".join(normalized_list) def _parse_substring(allele, pred, max_len=None): """ diff --git a/mhctools/base_commandline_predictor.py b/mhctools/base_commandline_predictor.py index 8334cc4..2670e01 100644 --- a/mhctools/base_commandline_predictor.py +++ b/mhctools/base_commandline_predictor.py @@ -80,8 +80,10 @@ def _determine_valid_alleles(command, supported_allele_flag): line = line.strip() if not line.startswith('#') and len(line) > 0: try: - allele = normalize_allele_name(line) - valid_alleles.add(allele) + # We need to normalize these alleles (the output of the predictor + # when it lists its supported alleles) so that they are comparable with + # our own alleles. + valid_alleles.add(normalize_allele_name(line)) except AlleleParseError as error: logging.info("Skipping allele %s: %s" % ( line, error)) diff --git a/mhctools/common.py b/mhctools/common.py index fac3730..ff3047d 100644 --- a/mhctools/common.py +++ b/mhctools/common.py @@ -21,7 +21,7 @@ except NameError: string_classes = (str,) -def seq_to_str(obj): +def seq_to_str(obj, sep=","): """ Given a sequence convert it to a comma separated string. If, however, the argument is a single object, return its string @@ -30,7 +30,7 @@ def seq_to_str(obj): if isinstance(obj, string_classes): return obj elif isinstance(obj, (list, tuple)): - return ",".join([str(x) for x in obj]) + return sep.join([str(x) for x in obj]) else: return str(obj) diff --git a/mhctools/file_formats.py b/mhctools/file_formats.py index 8af35dd..69e189f 100644 --- a/mhctools/file_formats.py +++ b/mhctools/file_formats.py @@ -167,9 +167,12 @@ def parse_netmhcpan_stdout( netmhc_output, fasta_dictionary, prediction_method_name="netmhcpan", - sequence_key_mapping=None): + sequence_key_mapping=None, + is_netmhcpanii=False): """ - Parse the output format for NetMHCpan and NetMHCcons, which looks like: + Parse the output format for NetMHCpan, NetMHCIIpan* and NetMHCcons, which looks like: + + * netMHCIIpan has two extra fields # Affinity Threshold for Strong binding peptides 50.000', # Affinity Threshold for Weak binding peptides 500.000', @@ -196,11 +199,15 @@ def parse_netmhcpan_stdout( fasta_dictionary=fasta_dictionary, prediction_method_name=prediction_method_name) + # netMHCIIpan has some extra fields + n_required_fields = 9 if is_netmhcpanii else 7 for fields in split_stdout_lines(netmhc_output): - n_required_fields = 7 if len(fields) >= n_required_fields: - pos, allele, peptide, key, log_affinity, ic50, rank = \ - fields[:n_required_fields] + if is_netmhcpanii: + pos, allele, peptide, key, Pos, Core, log_affinity, ic50, rank = ( + fields[:n_required_fields]) + else: + pos, allele, peptide, key, log_affinity, ic50, rank = fields[:n_required_fields] try: pos = int(pos) allele = str(allele) diff --git a/mhctools/netmhcii_pan.py b/mhctools/netmhcii_pan.py new file mode 100644 index 0000000..dd6d96e --- /dev/null +++ b/mhctools/netmhcii_pan.py @@ -0,0 +1,105 @@ +# Copyright (c) 2014. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import +import tempfile +import logging + +from .alleles import parse_classi_or_classii_allele_name +from .base_commandline_predictor import BaseCommandlinePredictor +from .cleanup_context import CleanupFiles +from .common import check_sequence_dictionary, seq_to_str +from .file_formats import create_input_fasta_files, parse_netmhcpan_stdout +from .process_helpers import AsyncProcess + +class NetMHCIIpan(BaseCommandlinePredictor): + + def __init__( + self, + alleles, + netmhc_command="netMHCIIpan", + epitope_lengths=[15, 16, 17, 18, 19, 20]): + BaseCommandlinePredictor.__init__( + self, + name="NetMHCIIpan", + command=netmhc_command, + alleles=alleles, + epitope_lengths=epitope_lengths, + supported_allele_flag="-list") + + def normalize_allele(self, allele_name): + """ + netMHCIIpan has some unique requirements for allele formats, + expecting the following forms: + - DRB1_0101 (for non-alpha/beta pairs) + - HLA-DQA10501-DQB10636 (for alpha and beta pairs) + """ + parsed_alleles = parse_classi_or_classii_allele_name(allele_name) + if len(parsed_alleles) == 1: + return "%s_%s%s" % (parsed_alleles[0].gene, + parsed_alleles[0].allele_family, + parsed_alleles[0].allele_code) + else: + return "HLA-%s%s%s-%s%s%s" % ( + parsed_alleles[0].gene, + parsed_alleles[0].allele_family, + parsed_alleles[0].allele_code, + parsed_alleles[1].gene, + parsed_alleles[1].allele_family, + parsed_alleles[1].allele_code) + + def predict(self, fasta_dictionary): + fasta_dictionary = check_sequence_dictionary(fasta_dictionary) + input_filenames, sequence_key_mapping = create_input_fasta_files( + fasta_dictionary) + # TODO: We are not currently using the file chunking + # functionality here. See NetMHCcons. + input_filename = input_filenames[0] + + alleles_str = \ + ",".join(self.normalize_allele(allele) for allele in self.alleles) + output_file = tempfile.NamedTemporaryFile( + "w", + prefix="netMHCIIpan_output", + delete=False) + args = [ + self.command, + "-length", seq_to_str(self.epitope_lengths, sep=" "), + "-f", input_filename, + "-a", alleles_str + ] + logging.info(" ".join(args)) + + with CleanupFiles( + filenames=[input_filename], + files=[output_file]): + process = AsyncProcess( + args=args, + redirect_stdout_file=output_file) + process.wait() + # need to flush written output and re-open for read + output_file.close() + with open(output_file.name, 'r') as f: + file_contents = f.read() + epitope_collection = parse_netmhcpan_stdout( + file_contents, + sequence_key_mapping=sequence_key_mapping, + fasta_dictionary=fasta_dictionary, + prediction_method_name="netmhciipan", + is_netmhcpanii=True) + + if len(epitope_collection) == 0: + logging.warn(file_contents) + raise ValueError("No epitopes from netMHCIIpan") + return epitope_collection diff --git a/setup.py b/setup.py index f88fb10..0d0eb38 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ if __name__ == '__main__': setup( name='mhctools', - version="0.1.4", + version="0.1.5", description="Python interface to running command-line and web-based MHC binding predictors", author="Alex Rubinsteyn", author_email="alex {dot} rubinsteyn {at} mssm {dot} edu", diff --git a/test/test_mhc_allele_names.py b/test/test_mhc_allele_names.py index 5c16cf4..88f6cc7 100644 --- a/test/test_mhc_allele_names.py +++ b/test/test_mhc_allele_names.py @@ -17,8 +17,6 @@ # - RT1-9.5*f # - RT1-M3-1*av1 -# TODO: test human and mice class II alleles - hla_02_01_names = [ "HLA-A*02:01", "HLA-A*0201", @@ -31,9 +29,9 @@ "A0201", "A2", "A2:01", - "HLA-A2" + "HLA-A2", # lower case - "hla-a*0201" + "hla-a*0201", "a*0201", "a*02:01", "a0201" @@ -87,7 +85,6 @@ def test_mouse_class1_alleles_H2_Kk(): eq_(normalize_allele_name("H-2-Kk"), "H-2-Kk") eq_(compact_allele_name("H-2-Kk"), "Kk") - def test_mouse_class1_alleles_H2_Db(): # H2-Db eq_(parse_allele_name("H2-Db"), @@ -100,3 +97,37 @@ def test_mouse_class1_alleles_H2_Db(): AlleleName("H-2", "D", "", "b")) eq_(normalize_allele_name("H-2-Db"), "H-2-Db") eq_(compact_allele_name("H-2-Db"), "Db") + +def test_human_class2(): + expected = "HLA-DRB1*01:02" + expected_compact = "DRB10102" + for name in ["DRB10102", + "DRB1*0102", + "HLA-DRB1*0102", + "HLA-DRB1*01:02"]: + eq_(normalize_allele_name(name), expected) + eq_(compact_allele_name(name), expected_compact) + +def test_human_class2_alpha_beta(): + expected = "HLA-DPA1*01:05-DPB1*100:01" + expected_compact = "DPA10105-DPB110001" + for name in ["DPA10105-DPB110001", + "HLA-DPA1*01:05-DPB1*100:01", + "hla-dpa1*0105-dpb1*10001", + "dpa1*0105-dpb1*10001", + "HLA-DPA1*01:05/DPB1*100:01"]: + eq_(normalize_allele_name(name), expected) + eq_(compact_allele_name(name), expected_compact) + +def test_mouse_class2_alleles(): + # H2-IAb + eq_(parse_allele_name("H2-IAb"), + AlleleName("H-2", "IA", "", "b")) + eq_(normalize_allele_name("H2-IAb"), "H-2-IAb") + eq_(compact_allele_name("H2-IAb"), "IAb") + + # with hyphen in "H-2" + eq_(parse_allele_name("H-2-IAb"), + AlleleName("H-2", "IA", "", "b")) + eq_(normalize_allele_name("H-2-IAb"), "H-2-IAb") + eq_(compact_allele_name("H-2-IAb"), "IAb") diff --git a/test/test_netmhcii_pan.py b/test/test_netmhcii_pan.py new file mode 100644 index 0000000..96728cb --- /dev/null +++ b/test/test_netmhcii_pan.py @@ -0,0 +1,32 @@ +from mhctools import NetMHCIIpan +from mhctools.alleles import normalize_allele_name + +def test_netmhcii_pan(): + alleles = [normalize_allele_name("HLA-DRB1*01:01")] + ii_pan_predictor = NetMHCIIpan( + alleles=alleles, + epitope_lengths=[15, 16]) + fasta_dictionary = { + "SMAD4-001": "PAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT", + "TP53-001": "SQAMDDLMLSPDDIEQWFTED" + } + epitope_collection = ii_pan_predictor.predict( + fasta_dictionary=fasta_dictionary) + + assert len(epitope_collection) == 27, \ + "Expected 27 epitopes from %s" % (epitope_collection,) + +def test_netmhcii_pan_alpha_beta(): + alleles = [normalize_allele_name("HLA-DPA1*01:05-DPB1*100:01")] + ii_pan_predictor = NetMHCIIpan( + alleles=alleles, + epitope_lengths=[15, 16]) + fasta_dictionary = { + "SMAD4-001": "PAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT", + "TP53-001": "SQAMDDLMLSPDDIEQWFTED" + } + epitope_collection = ii_pan_predictor.predict( + fasta_dictionary=fasta_dictionary) + + assert len(epitope_collection) == 27, \ + "Expected 27 epitopes from %s" % (epitope_collection,)