From 1794949068074cf3e1ccf6c6c46b7584007a1474 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 15:34:28 +0100 Subject: [PATCH 01/14] Update model.py --- model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/model.py b/model.py index 0660bc2..e7fe573 100644 --- a/model.py +++ b/model.py @@ -66,6 +66,7 @@ def from_name(cls, name: str): "stories15M": dict(n_layer=6, n_head=6, dim=288), "stories110M": dict(n_layer=12, n_head=12, dim=768), "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), + "Llama-3-70B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), } class KVCache(nn.Module): From 0f397291cdf17623b20573584c57240bcba971d5 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 15:53:23 +0100 Subject: [PATCH 02/14] Update model.py --- model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model.py b/model.py index e7fe573..98e2788 100644 --- a/model.py +++ b/model.py @@ -66,7 +66,7 @@ def from_name(cls, name: str): "stories15M": dict(n_layer=6, n_head=6, dim=288), "stories110M": dict(n_layer=12, n_head=12, dim=768), "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), - "Llama-3-70B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), + "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), } class KVCache(nn.Module): From f591d922d1e49617bd3420299680b42eea1d7dbc Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 16:04:32 +0100 Subject: [PATCH 03/14] Update model.py --- model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/model.py b/model.py index 98e2788..c4eae86 100644 --- a/model.py +++ b/model.py @@ -52,7 +52,8 @@ def from_name(cls, name: str): config.sort(key=len, reverse=True) assert len(config[0]) != len(config[1]), name # make sure only one 'best' match - return cls(**transformer_configs[config[0]]) + # return cls(**transformer_configs[config[0]]) + return cls(**dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256)) transformer_configs = { From f9c9bf9f2b360862d28df9f0dfb51951e1a46abb Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 16:16:53 +0100 Subject: [PATCH 04/14] Update model.py --- model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/model.py b/model.py index c4eae86..863b235 100644 --- a/model.py +++ b/model.py @@ -51,9 +51,9 @@ def from_name(cls, name: str): if len(config) > 1: config.sort(key=len, reverse=True) assert len(config[0]) != len(config[1]), name # make sure only one 'best' match - + # return cls(**transformer_configs[config[0]]) - return cls(**dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256)) + return cls(**dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=14336, vocab_size=128256)) transformer_configs = { @@ -67,7 +67,7 @@ def from_name(cls, name: str): "stories15M": dict(n_layer=6, n_head=6, dim=288), "stories110M": dict(n_layer=12, n_head=12, dim=768), "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), - "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), + "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=14336, vocab_size=128256), } class KVCache(nn.Module): From 7e606ae0f001f34409ea56d971430542520ec5bd Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 16:18:48 +0100 Subject: [PATCH 05/14] Update model.py --- model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model.py b/model.py index 863b235..2984950 100644 --- a/model.py +++ b/model.py @@ -53,7 +53,7 @@ def from_name(cls, name: str): assert len(config[0]) != len(config[1]), name # make sure only one 'best' match # return cls(**transformer_configs[config[0]]) - return cls(**dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=14336, vocab_size=128256)) + return cls(**dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256)) transformer_configs = { @@ -67,7 +67,7 @@ def from_name(cls, name: str): "stories15M": dict(n_layer=6, n_head=6, dim=288), "stories110M": dict(n_layer=12, n_head=12, dim=768), "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), - "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=14336, vocab_size=128256), + "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256), } class KVCache(nn.Module): From 1904f2caa2aace554991493443c181472ccff0e1 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 17:05:20 +0100 Subject: [PATCH 06/14] Update model.py --- model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model.py b/model.py index 2984950..82ea55a 100644 --- a/model.py +++ b/model.py @@ -66,7 +66,7 @@ def from_name(cls, name: str): "Mistral-7B": dict(n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=32000), "stories15M": dict(n_layer=6, n_head=6, dim=288), "stories110M": dict(n_layer=12, n_head=12, dim=768), - "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), + "llama-3-8b-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256), } From b8e0a3b3c8a244568b4a4a21521122094f37f5b2 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 17:08:13 +0100 Subject: [PATCH 07/14] revert convert_hf_checkpoint.py --- scripts/convert_hf_checkpoint.py | 133 +++++++++++-------------------- 1 file changed, 47 insertions(+), 86 deletions(-) diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py index 8a22106..b92114c 100644 --- a/scripts/convert_hf_checkpoint.py +++ b/scripts/convert_hf_checkpoint.py @@ -5,7 +5,6 @@ # LICENSE file in the root directory of this source tree. import json import re -import shutil import sys from pathlib import Path from typing import Optional @@ -28,62 +27,33 @@ def convert_hf_checkpoint( if model_name is None: model_name = checkpoint_dir.name - # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files - # need to be copied into model.pth. - # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the - # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not - # currently supported. - # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken - is_llama3 = "Llama-3" in model_name - if is_llama3: - # Check if we have multiple original/consolidated.NN.pth files and report error - # if we do for Llama 3. - original_dir = checkpoint_dir / "original" - pattern = re.compile(r"^consolidated\.\d{2}\.pth$") - bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)] - if len(bin_files) > 1: - raise ValueError( - f"Multiple consolidated.NN.pth files found in {original_dir}. " - "Merging them into one model.pth file is not supported for Llama 3.") - - config = ModelArgs.from_name(model_name) print(f"Model config {config.__dict__}") # Load the json file containing weight mapping - if not is_llama3: - model_map_json = checkpoint_dir / "pytorch_model.bin.index.json" - - assert model_map_json.is_file() - - with open(model_map_json) as json_map: - bin_index = json.load(json_map) - - weight_map = { - "model.embed_tokens.weight": "tok_embeddings.weight", - "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight", - "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight", - "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight", - "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight", - 'model.layers.{}.self_attn.rotary_emb.inv_freq': None, - 'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight', - "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight", - "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight", - "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight", - "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight", - "model.norm.weight": "norm.weight", - "lm_head.weight": "output.weight", - } - bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} - else: - # There is no separate pytorch_model.bin.index.json file for llama3. - # Instead, we will just use all original/consolidated.NN.pth files. - # so, we use model.safetensors.index.json - weight_map = None - original_dir = checkpoint_dir / "original" - pattern = re.compile(r"^consolidated\.\d{2}\.pth$") - bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)} - + model_map_json = checkpoint_dir / "pytorch_model.bin.index.json" + + assert model_map_json.is_file() + + with open(model_map_json) as json_map: + bin_index = json.load(json_map) + + weight_map = { + "model.embed_tokens.weight": "tok_embeddings.weight", + "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight", + "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight", + "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight", + "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight", + 'model.layers.{}.self_attn.rotary_emb.inv_freq': None, + 'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight', + "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight", + "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight", + "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight", + "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight", + "model.norm.weight": "norm.weight", + "lm_head.weight": "output.weight", + } + bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} def permute(w, n_head): dim = config.dim @@ -98,41 +68,32 @@ def permute(w, n_head): state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True) merged_result.update(state_dict) final_result = {} - if weight_map is not None: - for key, value in merged_result.items(): - if "layers" in key: - abstract_key = re.sub(r'(\d+)', '{}', key) - layer_num = re.search(r'\d+', key).group(0) - new_key = weight_map[abstract_key] - if new_key is None: - continue - new_key = new_key.format(layer_num) - else: - new_key = weight_map[key] - - final_result[new_key] = value - - for key in tuple(final_result.keys()): - if "wq" in key: - q = final_result[key] - k = final_result[key.replace("wq", "wk")] - v = final_result[key.replace("wq", "wv")] - q = permute(q, config.n_head) - k = permute(k, config.n_local_heads) - final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v]) - del final_result[key] - del final_result[key.replace("wq", "wk")] - del final_result[key.replace("wq", "wv")] - else: - final_result = merged_result + for key, value in merged_result.items(): + if "layers" in key: + abstract_key = re.sub(r'(\d+)', '{}', key) + layer_num = re.search(r'\d+', key).group(0) + new_key = weight_map[abstract_key] + if new_key is None: + continue + new_key = new_key.format(layer_num) + else: + new_key = weight_map[key] + + final_result[new_key] = value + + for key in tuple(final_result.keys()): + if "wq" in key: + q = final_result[key] + k = final_result[key.replace("wq", "wk")] + v = final_result[key.replace("wq", "wv")] + q = permute(q, config.n_head) + k = permute(k, config.n_local_heads) + final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v]) + del final_result[key] + del final_result[key.replace("wq", "wk")] + del final_result[key.replace("wq", "wv")] print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}") torch.save(final_result, checkpoint_dir / "model.pth") - if is_llama3: - original_dir = checkpoint_dir / "original" - tokenizer_model = original_dir / "tokenizer.model" - tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model" - print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}") - shutil.copy(tokenizer_model, tokenizer_model_tiktoken) if __name__ == '__main__': import argparse From f73f835ba8e739f813229a2565a1880bad0d403c Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 17:09:43 +0100 Subject: [PATCH 08/14] Update model.py --- model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/model.py b/model.py index 82ea55a..1dd5794 100644 --- a/model.py +++ b/model.py @@ -52,8 +52,7 @@ def from_name(cls, name: str): config.sort(key=len, reverse=True) assert len(config[0]) != len(config[1]), name # make sure only one 'best' match - # return cls(**transformer_configs[config[0]]) - return cls(**dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256)) + return cls(**transformer_configs[config[0]]) transformer_configs = { From be1af66f82d0b50dbabc3e0056993d4f4705108c Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 17:13:59 +0100 Subject: [PATCH 09/14] Update model.py --- model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/model.py b/model.py index 1dd5794..7498966 100644 --- a/model.py +++ b/model.py @@ -44,7 +44,7 @@ def from_name(cls, name: str): if name in transformer_configs: return cls(**transformer_configs[name]) # fuzzy search - config = [config for config in transformer_configs if config in str(name).upper() or config in str(name)] + config = [config for config in transformer_configs if config.lower() in str(name).lower() or config.lower() in str(name).lower()] # We may have two or more configs matched (e.g. "7B" and "Mistral-7B"). Find the best config match, # take longer name (as it have more symbols matched) @@ -67,6 +67,8 @@ def from_name(cls, name: str): "stories110M": dict(n_layer=12, n_head=12, dim=768), "llama-3-8b-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256), + "llama-3-8b-instruct-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), + "llama-3-70b-instruct-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256), } class KVCache(nn.Module): From 5b7e3c79452944dcfc77246d74fae6c93a914a93 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 17:17:57 +0100 Subject: [PATCH 10/14] copy tokeniser --- scripts/convert_hf_checkpoint.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py index b92114c..ab49d2e 100644 --- a/scripts/convert_hf_checkpoint.py +++ b/scripts/convert_hf_checkpoint.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. import json import re +import shutil import sys from pathlib import Path from typing import Optional @@ -94,6 +95,12 @@ def permute(w, n_head): del final_result[key.replace("wq", "wv")] print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}") torch.save(final_result, checkpoint_dir / "model.pth") + if 'llama-3' in model_name.lower(): + original_dir = checkpoint_dir / "original" + tokenizer_model = original_dir / "tokenizer.model" + tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model" + print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}") + shutil.copy(tokenizer_model, tokenizer_model_tiktoken) if __name__ == '__main__': import argparse From d0484b5f741f9cba95e9ec0acd3ce58e654f6258 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 17:20:51 +0100 Subject: [PATCH 11/14] Update tokenizer.py --- tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizer.py b/tokenizer.py index c62a0c5..32f7387 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -105,7 +105,7 @@ def get_tokenizer(tokenizer_model_path, model_name): Returns: - TokenizerInterface: An instance of a tokenizer. """ - if "Llama-3" in str(model_name): + if "llama-3" in str(model_name).lower(): return TiktokenWrapper(tokenizer_model_path) else: return SentencePieceWrapper(tokenizer_model_path) From 059b2bf2f7cc4ed88fbcd638cd23bac0985a0945 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 17:27:12 +0100 Subject: [PATCH 12/14] blobfile needed for tiktoken --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 04f828c..cac69db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ torch sentencepiece tiktoken +blobfile From 30d69b3245a29823e7c4c5ae6a1f48fa38267afd Mon Sep 17 00:00:00 2001 From: Artem Bolgar Date: Mon, 29 Apr 2024 14:02:52 -0700 Subject: [PATCH 13/14] llama3 8B support, tiktoken tokenizer (#158) * WIP: llama3 support, tiktoken tokenizer * Finalizing --- eval.py | 6 +- generate.py | 9 +-- mixtral-moe/generate.py | 2 +- model.py | 1 + quantize.py | 6 +- requirements.txt | 1 + scripts/convert_hf_checkpoint.py | 133 ++++++++++++++++++++----------- tokenizer.py | 111 ++++++++++++++++++++++++++ 8 files changed, 210 insertions(+), 59 deletions(-) create mode 100644 tokenizer.py diff --git a/eval.py b/eval.py index 7e8f841..d38abf8 100644 --- a/eval.py +++ b/eval.py @@ -18,7 +18,7 @@ torch._inductor.config.triton.cudagraphs = True torch._dynamo.config.cache_size_limit = 100000 -from sentencepiece import SentencePieceProcessor +from tokenizer import get_tokenizer from model import Transformer @@ -217,7 +217,7 @@ def main( assert checkpoint_path.is_file(), checkpoint_path tokenizer_path = checkpoint_path.parent / "tokenizer.model" - assert tokenizer_path.is_file(), tokenizer_path + assert tokenizer_path.is_file(), str(tokenizer_path) device = 'cuda' precision = torch.bfloat16 @@ -231,7 +231,7 @@ def main( model.eval() - tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path)) + tokenizer = get_tokenizer(tokenizer_path, checkpoint_path) torch.manual_seed(1234) diff --git a/generate.py b/generate.py index 8446d11..24ba553 100644 --- a/generate.py +++ b/generate.py @@ -32,10 +32,8 @@ def device_sync(device): wd = Path(__file__).parent.parent.resolve() sys.path.append(str(wd)) -from sentencepiece import SentencePieceProcessor - from model import Transformer - +from tokenizer import get_tokenizer def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization q = torch.empty_like(probs_sort).exponential_(1) @@ -269,7 +267,7 @@ def main( assert checkpoint_path.is_file(), checkpoint_path tokenizer_path = checkpoint_path.parent / "tokenizer.model" - assert tokenizer_path.is_file(), tokenizer_path + assert tokenizer_path.is_file(), str(tokenizer_path) global print from tp import maybe_init_dist @@ -297,7 +295,8 @@ def main( device_sync(device=device) # MKG print(f"Time to load model: {time.time() - t0:.02f} seconds") - tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path)) + tokenizer = get_tokenizer(tokenizer_path, checkpoint_path) + encoded = encode_tokens(tokenizer, prompt, bos=True, device=device) prompt_length = encoded.size(0) diff --git a/mixtral-moe/generate.py b/mixtral-moe/generate.py index ffe7113..9aa076b 100644 --- a/mixtral-moe/generate.py +++ b/mixtral-moe/generate.py @@ -175,7 +175,7 @@ def main( assert checkpoint_path.is_file(), checkpoint_path tokenizer_path = checkpoint_path.parent / "tokenizer.model" - assert tokenizer_path.is_file(), tokenizer_path + assert tokenizer_path.is_file(), str(tokenizer_path) global print rank = maybe_init_dist() diff --git a/model.py b/model.py index fbb6040..0660bc2 100644 --- a/model.py +++ b/model.py @@ -65,6 +65,7 @@ def from_name(cls, name: str): "Mistral-7B": dict(n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=32000), "stories15M": dict(n_layer=6, n_head=6, dim=288), "stories110M": dict(n_layer=12, n_head=12, dim=768), + "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256), } class KVCache(nn.Module): diff --git a/quantize.py b/quantize.py index af17a69..4ebbe5f 100644 --- a/quantize.py +++ b/quantize.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from sentencepiece import SentencePieceProcessor +from tokenizer import get_tokenizer try: from GPTQ import GenericGPTQRunner, InputRecorder @@ -578,8 +578,8 @@ def quantize( quant_handler = WeightOnlyInt4GPTQQuantHandler(model, groupsize) tokenizer_path = checkpoint_path.parent / "tokenizer.model" - assert tokenizer_path.is_file(), tokenizer_path - tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path)) + assert tokenizer_path.is_file(), str(tokenizer_path) + tokenizer = get_tokenizer(tokenizer_path, checkpoint_path) quantized_state_dict = quant_handler.create_quantized_state_dict( tokenizer, diff --git a/requirements.txt b/requirements.txt index 762cb09..04f828c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ torch sentencepiece +tiktoken diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py index b92114c..8a22106 100644 --- a/scripts/convert_hf_checkpoint.py +++ b/scripts/convert_hf_checkpoint.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. import json import re +import shutil import sys from pathlib import Path from typing import Optional @@ -27,33 +28,62 @@ def convert_hf_checkpoint( if model_name is None: model_name = checkpoint_dir.name + # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files + # need to be copied into model.pth. + # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the + # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not + # currently supported. + # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken + is_llama3 = "Llama-3" in model_name + if is_llama3: + # Check if we have multiple original/consolidated.NN.pth files and report error + # if we do for Llama 3. + original_dir = checkpoint_dir / "original" + pattern = re.compile(r"^consolidated\.\d{2}\.pth$") + bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)] + if len(bin_files) > 1: + raise ValueError( + f"Multiple consolidated.NN.pth files found in {original_dir}. " + "Merging them into one model.pth file is not supported for Llama 3.") + + config = ModelArgs.from_name(model_name) print(f"Model config {config.__dict__}") # Load the json file containing weight mapping - model_map_json = checkpoint_dir / "pytorch_model.bin.index.json" - - assert model_map_json.is_file() - - with open(model_map_json) as json_map: - bin_index = json.load(json_map) - - weight_map = { - "model.embed_tokens.weight": "tok_embeddings.weight", - "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight", - "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight", - "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight", - "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight", - 'model.layers.{}.self_attn.rotary_emb.inv_freq': None, - 'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight', - "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight", - "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight", - "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight", - "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight", - "model.norm.weight": "norm.weight", - "lm_head.weight": "output.weight", - } - bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} + if not is_llama3: + model_map_json = checkpoint_dir / "pytorch_model.bin.index.json" + + assert model_map_json.is_file() + + with open(model_map_json) as json_map: + bin_index = json.load(json_map) + + weight_map = { + "model.embed_tokens.weight": "tok_embeddings.weight", + "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight", + "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight", + "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight", + "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight", + 'model.layers.{}.self_attn.rotary_emb.inv_freq': None, + 'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight', + "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight", + "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight", + "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight", + "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight", + "model.norm.weight": "norm.weight", + "lm_head.weight": "output.weight", + } + bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} + else: + # There is no separate pytorch_model.bin.index.json file for llama3. + # Instead, we will just use all original/consolidated.NN.pth files. + # so, we use model.safetensors.index.json + weight_map = None + original_dir = checkpoint_dir / "original" + pattern = re.compile(r"^consolidated\.\d{2}\.pth$") + bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)} + def permute(w, n_head): dim = config.dim @@ -68,32 +98,41 @@ def permute(w, n_head): state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True) merged_result.update(state_dict) final_result = {} - for key, value in merged_result.items(): - if "layers" in key: - abstract_key = re.sub(r'(\d+)', '{}', key) - layer_num = re.search(r'\d+', key).group(0) - new_key = weight_map[abstract_key] - if new_key is None: - continue - new_key = new_key.format(layer_num) - else: - new_key = weight_map[key] - - final_result[new_key] = value - - for key in tuple(final_result.keys()): - if "wq" in key: - q = final_result[key] - k = final_result[key.replace("wq", "wk")] - v = final_result[key.replace("wq", "wv")] - q = permute(q, config.n_head) - k = permute(k, config.n_local_heads) - final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v]) - del final_result[key] - del final_result[key.replace("wq", "wk")] - del final_result[key.replace("wq", "wv")] + if weight_map is not None: + for key, value in merged_result.items(): + if "layers" in key: + abstract_key = re.sub(r'(\d+)', '{}', key) + layer_num = re.search(r'\d+', key).group(0) + new_key = weight_map[abstract_key] + if new_key is None: + continue + new_key = new_key.format(layer_num) + else: + new_key = weight_map[key] + + final_result[new_key] = value + + for key in tuple(final_result.keys()): + if "wq" in key: + q = final_result[key] + k = final_result[key.replace("wq", "wk")] + v = final_result[key.replace("wq", "wv")] + q = permute(q, config.n_head) + k = permute(k, config.n_local_heads) + final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v]) + del final_result[key] + del final_result[key.replace("wq", "wk")] + del final_result[key.replace("wq", "wv")] + else: + final_result = merged_result print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}") torch.save(final_result, checkpoint_dir / "model.pth") + if is_llama3: + original_dir = checkpoint_dir / "original" + tokenizer_model = original_dir / "tokenizer.model" + tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model" + print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}") + shutil.copy(tokenizer_model, tokenizer_model_tiktoken) if __name__ == '__main__': import argparse diff --git a/tokenizer.py b/tokenizer.py new file mode 100644 index 0000000..c62a0c5 --- /dev/null +++ b/tokenizer.py @@ -0,0 +1,111 @@ +import os +import sentencepiece as spm +import tiktoken +from tiktoken.load import load_tiktoken_bpe +from pathlib import Path +from typing import Dict + +class TokenizerInterface: + def __init__(self, model_path): + self.model_path = model_path + + def encode(self, text): + raise NotImplementedError("This method should be overridden by subclasses.") + + def decode(self, tokens): + raise NotImplementedError("This method should be overridden by subclasses.") + + def bos_id(self): + raise NotImplementedError("This method should be overridden by subclasses.") + + def eos_id(self): + raise NotImplementedError("This method should be overridden by subclasses.") + +class SentencePieceWrapper(TokenizerInterface): + def __init__(self, model_path): + super().__init__(model_path) + self.processor = spm.SentencePieceProcessor(str(model_path)) + + def encode(self, text): + return self.processor.EncodeAsIds(text) + + def decode(self, tokens): + return self.processor.DecodeIds(tokens) + + def bos_id(self): + return self.processor.bos_id() + + def eos_id(self): + return self.processor.eos_id() + +class TiktokenWrapper(TokenizerInterface): + """ + Tokenizing and encoding/decoding text using the Tiktoken tokenizer. + """ + + special_tokens: Dict[str, int] + + num_reserved_special_tokens = 256 + + pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501 + + def __init__(self, model_path): + super().__init__(model_path) + assert os.path.isfile(model_path), str(model_path) + mergeable_ranks = load_tiktoken_bpe(str(model_path)) + num_base_tokens = len(mergeable_ranks) + special_tokens = [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", # end of turn + ] + [ + f"<|reserved_special_token_{i}|>" + for i in range(5, self.num_reserved_special_tokens - 5) + ] + self.special_tokens = { + token: num_base_tokens + i for i, token in enumerate(special_tokens) + } + self.model = tiktoken.Encoding( + name=Path(model_path).name, + pat_str=self.pat_str, + mergeable_ranks=mergeable_ranks, + special_tokens=self.special_tokens, + ) + # BOS / EOS token IDs + self._bos_id: int = self.special_tokens["<|begin_of_text|>"] + self._eos_id: int = self.special_tokens["<|end_of_text|>"] + + def encode(self, text): + return self.model.encode(text) + + def decode(self, tokens): + return self.model.decode(tokens) + + def bos_id(self): + return self._bos_id + + def eos_id(self): + return self._eos_id + +def get_tokenizer(tokenizer_model_path, model_name): + """ + Factory function to get the appropriate tokenizer based on the model name. + + Args: + - tokenizer_model_path (str): The file path to the tokenizer model. + - model_name (str): The name of the model, used to determine the tokenizer type. + + Returns: + - TokenizerInterface: An instance of a tokenizer. + """ + if "Llama-3" in str(model_name): + return TiktokenWrapper(tokenizer_model_path) + else: + return SentencePieceWrapper(tokenizer_model_path) From 80e36227f148a9f4c33305e28e5555985cec8ab1 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Mon, 29 Apr 2024 22:24:42 +0100 Subject: [PATCH 14/14] fix --- scripts/convert_hf_checkpoint.py | 127 ++++++++++++------------------- 1 file changed, 47 insertions(+), 80 deletions(-) diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py index 8ac47dd..ab49d2e 100644 --- a/scripts/convert_hf_checkpoint.py +++ b/scripts/convert_hf_checkpoint.py @@ -28,62 +28,33 @@ def convert_hf_checkpoint( if model_name is None: model_name = checkpoint_dir.name - # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files - # need to be copied into model.pth. - # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the - # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not - # currently supported. - # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken - is_llama3 = "Llama-3" in model_name - if is_llama3: - # Check if we have multiple original/consolidated.NN.pth files and report error - # if we do for Llama 3. - original_dir = checkpoint_dir / "original" - pattern = re.compile(r"^consolidated\.\d{2}\.pth$") - bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)] - if len(bin_files) > 1: - raise ValueError( - f"Multiple consolidated.NN.pth files found in {original_dir}. " - "Merging them into one model.pth file is not supported for Llama 3.") - - config = ModelArgs.from_name(model_name) print(f"Model config {config.__dict__}") # Load the json file containing weight mapping - if not is_llama3: - model_map_json = checkpoint_dir / "pytorch_model.bin.index.json" - - assert model_map_json.is_file() - - with open(model_map_json) as json_map: - bin_index = json.load(json_map) - - weight_map = { - "model.embed_tokens.weight": "tok_embeddings.weight", - "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight", - "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight", - "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight", - "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight", - 'model.layers.{}.self_attn.rotary_emb.inv_freq': None, - 'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight', - "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight", - "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight", - "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight", - "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight", - "model.norm.weight": "norm.weight", - "lm_head.weight": "output.weight", - } - bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} - else: - # There is no separate pytorch_model.bin.index.json file for llama3. - # Instead, we will just use all original/consolidated.NN.pth files. - # so, we use model.safetensors.index.json - weight_map = None - original_dir = checkpoint_dir / "original" - pattern = re.compile(r"^consolidated\.\d{2}\.pth$") - bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)} - + model_map_json = checkpoint_dir / "pytorch_model.bin.index.json" + + assert model_map_json.is_file() + + with open(model_map_json) as json_map: + bin_index = json.load(json_map) + + weight_map = { + "model.embed_tokens.weight": "tok_embeddings.weight", + "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight", + "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight", + "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight", + "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight", + 'model.layers.{}.self_attn.rotary_emb.inv_freq': None, + 'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight', + "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight", + "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight", + "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight", + "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight", + "model.norm.weight": "norm.weight", + "lm_head.weight": "output.weight", + } + bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} def permute(w, n_head): dim = config.dim @@ -98,36 +69,32 @@ def permute(w, n_head): state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True) merged_result.update(state_dict) final_result = {} - if weight_map is not None: - for key, value in merged_result.items(): - if "layers" in key: - abstract_key = re.sub(r'(\d+)', '{}', key) - layer_num = re.search(r'\d+', key).group(0) - new_key = weight_map[abstract_key] - if new_key is None: - continue - new_key = new_key.format(layer_num) - else: - new_key = weight_map[key] - - final_result[new_key] = value - - for key in tuple(final_result.keys()): - if "wq" in key: - q = final_result[key] - k = final_result[key.replace("wq", "wk")] - v = final_result[key.replace("wq", "wv")] - q = permute(q, config.n_head) - k = permute(k, config.n_local_heads) - final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v]) - del final_result[key] - del final_result[key.replace("wq", "wk")] - del final_result[key.replace("wq", "wv")] - else: - final_result = merged_result + for key, value in merged_result.items(): + if "layers" in key: + abstract_key = re.sub(r'(\d+)', '{}', key) + layer_num = re.search(r'\d+', key).group(0) + new_key = weight_map[abstract_key] + if new_key is None: + continue + new_key = new_key.format(layer_num) + else: + new_key = weight_map[key] + + final_result[new_key] = value + + for key in tuple(final_result.keys()): + if "wq" in key: + q = final_result[key] + k = final_result[key.replace("wq", "wk")] + v = final_result[key.replace("wq", "wv")] + q = permute(q, config.n_head) + k = permute(k, config.n_local_heads) + final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v]) + del final_result[key] + del final_result[key.replace("wq", "wk")] + del final_result[key.replace("wq", "wv")] print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}") torch.save(final_result, checkpoint_dir / "model.pth") - if 'llama-3' in model_name.lower(): original_dir = checkpoint_dir / "original" tokenizer_model = original_dir / "tokenizer.model"