From 1794949068074cf3e1ccf6c6c46b7584007a1474 Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 15:34:28 +0100
Subject: [PATCH 01/14] Update model.py

---
 model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/model.py b/model.py
index 0660bc2..e7fe573 100644
--- a/model.py
+++ b/model.py
@@ -66,6 +66,7 @@ def from_name(cls, name: str):
     "stories15M": dict(n_layer=6, n_head=6, dim=288),
     "stories110M": dict(n_layer=12, n_head=12, dim=768),
     "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
+    "Llama-3-70B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
 }
 
 class KVCache(nn.Module):

From 0f397291cdf17623b20573584c57240bcba971d5 Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 15:53:23 +0100
Subject: [PATCH 02/14] Update model.py

---
 model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model.py b/model.py
index e7fe573..98e2788 100644
--- a/model.py
+++ b/model.py
@@ -66,7 +66,7 @@ def from_name(cls, name: str):
     "stories15M": dict(n_layer=6, n_head=6, dim=288),
     "stories110M": dict(n_layer=12, n_head=12, dim=768),
     "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
-    "Llama-3-70B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
+    "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
 }
 
 class KVCache(nn.Module):

From f591d922d1e49617bd3420299680b42eea1d7dbc Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 16:04:32 +0100
Subject: [PATCH 03/14] Update model.py

---
 model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/model.py b/model.py
index 98e2788..c4eae86 100644
--- a/model.py
+++ b/model.py
@@ -52,7 +52,8 @@ def from_name(cls, name: str):
             config.sort(key=len, reverse=True)
             assert len(config[0]) != len(config[1]), name # make sure only one 'best' match
 
-        return cls(**transformer_configs[config[0]])
+        # return cls(**transformer_configs[config[0]])
+        return cls(**dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256))
 
 
 transformer_configs = {

From f9c9bf9f2b360862d28df9f0dfb51951e1a46abb Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 16:16:53 +0100
Subject: [PATCH 04/14] Update model.py

---
 model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/model.py b/model.py
index c4eae86..863b235 100644
--- a/model.py
+++ b/model.py
@@ -51,9 +51,9 @@ def from_name(cls, name: str):
         if len(config) > 1:
             config.sort(key=len, reverse=True)
             assert len(config[0]) != len(config[1]), name # make sure only one 'best' match
-
+            
         # return cls(**transformer_configs[config[0]])
-        return cls(**dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256))
+        return cls(**dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=14336, vocab_size=128256))
 
 
 transformer_configs = {
@@ -67,7 +67,7 @@ def from_name(cls, name: str):
     "stories15M": dict(n_layer=6, n_head=6, dim=288),
     "stories110M": dict(n_layer=12, n_head=12, dim=768),
     "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
-    "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
+    "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=14336, vocab_size=128256),
 }
 
 class KVCache(nn.Module):

From 7e606ae0f001f34409ea56d971430542520ec5bd Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 16:18:48 +0100
Subject: [PATCH 05/14] Update model.py

---
 model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model.py b/model.py
index 863b235..2984950 100644
--- a/model.py
+++ b/model.py
@@ -53,7 +53,7 @@ def from_name(cls, name: str):
             assert len(config[0]) != len(config[1]), name # make sure only one 'best' match
             
         # return cls(**transformer_configs[config[0]])
-        return cls(**dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=14336, vocab_size=128256))
+        return cls(**dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256))
 
 
 transformer_configs = {
@@ -67,7 +67,7 @@ def from_name(cls, name: str):
     "stories15M": dict(n_layer=6, n_head=6, dim=288),
     "stories110M": dict(n_layer=12, n_head=12, dim=768),
     "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
-    "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=14336, vocab_size=128256),
+    "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256),
 }
 
 class KVCache(nn.Module):

From 1904f2caa2aace554991493443c181472ccff0e1 Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 17:05:20 +0100
Subject: [PATCH 06/14] Update model.py

---
 model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model.py b/model.py
index 2984950..82ea55a 100644
--- a/model.py
+++ b/model.py
@@ -66,7 +66,7 @@ def from_name(cls, name: str):
     "Mistral-7B": dict(n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=32000),
     "stories15M": dict(n_layer=6, n_head=6, dim=288),
     "stories110M": dict(n_layer=12, n_head=12, dim=768),
-    "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
+    "llama-3-8b-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
     "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256),
 }
 

From b8e0a3b3c8a244568b4a4a21521122094f37f5b2 Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 17:08:13 +0100
Subject: [PATCH 07/14] revert convert_hf_checkpoint.py

---
 scripts/convert_hf_checkpoint.py | 133 +++++++++++--------------------
 1 file changed, 47 insertions(+), 86 deletions(-)

diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index 8a22106..b92114c 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 import json
 import re
-import shutil
 import sys
 from pathlib import Path
 from typing import Optional
@@ -28,62 +27,33 @@ def convert_hf_checkpoint(
     if model_name is None:
         model_name = checkpoint_dir.name
 
-    # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files
-    # need to be copied into model.pth.
-    # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the
-    # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not
-    # currently supported.
-    # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken
-    is_llama3 = "Llama-3" in model_name
-    if is_llama3:
-        # Check if we have multiple original/consolidated.NN.pth files and report error
-        # if we do for Llama 3.
-        original_dir = checkpoint_dir / "original"
-        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
-        bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)]
-        if len(bin_files) > 1:
-            raise ValueError(
-                f"Multiple consolidated.NN.pth files found in {original_dir}. "
-                "Merging them into one model.pth file is not supported for Llama 3.")
-
-
     config = ModelArgs.from_name(model_name)
     print(f"Model config {config.__dict__}")
 
     # Load the json file containing weight mapping
-    if not is_llama3:
-        model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
-
-        assert model_map_json.is_file()
-
-        with open(model_map_json) as json_map:
-            bin_index = json.load(json_map)
-
-        weight_map = {
-            "model.embed_tokens.weight": "tok_embeddings.weight",
-            "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
-            "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
-            "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
-            "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
-            'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
-            'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
-            "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
-            "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
-            "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
-            "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
-            "model.norm.weight": "norm.weight",
-            "lm_head.weight": "output.weight",
-        }
-        bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
-    else:
-        # There is no separate pytorch_model.bin.index.json file for llama3.
-        # Instead, we will just use all original/consolidated.NN.pth files.
-        # so, we use model.safetensors.index.json
-        weight_map = None
-        original_dir = checkpoint_dir / "original"
-        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
-        bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)}
-        
+    model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
+
+    assert model_map_json.is_file()
+
+    with open(model_map_json) as json_map:
+        bin_index = json.load(json_map)
+
+    weight_map = {
+        "model.embed_tokens.weight": "tok_embeddings.weight",
+        "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+        "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+        "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+        "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+        'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
+        'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
+        "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+        "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+        "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+        "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+        "model.norm.weight": "norm.weight",
+        "lm_head.weight": "output.weight",
+    }
+    bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
 
     def permute(w, n_head):
         dim = config.dim
@@ -98,41 +68,32 @@ def permute(w, n_head):
         state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
         merged_result.update(state_dict)
     final_result = {}
-    if weight_map is not None:
-        for key, value in merged_result.items():
-            if "layers" in key:
-                abstract_key = re.sub(r'(\d+)', '{}', key)
-                layer_num = re.search(r'\d+', key).group(0)
-                new_key = weight_map[abstract_key]
-                if new_key is None:
-                    continue
-                new_key = new_key.format(layer_num)
-            else:
-                new_key = weight_map[key]
-
-            final_result[new_key] = value
-
-        for key in tuple(final_result.keys()):
-            if "wq" in key:
-                q = final_result[key]
-                k = final_result[key.replace("wq", "wk")]
-                v = final_result[key.replace("wq", "wv")]
-                q = permute(q, config.n_head)
-                k = permute(k, config.n_local_heads)
-                final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
-                del final_result[key]
-                del final_result[key.replace("wq", "wk")]
-                del final_result[key.replace("wq", "wv")]
-    else:
-        final_result = merged_result
+    for key, value in merged_result.items():
+        if "layers" in key:
+            abstract_key = re.sub(r'(\d+)', '{}', key)
+            layer_num = re.search(r'\d+', key).group(0)
+            new_key = weight_map[abstract_key]
+            if new_key is None:
+                continue
+            new_key = new_key.format(layer_num)
+        else:
+            new_key = weight_map[key]
+
+        final_result[new_key] = value
+
+    for key in tuple(final_result.keys()):
+        if "wq" in key:
+            q = final_result[key]
+            k = final_result[key.replace("wq", "wk")]
+            v = final_result[key.replace("wq", "wv")]
+            q = permute(q, config.n_head)
+            k = permute(k, config.n_local_heads)
+            final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
+            del final_result[key]
+            del final_result[key.replace("wq", "wk")]
+            del final_result[key.replace("wq", "wv")]
     print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
     torch.save(final_result, checkpoint_dir / "model.pth")
-    if is_llama3:
-        original_dir = checkpoint_dir / "original"
-        tokenizer_model = original_dir / "tokenizer.model"
-        tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model"
-        print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}")
-        shutil.copy(tokenizer_model, tokenizer_model_tiktoken)
 
 if __name__ == '__main__':
     import argparse

From f73f835ba8e739f813229a2565a1880bad0d403c Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 17:09:43 +0100
Subject: [PATCH 08/14] Update model.py

---
 model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/model.py b/model.py
index 82ea55a..1dd5794 100644
--- a/model.py
+++ b/model.py
@@ -52,8 +52,7 @@ def from_name(cls, name: str):
             config.sort(key=len, reverse=True)
             assert len(config[0]) != len(config[1]), name # make sure only one 'best' match
             
-        # return cls(**transformer_configs[config[0]])
-        return cls(**dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256))
+        return cls(**transformer_configs[config[0]])
 
 
 transformer_configs = {

From be1af66f82d0b50dbabc3e0056993d4f4705108c Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 17:13:59 +0100
Subject: [PATCH 09/14] Update model.py

---
 model.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/model.py b/model.py
index 1dd5794..7498966 100644
--- a/model.py
+++ b/model.py
@@ -44,7 +44,7 @@ def from_name(cls, name: str):
         if name in transformer_configs:
             return cls(**transformer_configs[name])
         # fuzzy search
-        config = [config for config in transformer_configs if config in str(name).upper() or config in str(name)]
+        config = [config for config in transformer_configs if config.lower() in str(name).lower() or config.lower() in str(name).lower()]
 
         # We may have two or more configs matched (e.g. "7B" and "Mistral-7B"). Find the best config match,
         # take longer name (as it have more symbols matched)
@@ -67,6 +67,8 @@ def from_name(cls, name: str):
     "stories110M": dict(n_layer=12, n_head=12, dim=768),
     "llama-3-8b-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
     "llama-3-70b-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256),
+    "llama-3-8b-instruct-hf-pt": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
+    "llama-3-70b-instruct-hf-pt": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256),
 }
 
 class KVCache(nn.Module):

From 5b7e3c79452944dcfc77246d74fae6c93a914a93 Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 17:17:57 +0100
Subject: [PATCH 10/14] copy tokeniser

---
 scripts/convert_hf_checkpoint.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index b92114c..ab49d2e 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 import json
 import re
+import shutil
 import sys
 from pathlib import Path
 from typing import Optional
@@ -94,6 +95,12 @@ def permute(w, n_head):
             del final_result[key.replace("wq", "wv")]
     print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
     torch.save(final_result, checkpoint_dir / "model.pth")
+    if 'llama-3' in model_name.lower():
+        original_dir = checkpoint_dir / "original"
+        tokenizer_model = original_dir / "tokenizer.model"
+        tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model"
+        print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}")
+        shutil.copy(tokenizer_model, tokenizer_model_tiktoken)
 
 if __name__ == '__main__':
     import argparse

From d0484b5f741f9cba95e9ec0acd3ce58e654f6258 Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 17:20:51 +0100
Subject: [PATCH 11/14] Update tokenizer.py

---
 tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tokenizer.py b/tokenizer.py
index c62a0c5..32f7387 100644
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -105,7 +105,7 @@ def get_tokenizer(tokenizer_model_path, model_name):
     Returns:
     - TokenizerInterface: An instance of a tokenizer.
     """
-    if "Llama-3" in str(model_name):
+    if "llama-3" in str(model_name).lower():
         return TiktokenWrapper(tokenizer_model_path)
     else:
         return SentencePieceWrapper(tokenizer_model_path)

From 059b2bf2f7cc4ed88fbcd638cd23bac0985a0945 Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 17:27:12 +0100
Subject: [PATCH 12/14] blobfile needed for tiktoken

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 04f828c..cac69db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 torch
 sentencepiece
 tiktoken
+blobfile

From 30d69b3245a29823e7c4c5ae6a1f48fa38267afd Mon Sep 17 00:00:00 2001
From: Artem Bolgar <artem@sesameai.com>
Date: Mon, 29 Apr 2024 14:02:52 -0700
Subject: [PATCH 13/14] llama3 8B support, tiktoken tokenizer (#158)

* WIP: llama3 support, tiktoken tokenizer

* Finalizing
---
 eval.py                          |   6 +-
 generate.py                      |   9 +--
 mixtral-moe/generate.py          |   2 +-
 model.py                         |   1 +
 quantize.py                      |   6 +-
 requirements.txt                 |   1 +
 scripts/convert_hf_checkpoint.py | 133 ++++++++++++++++++++-----------
 tokenizer.py                     | 111 ++++++++++++++++++++++++++
 8 files changed, 210 insertions(+), 59 deletions(-)
 create mode 100644 tokenizer.py

diff --git a/eval.py b/eval.py
index 7e8f841..d38abf8 100644
--- a/eval.py
+++ b/eval.py
@@ -18,7 +18,7 @@
 torch._inductor.config.triton.cudagraphs = True
 torch._dynamo.config.cache_size_limit = 100000
 
-from sentencepiece import SentencePieceProcessor
+from tokenizer import get_tokenizer
 
 from model import Transformer
 
@@ -217,7 +217,7 @@ def main(
     assert checkpoint_path.is_file(), checkpoint_path
 
     tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-    assert tokenizer_path.is_file(), tokenizer_path
+    assert tokenizer_path.is_file(), str(tokenizer_path)
 
     device = 'cuda'
     precision = torch.bfloat16
@@ -231,7 +231,7 @@ def main(
 
     model.eval()
 
-    tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
+    tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
 
     torch.manual_seed(1234)
 
diff --git a/generate.py b/generate.py
index 8446d11..24ba553 100644
--- a/generate.py
+++ b/generate.py
@@ -32,10 +32,8 @@ def device_sync(device):
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from sentencepiece import SentencePieceProcessor
-
 from model import Transformer
-
+from tokenizer import get_tokenizer
 
 def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
     q = torch.empty_like(probs_sort).exponential_(1)
@@ -269,7 +267,7 @@ def main(
     assert checkpoint_path.is_file(), checkpoint_path
 
     tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-    assert tokenizer_path.is_file(), tokenizer_path
+    assert tokenizer_path.is_file(), str(tokenizer_path)
 
     global print
     from tp import maybe_init_dist
@@ -297,7 +295,8 @@ def main(
     device_sync(device=device) # MKG
     print(f"Time to load model: {time.time() - t0:.02f} seconds")
 
-    tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
+    tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
+
     encoded = encode_tokens(tokenizer, prompt, bos=True, device=device)
     prompt_length = encoded.size(0)
 
diff --git a/mixtral-moe/generate.py b/mixtral-moe/generate.py
index ffe7113..9aa076b 100644
--- a/mixtral-moe/generate.py
+++ b/mixtral-moe/generate.py
@@ -175,7 +175,7 @@ def main(
     assert checkpoint_path.is_file(), checkpoint_path
 
     tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-    assert tokenizer_path.is_file(), tokenizer_path
+    assert tokenizer_path.is_file(), str(tokenizer_path)
 
     global print
     rank = maybe_init_dist()
diff --git a/model.py b/model.py
index fbb6040..0660bc2 100644
--- a/model.py
+++ b/model.py
@@ -65,6 +65,7 @@ def from_name(cls, name: str):
     "Mistral-7B": dict(n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=32000),
     "stories15M": dict(n_layer=6, n_head=6, dim=288),
     "stories110M": dict(n_layer=12, n_head=12, dim=768),
+    "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
 }
 
 class KVCache(nn.Module):
diff --git a/quantize.py b/quantize.py
index af17a69..4ebbe5f 100644
--- a/quantize.py
+++ b/quantize.py
@@ -9,7 +9,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from sentencepiece import SentencePieceProcessor
+from tokenizer import get_tokenizer
 
 try:
     from GPTQ import GenericGPTQRunner, InputRecorder
@@ -578,8 +578,8 @@ def quantize(
         quant_handler = WeightOnlyInt4GPTQQuantHandler(model, groupsize)
 
         tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-        assert tokenizer_path.is_file(), tokenizer_path
-        tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
+        assert tokenizer_path.is_file(), str(tokenizer_path)
+        tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
 
         quantized_state_dict = quant_handler.create_quantized_state_dict(
             tokenizer,
diff --git a/requirements.txt b/requirements.txt
index 762cb09..04f828c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 torch
 sentencepiece
+tiktoken
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index b92114c..8a22106 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 import json
 import re
+import shutil
 import sys
 from pathlib import Path
 from typing import Optional
@@ -27,33 +28,62 @@ def convert_hf_checkpoint(
     if model_name is None:
         model_name = checkpoint_dir.name
 
+    # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files
+    # need to be copied into model.pth.
+    # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the
+    # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not
+    # currently supported.
+    # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken
+    is_llama3 = "Llama-3" in model_name
+    if is_llama3:
+        # Check if we have multiple original/consolidated.NN.pth files and report error
+        # if we do for Llama 3.
+        original_dir = checkpoint_dir / "original"
+        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
+        bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)]
+        if len(bin_files) > 1:
+            raise ValueError(
+                f"Multiple consolidated.NN.pth files found in {original_dir}. "
+                "Merging them into one model.pth file is not supported for Llama 3.")
+
+
     config = ModelArgs.from_name(model_name)
     print(f"Model config {config.__dict__}")
 
     # Load the json file containing weight mapping
-    model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
-
-    assert model_map_json.is_file()
-
-    with open(model_map_json) as json_map:
-        bin_index = json.load(json_map)
-
-    weight_map = {
-        "model.embed_tokens.weight": "tok_embeddings.weight",
-        "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
-        "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
-        "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
-        "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
-        'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
-        'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
-        "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
-        "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
-        "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
-        "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
-        "model.norm.weight": "norm.weight",
-        "lm_head.weight": "output.weight",
-    }
-    bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
+    if not is_llama3:
+        model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
+
+        assert model_map_json.is_file()
+
+        with open(model_map_json) as json_map:
+            bin_index = json.load(json_map)
+
+        weight_map = {
+            "model.embed_tokens.weight": "tok_embeddings.weight",
+            "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+            "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+            "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+            "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+            'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
+            'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
+            "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+            "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+            "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+            "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+            "model.norm.weight": "norm.weight",
+            "lm_head.weight": "output.weight",
+        }
+        bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
+    else:
+        # There is no separate pytorch_model.bin.index.json file for llama3.
+        # Instead, we will just use all original/consolidated.NN.pth files.
+        # so, we use model.safetensors.index.json
+        weight_map = None
+        original_dir = checkpoint_dir / "original"
+        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
+        bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)}
+        
 
     def permute(w, n_head):
         dim = config.dim
@@ -68,32 +98,41 @@ def permute(w, n_head):
         state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
         merged_result.update(state_dict)
     final_result = {}
-    for key, value in merged_result.items():
-        if "layers" in key:
-            abstract_key = re.sub(r'(\d+)', '{}', key)
-            layer_num = re.search(r'\d+', key).group(0)
-            new_key = weight_map[abstract_key]
-            if new_key is None:
-                continue
-            new_key = new_key.format(layer_num)
-        else:
-            new_key = weight_map[key]
-
-        final_result[new_key] = value
-
-    for key in tuple(final_result.keys()):
-        if "wq" in key:
-            q = final_result[key]
-            k = final_result[key.replace("wq", "wk")]
-            v = final_result[key.replace("wq", "wv")]
-            q = permute(q, config.n_head)
-            k = permute(k, config.n_local_heads)
-            final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
-            del final_result[key]
-            del final_result[key.replace("wq", "wk")]
-            del final_result[key.replace("wq", "wv")]
+    if weight_map is not None:
+        for key, value in merged_result.items():
+            if "layers" in key:
+                abstract_key = re.sub(r'(\d+)', '{}', key)
+                layer_num = re.search(r'\d+', key).group(0)
+                new_key = weight_map[abstract_key]
+                if new_key is None:
+                    continue
+                new_key = new_key.format(layer_num)
+            else:
+                new_key = weight_map[key]
+
+            final_result[new_key] = value
+
+        for key in tuple(final_result.keys()):
+            if "wq" in key:
+                q = final_result[key]
+                k = final_result[key.replace("wq", "wk")]
+                v = final_result[key.replace("wq", "wv")]
+                q = permute(q, config.n_head)
+                k = permute(k, config.n_local_heads)
+                final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
+                del final_result[key]
+                del final_result[key.replace("wq", "wk")]
+                del final_result[key.replace("wq", "wv")]
+    else:
+        final_result = merged_result
     print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
     torch.save(final_result, checkpoint_dir / "model.pth")
+    if is_llama3:
+        original_dir = checkpoint_dir / "original"
+        tokenizer_model = original_dir / "tokenizer.model"
+        tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model"
+        print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}")
+        shutil.copy(tokenizer_model, tokenizer_model_tiktoken)
 
 if __name__ == '__main__':
     import argparse
diff --git a/tokenizer.py b/tokenizer.py
new file mode 100644
index 0000000..c62a0c5
--- /dev/null
+++ b/tokenizer.py
@@ -0,0 +1,111 @@
+import os
+import sentencepiece as spm
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+from pathlib import Path
+from typing import Dict
+
+class TokenizerInterface:
+    def __init__(self, model_path):
+        self.model_path = model_path
+
+    def encode(self, text):
+        raise NotImplementedError("This method should be overridden by subclasses.")
+
+    def decode(self, tokens):
+        raise NotImplementedError("This method should be overridden by subclasses.")
+
+    def bos_id(self):
+        raise NotImplementedError("This method should be overridden by subclasses.")
+
+    def eos_id(self):
+        raise NotImplementedError("This method should be overridden by subclasses.")
+
+class SentencePieceWrapper(TokenizerInterface):
+    def __init__(self, model_path):
+        super().__init__(model_path)
+        self.processor = spm.SentencePieceProcessor(str(model_path))
+
+    def encode(self, text):
+        return self.processor.EncodeAsIds(text)
+
+    def decode(self, tokens):
+        return self.processor.DecodeIds(tokens)
+
+    def bos_id(self):
+        return self.processor.bos_id()
+
+    def eos_id(self):
+        return self.processor.eos_id()
+
+class TiktokenWrapper(TokenizerInterface):
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    """
+
+    special_tokens: Dict[str, int]
+
+    num_reserved_special_tokens = 256
+
+    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
+
+    def __init__(self, model_path):
+        super().__init__(model_path)
+        assert os.path.isfile(model_path), str(model_path)
+        mergeable_ranks = load_tiktoken_bpe(str(model_path))
+        num_base_tokens = len(mergeable_ranks)
+        special_tokens = [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<|reserved_special_token_0|>",
+            "<|reserved_special_token_1|>",
+            "<|reserved_special_token_2|>",
+            "<|reserved_special_token_3|>",
+            "<|start_header_id|>",
+            "<|end_header_id|>",
+            "<|reserved_special_token_4|>",
+            "<|eot_id|>",  # end of turn
+        ] + [
+            f"<|reserved_special_token_{i}|>"
+            for i in range(5, self.num_reserved_special_tokens - 5)
+        ]
+        self.special_tokens = {
+            token: num_base_tokens + i for i, token in enumerate(special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(model_path).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        # BOS / EOS token IDs
+        self._bos_id: int = self.special_tokens["<|begin_of_text|>"]
+        self._eos_id: int = self.special_tokens["<|end_of_text|>"]
+
+    def encode(self, text):
+        return self.model.encode(text)
+
+    def decode(self, tokens):
+        return self.model.decode(tokens)
+
+    def bos_id(self):
+        return self._bos_id
+
+    def eos_id(self):
+        return self._eos_id
+
+def get_tokenizer(tokenizer_model_path, model_name):
+    """
+    Factory function to get the appropriate tokenizer based on the model name.
+    
+    Args:
+    - tokenizer_model_path (str): The file path to the tokenizer model.
+    - model_name (str): The name of the model, used to determine the tokenizer type.
+
+    Returns:
+    - TokenizerInterface: An instance of a tokenizer.
+    """
+    if "Llama-3" in str(model_name):
+        return TiktokenWrapper(tokenizer_model_path)
+    else:
+        return SentencePieceWrapper(tokenizer_model_path)

From 80e36227f148a9f4c33305e28e5555985cec8ab1 Mon Sep 17 00:00:00 2001
From: Srinivas Billa <nivibilla@gmail.com>
Date: Mon, 29 Apr 2024 22:24:42 +0100
Subject: [PATCH 14/14] fix

---
 scripts/convert_hf_checkpoint.py | 127 ++++++++++++-------------------
 1 file changed, 47 insertions(+), 80 deletions(-)

diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index 8ac47dd..ab49d2e 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -28,62 +28,33 @@ def convert_hf_checkpoint(
     if model_name is None:
         model_name = checkpoint_dir.name
 
-    # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files
-    # need to be copied into model.pth.
-    # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the
-    # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not
-    # currently supported.
-    # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken
-    is_llama3 = "Llama-3" in model_name
-    if is_llama3:
-        # Check if we have multiple original/consolidated.NN.pth files and report error
-        # if we do for Llama 3.
-        original_dir = checkpoint_dir / "original"
-        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
-        bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)]
-        if len(bin_files) > 1:
-            raise ValueError(
-                f"Multiple consolidated.NN.pth files found in {original_dir}. "
-                "Merging them into one model.pth file is not supported for Llama 3.")
-
-
     config = ModelArgs.from_name(model_name)
     print(f"Model config {config.__dict__}")
 
     # Load the json file containing weight mapping
-    if not is_llama3:
-        model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
-
-        assert model_map_json.is_file()
-
-        with open(model_map_json) as json_map:
-            bin_index = json.load(json_map)
-
-        weight_map = {
-            "model.embed_tokens.weight": "tok_embeddings.weight",
-            "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
-            "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
-            "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
-            "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
-            'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
-            'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
-            "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
-            "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
-            "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
-            "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
-            "model.norm.weight": "norm.weight",
-            "lm_head.weight": "output.weight",
-        }
-        bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
-    else:
-        # There is no separate pytorch_model.bin.index.json file for llama3.
-        # Instead, we will just use all original/consolidated.NN.pth files.
-        # so, we use model.safetensors.index.json
-        weight_map = None
-        original_dir = checkpoint_dir / "original"
-        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
-        bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)}
-        
+    model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
+
+    assert model_map_json.is_file()
+
+    with open(model_map_json) as json_map:
+        bin_index = json.load(json_map)
+
+    weight_map = {
+        "model.embed_tokens.weight": "tok_embeddings.weight",
+        "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+        "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+        "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+        "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+        'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
+        'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
+        "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+        "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+        "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+        "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+        "model.norm.weight": "norm.weight",
+        "lm_head.weight": "output.weight",
+    }
+    bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
 
     def permute(w, n_head):
         dim = config.dim
@@ -98,36 +69,32 @@ def permute(w, n_head):
         state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
         merged_result.update(state_dict)
     final_result = {}
-    if weight_map is not None:
-        for key, value in merged_result.items():
-            if "layers" in key:
-                abstract_key = re.sub(r'(\d+)', '{}', key)
-                layer_num = re.search(r'\d+', key).group(0)
-                new_key = weight_map[abstract_key]
-                if new_key is None:
-                    continue
-                new_key = new_key.format(layer_num)
-            else:
-                new_key = weight_map[key]
-
-            final_result[new_key] = value
-
-        for key in tuple(final_result.keys()):
-            if "wq" in key:
-                q = final_result[key]
-                k = final_result[key.replace("wq", "wk")]
-                v = final_result[key.replace("wq", "wv")]
-                q = permute(q, config.n_head)
-                k = permute(k, config.n_local_heads)
-                final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
-                del final_result[key]
-                del final_result[key.replace("wq", "wk")]
-                del final_result[key.replace("wq", "wv")]
-    else:
-        final_result = merged_result
+    for key, value in merged_result.items():
+        if "layers" in key:
+            abstract_key = re.sub(r'(\d+)', '{}', key)
+            layer_num = re.search(r'\d+', key).group(0)
+            new_key = weight_map[abstract_key]
+            if new_key is None:
+                continue
+            new_key = new_key.format(layer_num)
+        else:
+            new_key = weight_map[key]
+
+        final_result[new_key] = value
+
+    for key in tuple(final_result.keys()):
+        if "wq" in key:
+            q = final_result[key]
+            k = final_result[key.replace("wq", "wk")]
+            v = final_result[key.replace("wq", "wv")]
+            q = permute(q, config.n_head)
+            k = permute(k, config.n_local_heads)
+            final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
+            del final_result[key]
+            del final_result[key.replace("wq", "wk")]
+            del final_result[key.replace("wq", "wv")]
     print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
     torch.save(final_result, checkpoint_dir / "model.pth")
-
     if 'llama-3' in model_name.lower():
         original_dir = checkpoint_dir / "original"
         tokenizer_model = original_dir / "tokenizer.model"