diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index ca29828..fc292df 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,27 +1,27 @@ -diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h -index 1d2bd932..b5007c66 100644 ---- a/src/llama.cpp/common/common.h -+++ b/src/llama.cpp/common/common.h -@@ -183,6 +183,7 @@ struct common_params_vocoder { - }; - - struct common_params { -+ bool vocab_only = false; - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 4096; // context size - int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp -index 20be9291..1bedc55d 100644 +index 451826d5..a85ac028 100644 --- a/src/llama.cpp/common/common.cpp +++ b/src/llama.cpp/common/common.cpp -@@ -1017,6 +1017,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { +@@ -1043,6 +1043,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.vocab_only = params.vocab_only; - mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; + mparams.tensor_split = params.tensor_split; +diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp /common/common.h +index 3bcc637c..19ae7dad 100644 +--- a/src/llama.cpp/common/common.h ++++ b/src/llama.cpp/common/common.h +@@ -189,6 +189,7 @@ struct common_params_vocoder { + }; + + struct common_params { ++ bool vocab_only = false; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 4096; // context size + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt index 6b3641c4..6d6cb27f 100644 --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt diff --git a/src/EmbeddingWorker.cpp b/src/EmbeddingWorker.cpp index 0ad8d35..86da8d2 100644 --- a/src/EmbeddingWorker.cpp +++ b/src/EmbeddingWorker.cpp @@ -9,10 +9,11 @@ void EmbeddingWorker::Execute() { llama_kv_cache_clear(_sess->context()); auto tokens = ::common_tokenize(_sess->context(), _text, true); // add SEP if not present - if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) { - tokens.push_back(llama_token_sep(_sess->model())); + auto vocab = llama_model_get_vocab(_sess->model()); + if (tokens.empty() || tokens.back() != llama_vocab_sep(vocab)) { + tokens.push_back(llama_vocab_sep(vocab)); } - const int n_embd = llama_n_embd(_sess->model()); + const int n_embd = llama_model_n_embd(_sess->model()); do { auto ctx = _sess->context(); int ret = diff --git a/src/LlamaCompletionWorker.cpp b/src/LlamaCompletionWorker.cpp index 2ff96d3..e21f310 100644 --- a/src/LlamaCompletionWorker.cpp +++ b/src/LlamaCompletionWorker.cpp @@ -59,7 +59,9 @@ void LlamaCompletionWorker::Execute() { size_t n_cur = 0; size_t n_input = 0; const auto model = _sess->model(); - const bool add_bos = llama_add_bos_token(model); + auto vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); auto ctx = _sess->context(); auto sparams = llama_sampler_chain_default_params(); @@ -130,7 +132,7 @@ void LlamaCompletionWorker::Execute() { }); } // is it an end of generation? - if (llama_token_is_eog(model, new_token_id)) { + if (llama_vocab_is_eog(vocab, new_token_id)) { break; } // check for stop words diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp index e60480a..cd73027 100644 --- a/src/LlamaContext.cpp +++ b/src/LlamaContext.cpp @@ -162,8 +162,8 @@ bool validateModelChatTemplate(const struct llama_model * model) { int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); if (res >= 0) { llama_chat_message chat[] = {{"user", "test"}}; - std::string tmpl = std::string(model_template.data(), model_template.size()); - int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0); + const char * tmpl = llama_model_chat_template(model); + int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); return chat_res > 0; } return res > 0; diff --git a/src/llama.cpp b/src/llama.cpp index c05e8c9..92bc493 160000 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1 +1 @@ -Subproject commit c05e8c9934f94fde49bc1bc9dc51eed282605150 +Subproject commit 92bc493917d43b83e592349e138b54c90b1c3ea7 diff --git a/test/__snapshots__/index.test.ts.snap b/test/__snapshots__/index.test.ts.snap index e9329ff..902f77e 100644 --- a/test/__snapshots__/index.test.ts.snap +++ b/test/__snapshots__/index.test.ts.snap @@ -444,7 +444,7 @@ exports[`works fine with vocab_only: empty result 1`] = ` exports[`works fine with vocab_only: model info 1`] = ` { - "desc": "llama ?B all F32", + "desc": "", "isChatTemplateSupported": false, "metadata": { "general.architecture": "llama",