Merge branch 'main' into jhen-dev

mybigday · Jan 13, 2025 · 82d3ce6 · 82d3ce6
2 parents ba19a63 + 407b2db
commit 82d3ce6
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 26 deletions.
diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
@@ -3,27 +3,27 @@ on:
   workflow_dispatch:
     inputs:
       upload-artifacts:
-        type: boolean
-        default: false
+        type: string
+        default: 'NO'
       artifacts-retention-days:
         type: number
         default: 1
   workflow_call:
     inputs:
       upload-artifacts:
-        type: boolean
-        default: false
+        type: string
+        default: 'YES'
       artifacts-retention-days:
         type: number
         default: 3
 
 jobs:
   build-linux-x86_64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
-          submodules: 'true'
+          submodules: "true"
       - name: node_modules cache
         uses: actions/[email protected]
         with:
@@ -46,19 +46,19 @@ jobs:
           bash ./scripts/prepare-linux.sh
           bash ./scripts/build-linux.sh
       - name: Upload build artifacts
-        if: inputs.upload-artifacts == 'true'
+        if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES'
         uses: actions/upload-artifact@v4
         with:
           name: bin-linux-x86_64
           path: bin
           retention-days: ${{ inputs.artifacts-retention-days }}
 
   build-linux-arm64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
-          submodules: 'true'
+          submodules: "true"
       - name: node_modules cache
         uses: actions/[email protected]
         with:
@@ -72,7 +72,7 @@ jobs:
       - uses: actions/[email protected]
         with:
           node-version: 20
-          cache: 'yarn'
+          cache: "yarn"
       - name: Install dependencies
         run: yarn install
       - name: Setup QEMU
@@ -89,7 +89,7 @@ jobs:
             arm64v8/ubuntu:latest \
             bash -c "./scripts/prepare-linux.sh && ./scripts/build-linux.sh"
       - name: Upload build artifacts
-        if: inputs.upload-artifacts == 'true'
+        if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES'
         uses: actions/upload-artifact@v4
         with:
           name: bin-linux-arm64
@@ -104,7 +104,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
         with:
-          submodules: 'true'
+          submodules: "true"
       - name: node_modules cache
         uses: actions/[email protected]
         with:
@@ -118,13 +118,13 @@ jobs:
       - uses: actions/[email protected]
         with:
           node-version: 20
-          cache: 'yarn'
+          cache: "yarn"
       - name: Install dependencies
         run: yarn install
       - name: Build (macOS)
         run: bash ./scripts/build-macos.sh
       - name: Upload build artifacts
-        if: inputs.upload-artifacts == 'true'
+        if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES'
         uses: actions/upload-artifact@v4
         with:
           name: bin-${{ matrix.os }}
@@ -136,7 +136,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
         with:
-          submodules: 'true'
+          submodules: "true"
       - name: node_modules cache
         uses: actions/[email protected]
         with:
@@ -150,15 +150,15 @@ jobs:
       - uses: actions/[email protected]
         with:
           node-version: 20
-          cache: 'yarn'
+          cache: "yarn"
       - name: Install dependencies
         run: yarn install
       - name: Prepare & build
         run: |
           powershell ./scripts/prepare-windows.ps1
           powershell ./scripts/build-windows.ps1
       - name: Upload build artifacts
-        if: inputs.upload-artifacts == 'true'
+        if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES'
         uses: actions/upload-artifact@v4
         with:
           name: bin-windows

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -8,16 +8,12 @@ on:
 jobs:
   build:
     uses: ./.github/workflows/build-release.yml
-    with:
-      upload-artifacts: true
-      artifacts-retention-days: 3
   publish:
     permissions:
       contents: write
       pull-requests: read
     needs: build
     runs-on: ubuntu-latest
-    if: startsWith(github.ref, 'refs/tags/v')
     steps:
       - uses: actions/checkout@v4
         with:

diff --git a/.release-it.json b/.release-it.json
@@ -1,7 +1,8 @@
 {
   "$schema": "https://unpkg.com/release-it@17/schema/release-it.json",
   "git": {
-    "commitMessage": "chore: release v${version}"
+    "commitMessage": "chore: release v${version}",
+    "requireCleanWorkingDir": false
   },
   "github": {
     "release": true

diff --git a/lib/binding.ts b/lib/binding.ts
@@ -9,11 +9,31 @@ export type LlamaModelOptions = {
   model: string
   embedding?: boolean
   embd_normalize?: number
-  pooling_type?: number
+  pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
   n_ctx?: number
   n_batch?: number
+  n_ubatch?: number
   n_threads?: number
   n_gpu_layers?: number
+  flash_attn?: boolean
+  cache_type_k?:
+    | 'f16'
+    | 'f32'
+    | 'q8_0'
+    | 'q4_0'
+    | 'q4_1'
+    | 'iq4_nl'
+    | 'q5_0'
+    | 'q5_1'
+  cache_type_v?:
+    | 'f16'
+    | 'f32'
+    | 'q8_0'
+    | 'q4_0'
+    | 'q4_1'
+    | 'iq4_nl'
+    | 'q5_0'
+    | 'q5_1'
   use_mlock?: boolean
   use_mmap?: boolean
   vocab_only?: boolean

diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "0.3.4",
+  "version": "0.3.6",
   "description": "Llama.cpp for Node.js",
   "main": "lib/index.js",
   "scripts": {

diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp
@@ -1,3 +1,4 @@
+#include "ggml.h"
 #include "LlamaContext.h"
 #include "DetokenizeWorker.h"
 #include "DisposeWorker.h"
@@ -60,6 +61,36 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
   exports.Set("LlamaContext", func);
 }
 
+const std::vector<ggml_type> kv_cache_types = {
+  GGML_TYPE_F32,
+  GGML_TYPE_F16,
+  GGML_TYPE_BF16,
+  GGML_TYPE_Q8_0,
+  GGML_TYPE_Q4_0,
+  GGML_TYPE_Q4_1,
+  GGML_TYPE_IQ4_NL,
+  GGML_TYPE_Q5_0,
+  GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+  for (const auto & type : kv_cache_types) {
+    if (ggml_type_name(type) == s) {
+      return type;
+    }
+  }
+  throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static int32_t pooling_type_from_str(const std::string & s) {
+  if (s == "none") return LLAMA_POOLING_TYPE_NONE;
+  if (s == "mean") return LLAMA_POOLING_TYPE_MEAN;
+  if (s == "cls") return LLAMA_POOLING_TYPE_CLS;
+  if (s == "last") return LLAMA_POOLING_TYPE_LAST;
+  if (s == "rank") return LLAMA_POOLING_TYPE_RANK;
+  return LLAMA_POOLING_TYPE_UNSPECIFIED;
+}
+
 // construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers,
 // use_mlock, use_mmap }): LlamaContext throws error
 LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
@@ -83,18 +114,24 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
 
   params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
   params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
+  params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
   params.embedding = get_option<bool>(options, "embedding", false);
   if (params.embedding) {
     // For non-causal models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
   }
   params.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
-  int32_t pooling_type = get_option<int32_t>(options, "pooling_type", -1);
-  params.pooling_type = (enum llama_pooling_type) pooling_type;
+  params.pooling_type = (enum llama_pooling_type) pooling_type_from_str(
+    get_option<std::string>(options, "pooling_type", "").c_str()
+  );
 
   params.cpuparams.n_threads =
       get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
   params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
+  params.flash_attn = get_option<bool>(options, "flash_attn", false);
+  params.cache_type_k = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_k", "f16").c_str());
+  params.cache_type_v = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_v", "f16").c_str());
+
   params.use_mlock = get_option<bool>(options, "use_mlock", false);
   params.use_mmap = get_option<bool>(options, "use_mmap", true);
   params.numa =