diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index c3ea1f2..2aebe93 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -3,27 +3,27 @@ on: workflow_dispatch: inputs: upload-artifacts: - type: boolean - default: false + type: string + default: 'NO' artifacts-retention-days: type: number default: 1 workflow_call: inputs: upload-artifacts: - type: boolean - default: false + type: string + default: 'YES' artifacts-retention-days: type: number default: 3 jobs: build-linux-x86_64: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 with: - submodules: 'true' + submodules: "true" - name: node_modules cache uses: actions/cache@v4.0.2 with: @@ -46,7 +46,7 @@ jobs: bash ./scripts/prepare-linux.sh bash ./scripts/build-linux.sh - name: Upload build artifacts - if: inputs.upload-artifacts == 'true' + if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES' uses: actions/upload-artifact@v4 with: name: bin-linux-x86_64 @@ -54,11 +54,11 @@ jobs: retention-days: ${{ inputs.artifacts-retention-days }} build-linux-arm64: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 with: - submodules: 'true' + submodules: "true" - name: node_modules cache uses: actions/cache@v4.0.2 with: @@ -72,7 +72,7 @@ jobs: - uses: actions/setup-node@v4.0.2 with: node-version: 20 - cache: 'yarn' + cache: "yarn" - name: Install dependencies run: yarn install - name: Setup QEMU @@ -89,7 +89,7 @@ jobs: arm64v8/ubuntu:latest \ bash -c "./scripts/prepare-linux.sh && ./scripts/build-linux.sh" - name: Upload build artifacts - if: inputs.upload-artifacts == 'true' + if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES' uses: actions/upload-artifact@v4 with: name: bin-linux-arm64 @@ -104,7 +104,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - submodules: 'true' + submodules: "true" - name: node_modules cache uses: actions/cache@v4.0.2 with: @@ -118,13 +118,13 @@ jobs: - uses: actions/setup-node@v4.0.2 with: node-version: 20 - cache: 'yarn' + cache: "yarn" - name: Install dependencies run: yarn install - name: Build (macOS) run: bash ./scripts/build-macos.sh - name: Upload build artifacts - if: inputs.upload-artifacts == 'true' + if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES' uses: actions/upload-artifact@v4 with: name: bin-${{ matrix.os }} @@ -136,7 +136,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - submodules: 'true' + submodules: "true" - name: node_modules cache uses: actions/cache@v4.0.2 with: @@ -150,7 +150,7 @@ jobs: - uses: actions/setup-node@v4.0.2 with: node-version: 20 - cache: 'yarn' + cache: "yarn" - name: Install dependencies run: yarn install - name: Prepare & build @@ -158,7 +158,7 @@ jobs: powershell ./scripts/prepare-windows.ps1 powershell ./scripts/build-windows.ps1 - name: Upload build artifacts - if: inputs.upload-artifacts == 'true' + if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES' uses: actions/upload-artifact@v4 with: name: bin-windows diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d5baf89..c451ded 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,16 +8,12 @@ on: jobs: build: uses: ./.github/workflows/build-release.yml - with: - upload-artifacts: true - artifacts-retention-days: 3 publish: permissions: contents: write pull-requests: read needs: build runs-on: ubuntu-latest - if: startsWith(github.ref, 'refs/tags/v') steps: - uses: actions/checkout@v4 with: diff --git a/.release-it.json b/.release-it.json index 5fdfc72..7597d7f 100644 --- a/.release-it.json +++ b/.release-it.json @@ -1,7 +1,8 @@ { "$schema": "https://unpkg.com/release-it@17/schema/release-it.json", "git": { - "commitMessage": "chore: release v${version}" + "commitMessage": "chore: release v${version}", + "requireCleanWorkingDir": false }, "github": { "release": true diff --git a/lib/binding.ts b/lib/binding.ts index 55f010c..43a89ff 100644 --- a/lib/binding.ts +++ b/lib/binding.ts @@ -9,11 +9,31 @@ export type LlamaModelOptions = { model: string embedding?: boolean embd_normalize?: number - pooling_type?: number + pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank' n_ctx?: number n_batch?: number + n_ubatch?: number n_threads?: number n_gpu_layers?: number + flash_attn?: boolean + cache_type_k?: + | 'f16' + | 'f32' + | 'q8_0' + | 'q4_0' + | 'q4_1' + | 'iq4_nl' + | 'q5_0' + | 'q5_1' + cache_type_v?: + | 'f16' + | 'f32' + | 'q8_0' + | 'q4_0' + | 'q4_1' + | 'iq4_nl' + | 'q5_0' + | 'q5_1' use_mlock?: boolean use_mmap?: boolean vocab_only?: boolean diff --git a/package.json b/package.json index f9a4086..a27173b 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@fugood/llama.node", "access": "public", - "version": "0.3.4", + "version": "0.3.6", "description": "Llama.cpp for Node.js", "main": "lib/index.js", "scripts": { diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp index 96b2e8d..eb9ee8d 100644 --- a/src/LlamaContext.cpp +++ b/src/LlamaContext.cpp @@ -1,3 +1,4 @@ +#include "ggml.h" #include "LlamaContext.h" #include "DetokenizeWorker.h" #include "DisposeWorker.h" @@ -60,6 +61,36 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) { exports.Set("LlamaContext", func); } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; + +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); +} + +static int32_t pooling_type_from_str(const std::string & s) { + if (s == "none") return LLAMA_POOLING_TYPE_NONE; + if (s == "mean") return LLAMA_POOLING_TYPE_MEAN; + if (s == "cls") return LLAMA_POOLING_TYPE_CLS; + if (s == "last") return LLAMA_POOLING_TYPE_LAST; + if (s == "rank") return LLAMA_POOLING_TYPE_RANK; + return LLAMA_POOLING_TYPE_UNSPECIFIED; +} + // construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers, // use_mlock, use_mmap }): LlamaContext throws error LlamaContext::LlamaContext(const Napi::CallbackInfo &info) @@ -83,18 +114,24 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info) params.n_ctx = get_option(options, "n_ctx", 512); params.n_batch = get_option(options, "n_batch", 2048); + params.n_ubatch = get_option(options, "n_ubatch", 512); params.embedding = get_option(options, "embedding", false); if (params.embedding) { // For non-causal models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; } params.embd_normalize = get_option(options, "embd_normalize", 2); - int32_t pooling_type = get_option(options, "pooling_type", -1); - params.pooling_type = (enum llama_pooling_type) pooling_type; + params.pooling_type = (enum llama_pooling_type) pooling_type_from_str( + get_option(options, "pooling_type", "").c_str() + ); params.cpuparams.n_threads = get_option(options, "n_threads", cpu_get_num_math() / 2); params.n_gpu_layers = get_option(options, "n_gpu_layers", -1); + params.flash_attn = get_option(options, "flash_attn", false); + params.cache_type_k = kv_cache_type_from_str(get_option(options, "cache_type_k", "f16").c_str()); + params.cache_type_v = kv_cache_type_from_str(get_option(options, "cache_type_v", "f16").c_str()); + params.use_mlock = get_option(options, "use_mlock", false); params.use_mmap = get_option(options, "use_mmap", true); params.numa =