Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into script
Browse files Browse the repository at this point in the history
  • Loading branch information
katsu560 authored Jan 7, 2024
2 parents e391d4b + c75ca5d commit d5104a0
Show file tree
Hide file tree
Showing 38 changed files with 5,755 additions and 4,921 deletions.
2 changes: 1 addition & 1 deletion .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
git,
python3,
mpi,
openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
cudaPackages,
darwin,
rocmPackages,
Expand Down
42 changes: 38 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ option(LLAMA_HIP_UMA "llama: use HIP unified memory arch
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)

Expand Down Expand Up @@ -154,9 +155,9 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()

if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)

message(STATUS "Metal framework found")
set(GGML_HEADERS_METAL ggml-metal.h)
Expand All @@ -173,6 +174,35 @@ if (LLAMA_METAL)
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)

if (LLAMA_METAL_SHADER_DEBUG)
# custom command to do the following:
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
# xcrun -sdk macosx metallib ggml-metal.air -o default.metallib
#
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
# disabling fast math is needed in order to pass tests/test-backend-ops
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720
set(XC_FLAGS -fno-fast-math -fno-inline -g)
if (LLAMA_QKK_64)
set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
endif()

add_custom_command(
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
DEPENDS ggml-metal.metal
COMMENT "Compiling Metal kernels"
)

add_custom_target(
ggml-metal ALL
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
)
endif()

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
Expand Down Expand Up @@ -200,7 +230,11 @@ if (LLAMA_BLAS)
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS REQUIRED blas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
pkg_check_modules(DepBLAS REQUIRED openblas)
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
pkg_check_modules(DepBLAS openblas64)
if (NOT DepBLAS_FOUND)
pkg_check_modules(DepBLAS REQUIRED openblas)
endif()
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
pkg_check_modules(DepBLAS REQUIRED blis)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
Expand Down
12 changes: 4 additions & 8 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,17 @@ let package = Package(
products: [
.library(name: "llama", targets: ["llama"]),
],
dependencies: [
.package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
],
targets: [
.target(
name: "llama",
dependencies: ["ggml"],
path: ".",
exclude: [],
sources: [
"ggml.c",
"llama.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
],
publicHeadersPath: "spm-headers",
cSettings: [
Expand Down
2 changes: 1 addition & 1 deletion awq-py/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
torch>=2.0.0
torch>=2.1.1
transformers>=4.32.0
14 changes: 10 additions & 4 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`

CMAKE_EXTRA=""

if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
fi

## helpers

# download a file if it does not exist or if it is outdated
Expand Down Expand Up @@ -81,8 +87,8 @@ function gg_run_ctest_debug {

set -e

(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log

(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

Expand All @@ -109,8 +115,8 @@ function gg_run_ctest_release {

set -e

(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log

if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
Expand Down
2 changes: 1 addition & 1 deletion common/train.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
fprintf(stderr, " --sample-start STR Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
fprintf(stderr, " --include-sample-start Include the sample start in the samples. (default off)\n");
fprintf(stderr, " --escape process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stderr, " --overlapping-samples Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
fprintf(stderr, " --overlapping-samples Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
fprintf(stderr, " --fill-with-next-samples Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
fprintf(stderr, " --separate-with-eos When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
fprintf(stderr, " --separate-with-bos When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
Expand Down
2 changes: 1 addition & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
from safetensors import safe_open
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
else:
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True))
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))

with ctx as model_part:
for name in model_part.keys():
Expand Down
61 changes: 61 additions & 0 deletions examples/base-translate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
#
# Few-shot translation example.
# Requires a base model (i.e. no fine-tuned or instruct models).
#
# Usage:
#
# cd llama.cpp
# make -j
#
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
#

if [ $# -lt 2 ]; then
echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
exit 1
fi

eargs=""
if [ $# -gt 2 ]; then
eargs="${@:3}"
fi

ftmp="__llama.cpp_example_tmp__.txt"
trap "rm -f $ftmp" EXIT

echo "Translate from English to French:
===
sea otter, peppermint, plush girafe:
sea otter => loutre de mer
peppermint => menthe poivrée
plush girafe => girafe peluche
===
violin
violin => violon
===
phone, computer, mouse, keyboard:
phone => téléphone
computer => ordinateur
mouse => souris
keyboard => clavier
===
" > $ftmp

echo "$2
" >> $ftmp

model=$1

# generate the most likely continuation until the string "===" is found
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
2 changes: 1 addition & 1 deletion examples/finetune/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
```

The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.

Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
Expand Down
6 changes: 0 additions & 6 deletions examples/finetune/finetune.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,9 @@
#include "llama.h"
#include "common.h"
#include "train.h"
#include <unordered_map>
#include <vector>
#include <cassert>
#include <climits>
#include <cstring>
#include <cstdarg>
#include <ctime>
#include <random>
#include <stdexcept>
#include <algorithm>
#include <string>

Expand Down
2 changes: 1 addition & 1 deletion examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ actor LlamaContext {
new_token_id = llama_sample_token_greedy(context, &candidates_p)
}

if new_token_id == llama_token_eos(context) || n_cur == n_len {
if new_token_id == llama_token_eos(model) || n_cur == n_len {
print("\n")
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
/* Begin PBXBuildFile section */
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
Expand All @@ -24,6 +23,8 @@
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
F1FE20DC2B465C4500B45541 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
/* End PBXBuildFile section */

/* Begin PBXFileReference section */
Expand Down Expand Up @@ -52,6 +53,7 @@
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */

/* Begin PBXFrameworksBuildPhase section */
Expand Down Expand Up @@ -166,6 +168,7 @@
children = (
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
);
path = UI;
sourceTree = "<group>";
Expand Down Expand Up @@ -241,7 +244,7 @@
isa = PBXResourcesBuildPhase;
buildActionMask = 2147483647;
files = (
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
F1FE20DC2B465C4500B45541 /* ggml-metal.metal in Resources */,
8A3F84242AC4C891005E2EE8 /* models in Resources */,
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
Expand All @@ -257,6 +260,7 @@
files = (
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */,
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
Expand Down
2 changes: 2 additions & 0 deletions examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ struct ContentView: View {
ContentView.cleanupModelCaches()
llamaState.cacheCleared = true
}

LoadCustomButton(llamaState: llamaState)
}
.padding(.top, 4)
.font(.system(size: 12))
Expand Down
44 changes: 44 additions & 0 deletions examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import SwiftUI
import UniformTypeIdentifiers

struct LoadCustomButton: View {
@ObservedObject private var llamaState: LlamaState
@State private var showFileImporter = false

init(llamaState: LlamaState) {
self.llamaState = llamaState
}

var body: some View {
VStack {
Button(action: {
showFileImporter = true
}) {
Text("Load Custom Model")
}
}
.fileImporter(
isPresented: $showFileImporter,
allowedContentTypes: [UTType(filenameExtension: "gguf", conformingTo: .data)!],
allowsMultipleSelection: false
) { result in
switch result {
case .success(let files):
files.forEach { file in
let gotAccess = file.startAccessingSecurityScopedResource()
if !gotAccess { return }

do {
try llamaState.loadModel(modelUrl: file.absoluteURL)
} catch let err {
print("Error: \(err.localizedDescription)")
}

file.stopAccessingSecurityScopedResource()
}
case .failure(let error):
print(error)
}
}
}
}
12 changes: 6 additions & 6 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@ node index.js

`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)

`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

*Result JSON:*

Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
Expand Down Expand Up @@ -198,12 +204,6 @@ node index.js

`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)

`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)

`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

- **POST** `/tokenize`: Tokenize a given text.

*Options:*
Expand Down
Loading

0 comments on commit d5104a0

Please sign in to comment.