From 7e9b49df958d4081c19ccf48df0e3e99dfb8f654 Mon Sep 17 00:00:00 2001 From: jhen Date: Thu, 22 Feb 2024 12:19:12 +0800 Subject: [PATCH 1/2] feat: sync llama.cpp --- cpp/common.cpp | 309 +++- cpp/common.h | 82 +- cpp/ggml-alloc.c | 1185 +++++++------ cpp/ggml-alloc.h | 104 +- cpp/ggml-backend-impl.h | 6 + cpp/ggml-backend.c | 661 +++++--- cpp/ggml-backend.h | 23 +- cpp/ggml-impl.h | 2 + cpp/ggml-metal-llama.metal | 1170 +++++++++++-- cpp/ggml-metal.h | 3 + cpp/ggml-metal.m | 335 ++-- cpp/ggml-quants.c | 2242 ++++++++++++++++++++++--- cpp/ggml-quants.h | 164 +- cpp/ggml.c | 2362 ++++++++++++++++++-------- cpp/ggml.h | 92 +- cpp/llama.cpp | 3251 ++++++++++++++++++++++++++++++------ cpp/llama.h | 81 +- cpp/sampling.cpp | 52 +- cpp/sampling.h | 23 +- cpp/unicode.h | 75 +- example/ios/Podfile.lock | 4 +- llama.cpp | 2 +- scripts/bootstrap.sh | 1 + scripts/common.cpp.patch | 10 +- scripts/ggml-metal.m.patch | 6 +- scripts/llama.cpp.patch | 8 +- 26 files changed, 9505 insertions(+), 2748 deletions(-) diff --git a/cpp/common.cpp b/cpp/common.cpp index 24766b3a..823104d8 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -48,6 +48,14 @@ char const *LLAMA_COMMIT = "unknown"; char const *LLAMA_COMPILER = "unknown"; char const *LLAMA_BUILD_TARGET = "unknown"; +#if (defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_SYCL)) +#define LM_GGML_USE_CUBLAS_SYCL +#endif + +#if (defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_SYCL)) || defined(LM_GGML_USE_VULKAN) +#define LM_GGML_USE_CUBLAS_SYCL_VULKAN +#endif + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -209,6 +217,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.prompt_cache_all = true; } else if (arg == "--prompt-cache-ro") { params.prompt_cache_ro = true; + } else if (arg == "-bf" || arg == "--binary-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -321,13 +346,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - sparams.samplers_sequence = parse_samplers_input(argv[i]); + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); } else if (arg == "--sampling-seq") { if (++i >= argc) { invalid_param = true; break; } - sparams.samplers_sequence = argv[i]; + sparams.samplers_sequence = sampler_types_from_chars(argv[i]); } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; @@ -384,6 +410,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } sparams.penalty_present = std::stof(argv[i]); + } else if (arg == "--dynatemp-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.dynatemp_range = std::stof(argv[i]); + } else if (arg == "--dynatemp-exp") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.dynatemp_exponent = std::stof(argv[i]); } else if (arg == "--mirostat") { if (++i >= argc) { invalid_param = true; @@ -500,7 +538,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); + params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; } else if (arg == "--lora-scaled") { if (++i >= argc) { @@ -512,7 +550,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i]))); + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; } else if (arg == "--lora-base") { if (++i >= argc) { @@ -568,29 +606,29 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.n_gpu_layers = std::stoi(argv[i]); -#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { if (++i >= argc) { invalid_param = true; break; } params.n_gpu_layers_draft = std::stoi(argv[i]); -#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { invalid_param = true; break; } params.main_gpu = std::stoi(argv[i]); -#ifndef LM_GGML_USE_CUBLAS - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n"); -#endif // LM_GGML_USE_CUBLAS +#ifndef LM_GGML_USE_CUBLAS_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); +#endif // LM_GGML_USE_CUBLAS_SYCL } else if (arg == "--split-mode" || arg == "-sm") { if (++i >= argc) { invalid_param = true; @@ -607,9 +645,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifndef LM_GGML_USE_CUBLAS - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); -#endif // LM_GGML_USE_CUBLAS +#ifndef LM_GGML_USE_CUBLAS_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); +#endif // LM_GGML_USE_CUBLAS_SYCL + } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; @@ -621,24 +660,32 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector split_arg{it, {}}; - if (split_arg.size() >= LLAMA_MAX_DEVICES) { + if (split_arg.size() >= llama_max_devices()) { invalid_param = true; break; } - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); } else { params.tensor_split[i] = 0.0f; } } -#ifndef LM_GGML_USE_CUBLAS - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n"); -#endif // LM_GGML_USE_CUBLAS +#ifndef LM_GGML_USE_CUBLAS_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // LM_GGML_USE_CUBLAS_SYCL } else if (arg == "--no-mmap") { params.use_mmap = false; } else if (arg == "--numa") { - params.numa = true; + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "") { params.numa = LM_GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = LM_GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = LM_GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; break; } } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "--no-display-prompt") { @@ -648,7 +695,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.antiprompt.push_back(argv[i]); + params.antiprompt.emplace_back(argv[i]); } else if (arg == "-ld" || arg == "--logdir") { if (++i >= argc) { invalid_param = true; @@ -659,6 +706,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } + } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.logits_file = argv[i]; } else if (arg == "--perplexity" || arg == "--all-logits") { params.logits_all = true; } else if (arg == "--ppl-stride") { @@ -695,6 +748,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.winogrande_tasks = std::stoi(argv[i]); + } else if (arg == "--multiple-choice") { + params.multiple_choice = true; + } else if (arg == "--multiple-choice-tasks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.multiple_choice_tasks = std::stoi(argv[i]); + } else if (arg == "--kl-divergence") { + params.kl_divergence = true; } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--no-penalize-nl") { @@ -848,7 +911,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(llama_model_kv_override()); + params.kv_overrides.emplace_back(); params.kv_overrides.back().key[0] = 0; } @@ -858,6 +921,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto sampler_type : sparams.samplers_sequence) { + sampler_type_chars += static_cast(sampler_type); + sampler_type_names += sampler_type_to_name_string(sampler_type) + ";"; + } + sampler_type_names.pop_back(); + printf("\n"); printf("usage: %s [options]\n", argv[0]); printf("\n"); @@ -878,7 +949,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" -td N, --threads-draft N"); - printf(" number of threads to use during generation (default: same as --threads)"); + printf(" number of threads to use during generation (default: same as --threads)\n"); printf(" -tbd N, --threads-batch-draft N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); @@ -894,11 +965,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" -f FNAME, --file FNAME\n"); printf(" prompt file to start generation.\n"); + printf(" -bf FNAME, --binary-file FNAME\n"); + printf(" binary file containing multiple choice tasks.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n"); - printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str()); + printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); + printf(" (default: %s)\n", sampler_type_names.c_str()); + printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); @@ -908,6 +982,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); + printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range); + printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent); printf(" --mirostat N use Mirostat sampling.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); @@ -942,6 +1018,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); + printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); + printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); + printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n"); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); @@ -952,30 +1031,33 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); - if (llama_mlock_supported()) { + if (llama_supports_mlock()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } - if (llama_mmap_supported()) { + if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - printf(" --numa attempt optimizations that help on some NUMA systems\n"); + printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); + printf(" - distribute: spread execution evenly over all nodes\n"); + printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); + printf(" - numactl: use the CPU map provided by numactl\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -ngld N, --n-gpu-layers-draft N\n"); - printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf(" how to split the model across multiple GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf(" - layer (default): split layers and KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT, --tensor-split SPLIT\n"); - printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); - printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); -#endif + if (llama_supports_gpu_offload()) { + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ngld N, --n-gpu-layers-draft N\n"); + printf(" number of layers to store in VRAM for the draft model\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); + printf(" -ts SPLIT, --tensor-split SPLIT\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); + } printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n"); @@ -1042,45 +1124,101 @@ std::string gpt_random_prompt(std::mt19937 & rng) { } // -// String parsing +// String utils // -std::string parse_samplers_input(std::string input) { - std::string output = ""; +std::vector string_split(std::string input, char separator) { + std::vector parts; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(0, separator_pos); + parts.emplace_back(part); + input = input.substr(separator_pos + 1); + separator_pos = input.find(separator); + } + parts.emplace_back(input); + return parts; +} + +std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"temperature", llama_sampler_type::TEMPERATURE} + }; + // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map samplers_symbols { - {"top_k", 'k'}, - {"top-k", 'k'}, - {"top_p", 'p'}, - {"top-p", 'p'}, - {"nucleus", 'p'}, - {"typical_p", 'y'}, - {"typical-p", 'y'}, - {"typical", 'y'}, - {"min_p", 'm'}, - {"min-p", 'm'}, - {"tfs_z", 'f'}, - {"tfs-z", 'f'}, - {"tfs", 'f'}, - {"temp", 't'}, - {"temperature",'t'} + std::unordered_map sampler_alt_name_map { + {"top-k", llama_sampler_type::TOP_K}, + {"top-p", llama_sampler_type::TOP_P}, + {"nucleus", llama_sampler_type::TOP_P}, + {"typical-p", llama_sampler_type::TYPICAL_P}, + {"typical", llama_sampler_type::TYPICAL_P}, + {"min-p", llama_sampler_type::MIN_P}, + {"tfs-z", llama_sampler_type::TFS_Z}, + {"tfs", llama_sampler_type::TFS_Z}, + {"temp", llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names.size()); + for (const auto & name : names) + { + auto sampler_item = sampler_canonical_name_map.find(name); + if (sampler_item != sampler_canonical_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + else + { + if (allow_alt_names) + { + sampler_item = sampler_alt_name_map.find(name); + if (sampler_item != sampler_alt_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + } + } + } + return sampler_types; +} + +std::vector sampler_types_from_chars(const std::string & names_string) { + std::unordered_map sampler_name_map { + {'k', llama_sampler_type::TOP_K}, + {'p', llama_sampler_type::TOP_P}, + {'y', llama_sampler_type::TYPICAL_P}, + {'m', llama_sampler_type::MIN_P}, + {'f', llama_sampler_type::TFS_Z}, + {'t', llama_sampler_type::TEMPERATURE} }; - // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p" - size_t separator = input.find(';'); - while (separator != input.npos) { - std::string name = input.substr(0,separator); - input = input.substr(separator+1); - separator = input.find(';'); - - if (samplers_symbols.find(name) != samplers_symbols.end()) { - output += samplers_symbols[name]; + + std::vector sampler_types; + sampler_types.reserve(names_string.size()); + for (const auto & c : names_string) { + const auto sampler_item = sampler_name_map.find(c); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); } } - if (samplers_symbols.find(input) != samplers_symbols.end()) { - output += samplers_symbols[input]; + return sampler_types; +} + +std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { + switch (sampler_type) { + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMPERATURE: return "temperature"; + default : return ""; } - return output; } // @@ -1482,9 +1620,10 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", lm_ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_cublas: %s\n", lm_ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_vulkan: %s\n", lm_ggml_cpu_has_vulkan() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", lm_ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_kompute: %s\n", lm_ggml_cpu_has_kompute() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", lm_ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_gpublas: %s\n", lm_ggml_cpu_has_gpublas() ? "true" : "false"); fprintf(stream, "cpu_has_neon: %s\n", lm_ggml_cpu_has_neon() ? "true" : "false"); @@ -1494,6 +1633,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_vsx: %s\n", lm_ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_matmul_int8: %s\n", lm_ggml_cpu_has_matmul_int8() ? "true" : "false"); #ifdef NDEBUG fprintf(stream, "debug: false\n"); @@ -1570,6 +1710,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); @@ -1583,7 +1724,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); - fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); @@ -1608,16 +1748,16 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); + fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); @@ -1668,7 +1808,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { if (cs_curr[j] < 0) { continue; } if (seqs.find(cs_curr[j]) == seqs.end()) { if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } - seqs[cs_curr[j]] = seqs.size(); + const size_t sz = seqs.size(); + seqs[cs_curr[j]] = sz; } } if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } diff --git a/cpp/common.h b/cpp/common.h index 9b490089..a3ed490b 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -32,40 +32,40 @@ int32_t get_num_physical_cores(); struct gpt_params { - uint32_t seed = -1; // RNG seed - - int32_t n_threads = get_num_physical_cores(); - int32_t n_threads_draft = -1; - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_threads_batch_draft = -1; - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 8; // number of tokens to draft during speculative decoding - int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_parallel = 1; // number of parallel sequences to decode - int32_t n_sequences = 1; // number of sequences to decode - float p_accept = 0.5f; // speculative decoding accept probability - float p_split = 0.1f; // speculative decoding split probability - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. - int32_t grp_attn_n = 1; // group-attention factor - int32_t grp_attn_w = 512; // group-attention width - int32_t n_print = -1; // print token count every n tokens (-1 = disabled) - float rope_freq_base = 0.0f; // RoPE base frequency - float rope_freq_scale = 0.0f; // RoPE frequency scaling factor - float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor - float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor - float yarn_beta_fast = 32.0f; // YaRN low correction dim - float yarn_beta_slow = 1.0f; // YaRN high correction dim - int32_t yarn_orig_ctx = 0; // YaRN original context length - int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment - // pinging @cebtenzzre + uint32_t seed = -1; // RNG seed + + int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_draft = -1; + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 8; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode + float p_accept = 0.5f; // speculative decoding accept probability + float p_split = 0.1f; // speculative decoding split probability + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_beams = 0; // if non-zero then use beam search of given width. + int32_t grp_attn_n = 1; // group-attention factor + int32_t grp_attn_w = 512; // group-attention width + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f; // YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters struct llama_sampling_params sparams; @@ -80,6 +80,7 @@ struct gpt_params { std::string input_suffix = ""; // string to suffix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files + std::string logits_file = ""; // file for saving *all* logits std::vector kv_overrides; @@ -97,6 +98,11 @@ struct gpt_params { bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt + size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed + + bool kl_divergence = false; // compute KL-divergence + bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs @@ -118,7 +124,6 @@ struct gpt_params { bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory - bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode @@ -146,10 +151,13 @@ std::string gpt_random_prompt(std::mt19937 & rng); void process_escapes(std::string& input); // -// String parsing +// String utils // -std::string parse_samplers_input(std::string input); +std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector sampler_types_from_chars(const std::string & names_string); +std::vector string_split(std::string input, char separator); +std::string sampler_type_to_name_string(llama_sampler_type sampler_type); // // Model utils diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c index 2df93eea..ebe675f0 100644 --- a/cpp/ggml-alloc.c +++ b/cpp/ggml-alloc.c @@ -17,6 +17,50 @@ //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) #define AT_PRINTF(...) + +static bool lm_ggml_is_view(const struct lm_ggml_tensor * t) { + return t->view_src != NULL; +} + +static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < LM_GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) { + switch (op) { + case LM_GGML_OP_SCALE: + case LM_GGML_OP_DIAG_MASK_ZERO: + case LM_GGML_OP_DIAG_MASK_INF: + case LM_GGML_OP_ADD: + case LM_GGML_OP_ADD1: + case LM_GGML_OP_SUB: + case LM_GGML_OP_MUL: + case LM_GGML_OP_DIV: + case LM_GGML_OP_SQR: + case LM_GGML_OP_SQRT: + case LM_GGML_OP_LOG: + case LM_GGML_OP_UNARY: + case LM_GGML_OP_ROPE: + case LM_GGML_OP_RMS_NORM: + case LM_GGML_OP_SOFT_MAX: + return true; + + default: + return false; + } +} + // TODO: LM_GGML_PAD ? static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { assert(alignment && !(alignment & (alignment - 1))); // power of 2 @@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen return offset + align; } +// tallocr +struct lm_ggml_tallocr { + lm_ggml_backend_buffer_t buffer; + void * base; + size_t alignment; + size_t offset; +}; + +lm_ggml_tallocr_t lm_ggml_tallocr_new(lm_ggml_backend_buffer_t buffer) { + lm_ggml_tallocr_t talloc = malloc(sizeof(struct lm_ggml_tallocr)); + if (talloc == NULL) { + return NULL; + } + + void * base = lm_ggml_backend_buffer_get_base(buffer); + size_t align = lm_ggml_backend_buffer_get_alignment(buffer); + + assert(align && !(align & (align - 1))); // power of 2 + + *talloc = (struct lm_ggml_tallocr) { + /*.buffer = */ buffer, + /*.base = */ base, + /*.alignment = */ align, + /*.offset = */ aligned_offset(base, 0, align), + }; + return talloc; +} + +void lm_ggml_tallocr_free(lm_ggml_tallocr_t talloc) { + free(talloc); +} + +void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t talloc, struct lm_ggml_tensor * tensor) { + size_t size = lm_ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); + size = LM_GGML_PAD(size, talloc->alignment); + + if (talloc->offset + size > lm_ggml_backend_buffer_get_size(talloc->buffer)) { + fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + __func__, tensor->name, size, lm_ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); + LM_GGML_ASSERT(!"not enough space in the buffer"); + return; + } + + void * addr = (char *)lm_ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; + talloc->offset += size; + + assert(((uintptr_t)addr % talloc->alignment) == 0); + + lm_ggml_backend_tensor_alloc(talloc->buffer, tensor, addr); +} + +// dynamic tensor allocator + struct free_block { - void * addr; + size_t offset; size_t size; }; -struct lm_ggml_tallocr { - struct lm_ggml_backend_buffer * buffer; - bool buffer_owned; - void * base; +struct lm_ggml_dyn_tallocr { size_t alignment; - int n_free_blocks; struct free_block free_blocks[MAX_FREE_BLOCKS]; - size_t max_size; - bool measure; - #ifdef LM_GGML_ALLOCATOR_DEBUG - struct lm_ggml_tensor * allocated_tensors[1024]; + struct { + const struct lm_ggml_tensor * tensor; + size_t offset; + } allocated_tensors[1024]; #endif }; #ifdef LM_GGML_ALLOCATOR_DEBUG -static void add_allocated_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) { +static void add_allocated_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t offset, const struct lm_ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i] == NULL) { - alloc->allocated_tensors[i] = tensor; + if (alloc->allocated_tensors[i].tensor == NULL) { + alloc->allocated_tensors[i].tensor = tensor; + alloc->allocated_tensors[i].offset = offset; return; } } LM_GGML_ASSERT(!"out of allocated_tensors"); } -static void remove_allocated_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) { +static void remove_allocated_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t offset, const struct lm_ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i] == tensor || - (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { - alloc->allocated_tensors[i] = NULL; + if (alloc->allocated_tensors[i].offset == offset) { + alloc->allocated_tensors[i].tensor = NULL; return; } } - printf("tried to free tensor %s not found\n", tensor->name); + fprintf(stderr, "tried to free tensor %s not found\n", tensor->name); LM_GGML_ASSERT(!"tensor not found"); } #endif -// check if a tensor is allocated by this buffer -static bool lm_ggml_tallocr_is_own(lm_ggml_tallocr_t alloc, const struct lm_ggml_tensor * tensor) { - return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer); -} - -static bool lm_ggml_is_view(struct lm_ggml_tensor * t) { - return t->view_src != NULL; -} - -void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) { - LM_GGML_ASSERT(!lm_ggml_is_view(tensor)); // views generally get data pointer from one of their sources - LM_GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated - - size_t size = lm_ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); +static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size_t size, const struct lm_ggml_tensor * tensor) { size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); @@ -109,16 +189,17 @@ void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tens if (block->size >= size) { best_fit_block = alloc->n_free_blocks - 1; } else { - fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", + // this should never happen + fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", __func__, size, max_avail); LM_GGML_ASSERT(!"not enough space in the buffer"); - return; + LM_GGML_UNREACHABLE(); } } struct free_block * block = &alloc->free_blocks[best_fit_block]; - void * addr = block->addr; - block->addr = (char*)block->addr + size; + size_t offset = block->offset; + block->offset = offset + size; block->size -= size; if (block->size == 0) { // remove block if empty @@ -128,59 +209,63 @@ void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tens } } - AT_PRINTF("block %d, addr %p\n", best_fit_block, addr); - - tensor->data = addr; - tensor->buffer = alloc->buffer; - if (!alloc->measure) { - lm_ggml_backend_buffer_init_tensor(alloc->buffer, tensor); - } + AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset); #ifdef LM_GGML_ALLOCATOR_DEBUG - add_allocated_tensor(alloc, tensor); - size_t cur_max = (char*)addr - (char*)alloc->base + size; + add_allocated_tensor(alloc, offset, tensor); + size_t cur_max = offset + size; if (cur_max > alloc->max_size) { - printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + // sort allocated_tensors by offset + for (int i = 0; i < 1024; i++) { + for (int j = i + 1; j < 1024; j++) { + if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) { + const struct lm_ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor; + size_t tmp_offset = alloc->allocated_tensors[i].offset; + alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor; + alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset; + alloc->allocated_tensors[j].tensor = tmp_tensor; + alloc->allocated_tensors[j].offset = tmp_offset; + } + } + } + fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i]) { - printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, lm_ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); + if (alloc->allocated_tensors[i].tensor) { + fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, + alloc->allocated_tensors[i].offset, + alloc->allocated_tensors[i].offset + lm_ggml_nbytes(alloc->allocated_tensors[i].tensor), + lm_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); } } - printf("\n"); + fprintf(stderr, "\n"); } #endif - alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size); -} + alloc->max_size = MAX(alloc->max_size, offset + size); -// this is a very naive implementation, but for our case the number of free blocks should be very small -static void lm_ggml_tallocr_free_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) { - if (lm_ggml_tallocr_is_own(alloc, tensor) == false) { - // the tensor was not allocated in this buffer - // this can happen because the graph allocator will try to free weights and other tensors from different buffers - // the easiest way to deal with this is just to ignore it - // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer); - return; - } + return offset; - void * ptr = tensor->data; + LM_GGML_UNUSED(tensor); +} - size_t size = lm_ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void lm_ggml_dyn_tallocr_free_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct lm_ggml_tensor * tensor) { size = aligned_offset(NULL, size, alloc->alignment); - AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); + + AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks); #ifdef LM_GGML_ALLOCATOR_DEBUG - remove_allocated_tensor(alloc, tensor); + remove_allocated_tensor(alloc, offset, tensor); #endif // see if we can merge with an existing block for (int i = 0; i < alloc->n_free_blocks; i++) { struct free_block * block = &alloc->free_blocks[i]; // check if ptr is at the end of the block - if ((char*)block->addr + block->size == ptr) { + if (block->offset + block->size == offset) { block->size += size; // check if we can merge with the next block - if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) { + if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) { block->size += alloc->free_blocks[i+1].size; alloc->n_free_blocks--; for (int j = i+1; j < alloc->n_free_blocks; j++) { @@ -190,11 +275,11 @@ static void lm_ggml_tallocr_free_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_ return; } // check if ptr is at the beginning of the block - if ((char*)ptr + size == block->addr) { - block->addr = ptr; + if (offset + size == block->offset) { + block->offset = offset; block->size += size; // check if we can merge with the previous block - if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) { + if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) { alloc->free_blocks[i-1].size += block->size; alloc->n_free_blocks--; for (int j = i; j < alloc->n_free_blocks; j++) { @@ -208,7 +293,7 @@ static void lm_ggml_tallocr_free_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_ LM_GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) int insert_pos = 0; - while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) { + while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) { insert_pos++; } // shift all blocks from insert_pos onward to make room for the new block @@ -216,606 +301,616 @@ static void lm_ggml_tallocr_free_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_ alloc->free_blocks[i] = alloc->free_blocks[i-1]; } // insert the new block - alloc->free_blocks[insert_pos].addr = ptr; + alloc->free_blocks[insert_pos].offset = offset; alloc->free_blocks[insert_pos].size = size; alloc->n_free_blocks++; + + LM_GGML_UNUSED(tensor); } -void lm_ggml_tallocr_reset(lm_ggml_tallocr_t alloc) { +static void lm_ggml_dyn_tallocr_reset(struct lm_ggml_dyn_tallocr * alloc) { alloc->n_free_blocks = 1; - size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment); - alloc->free_blocks[0].addr = (char *)alloc->base + align_offset; - - if (alloc->measure) { - alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows - } else { - alloc->free_blocks[0].size = lm_ggml_backend_buffer_get_size(alloc->buffer) - align_offset; - lm_ggml_backend_buffer_reset(alloc->buffer); - } + alloc->free_blocks[0].offset = 0; + alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows + alloc->max_size = 0; } -lm_ggml_tallocr_t lm_ggml_tallocr_new(void * data, size_t size, size_t alignment) { - struct lm_ggml_backend_buffer * buffer = lm_ggml_backend_cpu_buffer_from_ptr(data, size); - - lm_ggml_tallocr_t alloc = (lm_ggml_tallocr_t)malloc(sizeof(struct lm_ggml_tallocr)); +static struct lm_ggml_dyn_tallocr * lm_ggml_dyn_tallocr_new(size_t alignment) { + struct lm_ggml_dyn_tallocr * alloc = (struct lm_ggml_dyn_tallocr *)malloc(sizeof(struct lm_ggml_dyn_tallocr)); - *alloc = (struct lm_ggml_tallocr) { - /*.buffer = */ buffer, - /*.buffer_owned = */ true, - /*.base = */ lm_ggml_backend_buffer_get_base(buffer), + *alloc = (struct lm_ggml_dyn_tallocr) { /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, /*.max_size = */ 0, - /*.measure = */ false, #ifdef LM_GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, + /*.allocated_tensors = */ {{0}}, #endif }; - lm_ggml_tallocr_reset(alloc); + lm_ggml_dyn_tallocr_reset(alloc); return alloc; } -lm_ggml_tallocr_t lm_ggml_tallocr_new_measure(size_t alignment) { - lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment); - alloc->measure = true; - - return alloc; -} - -lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_buft(struct lm_ggml_backend_buffer_type * buft) { - // create a backend buffer to get the correct tensor allocation sizes - lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, 1); - - // TODO: move alloc initialization to a common lm_ggml_tallocr_new_impl function - lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new_from_buffer(buffer); - alloc->buffer_owned = true; - alloc->measure = true; - lm_ggml_tallocr_reset(alloc); - return alloc; +static void lm_ggml_dyn_tallocr_free(struct lm_ggml_dyn_tallocr * alloc) { + free(alloc); } -lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend) { - return lm_ggml_tallocr_new_measure_from_buft(lm_ggml_backend_get_default_buffer_type(backend)); +static size_t lm_ggml_dyn_tallocr_max_size(struct lm_ggml_dyn_tallocr * alloc) { + return alloc->max_size; } -lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buft(struct lm_ggml_backend_buffer_type * buft, size_t size) { - // create a backend buffer to get the correct tensor allocation sizes - lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, size); - lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new_from_buffer(buffer); - alloc->buffer_owned = true; - return alloc; -} -lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size) { - return lm_ggml_tallocr_new_from_buft(lm_ggml_backend_get_default_buffer_type(backend), size); -} +///////////////////////////////////// -lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) { - lm_ggml_tallocr_t alloc = (lm_ggml_tallocr_t)malloc(sizeof(struct lm_ggml_tallocr)); +// graph allocator - *alloc = (struct lm_ggml_tallocr) { - /*.buffer = */ buffer, - /*.buffer_owned = */ false, - /*.base = */ lm_ggml_backend_buffer_get_base(buffer), - /*.alignment = */ lm_ggml_backend_buffer_get_alignment(buffer), - /*.n_free_blocks = */ 0, - /*.free_blocks = */ {{0}}, - /*.max_size = */ 0, - /*.measure = */ false, -#ifdef LM_GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, -#endif - }; +struct hash_node { + int n_children; + int n_views; + int buffer_id; + size_t offset; // offset within the buffer + bool allocated; +}; - lm_ggml_tallocr_reset(alloc); +// +struct tensor_alloc { + size_t offset; + size_t size_max; // 0 = pre-allocated, unused, or view +}; - return alloc; -} +struct node_alloc { + int buffer_id; + struct tensor_alloc dst; + struct tensor_alloc src[LM_GGML_MAX_SRC]; +}; -struct lm_ggml_backend_buffer * lm_ggml_tallocr_get_buffer(lm_ggml_tallocr_t alloc) { - return alloc->buffer; -} +struct lm_ggml_gallocr { + lm_ggml_backend_buffer_type_t * bufts; // [n_buffers] + lm_ggml_backend_buffer_t * buffers; // [n_buffers] + struct lm_ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] + int n_buffers; -void lm_ggml_tallocr_free(lm_ggml_tallocr_t alloc) { - if (alloc == NULL) { - return; - } + struct lm_ggml_hash_set hash_set; + struct hash_node * hash_values; // [hash_set.size] - if (alloc->buffer_owned) { - lm_ggml_backend_buffer_free(alloc->buffer); - } - free(alloc); -} + struct node_alloc * node_allocs; // [n_nodes] + int n_nodes; -bool lm_ggml_tallocr_is_measure(lm_ggml_tallocr_t alloc) { - return alloc->measure; -} + struct tensor_alloc * leaf_allocs; // [n_leafs] + int n_leafs; +}; -size_t lm_ggml_tallocr_max_size(lm_ggml_tallocr_t alloc) { - return alloc->max_size; -} +lm_ggml_gallocr_t lm_ggml_gallocr_new_n(lm_ggml_backend_buffer_type_t * bufts, int n_bufs) { + lm_ggml_gallocr_t galloc = (lm_ggml_gallocr_t)calloc(sizeof(struct lm_ggml_gallocr), 1); + LM_GGML_ASSERT(galloc != NULL); -// graph allocator + galloc->bufts = calloc(sizeof(lm_ggml_backend_buffer_type_t) * n_bufs, 1); + LM_GGML_ASSERT(galloc->bufts != NULL); -struct hash_node { - int n_children; - int n_views; -}; + galloc->buffers = calloc(sizeof(lm_ggml_backend_buffer_t) * n_bufs, 1); + LM_GGML_ASSERT(galloc->buffers != NULL); -struct lm_ggml_gallocr { - lm_ggml_tallocr_t talloc; - struct lm_ggml_hash_set hash_set; - struct hash_node * hash_values; - size_t hash_values_size; - lm_ggml_tallocr_t * hash_allocs; - int * parse_seq; - int parse_seq_len; -}; + galloc->buf_tallocs = calloc(sizeof(struct lm_ggml_dyn_tallocr *) * n_bufs, 1); + LM_GGML_ASSERT(galloc->buf_tallocs != NULL); -lm_ggml_gallocr_t lm_ggml_gallocr_new(void) { - lm_ggml_gallocr_t galloc = (lm_ggml_gallocr_t)malloc(sizeof(struct lm_ggml_gallocr)); - - *galloc = (struct lm_ggml_gallocr) { - /*.talloc = */ NULL, - /*.hash_set = */ {0}, - /*.hash_values = */ NULL, - /*.hash_values_size = */ 0, - /*.hash_allocs = */ NULL, - /*.parse_seq = */ NULL, - /*.parse_seq_len = */ 0, - }; + for (int i = 0; i < n_bufs; i++) { + galloc->bufts[i] = bufts[i]; + galloc->buffers[i] = NULL; + size_t alignment = lm_ggml_backend_buft_get_alignment(bufts[i]); + galloc->buf_tallocs[i] = lm_ggml_dyn_tallocr_new(alignment); + } + galloc->n_buffers = n_bufs; return galloc; } +lm_ggml_gallocr_t lm_ggml_gallocr_new(lm_ggml_backend_buffer_type_t buft) { + return lm_ggml_gallocr_new_n(&buft, 1); +} + void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc) { if (galloc == NULL) { return; } - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); - } - if (galloc->hash_values != NULL) { - free(galloc->hash_values); - } - if (galloc->hash_allocs != NULL) { - free(galloc->hash_allocs); - } - if (galloc->parse_seq != NULL) { - free(galloc->parse_seq); + for (int i = 0; i < galloc->n_buffers; i++) { + if (galloc->buffers != NULL) { + lm_ggml_backend_buffer_free(galloc->buffers[i]); + } + if (galloc->buf_tallocs != NULL) { + lm_ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + } } + + free(galloc->hash_set.keys); + free(galloc->hash_values); + free(galloc->bufts); + free(galloc->buffers); + free(galloc->buf_tallocs); + free(galloc->node_allocs); + free(galloc->leaf_allocs); free(galloc); } -void lm_ggml_gallocr_set_parse_seq(lm_ggml_gallocr_t galloc, const int * list, int n) { - free(galloc->parse_seq); - galloc->parse_seq = malloc(sizeof(int) * n); - - for (int i = 0; i < n; i++) { - galloc->parse_seq[i] = list[i]; - } - galloc->parse_seq_len = n; -} +typedef struct lm_ggml_gallocr * lm_ggml_gallocr_t; -static struct hash_node * hash_get(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) { +static struct hash_node * lm_ggml_gallocr_hash_get(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) { size_t i = lm_ggml_hash_find_or_insert(galloc->hash_set, t); return &galloc->hash_values[i]; } -static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) { - if (a->type != b->type) { - return false; - } - for (int i = 0; i < LM_GGML_MAX_DIMS; i++) { - if (a->ne[i] != b->ne[i]) { - return false; - } - if (a->nb[i] != b->nb[i]) { - return false; - } - } - return true; +static bool lm_ggml_gallocr_is_own(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) { + return lm_ggml_gallocr_hash_get(galloc, t)->allocated; } -static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) { - switch (op) { - case LM_GGML_OP_SCALE: - case LM_GGML_OP_DIAG_MASK_ZERO: - case LM_GGML_OP_DIAG_MASK_INF: - case LM_GGML_OP_ADD: - case LM_GGML_OP_ADD1: - case LM_GGML_OP_SUB: - case LM_GGML_OP_MUL: - case LM_GGML_OP_DIV: - case LM_GGML_OP_SQR: - case LM_GGML_OP_SQRT: - case LM_GGML_OP_LOG: - case LM_GGML_OP_UNARY: - case LM_GGML_OP_ROPE: - case LM_GGML_OP_RMS_NORM: - case LM_GGML_OP_SOFT_MAX: - return true; - - default: - return false; - } +static void lm_ggml_gallocr_set_node_offset(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id, size_t offset) { + struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node); + hn->buffer_id = buffer_id; + hn->offset = offset; + hn->allocated = true; } -static lm_ggml_tallocr_t node_tallocr(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) { - if (galloc->talloc != NULL) { - return galloc->talloc; - } - - return galloc->hash_allocs[lm_ggml_hash_find_or_insert(galloc->hash_set, node)]; +static bool lm_ggml_gallocr_is_allocated(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) { + return t->data != NULL || lm_ggml_gallocr_hash_get(galloc, t)->allocated; } -static void init_view(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * view, bool update_backend) { - lm_ggml_tallocr_t alloc = node_tallocr(galloc, view); - - LM_GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL); - if (update_backend) { - view->backend = view->view_src->backend; - } - // views are initialized in the alloc buffer rather than the view_src buffer - view->buffer = alloc->buffer; - view->data = (char *)view->view_src->data + view->view_offs; +static void lm_ggml_gallocr_allocate_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id) { + struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node); - assert(lm_ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); + if (!lm_ggml_gallocr_is_allocated(galloc, node) && !lm_ggml_is_view(node)) { + hn->allocated = true; + assert(hn->offset == 0); - if (!alloc->measure) { - lm_ggml_backend_buffer_init_tensor(alloc->buffer, view); - } -} + // try to reuse a parent's buffer (inplace) + if (lm_ggml_op_can_inplace(node->op)) { + for (int i = 0; i < LM_GGML_MAX_SRC; i++) { + struct lm_ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + continue; + } -static void allocate_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) { - lm_ggml_tallocr_t alloc = node_tallocr(galloc, node); + // if the node's data is external, then we cannot re-use it + if (!lm_ggml_gallocr_is_own(galloc, parent)) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } - if (node->data == NULL) { - if (lm_ggml_is_view(node)) { - init_view(galloc, node, true); - } else { - // see if we can reuse a parent's buffer (inplace) - if (lm_ggml_op_can_inplace(node->op)) { - for (int i = 0; i < LM_GGML_MAX_SRC; i++) { - struct lm_ggml_tensor * parent = node->src[i]; - if (parent == NULL) { - break; - } + // outputs cannot be reused + if (parent->flags & LM_GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & LM_GGML_TENSOR_FLAG_OUTPUT)) { + AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name); + continue; + } - // if the node's data is external, then we cannot re-use it - if (lm_ggml_tallocr_is_own(alloc, parent) == false) { - AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); - continue; - } + if (!lm_ggml_are_same_layout(node, parent)) { + AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name); + continue; + } - struct hash_node * p_hn = hash_get(galloc, parent); - if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && lm_ggml_are_same_layout(node, parent)) { - if (lm_ggml_is_view(parent)) { - struct lm_ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(galloc, view_src); - if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { - // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite - // the parent's data that it will need later (same layout requirement). the problem is that then - // we cannot free the tensor because the original address of the allocation is lost. - // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views - // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) - AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); - node->view_src = view_src; - view_src_hn->n_views += 1; - init_view(galloc, node, false); - return; - } - } else { - AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - node->view_src = parent; - p_hn->n_views += 1; - init_view(galloc, node, false); + struct hash_node * p_hn = lm_ggml_gallocr_hash_get(galloc, parent); + if (p_hn->n_children == 1 && p_hn->n_views == 0) { + if (lm_ggml_is_view(parent)) { + struct lm_ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = lm_ggml_gallocr_hash_get(galloc, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + assert(view_src_hn->offset == p_hn->offset); + hn->buffer_id = p_hn->buffer_id; + hn->offset = p_hn->offset; + p_hn->allocated = false; // avoid freeing the parent + view_src_hn->allocated = false; return; } + } else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + hn->buffer_id = p_hn->buffer_id; + hn->offset = p_hn->offset; + p_hn->allocated = false; // avoid freeing the parent + return; } } } - lm_ggml_tallocr_alloc(alloc, node); } + // allocate tensor from the buffer + struct lm_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + lm_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; + size_t size = lm_ggml_backend_buft_get_alloc_size(buft, node); + size_t offset = lm_ggml_dyn_tallocr_alloc(alloc, size, node); + hn->buffer_id = buffer_id; + hn->offset = offset; + return; } } -static void free_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) { - lm_ggml_tallocr_t alloc = node_tallocr(galloc, node); +static void lm_ggml_gallocr_free_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id) { + // graph outputs are never freed + if (node->flags & LM_GGML_TENSOR_FLAG_OUTPUT) { + AT_PRINTF("not freeing output %s\n", node->name); + return; + } + + struct lm_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + lm_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; + struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node); + size_t offset = hn->offset; + size_t size = lm_ggml_backend_buft_get_alloc_size(buft, node); + lm_ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); + hn->allocated = false; +} - lm_ggml_tallocr_free_tensor(alloc, node); +static int get_node_buffer_id(const int * node_buffer_ids, int i) { + return node_buffer_ids ? node_buffer_ids[i] : 0; } -static void lm_ggml_tallocr_alloc_graph_impl(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * gf) { - const int * parse_seq = galloc->parse_seq; - int parse_seq_len = galloc->parse_seq_len; +static void lm_ggml_gallocr_alloc_graph_impl(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, const int * node_buffer_ids) { + // clear hash tables + memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct lm_ggml_tensor *)); + memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); // count number of children and views - for (int i = 0; i < gf->n_nodes; i++) { - struct lm_ggml_tensor * node = gf->nodes[i]; + // allocate all graph inputs and leafs first to avoid overwriting them + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; if (lm_ggml_is_view(node)) { struct lm_ggml_tensor * view_src = node->view_src; - hash_get(galloc, view_src)->n_views += 1; - if (node->buffer == NULL && node->data != NULL) { - // view of a pre-allocated tensor, didn't call init_view() yet - init_view(galloc, node, true); - } + lm_ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; + } + + if (node->flags & LM_GGML_TENSOR_FLAG_INPUT) { + lm_ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); } for (int j = 0; j < LM_GGML_MAX_SRC; j++) { - struct lm_ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; } - hash_get(galloc, parent)->n_children += 1; - if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { - init_view(galloc, parent, true); + + lm_ggml_gallocr_hash_get(galloc, src)->n_children += 1; + + // allocate explicit inputs and leafs + if (src->flags & LM_GGML_TENSOR_FLAG_INPUT || src->op == LM_GGML_OP_NONE) { + lm_ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); } } - } + } + + // allocate the remaining leafs that are unused on the graph + // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes + for (int i = 0; i < graph->n_leafs; i++) { + struct lm_ggml_tensor * leaf = graph->leafs[i]; + struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, leaf); + + if (hn->n_children == 0) { + assert(!hn->allocated); + // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer + lm_ggml_gallocr_allocate_node(galloc, leaf, 0); + } + } // allocate tensors - // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers - int last_barrier_pos = 0; - int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes; - - for (int ind = 0; ind < n_nodes; ind++) { - // allocate a node if there is no parse_seq or this is not a barrier - if (parse_seq_len == 0 || parse_seq[ind] != -1) { - int i = parse_seq_len ? parse_seq[ind] : ind; - struct lm_ggml_tensor * node = gf->nodes[i]; - - // allocate parents (leafs) - for (int j = 0; j < LM_GGML_MAX_SRC; j++) { - struct lm_ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - allocate_node(galloc, parent); + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + int buffer_id = get_node_buffer_id(node_buffer_ids, i); + + // allocate parents (only leafs need to be allocated at this point) + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + continue; } + lm_ggml_gallocr_allocate_node(galloc, parent, buffer_id); + } - // allocate node - allocate_node(galloc, node); + // allocate node + lm_ggml_gallocr_allocate_node(galloc, node, buffer_id); - AT_PRINTF("exec: %s (%s) <= ", lm_ggml_op_name(node->op), node->name); - for (int j = 0; j < LM_GGML_MAX_SRC; j++) { - struct lm_ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - AT_PRINTF("%s", parent->name); - if (j < LM_GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { - AT_PRINTF(", "); - } + AT_PRINTF("exec: %s (%s) <= ", lm_ggml_op_desc(node), node->name); + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + continue; + } + AT_PRINTF("%s", parent->name); + if (j < LM_GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); } - AT_PRINTF("\n"); } + AT_PRINTF("\n"); // update parents - // update immediately if there is no parse_seq - // update only at barriers if there is parse_seq - if ((parse_seq_len == 0) || parse_seq[ind] == -1) { - int update_start = parse_seq_len ? last_barrier_pos : ind; - int update_end = parse_seq_len ? ind : ind + 1; - for (int i = update_start; i < update_end; i++) { - int node_i = parse_seq_len ? parse_seq[i] : i; - struct lm_ggml_tensor * node = gf->nodes[node_i]; - - for (int j = 0; j < LM_GGML_MAX_SRC; j++) { - struct lm_ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(galloc, parent); - p_hn->n_children -= 1; - - //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - - if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (lm_ggml_is_view(parent)) { - struct lm_ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(galloc, view_src); - view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); - if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) { - free_node(galloc, view_src); - } - } - else { - free_node(galloc, parent); - } + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + continue; + } + struct hash_node * p_hn = lm_ggml_gallocr_hash_get(galloc, parent); + p_hn->n_children -= 1; + + AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n", + parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (lm_ggml_is_view(parent)) { + struct lm_ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = lm_ggml_gallocr_hash_get(galloc, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", + view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { + lm_ggml_gallocr_free_node(galloc, view_src, buffer_id); } } + else if (p_hn->allocated) { + lm_ggml_gallocr_free_node(galloc, parent, buffer_id); + } } AT_PRINTF("\n"); - if (parse_seq_len) { - last_barrier_pos = ind + 1; - } } } } -size_t lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, lm_ggml_tallocr_t talloc, struct lm_ggml_cgraph * graph) { +bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, const int * node_buffer_ids) { size_t hash_size = graph->visited_hash_table.size; - // check if the hash table is initialized and large enough + // initialize hash table if (galloc->hash_set.size < hash_size) { - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); - } - if (galloc->hash_values != NULL) { - free(galloc->hash_values); - } - galloc->hash_set.keys = malloc(sizeof(struct lm_ggml_tensor *) * hash_size); + free(galloc->hash_set.keys); + free(galloc->hash_values); galloc->hash_set.size = hash_size; - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + galloc->hash_set.keys = calloc(sizeof(struct lm_ggml_tensor *), hash_size); + galloc->hash_values = calloc(sizeof(struct hash_node), hash_size); + LM_GGML_ASSERT(galloc->hash_set.keys != NULL); + LM_GGML_ASSERT(galloc->hash_values != NULL); + } else { + // reset hash table + memset(galloc->hash_set.keys, 0, sizeof(struct lm_ggml_tensor *) * galloc->hash_set.size); + memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); } - // reset hash table - memset(galloc->hash_set.keys, 0, sizeof(struct lm_ggml_tensor *) * hash_size); - memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); - - galloc->talloc = talloc; - lm_ggml_tallocr_alloc_graph_impl(galloc, graph); - galloc->talloc = NULL; - - size_t max_size = lm_ggml_tallocr_max_size(talloc); - - return max_size; -} - -void lm_ggml_gallocr_alloc_graph_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, struct lm_ggml_hash_set hash_set, lm_ggml_tallocr_t * hash_node_talloc) { - const size_t hash_size = hash_set.size; - - LM_GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs)); + // reset allocators + for (int i = 0; i < galloc->n_buffers; i++) { + lm_ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]); + } - galloc->talloc = NULL; + // allocate in hash table + lm_ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids); - // alloc hash_values if needed - if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) { - free(galloc->hash_values); - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); - galloc->hash_values_size = hash_size; + // set the node_allocs from the hash table + if (galloc->n_nodes < graph->n_nodes) { + free(galloc->node_allocs); + galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes); + LM_GGML_ASSERT(galloc->node_allocs != NULL); } - - // free hash_set.keys if needed - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); + galloc->n_nodes = graph->n_nodes; + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; + node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); + if (node->view_src || node->data) { + node_alloc->dst.offset = SIZE_MAX; + node_alloc->dst.size_max = 0; + } else { + struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node); + node_alloc->dst.offset = hn->offset; + node_alloc->dst.size_max = lm_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); + } + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (!src || src->view_src || src->data) { + node_alloc->src[j].offset = SIZE_MAX; + node_alloc->src[j].size_max = 0; + } else { + struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, src); + node_alloc->src[j].offset = hn->offset; + node_alloc->src[j].size_max = lm_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); + } + } + } + if (galloc->n_leafs < graph->n_leafs) { + free(galloc->leaf_allocs); + galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs); + LM_GGML_ASSERT(galloc->leaf_allocs != NULL); + } + galloc->n_leafs = graph->n_leafs; + for (int i = 0; i < graph->n_leafs; i++) { + struct lm_ggml_tensor * leaf = graph->leafs[i]; + struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, leaf); + galloc->leaf_allocs[i].offset = hn->offset; + galloc->leaf_allocs[i].size_max = lm_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); } - galloc->hash_set = hash_set; - - // reset hash values - memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); - galloc->hash_allocs = hash_node_talloc; + // reallocate buffers if needed + for (int i = 0; i < galloc->n_buffers; i++) { + size_t cur_size = galloc->buffers[i] ? lm_ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; + size_t new_size = lm_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); - lm_ggml_tallocr_alloc_graph_impl(galloc, graph); + if (new_size > cur_size) { +#ifndef NDEBUG + fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + lm_ggml_backend_buffer_free(galloc->buffers[i]); + galloc->buffers[i] = lm_ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); + if (galloc->buffers[i] == NULL) { + fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size); + return false; + } + } + } - // remove unowned resources - galloc->hash_set.keys = NULL; - galloc->hash_allocs = NULL; + return true; } -// legacy API wrapper - -struct lm_ggml_allocr { - lm_ggml_tallocr_t talloc; - lm_ggml_gallocr_t galloc; -}; - -static lm_ggml_allocr_t lm_ggml_allocr_new_impl(lm_ggml_tallocr_t talloc) { - lm_ggml_allocr_t alloc = (lm_ggml_allocr_t)malloc(sizeof(struct lm_ggml_allocr)); - *alloc = (struct lm_ggml_allocr) { - /*.talloc = */ talloc, - /*.galloc = */ lm_ggml_gallocr_new(), - }; - return alloc; +bool lm_ggml_gallocr_reserve(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *graph) { + return lm_ggml_gallocr_reserve_n(galloc, graph, NULL); } -lm_ggml_allocr_t lm_ggml_allocr_new(void * data, size_t size, size_t alignment) { - return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new(data, size, alignment)); -} +static void lm_ggml_gallocr_init_tensor(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) { + assert(node->data || node->view_src || lm_ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); -lm_ggml_allocr_t lm_ggml_allocr_new_measure(size_t alignment) { - return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_measure(alignment)); + if (node->view_src != NULL) { + if (node->buffer == NULL) { + assert(tensor_alloc->offset == SIZE_MAX); + if (node->view_src->buffer == NULL) { + // this tensor was allocated without ggml-backend + return; + } + lm_ggml_backend_view_init(galloc->buffers[buffer_id], node); + } + } else { + if (node->data == NULL) { + assert(tensor_alloc->offset != SIZE_MAX); + assert(lm_ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); + void * base = lm_ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); + void * addr = (char *)base + tensor_alloc->offset; + lm_ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr); + } else { + if (node->buffer == NULL) { + // this tensor was allocated without ggml-backend + return; + } + } + } } -lm_ggml_allocr_t lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) { - return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_from_buffer(buffer)); +static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { + lm_ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; + size_t node_size = (node->data || node->view_src) ? 0 : lm_ggml_backend_buft_get_alloc_size(buft, node); + return talloc->size_max >= node_size; } -lm_ggml_allocr_t lm_ggml_allocr_new_from_backend(struct lm_ggml_backend * backend, size_t size) { - return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_from_backend(backend, size)); -} +static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph) { + if (galloc->n_nodes != graph->n_nodes) { +#ifndef NDEBUG + fprintf(stderr, "%s: graph has different number of nodes\n", __func__); +#endif + return true; + } -lm_ggml_allocr_t lm_ggml_allocr_new_measure_from_backend(struct lm_ggml_backend * backend) { - return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_measure_from_backend(backend)); -} + if (galloc->n_leafs != graph->n_leafs) { +#ifndef NDEBUG + fprintf(stderr, "%s: graph has different number of leafs\n", __func__); +#endif + return true; + } -struct lm_ggml_backend_buffer * lm_ggml_allocr_get_buffer(lm_ggml_allocr_t alloc) { - return lm_ggml_tallocr_get_buffer(alloc->talloc); -} + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; -void lm_ggml_allocr_set_parse_seq(lm_ggml_allocr_t alloc, const int * list, int n) { - lm_ggml_gallocr_set_parse_seq(alloc->galloc, list, n); -} + if (!lm_ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { +#ifndef NDEBUG + fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); +#endif + return true; + } -void lm_ggml_allocr_free(lm_ggml_allocr_t alloc) { - if (alloc == NULL) { - return; + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if (!lm_ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { +#ifndef NDEBUG + fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); +#endif + return true; + } + } } - lm_ggml_gallocr_free(alloc->galloc); - lm_ggml_tallocr_free(alloc->talloc); - free(alloc); + return false; } -bool lm_ggml_allocr_is_measure(lm_ggml_allocr_t alloc) { - return lm_ggml_tallocr_is_measure(alloc->talloc); -} - -void lm_ggml_allocr_reset(lm_ggml_allocr_t alloc) { - lm_ggml_tallocr_reset(alloc->talloc); -} +bool lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph) { + if (lm_ggml_gallocr_needs_realloc(galloc, graph)) { + if (galloc->n_buffers == 1) { +#ifndef NDEBUG + fprintf(stderr, "%s: reallocating buffers automatically\n", __func__); +#endif + if (!lm_ggml_gallocr_reserve(galloc, graph)) { + return false; + } + } else { +#ifndef NDEBUG + fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); +#endif + return false; + } + } -void lm_ggml_allocr_alloc(lm_ggml_allocr_t alloc, struct lm_ggml_tensor * tensor) { - lm_ggml_tallocr_alloc(alloc->talloc, tensor); -} + // reset buffers + for (int i = 0; i < galloc->n_buffers; i++) { + // zero size buffers are not allocated + if (galloc->buffers[i] != NULL) { + lm_ggml_backend_buffer_reset(galloc->buffers[i]); + } + } -size_t lm_ggml_allocr_max_size(lm_ggml_allocr_t alloc) { - return lm_ggml_tallocr_max_size(alloc->talloc); -} + // allocate the graph tensors from the previous assignments + // nodes + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + lm_ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); + } + lm_ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); + } + // leafs + for (int i = 0; i < graph->n_leafs; i++) { + struct lm_ggml_tensor * leaf = graph->leafs[i]; + struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i]; + lm_ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc); + } -size_t lm_ggml_allocr_alloc_graph(lm_ggml_allocr_t alloc, struct lm_ggml_cgraph * graph) { - return lm_ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph); + return true; } -// utils -lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_ggml_context * ctx, lm_ggml_backend_buffer_type_t buft) { - LM_GGML_ASSERT(lm_ggml_get_no_alloc(ctx) == true); +size_t lm_ggml_gallocr_get_buffer_size(lm_ggml_gallocr_t galloc, int buffer_id) { + LM_GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers); - size_t alignment = lm_ggml_backend_buft_get_alignment(buft); - - size_t nbytes = 0; - for (struct lm_ggml_tensor * t = lm_ggml_get_first_tensor(ctx); t != NULL; t = lm_ggml_get_next_tensor(ctx, t)) { - if (t->data == NULL && t->view_src == NULL) { - nbytes += LM_GGML_PAD(lm_ggml_backend_buft_get_alloc_size(buft, t), alignment); - } + if (galloc->buffers[buffer_id] == NULL) { + return 0; } + return lm_ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); +} - if (nbytes == 0) { - // all the tensors in the context are already allocated -#ifndef NDEBUG - fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); -#endif - return NULL; - } +// utils - lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, nbytes); +static bool alloc_tensor_range(struct lm_ggml_context * ctx, + struct lm_ggml_tensor * first, struct lm_ggml_tensor * last, + lm_ggml_backend_buffer_type_t buft, size_t size, + lm_ggml_backend_buffer_t ** buffers, size_t * n_buffers) { + lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, size); if (buffer == NULL) { - // failed to allocate buffer #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate buffer\n", __func__); + fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size); #endif - return NULL; + for (size_t i = 0; i < *n_buffers; i++) { + lm_ggml_backend_buffer_free(*buffers[i]); + } + free(*buffers); + return false; } - lm_ggml_tallocr_t tallocr = lm_ggml_tallocr_new_from_buffer(buffer); + struct lm_ggml_tallocr * tallocr = lm_ggml_tallocr_new(buffer); - for (struct lm_ggml_tensor * t = lm_ggml_get_first_tensor(ctx); t != NULL; t = lm_ggml_get_next_tensor(ctx, t)) { + for (struct lm_ggml_tensor * t = first; t != last; t = lm_ggml_get_next_tensor(ctx, t)) { if (t->data == NULL) { if (t->view_src == NULL) { lm_ggml_tallocr_alloc(tallocr, t); - } else { + } else if (t->buffer == NULL) { lm_ggml_backend_view_init(buffer, t); } } else { - if (t->view_src != NULL) { + if (t->view_src != NULL && t->buffer == NULL) { // view of a pre-allocated tensor lm_ggml_backend_view_init(buffer, t); } @@ -824,6 +919,74 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g lm_ggml_tallocr_free(tallocr); + *buffers = realloc(*buffers, sizeof(lm_ggml_backend_buffer_t) * (*n_buffers + 1)); + (*buffers)[(*n_buffers)++] = buffer; + + return true; +} + +lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_ggml_context * ctx, lm_ggml_backend_buffer_type_t buft) { + LM_GGML_ASSERT(lm_ggml_get_no_alloc(ctx) == true); + + size_t alignment = lm_ggml_backend_buft_get_alignment(buft); + size_t max_size = lm_ggml_backend_buft_get_max_size(buft); + + lm_ggml_backend_buffer_t * buffers = NULL; + size_t n_buffers = 0; + + size_t cur_buf_size = 0; + struct lm_ggml_tensor * first = lm_ggml_get_first_tensor(ctx); + for (struct lm_ggml_tensor * t = first; t != NULL; t = lm_ggml_get_next_tensor(ctx, t)) { + size_t this_size = 0; + if (t->data == NULL && t->view_src == NULL) { + this_size = LM_GGML_PAD(lm_ggml_backend_buft_get_alloc_size(buft, t), alignment); + } + + if (this_size > max_size) { + fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", + __func__, t->name, + lm_ggml_backend_buft_name(buft), + this_size, max_size); + for (size_t i = 0; i < n_buffers; i++) { + lm_ggml_backend_buffer_free(buffers[i]); + } + free(buffers); + return NULL; + } + + if ((cur_buf_size + this_size) > max_size) { + // allocate tensors in the current buffer + if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { + return NULL; + } + first = t; + cur_buf_size = this_size; + } else { + cur_buf_size += this_size; + } + } + + // allocate remaining tensors + if (cur_buf_size > 0) { + if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { + return NULL; + } + } + + if (n_buffers == 0) { +#ifndef NDEBUG + fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); +#endif + return NULL; + } + + lm_ggml_backend_buffer_t buffer; + if (n_buffers == 1) { + buffer = buffers[0]; + } else { + buffer = lm_ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); + } + free(buffers); return buffer; } diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h index 0d5416b9..b709174f 100644 --- a/cpp/ggml-alloc.h +++ b/cpp/ggml-alloc.h @@ -6,88 +6,62 @@ extern "C" { #endif -struct lm_ggml_backend; -struct lm_ggml_backend_buffer; -struct lm_ggml_backend_buffer_type; +typedef struct lm_ggml_backend_buffer_type * lm_ggml_backend_buffer_type_t; +typedef struct lm_ggml_backend_buffer * lm_ggml_backend_buffer_t; +typedef struct lm_ggml_backend * lm_ggml_backend_t; -// -// Legacy API -// - -typedef struct lm_ggml_allocr * lm_ggml_allocr_t; - -// initialize allocator for use with CPU backend only -LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new(void * data, size_t size, size_t alignment); -LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_measure(size_t alignment); - -// initialize allocator for use with ggml-backend -LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer); -LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_from_backend(struct lm_ggml_backend * backend, size_t size); // allocates an owned buffer -LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_measure_from_backend(struct lm_ggml_backend * backend); - -LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_allocr_get_buffer(lm_ggml_allocr_t alloc); - -// tell the allocator to parse nodes following the order described in the list -// you should call this if your graph are optimized to execute out-of-order -LM_GGML_API void lm_ggml_allocr_set_parse_seq(lm_ggml_allocr_t alloc, const int * list, int n); - -LM_GGML_API void lm_ggml_allocr_free (lm_ggml_allocr_t alloc); -LM_GGML_API bool lm_ggml_allocr_is_measure (lm_ggml_allocr_t alloc); -LM_GGML_API void lm_ggml_allocr_reset (lm_ggml_allocr_t alloc); -LM_GGML_API void lm_ggml_allocr_alloc (lm_ggml_allocr_t alloc, struct lm_ggml_tensor * tensor); -LM_GGML_API size_t lm_ggml_allocr_max_size (lm_ggml_allocr_t alloc); - -LM_GGML_API size_t lm_ggml_allocr_alloc_graph(lm_ggml_allocr_t alloc, struct lm_ggml_cgraph * graph); +// Tensor allocator +typedef struct lm_ggml_tallocr * lm_ggml_tallocr_t; -// -// ggml-backend v2 API -// +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new(lm_ggml_backend_buffer_t buffer); +LM_GGML_API void lm_ggml_tallocr_free(lm_ggml_tallocr_t talloc); +LM_GGML_API void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t talloc, struct lm_ggml_tensor * tensor); -// Separate tensor and graph allocator objects -// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators -// The original API is kept as a wrapper around the new API +// Graph allocator +/* + Example usage: + lm_ggml_gallocr_t galloc = lm_ggml_gallocr_new(lm_ggml_bacckend_cpu_buffer_type()); -// Tensor allocator -typedef struct lm_ggml_tallocr * lm_ggml_tallocr_t; + // optional: create a worst-case graph and reserve the buffers to avoid reallocations + lm_ggml_gallocr_reserve(galloc, build_graph(max_batch)); -LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new(void * data, size_t size, size_t alignment); -LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure(size_t alignment); -LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buft(struct lm_ggml_backend_buffer_type * buft, size_t size); -LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size); // allocates an owned buffer -LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer); -LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_buft(struct lm_ggml_backend_buffer_type * buft); -LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend); + // allocate the graph + struct lm_ggml_cgraph * graph = build_graph(batch); + lm_ggml_gallocr_alloc_graph(galloc, graph); -LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_tallocr_get_buffer(lm_ggml_tallocr_t talloc); + printf("compute buffer size: %zu bytes\n", lm_ggml_gallocr_get_buffer_size(galloc, 0)); -LM_GGML_API void lm_ggml_tallocr_free (lm_ggml_tallocr_t talloc); -LM_GGML_API bool lm_ggml_tallocr_is_measure (lm_ggml_tallocr_t talloc); -LM_GGML_API void lm_ggml_tallocr_reset (lm_ggml_tallocr_t talloc); -LM_GGML_API void lm_ggml_tallocr_alloc (lm_ggml_tallocr_t talloc, struct lm_ggml_tensor * tensor); -LM_GGML_API size_t lm_ggml_tallocr_max_size (lm_ggml_tallocr_t talloc); + // evaluate the graph + lm_ggml_backend_graph_compute(backend, graph); +*/ +// special tensor flags for use with the graph allocator: +// lm_ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses +// lm_ggml_set_output(): output tensors are never freed and never overwritten -// Graph allocator typedef struct lm_ggml_gallocr * lm_ggml_gallocr_t; -LM_GGML_API lm_ggml_gallocr_t lm_ggml_gallocr_new(void); -LM_GGML_API void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc); +LM_GGML_API lm_ggml_gallocr_t lm_ggml_gallocr_new(lm_ggml_backend_buffer_type_t buft); +LM_GGML_API lm_ggml_gallocr_t lm_ggml_gallocr_new_n(lm_ggml_backend_buffer_type_t * bufts, int n_bufs); +LM_GGML_API void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc); -LM_GGML_API void lm_ggml_gallocr_set_parse_seq(lm_ggml_gallocr_t galloc, const int * list, int n); -LM_GGML_API size_t lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, lm_ggml_tallocr_t talloc, struct lm_ggml_cgraph * graph); +// pre-allocate buffers from a measure graph - does not allocate or modify the graph +// call with a worst-case graph to avoid buffer reallocations +// not strictly required for single buffer usage: lm_ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed +// returns false if the buffer allocation failed +LM_GGML_API bool lm_ggml_gallocr_reserve(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph); +LM_GGML_API bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, const int * node_buffer_ids); -// Allocate tensors from the allocators given by the hash table -LM_GGML_API void lm_ggml_gallocr_alloc_graph_n( - lm_ggml_gallocr_t galloc, - struct lm_ggml_cgraph * graph, - struct lm_ggml_hash_set hash_set, - lm_ggml_tallocr_t * hash_node_talloc); +// automatic reallocation if the topology changes when using a single buffer +// returns false if using multiple buffers and a re-allocation is needed (call lm_ggml_gallocr_reserve_n first to set the node buffers) +LM_GGML_API bool lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph); +LM_GGML_API size_t lm_ggml_gallocr_get_buffer_size(lm_ggml_gallocr_t galloc, int buffer_id); // Utils // Create a buffer and allocate all the tensors in a lm_ggml_context -LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_ggml_context * ctx, struct lm_ggml_backend_buffer_type * buft); -LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_backend_alloc_ctx_tensors(struct lm_ggml_context * ctx, struct lm_ggml_backend * backend); +LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_ggml_context * ctx, lm_ggml_backend_buffer_type_t buft); +LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_backend_alloc_ctx_tensors(struct lm_ggml_context * ctx, lm_ggml_backend_t backend); #ifdef __cplusplus } diff --git a/cpp/ggml-backend-impl.h b/cpp/ggml-backend-impl.h index 01c3c0cb..9a007a16 100644 --- a/cpp/ggml-backend-impl.h +++ b/cpp/ggml-backend-impl.h @@ -19,6 +19,7 @@ extern "C" { const char * (*LM_GGML_CALL get_name) (lm_ggml_backend_buffer_type_t buft); lm_ggml_backend_buffer_t (*LM_GGML_CALL alloc_buffer) (lm_ggml_backend_buffer_type_t buft, size_t size); size_t (*LM_GGML_CALL get_alignment) (lm_ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*LM_GGML_CALL get_max_size) (lm_ggml_backend_buffer_type_t buft); // allocation max size size_t (*LM_GGML_CALL get_alloc_size) (lm_ggml_backend_buffer_type_t buft, const struct lm_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*LM_GGML_CALL supports_backend)(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory @@ -63,6 +64,11 @@ extern "C" { // do not use directly, use lm_ggml_backend_tensor_copy instead bool lm_ggml_backend_buffer_copy_tensor(const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); + // buffer that contains a collection of buffers + LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers); + LM_GGML_CALL bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer); + LM_GGML_CALL void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage); + // // Backend // diff --git a/cpp/ggml-backend.c b/cpp/ggml-backend.c index e3257a36..86f3b268 100644 --- a/cpp/ggml-backend.c +++ b/cpp/ggml-backend.c @@ -27,10 +27,20 @@ size_t lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type_t buft) { return buft->iface.get_alignment(buft); } +size_t lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_type_t buft) { + // get_max_size is optional, defaults to SIZE_MAX + if (buft->iface.get_max_size) { + return buft->iface.get_max_size(buft); + } + return SIZE_MAX; +} + LM_GGML_CALL size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) { // get_alloc_size is optional, defaults to lm_ggml_nbytes if (buft->iface.get_alloc_size) { - return buft->iface.get_alloc_size(buft, tensor); + size_t size = buft->iface.get_alloc_size(buft, tensor); + assert(size >= lm_ggml_nbytes(tensor)); + return size; } return lm_ggml_nbytes(tensor); } @@ -55,8 +65,6 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( size_t size) { lm_ggml_backend_buffer_t buffer = malloc(sizeof(struct lm_ggml_backend_buffer)); - LM_GGML_ASSERT(iface.get_base != NULL); - (*buffer) = (struct lm_ggml_backend_buffer) { /* .interface = */ iface, /* .buft = */ buft, @@ -106,6 +114,10 @@ size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer) { return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer)); } +size_t lm_ggml_backend_buffer_get_max_size(lm_ggml_backend_buffer_t buffer) { + return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_get_type(buffer)); +} + size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor); } @@ -120,6 +132,11 @@ bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) { void lm_ggml_backend_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) { buffer->usage = usage; + + // FIXME: add a generic callback to the buffer interface + if (lm_ggml_backend_buffer_is_multi_buffer(buffer)) { + lm_ggml_backend_multi_buffer_set_usage(buffer, usage); + } } lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_get_type(lm_ggml_backend_buffer_t buffer) { @@ -169,6 +186,10 @@ size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend) { return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_get_default_buffer_type(backend)); } +size_t lm_ggml_backend_get_max_size(lm_ggml_backend_t backend) { + return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_get_default_buffer_type(backend)); +} + void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); @@ -198,6 +219,10 @@ LM_GGML_CALL void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, con LM_GGML_ASSERT(buf != NULL && "tensor buffer not set"); LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); + if (!size) { + return; + } + tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); } @@ -208,6 +233,10 @@ LM_GGML_CALL void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tenso LM_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); + if (!size) { + return; + } + tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); } @@ -337,11 +366,26 @@ LM_GGML_CALL static void lm_ggml_backend_registry_init(void) { lm_ggml_backend_cuda_reg_devices(); #endif +#ifdef LM_GGML_USE_SYCL + extern void lm_ggml_backend_sycl_reg_devices(void); + lm_ggml_backend_sycl_reg_devices(); +#endif + #ifdef LM_GGML_USE_METAL extern LM_GGML_CALL lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data); extern LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void); lm_ggml_backend_register("Metal", lm_ggml_backend_reg_metal_init, lm_ggml_backend_metal_buffer_type(), NULL); #endif + +#ifdef LM_GGML_USE_VULKAN + extern LM_GGML_CALL int lm_ggml_backend_vk_reg_devices(void); + lm_ggml_backend_vk_reg_devices(); +#endif + +#ifdef LM_GGML_USE_KOMPUTE + extern LM_GGML_CALL void lm_ggml_backend_kompute_reg_devices(void); + lm_ggml_backend_kompute_reg_devices(); +#endif } LM_GGML_CALL void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data) { @@ -439,6 +483,8 @@ lm_ggml_backend_buffer_t lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size) // backend CPU +static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment + LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_name(lm_ggml_backend_buffer_t buffer) { return "CPU"; @@ -446,7 +492,14 @@ LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_name(lm_ggml_backend } LM_GGML_CALL static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) { - return (void *)buffer->context; + uintptr_t data = (uintptr_t)buffer->context; + + // align the buffer + if (data % TENSOR_ALIGNMENT != 0) { + data = LM_GGML_PAD(data, TENSOR_ALIGNMENT); + } + + return (void *)data; } LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) { @@ -504,8 +557,6 @@ static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .reset = */ NULL, }; -static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 - LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) { return "CPU"; @@ -514,9 +565,11 @@ LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggm LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: maybe use LM_GGML_ALIGNED_MALLOC? - - LM_GGML_ASSERT(data != NULL && "failed to allocate buffer"); + void * data = malloc(size); // TODO: use LM_GGML_ALIGNED_MALLOC (move to ggml-impl.h) + if (data == NULL) { + fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } return lm_ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); } @@ -545,6 +598,7 @@ LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes /* .supports_backend = */ lm_ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host, @@ -600,6 +654,7 @@ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) { /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name, /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes /* .supports_backend = */ lm_ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host, @@ -615,6 +670,9 @@ struct lm_ggml_backend_cpu_context { int n_threads; void * work_data; size_t work_size; + + lm_ggml_abort_callback abort_callback; + void * abort_callback_data; }; LM_GGML_CALL static const char * lm_ggml_backend_cpu_name(lm_ggml_backend_t backend) { @@ -653,6 +711,9 @@ LM_GGML_CALL static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_ cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); } + cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; + cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; + return cpu_plan; } @@ -683,9 +744,11 @@ LM_GGML_CALL static bool lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t bac cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); cpu_ctx->work_size = cplan.work_size; } - cplan.work_data = cpu_ctx->work_data; + cplan.abort_callback = cpu_ctx->abort_callback; + cplan.abort_callback_data = cpu_ctx->abort_callback_data; + lm_ggml_graph_compute(cgraph, &cplan); return true; } @@ -693,7 +756,7 @@ LM_GGML_CALL static bool lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t bac LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) { switch (op->op) { case LM_GGML_OP_CPY: - return op->type != LM_GGML_TYPE_IQ2_XXS && op->type != LM_GGML_TYPE_IQ2_XS; // missing type_traits.from_float + return op->type != LM_GGML_TYPE_IQ2_XXS && op->type != LM_GGML_TYPE_IQ2_XS && op->type != LM_GGML_TYPE_IQ1_S; // missing type_traits.from_float case LM_GGML_OP_MUL_MAT: return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; default: @@ -720,12 +783,21 @@ static struct lm_ggml_backend_i cpu_backend_i = { lm_ggml_backend_t lm_ggml_backend_cpu_init(void) { struct lm_ggml_backend_cpu_context * ctx = malloc(sizeof(struct lm_ggml_backend_cpu_context)); + if (ctx == NULL) { + return NULL; + } - ctx->n_threads = LM_GGML_DEFAULT_N_THREADS; - ctx->work_data = NULL; - ctx->work_size = 0; + ctx->n_threads = LM_GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; + ctx->abort_callback = NULL; + ctx->abort_callback_data = NULL; lm_ggml_backend_t cpu_backend = malloc(sizeof(struct lm_ggml_backend)); + if (cpu_backend == NULL) { + free(ctx); + return NULL; + } *cpu_backend = (struct lm_ggml_backend) { /* .interface = */ cpu_backend_i, @@ -745,7 +817,16 @@ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_thre ctx->n_threads = n_threads; } +void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) { + LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu)); + + struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context; + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = abort_callback_data; +} + LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { + LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } @@ -756,6 +837,94 @@ LM_GGML_CALL static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * LM_GGML_UNUSED(user_data); } +// multi-buffer buffer + +struct lm_ggml_backend_multi_buffer_context { + lm_ggml_backend_buffer_t * buffers; + size_t n_buffers; +}; + +typedef struct lm_ggml_backend_multi_buffer_context * lm_ggml_backend_multi_buffer_context_t; + +LM_GGML_CALL static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) { + lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context; + + return ctx->buffers[0]->iface.get_name(ctx->buffers[0]); +} + +LM_GGML_CALL static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) { + lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context; + for (size_t i = 0; i < ctx->n_buffers; i++) { + lm_ggml_backend_buffer_free(ctx->buffers[i]); + } + + free(ctx->buffers); + free(ctx); +} + +LM_GGML_CALL static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) { + lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context; + for (size_t i = 0; i < ctx->n_buffers; i++) { + lm_ggml_backend_buffer_clear(ctx->buffers[i], value); + } +} + +static struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_context_interface(void) { + static struct lm_ggml_backend_buffer_i multi_backend_buffer_i = { + /* .get_name = */ lm_ggml_backend_multi_buffer_get_name, + /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer, + /* .get_base = */ NULL, + /* .init_tensor = */ NULL, + /* .set_tensor = */ NULL, + /* .get_tensor = */ NULL, + /* .cpy_tensor = */ NULL, + /* .clear = */ lm_ggml_backend_multi_buffer_clear, + /* .reset = */ NULL, + }; + + return multi_backend_buffer_i; +} + +LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) { + lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context)); + ctx->n_buffers = n_buffers; + ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t)); + + LM_GGML_ASSERT(ctx->buffers != NULL); + + size_t total_size = 0; + for (size_t i = 0; i < n_buffers; i++) { + ctx->buffers[i] = buffers[i]; + total_size += lm_ggml_backend_buffer_get_size(buffers[i]); + } + + return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_context_interface(), ctx, total_size); +} + +LM_GGML_CALL bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name; +} + +LM_GGML_CALL void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) { + LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer)); + lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context; + for (size_t i = 0; i < ctx->n_buffers; i++) { + lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage); + } +} + +// creates a copy of the tensor with the same memory layout +static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) { + struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor); + for (int i = 0; i < LM_GGML_MAX_DIMS; i++) { + dup->nb[i] = tensor->nb[i]; + } + return dup; +} + +static bool lm_ggml_is_view_op(enum lm_ggml_op op) { + return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE; +} // scheduler @@ -764,7 +933,7 @@ LM_GGML_CALL static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * #define LM_GGML_MAX_SPLIT_INPUTS 16 struct lm_ggml_backend_sched_split { - lm_ggml_tallocr_t tallocr; + int backend_id; int i_start; int i_end; struct lm_ggml_tensor * inputs[LM_GGML_MAX_SPLIT_INPUTS]; @@ -779,15 +948,17 @@ struct lm_ggml_backend_sched { int n_backends; lm_ggml_backend_t backends[LM_GGML_MAX_BACKENDS]; lm_ggml_backend_buffer_type_t bufts[LM_GGML_MAX_BACKENDS]; - lm_ggml_tallocr_t tallocs[LM_GGML_MAX_BACKENDS]; lm_ggml_gallocr_t galloc; // hash keys of the nodes in the graph struct lm_ggml_hash_set hash_set; - // hash values (arrays of [hash_set.size]) - lm_ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend) - struct lm_ggml_tensor * (* node_copies)[LM_GGML_MAX_BACKENDS]; // copies of each node for each destination backend + // hash values + int * tensor_backend_id; + struct lm_ggml_tensor * (* tensor_copies)[LM_GGML_MAX_BACKENDS]; + + int * node_backend_ids; // [n_nodes] + int n_nodes; // copy of the graph with modified inputs struct lm_ggml_cgraph * graph; @@ -797,75 +968,45 @@ struct lm_ggml_backend_sched { struct lm_ggml_context * ctx; + lm_ggml_backend_sched_eval_callback callback_eval; + void * callback_eval_user_data; + // align context_buffer to LM_GGML_MEM_ALIGN #ifdef _MSC_VER __declspec(align(LM_GGML_MEM_ALIGN)) #else __attribute__((aligned(LM_GGML_MEM_ALIGN))) #endif - char context_buffer[LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS*sizeof(struct lm_ggml_tensor) + sizeof(struct lm_ggml_cgraph)]; - - lm_ggml_backend_sched_eval_callback callback_eval; - void * callback_eval_user_data; + char context_buffer[LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + sizeof(struct lm_ggml_cgraph)]; }; #define hash_id(node) lm_ggml_hash_find_or_insert(sched->hash_set, node) -#define node_allocr(node) sched->node_talloc[hash_id(node)] - -static bool lm_ggml_is_view_op(enum lm_ggml_op op) { - return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE; -} +#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)] +#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)]) -// returns the priority of the backend, lower is better -static int sched_backend_prio(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { +// returns the priority of the backend, lower id is higher priority +static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { for (int i = 0; i < sched->n_backends; i++) { if (sched->backends[i] == backend) { return i; } } - return INT_MAX; + return -1; } -static int sched_allocr_prio(lm_ggml_backend_sched_t sched, lm_ggml_tallocr_t allocr) { - for (int i = 0; i < sched->n_backends; i++) { - if (sched->tallocs[i] == allocr) { - return i; - } - } - return INT_MAX; -} - -static lm_ggml_tallocr_t sched_allocr_from_buffer(lm_ggml_backend_sched_t sched, lm_ggml_backend_buffer_t buffer) { +static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, lm_ggml_backend_buffer_t buffer) { if (buffer == NULL) { - return NULL; - } - - // check if this is already allocate in a allocr buffer (from user manual allocations) - for (int i = 0; i < sched->n_backends; i++) { - if (lm_ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) { - return sched->tallocs[i]; - } + return -1; } // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (lm_ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { - return sched->tallocs[i]; + return i; } } LM_GGML_ASSERT(false && "tensor buffer type not supported by any backend"); -} - -static lm_ggml_backend_t get_allocr_backend(lm_ggml_backend_sched_t sched, lm_ggml_tallocr_t allocr) { - if (allocr == NULL) { - return NULL; - } - for (int i = 0; i < sched->n_backends; i++) { - if (sched->tallocs[i] == allocr) { - return sched->backends[i]; - } - } - LM_GGML_UNREACHABLE(); + return -1; // silence warning } #if 0 @@ -878,37 +1019,39 @@ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_MAX_SPLITS*LM_GGML_MA #endif // returns the backend that should be used for the node based on the current locations -static lm_ggml_tallocr_t sched_allocr_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) { +static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) { + // TODO: use supports_op to check if the backend supports the op + // assign pre-allocated nodes to their backend // dst - lm_ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer); - if (cur_allocr != NULL) { + int cur_backend = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->buffer); + if (cur_backend != -1) { SET_CAUSE(node, "1.dst"); - return cur_allocr; + return cur_backend; } // view_src - if (node->view_src != NULL) { - cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer); - if (cur_allocr != NULL) { + if (tensor->view_src != NULL) { + cur_backend = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer); + if (cur_backend != -1) { SET_CAUSE(node, "1.vsrc"); - return cur_allocr; + return cur_backend; } } // assign nodes that use weights to the backend of the weights for (int i = 0; i < LM_GGML_MAX_SRC; i++) { - const struct lm_ggml_tensor * src = node->src[i]; + const struct lm_ggml_tensor * src = tensor->src[i]; if (src == NULL) { - break; + continue; } if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - lm_ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer); + int src_backend = lm_ggml_backend_sched_backend_from_buffer(sched, src->buffer); // operations with weights are always run on the same backend as the weights SET_CAUSE(node, "1.wgt%d", i); - return src_allocr; + return src_backend; } } - return NULL; + return -1; } static char * fmt_size(size_t size) { @@ -921,11 +1064,11 @@ static char * fmt_size(size_t size) { return buffer; } -static void sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { +static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { int cur_split = 0; for (int i = 0; i < graph->n_nodes; i++) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { - lm_ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr); + lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { @@ -939,17 +1082,15 @@ static void sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggm if (lm_ggml_is_view_op(node->op)) { continue; } - lm_ggml_tallocr_t node_allocr = node_allocr(node); - lm_ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME: + lm_ggml_backend_t tensor_backend = tensor_backend(node); fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name, - fmt_size(lm_ggml_nbytes(node)), node_allocr ? lm_ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node)); + fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } - lm_ggml_tallocr_t src_allocr = node_allocr(src); - lm_ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL; + lm_ggml_backend_t src_backend = tensor_backend(src); fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } @@ -957,23 +1098,13 @@ static void sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggm } } -// creates a copy of the tensor with the same memory layout -static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) { - struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor); - for (int i = 0; i < LM_GGML_MAX_DIMS; i++) { - dup->nb[i] = tensor->nb[i]; - } - return dup; -} - - //#define DEBUG_PASS1 //#define DEBUG_PASS2 //#define DEBUG_PASS3 //#define DEBUG_PASS4 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend -static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { +static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { // reset splits sched->n_splits = 0; sched->is_reset = false; @@ -995,28 +1126,28 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct lm_ggml_tensor * leaf = graph->leafs[i]; - if (node_allocr(leaf) != NULL) { + if (tensor_backend_id(leaf) != -1) { // do not overwrite user assignments continue; } - node_allocr(leaf) = sched_allocr_from_cur(sched, leaf); + tensor_backend_id(leaf) = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; - if (node_allocr(node) != NULL) { + if (tensor_backend_id(node) != -1) { // do not overwrite user assignments continue; } - node_allocr(node) = sched_allocr_from_cur(sched, node); + tensor_backend_id(node) = lm_ggml_backend_sched_backend_id_from_cur(sched, node); // src for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } - if (node_allocr(src) == NULL) { - node_allocr(src) = sched_allocr_from_cur(sched, src); + if (tensor_backend_id(src) == -1) { + tensor_backend_id(src) = lm_ggml_backend_sched_backend_id_from_cur(sched, src); } } } @@ -1031,22 +1162,22 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra // pass 2.1 expand gpu up { - lm_ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct lm_ggml_tensor * node = graph->nodes[i]; if (lm_ggml_is_view_op(node->op)) { continue; } - lm_ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + if (tensor_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) - cur_allocr = NULL; + cur_backend_id = -1; } else { - cur_allocr = node_allocr; + cur_backend_id = tensor_backend_id; } } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.1"); } } @@ -1054,22 +1185,22 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra // pass 2.2 expand gpu down { - lm_ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; if (lm_ggml_is_view_op(node->op)) { continue; } - lm_ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + if (tensor_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) - cur_allocr = NULL; + cur_backend_id = -1; } else { - cur_allocr = node_allocr; + cur_backend_id = tensor_backend_id; } } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.2"); } } @@ -1077,17 +1208,17 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra // pass 2.3 expand rest up { - lm_ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct lm_ggml_tensor * node = graph->nodes[i]; if (lm_ggml_is_view_op(node->op)) { continue; } - lm_ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - cur_allocr = node_allocr; + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + cur_backend_id = tensor_backend_id; } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.3"); } } @@ -1095,17 +1226,17 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra // pass 2.4 expand rest down { - lm_ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; if (lm_ggml_is_view_op(node->op)) { continue; } - lm_ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - cur_allocr = node_allocr; + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + cur_backend_id = tensor_backend_id; } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.4"); } } @@ -1117,24 +1248,24 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; - lm_ggml_tallocr_t cur_allocr = node_allocr(node); - if (node->view_src != NULL && cur_allocr == NULL) { - cur_allocr = node_allocr(node) = node_allocr(node->view_src); + int cur_backend_id = tensor_backend_id(node); + if (node->view_src != NULL && cur_backend_id == -1) { + cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src); SET_CAUSE(node, "3.vsrc"); } for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } - lm_ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr == NULL) { + int src_backend_id = tensor_backend_id(src); + if (src_backend_id == -1) { if (src->view_src != NULL) { // views are always on the same backend as the source - node_allocr(src) = node_allocr(src->view_src); + tensor_backend_id(src) = tensor_backend_id(src->view_src); SET_CAUSE(src, "3.vsrc"); } else { - node_allocr(src) = cur_allocr; + tensor_backend_id(src) = cur_backend_id; SET_CAUSE(src, "3.cur"); } } @@ -1151,15 +1282,14 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; if (!lm_ggml_is_view_op(node->op)) { - sched->splits[0].tallocr = node_allocr(node); + sched->splits[0].backend_id = tensor_backend_id(node); break; } } sched->splits[0].i_start = 0; sched->splits[0].n_inputs = 0; memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - lm_ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; - size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + int cur_backend_id = sched->splits[0].backend_id; for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; @@ -1167,58 +1297,45 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra continue; } - lm_ggml_tallocr_t node_allocr = node_allocr(node); + int tensor_backend_id = tensor_backend_id(node); - LM_GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now + LM_GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now - if (node_allocr != cur_allocr) { + if (tensor_backend_id != cur_backend_id) { sched->splits[cur_split].i_end = i; cur_split++; LM_GGML_ASSERT(cur_split < LM_GGML_MAX_SPLITS); - sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].backend_id = tensor_backend_id; sched->splits[cur_split].i_start = i; sched->splits[cur_split].n_inputs = 0; - cur_allocr = node_allocr; - cur_backend_id = sched_allocr_prio(sched, cur_allocr); + cur_backend_id = tensor_backend_id; } // find inputs that are not on the same backend for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } - lm_ggml_tallocr_t src_allocr = node_allocr(src); - LM_GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now - if (src_allocr != node_allocr) { - // check if the input is already in the split - bool found = false; - for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { - if (sched->splits[cur_split].inputs[k] == src) { - found = true; - break; - } - } - - if (!found) { - int n_inputs = sched->splits[cur_split].n_inputs++; - //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, lm_ggml_backend_name(get_allocr_backend(sched, src_allocr))); - LM_GGML_ASSERT(n_inputs < LM_GGML_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = src; - } - + int src_backend_id = tensor_backend_id(src); + assert(src_backend_id != -1); // all inputs should be assigned by now + if (src_backend_id != tensor_backend_id) { // create a copy of the input in the split's backend size_t id = hash_id(src); - if (sched->node_copies[id][cur_backend_id] == NULL) { - lm_ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + if (sched->tensor_copies[id][cur_backend_id] == NULL) { + lm_ggml_backend_t backend = sched->backends[cur_backend_id]; struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src); lm_ggml_format_name(tensor_copy, "%s#%s", lm_ggml_backend_name(backend), src->name); - sched->node_copies[id][cur_backend_id] = tensor_copy; - node_allocr(tensor_copy) = cur_allocr; + sched->tensor_copies[id][cur_backend_id] = tensor_copy; + tensor_backend_id(tensor_copy) = cur_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); + + int n_inputs = sched->splits[cur_split].n_inputs++; + LM_GGML_ASSERT(n_inputs < LM_GGML_MAX_SPLIT_INPUTS); + sched->splits[cur_split].inputs[n_inputs] = src; } - node->src[j] = sched->node_copies[id][cur_backend_id]; + node->src[j] = sched->tensor_copies[id][cur_backend_id]; } } } @@ -1233,30 +1350,30 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra // sanity check: all sources should have the same backend as the node for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; - lm_ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr == NULL) { + lm_ggml_backend_t tensor_backend = tensor_backend(node); + if (tensor_backend == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } - if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) { + if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) { fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", - node->name, node_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", - node->view_src->name, node_allocr(node->view_src) ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL"); + node->name, tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", + node->view_src->name, tensor_backend(node->view_src) ? lm_ggml_backend_name(tensor_backend(node->view_src)) : "NULL"); } for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } - lm_ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now + lm_ggml_backend_t src_backend = tensor_backend(src); + if (src_backend != tensor_backend /* && src_backend != NULL */) { fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", - node->name, node_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", - j, src->name, src_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); + node->name, tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", + j, src->name, src_backend ? lm_ggml_backend_name(src_backend) : "NULL"); } - if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) { + if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) { fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", - src->name, src_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL", - src->view_src->name, node_allocr(src->view_src) ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL"); + src->name, src_backend ? lm_ggml_backend_name(src_backend) : "NULL", + src->view_src->name, tensor_backend(src->view_src) ? lm_ggml_backend_name(tensor_backend(src->view_src)) : "NULL"); } } } @@ -1270,32 +1387,45 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra struct lm_ggml_backend_sched_split * split = &sched->splits[i]; split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end); - // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { struct lm_ggml_tensor * input = split->inputs[j]; - struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + struct lm_ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id]; + // add a dependency to the input source so that it is not freed before the copy is done - LM_GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input); - input_cpy->src[0] = input; + struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input); + sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); + graph_copy->nodes[graph_copy->n_nodes++] = input_dep; + + // add a dependency to the input copy so that it is allocated at the start of the split + sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } for (int j = split->i_start; j < split->i_end; j++) { + sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; } } sched->graph = graph_copy; } -static void sched_alloc_splits(lm_ggml_backend_sched_t sched) { - lm_ggml_gallocr_alloc_graph_n( - sched->galloc, - sched->graph, - sched->hash_set, - sched->node_talloc); +static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) { + // lm_ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); + if (!lm_ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { +#ifndef NDEBUG + fprintf(stderr, "lm_ggml_backend_sched: failed to allocate graph, reserving\n"); +#endif + lm_ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); + if (!lm_ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + fprintf(stderr, "lm_ggml_backend_sched: failed to allocate graph\n"); + return false; + } + } + + return true; } -static void sched_compute_splits(lm_ggml_backend_sched_t sched) { +static bool lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) { uint64_t copy_us[LM_GGML_MAX_BACKENDS] = {0}; uint64_t compute_us[LM_GGML_MAX_BACKENDS] = {0}; @@ -1303,20 +1433,18 @@ static void sched_compute_splits(lm_ggml_backend_sched_t sched) { for (int i = 0; i < sched->n_splits; i++) { struct lm_ggml_backend_sched_split * split = &splits[i]; - lm_ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr); - int split_backend_id = sched_backend_prio(sched, split_backend); + int split_backend_id = split->backend_id; + lm_ggml_backend_t split_backend = sched->backends[split_backend_id]; // copy the input tensors to the split backend uint64_t copy_start_us = lm_ggml_time_us(); for (int j = 0; j < split->n_inputs; j++) { struct lm_ggml_tensor * input = split->inputs[j]; - struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id]; + struct lm_ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id]; LM_GGML_ASSERT(input->buffer != NULL); LM_GGML_ASSERT(input_cpy->buffer != NULL); - // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change - // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times lm_ggml_backend_tensor_copy_async(split_backend, input, input_cpy); } //lm_ggml_backend_synchronize(split_backend); // necessary to measure copy time @@ -1332,8 +1460,10 @@ static void sched_compute_splits(lm_ggml_backend_sched_t sched) { uint64_t compute_start_us = lm_ggml_time_us(); if (!sched->callback_eval) { - lm_ggml_backend_graph_compute(split_backend, &split->graph); - //lm_ggml_backend_synchronize(split_backend); // necessary to measure compute time + if (!lm_ggml_backend_graph_compute(split_backend, &split->graph)) { + return false; + } + //lm_ggml_backend_synchronize(split_backend); // necessary to measure compute time } else { // similar to lm_ggml_backend_compare_graph_backend for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { @@ -1352,7 +1482,9 @@ static void sched_compute_splits(lm_ggml_backend_sched_t sched) { struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1); - lm_ggml_backend_graph_compute(split_backend, &gv); + if (!lm_ggml_backend_graph_compute(split_backend, &gv)) { + return false; + } if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { break; @@ -1374,19 +1506,8 @@ static void sched_compute_splits(lm_ggml_backend_sched_t sched) { } } #endif -} -static void sched_reset(lm_ggml_backend_sched_t sched) { - for (int i = 0; i < sched->n_backends; i++) { - lm_ggml_tallocr_reset(sched->tallocs[i]); - } - // reset state for the next run - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); - memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); - memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); - - sched->is_reset = true; + return true; } lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { @@ -1396,9 +1517,10 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, struct lm_ggml_backend_sched * sched = calloc(sizeof(struct lm_ggml_backend_sched), 1); // initialize hash table - sched->hash_set = lm_ggml_hash_set_new(graph_size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS); - sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1); - sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1); + sched->hash_set = lm_ggml_hash_set_new(graph_size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS); + sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); + sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); + sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); sched->n_backends = n_backends; for (int i = 0; i < n_backends; i++) { @@ -1406,14 +1528,9 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, sched->bufts[i] = bufts ? bufts[i] : lm_ggml_backend_get_default_buffer_type(backends[i]); } - sched->galloc = lm_ggml_gallocr_new(); + sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends); - // init measure allocs for each backend - for (int i = 0; i < n_backends; i++) { - sched->tallocs[i] = lm_ggml_tallocr_new_measure_from_buft(sched->bufts[i]); - } - - sched_reset(sched); + lm_ggml_backend_sched_reset(sched); return sched; } @@ -1422,49 +1539,54 @@ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) { if (sched == NULL) { return; } - for (int i = 0; i < sched->n_backends; i++) { - lm_ggml_tallocr_free(sched->tallocs[i]); - } lm_ggml_gallocr_free(sched->galloc); lm_ggml_free(sched->ctx); free(sched->hash_set.keys); - free(sched->node_talloc); - free(sched->node_copies); + free(sched->tensor_backend_id); + free(sched->tensor_copies); + free(sched->node_backend_ids); free(sched); } -void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) { - LM_GGML_ASSERT(lm_ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once +void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) { + // reset state for the next run + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT + memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); + memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); + + sched->is_reset = true; +} - sched_split_graph(sched, measure_graph); - sched_alloc_splits(sched); +bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) { + lm_ggml_backend_sched_split_graph(sched, measure_graph); - // allocate buffers and reset allocators - for (int i = 0; i < sched->n_backends; i++) { - size_t size = lm_ggml_tallocr_max_size(sched->tallocs[i]); - lm_ggml_tallocr_free(sched->tallocs[i]); - sched->tallocs[i] = lm_ggml_tallocr_new_from_buft(sched->bufts[i], size); + if (!lm_ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) { + return false; } - sched_reset(sched); + lm_ggml_backend_sched_reset(sched); + return true; } -void lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { +bool lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS); if (!sched->is_reset) { - sched_reset(sched); + lm_ggml_backend_sched_reset(sched); } - sched_split_graph(sched, graph); - sched_alloc_splits(sched); - sched_compute_splits(sched); -} + lm_ggml_backend_sched_split_graph(sched, graph); + if (!lm_ggml_backend_sched_alloc_splits(sched)) { + return false; + } -void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) { - sched_reset(sched); -} + if (!lm_ggml_backend_sched_compute_splits(sched)) { + return false; + } + return true; +} void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) { sched->callback_eval = callback; @@ -1475,37 +1597,30 @@ int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) { return sched->n_splits; } -lm_ggml_tallocr_t lm_ggml_backend_sched_get_tallocr(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); - LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - return sched->tallocs[backend_index]; -} - -lm_ggml_backend_buffer_t lm_ggml_backend_sched_get_buffer(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); +size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { + int backend_index = lm_ggml_backend_sched_backend_id(sched, backend); LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - return lm_ggml_tallocr_get_buffer(sched->tallocs[backend_index]); + return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index); } void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); + int backend_index = lm_ggml_backend_sched_backend_id(sched, backend); LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - node_allocr(node) = sched->tallocs[backend_index]; + tensor_backend_id(node) = backend_index; } lm_ggml_backend_t lm_ggml_backend_sched_get_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) { - lm_ggml_tallocr_t allocr = node_allocr(node); - if (allocr == NULL) { + int backend_index = tensor_backend_id(node); + if (backend_index == -1) { return NULL; } - return get_allocr_backend(sched, allocr); + return sched->backends[backend_index]; } // utils void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { LM_GGML_ASSERT(tensor->buffer == NULL); - //LM_GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in lm_ggml_new_tensor, but still need to be initialized by the backend LM_GGML_ASSERT(tensor->view_src != NULL); LM_GGML_ASSERT(tensor->view_src->buffer != NULL); LM_GGML_ASSERT(tensor->view_src->data != NULL); @@ -1529,7 +1644,7 @@ void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggm lm_ggml_backend_buffer_init_tensor(buffer, tensor); } -static struct lm_ggml_tensor * graph_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies, +static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies, struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) { LM_GGML_ASSERT(src != NULL); @@ -1542,7 +1657,7 @@ static struct lm_ggml_tensor * graph_dup_tensor(struct lm_ggml_hash_set hash_set struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); if (src->view_src != NULL) { - dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); + dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); dst->view_offs = src->view_offs; } dst->op = src->op; @@ -1553,16 +1668,16 @@ static struct lm_ggml_tensor * graph_dup_tensor(struct lm_ggml_hash_set hash_set for (int i = 0; i < LM_GGML_MAX_SRC; i++) { struct lm_ggml_tensor * s = src->src[i]; if (s == NULL) { - break; + continue; } - dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); + dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); } node_copies[id] = dst; return dst; } -static void graph_init_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) { +static void graph_copy_init_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) { size_t id = lm_ggml_hash_find(hash_set, src); if (node_init[id]) { return; @@ -1571,7 +1686,7 @@ static void graph_init_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_t struct lm_ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { - graph_init_tensor(hash_set, node_copies, node_init, src->view_src); + graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); lm_ggml_backend_view_init(dst->view_src->buffer, dst); } else { @@ -1582,19 +1697,19 @@ static void graph_init_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_t for (int i = 0; i < LM_GGML_MAX_SRC; i++) { struct lm_ggml_tensor * s = src->src[i]; if (s == NULL) { - break; + continue; } - graph_init_tensor(hash_set, node_copies, node_init, s); + graph_copy_init_tensor(hash_set, node_copies, node_init, s); } } struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) { struct lm_ggml_hash_set hash_set = { /* .size = */ graph->visited_hash_table.size, - /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1) + /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT }; - struct lm_ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1); - bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1); + struct lm_ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT + bool * node_init = calloc(sizeof(node_init[0]), hash_set.size); struct lm_ggml_init_params params = { /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false), @@ -1623,7 +1738,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b // dup nodes for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; - graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); + graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); } // allocate nodes @@ -1648,7 +1763,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b // copy data and init views for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; - graph_init_tensor(hash_set, node_copies, node_init, node); + graph_copy_init_tensor(hash_set, node_copies, node_init, node); } // build graph copy diff --git a/cpp/ggml-backend.h b/cpp/ggml-backend.h index e020ccc1..22faeb7d 100644 --- a/cpp/ggml-backend.h +++ b/cpp/ggml-backend.h @@ -20,6 +20,7 @@ extern "C" { LM_GGML_API const char * lm_ggml_backend_buft_name (lm_ggml_backend_buffer_type_t buft); LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer (lm_ggml_backend_buffer_type_t buft, size_t size); LM_GGML_API size_t lm_ggml_backend_buft_get_alignment (lm_ggml_backend_buffer_type_t buft); + LM_GGML_API size_t lm_ggml_backend_buft_get_max_size (lm_ggml_backend_buffer_type_t buft); LM_GGML_API LM_GGML_CALL size_t lm_ggml_backend_buft_get_alloc_size (lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor); LM_GGML_API bool lm_ggml_backend_buft_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend); LM_GGML_API bool lm_ggml_backend_buft_is_host (lm_ggml_backend_buffer_type_t buft); @@ -36,6 +37,7 @@ extern "C" { LM_GGML_API size_t lm_ggml_backend_buffer_get_size (lm_ggml_backend_buffer_t buffer); LM_GGML_API LM_GGML_CALL void lm_ggml_backend_buffer_init_tensor (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer); + LM_GGML_API size_t lm_ggml_backend_buffer_get_max_size (lm_ggml_backend_buffer_t buffer); LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); LM_GGML_API void lm_ggml_backend_buffer_clear (lm_ggml_backend_buffer_t buffer, uint8_t value); LM_GGML_API bool lm_ggml_backend_buffer_is_host (lm_ggml_backend_buffer_t buffer); @@ -54,6 +56,7 @@ extern "C" { LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend); LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size); LM_GGML_API size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend); + LM_GGML_API size_t lm_ggml_backend_get_max_size(lm_ggml_backend_t backend); LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); @@ -80,8 +83,9 @@ extern "C" { LM_GGML_API lm_ggml_backend_t lm_ggml_backend_cpu_init(void); - LM_GGML_API LM_GGML_CALL bool lm_ggml_backend_is_cpu (lm_ggml_backend_t backend); - LM_GGML_API void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads); + LM_GGML_API LM_GGML_CALL bool lm_ggml_backend_is_cpu (lm_ggml_backend_t backend); + LM_GGML_API void lm_ggml_backend_cpu_set_n_threads (lm_ggml_backend_t backend_cpu, int n_threads); + LM_GGML_API void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); @@ -126,11 +130,7 @@ extern "C" { // in build_graph: build_graph(...) { - // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) - alloc_cpu = lm_ggml_backend_sched_get_allocr(sched, backend_cpu); - lm_ggml_allocr_alloc(alloc_cpu, tensor); - - // manually assigning nodes to a backend (optional, shouldn't be needed in most cases) + // manually assign nodes to a backend (optional, should not be needed in most cases) struct lm_ggml_tensor * node = lm_ggml_mul_mat(ctx, ...); lm_ggml_backend_sched_set_node_backend(sched, node, backend_gpu); } @@ -160,20 +160,19 @@ extern "C" { LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - LM_GGML_API void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph); + LM_GGML_API bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph); // Get the number of splits of the last graph LM_GGML_API int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched); - LM_GGML_API lm_ggml_tallocr_t lm_ggml_backend_sched_get_tallocr(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend); - LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_sched_get_buffer (lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend); + LM_GGML_API size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend); LM_GGML_API void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend); LM_GGML_API lm_ggml_backend_t lm_ggml_backend_sched_get_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node); // Allocate and compute graph on the backend scheduler - LM_GGML_API void lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph); + LM_GGML_API bool lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph); - // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs + // Reset all assignments and allocators - must be called before changing the node backends LM_GGML_API void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched); // Set a callback to be called for each resulting node during graph compute diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h index 38380f5c..42bd0c53 100644 --- a/cpp/ggml-impl.h +++ b/cpp/ggml-impl.h @@ -19,6 +19,7 @@ extern "C" { // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 +#ifndef __cplusplus #ifndef static_assert #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #define static_assert(cond, msg) _Static_assert(cond, msg) @@ -26,6 +27,7 @@ extern "C" { #define static_assert(cond, msg) struct global_scope_noop_trick #endif #endif +#endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) diff --git a/cpp/ggml-metal-llama.metal b/cpp/ggml-metal-llama.metal index 029578dc..c223a981 100644 --- a/cpp/ggml-metal-llama.metal +++ b/cpp/ggml-metal-llama.metal @@ -351,12 +351,17 @@ kernel void kernel_sum_rows( kernel void kernel_soft_max( device const float * src0, device const float * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant float & scale, - threadgroup float * buf [[threadgroup(0)]], + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, + threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], @@ -368,13 +373,26 @@ kernel void kernel_soft_max( device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr; + device const float * ppos = src2 != src0 ? src2 : nullptr; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + float slope = 0.0f; + + // ALiBi + if (max_bias > 0.0f) { + const int64_t h = i02; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } + // parallel max float lmax = -INFINITY; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)); + lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); } // find the max value in the block @@ -399,7 +417,7 @@ kernel void kernel_soft_max( // parallel sum float lsum = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val); + const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); lsum += exp_psrc0; pdst[i00] = exp_psrc0; } @@ -437,12 +455,17 @@ kernel void kernel_soft_max( kernel void kernel_soft_max_4( device const float * src0, device const float * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant float & scale, - threadgroup float * buf [[threadgroup(0)]], + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, + threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], @@ -454,13 +477,25 @@ kernel void kernel_soft_max_4( device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr; + device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr; device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + float slope = 0.0f; + + if (max_bias > 0.0f) { + const int64_t h = i02; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } + // parallel max float4 lmax4 = -INFINITY; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)); + lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); @@ -486,7 +521,7 @@ kernel void kernel_soft_max_4( // parallel sum float4 lsum4 = 0.0f; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val); + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } @@ -1775,9 +1810,29 @@ kernel void kernel_rope( template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope; template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope; -kernel void kernel_im2col_f16( +typedef void (im2col_t)( + device const float * x, + device char * dst, + constant int32_t & ofs0, + constant int32_t & ofs1, + constant int32_t & IW, + constant int32_t & IH, + constant int32_t & CHW, + constant int32_t & s0, + constant int32_t & s1, + constant int32_t & p0, + constant int32_t & p1, + constant int32_t & d0, + constant int32_t & d1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template +kernel void kernel_im2col( device const float * x, - device half * dst, + device char * dst, constant int32_t & ofs0, constant int32_t & ofs1, constant int32_t & IW, @@ -1800,14 +1855,19 @@ kernel void kernel_im2col_f16( (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW + (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]); + device T * pdst = (device T *) (dst); + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst[offset_dst] = 0.0f; + pdst[offset_dst] = 0.0f; } else { const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1; - dst[offset_dst] = x[offset_src + iih * IW + iiw]; + pdst[offset_dst] = x[offset_src + iih * IW + iiw]; } } +template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col; +template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col; + kernel void kernel_upscale_f32( device const char * src0, device char * dst, @@ -2459,6 +2519,25 @@ typedef struct { } block_iq2_xs; // 74 bytes / block for QK_K = 256, so 2.3125 bpw +typedef struct { + half d; + uint8_t qs[3*QK_K/8]; +} block_iq3_xxs; +// 98 bytes / block for QK_K = 256, so 3.0625 bpw + +typedef struct { + half d; + uint8_t qs[QK_K/8]; + uint8_t scales[QK_K/16]; +} block_iq1_s; + +// Non-linear quants +#define QK4_NL 32 +typedef struct { + half d; + uint8_t qs[QK4_NL/2]; +} block_iq4_nl; + //====================================== dot products ========================= void kernel_mul_mv_q2_K_f32_impl( @@ -3681,6 +3760,173 @@ constexpr constant static uint64_t iq2xs_grid[512] = { 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, }; +constexpr constant static uint32_t iq3xxs_grid[256] = { + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, +}; + +#define NGRID_IQ1S 512 +constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = { + 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, + 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, + 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, + 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, + 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, + 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, + 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, + 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, + 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, + 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, + 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, + 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, + 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, + 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, + 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, + 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, + 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, + 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, + 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, + 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, + 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, + 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, + 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, + 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, + 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, + 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, + 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, + 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, + 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, + 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, + 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, + 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, + 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, + 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, + 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, + 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, + 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, + 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, + 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, + 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, + 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, + 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, + 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, + 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, + 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, + 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, + 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, + 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, + 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, + 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, + 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, + 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, + 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, + 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, + 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, + 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, + 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, + 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, + 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, + 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, + 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, + 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, + 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, + 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, + 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, + 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, + 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, + 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, + 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, + 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, + 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, + 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, + 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, + 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, + 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, + 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, + 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, + 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, + 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, + 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, + 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, + 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, + 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, + 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, + 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, + 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, + 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, + 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, + 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, + 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, + 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, + 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, + 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, + 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, + 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, + 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, + 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, + 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, + 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, + 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, + 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, + 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, + 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, + 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, + 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, + 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, + 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, + 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, + 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, + 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, + 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, + 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, + 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, + 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, + 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, + 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, + 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, + 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, + 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, + 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, + 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, + 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, + 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, + 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, + 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, + 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, + 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, + 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, +}; + constexpr constant static uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -3787,7 +4033,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl( y4 += 32 * 32; } #else - // TODO + (void) x; + (void) y; + (void) yl; + (void) nb32; #endif for (int row = 0; row < N_DST; ++row) { @@ -3930,7 +4179,10 @@ void kernel_mul_mv_iq2_xs_f32_impl( y4 += 32 * 32; } #else - // TODO + (void) x; + (void) y; + (void) yl; + (void) nb32; #endif for (int row = 0; row < N_DST; ++row) { @@ -3970,157 +4222,541 @@ kernel void kernel_mul_mv_iq2_xs_f32( kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } -//============================= templates and their specializations ============================= - -// NOTE: this is not dequantizing - we are simply fitting the template -template -void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { - float4x4 temp = *(((device float4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } -} +void kernel_mul_mv_iq3_xxs_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { -template -void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { - half4x4 temp = *(((device half4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } -} + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; -template -void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 1); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float md = -8.h * xb->d; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; - reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; - } -} + const uint i12 = im%ne12; + const uint i13 = im/ne12; -template -void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 2); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float m = xb->m; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; - reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; - } -} + device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; -template -void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 3); - const float d = xb->d; - const float md = -16.h * xb->d; - const ushort mask = il ? 0x00F0 : 0x000F; + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; - const uint32_t qh = *((device const uint32_t *)xb->qh); + const int nb32 = nb * (QK_K / 32); - const int x_mv = il ? 4 : 0; + threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values; + threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 256); + { + int nval = 4; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) values[pos + i] = iq3xxs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } - const int gh_mv = il ? 12 : 0; - const int gh_bk = il ? 0 : 4; +#if QK_K == 256 + const int ix = tiisg; - for (int i = 0; i < 8; i++) { - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + device const float * y4 = y + 32 * ix; - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - reg[i/2][2*(i%2)+0] = d * x0 + md; - reg[i/2][2*(i%2)+1] = d * x1 + md; - } -} + for (int i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } -template -void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 4); - const float d = xb->d; - const float m = xb->m; - const ushort mask = il ? 0x00F0 : 0x000F; + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); - const uint32_t qh = *((device const uint32_t *)xb->qh); + device const block_iq3_xxs * xr = x + ibl; + device const uint8_t * q3 = xr->qs + 8 * ib; + device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib; + device const half * dh = &xr->d; - const int x_mv = il ? 4 : 0; + for (int row = 0; row < N_DST; row++) { - const int gh_mv = il ? 12 : 0; - const int gh_bk = il ? 0 : 4; + const float db = dh[0]; + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = db * (0.5f + (aux32 >> 28)); - for (int i = 0; i < 8; i++) { - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + float2 sum = {0}; + for (int l = 0; l < 4; ++l) { + const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + q3[2*l+0]); + const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + q3[2*l+1]); + const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + } + sumf[row] += d * (sum[0] + sum[1]); - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + dh += nb*sizeof(block_iq3_xxs)/2; + q3 += nb*sizeof(block_iq3_xxs); + gas += nb*sizeof(block_iq3_xxs)/2; + } - reg[i/2][2*(i%2)+0] = d * x0 + m; - reg[i/2][2*(i%2)+1] = d * x1 + m; + y4 += 32 * 32; } -} - -template -void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { - device const int8_t * qs = ((device const int8_t *)xb->qs); - const half d = xb->d; +#else + (void) x; + (void) y; + (void) yl; + (void) nb32; +#endif - for (int i = 0; i < 16; i++) { - reg[i/4][i%4] = (qs[i + 16*il] * d); + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f; + } } } -template -void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { - const float d = xb->d; - const float min = xb->dmin; - device const uint8_t * q = (device const uint8_t *)xb->qs; - float dl, ml; - uint8_t sc = xb->scales[il]; +[[host_name("kernel_mul_mv_iq3_xxs_f32")]] +kernel void kernel_mul_mv_iq3_xxs_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { -#if QK_K == 256 - q = q + 32*(il/8) + 16*(il&1); - il = (il/2)%4; -#endif - half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - ml; - } + kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } -template -void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { - const half d_all = xb->d; - device const uint8_t * q = (device const uint8_t *)xb->qs; - device const uint8_t * h = (device const uint8_t *)xb->hmask; - device const int8_t * scales = (device const int8_t *)xb->scales; - -#if QK_K == 256 - q = q + 32 * (il/8) + 16 * (il&1); - h = h + 16 * (il&1); - uint8_t m = 1 << (il/2); - uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ - ((il/4)>0 ? 12 : 3); - uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; - uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; - int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) +void kernel_mul_mv_iq1_s_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[16]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + +#if QK_K == 256 + const int ix = tiisg/2; + const int il = tiisg%2; + + device const float * y4 = y + 32 * ix + 16 * il; + + for (int ib32 = ix; ib32 < nb32; ib32 += 16) { + + for (int i = 0; i < 16; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq1_s * xr = x + ibl; + device const uint8_t * qs = xr->qs + 4 * ib + 2 * il; + device const uint8_t * sc = xr->scales + 2 * ib + il; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); + constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); + + float2 sum = {0}; + for (int j = 0; j < 8; ++j) { + sum[0] += yl[j+ 0] * grid1[j]; + sum[1] += yl[j+ 8] * grid2[j]; + } + sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1)); + + dh += nb*sizeof(block_iq1_s)/2; + qs += nb*sizeof(block_iq1_s); + sc += nb*sizeof(block_iq1_s); + } + + y4 += 16 * 32; + } +#else + (void) x; + (void) y; + (void) yl; + (void) nb32; +#endif + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; + } + } +} + +constexpr constant static float kvalues_iq4nl_f[16] = { + -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f +}; + +void kernel_mul_mv_iq4_nl_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK4_NL; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + const int first_row = (r0 * 2 + sgitg) * 2; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + const int ix = tiisg/2; // 0...15 + const int it = tiisg%2; // 0 or 1 + + shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + float4 yl[4]; + float sumf[2]={0.f}, all_sum; + + device const float * yb = y + ix * QK4_NL + it * 8; + + uint32_t aux32[2]; + thread const uint8_t * q8 = (thread const uint8_t *)aux32; + + float4 qf1, qf2; + + for (int ib = ix; ib < nb; ib += 16) { + + device const float4 * y4 = (device const float4 *)yb; + yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5]; + + for (int row = 0; row < 2; ++row) { + + device const block_iq4_nl & xb = x[row*nb + ib]; + device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it); + + float4 acc1 = {0.f}, acc2 = {0.f}; + + aux32[0] = q4[0] | (q4[1] << 16); + aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; + aux32[0] &= 0x0f0f0f0f; + qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]}; + qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]}; + acc1 += yl[0] * qf1; + acc2 += yl[1] * qf2; + + aux32[0] = q4[2] | (q4[3] << 16); + aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; + aux32[0] &= 0x0f0f0f0f; + qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]}; + qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]}; + acc1 += yl[2] * qf1; + acc2 += yl[3] * qf2; + + acc1 += acc2; + + sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]); + + } + + yb += 16 * QK4_NL; + } + + for (int row = 0; row < 2; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; + } + } +} + +[[host_name("kernel_mul_mv_iq1_s_f32")]] +kernel void kernel_mul_mv_iq1_s_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); +} + +[[host_name("kernel_mul_mv_iq4_nl_f32")]] +kernel void kernel_mul_mv_iq4_nl_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + +//============================= templates and their specializations ============================= + +// NOTE: this is not dequantizing - we are simply fitting the template +template +void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { + float4x4 temp = *(((device float4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { + half4x4 temp = *(((device half4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float md = -8.h * xb->d; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; + reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; + } +} + +template +void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float m = xb->m; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; + reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; + } +} + +template +void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 3); + const float d = xb->d; + const float md = -16.h * xb->d; + const ushort mask = il ? 0x00F0 : 0x000F; + + const uint32_t qh = *((device const uint32_t *)xb->qh); + + const int x_mv = il ? 4 : 0; + + const int gh_mv = il ? 12 : 0; + const int gh_bk = il ? 0 : 4; + + for (int i = 0; i < 8; i++) { + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg[i/2][2*(i%2)+0] = d * x0 + md; + reg[i/2][2*(i%2)+1] = d * x1 + md; + } +} + +template +void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 4); + const float d = xb->d; + const float m = xb->m; + const ushort mask = il ? 0x00F0 : 0x000F; + + const uint32_t qh = *((device const uint32_t *)xb->qh); + + const int x_mv = il ? 4 : 0; + + const int gh_mv = il ? 12 : 0; + const int gh_bk = il ? 0 : 4; + + for (int i = 0; i < 8; i++) { + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg[i/2][2*(i%2)+0] = d * x0 + m; + reg[i/2][2*(i%2)+1] = d * x1 + m; + } +} + +template +void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { + device const int8_t * qs = ((device const int8_t *)xb->qs); + const half d = xb->d; + + for (int i = 0; i < 16; i++) { + reg[i/4][i%4] = (qs[i + 16*il] * d); + } +} + +template +void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { + const float d = xb->d; + const float min = xb->dmin; + device const uint8_t * q = (device const uint8_t *)xb->qs; + float dl, ml; + uint8_t sc = xb->scales[il]; + +#if QK_K == 256 + q = q + 32*(il/8) + 16*(il&1); + il = (il/2)%4; +#endif + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint8_t * q = (device const uint8_t *)xb->qs; + device const uint8_t * h = (device const uint8_t *)xb->hmask; + device const int8_t * scales = (device const int8_t *)xb->scales; + +#if QK_K == 256 + q = q + 32 * (il/8) + 16 * (il&1); + h = h + 16 * (il&1); + uint8_t m = 1 << (il/2); + uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ + ((il/4)>0 ? 12 : 3); + uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; + uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; + int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : (scale_2&kmask2) | ((scale_1&kmask1) << 4); float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); const float ml = 4.f * dl; @@ -4165,6 +4801,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg const float dl = d * sc[0]; const float ml = min * sc[1]; #else + (void) get_scale_min_k4_just2; + q = q + 16 * (il&1); device const uint8_t * s = xb->scales; device const half2 * dh = (device const half2 *)xb->d; @@ -4287,6 +4925,64 @@ void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 } } +template +void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * q3 = xb->qs + 8*ib32; + device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32; + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f; + constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]); + constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]); + uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127]; + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } + grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]); + grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]); + signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127]; + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } +} + +template +void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + device const uint8_t * qs = xb->qs + 2*il; + device const uint8_t * sc = xb->scales + il; + const float dl1 = d * (2*(sc[0] & 7) + 1); + const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1); + constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); + constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); + for (int i = 0; i < 8; ++i) { + reg[i/4+0][i%4] = dl1 * grid1[i]; + reg[i/4+2][i%4] = dl2 * grid2[i]; + } +} + +template +void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { + device const uint16_t * q4 = (device const uint16_t *)xb->qs; + const float d = xb->d; + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + template kernel void kernel_get_rows( device const void * src0, @@ -4828,6 +5524,9 @@ template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; // // matrix-matrix multiplication @@ -4866,6 +5565,9 @@ template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication @@ -4916,6 +5618,9 @@ template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mu template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -5818,3 +6523,196 @@ kernel void kernel_mul_mv_id_iq2_xs_f32( tiisg, sgitg); } + +[[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] +kernel void kernel_mul_mv_id_iq3_xxs_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq3_xxs_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_iq1_s_f32")]] +kernel void kernel_mul_mv_id_iq1_s_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq1_s_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_iq4_nl_f32")]] +kernel void kernel_mul_mv_id_iq4_nl_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq4_nl_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} diff --git a/cpp/ggml-metal.h b/cpp/ggml-metal.h index 2717e951..914dd6cf 100644 --- a/cpp/ggml-metal.h +++ b/cpp/ggml-metal.h @@ -57,6 +57,9 @@ LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buf // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf LM_GGML_API bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family); +// capture all command buffers committed the next time `lm_ggml_backend_graph_compute` is called +LM_GGML_API void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend); + #ifdef __cplusplus } #endif diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index c51bc3a4..6a4a70dd 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -24,19 +24,7 @@ #define UNUSED(x) (void)(x) -#define LM_GGML_METAL_MAX_KERNELS 256 - -struct lm_ggml_metal_buffer { - const char * name; - - void * data; - size_t size; - - id metal; -}; - struct lm_ggml_metal_kernel { - id function; id pipeline; }; @@ -72,6 +60,9 @@ LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, LM_GGML_METAL_KERNEL_TYPE_RMS_NORM, LM_GGML_METAL_KERNEL_TYPE_GROUP_NORM, @@ -93,6 +84,9 @@ LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, //LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, @@ -110,6 +104,9 @@ LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, @@ -124,6 +121,9 @@ LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, @@ -138,10 +138,14 @@ LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, LM_GGML_METAL_KERNEL_TYPE_ROPE_F32, LM_GGML_METAL_KERNEL_TYPE_ROPE_F16, LM_GGML_METAL_KERNEL_TYPE_ALIBI_F32, LM_GGML_METAL_KERNEL_TYPE_IM2COL_F16, + LM_GGML_METAL_KERNEL_TYPE_IM2COL_F32, LM_GGML_METAL_KERNEL_TYPE_UPSCALE_F32, LM_GGML_METAL_KERNEL_TYPE_PAD_F32, LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, @@ -168,23 +172,21 @@ id device; id queue; - id library; dispatch_queue_t d_queue; - int n_buffers; - struct lm_ggml_metal_buffer buffers[LM_GGML_METAL_MAX_BUFFERS]; - - struct lm_ggml_metal_kernel kernels[LM_GGML_METAL_MAX_KERNELS]; + struct lm_ggml_metal_kernel kernels[LM_GGML_METAL_KERNEL_TYPE_COUNT]; bool support_simdgroup_reduction; bool support_simdgroup_mm; + + bool should_capture_next_compute; }; // MSL code // TODO: move the contents here when ready // for now it is easier to work in a separate file -//static NSString * const msl_library_source = @"see metal.metal"; +// static NSString * const msl_library_source = @"see metal.metal"; // Here to assist with NSBundle Path Hack @interface LMGGMLMetalClass : NSObject @@ -242,26 +244,24 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, // Show all the Metal device instances in the system NSArray * devices = MTLCopyAllDevices(); for (id device in devices) { - NSString * s = [device name]; - LM_GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]); + LM_GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]); } [devices release]; // since it was created by a *Copy* C method #endif // Pick and show default Metal device id device = MTLCreateSystemDefaultDevice(); - NSString * s = [device name]; - LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); + LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // Configure context struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context)); ctx->device = device; ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; - ctx->n_buffers = 0; - ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); + id metal_library; + // load library { NSBundle * bundle = nil; @@ -276,8 +276,20 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, // pre-compiled library found NSURL * libURL = [NSURL fileURLWithPath:libPath]; LM_GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); - ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; + metal_library = [ctx->device newLibraryWithURL:libURL error:&error]; + if (error) { + LM_GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } } else { +#if LM_GGML_METAL_EMBED_LIBRARY + LM_GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__); + + extern const char lm_ggml_metallib_start[]; + extern const char lm_ggml_metallib_end[]; + + NSString * src = [[NSString alloc] initWithBytes:lm_ggml_metallib_start length:(lm_ggml_metallib_end-lm_ggml_metallib_start) encoding:NSUTF8StringEncoding]; +#else LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); NSString * sourcePath; @@ -300,6 +312,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } +#endif @autoreleasepool { // dictionary of preprocessor macros @@ -314,14 +327,13 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, //[options setFastMathEnabled:false]; - ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; + metal_library = [ctx->device newLibraryWithSource:src options:options error:&error]; + if (error) { + LM_GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } } } - - if (error) { - LM_GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } } // print MTL GPU family: @@ -364,6 +376,8 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); LM_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + ctx->should_capture_next_compute = false; + #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { LM_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); @@ -380,8 +394,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, { NSError * error = nil; - for (int i = 0; i < LM_GGML_METAL_MAX_KERNELS; ++i) { - ctx->kernels[i].function = nil; + for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) { ctx->kernels[i].pipeline = nil; } @@ -393,10 +406,12 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, #define LM_GGML_METAL_ADD_KERNEL(e, name, supported) \ if (supported) { \ struct lm_ggml_metal_kernel * kernel = &ctx->kernels[e]; \ - kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \ + id metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \ + kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \ + [metal_function release]; \ if (error) { \ LM_GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + [metal_library release]; \ return NULL; \ } \ } else { \ @@ -436,6 +451,9 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); @@ -457,6 +475,9 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); //LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); @@ -474,6 +495,9 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); @@ -488,6 +512,9 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); @@ -502,10 +529,14 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); @@ -525,27 +556,17 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); } + [metal_library release]; return ctx; } static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); - for (int i = 0; i < ctx->n_buffers; ++i) { - [ctx->buffers[i].metal release]; - } - - for (int i = 0; i < LM_GGML_METAL_MAX_KERNELS; ++i) { - if (ctx->kernels[i].pipeline) { - [ctx->kernels[i].pipeline release]; - } - - if (ctx->kernels[i].function) { - [ctx->kernels[i].function release]; - } + for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) { + [ctx->kernels[i].pipeline release]; } - [ctx->library release]; [ctx->queue release]; [ctx->device release]; @@ -577,51 +598,30 @@ static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer // -static id lm_ggml_metal_get_buffer(struct lm_ggml_metal_context * ctx, struct lm_ggml_tensor * t, size_t * offs) { +static id lm_ggml_metal_get_buffer(struct lm_ggml_tensor * t, size_t * offs) { //LM_GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = lm_ggml_nbytes(t); lm_ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; - // compatibility with ggml-backend - if (buffer && buffer->buft == lm_ggml_backend_metal_buffer_type()) { - struct lm_ggml_backend_metal_buffer_context * buf_ctx = (struct lm_ggml_backend_metal_buffer_context *) buffer->context; - - // find the view that contains the tensor fully - for (int i = 0; i < buf_ctx->n_buffers; ++i) { - const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - - //LM_GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); - if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { - *offs = (size_t) ioffs; - - //LM_GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); - - return buf_ctx->buffers[i].metal; - } - } - - LM_GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); - - return nil; - } + struct lm_ggml_backend_metal_buffer_context * buf_ctx = (struct lm_ggml_backend_metal_buffer_context *) buffer->context; // find the view that contains the tensor fully - for (int i = 0; i < ctx->n_buffers; ++i) { - const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - //LM_GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name); - if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { + //LM_GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { *offs = (size_t) ioffs; - //LM_GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //LM_GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); - return ctx->buffers[i].metal; + return buf_ctx->buffers[i].metal; } } - LM_GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__); + LM_GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); return nil; } @@ -661,6 +661,10 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, case LM_GGML_OP_ALIBI: case LM_GGML_OP_ROPE: case LM_GGML_OP_IM2COL: + return true; + case LM_GGML_OP_POOL_1D: + case LM_GGML_OP_POOL_2D: + return false; case LM_GGML_OP_UPSCALE: case LM_GGML_OP_PAD: case LM_GGML_OP_ARGSORT: @@ -668,7 +672,8 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, return true; case LM_GGML_OP_MUL_MAT: case LM_GGML_OP_MUL_MAT_ID: - return ctx->support_simdgroup_reduction; + return ctx->support_simdgroup_reduction && + (op->src[0]->type != LM_GGML_TYPE_F32 || op->src[1]->type == LM_GGML_TYPE_F32); case LM_GGML_OP_CPY: case LM_GGML_OP_DUP: case LM_GGML_OP_CONT: @@ -711,6 +716,7 @@ static bool lm_ggml_metal_graph_compute( struct lm_ggml_metal_context * ctx, struct lm_ggml_cgraph * gf) { + @autoreleasepool { MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; edesc.dispatchType = MTLDispatchTypeSerial; @@ -721,6 +727,20 @@ static bool lm_ggml_metal_graph_compute( const int n_cb = ctx->n_cb; const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; + const bool should_capture = ctx->should_capture_next_compute; + if (should_capture) { + ctx->should_capture_next_compute = false; + + MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new]; + descriptor.captureObject = ctx->queue; + + NSError * error = nil; + if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { + LM_GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); + LM_GGML_ASSERT(!"capture failed"); + } + } + id command_buffer_builder[n_cb]; for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; @@ -729,6 +749,7 @@ static bool lm_ggml_metal_graph_compute( // enqueue the command buffers in order to specify their execution order [command_buffer enqueue]; } + const id *command_buffers = command_buffer_builder; dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) { @@ -736,6 +757,7 @@ static bool lm_ggml_metal_graph_compute( size_t offs_src0 = 0; size_t offs_src1 = 0; + size_t offs_src2 = 0; size_t offs_dst = 0; id command_buffer = command_buffers[cb_idx]; @@ -754,6 +776,7 @@ static bool lm_ggml_metal_graph_compute( struct lm_ggml_tensor * src0 = gf->nodes[i]->src[0]; struct lm_ggml_tensor * src1 = gf->nodes[i]->src[1]; + struct lm_ggml_tensor * src2 = gf->nodes[i]->src[2]; struct lm_ggml_tensor * dst = gf->nodes[i]; switch (dst->op) { @@ -775,9 +798,9 @@ static bool lm_ggml_metal_graph_compute( LM_GGML_ASSERT(!"unsupported op"); } -#ifndef LM_GGML_METAL_NDEBUG - [encoder pushDebugGroup:[NSString stringWithCString:lm_ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; -#endif + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:lm_ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; + } const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; @@ -813,9 +836,10 @@ static bool lm_ggml_metal_graph_compute( const enum lm_ggml_type src1t = src1 ? src1->type : LM_GGML_TYPE_COUNT; const enum lm_ggml_type dstt = dst ? dst->type : LM_GGML_TYPE_COUNT; - id id_src0 = src0 ? lm_ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; - id id_src1 = src1 ? lm_ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; - id id_dst = dst ? lm_ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; + id id_src0 = src0 ? lm_ggml_metal_get_buffer(src0, &offs_src0) : nil; + id id_src1 = src1 ? lm_ggml_metal_get_buffer(src1, &offs_src1) : nil; + id id_src2 = src2 ? lm_ggml_metal_get_buffer(src2, &offs_src2) : nil; + id id_dst = dst ? lm_ggml_metal_get_buffer(dst, &offs_dst) : nil; //LM_GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, lm_ggml_op_name(dst->op)); //if (src0) { @@ -1196,7 +1220,16 @@ static bool lm_ggml_metal_graph_compute( pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; } - const float scale = ((float *) dst->op_params)[0]; + const float scale = ((float *) dst->op_params)[0]; + const float max_bias = ((float *) dst->op_params)[1]; + + const int64_t nrows_x = lm_ggml_nrows(src0); + const int64_t nrows_y = src0->ne[1]; + const uint32_t n_head_kv = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1205,11 +1238,20 @@ static bool lm_ggml_metal_graph_compute( } else { [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; + if (id_src2) { + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:7]; + [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:9]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:10]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; @@ -1304,6 +1346,9 @@ static bool lm_ggml_metal_graph_compute( case LM_GGML_TYPE_Q6_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; case LM_GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; case LM_GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; + case LM_GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; + case LM_GGML_TYPE_IQ1_S: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; + case LM_GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; default: LM_GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } @@ -1432,6 +1477,24 @@ static bool lm_ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; } break; + case LM_GGML_TYPE_IQ3_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; + } break; + case LM_GGML_TYPE_IQ1_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; + } break; + case LM_GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline; + } break; default: { LM_GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); @@ -1466,7 +1529,7 @@ static bool lm_ggml_metal_graph_compute( if (src0t == LM_GGML_TYPE_Q4_0 || src0t == LM_GGML_TYPE_Q4_1 || src0t == LM_GGML_TYPE_Q5_0 || src0t == LM_GGML_TYPE_Q5_1 || src0t == LM_GGML_TYPE_Q8_0 || - src0t == LM_GGML_TYPE_Q2_K) { // || src0t == LM_GGML_TYPE_Q4_K) { + src0t == LM_GGML_TYPE_Q2_K || src0t == LM_GGML_TYPE_IQ1_S) { // || src0t == LM_GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == LM_GGML_TYPE_IQ2_XXS || src0t == LM_GGML_TYPE_IQ2_XS) { @@ -1474,6 +1537,16 @@ static bool lm_ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src0t == LM_GGML_TYPE_IQ3_XXS) { + const int mem_size = 256*4+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == LM_GGML_TYPE_IQ4_NL) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src0t == LM_GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1510,8 +1583,6 @@ static bool lm_ggml_metal_graph_compute( // max size of the src1ids array in the kernel stack LM_GGML_ASSERT(ne11 <= 512); - struct lm_ggml_tensor * src2 = gf->nodes[i]->src[2]; - const int64_t ne20 = src2 ? src2->ne[0] : 0; const int64_t ne21 = src2 ? src2->ne[1] : 0; const int64_t ne22 = src2 ? src2->ne[2] : 0; @@ -1568,6 +1639,9 @@ static bool lm_ggml_metal_graph_compute( case LM_GGML_TYPE_Q6_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; case LM_GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; case LM_GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; + case LM_GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; + case LM_GGML_TYPE_IQ1_S: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; + case LM_GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; default: LM_GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } @@ -1597,7 +1671,7 @@ static bool lm_ggml_metal_graph_compute( struct lm_ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; size_t offs_src_cur = 0; - id id_src_cur = lm_ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + id id_src_cur = lm_ggml_metal_get_buffer(src_cur, &offs_src_cur); [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j]; } @@ -1699,6 +1773,24 @@ static bool lm_ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; } break; + case LM_GGML_TYPE_IQ3_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; + } break; + case LM_GGML_TYPE_IQ1_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; + } break; + case LM_GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; + } break; default: { LM_GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); @@ -1742,14 +1834,14 @@ static bool lm_ggml_metal_graph_compute( struct lm_ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; size_t offs_src_cur = 0; - id id_src_cur = lm_ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + id id_src_cur = lm_ggml_metal_get_buffer(src_cur, &offs_src_cur); [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; } if (src2t == LM_GGML_TYPE_Q4_0 || src2t == LM_GGML_TYPE_Q4_1 || src2t == LM_GGML_TYPE_Q5_0 || src2t == LM_GGML_TYPE_Q5_1 || src2t == LM_GGML_TYPE_Q8_0 || - src2t == LM_GGML_TYPE_Q2_K) { // || src2t == LM_GGML_TYPE_Q4_K) { + src2t == LM_GGML_TYPE_Q2_K || src2t == LM_GGML_TYPE_IQ1_S) { // || src2t == LM_GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src2t == LM_GGML_TYPE_IQ2_XXS || src2t == LM_GGML_TYPE_IQ2_XS) { @@ -1757,6 +1849,16 @@ static bool lm_ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src2t == LM_GGML_TYPE_IQ3_XXS) { + const int mem_size = 256*4+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src2t == LM_GGML_TYPE_IQ4_NL) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src2t == LM_GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1797,6 +1899,9 @@ static bool lm_ggml_metal_graph_compute( case LM_GGML_TYPE_Q6_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; case LM_GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; case LM_GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; + case LM_GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; + case LM_GGML_TYPE_IQ1_S: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; + case LM_GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case LM_GGML_TYPE_I32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; default: LM_GGML_ASSERT(false && "not implemented"); } @@ -2005,7 +2110,7 @@ static bool lm_ggml_metal_graph_compute( { LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16 || dst->type == LM_GGML_TYPE_F32); const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; @@ -2013,6 +2118,7 @@ static bool lm_ggml_metal_graph_compute( const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; const int32_t N = src1->ne[is_2D ? 3 : 2]; @@ -2033,8 +2139,8 @@ static bool lm_ggml_metal_graph_compute( id pipeline = nil; - switch (src0->type) { - case LM_GGML_TYPE_F32: LM_GGML_ASSERT(false && "not implemented"); break; + switch (dst->type) { + case LM_GGML_TYPE_F32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; default: LM_GGML_ASSERT(false); }; @@ -2227,9 +2333,9 @@ static bool lm_ggml_metal_graph_compute( } } -#ifndef LM_GGML_METAL_NDEBUG - [encoder popDebugGroup]; -#endif + if (should_capture) { + [encoder popDebugGroup]; + } } [encoder endEncoding]; @@ -2251,6 +2357,11 @@ static bool lm_ggml_metal_graph_compute( } } + if (should_capture) { + [[MTLCaptureManager sharedCaptureManager] stopCapture]; + } + + } return true; } @@ -2419,6 +2530,16 @@ LM_GGML_CALL static size_t lm_ggml_backend_metal_buffer_type_get_alignment(lm_gg UNUSED(buft); } +LM_GGML_CALL static size_t lm_ggml_backend_metal_buffer_type_get_max_size(lm_ggml_backend_buffer_type_t buft) { + id device = lm_ggml_backend_metal_get_device(); + size_t max_size = device.maxBufferLength; + lm_ggml_backend_metal_free_device(); + + return max_size; + + UNUSED(buft); +} + LM_GGML_CALL static bool lm_ggml_backend_metal_buffer_type_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend) { return lm_ggml_backend_is_metal(backend) || lm_ggml_backend_is_cpu(backend); @@ -2437,6 +2558,7 @@ LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(voi /* .get_name = */ lm_ggml_backend_metal_buffer_type_get_name, /* .alloc_buffer = */ lm_ggml_backend_metal_buffer_type_alloc_buffer, /* .get_alignment = */ lm_ggml_backend_metal_buffer_type_get_alignment, + /* .get_max_size = */ lm_ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes /* .supports_backend = */ lm_ggml_backend_metal_buffer_type_supports_backend, /* .is_host = */ lm_ggml_backend_metal_buffer_type_is_host, @@ -2611,6 +2733,13 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } +void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) { + LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend)); + + struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; + ctx->should_capture_next_compute = true; +} + LM_GGML_CALL lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning LM_GGML_CALL lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data) { diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c index 460c5a27..5dd07d63 100644 --- a/cpp/ggml-quants.c +++ b/cpp/ggml-quants.c @@ -49,6 +49,8 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define UNUSED LM_GGML_UNUSED + #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) @@ -268,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #if defined(__ARM_NEON) + +#ifdef _MSC_VER + +#define lm_ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + +#define lm_ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif + #if !defined(__aarch64__) // 64-bit compatibility @@ -1824,9 +1837,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri float sigma2 = sumx2/QK_K; for (int j = 0; j < QK_K/16; ++j) { const float * restrict qw = quant_weights + QK_K * i + 16*j; - for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); - for (int l = 0; l < 16; ++l) sw[j] += weight[l]; - scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); + for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l]; + scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); } float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); @@ -2381,19 +2394,20 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri uint8_t L[QK_K]; uint8_t Laux[32]; + uint8_t Ls[QK_K/32]; + uint8_t Lm[QK_K/32]; float weights[32]; - float mins[QK_K/32]; - float scales[QK_K/32]; + float sw[QK_K/32]; + float mins[QK_K/32]; + float scales[QK_K/32]; for (int i = 0; i < nb; i++) { float sum_x2 = 0; for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; - float sigma2 = sum_x2/QK_K; + float sigma2 = 2*sum_x2/QK_K; float av_x = sqrtf(sigma2); - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { if (quant_weights) { const float * qw = quant_weights + QK_K*i + 32*j; @@ -2401,25 +2415,17 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri } else { for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); } + float sumw = 0; + for (int l = 0; l < 32; ++l) sumw += weights[l]; + sw[j] = sumw; scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); - //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; - } - float min = mins[j]; - if (min > max_min) { - max_min = min; - } } - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw); + float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw); for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - ls = MIN(63, ls); - lm = MIN(63, lm); + uint8_t ls = Ls[j]; + uint8_t lm = Lm[j]; if (j < 4) { y[i].scales[j] = ls; y[i].scales[j+4] = lm; @@ -2429,8 +2435,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri y[i].scales[j-0] |= ((lm >> 4) << 6); } } - y[i].d = LM_GGML_FP32_TO_FP16(max_scale/63.f); - y[i].dmin = LM_GGML_FP32_TO_FP16(max_min/63.f); + y[i].d = LM_GGML_FP32_TO_FP16(d_block); + y[i].dmin = LM_GGML_FP32_TO_FP16(m_block); uint8_t sc, m; for (int j = 0; j < QK_K/32; ++j) { @@ -2688,20 +2694,21 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri const int nb = n_per_row / QK_K; uint8_t L[QK_K]; - float mins[QK_K/32]; - float scales[QK_K/32]; - float weights[32]; uint8_t Laux[32]; + uint8_t Ls[QK_K/32]; + uint8_t Lm[QK_K/32]; + float mins[QK_K/32]; + float scales[QK_K/32]; + float sw[QK_K/32]; + float weights[32]; for (int i = 0; i < nb; i++) { float sum_x2 = 0; for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; - float sigma2 = sum_x2/QK_K; + float sigma2 = 2*sum_x2/QK_K; float av_x = sqrtf(sigma2); - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { if (quant_weights) { const float * qw = quant_weights + QK_K*i + 32*j; @@ -2709,22 +2716,19 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri } else { for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); } + float sumw = 0; + for (int l = 0; l < 32; ++l) sumw += weights[l]; + sw[j] = sumw; + scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; - } - float min = mins[j]; - if (min > max_min) { - max_min = min; - } } - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw); + float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw); + for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); + uint8_t ls = Ls[j]; + uint8_t lm = Lm[j]; ls = MIN(63, ls); lm = MIN(63, lm); if (j < 4) { @@ -2736,8 +2740,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri y[i].scales[j-0] |= ((lm >> 4) << 6); } } - y[i].d = LM_GGML_FP32_TO_FP16(max_scale/63.f); - y[i].dmin = LM_GGML_FP32_TO_FP16(max_min/63.f); + y[i].d = LM_GGML_FP32_TO_FP16(d_block); + y[i].dmin = LM_GGML_FP32_TO_FP16(m_block); uint8_t sc, m; for (int j = 0; j < QK_K/32; ++j) { @@ -3441,6 +3445,174 @@ static const uint64_t iq2xs_grid[512] = { 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, }; +static const uint32_t iq3xxs_grid[256] = { + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, +}; + +#define NGRID_IQ2XXS 512 +static const uint64_t iq1s_grid[NGRID_IQ2XXS] = { + 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, + 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, + 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, + 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, + 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, + 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, + 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, + 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, + 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, + 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, + 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, + 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, + 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, + 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, + 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, + 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, + 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, + 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, + 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, + 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, + 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, + 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, + 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, + 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, + 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, + 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, + 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, + 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, + 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, + 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, + 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, + 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, + 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, + 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, + 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, + 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, + 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, + 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, + 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, + 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, + 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, + 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, + 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, + 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, + 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, + 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, + 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, + 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, + 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, + 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, + 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, + 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, + 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, + 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, + 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, + 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, + 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, + 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, + 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, + 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, + 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, + 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, + 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, + 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, + 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, + 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, + 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, + 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, + 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, + 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, + 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, + 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, + 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, + 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, + 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, + 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, + 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, + 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, + 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, + 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, + 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, + 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, + 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, + 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, + 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, + 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, + 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, + 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, + 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, + 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, + 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, + 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, + 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, + 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, + 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, + 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, + 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, + 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, + 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, + 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, + 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, + 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, + 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, + 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, + 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, + 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, + 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, + 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, + 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, + 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, + 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, + 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, + 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, + 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, + 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, + 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, + 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, + 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, + 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, + 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, + 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, + 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, + 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, + 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, + 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, + 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, + 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, + 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, + +}; + static const uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -3507,6 +3679,101 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, } } +// ====================== 3.0625 bpw (de)-quantization + +void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + uint32_t aux32; + + for (int i = 0; i < nb; i++) { + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + const uint8_t * scales_and_signs = qs + QK_K/4; + + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, scales_and_signs + 4*ib32, sizeof(uint32_t)); + const float db = d * (0.5f + (aux32 >> 28)) * 0.5f; + for (int l = 0; l < 4; ++l) { + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + qs[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + qs[2*l+1]); + for (int j = 0; j < 4; ++j) { + y[j+0] = db * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = db * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + y += 8; + } + qs += 8; + } + } +} + +// ====================== 1.5625 bpw (de)-quantization + +void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + float db[4]; + uint16_t idx[4]; + //const int8_t * grid[4]; + + for (int i = 0; i < nb; i++) { + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + const uint8_t * sc = x[i].scales; + const uint8_t * qs = x[i].qs; + + for (int i8 = 0; i8 < QK_K/8; i8 += 4) { + idx[0] = qs[0] | ((sc[0] & 0x08) << 5); + idx[1] = qs[1] | ((sc[0] & 0x80) << 1); + idx[2] = qs[2] | ((sc[1] & 0x08) << 5); + idx[3] = qs[3] | ((sc[1] & 0x80) << 1); + //grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); + //grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); + //grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5))); + //grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1))); + db[0] = d * (2*(sc[0] & 7) + 1); + db[1] = d * (2*((sc[0] >> 4) & 7) + 1); + db[2] = d * (2*(sc[1] & 7) + 1); + db[3] = d * (2*((sc[1] >> 4) & 7) + 1); + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + for (int j = 0; j < 8; ++j) { + //y[j] = db[l] * grid[l][j]; + y[j] = db[l] * grid[j]; + } + y += 8; + } + qs += 4; + sc += 2; + } + } +} + +static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + +void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) { + assert(k % QK4_NL == 0); + const int nb = k / QK4_NL; + + for (int i = 0; i < nb; i++) { + + const uint8_t * qs = x[i].qs; + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK4_NL/2; ++j) { + y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf]; + y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4]; + } + y += QK4_NL; + qs += QK4_NL/2; + } +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -3608,15 +3875,92 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_0 * restrict x = vx; const block_q8_0 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_0 * restrict vx0 = vx; + const block_q4_0 * restrict vx1 = vx + bx; + + const block_q8_0 * restrict vy0 = vy; + const block_q8_0 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q4_0 * restrict b_x0 = &vx0[i]; + const block_q4_0 * restrict b_x1 = &vx1[i]; + const block_q8_0 * restrict b_y0 = &vy0[i]; + const block_q8_0 * restrict b_y1 = &vy1[i]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); + const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t x0_l = vsubq_s8(v0_0l, s8b); + const int8x16_t x0_h = vsubq_s8(v0_0h, s8b); + const int8x16_t x1_l = vsubq_s8(v0_1l, s8b); + const int8x16_t x1_h = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + float32x4_t scale = {LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y1->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -3671,15 +4015,15 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i qx = bytes_from_nibbles_32(x[i].qs); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); + qx = _mm256_sub_epi8( qx, off ); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); /* Multiply q with scale and accumulate */ acc = _mm256_fmadd_ps( d, q, acc ); @@ -3700,15 +4044,15 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); - __m128i bx = _mm_and_si128(lowMask, tmp); - __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx, by); + __m128i bx_0 = _mm_and_si128(lowMask, tmp); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); - by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx, by); + bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); + by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0); // Convert int32_t to float __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); @@ -3898,15 +4242,93 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict #endif } -void lm_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_1 * restrict vx0 = vx; + const block_q4_1 * restrict vx1 = vx + bx; + const block_q8_1 * restrict vy0 = vy; + const block_q8_1 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t summs0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q4_1 * restrict b_x0 = &vx0[i]; + const block_q4_1 * restrict b_x1 = &vx1[i]; + const block_q8_1 * restrict b_y0 = &vy0[i]; + const block_q8_1 * restrict b_y1 = &vy1[i]; + + float32x4_t summs_t = {LM_GGML_FP16_TO_FP32(b_x0->m) * b_y0->s, + LM_GGML_FP16_TO_FP32(b_x1->m) * b_y0->s, + LM_GGML_FP16_TO_FP32(b_x0->m) * b_y1->s, + LM_GGML_FP16_TO_FP32(b_x1->m) * b_y1->s}; + summs0 += summs_t; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); + const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); + + // 4-bit -> 8-bit + const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + // mmla into int32x4_t + float32x4_t scale = {LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y1->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + sumv2 = sumv2 + summs0; + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif // TODO: add WASM SIMD #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3970,10 +4392,10 @@ void lm_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * res const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); + const __m256i qx = bytes_from_nibbles_32(x[i].qs); + const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs ); - const __m256 xy = mul_sum_us8_pairs_float(bx, by); + const __m256 xy = mul_sum_us8_pairs_float(qx, qy); // Accumulate d0*d1*x*y #if defined(__AVX2__) @@ -4038,12 +4460,17 @@ void lm_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * res #endif } -void lm_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -4187,14 +4614,14 @@ void lm_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * res /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i qx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); - bx = _mm256_or_si256(bx, bxhi); + qx = _mm256_or_si256(qx, bxhi); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); /* Multiply q with scale and accumulate */ acc = _mm256_fmadd_ps(d, q, acc); @@ -4211,21 +4638,21 @@ void lm_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * res /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bx_0 = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); __m128i bxhil = _mm256_castsi256_si128(bxhi); __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); bxhil = _mm_andnot_si128(bxhil, mask); bxhih = _mm_andnot_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx); - __m128i bxh = _mm256_extractf128_si256(bx, 1); + __m128i bxl = _mm256_castsi256_si128(bx_0); + __m128i bxh = _mm256_extractf128_si256(bx_0, 1); bxl = _mm_or_si128(bxl, bxhil); bxh = _mm_or_si128(bxh, bxhih); - bx = MM256_SET_M128I(bxh, bxl); + bx_0 = MM256_SET_M128I(bxh, bxl); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0); /* Multiply q with scale and accumulate */ acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); @@ -4324,12 +4751,17 @@ void lm_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * res #endif } -void lm_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -4486,15 +4918,15 @@ void lm_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * res summs += LM_GGML_FP16_TO_FP32(x[i].m) * y[i].s; - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i qx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); - bx = _mm256_or_si256(bx, bxhi); + qx = _mm256_or_si256(qx, bxhi); const __m256 dy = _mm256_set1_ps(y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_us8_pairs_float(bx, by); + const __m256 q = mul_sum_us8_pairs_float(qx, qy); acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); } @@ -4513,22 +4945,22 @@ void lm_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * res summs += LM_GGML_FP16_TO_FP32(x[i].m) * y[i].s; - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bx_0 = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); __m128i bxhil = _mm256_castsi256_si128(bxhi); __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); bxhil = _mm_and_si128(bxhil, mask); bxhih = _mm_and_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx); - __m128i bxh = _mm256_extractf128_si256(bx, 1); + __m128i bxl = _mm256_castsi256_si128(bx_0); + __m128i bxh = _mm256_extractf128_si256(bx_0, 1); bxl = _mm_or_si128(bxl, bxhil); bxh = _mm_or_si128(bxh, bxhih); - bx = MM256_SET_M128I(bxh, bxl); + bx_0 = MM256_SET_M128I(bxh, bxl); const __m256 dy = _mm256_set1_ps(y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_us8_pairs_float(bx, by); + const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0); acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); } @@ -4623,15 +5055,79 @@ void lm_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * res #endif } -void lm_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q8_0 * restrict vx0 = vx; + const block_q8_0 * restrict vx1 = vx + bx; + const block_q8_0 * restrict vy0 = vy; + const block_q8_0 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q8_0 * restrict b_x0 = &vx0[i]; + const block_q8_0 * restrict b_y0 = &vy0[i]; + + const block_q8_0 * restrict b_x1 = &vx1[i]; + const block_q8_0 * restrict b_y1 = &vy1[i]; + + const int8x16_t x0_l = vld1q_s8(b_x0->qs); + const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16); + const int8x16_t x1_l = vld1q_s8(b_x1->qs); + const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + float32x4_t scale = {LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y1->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -4673,10 +5169,10 @@ void lm_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { // Compute combined scale for the block const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); - __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + __m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); // Multiply q with scale and accumulate #if defined(__AVX2__) @@ -4693,10 +5189,10 @@ void lm_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * res for (int i = 0; i < nb; i++) { // load elements - vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); - vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl); - vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl); vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); @@ -4726,7 +5222,12 @@ void lm_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * res } #if QK_K == 256 -void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q2_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -5102,7 +5603,12 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res #else -void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q2_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -5360,8 +5866,13 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res #endif #if QK_K == 256 -void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; @@ -5880,8 +6391,13 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res #else -void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q3_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6223,8 +6739,13 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res #endif #if QK_K == 256 -void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6579,8 +7100,13 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res #endif } #else -void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6822,8 +7348,13 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res #endif #if QK_K == 256 -void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -7242,8 +7773,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res #else -void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -7508,8 +8044,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res #if QK_K == 256 -void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q6_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -7940,8 +8481,13 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res #else -void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q6_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8270,8 +8816,13 @@ static const int8_t keven_signs_q2xs[1024] = { 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, }; -void lm_ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_iq2_xxs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8393,8 +8944,13 @@ void lm_ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * #endif } -void lm_ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void lm_ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_iq2_xs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8458,17 +9014,36 @@ void lm_ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * r const __m128i m4 = _mm_set1_epi8(0xf); const __m128i m1 = _mm_set1_epi8(1); - const __m128i m511 = _mm_set1_epi16(511); - const __m128i m127 = _mm_set1_epi16(127); + const __m256i m511 = _mm256_set1_epi16(511); + const __m256i mone = _mm256_set1_epi8(1); - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + static const uint8_t k_bit_helper[32] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + static const char block_sign_shuffle_mask_1[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + }; + static const char block_sign_shuffle_mask_2[32] = { + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, + }; + static const uint8_t bit_selector_mask_bytes[32] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper); + const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes); + const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1); + const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2); uint64_t aux64; // somewhat hacky, but gives a significant boost in performance - __m128i aux_gindex, aux_sindex; + __m256i aux_gindex; const uint16_t * gindex = (const uint16_t *)&aux_gindex; - const uint16_t * sindex = (const uint16_t *)&aux_sindex; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { @@ -8483,26 +9058,68 @@ void lm_ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * r __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { + + const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16; + aux_gindex = _mm256_and_si256(q2_data, m511); + + const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9); + const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13); + const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper); + + const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); + const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits); + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8; - aux_gindex = _mm_and_si128(q2_data, m511); - aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127); - const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); - const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); - const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]); - const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]); - const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); - const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + + const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]], + iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]], + iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]); + const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]], + iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]); + const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]], + iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); + + const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits); + const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1); + const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l); + const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h); + + __m256i signs; + signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone)); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3); + const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4); const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); + const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2))); + const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3))); sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4)); } accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); @@ -8551,70 +9168,478 @@ void lm_ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * r #endif } -// ================================ IQ2 quantization ============================================= +void lm_ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); -typedef struct { - uint64_t * grid; - int * map; - uint16_t * neighbours; -} iq2_entry_t; + const block_iq3_xxs * restrict x = vx; + const block_q8_K * restrict y = vy; -static iq2_entry_t iq2_data[2] = { - {NULL, NULL, NULL}, - {NULL, NULL, NULL}, -}; + const int nb = n / QK_K; -static inline int iq2_data_index(int grid_size) { - LM_GGML_ASSERT(grid_size == 256 || grid_size == 512); - return grid_size == 256 ? 0 : 1; -} +#if defined(__ARM_NEON) -static int iq2_compare_func(const void * left, const void * right) { - const int * l = (const int *)left; - const int * r = (const int *)right; - return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0; -} + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; -void iq2xs_init_impl(int grid_size) { - const int gindex = iq2_data_index(grid_size); - if (iq2_data[gindex].grid) { - return; + uint32_t aux32[2]; + + lm_ggml_int8x16x4_t q3s; + lm_ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict gas = x[i].qs + QK_K/4; + const int8_t * restrict q8 = y[i].qs; + float sumf1 = 0, sumf2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = lm_ggml_vld1q_s8_x4(q8); q8 += 64; + memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t); + const uint32x4_t aux32x4_0 = lm_ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]); + const uint32x4_t aux32x4_1 = lm_ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]); + const uint32x4_t aux32x4_2 = lm_ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]); + const uint32x4_t aux32x4_3 = lm_ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]); + q3 += 16; + q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127)))); + q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127)))); + q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127)))); + q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127)))); + q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0)); + q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1)); + q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2)); + q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3)); + const int32x4_t p1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); + const int32x4_t p2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); + sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28)); + sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28)); + } + sumf += d*(sumf1 + sumf2); } - static const uint16_t kgrid_256[256] = { - 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97, - 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642, - 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288, - 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113, - 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240, - 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400, - 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260, - 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872, - 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516, - 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561, - 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488, - 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545, - 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874, - 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856, - 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142, - 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268, - }; - static const uint16_t kgrid_512[512] = { - 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70, - 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257, - 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340, - 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597, - 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096, - 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348, - 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065, - 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441, - 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160, - 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372, - 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125, - 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652, - 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197, - 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549, - 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894, - 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388, + *s = 0.5f * sumf; + +#elif defined(__AVX2__) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict gas = x[i].qs + QK_K/4; + const int8_t * restrict q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + memcpy(aux32, gas, 8); gas += 8; + const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127], + signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); + const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = aux32[0] >> 28; + const uint16_t ls2 = aux32[1] >> 28; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.25f * hsum_float_8(accumf); + +#else + + uint32_t aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict gas = x[i].qs + QK_K/4; + const int8_t * restrict q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); + const uint32_t ls = 2*(aux32 >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]); + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + q3 += 8; + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +#endif +} + +#ifdef __AVX2__ +static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { + const __m256i ax = _mm256_sign_epi8(x, x); + const __m256i sy = _mm256_sign_epi8(y, x); + return _mm256_maddubs_epi16(ax, sy); +} +#endif + +void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_s * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined __ARM_NEON + + const uint8x16_t m8 = vdupq_n_u8(0x08); + const uint8x16_t m7 = vdupq_n_u8(0x07); + const uint8x16_t m1 = vdupq_n_u8(0x01); + const int32x4_t vzero = vdupq_n_s32(0); + + uint16_t gindex[8]; + uint16x8x2_t vindex; + int8x16x4_t q1b; + int8x16x4_t q8b; + uint16x8x4_t scales; + int32x4x2_t sumi; + int32x4x2_t dotq; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * sc = x[i].scales; + + sumi.val[0] = sumi.val[1] = vzero; + + for (int i128 = 0; i128 < QK_K/128; ++i128) { + const uint8x16_t ql = vld1q_u8(qs); qs += 16; + const uint8x8_t tm1 = vld1_u8 (sc); sc += 8; + const uint8x8_t tm2 = vshr_n_u8(tm1, 4); + const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2)); + const uint8x16_t hbit = vandq_u8(qh, m8); + vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5)); + vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5)); + const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1); + scales.val[0] = vmovl_u8(vget_low_u8 (scales8)); + scales.val[1] = vmovl_u8(vget_high_u8 (scales8)); + + for (int l = 0; l < 2; ++l) { + vst1q_u16(gindex+0, vindex.val[l]); + q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1]))); + q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3]))); + q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5]))); + q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7]))); + q8b = lm_ggml_vld1q_s8_x4(q8); q8 += 64; + + dotq.val[0] = vpaddq_s32(lm_ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), lm_ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1])); + dotq.val[1] = vpaddq_s32(lm_ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), lm_ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3])); + + sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l])))); + sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l])))); + } + } + + sumf += y[i].d * LM_GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1])); + } + + *s = sumf; + +#elif defined __AVX2__ + + const __m128i m8 = _mm_set1_epi8(0x08); + const __m128i m7 = _mm_set1_epi8(0x07); + const __m128i m1 = _mm_set1_epi8(0x01); + const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0); + const __m128i shuffle_s[4] = { + _mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000), + _mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404), + _mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808), + _mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c) + }; + + uint64_t aux64; + + __m256i v_gindex; + const uint16_t * gindex = (const uint16_t *)&v_gindex; + + __m256 accum = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * sc = x[i].scales; + + __m256i sumi = _mm256_setzero_si256(); + for (int i128 = 0; i128 < QK_K/128; ++i128) { + const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16; + memcpy(&aux64, sc, 8); sc += 8; + const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h); + const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8)); + v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5)); + const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1); + + for (int i32 = 0; i32 < 4; ++i32) { + const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q1b = _mm256_set_epi64x(iq1s_grid[gindex[4*i32+3]], iq1s_grid[gindex[4*i32+2]], + iq1s_grid[gindex[4*i32+1]], iq1s_grid[gindex[4*i32+0]]); + const __m256i dot = mul_add_epi8(q1b, q8b); + const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32])); + const __m256i p = _mm256_madd_epi16(s16, dot); + sumi = _mm256_add_epi32(sumi, p); + } + + } + + accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * LM_GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum); + + } + + *s = hsum_float_8(accum); + +#else + + int db[4]; + uint16_t idx[4]; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * sc = x[i].scales; + + int sumi = 0; + for (int i32 = 0; i32 < QK_K/32; ++i32) { + idx[0] = qs[0] | ((sc[0] & 0x08) << 5); + idx[1] = qs[1] | ((sc[0] & 0x80) << 1); + idx[2] = qs[2] | ((sc[1] & 0x08) << 5); + idx[3] = qs[3] | ((sc[1] & 0x80) << 1); + db[0] = (2*(sc[0] & 7) + 1); + db[1] = (2*((sc[0] >> 4) & 7) + 1); + db[2] = (2*(sc[1] & 7) + 1); + db[3] = (2*((sc[1] >> 4) & 7) + 1); + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + int suml = 0; + for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j]; + sumi += db[l] * suml; + q8 += 8; + } + qs += 4; + sc += 2; + } + + sumf += LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi; + } + + *s = sumf; + +#endif +} + +void lm_ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * restrict x = vx; + const block_q8_0 * restrict y = vy; + + const int nb = n / QK4_NL; + +#if defined __ARM_NEON + const int8x16_t values = vld1q_s8(kvalues_iq4nl); + const uint8x16_t m4b = vdupq_n_u8(0x0f); + uint8x16x2_t q4bits; + int8x16x4_t q4b; + int8x16x4_t q8b; + int32x4_t prod_1, prod_2; + + float sumf = 0; + + for (int ib = 0; ib < nb; ib += 2) { + + q4bits.val[0] = vld1q_u8(x[ib+0].qs); + q4bits.val[1] = vld1q_u8(x[ib+1].qs); + q8b.val[0] = vld1q_s8(y[ib+0].qs); + q8b.val[1] = vld1q_s8(y[ib+0].qs + 16); + q8b.val[2] = vld1q_s8(y[ib+1].qs); + q8b.val[3] = vld1q_s8(y[ib+1].qs + 16); + + q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b)); + q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); + q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b)); + q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); + + prod_1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); + prod_2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); + + sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2); + + } + + *s = sumf; + +#elif defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + const __m256i mone = _mm256_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int ib = 0; ib < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs); + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs); + const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); + const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); + const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); + accum1 = _mm256_fmadd_ps(_mm256_set1_ps(LM_GGML_FP16_TO_FP32(y[0].d)*LM_GGML_FP16_TO_FP32(x[0].d)), + _mm256_cvtepi32_ps(p_1), accum1); + accum2 = _mm256_fmadd_ps(_mm256_set1_ps(LM_GGML_FP16_TO_FP32(y[1].d)*LM_GGML_FP16_TO_FP32(x[1].d)), + _mm256_cvtepi32_ps(p_2), accum2); + + y += 2; + x += 2; + } + + *s = hsum_float_8(_mm256_add_ps(accum1, accum2)); + +#else + float sumf = 0; + for (int ib = 0; ib < nb; ++ib) { + const float d = LM_GGML_FP16_TO_FP32(y[ib].d)*LM_GGML_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +#endif +} + +// ================================ IQ2 quantization ============================================= + +typedef struct { + uint64_t * grid; + int * map; + uint16_t * neighbours; +} iq2_entry_t; + +static iq2_entry_t iq2_data[3] = { + {NULL, NULL, NULL}, + {NULL, NULL, NULL}, + {NULL, NULL, NULL}, +}; + +static inline int iq2_data_index(enum lm_ggml_type type) { + LM_GGML_ASSERT(type == LM_GGML_TYPE_IQ2_XXS || type == LM_GGML_TYPE_IQ2_XS || type == LM_GGML_TYPE_IQ1_S); + return type == LM_GGML_TYPE_IQ2_XXS ? 0 : + type == LM_GGML_TYPE_IQ2_XS ? 1 : 2; +} + +static inline int iq2_grid_size(enum lm_ggml_type type) { + LM_GGML_ASSERT(type == LM_GGML_TYPE_IQ2_XXS || type == LM_GGML_TYPE_IQ2_XS || type == LM_GGML_TYPE_IQ1_S); + return type == LM_GGML_TYPE_IQ2_XXS ? 256 : + type == LM_GGML_TYPE_IQ2_XS ? 512 : 512; +} + +static int iq2_compare_func(const void * left, const void * right) { + const int * l = (const int *)left; + const int * r = (const int *)right; + return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0; +} + +void iq2xs_init_impl(enum lm_ggml_type type) { + const int gindex = iq2_data_index(type); + const int grid_size = iq2_grid_size(type); + if (iq2_data[gindex].grid) { + return; + } + static const uint16_t kgrid_2bit_256[256] = { + 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97, + 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642, + 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288, + 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113, + 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240, + 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400, + 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260, + 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872, + 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516, + 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561, + 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488, + 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545, + 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874, + 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856, + 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142, + 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268, + }; + static const uint16_t kgrid_2bit_512[512] = { + 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70, + 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257, + 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340, + 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597, + 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096, + 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348, + 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065, + 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441, + 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160, + 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372, + 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125, + 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652, + 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197, + 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549, + 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894, + 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480, 16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773, 16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473, @@ -8632,9 +9657,45 @@ void iq2xs_init_impl(int grid_size) { 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048, 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690, }; + static const uint16_t kgrid_1bit_512[512] = { + 10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545, + 553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444, + 1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440, + 2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422, + 4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397, + 5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769, + 5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788, + 6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794, + 9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272, + 10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665, + 16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685, + 17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529, + 18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517, + 20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872, + 20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653, + 21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842, + 21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913, + 21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608, + 22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072, + 23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110, + 25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937, + 25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885, + 26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808, + 32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320, + 33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918, + 34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125, + 37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973, + 38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485, + 38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497, + 39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514, + 41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512, + 42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680, + }; + const int kmap_size = 43692; - const int nwant = 2; - const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512; + const int nwant = type == LM_GGML_TYPE_IQ1_S ? 3 : 2; + const uint16_t * kgrid = type == LM_GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 : + type == LM_GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : kgrid_1bit_512; uint64_t * kgrid_q2xs; int * kmap_q2xs; uint16_t * kneighbors_q2xs; @@ -8730,9 +9791,9 @@ void iq2xs_init_impl(int grid_size) { free(dist2); } -void iq2xs_free_impl(int grid_size) { - LM_GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024); - const int gindex = iq2_data_index(grid_size); +void iq2xs_free_impl(enum lm_ggml_type type) { + LM_GGML_ASSERT(type == LM_GGML_TYPE_IQ2_XXS || type == LM_GGML_TYPE_IQ2_XS || type == LM_GGML_TYPE_IQ1_S); + const int gindex = iq2_data_index(type); if (iq2_data[gindex].grid) { free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL; free(iq2_data[gindex].map); iq2_data[gindex].map = NULL; @@ -8766,7 +9827,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { - const int gindex = iq2_data_index(256); + const int gindex = iq2_data_index(LM_GGML_TYPE_IQ2_XXS); const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; const int * kmap_q2xs = iq2_data[gindex].map; @@ -8790,8 +9851,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict int8_t L[32]; int8_t Laux[32]; float waux[32]; - bool is_on_grid[4]; - bool is_on_grid_aux[4]; uint8_t block_signs[4]; uint32_t q2[2*(QK_K/32)]; @@ -8841,10 +9900,11 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict memset(L, 0, 32); continue; } + float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight); + float eff_max = scale*kMaxQ; float best = 0; - float scale = max/(2*kMaxQ-1); - for (int is = -9; is <= 9; ++is) { - float id = (2*kMaxQ-1+is*0.1f)/max; + for (int is = -6; is <= 6; ++is) { + float id = (2*kMaxQ-1+is*0.1f)/eff_max; float this_scale = 1/id; for (int k = 0; k < 4; ++k) { for (int i = 0; i < 8; ++i) { @@ -8854,9 +9914,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict uint16_t u = 0; for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i); int grid_index = kmap_q2xs[u]; - is_on_grid_aux[k] = true; if (grid_index < 0) { - is_on_grid_aux[k] = false; const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k); } @@ -8870,16 +9928,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict } if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { scale = sumqx/sumq2; best = scale*sumqx; - for (int i = 0; i < 32; ++i) L[i] = Laux[i]; - for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k]; + memcpy(L, Laux, 32); } } - int n_not_ongrid = 0; - for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid; - if (n_not_ongrid > 0 && scale > 0) { + if (scale > 0) { float id = 1/scale; for (int k = 0; k < 4; ++k) { - if (is_on_grid[k]) continue; uint16_t u = 0; for (int i = 0; i < 8; ++i) { int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); @@ -8935,49 +9989,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict float d = max_scale/31; y[ibl].d = LM_GGML_FP32_TO_FP16(d); float id = 1/d; - float sumqx = 0, sumq2 = 0; for (int ib = 0; ib < QK_K/32; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); l = MAX(0, MIN(15, l)); q2[2*ib+1] |= ((uint32_t)l << 28); - const float * xb = xbl + 32*ib; - const float * qw = quant_weights + QK_K*ibl + 32*ib; - for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); - const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib); - const float db = d * (1 + 2*l); - uint32_t u = 0; - for (int k = 0; k < 4; ++k) { - const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127); - const float * xk = xb + 8*k; - const float * wk = weight + 8*k; - const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); - float best_mse = 0; int best_index = aux8[k]; - for (int j = 0; j < 8; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - best_mse += wk[j] * diff * diff; - } - for (int idx = 0; idx < 256; ++idx) { - grid = (const uint8_t *)(kgrid_q2xs + idx); - float mse = 0; - for (int j = 0; j < 8; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - mse += wk[j] * diff * diff; - } - if (mse < best_mse) { - best_mse = mse; best_index = idx; - } - } - u |= (best_index << 8*k); - grid = (const uint8_t *)(kgrid_q2xs + best_index); - //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); - for (int j = 0; j < 8; ++j) { - float q = db * grid[j] * signs[j]; - sumqx += wk[j] * q * xk[j]; - sumq2 += wk[j] * q * q; - } - } - q2[2*ib] = u; - if (sumq2 > 0) y[ibl].d = LM_GGML_FP32_TO_FP16(d*sumqx/sumq2); } memcpy(y[ibl].qs, q2, QK_K/4); } @@ -8985,7 +10000,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { - const int gindex = iq2_data_index(512); + const int gindex = iq2_data_index(LM_GGML_TYPE_IQ2_XS); const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; const int * kmap_q2xs = iq2_data[gindex].map; @@ -9189,3 +10204,760 @@ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, i return nrow * nblock * sizeof(block_iq2_xs); } +// +// ============================================= 3-bit using D4 lattice +// + +typedef struct { + uint32_t * grid; + int * map; + uint16_t * neighbours; +} iq3_entry_t; + +static iq3_entry_t iq3_data[1] = { + {NULL, NULL, NULL}, +}; + +static inline int iq3_data_index(int grid_size) { + (void)grid_size; + LM_GGML_ASSERT(grid_size == 256); + return 0; +} + +static int iq3_compare_func(const void * left, const void * right) { + const int * l = (const int *)left; + const int * r = (const int *)right; + return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0; +} + +void iq3xs_init_impl(int grid_size) { + const int gindex = iq3_data_index(grid_size); + if (iq3_data[gindex].grid) { + return; + } + static const uint16_t kgrid_256[256] = { + 0, 2, 4, 9, 11, 15, 16, 18, 25, 34, 59, 61, 65, 67, 72, 74, + 81, 85, 88, 90, 97, 108, 120, 128, 130, 132, 137, 144, 146, 153, 155, 159, + 169, 175, 189, 193, 199, 200, 202, 213, 248, 267, 287, 292, 303, 315, 317, 321, + 327, 346, 362, 413, 436, 456, 460, 462, 483, 497, 513, 515, 520, 522, 529, 531, + 536, 538, 540, 551, 552, 576, 578, 585, 592, 594, 641, 643, 648, 650, 657, 664, + 698, 704, 706, 720, 729, 742, 758, 769, 773, 808, 848, 852, 870, 889, 901, 978, + 992, 1024, 1026, 1033, 1035, 1040, 1042, 1046, 1049, 1058, 1089, 1091, 1093, 1096, 1098, 1105, + 1112, 1139, 1143, 1144, 1152, 1154, 1161, 1167, 1168, 1170, 1183, 1184, 1197, 1217, 1224, 1228, + 1272, 1276, 1309, 1323, 1347, 1367, 1377, 1404, 1473, 1475, 1486, 1509, 1537, 1544, 1546, 1553, + 1555, 1576, 1589, 1594, 1600, 1602, 1616, 1625, 1636, 1638, 1665, 1667, 1672, 1685, 1706, 1722, + 1737, 1755, 1816, 1831, 1850, 1856, 1862, 1874, 1901, 1932, 1950, 1971, 2011, 2032, 2052, 2063, + 2077, 2079, 2091, 2095, 2172, 2192, 2207, 2208, 2224, 2230, 2247, 2277, 2308, 2345, 2356, 2389, + 2403, 2424, 2501, 2504, 2506, 2520, 2570, 2593, 2616, 2624, 2630, 2646, 2669, 2700, 2714, 2746, + 2754, 2795, 2824, 2835, 2839, 2874, 2882, 2905, 2984, 3028, 3042, 3092, 3108, 3110, 3124, 3153, + 3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610, + 3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992, + }; + const int kmap_size = 4096; + const int nwant = 2; + const uint16_t * kgrid = kgrid_256; + uint32_t * kgrid_q3xs; + int * kmap_q3xs; + uint16_t * kneighbors_q3xs; + + printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); + uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t)); + for (int k = 0; k < grid_size; ++k) { + int8_t * pos = (int8_t *)(the_grid + k); + for (int i = 0; i < 4; ++i) { + int l = (kgrid[k] >> 3*i) & 0x7; + pos[i] = 2*l + 1; + } + } + kgrid_q3xs = the_grid; + iq3_data[gindex].grid = the_grid; + kmap_q3xs = (int *)malloc(kmap_size*sizeof(int)); + iq3_data[gindex].map = kmap_q3xs; + for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1; + uint32_t aux32; + uint8_t * aux8 = (uint8_t *)&aux32; + for (int i = 0; i < grid_size; ++i) { + aux32 = kgrid_q3xs[i]; + uint16_t index = 0; + for (int k=0; k<4; ++k) { + uint16_t q = (aux8[k] - 1)/2; + index |= (q << 3*k); + } + kmap_q3xs[index] = i; + } + int8_t pos[4]; + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + int num_neighbors = 0, num_not_in_map = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) continue; + ++num_not_in_map; + for (int k = 0; k < 4; ++k) { + int l = (i >> 3*k) & 0x7; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + num_neighbors += n; + } + printf("%s: %d neighbours in total\n", __func__, num_neighbors); + kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); + iq3_data[gindex].neighbours = kneighbors_q3xs; + int counter = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) continue; + for (int k = 0; k < 4; ++k) { + int l = (i >> 3*k) & 0x7; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + kmap_q3xs[i] = -(counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q3xs[counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q3xs[counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); +} + +void iq3xs_free_impl(int grid_size) { + LM_GGML_ASSERT(grid_size == 256); + const int gindex = iq3_data_index(grid_size); + if (iq3_data[gindex].grid) { + free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL; + free(iq3_data[gindex].map); iq3_data[gindex].map = NULL; + free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL; + } +} + +static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid, + const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) { + int num_neighbors = neighbours[0]; + LM_GGML_ASSERT(num_neighbors > 0); + float best_d2 = FLT_MAX; + int grid_index = -1; + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float d2 = 0; + for (int i = 0; i < 4; ++i) { + float q = pg[i]; + float diff = scale*q - xval[i]; + d2 += weight[i]*diff*diff; + } + if (d2 < best_d2) { + best_d2 = d2; grid_index = neighbours[j]; + } + } + LM_GGML_ASSERT(grid_index >= 0); + const int8_t * pg = (const int8_t *)(grid + grid_index); + for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2; + return grid_index; +} + +static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { + + const int gindex = iq3_data_index(256); + + const uint32_t * kgrid_q3xs = iq3_data[gindex].grid; + const int * kmap_q3xs = iq3_data[gindex].map; + const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours; + + //LM_GGML_ASSERT(quant_weights && "missing quantization weights"); + LM_GGML_ASSERT(kgrid_q3xs && "forgot to call lm_ggml_quantize_init()?"); + LM_GGML_ASSERT(kmap_q3xs && "forgot to call lm_ggml_quantize_init()?"); + LM_GGML_ASSERT(kneighbors_q3xs && "forgot to call lm_ggml_quantize_init()?"); + LM_GGML_ASSERT(n%QK_K == 0); + + const int kMaxQ = 8; + + const int nbl = n/256; + + block_iq3_xxs * y = vy; + + float scales[QK_K/32]; + float weight[32]; + float xval[32]; + int8_t L[32]; + int8_t Laux[32]; + float waux[32]; + bool is_on_grid[8]; + bool is_on_grid_aux[8]; + uint8_t block_signs[8]; + uint8_t q3[3*(QK_K/8)]; + uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4); + + for (int ibl = 0; ibl < nbl; ++ibl) { + + y[ibl].d = LM_GGML_FP32_TO_FP16(0.f); + memset(q3, 0, 3*QK_K/8); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = sumx2/QK_K; + + for (int ib = 0; ib < QK_K/32; ++ib) { + const float * xb = xbl + 32*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i]; + } + for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]); + for (int k = 0; k < 4; ++k) { + int nflip = 0; + uint8_t s = 0; + for (int i = 0; i < 8; ++i) { + if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i]; + else { + xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i); + } + } + if (nflip%2) { + int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin]; + for (int i = 1; i < 8; ++i) { + float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i]; + if (ax < min) { + min = ax; imin = i; + } + } + xval[8*k+imin] = -xval[8*k+imin]; + s ^= (1 << imin); + } + block_signs[k] = s & 127; + } + float max = xval[0]; + for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]); + if (!max) { + scales[ib] = 0; + memset(L, 0, 32); + continue; + } + float best = 0; + float scale = max/(2*kMaxQ-1); + for (int is = -15; is <= 15; ++is) { + float id = (2*kMaxQ-1+is*0.2f)/max; + float this_scale = 1/id; + for (int k = 0; k < 8; ++k) { + for (int i = 0; i < 4; ++i) { + int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); + Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l)); + } + uint16_t u = 0; + for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i); + int grid_index = kmap_q3xs[u]; + is_on_grid_aux[k] = true; + if (grid_index < 0) { + is_on_grid_aux[k] = false; + const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1; + grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 32; ++i) { + float w = weight[i]; + float q = 2*Laux[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + scale = sumqx/sumq2; best = scale*sumqx; + for (int i = 0; i < 32; ++i) L[i] = Laux[i]; + for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k]; + } + } + int n_not_ongrid = 0; + for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid; + if (n_not_ongrid > 0 && scale > 0) { + float id = 1/scale; + for (int k = 0; k < 8; ++k) { + if (is_on_grid[k]) continue; + uint16_t u = 0; + for (int i = 0; i < 4; ++i) { + int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); + l = MAX(0, MIN(kMaxQ-1, l)); + u |= (l << 3*i); + } + int grid_index = kmap_q3xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1; + grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k); + } + const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index); + for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2; + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 32; ++i) { + float w = weight[i]; + float q = 2*L[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0) scale = sumqx/sumq2; + } + if (scale < 0) { + // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale) + // and correspondingly flip quant signs. + scale = -scale; + for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127; + } + for (int k = 0; k < 8; ++k) { + uint16_t u = 0; + for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i); + int grid_index = kmap_q3xs[u]; + if (grid_index < 0) { + printf("Oops: found point %u not on grid:", u); + for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); + printf("\n"); + LM_GGML_ASSERT(false); + } + q3[8*ib+k] = grid_index; + } + scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21); + LM_GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + memset(y[ibl].qs, 0, 3*QK_K/8); + continue; + } + + float d = max_scale/31; + y[ibl].d = LM_GGML_FP32_TO_FP16(d); + float id = 1/d; + float sumqx = 0, sumq2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib]-1)); + l = MAX(0, MIN(15, l)); + scales_and_signs[ib] |= ((uint32_t)l << 28); + if (false) { + const float * xb = xbl + 32*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i]; + } + const float db = 0.25f * d * (1 + 2*l); + for (int k = 0; k < 8; ++k) { + const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2); + const float * xk = xb + 4*k; + const float * wk = weight + 4*k; + //const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]); + const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]); + float best_mse = 0; int best_index = q3[8*ib+k]; + for (int j = 0; j < 4; ++j) { + float diff = db * grid[j] * signs[j] - xk[j]; + best_mse += wk[j] * diff * diff; + } + for (int idx = 0; idx < 256; ++idx) { + //grid = (const uint8_t *)(kgrid_q3xs + idx); + grid = (const uint8_t *)(iq3xxs_grid + idx); + float mse = 0; + for (int j = 0; j < 4; ++j) { + float diff = db * grid[j] * signs[j] - xk[j]; + mse += wk[j] * diff * diff; + } + if (mse < best_mse) { + best_mse = mse; best_index = idx; + } + } + q3[8*ib+k] = best_index; + //grid = (const uint8_t *)(kgrid_q3xs + best_index); + grid = (const uint8_t *)(iq3xxs_grid + best_index); + for (int j = 0; j < 4; ++j) { + float q = db * grid[j] * signs[j]; + sumqx += wk[j] * q * xk[j]; + sumq2 += wk[j] * q * q; + } + } + if (sumq2 > 0) y[ibl].d = LM_GGML_FP32_TO_FP16(d*sumqx/sumq2); + } + } + memcpy(y[ibl].qs, q3, 3*QK_K/8); + } +} + +size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + LM_GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock*sizeof(block_iq3_xxs); + } + return nrow * nblock * sizeof(block_iq3_xxs); +} + +void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_iq3_xxs * restrict y = vy; + quantize_row_iq3_xxs_reference(x, y, k); +} + +void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) { + assert(k % QK_K == 0); + quantize_row_iq3_xxs_impl(x, y, k, NULL); +} + +// =================================== 1.5 bpw =================================================== + +static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid, + const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) { + int num_neighbors = neighbours[0]; + LM_GGML_ASSERT(num_neighbors > 0); + float best_score = 0; + int grid_index = -1; + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 8; ++i) { + float q = (pg[i] - 3)/2; + float w = weight[i]; + sumqx += w*q*xval[i]; + sumq2 += w*q*q; + } + if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + *scale = sumqx/sumq2; best_score = *scale * sumqx; + grid_index = neighbours[j]; + } + } + if (grid_index < 0) { + for (int i = 0; i < ngrid; ++i) { + const int8_t * grid_i = (const int8_t *)(grid + i); + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < 8; ++j) { + float w = weight[j]; + float q = (grid_i[j] - 3)/2; + sumqx += w*q*xval[j]; + sumq2 += w*q*q; + } + if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + *scale = sumqx/sumq2; best_score = *scale*sumqx; + grid_index = i; + } + } + } + if (grid_index < 0) { + printf("Oops, did not find grid point\n"); + printf("Have %d neighbours\n", num_neighbors); + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 8; ++i) { + float q = (pg[i] - 3)/2; + float w = weight[i]; + sumqx += w*q*xval[i]; + sumq2 += w*q*q; + } + printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2); + } + } + LM_GGML_ASSERT(grid_index >= 0); + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + *scale *= 1.05f; // This is a fudge factor. Don't ask me why it improves the result. + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + const int8_t * pg = (const int8_t *)(grid + grid_index); + for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2; + return grid_index; +} + +static int iq1_sort_helper(const void * left, const void * right) { + const float * l = left; + const float * r = right; + return *l < *r ? -1 : *l > *r ? 1 : 0; +} + +static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { + + const int gindex = iq2_data_index(LM_GGML_TYPE_IQ1_S); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + LM_GGML_ASSERT(quant_weights && "missing quantization weights"); + LM_GGML_ASSERT(kgrid_q2xs && "forgot to call lm_ggml_quantize_init()?"); + LM_GGML_ASSERT(kmap_q2xs && "forgot to call lm_ggml_quantize_init()?"); + LM_GGML_ASSERT(kneighbors_q2xs && "forgot to call lm_ggml_quantize_init()?"); + LM_GGML_ASSERT(n%QK_K == 0); + + const int nbl = n/256; + + block_iq1_s * y = vy; + + float scales[QK_K/8]; + float weight[8]; + int8_t L[8]; + float sumx[9]; + float sumw[9]; + float pairs[16]; + int * idx = (int *)(pairs + 1); + uint8_t hbit[QK_K/8]; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + y[ibl].d = LM_GGML_FP32_TO_FP16(0.f); + memset(y[ibl].qs, 0, QK_K/8); + memset(y[ibl].scales, 0, QK_K/16); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = sumx2/QK_K; + + for (int ib = 0; ib < QK_K/8; ++ib) { + const float * xb = xbl + 8*ib; + const float * qw = quant_weights + QK_K*ibl + 8*ib; + for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + float max = fabsf(xb[0]); + for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i])); + if (!max) { + scales[ib] = 0; + memset(L, 1, 8); + continue; + } + // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. + // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two + // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights + // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and + // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale + // for each possible and score for each split. + for (int j = 0; j < 8; ++j) { + pairs[2*j] = xb[j]; + idx[2*j] = j; + } + qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper); + { + sumx[0] = sumw[0] = 0; + for (int j = 0; j < 8; ++j) { + int i = idx[2*j]; + sumx[j+1] = sumx[j] + weight[i]*xb[i]; + sumw[j+1] = sumw[j] + weight[i]; + } + } + float best_score = 0, scale = max; + int besti1 = 0, besti2 = 0; + for (int i1 = 0; i1 <= 8; ++i1) { + for (int i2 = i1; i2 <= 8; ++i2) { + float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]); + float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]); + if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + scale = sumqx/sumq2; best_score = scale*sumqx; + besti1 = i1; besti2 = i2; + } + } + } + for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; + for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; + for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2; + if (scale < 0) { + for (int j = 0; j < 8; ++j) L[j] = 2 - L[j]; + scale = -scale; + } + // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring + // grid point that minimizes SSD. + uint16_t u = 0; + for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS); + LM_GGML_ASSERT(grid_index >= 0); + } + y[ibl].qs[ib] = grid_index & 255; + hbit[ib] = grid_index >> 8; + LM_GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + memset(y[ibl].qs, 0, QK_K/8); + continue; + } + + float d = max_scale/15; + y[ibl].d = LM_GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed. + float id = 1/d; + for (int ib = 0; ib < QK_K/8; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib]-1)); + l = MAX(0, MIN(7, l)); + if (hbit[ib]) l |= 8; + y[ibl].scales[ib/2] |= (l << 4*(ib%2)); + } + } +} + +size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + LM_GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock*sizeof(block_iq1_s); + } + return nrow * nblock * sizeof(block_iq1_s); +} + +// ============================ 4-bit non-linear quants + +static inline int best_index_int8(int n, const int8_t * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} + +static void quantize_row_iq4_nl_impl(const int block_size, const float * LM_GGML_RESTRICT x, + lm_ggml_fp16_t * dh, uint8_t * q4, + float * weight, uint8_t * L, + const int8_t * values, + const float * quant_weights) { + + const int ntry = 7; + + float sigma2 = 0; + for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j]; + sigma2 *= 2.f/QK4_NL; + + const int nb = QK4_NL/block_size; + + memset(q4, 0, QK4_NL/2); + for (int ib = 0; ib < nb; ++ib) { + dh[ib] = LM_GGML_FP32_TO_FP16(0.f); + const float * xb = x + ib*block_size; + if (quant_weights) { + const float * qw = quant_weights + ib*block_size; + for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < block_size; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (!amax) { + continue; + } + float d = -max/values[0]; + float id = 1/d; + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < block_size; ++j) { + float al = id*xb[j]; + int l = best_index_int8(16, values, al); + float q = values[l]; + float w = weight[j]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + float best_id = id; + d = sumqx/sumq2; + float best = d*sumqx; + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry + values[0])/max; + sumqx = sumq2 = 0; + for (int j = 0; j < block_size; ++j) { + float al = id*xb[j]; + int l = best_index_int8(16, values, al); + float q = values[l]; + float w = weight[j]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d * sumqx; + best_id = id; + } + } + dh[ib] = LM_GGML_FP32_TO_FP16(d); + for (int j = 0; j < block_size; ++j) { + L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]); + } + } + for (int i = 0; i < QK4_NL/32; ++i) { + for (int j = 0; j < 16; ++j) { + q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4); + } + } +} + +size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + LM_GGML_ASSERT(n_per_row%QK4_NL == 0); + int nblock = n_per_row/QK4_NL; + char * qrow = (char *)dst; + uint8_t L[QK4_NL]; + float weight[32]; + for (int row = 0; row < nrow; ++row) { + block_iq4_nl * iq4 = (block_iq4_nl *)qrow; + for (int ibl = 0; ibl < nblock; ++ibl) { + const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; + quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw); + } + src += n_per_row; + qrow += nblock*sizeof(block_iq4_nl); + } + return nrow * nblock * sizeof(block_iq4_nl); +} + +void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { + assert(k % QK4_NL == 0); + block_iq4_nl * restrict y = vy; + quantize_row_iq4_nl_reference(x, y, k); +} + +void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { + assert(k % QK4_NL == 0); + quantize_iq4_nl(x, y, 1, k, NULL, NULL); +} + diff --git a/cpp/ggml-quants.h b/cpp/ggml-quants.h index 4586ff80..2886d95b 100644 --- a/cpp/ggml-quants.h +++ b/cpp/ggml-quants.h @@ -166,7 +166,7 @@ typedef struct { static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); // (Almost) "true" 2-bit quantization. -// Due to the need to use blocks as per ggml dsign, it ends up using +// Due to the need to use blocks as per ggml design, it ends up using // 2.0625 bpw because of the 16-bit scale for each block of 256. typedef struct { lm_ggml_fp16_t d; @@ -182,72 +182,113 @@ typedef struct { } block_iq2_xs; static_assert(sizeof(block_iq2_xs) == sizeof(lm_ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); +// (Almost) "true" 3-bit quantization. +// Due to the need to use blocks as per ggml design, it ends up using +// 3.0625 bpw because of the 16-bit scale for each block of 256. +typedef struct { + lm_ggml_fp16_t d; + uint8_t qs[3*QK_K/8]; +} block_iq3_xxs; +static_assert(sizeof(block_iq3_xxs) == sizeof(lm_ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); + +typedef struct { + lm_ggml_fp16_t d; + uint8_t qs[QK_K/8]; + uint8_t scales[QK_K/16]; +} block_iq1_s; +static_assert(sizeof(block_iq1_s) == sizeof(lm_ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); + +// Non-linear quants +#define QK4_NL 32 +typedef struct { + lm_ggml_fp16_t d; + uint8_t qs[QK4_NL/2]; +} block_iq4_nl; +static_assert(sizeof(block_iq4_nl) == sizeof(lm_ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); + +#ifdef __cplusplus +extern "C" { +#endif + // Quantization -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); - -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); - -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); - -void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_0_reference(const float * LM_GGML_RESTRICT x, block_q4_0 * LM_GGML_RESTRICT y, int k); +void quantize_row_q4_1_reference(const float * LM_GGML_RESTRICT x, block_q4_1 * LM_GGML_RESTRICT y, int k); +void quantize_row_q5_0_reference(const float * LM_GGML_RESTRICT x, block_q5_0 * LM_GGML_RESTRICT y, int k); +void quantize_row_q5_1_reference(const float * LM_GGML_RESTRICT x, block_q5_1 * LM_GGML_RESTRICT y, int k); +void quantize_row_q8_0_reference(const float * LM_GGML_RESTRICT x, block_q8_0 * LM_GGML_RESTRICT y, int k); +void quantize_row_q8_1_reference(const float * LM_GGML_RESTRICT x, block_q8_1 * LM_GGML_RESTRICT y, int k); + +void quantize_row_q2_K_reference(const float * LM_GGML_RESTRICT x, block_q2_K * LM_GGML_RESTRICT y, int k); +void quantize_row_q3_K_reference(const float * LM_GGML_RESTRICT x, block_q3_K * LM_GGML_RESTRICT y, int k); +void quantize_row_q4_K_reference(const float * LM_GGML_RESTRICT x, block_q4_K * LM_GGML_RESTRICT y, int k); +void quantize_row_q5_K_reference(const float * LM_GGML_RESTRICT x, block_q5_K * LM_GGML_RESTRICT y, int k); +void quantize_row_q6_K_reference(const float * LM_GGML_RESTRICT x, block_q6_K * LM_GGML_RESTRICT y, int k); +void quantize_row_q8_K_reference(const float * LM_GGML_RESTRICT x, block_q8_K * LM_GGML_RESTRICT y, int k); +void quantize_row_iq3_xxs_reference(const float * LM_GGML_RESTRICT x, block_iq3_xxs * LM_GGML_RESTRICT y, int k); +void quantize_row_iq4_nl_reference (const float * LM_GGML_RESTRICT x, block_iq4_nl * LM_GGML_RESTRICT y, int k); + +void quantize_row_q4_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q4_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q5_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q5_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q8_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q8_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); + +void quantize_row_q2_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q3_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q4_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q5_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q6_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_q8_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_iq3_xxs(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); +void quantize_row_iq4_nl (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); // Dequantization -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); -//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); - -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); -void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k); -void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k); +void dequantize_row_q4_0(const block_q4_0 * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q4_1(const block_q4_1 * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q5_0(const block_q5_0 * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q5_1(const block_q5_1 * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q8_0(const block_q8_0 * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +//void dequantize_row_q8_1(const block_q8_1 * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); + +void dequantize_row_q2_K(const block_q2_K * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q3_K(const block_q3_K * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q4_K(const block_q4_K * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q5_K(const block_q5_K * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q6_K(const block_q6_K * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_q8_K(const block_q8_K * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_iq2_xxs(const block_iq2_xxs * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_iq2_xs (const block_iq2_xs * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_iq3_xxs(const block_iq3_xxs * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_iq1_s (const block_iq1_s * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); +void dequantize_row_iq4_nl (const block_iq4_nl * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); // Dot product -void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); - -void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void lm_ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void lm_ggml_vec_dot_q4_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_q4_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_q5_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_q5_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_q8_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); + +void lm_ggml_vec_dot_q2_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_q3_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_q4_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_q5_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_iq2_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_iq3_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); +void lm_ggml_vec_dot_iq4_nl_q8_0 (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") // size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); @@ -258,5 +299,12 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -void iq2xs_init_impl(int grid_size); -void iq2xs_free_impl(int grid_size); +void iq2xs_init_impl(enum lm_ggml_type type); +void iq2xs_free_impl(enum lm_ggml_type type); +void iq3xs_init_impl(int grid_size); +void iq3xs_free_impl(int grid_size); + +#ifdef __cplusplus +} +#endif + diff --git a/cpp/ggml.c b/cpp/ggml.c index 48498aee..13003520 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -23,6 +23,9 @@ #include #include #include +#if defined(__gnu_linux__) +#include +#endif #ifdef LM_GGML_USE_METAL #include @@ -218,6 +221,7 @@ inline static void * lm_ggml_aligned_malloc(size_t size) { break; } LM_GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); + LM_GGML_ASSERT(false); return NULL; } return aligned_memory; @@ -230,6 +234,38 @@ inline static void * lm_ggml_aligned_malloc(size_t size) { #endif #endif +inline static void * lm_ggml_malloc(size_t size) { + if (size == 0) { + LM_GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for lm_ggml_malloc!\n"); + return NULL; + } + void * result = malloc(size); + if (result == NULL) { + LM_GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); + LM_GGML_ASSERT(false); + } + return result; +} + +// calloc +inline static void * lm_ggml_calloc(size_t num, size_t size) { + if (num == 0 || size == 0) { + LM_GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for lm_ggml_calloc!\n"); + return NULL; + } + void * result = calloc(num, size); + if (result == NULL) { + LM_GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); + LM_GGML_ASSERT(false); + } + return result; +} + +#define LM_GGML_MALLOC(size) lm_ggml_malloc(size) +#define LM_GGML_CALLOC(num, size) lm_ggml_calloc(num, size) + +#define LM_GGML_FREE(ptr) free(ptr) + #define UNUSED LM_GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) @@ -237,6 +273,8 @@ inline static void * lm_ggml_aligned_malloc(size_t size) { #include #if defined(LM_GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions #include "ggml-opencl.h" +#elif defined(LM_GGML_USE_VULKAN) +#include "ggml-vulkan.h" #endif #elif defined(LM_GGML_USE_OPENBLAS) #if defined(LM_GGML_BLAS_USE_MKL) @@ -248,6 +286,10 @@ inline static void * lm_ggml_aligned_malloc(size_t size) { #include "ggml-cuda.h" #elif defined(LM_GGML_USE_CLBLAST) #include "ggml-opencl.h" +#elif defined(LM_GGML_USE_VULKAN) +#include "ggml-vulkan.h" +#elif defined(LM_GGML_USE_SYCL) +#include "ggml-sycl.h" #endif // floating point type used to accumulate sums @@ -391,8 +433,8 @@ int64_t lm_ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -static void lm_ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); -static void lm_ggml_vec_dot_f16(const int n, float * restrict s, lm_ggml_fp16_t * restrict x, lm_ggml_fp16_t * restrict y); +static void lm_ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); +static void lm_ggml_vec_dot_f16(int n, float * restrict s, size_t bs, lm_ggml_fp16_t * restrict x, size_t bx, lm_ggml_fp16_t * restrict y, size_t by, int nrc); static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { [LM_GGML_TYPE_I8] = { @@ -420,6 +462,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .is_quantized = false, .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_f32, .vec_dot_type = LM_GGML_TYPE_F32, + .nrows = 1, }, [LM_GGML_TYPE_F16] = { .type_name = "f16", @@ -431,6 +474,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) lm_ggml_fp32_to_fp16_row, .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_f16, .vec_dot_type = LM_GGML_TYPE_F16, + .nrows = 1, }, [LM_GGML_TYPE_Q4_0] = { .type_name = "q4_0", @@ -442,6 +486,11 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_0_reference, .vec_dot = lm_ggml_vec_dot_q4_0_q8_0, .vec_dot_type = LM_GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [LM_GGML_TYPE_Q4_1] = { .type_name = "q4_1", @@ -453,6 +502,11 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_1_reference, .vec_dot = lm_ggml_vec_dot_q4_1_q8_1, .vec_dot_type = LM_GGML_TYPE_Q8_1, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [4] = { // LM_GGML_TYPE_Q4_2 .type_name = "DEPRECATED", @@ -464,6 +518,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = LM_GGML_TYPE_COUNT, + .nrows = 1, }, [5] = { // LM_GGML_TYPE_Q4_3 .type_name = "DEPRECATED", @@ -475,6 +530,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = LM_GGML_TYPE_COUNT, + .nrows = 1, }, [LM_GGML_TYPE_Q5_0] = { .type_name = "q5_0", @@ -486,6 +542,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_0_reference, .vec_dot = lm_ggml_vec_dot_q5_0_q8_0, .vec_dot_type = LM_GGML_TYPE_Q8_0, + .nrows = 1, }, [LM_GGML_TYPE_Q5_1] = { .type_name = "q5_1", @@ -497,6 +554,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_1_reference, .vec_dot = lm_ggml_vec_dot_q5_1_q8_1, .vec_dot_type = LM_GGML_TYPE_Q8_1, + .nrows = 1, }, [LM_GGML_TYPE_Q8_0] = { .type_name = "q8_0", @@ -508,6 +566,11 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q8_0_reference, .vec_dot = lm_ggml_vec_dot_q8_0_q8_0, .vec_dot_type = LM_GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [LM_GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -517,6 +580,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_1, .from_float_reference = (lm_ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = LM_GGML_TYPE_Q8_1, + .nrows = 1, }, [LM_GGML_TYPE_Q2_K] = { .type_name = "q2_K", @@ -528,6 +592,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q2_K_reference, .vec_dot = lm_ggml_vec_dot_q2_K_q8_K, .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, }, [LM_GGML_TYPE_Q3_K] = { .type_name = "q3_K", @@ -539,6 +604,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q3_K_reference, .vec_dot = lm_ggml_vec_dot_q3_K_q8_K, .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, }, [LM_GGML_TYPE_Q4_K] = { .type_name = "q4_K", @@ -550,6 +616,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_K_reference, .vec_dot = lm_ggml_vec_dot_q4_K_q8_K, .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, }, [LM_GGML_TYPE_Q5_K] = { .type_name = "q5_K", @@ -561,6 +628,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_K_reference, .vec_dot = lm_ggml_vec_dot_q5_K_q8_K, .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, }, [LM_GGML_TYPE_Q6_K] = { .type_name = "q6_K", @@ -572,6 +640,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = (lm_ggml_from_float_t) quantize_row_q6_K_reference, .vec_dot = lm_ggml_vec_dot_q6_K_q8_K, .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, }, [LM_GGML_TYPE_IQ2_XXS] = { .type_name = "iq2_xxs", @@ -583,6 +652,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = lm_ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, }, [LM_GGML_TYPE_IQ2_XS] = { .type_name = "iq2_xs", @@ -594,6 +664,43 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = lm_ggml_vec_dot_iq2_xs_q8_K, .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, + }, + [LM_GGML_TYPE_IQ3_XXS] = { + .type_name = "iq3_xxs", + .blck_size = QK_K, + .type_size = sizeof(block_iq3_xxs), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_iq3_xxs, + .from_float = quantize_row_iq3_xxs, + .from_float_reference = (lm_ggml_from_float_t)quantize_row_iq3_xxs_reference, + .vec_dot = lm_ggml_vec_dot_iq3_xxs_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, + }, + [LM_GGML_TYPE_IQ1_S] = { + .type_name = "iq1_s", + .blck_size = QK_K, + .type_size = sizeof(block_iq1_s), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_iq1_s, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = lm_ggml_vec_dot_iq1_s_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + .nrows = 1, + }, + [LM_GGML_TYPE_IQ4_NL] = { + .type_name = "iq4_nl", + .blck_size = QK4_NL, + .type_size = sizeof(block_iq4_nl), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_iq4_nl, + .from_float = quantize_row_iq4_nl, + .from_float_reference = (lm_ggml_from_float_t)quantize_row_iq4_nl_reference, + .vec_dot = lm_ggml_vec_dot_iq4_nl_q8_0, + .vec_dot_type = LM_GGML_TYPE_Q8_0, + .nrows = 1, }, [LM_GGML_TYPE_Q8_K] = { .type_name = "q8_K", @@ -790,7 +897,7 @@ do { \ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ _mm256_extractf128_ps(x[0], 1)); \ const __m128 t1 = _mm_hadd_ps(t0, t0); \ - res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ + res = (lm_ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ } while (0) // TODO: is this optimal ? @@ -1071,7 +1178,7 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) { x[i] = _mm_add_ps(x[i], x[offset+i]); \ } \ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ - res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ + res = (lm_ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ } // TODO: is this optimal ? @@ -1164,7 +1271,13 @@ inline static void lm_ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void lm_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void lm_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -static void lm_ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +static void lm_ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + #ifdef LM_GGML_SIMD float sumf = 0.0f; const int np = (n & ~(LM_GGML_F32_STEP - 1)); @@ -1201,7 +1314,13 @@ static void lm_ggml_vec_dot_f32(const int n, float * restrict s, const float * r *s = sumf; } -static void lm_ggml_vec_dot_f16(const int n, float * restrict s, lm_ggml_fp16_t * restrict x, lm_ggml_fp16_t * restrict y) { +static void lm_ggml_vec_dot_f16(int n, float * restrict s, size_t bs, lm_ggml_fp16_t * restrict x, size_t bx, lm_ggml_fp16_t * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + lm_ggml_float sumf = 0.0; #if defined(LM_GGML_SIMD) @@ -1407,7 +1526,7 @@ inline static void lm_ggml_vec_scale_f32(const int n, float * y, const float v #endif } -inline static void lm_ggml_vec_norm_f32 (const int n, float * s, const float * x) { lm_ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } +inline static void lm_ggml_vec_norm_f32 (const int n, float * s, const float * x) { lm_ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } inline static void lm_ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void lm_ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void lm_ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } @@ -1418,6 +1537,9 @@ inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } +// TODO: optimize performance +inline static void lm_ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } +inline static void lm_ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } static const float GELU_COEF_A = 0.044715f; static const float GELU_QUICK_COEF = -1.702f; @@ -1776,9 +1898,11 @@ static const char * LM_GGML_UNARY_OP_NAME[LM_GGML_UNARY_OP_COUNT] = { "GELU", "GELU_QUICK", "SILU", + "HARDSWISH", + "HARDSIGMOID", }; -static_assert(LM_GGML_UNARY_OP_COUNT == 10, "LM_GGML_UNARY_OP_COUNT != 10"); +static_assert(LM_GGML_UNARY_OP_COUNT == 12, "LM_GGML_UNARY_OP_COUNT != 12"); static_assert(sizeof(struct lm_ggml_object)%LM_GGML_MEM_ALIGN == 0, "lm_ggml_object size must be a multiple of LM_GGML_MEM_ALIGN"); @@ -1859,9 +1983,16 @@ struct lm_ggml_numa_node { }; struct lm_ggml_numa_nodes { + enum lm_ggml_numa_strategy numa_strategy; struct lm_ggml_numa_node nodes[LM_GGML_NUMA_MAX_NODES]; uint32_t n_nodes; uint32_t total_cpus; // hardware threads on system + uint32_t current_node; // node on which main process is execting +#if defined(__gnu_linux__) + cpu_set_t cpuset; // cpuset from numactl +#else + uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype +#endif }; // @@ -1895,18 +2026,40 @@ inline static void lm_ggml_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); } -void lm_ggml_numa_init(void) { +#if defined(__gnu_linux__) +static cpu_set_t lm_ggml_get_numa_affinity(void) { + cpu_set_t cpuset; + pthread_t thread; + thread = pthread_self(); + CPU_ZERO(&cpuset); + pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + return cpuset; +} +#else +static uint32_t lm_ggml_get_numa_affinity(void) { + return 0; // no NUMA support +} +#endif + +void lm_ggml_numa_init(enum lm_ggml_numa_strategy numa_flag) { if (g_state.numa.n_nodes > 0) { fprintf(stderr, "lm_ggml_numa_init: NUMA already initialized\n"); return; } -#ifdef __linux__ +#if defined(__gnu_linux__) struct stat st; char path[256]; int rv; + // set numa scheme + g_state.numa.numa_strategy = numa_flag; + + LM_GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy); + + g_state.numa.cpuset = lm_ggml_get_numa_affinity(); + // enumerate nodes while (g_state.numa.n_nodes < LM_GGML_NUMA_MAX_NODES) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); @@ -1925,11 +2078,23 @@ void lm_ggml_numa_init(void) { LM_GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); - if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) { + // figure out which node we're on + uint current_cpu; + int getcpu_ret = 0; +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) + getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); +#else + // old glibc doesn't have a wrapper for this call. Fall back on direct syscall + getcpu_ret = syscall(SYS_getcpu,¤t_cpu,&g_state.numa.current_node); +#endif + + if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) { g_state.numa.n_nodes = 0; return; } + LM_GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu); + for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { struct lm_ggml_numa_node * node = &g_state.numa.nodes[n]; LM_GGML_PRINT_DEBUG("CPUs on node %u:", n); @@ -1956,6 +2121,7 @@ void lm_ggml_numa_init(void) { } } #else + LM_GGML_UNUSED(numa_flag); // TODO #endif } @@ -2135,6 +2301,9 @@ enum lm_ggml_type lm_ggml_ftype_to_lm_ggml_type(enum lm_ggml_ftype ftype) { case LM_GGML_FTYPE_MOSTLY_Q6_K: wtype = LM_GGML_TYPE_Q6_K; break; case LM_GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = LM_GGML_TYPE_IQ2_XXS; break; case LM_GGML_FTYPE_MOSTLY_IQ2_XS: wtype = LM_GGML_TYPE_IQ2_XS; break; + case LM_GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = LM_GGML_TYPE_IQ3_XXS; break; + case LM_GGML_FTYPE_MOSTLY_IQ1_S: wtype = LM_GGML_TYPE_IQ1_S; break; + case LM_GGML_FTYPE_MOSTLY_IQ4_NL: wtype = LM_GGML_TYPE_IQ4_NL; break; case LM_GGML_FTYPE_UNKNOWN: wtype = LM_GGML_TYPE_COUNT; break; case LM_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = LM_GGML_TYPE_COUNT; break; } @@ -2288,6 +2457,10 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) { lm_ggml_init_cublas(); #elif defined(LM_GGML_USE_CLBLAST) lm_ggml_cl_init(); +#elif defined(LM_GGML_USE_VULKAN) + lm_ggml_vk_init_cpu_assist(); +#elif defined(LM_GGML_USE_SYCL) + lm_ggml_init_sycl(); #endif lm_ggml_setup_op_has_task_pass(); @@ -2412,7 +2585,8 @@ size_t lm_ggml_get_max_tensor_size(const struct lm_ggml_context * ctx) { size_t max_size = 0; for (struct lm_ggml_tensor * tensor = lm_ggml_get_first_tensor(ctx); tensor != NULL; tensor = lm_ggml_get_next_tensor(ctx, tensor)) { - max_size = MAX(max_size, lm_ggml_nbytes(tensor)); + size_t bytes = lm_ggml_nbytes(tensor); + max_size = MAX(max_size, bytes); } return max_size; @@ -2548,7 +2722,7 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl( /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ LM_GGML_OP_NONE, /*.op_params =*/ { 0 }, - /*.is_param =*/ false, + /*.flags =*/ 0, /*.grad =*/ NULL, /*.src =*/ { NULL }, /*.perf_runs =*/ 0, @@ -3083,7 +3257,7 @@ const char * lm_ggml_get_name(const struct lm_ggml_tensor * tensor) { } struct lm_ggml_tensor * lm_ggml_set_name(struct lm_ggml_tensor * tensor, const char * name) { - strncpy(tensor->name, name, sizeof(tensor->name)); + strncpy(tensor->name, name, sizeof(tensor->name) - 1); tensor->name[sizeof(tensor->name) - 1] = '\0'; return tensor; } @@ -3945,6 +4119,20 @@ struct lm_ggml_tensor * lm_ggml_silu_back( return result; } +// ggml hardswish +struct lm_ggml_tensor * lm_ggml_hardswish( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a) { + return lm_ggml_unary(ctx, a, LM_GGML_UNARY_OP_HARDSWISH); +} + +// ggml hardsigmoid +struct lm_ggml_tensor * lm_ggml_hardsigmoid( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a) { + return lm_ggml_unary(ctx, a, LM_GGML_UNARY_OP_HARDSIGMOID); +} + // lm_ggml_norm static struct lm_ggml_tensor * lm_ggml_norm_impl( @@ -4945,16 +5133,28 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * mask, + struct lm_ggml_tensor * pos, float scale, + float max_bias, bool inplace) { LM_GGML_ASSERT(lm_ggml_is_contiguous(a)); + if (mask) { LM_GGML_ASSERT(lm_ggml_is_contiguous(mask)); - LM_GGML_ASSERT(mask->ne[2] == 1); - LM_GGML_ASSERT(mask->ne[3] == 1); + LM_GGML_ASSERT(lm_ggml_is_matrix(mask)); LM_GGML_ASSERT(lm_ggml_can_repeat_rows(mask, a)); } + if (pos) { + LM_GGML_ASSERT(lm_ggml_is_vector(pos)); + LM_GGML_ASSERT(pos->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(pos->ne[0] == a->ne[0]); + } + + if (max_bias > 0.0f) { + LM_GGML_ASSERT(pos); + } + bool is_node = false; if (a->grad) { @@ -4963,13 +5163,14 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); - float params[] = { scale }; + float params[] = { scale, max_bias }; lm_ggml_set_op_params(result, params, sizeof(params)); result->op = LM_GGML_OP_SOFT_MAX; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = mask; + result->src[2] = pos; return result; } @@ -4977,21 +5178,23 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( struct lm_ggml_tensor * lm_ggml_soft_max( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a) { - return lm_ggml_soft_max_impl(ctx, a, NULL, 1.0f, false); + return lm_ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false); } struct lm_ggml_tensor * lm_ggml_soft_max_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a) { - return lm_ggml_soft_max_impl(ctx, a, NULL, 1.0f, true); + return lm_ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true); } struct lm_ggml_tensor * lm_ggml_soft_max_ext( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * mask, - float scale) { - return lm_ggml_soft_max_impl(ctx, a, mask, scale, false); + struct lm_ggml_tensor * pos, + float scale, + float max_bias) { + return lm_ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false); } // lm_ggml_soft_max_back @@ -5277,7 +5480,7 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d( int s0, int p0, int d0) { - struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K] + struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, LM_GGML_TYPE_F16); // [N, OL, IC * K] struct lm_ggml_tensor * result = lm_ggml_mul_mat(ctx, @@ -5344,6 +5547,30 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d( return result; } +// lm_ggml_conv_depthwise +struct lm_ggml_tensor * lm_ggml_conv_depthwise_2d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + + struct lm_ggml_tensor * new_a = lm_ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); + struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, new_a, + lm_ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), + s0, s1, p0, p1, d0, d1, true, LM_GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] + struct lm_ggml_tensor * new_b = lm_ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + + new_a = lm_ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] + struct lm_ggml_tensor * result = lm_ggml_mul_mat(ctx, new_a, new_b); + result = lm_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] + + return result; +} // lm_ggml_conv_2d // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] @@ -5360,7 +5587,8 @@ struct lm_ggml_tensor * lm_ggml_im2col( int p1, int d0, int d1, - bool is_2D) { + bool is_2D, + enum lm_ggml_type dst_type) { if(is_2D) { LM_GGML_ASSERT(a->ne[2] == b->ne[2]); @@ -5384,7 +5612,7 @@ struct lm_ggml_tensor * lm_ggml_im2col( is_2D ? b->ne[3] : 1, }; - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne); + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, dst_type, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; lm_ggml_set_op_params(result, params, sizeof(params)); @@ -5409,14 +5637,16 @@ struct lm_ggml_tensor * lm_ggml_conv_2d( int p1, int d0, int d1) { - struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW] + struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, LM_GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW] struct lm_ggml_tensor * result = lm_ggml_mul_mat(ctx, lm_ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW] lm_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW] - result = lm_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW] + result = lm_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW] + result = lm_ggml_cont(ctx, lm_ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW] + return result; } @@ -5535,12 +5765,13 @@ struct lm_ggml_tensor * lm_ggml_pool_2d( is_node = true; } + struct lm_ggml_tensor * result; const int64_t ne[3] = { lm_ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), lm_ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), a->ne[2], }; - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 3, ne); + result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 3, ne); int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; lm_ggml_set_op_params(result, params, sizeof(params)); @@ -5548,7 +5779,6 @@ struct lm_ggml_tensor * lm_ggml_pool_2d( result->op = LM_GGML_OP_POOL_2D; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - return result; } @@ -6411,7 +6641,7 @@ struct lm_ggml_tensor * lm_ggml_cross_entropy_loss_back( void lm_ggml_set_param( struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor) { - tensor->is_param = true; + tensor->flags |= LM_GGML_TENSOR_FLAG_PARAM; LM_GGML_ASSERT(tensor->grad == NULL); tensor->grad = lm_ggml_dup_tensor(ctx, tensor); @@ -6422,8 +6652,10 @@ void lm_ggml_set_param( static void lm_ggml_compute_forward_dup_same_cont( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst) && lm_ggml_is_contiguous(src0)); LM_GGML_ASSERT(src0->type == dst->type); @@ -6454,8 +6686,10 @@ static void lm_ggml_compute_forward_dup_same_cont( } static void lm_ggml_compute_forward_dup_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -6468,7 +6702,7 @@ static void lm_ggml_compute_forward_dup_f16( const int nth = params->nth; // number of threads if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst) && src0->type == dst->type) { - lm_ggml_compute_forward_dup_same_cont(params, src0, dst); + lm_ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -6725,8 +6959,10 @@ static void lm_ggml_compute_forward_dup_f16( static void lm_ggml_compute_forward_dup_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -6739,7 +6975,7 @@ static void lm_ggml_compute_forward_dup_f32( const int nth = params->nth; // number of threads if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst) && src0->type == dst->type) { - lm_ggml_compute_forward_dup_same_cont(params, src0, dst); + lm_ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -6975,8 +7211,10 @@ static void lm_ggml_compute_forward_dup_f32( // A simplified version of lm_ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. static void lm_ggml_compute_forward_dup_bytes( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0)); LM_GGML_ASSERT(src0->type == dst->type); @@ -6985,7 +7223,7 @@ static void lm_ggml_compute_forward_dup_bytes( } if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst)) { - lm_ggml_compute_forward_dup_same_cont(params, src0, dst); + lm_ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -7124,21 +7362,23 @@ static void lm_ggml_compute_forward_dup_bytes( static void lm_ggml_compute_forward_dup( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + if (src0->type == dst->type) { - lm_ggml_compute_forward_dup_bytes(params, src0, dst); + lm_ggml_compute_forward_dup_bytes(params, dst); return; } switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_dup_f16(params, src0, dst); + lm_ggml_compute_forward_dup_f16(params, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_dup_f32(params, src0, dst); + lm_ggml_compute_forward_dup_f32(params, dst); } break; default: { @@ -7151,9 +7391,11 @@ static void lm_ggml_compute_forward_dup( static void lm_ggml_compute_forward_add_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -7163,6 +7405,17 @@ static void lm_ggml_compute_forward_add_f32( const int ith = params->ith; const int nth = params->nth; +#ifdef LM_GGML_USE_CLBLAST + if (src1->backend == LM_GGML_BACKEND_GPU) { + // TODO: OpenCL kernel support full broadcast + LM_GGML_ASSERT(lm_ggml_can_repeat_rows(src1, src0)); + if (ith == 0) { + lm_ggml_cl_add(src0, src1, dst); + } + return; + } +#endif + const int nr = lm_ggml_nrows(src0); LM_GGML_TENSOR_BINARY_OP_LOCALS @@ -7228,9 +7481,11 @@ static void lm_ggml_compute_forward_add_f32( static void lm_ggml_compute_forward_add_f16_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -7305,9 +7560,11 @@ static void lm_ggml_compute_forward_add_f16_f32( static void lm_ggml_compute_forward_add_f16_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -7359,9 +7616,11 @@ static void lm_ggml_compute_forward_add_f16_f16( static void lm_ggml_compute_forward_add_q_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -7437,21 +7696,28 @@ static void lm_ggml_compute_forward_add_q_f32( static void lm_ggml_compute_forward_add( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_add_f32(params, src0, src1, dst); + if (src1->type == LM_GGML_TYPE_F32) { + lm_ggml_compute_forward_add_f32(params, dst); + } + else { + LM_GGML_ASSERT(false); + } } break; case LM_GGML_TYPE_F16: { if (src1->type == LM_GGML_TYPE_F16) { - lm_ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + lm_ggml_compute_forward_add_f16_f16(params, dst); } else if (src1->type == LM_GGML_TYPE_F32) { - lm_ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + lm_ggml_compute_forward_add_f16_f32(params, dst); } else { LM_GGML_ASSERT(false); @@ -7469,8 +7735,11 @@ static void lm_ggml_compute_forward_add( case LM_GGML_TYPE_Q6_K: case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_IQ4_NL: { - lm_ggml_compute_forward_add_q_f32(params, src0, src1, dst); + lm_ggml_compute_forward_add_q_f32(params, dst); } break; default: { @@ -7483,9 +7752,11 @@ static void lm_ggml_compute_forward_add( static void lm_ggml_compute_forward_add1_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); LM_GGML_ASSERT(lm_ggml_is_scalar(src1)); @@ -7535,9 +7806,11 @@ static void lm_ggml_compute_forward_add1_f32( static void lm_ggml_compute_forward_add1_f16_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); LM_GGML_ASSERT(lm_ggml_is_scalar(src1)); @@ -7585,9 +7858,11 @@ static void lm_ggml_compute_forward_add1_f16_f32( static void lm_ggml_compute_forward_add1_f16_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); LM_GGML_ASSERT(lm_ggml_is_scalar(src1)); @@ -7635,9 +7910,11 @@ static void lm_ggml_compute_forward_add1_f16_f16( static void lm_ggml_compute_forward_add1_q_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); LM_GGML_ASSERT(lm_ggml_is_scalar(src1)); @@ -7702,21 +7979,23 @@ static void lm_ggml_compute_forward_add1_q_f32( static void lm_ggml_compute_forward_add1( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_add1_f32(params, src0, src1, dst); + lm_ggml_compute_forward_add1_f32(params, dst); } break; case LM_GGML_TYPE_F16: { if (src1->type == LM_GGML_TYPE_F16) { - lm_ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); + lm_ggml_compute_forward_add1_f16_f16(params, dst); } else if (src1->type == LM_GGML_TYPE_F32) { - lm_ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); + lm_ggml_compute_forward_add1_f16_f32(params, dst); } else { LM_GGML_ASSERT(false); @@ -7735,8 +8014,11 @@ static void lm_ggml_compute_forward_add1( case LM_GGML_TYPE_Q6_K: case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_IQ4_NL: { - lm_ggml_compute_forward_add1_q_f32(params, src0, src1, dst); + lm_ggml_compute_forward_add1_q_f32(params, dst); } break; default: { @@ -7749,9 +8031,11 @@ static void lm_ggml_compute_forward_add1( static void lm_ggml_compute_forward_acc_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst) && lm_ggml_is_contiguous(src0)); @@ -7764,6 +8048,9 @@ static void lm_ggml_compute_forward_acc_f32( bool inplace = (bool) ((int32_t *) dst->op_params)[4]; if (!inplace && (params->type == LM_GGML_TASK_INIT)) { + if (params->ith != 0) { + return; + } // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( @@ -7828,14 +8115,14 @@ static void lm_ggml_compute_forward_acc_f32( static void lm_ggml_compute_forward_acc( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_acc_f32(params, src0, src1, dst); + lm_ggml_compute_forward_acc_f32(params, dst); } break; case LM_GGML_TYPE_F16: case LM_GGML_TYPE_Q4_0: @@ -7851,6 +8138,9 @@ static void lm_ggml_compute_forward_acc( case LM_GGML_TYPE_Q6_K: case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_IQ4_NL: default: { LM_GGML_ASSERT(false); @@ -7862,9 +8152,11 @@ static void lm_ggml_compute_forward_acc( static void lm_ggml_compute_forward_sub_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst)); @@ -7922,13 +8214,14 @@ static void lm_ggml_compute_forward_sub_f32( static void lm_ggml_compute_forward_sub( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_sub_f32(params, src0, src1, dst); + lm_ggml_compute_forward_sub_f32(params, dst); } break; default: { @@ -7941,9 +8234,11 @@ static void lm_ggml_compute_forward_sub( static void lm_ggml_compute_forward_mul_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -7952,7 +8247,7 @@ static void lm_ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; -#ifdef LM_GGML_USE_CLBLAST +#if defined(LM_GGML_USE_CLBLAST) if (src1->backend == LM_GGML_BACKEND_GPU) { // TODO: OpenCL kernel support full broadcast LM_GGML_ASSERT(lm_ggml_can_repeat_rows(src1, src0)); @@ -8024,15 +8319,17 @@ static void lm_ggml_compute_forward_mul_f32( static void lm_ggml_compute_forward_mul( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32 && "only f32 src1 supported for now"); switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_mul_f32(params, src0, src1, dst); + lm_ggml_compute_forward_mul_f32(params, dst); } break; default: { @@ -8045,9 +8342,11 @@ static void lm_ggml_compute_forward_mul( static void lm_ggml_compute_forward_div_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -8118,13 +8417,14 @@ static void lm_ggml_compute_forward_div_f32( static void lm_ggml_compute_forward_div( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_div_f32(params, src0, src1, dst); + lm_ggml_compute_forward_div_f32(params, dst); } break; default: { @@ -8137,8 +8437,10 @@ static void lm_ggml_compute_forward_div( static void lm_ggml_compute_forward_sqr_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -8161,12 +8463,14 @@ static void lm_ggml_compute_forward_sqr_f32( static void lm_ggml_compute_forward_sqr( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_sqr_f32(params, src0, dst); + lm_ggml_compute_forward_sqr_f32(params, dst); } break; default: { @@ -8179,8 +8483,10 @@ static void lm_ggml_compute_forward_sqr( static void lm_ggml_compute_forward_sqrt_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -8203,12 +8509,14 @@ static void lm_ggml_compute_forward_sqrt_f32( static void lm_ggml_compute_forward_sqrt( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_sqrt_f32(params, src0, dst); + lm_ggml_compute_forward_sqrt_f32(params, dst); } break; default: { @@ -8221,8 +8529,10 @@ static void lm_ggml_compute_forward_sqrt( static void lm_ggml_compute_forward_log_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(params->ith == 0); LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); @@ -8245,12 +8555,14 @@ static void lm_ggml_compute_forward_log_f32( static void lm_ggml_compute_forward_log( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_log_f32(params, src0, dst); + lm_ggml_compute_forward_log_f32(params, dst); } break; default: { @@ -8263,8 +8575,10 @@ static void lm_ggml_compute_forward_log( static void lm_ggml_compute_forward_sum_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_is_scalar(dst)); @@ -8296,8 +8610,10 @@ static void lm_ggml_compute_forward_sum_f32( static void lm_ggml_compute_forward_sum_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_is_scalar(dst)); @@ -8328,16 +8644,18 @@ static void lm_ggml_compute_forward_sum_f16( static void lm_ggml_compute_forward_sum( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_sum_f32(params, src0, dst); + lm_ggml_compute_forward_sum_f32(params, dst); } break; case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_sum_f16(params, src0, dst); + lm_ggml_compute_forward_sum_f16(params, dst); } break; default: { @@ -8350,8 +8668,10 @@ static void lm_ggml_compute_forward_sum( static void lm_ggml_compute_forward_sum_rows_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -8383,12 +8703,14 @@ static void lm_ggml_compute_forward_sum_rows_f32( static void lm_ggml_compute_forward_sum_rows( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_sum_rows_f32(params, src0, dst); + lm_ggml_compute_forward_sum_rows_f32(params, dst); } break; default: { @@ -8401,8 +8723,10 @@ static void lm_ggml_compute_forward_sum_rows( static void lm_ggml_compute_forward_mean_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -8438,12 +8762,14 @@ static void lm_ggml_compute_forward_mean_f32( static void lm_ggml_compute_forward_mean( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_mean_f32(params, src0, dst); + lm_ggml_compute_forward_mean_f32(params, dst); } break; default: { @@ -8456,8 +8782,10 @@ static void lm_ggml_compute_forward_mean( static void lm_ggml_compute_forward_argmax_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -8484,12 +8812,14 @@ static void lm_ggml_compute_forward_argmax_f32( static void lm_ggml_compute_forward_argmax( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_argmax_f32(params, src0, dst); + lm_ggml_compute_forward_argmax_f32(params, dst); } break; default: { @@ -8502,8 +8832,10 @@ static void lm_ggml_compute_forward_argmax( static void lm_ggml_compute_forward_repeat_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(params->ith == 0); LM_GGML_ASSERT(lm_ggml_can_repeat(src0, dst)); @@ -8545,8 +8877,10 @@ static void lm_ggml_compute_forward_repeat_f32( static void lm_ggml_compute_forward_repeat_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(params->ith == 0); LM_GGML_ASSERT(lm_ggml_can_repeat(src0, dst)); @@ -8591,18 +8925,20 @@ static void lm_ggml_compute_forward_repeat_f16( static void lm_ggml_compute_forward_repeat( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F16: case LM_GGML_TYPE_I16: { - lm_ggml_compute_forward_repeat_f16(params, src0, dst); + lm_ggml_compute_forward_repeat_f16(params, dst); } break; case LM_GGML_TYPE_F32: case LM_GGML_TYPE_I32: { - lm_ggml_compute_forward_repeat_f32(params, src0, dst); + lm_ggml_compute_forward_repeat_f32(params, dst); } break; default: { @@ -8615,8 +8951,10 @@ static void lm_ggml_compute_forward_repeat( static void lm_ggml_compute_forward_repeat_back_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(params->ith == 0); LM_GGML_ASSERT(lm_ggml_can_repeat(dst, src0)); @@ -8672,12 +9010,14 @@ static void lm_ggml_compute_forward_repeat_back_f32( static void lm_ggml_compute_forward_repeat_back( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_repeat_back_f32(params, src0, dst); + lm_ggml_compute_forward_repeat_back_f32(params, dst); } break; default: { @@ -8690,10 +9030,11 @@ static void lm_ggml_compute_forward_repeat_back( static void lm_ggml_compute_forward_concat_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -8738,14 +9079,15 @@ static void lm_ggml_compute_forward_concat_f32( static void lm_ggml_compute_forward_concat( const struct lm_ggml_compute_params* params, - const struct lm_ggml_tensor* src0, - const struct lm_ggml_tensor* src1, struct lm_ggml_tensor* dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: case LM_GGML_TYPE_I32: { - lm_ggml_compute_forward_concat_f32(params, src0, src1, dst); + lm_ggml_compute_forward_concat_f32(params, dst); } break; default: { @@ -8758,8 +9100,10 @@ static void lm_ggml_compute_forward_concat( static void lm_ggml_compute_forward_abs_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -8782,12 +9126,14 @@ static void lm_ggml_compute_forward_abs_f32( static void lm_ggml_compute_forward_abs( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { - switch (src0->type) { - case LM_GGML_TYPE_F32: + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_abs_f32(params, src0, dst); + lm_ggml_compute_forward_abs_f32(params, dst); } break; default: { @@ -8800,8 +9146,10 @@ static void lm_ggml_compute_forward_abs( static void lm_ggml_compute_forward_sgn_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -8824,12 +9172,14 @@ static void lm_ggml_compute_forward_sgn_f32( static void lm_ggml_compute_forward_sgn( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_sgn_f32(params, src0, dst); + lm_ggml_compute_forward_sgn_f32(params, dst); } break; default: { @@ -8842,8 +9192,10 @@ static void lm_ggml_compute_forward_sgn( static void lm_ggml_compute_forward_neg_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -8866,12 +9218,14 @@ static void lm_ggml_compute_forward_neg_f32( static void lm_ggml_compute_forward_neg( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_neg_f32(params, src0, dst); + lm_ggml_compute_forward_neg_f32(params, dst); } break; default: { @@ -8884,8 +9238,10 @@ static void lm_ggml_compute_forward_neg( static void lm_ggml_compute_forward_step_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -8908,12 +9264,14 @@ static void lm_ggml_compute_forward_step_f32( static void lm_ggml_compute_forward_step( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_step_f32(params, src0, dst); + lm_ggml_compute_forward_step_f32(params, dst); } break; default: { @@ -8926,8 +9284,10 @@ static void lm_ggml_compute_forward_step( static void lm_ggml_compute_forward_tanh_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -8950,12 +9310,14 @@ static void lm_ggml_compute_forward_tanh_f32( static void lm_ggml_compute_forward_tanh( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_tanh_f32(params, src0, dst); + lm_ggml_compute_forward_tanh_f32(params, dst); } break; default: { @@ -8968,8 +9330,10 @@ static void lm_ggml_compute_forward_tanh( static void lm_ggml_compute_forward_elu_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -8992,12 +9356,14 @@ static void lm_ggml_compute_forward_elu_f32( static void lm_ggml_compute_forward_elu( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_elu_f32(params, src0, dst); + lm_ggml_compute_forward_elu_f32(params, dst); } break; default: { @@ -9010,8 +9376,10 @@ static void lm_ggml_compute_forward_elu( static void lm_ggml_compute_forward_relu_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -9034,12 +9402,14 @@ static void lm_ggml_compute_forward_relu_f32( static void lm_ggml_compute_forward_relu( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_relu_f32(params, src0, dst); + lm_ggml_compute_forward_relu_f32(params, dst); } break; default: { @@ -9052,8 +9422,10 @@ static void lm_ggml_compute_forward_relu( static void lm_ggml_compute_forward_gelu_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(dst)); LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); @@ -9093,12 +9465,14 @@ static void lm_ggml_compute_forward_gelu_f32( static void lm_ggml_compute_forward_gelu( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_gelu_f32(params, src0, dst); + lm_ggml_compute_forward_gelu_f32(params, dst); } break; default: { @@ -9111,8 +9485,10 @@ static void lm_ggml_compute_forward_gelu( static void lm_ggml_compute_forward_gelu_quick_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(dst)); LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); @@ -9152,12 +9528,14 @@ static void lm_ggml_compute_forward_gelu_quick_f32( static void lm_ggml_compute_forward_gelu_quick( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_gelu_quick_f32(params, src0, dst); + lm_ggml_compute_forward_gelu_quick_f32(params, dst); } break; default: { @@ -9170,8 +9548,10 @@ static void lm_ggml_compute_forward_gelu_quick( static void lm_ggml_compute_forward_silu_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(dst)); LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); @@ -9211,12 +9591,14 @@ static void lm_ggml_compute_forward_silu_f32( static void lm_ggml_compute_forward_silu( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_silu_f32(params, src0, dst); + lm_ggml_compute_forward_silu_f32(params, dst); } break; default: { @@ -9228,8 +9610,10 @@ static void lm_ggml_compute_forward_silu( static void lm_ggml_compute_forward_leaky_relu_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, dst)); @@ -9255,12 +9639,14 @@ static void lm_ggml_compute_forward_leaky_relu_f32( static void lm_ggml_compute_forward_leaky_relu( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_leaky_relu_f32(params, src0, dst); + lm_ggml_compute_forward_leaky_relu_f32(params, dst); } break; default: { @@ -9273,9 +9659,11 @@ static void lm_ggml_compute_forward_leaky_relu( static void lm_ggml_compute_forward_silu_back_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * grad, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * grad = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(grad)); LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous_except_dim_1(dst)); @@ -9318,13 +9706,102 @@ static void lm_ggml_compute_forward_silu_back_f32( static void lm_ggml_compute_forward_silu_back( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * grad, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_silu_back_f32(params, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + + +static void lm_ggml_compute_forward_hardswish_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + assert(params->ith == 0); + assert(lm_ggml_are_same_shape(src0, dst)); + + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { + return; + } + + const int n = lm_ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + lm_ggml_vec_hardswish_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} +static void lm_ggml_compute_forward_hardswish( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_hardswish_f32(params, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + +static void lm_ggml_compute_forward_hardsigmoid_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + assert(params->ith == 0); + assert(lm_ggml_are_same_shape(src0, dst)); + + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { + return; + } + + const int n = lm_ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + lm_ggml_vec_hardsigmoid_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void lm_ggml_compute_forward_hardsigmoid( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_silu_back_f32(params, src0, grad, dst); + lm_ggml_compute_forward_hardsigmoid_f32(params, dst); } break; default: { @@ -9333,12 +9810,15 @@ static void lm_ggml_compute_forward_silu_back( } } + // lm_ggml_compute_forward_norm static void lm_ggml_compute_forward_norm_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -9390,12 +9870,14 @@ static void lm_ggml_compute_forward_norm_f32( static void lm_ggml_compute_forward_norm( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_norm_f32(params, src0, dst); + lm_ggml_compute_forward_norm_f32(params, dst); } break; default: { @@ -9408,8 +9890,10 @@ static void lm_ggml_compute_forward_norm( static void lm_ggml_compute_forward_rms_norm_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -9458,12 +9942,14 @@ static void lm_ggml_compute_forward_rms_norm_f32( static void lm_ggml_compute_forward_rms_norm( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rms_norm_f32(params, src0, dst); + lm_ggml_compute_forward_rms_norm_f32(params, dst); } break; default: { @@ -9474,9 +9960,11 @@ static void lm_ggml_compute_forward_rms_norm( static void lm_ggml_compute_forward_rms_norm_back_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst) && lm_ggml_are_same_shape(src0, src1)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -9631,13 +10119,14 @@ static void lm_ggml_compute_forward_rms_norm_back_f32( static void lm_ggml_compute_forward_rms_norm_back( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); + lm_ggml_compute_forward_rms_norm_back_f32(params, dst); } break; default: { @@ -9650,8 +10139,10 @@ static void lm_ggml_compute_forward_rms_norm_back( static void lm_ggml_compute_forward_group_norm_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -9722,12 +10213,14 @@ static void lm_ggml_compute_forward_group_norm_f32( static void lm_ggml_compute_forward_group_norm( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_group_norm_f32(params, src0, dst); + lm_ggml_compute_forward_group_norm_f32(params, dst); } break; default: { @@ -9773,9 +10266,11 @@ static bool lm_ggml_compute_forward_mul_mat_use_blas(struct lm_ggml_tensor * dst static void lm_ggml_compute_forward_mul_mat( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); @@ -9791,6 +10286,7 @@ static void lm_ggml_compute_forward_mul_mat( lm_ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; enum lm_ggml_type const vec_dot_type = type_traits[type].vec_dot_type; lm_ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + int64_t const vec_dot_num_rows = type_traits[type].nrows; LM_GGML_ASSERT(ne0 == ne01); LM_GGML_ASSERT(ne1 == ne11); @@ -9825,11 +10321,30 @@ static void lm_ggml_compute_forward_mul_mat( #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) if (lm_ggml_compute_forward_mul_mat_use_blas(dst)) { - if (params->ith != 0) { - return; - } + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); + UNUSED(desired_wsize); if (params->type == LM_GGML_TASK_INIT) { + if (type != LM_GGML_TYPE_F32) { + assert(params->wsize >= desired_wsize); + // parallelize by src0 rows + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + // broadcast src0 into src1 across 2nd,3rd dimension + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; + lm_ggml_to_float_t const to_float = type_traits[type].to_float; + + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00); + } + } + } + } return; } @@ -9837,9 +10352,14 @@ static void lm_ggml_compute_forward_mul_mat( return; } + // perform sgemm, parallelization controlled by blas lib + if (ith != 0) { + return; + } + + //const int64_t tgemm0 = lm_ggml_perf_time_us(); for (int64_t i13 = 0; i13 < ne13; i13++) { for (int64_t i12 = 0; i12 < ne12; i12++) { - // broadcast src0 into src1 across 2nd,3rd dimension const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; @@ -9848,17 +10368,7 @@ static void lm_ggml_compute_forward_mul_mat( float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (type != LM_GGML_TYPE_F32) { - float * const wdata = params->wdata; - lm_ggml_to_float_t const to_float = type_traits[type].to_float; - - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - to_float((const char *) x + i01*nb01, wdata + id, ne00); - id += ne00; - } - - assert(id*sizeof(float) <= params->wsize); - x = wdata; + x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, @@ -9868,6 +10378,7 @@ static void lm_ggml_compute_forward_mul_mat( 0.0f, d, ne01); } } + //printf("cblas_sgemm = %.3f ms, %lld flops\n", (lm_ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2); //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (lm_ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); @@ -9876,6 +10387,9 @@ static void lm_ggml_compute_forward_mul_mat( #endif if (params->type == LM_GGML_TASK_INIT) { + if (ith != 0) { + return; + } if (src1->type != vec_dot_type) { char * wdata = params->wdata; const size_t row_size = lm_ggml_row_size(vec_dot_type, ne10); @@ -9940,12 +10454,23 @@ static void lm_ggml_compute_forward_mul_mat( const int64_t blck_0 = 16; const int64_t blck_1 = 16; + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int64_t nrc = vec_dot_num_rows; + // TODO: currently the mmla kernels support only even numbered rows/cols. + // this check can be removed once they are extended to support odd numbered rows/cols too + if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { + nrc = 1; + } + + const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + // attempt to reduce false-sharing (does not seem to make a difference) - float tmp[16]; + // 16 * 2, accounting for mmla kernels + float tmp[32]; for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) { const int64_t i13 = (ir1/(ne12*ne1)); const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1; const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1); @@ -9968,17 +10493,19 @@ static void lm_ggml_compute_forward_mul_mat( (src1_cont || src1->type != vec_dot_type ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size : (i11*nb11 + i12*nb12 + i13*nb13)); - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); //} - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) { + vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc); + } + + for (int cn = 0; cn < nrc; ++cn) { + memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } - memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } } } @@ -9988,10 +10515,11 @@ static void lm_ggml_compute_forward_mul_mat( static void lm_ggml_compute_forward_mul_mat_id( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * ids, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * ids = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + const struct lm_ggml_tensor * src0 = dst->src[2]; // only for LM_GGML_TENSOR_BINARY_OP_LOCALS LM_GGML_TENSOR_BINARY_OP_LOCALS @@ -10040,6 +10568,9 @@ static void lm_ggml_compute_forward_mul_mat_id( #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] if (params->type == LM_GGML_TASK_INIT) { + if (ith != 0) { + return; + } char * wdata = params->wdata; if (src1->type != vec_dot_type) { const size_t row_size = lm_ggml_row_size(vec_dot_type, ne10); @@ -10164,7 +10695,7 @@ static void lm_ggml_compute_forward_mul_mat_id( //} for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1); } memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } @@ -10179,9 +10710,11 @@ static void lm_ggml_compute_forward_mul_mat_id( static void lm_ggml_compute_forward_out_prod_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = lm_ggml_perf_time_us(); // UNUSED(t0); @@ -10225,6 +10758,9 @@ static void lm_ggml_compute_forward_out_prod_f32( return; } #endif + if (ith != 0) { + return; + } lm_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); return; } @@ -10368,9 +10904,11 @@ static void lm_ggml_compute_forward_out_prod_f32( static void lm_ggml_compute_forward_out_prod_q_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = lm_ggml_perf_time_us(); // UNUSED(t0); @@ -10408,6 +10946,9 @@ static void lm_ggml_compute_forward_out_prod_q_f32( // TODO: #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) || defined(LM_GGML_USE_CLBLAST) if (params->type == LM_GGML_TASK_INIT) { + if (ith != 0) { + return; + } lm_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); return; } @@ -10478,9 +11019,10 @@ static void lm_ggml_compute_forward_out_prod_q_f32( static void lm_ggml_compute_forward_out_prod( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_Q4_0: case LM_GGML_TYPE_Q4_1: @@ -10494,17 +11036,20 @@ static void lm_ggml_compute_forward_out_prod( case LM_GGML_TYPE_Q6_K: case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_IQ4_NL: { - lm_ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + lm_ggml_compute_forward_out_prod_q_f32(params, dst); } break; case LM_GGML_TYPE_F16: { LM_GGML_ASSERT(false); // todo - // lm_ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); + // lm_ggml_compute_forward_out_prod_f16_f32(params, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_out_prod_f32(params, src0, src1, dst); + lm_ggml_compute_forward_out_prod_f32(params, dst); } break; default: { @@ -10517,8 +11062,10 @@ static void lm_ggml_compute_forward_out_prod( static void lm_ggml_compute_forward_scale_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); @@ -10559,12 +11106,14 @@ static void lm_ggml_compute_forward_scale_f32( static void lm_ggml_compute_forward_scale( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_scale_f32(params, src0, dst); + lm_ggml_compute_forward_scale_f32(params, dst); } break; default: { @@ -10577,9 +11126,11 @@ static void lm_ggml_compute_forward_scale( static void lm_ggml_compute_forward_set_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst) && lm_ggml_is_contiguous(src0)); @@ -10592,6 +11143,9 @@ static void lm_ggml_compute_forward_set_f32( bool inplace = (bool) ((int32_t *) dst->op_params)[4]; if (!inplace && (params->type == LM_GGML_TASK_INIT)) { + if (params->ith != 0) { + return; + } // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( @@ -10647,14 +11201,14 @@ static void lm_ggml_compute_forward_set_f32( static void lm_ggml_compute_forward_set( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_set_f32(params, src0, src1, dst); + lm_ggml_compute_forward_set_f32(params, dst); } break; case LM_GGML_TYPE_F16: case LM_GGML_TYPE_Q4_0: @@ -10670,6 +11224,9 @@ static void lm_ggml_compute_forward_set( case LM_GGML_TYPE_Q6_K: case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_IQ4_NL: default: { LM_GGML_ASSERT(false); @@ -10681,29 +11238,25 @@ static void lm_ggml_compute_forward_set( static void lm_ggml_compute_forward_cpy( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { - lm_ggml_compute_forward_dup(params, src0, dst); + lm_ggml_compute_forward_dup(params, dst); } // lm_ggml_compute_forward_cont static void lm_ggml_compute_forward_cont( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { - lm_ggml_compute_forward_dup(params, src0, dst); + lm_ggml_compute_forward_dup(params, dst); } // lm_ggml_compute_forward_reshape static void lm_ggml_compute_forward_reshape( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); UNUSED(dst); } @@ -10711,39 +11264,41 @@ static void lm_ggml_compute_forward_reshape( static void lm_ggml_compute_forward_view( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0) { + const struct lm_ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // lm_ggml_compute_forward_permute static void lm_ggml_compute_forward_permute( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0) { + const struct lm_ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // lm_ggml_compute_forward_transpose static void lm_ggml_compute_forward_transpose( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0) { + const struct lm_ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // lm_ggml_compute_forward_get_rows static void lm_ggml_compute_forward_get_rows_q( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -10779,9 +11334,11 @@ static void lm_ggml_compute_forward_get_rows_q( static void lm_ggml_compute_forward_get_rows_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -10814,9 +11371,11 @@ static void lm_ggml_compute_forward_get_rows_f16( static void lm_ggml_compute_forward_get_rows_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -10849,9 +11408,10 @@ static void lm_ggml_compute_forward_get_rows_f32( static void lm_ggml_compute_forward_get_rows( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_Q4_0: case LM_GGML_TYPE_Q4_1: @@ -10866,17 +11426,20 @@ static void lm_ggml_compute_forward_get_rows( case LM_GGML_TYPE_Q6_K: case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_IQ4_NL: { - lm_ggml_compute_forward_get_rows_q(params, src0, src1, dst); + lm_ggml_compute_forward_get_rows_q(params, dst); } break; case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_get_rows_f16(params, src0, src1, dst); + lm_ggml_compute_forward_get_rows_f16(params, dst); } break; case LM_GGML_TYPE_F32: case LM_GGML_TYPE_I32: { - lm_ggml_compute_forward_get_rows_f32(params, src0, src1, dst); + lm_ggml_compute_forward_get_rows_f32(params, dst); } break; default: { @@ -10907,15 +11470,20 @@ static void lm_ggml_compute_forward_get_rows( static void lm_ggml_compute_forward_get_rows_back_f32_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(params->ith == 0); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); // lm_ggml_compute_forward_dup_same_cont(params, opt0, dst); if (params->type == LM_GGML_TASK_INIT) { + if (params->ith != 0) { + return; + } memset(dst->data, 0, lm_ggml_nbytes(dst)); } @@ -10941,15 +11509,20 @@ static void lm_ggml_compute_forward_get_rows_back_f32_f16( static void lm_ggml_compute_forward_get_rows_back_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(params->ith == 0); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); // lm_ggml_compute_forward_dup_same_cont(params, opt0, dst); if (params->type == LM_GGML_TASK_INIT) { + if (params->ith != 0) { + return; + } memset(dst->data, 0, lm_ggml_nbytes(dst)); } @@ -10975,17 +11548,18 @@ static void lm_ggml_compute_forward_get_rows_back_f32( static void lm_ggml_compute_forward_get_rows_back( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst); + lm_ggml_compute_forward_get_rows_back_f32_f16(params, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst); + lm_ggml_compute_forward_get_rows_back_f32(params, dst); } break; default: { @@ -11016,8 +11590,10 @@ static void lm_ggml_compute_forward_get_rows_back( static void lm_ggml_compute_forward_diag_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -11056,12 +11632,14 @@ static void lm_ggml_compute_forward_diag_f32( static void lm_ggml_compute_forward_diag( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_diag_f32(params, src0, dst); + lm_ggml_compute_forward_diag_f32(params, dst); } break; default: { @@ -11074,10 +11652,11 @@ static void lm_ggml_compute_forward_diag( static void lm_ggml_compute_forward_diag_mask_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst, const float value) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + const int ith = params->ith; const int nth = params->nth; @@ -11087,6 +11666,9 @@ static void lm_ggml_compute_forward_diag_mask_f32( LM_GGML_ASSERT(n_past >= 0); if (!inplace && (params->type == LM_GGML_TASK_INIT)) { + if (ith != 0) { + return; + } // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0)); @@ -11124,12 +11706,14 @@ static void lm_ggml_compute_forward_diag_mask_f32( static void lm_ggml_compute_forward_diag_mask_inf( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY); + lm_ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY); } break; default: { @@ -11140,12 +11724,14 @@ static void lm_ggml_compute_forward_diag_mask_inf( static void lm_ggml_compute_forward_diag_mask_zero( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_diag_mask_f32(params, src0, dst, 0); + lm_ggml_compute_forward_diag_mask_f32(params, dst, 0); } break; default: { @@ -11158,9 +11744,12 @@ static void lm_ggml_compute_forward_diag_mask_zero( static void lm_ggml_compute_forward_soft_max_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + const struct lm_ggml_tensor * src2 = dst->src[2]; + assert(lm_ggml_is_contiguous(dst)); assert(lm_ggml_are_same_shape(src0, dst)); @@ -11168,16 +11757,29 @@ static void lm_ggml_compute_forward_soft_max_f32( return; } - float scale = 1.0f; - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); // TODO: handle transposed/permuted matrices const int ith = params->ith; const int nth = params->nth; + LM_GGML_TENSOR_UNARY_OP_LOCALS + const int64_t ne11 = src1 ? src1->ne[1] : 1; + // TODO: is this supposed to be ceil instead of floor? + // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 + const uint32_t n_head_kv = ne02; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const int nc = src0->ne[0]; const int nr = lm_ggml_nrows(src0); @@ -11190,6 +11792,9 @@ static void lm_ggml_compute_forward_soft_max_f32( float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; + // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching + float * pos = src2 ? (float *) src2->data : src0->data; + for (int i1 = ir0; i1 < ir1; i1++) { float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); @@ -11203,6 +11808,16 @@ static void lm_ggml_compute_forward_soft_max_f32( lm_ggml_vec_acc_f32(nc, wp, mp); } + // ALiBi bias + if (max_bias > 0.0f) { + const uint32_t h = (i1/ne01)%ne02; // head + const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1); + + for (int i = 0; i < nc; i++) { + wp[i] = wp[i] + slope*pos[i]; + } + } + #ifndef NDEBUG for (int i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); @@ -11245,13 +11860,14 @@ static void lm_ggml_compute_forward_soft_max_f32( static void lm_ggml_compute_forward_soft_max( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_soft_max_f32(params, src0, src1, dst); + lm_ggml_compute_forward_soft_max_f32(params, dst); } break; default: { @@ -11264,9 +11880,11 @@ static void lm_ggml_compute_forward_soft_max( static void lm_ggml_compute_forward_soft_max_back_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(src1)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); @@ -11325,7 +11943,7 @@ static void lm_ggml_compute_forward_soft_max_back_f32( // linear runtime, no additional memory float dot_y_dy = 0; - lm_ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); + lm_ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); lm_ggml_vec_cpy_f32 (nc, dx, dy); lm_ggml_vec_acc1_f32(nc, dx, -dot_y_dy); lm_ggml_vec_mul_f32 (nc, dx, dx, y); @@ -11341,13 +11959,14 @@ static void lm_ggml_compute_forward_soft_max_back_f32( static void lm_ggml_compute_forward_soft_max_back( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); + lm_ggml_compute_forward_soft_max_back_f32(params, dst); } break; default: { @@ -11360,8 +11979,10 @@ static void lm_ggml_compute_forward_soft_max_back( static void lm_ggml_compute_forward_alibi_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -11395,22 +12016,20 @@ static void lm_ggml_compute_forward_alibi_f32( const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - for (int64_t i = 0; i < ne0; i++) { - for (int64_t j = 0; j < ne1; j++) { - for (int64_t k = 0; k < ne2_ne3; k++) { - float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); - float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); - - // TODO: k*nb2 or k*nb3 - - float m_k; + for (int64_t k = 0; k < ne2_ne3; k++) { + // TODO: k*nb2 or k*nb3 + float m_k; - if (k < n_heads_log2_floor) { - m_k = powf(m0, k + 1); - } else { - m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); - } + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } + for (int64_t i = 0; i < ne0; i++) { + for (int64_t j = 0; j < ne1; j++) { + float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); + float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); pdst[0] = i * m_k + src[0]; } } @@ -11419,8 +12038,10 @@ static void lm_ggml_compute_forward_alibi_f32( static void lm_ggml_compute_forward_alibi_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -11455,21 +12076,20 @@ static void lm_ggml_compute_forward_alibi_f16( const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - for (int i = 0; i < ne0; i++) { - for (int j = 0; j < ne1; j++) { - for (int k = 0; k < ne2_ne3; k++) { - lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); - float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); - - // TODO: k*nb2 or k*nb3 + for (int k = 0; k < ne2_ne3; k++) { + // TODO: k*nb2 or k*nb3 + float m_k; - float m_k; + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } - if (k < n_heads_log2_floor) { - m_k = powf(m0, k + 1); - } else { - m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); - } + for (int i = 0; i < ne0; i++) { + for (int j = 0; j < ne1; j++) { + lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); + float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); // we return F32 pdst[0] = i * m_k + LM_GGML_FP16_TO_FP32(src[0]); @@ -11480,16 +12100,18 @@ static void lm_ggml_compute_forward_alibi_f16( static void lm_ggml_compute_forward_alibi( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_alibi_f16(params, src0, dst); + lm_ggml_compute_forward_alibi_f16(params, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_alibi_f32(params, src0, dst); + lm_ggml_compute_forward_alibi_f32(params, dst); } break; case LM_GGML_TYPE_Q4_0: case LM_GGML_TYPE_Q4_1: @@ -11504,6 +12126,9 @@ static void lm_ggml_compute_forward_alibi( case LM_GGML_TYPE_Q6_K: case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_IQ4_NL: case LM_GGML_TYPE_Q8_K: case LM_GGML_TYPE_I8: case LM_GGML_TYPE_I16: @@ -11519,8 +12144,10 @@ static void lm_ggml_compute_forward_alibi( static void lm_ggml_compute_forward_clamp_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -11559,12 +12186,14 @@ static void lm_ggml_compute_forward_clamp_f32( static void lm_ggml_compute_forward_clamp( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_clamp_f32(params, src0, dst); + lm_ggml_compute_forward_clamp_f32(params, dst); } break; case LM_GGML_TYPE_F16: case LM_GGML_TYPE_Q4_0: @@ -11580,6 +12209,9 @@ static void lm_ggml_compute_forward_clamp( case LM_GGML_TYPE_Q6_K: case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_IQ4_NL: case LM_GGML_TYPE_Q8_K: case LM_GGML_TYPE_I8: case LM_GGML_TYPE_I16: @@ -11643,16 +12275,20 @@ LM_GGML_CALL void lm_ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { // start and end correction dims - dims[0] = MAX(0, floorf(lm_ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); - dims[1] = MIN(n_dims - 1, ceilf(lm_ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); + float start = floorf(lm_ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)); + float end = ceilf(lm_ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)); + dims[0] = MAX(0, start); + dims[1] = MIN(n_dims - 1, end); } static void lm_ggml_compute_forward_rope_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst, const bool forward) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -11825,10 +12461,12 @@ static void lm_ggml_compute_forward_rope_f32( static void lm_ggml_compute_forward_rope_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst, const bool forward) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -11990,17 +12628,18 @@ static void lm_ggml_compute_forward_rope_f16( static void lm_ggml_compute_forward_rope( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_rope_f16(params, src0, src1, dst, true); + lm_ggml_compute_forward_rope_f16(params, dst, true); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rope_f32(params, src0, src1, dst, true); + lm_ggml_compute_forward_rope_f32(params, dst, true); } break; default: { @@ -12013,17 +12652,18 @@ static void lm_ggml_compute_forward_rope( static void lm_ggml_compute_forward_rope_back( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_rope_f16(params, src0, src1, dst, false); + lm_ggml_compute_forward_rope_f16(params, dst, false); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rope_f32(params, src0, src1, dst, false); + lm_ggml_compute_forward_rope_f32(params, dst, false); } break; default: { @@ -12036,9 +12676,11 @@ static void lm_ggml_compute_forward_rope_back( static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); @@ -12057,6 +12699,9 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( LM_GGML_ASSERT(nb10 == sizeof(float)); if (params->type == LM_GGML_TASK_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) @@ -12119,9 +12764,9 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( const int i1n = i10*ne11; for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - lm_ggml_vec_dot_f16(ne02, &v, - (lm_ggml_fp16_t *) wdata_src + i1n, - (lm_ggml_fp16_t *) wdata_kernel + i00*ne02); + lm_ggml_vec_dot_f16(ne02, &v, 0, + (lm_ggml_fp16_t *) wdata_src + i1n, 0, + (lm_ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); dst_data[i10*s0 + i00] += v; } } @@ -12130,9 +12775,11 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( static void lm_ggml_compute_forward_conv_transpose_1d_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); @@ -12151,6 +12798,9 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f32( LM_GGML_ASSERT(nb10 == sizeof(float)); if (params->type == LM_GGML_TASK_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) @@ -12213,9 +12863,9 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f32( const int i1n = i10*ne11; for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - lm_ggml_vec_dot_f32(ne02, &v, - wdata_src + i1n, - wdata_kernel + i00*ne02); + lm_ggml_vec_dot_f32(ne02, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i00*ne02, 0, 1); dst_data[i10*s0 + i00] += v; } } @@ -12224,17 +12874,18 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f32( static void lm_ggml_compute_forward_conv_transpose_1d( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst); + lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst); + lm_ggml_compute_forward_conv_transpose_1d_f32(params, dst); } break; default: { @@ -12243,14 +12894,104 @@ static void lm_ggml_compute_forward_conv_transpose_1d( } } +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void lm_ggml_compute_forward_im2col_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); + + int64_t t0 = lm_ggml_perf_time_us(); + UNUSED(t0); + + LM_GGML_TENSOR_BINARY_OP_LOCALS; + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t N = is_2D ? ne13 : ne12; + const int64_t IC = is_2D ? ne12 : ne11; + const int64_t IH = is_2D ? ne11 : 1; + const int64_t IW = ne10; + + const int64_t KH = is_2D ? ne01 : 1; + const int64_t KW = ne00; + + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; + + int ofs0 = is_2D ? nb13 : nb12; + int ofs1 = is_2D ? nb12 : nb11; + + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == LM_GGML_TASK_INIT) { + return; + } + + if (params->type == LM_GGML_TASK_FINALIZE) { + return; + } + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + float * const wdata = (float *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic += nth) { + + // micro kernel + float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; + } else { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]); + } + } + } + } + } + } + } + } +} + + // src0: kernel [OC, IC, KH, KW] // src1: image [N, IC, IH, IW] // dst: result [N, OH, OW, IC*KH*KW] static void lm_ggml_compute_forward_im2col_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16); @@ -12330,17 +13071,15 @@ static void lm_ggml_compute_forward_im2col_f16( static void lm_ggml_compute_forward_im2col( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { - switch (src0->type) { + switch (dst->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_im2col_f16(params, src0, src1, dst); + lm_ggml_compute_forward_im2col_f16(params, dst); } break; case LM_GGML_TYPE_F32: { - LM_GGML_ASSERT(false); + lm_ggml_compute_forward_im2col_f32(params, dst); } break; default: { @@ -12349,13 +13088,16 @@ static void lm_ggml_compute_forward_im2col( } } + // lm_ggml_compute_forward_conv_transpose_2d static void lm_ggml_compute_forward_conv_transpose_2d( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); @@ -12374,6 +13116,9 @@ static void lm_ggml_compute_forward_conv_transpose_2d( LM_GGML_ASSERT(nb10 == sizeof(float)); if (params->type == LM_GGML_TASK_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) @@ -12440,9 +13185,9 @@ static void lm_ggml_compute_forward_conv_transpose_2d( for (int i01 = 0; i01 < ne01; i01++) { for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - lm_ggml_vec_dot_f16(ne03, &v, - wdata_src + i1n, - wdata_kernel + i01*ne00*ne03 + i00*ne03); + lm_ggml_vec_dot_f16(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; } } @@ -12456,9 +13201,11 @@ static void lm_ggml_compute_forward_conv_transpose_2d( static void lm_ggml_compute_forward_pool_1d_sk_p0( const struct lm_ggml_compute_params * params, const enum lm_ggml_op_pool op, - const struct lm_ggml_tensor * src, const int k, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src = dst->src[0]; + assert(src->type == LM_GGML_TYPE_F32); assert(params->ith == 0); @@ -12507,7 +13254,6 @@ static void lm_ggml_compute_forward_pool_1d_sk_p0( static void lm_ggml_compute_forward_pool_1d( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { const int32_t * opts = (const int32_t *)dst->op_params; @@ -12518,17 +13264,19 @@ static void lm_ggml_compute_forward_pool_1d( LM_GGML_ASSERT(p0 == 0); // padding not supported LM_GGML_ASSERT(k0 == s0); // only s = k supported - lm_ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); + lm_ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); } // lm_ggml_compute_forward_pool_2d static void lm_ggml_compute_forward_pool_2d( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - assert(src->type == LM_GGML_TYPE_F32); - assert(params->ith == 0); + + const struct lm_ggml_tensor * src = dst->src[0]; + + LM_GGML_ASSERT(src->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; @@ -12599,9 +13347,10 @@ static void lm_ggml_compute_forward_pool_2d( static void lm_ggml_compute_forward_upscale_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -12638,12 +13387,14 @@ static void lm_ggml_compute_forward_upscale_f32( static void lm_ggml_compute_forward_upscale( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_upscale_f32(params, src0, dst); + lm_ggml_compute_forward_upscale_f32(params, dst); } break; default: { @@ -12656,9 +13407,10 @@ static void lm_ggml_compute_forward_upscale( static void lm_ggml_compute_forward_pad_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -12696,12 +13448,14 @@ static void lm_ggml_compute_forward_pad_f32( static void lm_ggml_compute_forward_pad( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_pad_f32(params, src0, dst); + lm_ggml_compute_forward_pad_f32(params, dst); } break; default: { @@ -12714,9 +13468,10 @@ static void lm_ggml_compute_forward_pad( static void lm_ggml_compute_forward_argsort_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -12756,13 +13511,14 @@ static void lm_ggml_compute_forward_argsort_f32( static void lm_ggml_compute_forward_argsort( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_argsort_f32(params, src0, dst); + lm_ggml_compute_forward_argsort_f32(params, dst); } break; default: { @@ -12775,11 +13531,13 @@ static void lm_ggml_compute_forward_argsort( static void lm_ggml_compute_forward_flash_attn_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * q, - const struct lm_ggml_tensor * k, - const struct lm_ggml_tensor * v, const bool masked, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * q = dst->src[0]; + const struct lm_ggml_tensor * k = dst->src[1]; + const struct lm_ggml_tensor * v = dst->src[2]; + int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); @@ -12871,9 +13629,9 @@ static void lm_ggml_compute_forward_flash_attn_f32( const int i1 = ik1; lm_ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } // scale @@ -12956,20 +13714,22 @@ static void lm_ggml_compute_forward_flash_attn_f32( const int iv3 = iq3; lm_ggml_vec_dot_f32(masked_begin, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - S); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, + S, 0, 1); } } } static void lm_ggml_compute_forward_flash_attn_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * q, - const struct lm_ggml_tensor * k, - const struct lm_ggml_tensor * v, const bool masked, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * q = dst->src[0]; + const struct lm_ggml_tensor * k = dst->src[1]; + const struct lm_ggml_tensor * v = dst->src[2]; + int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); @@ -13061,9 +13821,9 @@ static void lm_ggml_compute_forward_flash_attn_f16( const int i1 = ik1; lm_ggml_vec_dot_f16(neq0, - S + i1, - (lm_ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (lm_ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (lm_ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (lm_ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } } else { for (int64_t ic = 0; ic < nek1; ic += LM_GGML_VEC_DOT_UNROLL) { @@ -13165,9 +13925,9 @@ static void lm_ggml_compute_forward_flash_attn_f16( const int iv3 = iq3; lm_ggml_vec_dot_f16(nev0, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (lm_ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - S16); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (lm_ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, + S16, 0, 1); } } else { for (int64_t ic = 0; ic < nev1; ic += LM_GGML_VEC_DOT_UNROLL) { @@ -13191,19 +13951,19 @@ static void lm_ggml_compute_forward_flash_attn_f16( static void lm_ggml_compute_forward_flash_attn( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * q, - const struct lm_ggml_tensor * k, - const struct lm_ggml_tensor * v, const bool masked, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * q = dst->src[0]; + switch (q->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); + lm_ggml_compute_forward_flash_attn_f16(params, masked, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); + lm_ggml_compute_forward_flash_attn_f32(params, masked, dst); } break; default: { @@ -13216,12 +13976,14 @@ static void lm_ggml_compute_forward_flash_attn( static void lm_ggml_compute_forward_flash_ff_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * a, // F16 - const struct lm_ggml_tensor * b0, // F16 fc_w - const struct lm_ggml_tensor * b1, // F32 fc_b - const struct lm_ggml_tensor * c0, // F16 proj_w - const struct lm_ggml_tensor * c1, // F32 proj_b struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * a = dst->src[0]; // F16 + const struct lm_ggml_tensor * b0 = dst->src[1]; // F16 fc_w + const struct lm_ggml_tensor * b1 = dst->src[2]; // F32 fc_b + const struct lm_ggml_tensor * c0 = dst->src[3]; // F16 proj_w + const struct lm_ggml_tensor * c1 = dst->src[4]; // F32 proj_b + int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); @@ -13309,9 +14071,9 @@ static void lm_ggml_compute_forward_flash_ff_f16( const int i1 = ib01; lm_ggml_vec_dot_f16(nea0, - S + i1, - (lm_ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), - (lm_ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); + S + i1, 0, + (lm_ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0, + (lm_ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1); } lm_ggml_vec_add_f32(neb01, S, S, (float *) b1->data); @@ -13334,9 +14096,9 @@ static void lm_ggml_compute_forward_flash_ff_f16( for (int64_t ic = 0; ic < nec01; ++ic) { lm_ggml_vec_dot_f16(neb01, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (lm_ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), - S16); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (lm_ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0, + S16, 0, 1); } lm_ggml_vec_add_f32(nec01, @@ -13349,16 +14111,14 @@ static void lm_ggml_compute_forward_flash_ff_f16( static void lm_ggml_compute_forward_flash_ff( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * a, - const struct lm_ggml_tensor * b0, - const struct lm_ggml_tensor * b1, - const struct lm_ggml_tensor * c0, - const struct lm_ggml_tensor * c1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * b0 = dst->src[1]; + switch (b0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); + lm_ggml_compute_forward_flash_ff_f16(params, dst); } break; case LM_GGML_TYPE_F32: { @@ -13375,12 +14135,14 @@ static void lm_ggml_compute_forward_flash_ff( static void lm_ggml_compute_forward_flash_attn_back_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * q, - const struct lm_ggml_tensor * k, - const struct lm_ggml_tensor * v, - const struct lm_ggml_tensor * d, const bool masked, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * q = dst->src[0]; + const struct lm_ggml_tensor * k = dst->src[1]; + const struct lm_ggml_tensor * v = dst->src[2]; + const struct lm_ggml_tensor * d = dst->src[3]; + int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); @@ -13523,9 +14285,9 @@ static void lm_ggml_compute_forward_flash_attn_back_f32( const int i1 = ik1; lm_ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } // scale @@ -13670,7 +14432,7 @@ static void lm_ggml_compute_forward_flash_attn_back_f32( // S = SM * (S - dot(SM, S)) float dot_SM_gradSM = 0; - lm_ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S); + lm_ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1); lm_ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); lm_ggml_vec_mul_f32 (masked_begin, S, S, SM); @@ -13728,16 +14490,15 @@ static void lm_ggml_compute_forward_flash_attn_back_f32( static void lm_ggml_compute_forward_flash_attn_back( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * q, - const struct lm_ggml_tensor * k, - const struct lm_ggml_tensor * v, - const struct lm_ggml_tensor * d, const bool masked, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * q = dst->src[0]; + switch (q->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); + lm_ggml_compute_forward_flash_attn_back_f32(params, masked, dst); } break; default: { @@ -13750,8 +14511,10 @@ static void lm_ggml_compute_forward_flash_attn_back( static void lm_ggml_compute_forward_win_part_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -13794,12 +14557,14 @@ static void lm_ggml_compute_forward_win_part_f32( static void lm_ggml_compute_forward_win_part( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_win_part_f32(params, src0, dst); + lm_ggml_compute_forward_win_part_f32(params, dst); } break; default: { @@ -13812,8 +14577,10 @@ static void lm_ggml_compute_forward_win_part( static void lm_ggml_compute_forward_win_unpart_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -13854,12 +14621,14 @@ static void lm_ggml_compute_forward_win_unpart_f32( static void lm_ggml_compute_forward_win_unpart( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_win_unpart_f32(params, src0, dst); + lm_ggml_compute_forward_win_unpart_f32(params, dst); } break; default: { @@ -13872,50 +14641,58 @@ static void lm_ggml_compute_forward_win_unpart( static void lm_ggml_compute_forward_unary( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + const enum lm_ggml_unary_op op = lm_ggml_get_unary_op(dst); switch (op) { case LM_GGML_UNARY_OP_ABS: { - lm_ggml_compute_forward_abs(params, src0, dst); + lm_ggml_compute_forward_abs(params, dst); } break; case LM_GGML_UNARY_OP_SGN: { - lm_ggml_compute_forward_sgn(params, src0, dst); + lm_ggml_compute_forward_sgn(params, dst); } break; case LM_GGML_UNARY_OP_NEG: { - lm_ggml_compute_forward_neg(params, src0, dst); + lm_ggml_compute_forward_neg(params, dst); } break; case LM_GGML_UNARY_OP_STEP: { - lm_ggml_compute_forward_step(params, src0, dst); + lm_ggml_compute_forward_step(params, dst); } break; case LM_GGML_UNARY_OP_TANH: { - lm_ggml_compute_forward_tanh(params, src0, dst); + lm_ggml_compute_forward_tanh(params, dst); } break; case LM_GGML_UNARY_OP_ELU: { - lm_ggml_compute_forward_elu(params, src0, dst); + lm_ggml_compute_forward_elu(params, dst); } break; case LM_GGML_UNARY_OP_RELU: { - lm_ggml_compute_forward_relu(params, src0, dst); + lm_ggml_compute_forward_relu(params, dst); } break; case LM_GGML_UNARY_OP_GELU: { - lm_ggml_compute_forward_gelu(params, src0, dst); + lm_ggml_compute_forward_gelu(params, dst); } break; case LM_GGML_UNARY_OP_GELU_QUICK: { - lm_ggml_compute_forward_gelu_quick(params, src0, dst); + lm_ggml_compute_forward_gelu_quick(params, dst); } break; case LM_GGML_UNARY_OP_SILU: { - lm_ggml_compute_forward_silu(params, src0, dst); + lm_ggml_compute_forward_silu(params, dst); + } break; + case LM_GGML_UNARY_OP_HARDSWISH: + { + lm_ggml_compute_forward_hardswish(params, dst); + } break; + case LM_GGML_UNARY_OP_HARDSIGMOID: + { + lm_ggml_compute_forward_hardsigmoid(params, dst); } break; default: { @@ -13928,8 +14705,10 @@ static void lm_ggml_compute_forward_unary( static void lm_ggml_compute_forward_get_rel_pos_f16( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -13955,12 +14734,14 @@ static void lm_ggml_compute_forward_get_rel_pos_f16( static void lm_ggml_compute_forward_get_rel_pos( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_get_rel_pos_f16(params, src0, dst); + lm_ggml_compute_forward_get_rel_pos_f16(params, dst); } break; default: { @@ -13973,13 +14754,17 @@ static void lm_ggml_compute_forward_get_rel_pos( static void lm_ggml_compute_forward_add_rel_pos_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - const struct lm_ggml_tensor * src2, struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + const struct lm_ggml_tensor * src2 = dst->src[2]; + const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; if (!inplace && params->type == LM_GGML_TASK_INIT) { + if (params->ith != 0) { + return; + } memcpy((char *) dst->data, (char *) src0->data, lm_ggml_nbytes(dst)); return; } @@ -14038,14 +14823,14 @@ static void lm_ggml_compute_forward_add_rel_pos_f32( static void lm_ggml_compute_forward_add_rel_pos( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - const struct lm_ggml_tensor * src2, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst); + lm_ggml_compute_forward_add_rel_pos_f32(params, dst); } break; default: { @@ -14058,9 +14843,11 @@ static void lm_ggml_compute_forward_add_rel_pos( static void lm_ggml_compute_forward_map_unary_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst, const lm_ggml_unary_op_f32_t fun) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -14082,13 +14869,15 @@ static void lm_ggml_compute_forward_map_unary_f32( static void lm_ggml_compute_forward_map_unary( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst, const lm_ggml_unary_op_f32_t fun) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + lm_ggml_compute_forward_map_unary_f32(params, dst, fun); } break; default: { @@ -14101,10 +14890,12 @@ static void lm_ggml_compute_forward_map_unary( static void lm_ggml_compute_forward_map_binary_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst, const lm_ggml_binary_op_f32_t fun) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); assert(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst)); @@ -14129,14 +14920,15 @@ static void lm_ggml_compute_forward_map_binary_f32( static void lm_ggml_compute_forward_map_binary( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst, const lm_ggml_binary_op_f32_t fun) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + lm_ggml_compute_forward_map_binary_f32(params, dst, fun); } break; default: { @@ -14149,9 +14941,11 @@ static void lm_ggml_compute_forward_map_binary( static void lm_ggml_compute_forward_map_custom1_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * a, struct lm_ggml_tensor * dst, const lm_ggml_custom1_op_f32_t fun) { + + const struct lm_ggml_tensor * a = dst->src[0]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -14165,10 +14959,12 @@ static void lm_ggml_compute_forward_map_custom1_f32( static void lm_ggml_compute_forward_map_custom2_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * a, - const struct lm_ggml_tensor * b, struct lm_ggml_tensor * dst, const lm_ggml_custom2_op_f32_t fun) { + + const struct lm_ggml_tensor * a = dst->src[0]; + const struct lm_ggml_tensor * b = dst->src[1]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -14182,11 +14978,13 @@ static void lm_ggml_compute_forward_map_custom2_f32( static void lm_ggml_compute_forward_map_custom3_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * a, - const struct lm_ggml_tensor * b, - const struct lm_ggml_tensor * c, struct lm_ggml_tensor * dst, const lm_ggml_custom3_op_f32_t fun) { + + const struct lm_ggml_tensor * a = dst->src[0]; + const struct lm_ggml_tensor * b = dst->src[1]; + const struct lm_ggml_tensor * c = dst->src[1]; + assert(params->ith == 0); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -14200,8 +14998,10 @@ static void lm_ggml_compute_forward_map_custom3_f32( static void lm_ggml_compute_forward_map_custom1( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * a, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * a = dst->src[0]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -14215,9 +15015,11 @@ static void lm_ggml_compute_forward_map_custom1( static void lm_ggml_compute_forward_map_custom2( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * a, - const struct lm_ggml_tensor * b, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * a = dst->src[0]; + const struct lm_ggml_tensor * b = dst->src[1]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -14231,10 +15033,12 @@ static void lm_ggml_compute_forward_map_custom2( static void lm_ggml_compute_forward_map_custom3( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * a, - const struct lm_ggml_tensor * b, - const struct lm_ggml_tensor * c, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * a = dst->src[0]; + const struct lm_ggml_tensor * b = dst->src[1]; + const struct lm_ggml_tensor * c = dst->src[2]; + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -14248,9 +15052,11 @@ static void lm_ggml_compute_forward_map_custom3( static void lm_ggml_compute_forward_cross_entropy_loss_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(src1)); LM_GGML_ASSERT(lm_ggml_is_scalar(dst)); @@ -14354,13 +15160,14 @@ static void lm_ggml_compute_forward_cross_entropy_loss_f32( static void lm_ggml_compute_forward_cross_entropy_loss( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); + lm_ggml_compute_forward_cross_entropy_loss_f32(params, dst); } break; default: { @@ -14373,10 +15180,12 @@ static void lm_ggml_compute_forward_cross_entropy_loss( static void lm_ggml_compute_forward_cross_entropy_loss_back_f32( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - const struct lm_ggml_tensor * opt0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + const struct lm_ggml_tensor * opt0 = dst->src[2]; + LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(src1)); @@ -14463,14 +15272,14 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back_f32( static void lm_ggml_compute_forward_cross_entropy_loss_back( const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - const struct lm_ggml_tensor * opt0, struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); + lm_ggml_compute_forward_cross_entropy_loss_back_f32(params, dst); } break; default: { @@ -14495,317 +15304,335 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru } LM_GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == LM_GGML_BACKEND_CPU); LM_GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == LM_GGML_BACKEND_CPU); +#elif defined(LM_GGML_USE_VULKAN) + const bool skip_cpu = lm_ggml_vk_compute_forward_cpu_assist(params, tensor); +#ifdef LM_GGML_VULKAN_CHECK_RESULTS + if (skip_cpu) { + lm_ggml_vk_check_results_1_cpu_assist(params, tensor); + } +#endif + if (skip_cpu) { + return; + } + LM_GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == LM_GGML_BACKEND_CPU); + LM_GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == LM_GGML_BACKEND_CPU); #endif // LM_GGML_USE_CUBLAS +#ifdef LM_GGML_USE_SYCL + bool skip_cpu = lm_ggml_sycl_compute_forward(params, tensor); + if (skip_cpu) { + return; + } +#endif // LM_GGML_USE_SYCL switch (tensor->op) { case LM_GGML_OP_DUP: { - lm_ggml_compute_forward_dup(params, tensor->src[0], tensor); + lm_ggml_compute_forward_dup(params, tensor); } break; case LM_GGML_OP_ADD: { - lm_ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_add(params, tensor); } break; case LM_GGML_OP_ADD1: { - lm_ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_add1(params, tensor); } break; case LM_GGML_OP_ACC: { - lm_ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_acc(params, tensor); } break; case LM_GGML_OP_SUB: { - lm_ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_sub(params, tensor); } break; case LM_GGML_OP_MUL: { - lm_ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_mul(params, tensor); } break; case LM_GGML_OP_DIV: { - lm_ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_div(params, tensor); } break; case LM_GGML_OP_SQR: { - lm_ggml_compute_forward_sqr(params, tensor->src[0], tensor); + lm_ggml_compute_forward_sqr(params, tensor); } break; case LM_GGML_OP_SQRT: { - lm_ggml_compute_forward_sqrt(params, tensor->src[0], tensor); + lm_ggml_compute_forward_sqrt(params, tensor); } break; case LM_GGML_OP_LOG: { - lm_ggml_compute_forward_log(params, tensor->src[0], tensor); + lm_ggml_compute_forward_log(params, tensor); } break; case LM_GGML_OP_SUM: { - lm_ggml_compute_forward_sum(params, tensor->src[0], tensor); + lm_ggml_compute_forward_sum(params, tensor); } break; case LM_GGML_OP_SUM_ROWS: { - lm_ggml_compute_forward_sum_rows(params, tensor->src[0], tensor); + lm_ggml_compute_forward_sum_rows(params, tensor); } break; case LM_GGML_OP_MEAN: { - lm_ggml_compute_forward_mean(params, tensor->src[0], tensor); + lm_ggml_compute_forward_mean(params, tensor); } break; case LM_GGML_OP_ARGMAX: { - lm_ggml_compute_forward_argmax(params, tensor->src[0], tensor); + lm_ggml_compute_forward_argmax(params, tensor); } break; case LM_GGML_OP_REPEAT: { - lm_ggml_compute_forward_repeat(params, tensor->src[0], tensor); + lm_ggml_compute_forward_repeat(params, tensor); } break; case LM_GGML_OP_REPEAT_BACK: { - lm_ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); + lm_ggml_compute_forward_repeat_back(params, tensor); } break; case LM_GGML_OP_CONCAT: { - lm_ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_concat(params, tensor); } break; case LM_GGML_OP_SILU_BACK: { - lm_ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_silu_back(params, tensor); } break; case LM_GGML_OP_NORM: { - lm_ggml_compute_forward_norm(params, tensor->src[0], tensor); + lm_ggml_compute_forward_norm(params, tensor); } break; case LM_GGML_OP_RMS_NORM: { - lm_ggml_compute_forward_rms_norm(params, tensor->src[0], tensor); + lm_ggml_compute_forward_rms_norm(params, tensor); } break; case LM_GGML_OP_RMS_NORM_BACK: { - lm_ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_rms_norm_back(params, tensor); } break; case LM_GGML_OP_GROUP_NORM: { - lm_ggml_compute_forward_group_norm(params, tensor->src[0], tensor); + lm_ggml_compute_forward_group_norm(params, tensor); } break; case LM_GGML_OP_MUL_MAT: { - lm_ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_mul_mat(params, tensor); } break; case LM_GGML_OP_MUL_MAT_ID: { - lm_ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_mul_mat_id(params, tensor); } break; case LM_GGML_OP_OUT_PROD: { - lm_ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_out_prod(params, tensor); } break; case LM_GGML_OP_SCALE: { - lm_ggml_compute_forward_scale(params, tensor->src[0], tensor); + lm_ggml_compute_forward_scale(params, tensor); } break; case LM_GGML_OP_SET: { - lm_ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_set(params, tensor); } break; case LM_GGML_OP_CPY: { - lm_ggml_compute_forward_cpy(params, tensor->src[0], tensor); + lm_ggml_compute_forward_cpy(params, tensor); } break; case LM_GGML_OP_CONT: { - lm_ggml_compute_forward_cont(params, tensor->src[0], tensor); + lm_ggml_compute_forward_cont(params, tensor); } break; case LM_GGML_OP_RESHAPE: { - lm_ggml_compute_forward_reshape(params, tensor->src[0], tensor); + lm_ggml_compute_forward_reshape(params, tensor); } break; case LM_GGML_OP_VIEW: { - lm_ggml_compute_forward_view(params, tensor->src[0]); + lm_ggml_compute_forward_view(params, tensor); } break; case LM_GGML_OP_PERMUTE: { - lm_ggml_compute_forward_permute(params, tensor->src[0]); + lm_ggml_compute_forward_permute(params, tensor); } break; case LM_GGML_OP_TRANSPOSE: { - lm_ggml_compute_forward_transpose(params, tensor->src[0]); + lm_ggml_compute_forward_transpose(params, tensor); } break; case LM_GGML_OP_GET_ROWS: { - lm_ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_get_rows(params, tensor); } break; case LM_GGML_OP_GET_ROWS_BACK: { - lm_ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_get_rows_back(params, tensor); } break; case LM_GGML_OP_DIAG: { - lm_ggml_compute_forward_diag(params, tensor->src[0], tensor); + lm_ggml_compute_forward_diag(params, tensor); } break; case LM_GGML_OP_DIAG_MASK_INF: { - lm_ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor); + lm_ggml_compute_forward_diag_mask_inf(params, tensor); } break; case LM_GGML_OP_DIAG_MASK_ZERO: { - lm_ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor); + lm_ggml_compute_forward_diag_mask_zero(params, tensor); } break; case LM_GGML_OP_SOFT_MAX: { - lm_ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_soft_max(params, tensor); } break; case LM_GGML_OP_SOFT_MAX_BACK: { - lm_ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_soft_max_back(params, tensor); } break; case LM_GGML_OP_ROPE: { - lm_ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_rope(params, tensor); } break; case LM_GGML_OP_ROPE_BACK: { - lm_ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_rope_back(params, tensor); } break; case LM_GGML_OP_ALIBI: { - lm_ggml_compute_forward_alibi(params, tensor->src[0], tensor); + lm_ggml_compute_forward_alibi(params, tensor); } break; case LM_GGML_OP_CLAMP: { - lm_ggml_compute_forward_clamp(params, tensor->src[0], tensor); + lm_ggml_compute_forward_clamp(params, tensor); } break; case LM_GGML_OP_CONV_TRANSPOSE_1D: { - lm_ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_conv_transpose_1d(params, tensor); } break; case LM_GGML_OP_IM2COL: { - lm_ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_im2col(params, tensor); } break; case LM_GGML_OP_CONV_TRANSPOSE_2D: { - lm_ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_conv_transpose_2d(params, tensor); } break; case LM_GGML_OP_POOL_1D: { - lm_ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); + lm_ggml_compute_forward_pool_1d(params, tensor); } break; case LM_GGML_OP_POOL_2D: { - lm_ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); + lm_ggml_compute_forward_pool_2d(params, tensor); } break; case LM_GGML_OP_UPSCALE: { - lm_ggml_compute_forward_upscale(params, tensor->src[0], tensor); + lm_ggml_compute_forward_upscale(params, tensor); } break; case LM_GGML_OP_PAD: { - lm_ggml_compute_forward_pad(params, tensor->src[0], tensor); + lm_ggml_compute_forward_pad(params, tensor); } break; case LM_GGML_OP_ARGSORT: { - lm_ggml_compute_forward_argsort(params, tensor->src[0], tensor); + lm_ggml_compute_forward_argsort(params, tensor); } break; case LM_GGML_OP_LEAKY_RELU: { - lm_ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor); + lm_ggml_compute_forward_leaky_relu(params, tensor); } break; case LM_GGML_OP_FLASH_ATTN: { const int32_t t = lm_ggml_get_op_params_i32(tensor, 0); LM_GGML_ASSERT(t == 0 || t == 1); const bool masked = t != 0; - lm_ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor); + lm_ggml_compute_forward_flash_attn(params, masked, tensor); } break; case LM_GGML_OP_FLASH_FF: { - lm_ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); + lm_ggml_compute_forward_flash_ff(params, tensor); } break; case LM_GGML_OP_FLASH_ATTN_BACK: { int32_t t = lm_ggml_get_op_params_i32(tensor, 0); LM_GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; - lm_ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor); + lm_ggml_compute_forward_flash_attn_back(params, masked, tensor); } break; case LM_GGML_OP_WIN_PART: { - lm_ggml_compute_forward_win_part(params, tensor->src[0], tensor); + lm_ggml_compute_forward_win_part(params, tensor); } break; case LM_GGML_OP_WIN_UNPART: { - lm_ggml_compute_forward_win_unpart(params, tensor->src[0], tensor); + lm_ggml_compute_forward_win_unpart(params, tensor); } break; case LM_GGML_OP_UNARY: { - lm_ggml_compute_forward_unary(params, tensor->src[0], tensor); + lm_ggml_compute_forward_unary(params, tensor); } break; case LM_GGML_OP_GET_REL_POS: { - lm_ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor); + lm_ggml_compute_forward_get_rel_pos(params, tensor); } break; case LM_GGML_OP_ADD_REL_POS: { - lm_ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + lm_ggml_compute_forward_add_rel_pos(params, tensor); } break; case LM_GGML_OP_MAP_UNARY: { lm_ggml_unary_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - lm_ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); + lm_ggml_compute_forward_map_unary(params, tensor, fun); } break; case LM_GGML_OP_MAP_BINARY: { lm_ggml_binary_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - lm_ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); + lm_ggml_compute_forward_map_binary(params, tensor, fun); } break; case LM_GGML_OP_MAP_CUSTOM1_F32: { lm_ggml_custom1_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - lm_ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); + lm_ggml_compute_forward_map_custom1_f32(params, tensor, fun); } break; case LM_GGML_OP_MAP_CUSTOM2_F32: { lm_ggml_custom2_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - lm_ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); + lm_ggml_compute_forward_map_custom2_f32(params, tensor, fun); } break; case LM_GGML_OP_MAP_CUSTOM3_F32: { lm_ggml_custom3_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - lm_ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + lm_ggml_compute_forward_map_custom3_f32(params, tensor, fun); } break; case LM_GGML_OP_MAP_CUSTOM1: { - lm_ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); + lm_ggml_compute_forward_map_custom1(params, tensor); } break; case LM_GGML_OP_MAP_CUSTOM2: { - lm_ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_map_custom2(params, tensor); } break; case LM_GGML_OP_MAP_CUSTOM3: { - lm_ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + lm_ggml_compute_forward_map_custom3(params, tensor); } break; case LM_GGML_OP_CROSS_ENTROPY_LOSS: { - lm_ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_cross_entropy_loss(params, tensor); } break; case LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - lm_ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + lm_ggml_compute_forward_cross_entropy_loss_back(params, tensor); } break; case LM_GGML_OP_NONE: @@ -14899,13 +15726,13 @@ struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size) { size = lm_ggml_hash_size(size); struct lm_ggml_hash_set result; result.size = size; - result.keys = malloc(sizeof(struct lm_ggml_tensor *) * size); + result.keys = LM_GGML_MALLOC(sizeof(struct lm_ggml_tensor *) * size); memset(result.keys, 0, sizeof(struct lm_ggml_tensor *) * size); return result; } static void lm_ggml_hash_set_free(struct lm_ggml_hash_set hash_set) { - free(hash_set.keys); + LM_GGML_FREE(hash_set.keys); } struct hash_map { @@ -14914,17 +15741,17 @@ struct hash_map { }; static struct hash_map * lm_ggml_new_hash_map(size_t size) { - struct hash_map * result = malloc(sizeof(struct hash_map)); + struct hash_map * result = LM_GGML_MALLOC(sizeof(struct hash_map)); result->set = lm_ggml_hash_set_new(size); - result->vals = malloc(sizeof(struct lm_ggml_tensor *) * result->set.size); + result->vals = LM_GGML_MALLOC(sizeof(struct lm_ggml_tensor *) * result->set.size); memset(result->vals, 0, sizeof(struct lm_ggml_tensor *) * result->set.size); return result; } static void lm_ggml_hash_map_free(struct hash_map * map) { lm_ggml_hash_set_free(map->set); - free(map->vals); - free(map); + LM_GGML_FREE(map->vals); + LM_GGML_FREE(map); } // gradient checkpointing @@ -14939,7 +15766,7 @@ static struct lm_ggml_tensor * lm_ggml_recompute_graph_node( return NULL; } - if (node->is_param) { + if (node->flags & LM_GGML_TENSOR_FLAG_PARAM) { return node; } @@ -14973,7 +15800,7 @@ static struct lm_ggml_tensor * lm_ggml_recompute_graph_node( clone->op = node->op; clone->grad = node->grad; - clone->is_param = node->is_param; + clone->flags = node->flags; clone->extra = node->extra; for (int k = 0; k < LM_GGML_MAX_DIMS; ++k) { clone->nb[k] = node->nb[k]; @@ -16005,7 +16832,7 @@ void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_ for (int i = 0; i < gf->n_nodes; i++) { struct lm_ggml_tensor * node = gf->nodes[i]; - if (node->is_param) { + if (node->flags & LM_GGML_TENSOR_FLAG_PARAM) { LM_GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); lm_ggml_build_forward_expand(gb, node->grad); } @@ -16209,27 +17036,47 @@ typedef pthread_t lm_ggml_thread_t; #endif // Android's libc implementation "bionic" does not support setting affinity -#if defined(__linux__) && !defined(__BIONIC__) -static void set_numa_thread_affinity(int thread_n, int n_threads) { +#if defined(__gnu_linux__) +static void set_numa_thread_affinity(int thread_n) { if (!lm_ggml_is_numa()) { return; } - // run thread on node_num thread_n / (threads per node) - const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes); - struct lm_ggml_numa_node * node = &g_state.numa.nodes[node_num]; + int node_num; + int rv; size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); + switch(g_state.numa.numa_strategy) { + case LM_GGML_NUMA_STRATEGY_DISTRIBUTE: + // run thread on node_num thread_n / (threads per node) + node_num = thread_n % g_state.numa.n_nodes; + break; + case LM_GGML_NUMA_STRATEGY_ISOLATE: + // run thread on current_node + node_num = g_state.numa.current_node; + break; + case LM_GGML_NUMA_STRATEGY_NUMACTL: + // use the cpuset that numactl gave us + rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv)); + } + return; + default: + return; + } + + struct lm_ggml_numa_node * node = &g_state.numa.nodes[node_num]; + cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); CPU_ZERO_S(setsize, cpus); for (size_t i = 0; i < node->n_cpus; ++i) { CPU_SET_S(node->cpus[i], setsize, cpus); } - int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); + rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", - strerror(rv)); + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); } CPU_FREE(cpus); @@ -16250,8 +17097,7 @@ static void clear_numa_thread_affinity(void) { int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", - strerror(rv)); + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); } CPU_FREE(cpus); @@ -16259,7 +17105,7 @@ static void clear_numa_thread_affinity(void) { #else // TODO: Windows etc. // (the linux implementation may also work on BSD, someone should test) -static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } +static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} #endif @@ -16273,10 +17119,11 @@ struct lm_ggml_compute_state_shared { const int n_threads; // synchronization primitives - atomic_int n_active; // num active threads - atomic_int node_n; // active graph node + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + atomic_int node_task; // active graph node task phase - bool (*abort_callback)(void * data); // abort lm_ggml_graph_compute when true + lm_ggml_abort_callback abort_callback; // abort lm_ggml_graph_compute when true void * abort_callback_data; }; @@ -16330,6 +17177,8 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) { case LM_GGML_UNARY_OP_TANH: case LM_GGML_UNARY_OP_ELU: case LM_GGML_UNARY_OP_RELU: + case LM_GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads + case LM_GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads { n_tasks = 1; } break; @@ -16406,7 +17255,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) { } break; case LM_GGML_OP_SOFT_MAX: { - n_tasks = MIN(MIN(4, n_threads), lm_ggml_nrows(node->src[0])); + n_tasks = MIN(n_threads, lm_ggml_nrows(node->src[0])); } break; case LM_GGML_OP_CONV_TRANSPOSE_1D: { @@ -16520,6 +17369,34 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) { return n_tasks; } +static void lm_ggml_graph_compute_thread_sync_node(int * node_n, struct lm_ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_node_n = * node_n; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * node_n = atomic_load(&state->shared->node_n); + if (* node_n != last_node_n) break; + } +} + +static void lm_ggml_graph_compute_thread_sync_task(int * task_phase, struct lm_ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_task_phase = * task_phase; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * task_phase = atomic_load(&state->shared->node_task); + if (* task_phase != last_task_phase) break; + } +} + static thread_ret_t lm_ggml_graph_compute_thread(void * data) { struct lm_ggml_compute_state * state = (struct lm_ggml_compute_state *) data; @@ -16528,9 +17405,10 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { const int n_threads = state->shared->n_threads; - set_numa_thread_affinity(state->ith, n_threads); + set_numa_thread_affinity(state->ith); - int node_n = -1; + int node_n = -1; + int task_phase = LM_GGML_TASK_FINALIZE; while (true) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { @@ -16562,7 +17440,6 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { // distribute new work or execute it direct if 1T while (++node_n < cgraph->n_nodes) { LM_GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); - struct lm_ggml_tensor * node = cgraph->nodes[node_n]; const int n_tasks = lm_ggml_get_n_tasks(node, n_threads); @@ -16571,13 +17448,13 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { params.nth = n_tasks; - /* INIT */ - if (LM_GGML_OP_HAS_INIT[node->op]) { - params.type = LM_GGML_TASK_INIT; - lm_ggml_compute_forward(¶ms, node); - } - if (n_tasks == 1) { + /* INIT */ + if (LM_GGML_OP_HAS_INIT[node->op]) { + params.type = LM_GGML_TASK_INIT; + lm_ggml_compute_forward(¶ms, node); + } + // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) params.type = LM_GGML_TASK_COMPUTE; @@ -16598,38 +17475,24 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { } } - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_n, node_n); + task_phase = LM_GGML_TASK_INIT; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_n, node_n); + atomic_store(&state->shared->node_task, task_phase); } else { - // wait for other threads to finish - const int last = node_n; - - const bool do_yield = last < 0 || cgraph->nodes[last]->op == LM_GGML_OP_MUL_MAT; - - while (true) { - // TODO: this sched_yield can have significant impact on the performance - either positive or negative - // depending on the workload and the operating system. - // since it is not clear what is the best approach, it should potentially become user-configurable - // ref: https://github.com/ggerganov/ggml/issues/291 - // UPD: adding the do_yield flag seems to resolve the issue universally - if (do_yield) { - sched_yield(); - } - - node_n = atomic_load(&state->shared->node_n); - if (node_n != last) break; - }; + lm_ggml_graph_compute_thread_sync_node(&node_n, state, false); + lm_ggml_graph_compute_thread_sync_task(&task_phase, state, false); } // check if we should stop if (node_n >= cgraph->n_nodes) break; - /* COMPUTE */ + /* INIT & COMPUTE */ struct lm_ggml_tensor * node = cgraph->nodes[node_n]; const int n_tasks = lm_ggml_get_n_tasks(node, n_threads); struct lm_ggml_compute_params params = { - /*.type =*/ LM_GGML_TASK_COMPUTE, + /*.type =*/ LM_GGML_TASK_INIT, /*.ith =*/ state->ith, /*.nth =*/ n_tasks, /*.wsize =*/ cplan->work_size, @@ -16637,8 +17500,39 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { }; if (state->ith < n_tasks) { + if (LM_GGML_OP_HAS_INIT[node->op]) { + lm_ggml_compute_forward(¶ms, node); + } + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = LM_GGML_TASK_COMPUTE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + // TODO: this sched_yield can have significant impact on the performance - either positive or negative + // depending on the workload and the operating system. + // since it is not clear what is the best approach, it should potentially become user-configurable + // ref: https://github.com/ggerganov/ggml/issues/291 + // UPD: adding the do_yield flag seems to resolve the issue universally + const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == LM_GGML_OP_MUL_MAT; + lm_ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); + } + + if (state->ith < n_tasks) { + params.type = LM_GGML_TASK_COMPUTE; lm_ggml_compute_forward(¶ms, node); } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = LM_GGML_TASK_FINALIZE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + lm_ggml_graph_compute_thread_sync_task(&task_phase, state, false); + } } return LM_GGML_EXIT_SUCCESS; @@ -16654,12 +17548,16 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in struct lm_ggml_cplan cplan; memset(&cplan, 0, sizeof(struct lm_ggml_cplan)); + int max_tasks = 1; + // thread scheduling for the different operations + work buffer size estimation for (int i = 0; i < cgraph->n_nodes; i++) { struct lm_ggml_tensor * node = cgraph->nodes[i]; const int n_tasks = lm_ggml_get_n_tasks(node, n_threads); + max_tasks = MAX(max_tasks, n_tasks); + size_t cur = 0; switch (node->op) { @@ -16695,8 +17593,11 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) if (lm_ggml_compute_forward_mul_mat_use_blas(node)) { if (node->src[0]->type != LM_GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = lm_ggml_type_size(LM_GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); + // here we need memory for fully dequantized matrix from src0 + // take into account that src0 can be broadcasted into src1[2,3] + cur = lm_ggml_type_size(LM_GGML_TYPE_F32) + * node->src[0]->ne[0]*node->src[0]->ne[1] + * node->src[1]->ne[2]*node->src[1]->ne[3]; } } else #endif @@ -16823,7 +17724,7 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in work_size += CACHE_LINE_SIZE*(n_threads - 1); } - cplan.n_threads = n_threads; + cplan.n_threads = MIN(max_tasks, n_threads); cplan.work_size = work_size; cplan.work_data = NULL; @@ -16840,6 +17741,17 @@ int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * } } +#ifdef LM_GGML_USE_VULKAN + for (int i = 0; i < cgraph->n_nodes; i++) { + lm_ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]); + } + lm_ggml_vk_preallocate_buffers_cpu_assist(); + + for (int i = 0; i < cgraph->n_nodes; i++) { + lm_ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1); + } +#endif + const int n_threads = cplan->n_threads; struct lm_ggml_compute_state_shared state_shared = { @@ -16850,6 +17762,7 @@ int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * /*.n_threads =*/ n_threads, /*.n_active =*/ n_threads, /*.node_n =*/ -1, + /*.node_task =*/ LM_GGML_TASK_FINALIZE, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, }; @@ -16890,6 +17803,10 @@ int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * } } +#ifdef LM_GGML_USE_VULKAN + lm_ggml_vk_graph_cleanup_cpu_assist(); +#endif + // performance stats (graph) { int64_t perf_cycles_cur = lm_ggml_perf_cycles() - perf_start_cycles; @@ -17294,7 +18211,7 @@ struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_ ptr += lm_ggml_nbytes(tensor); - fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, lm_ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, lm_ggml_nbytes(tensor)); } } @@ -17397,7 +18314,7 @@ struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_ result->nodes[i] = tensor; - fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, lm_ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, lm_ggml_nbytes(tensor)); } } } @@ -17419,7 +18336,7 @@ void lm_ggml_graph_print(const struct lm_ggml_cgraph * cgraph) { LM_GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - lm_ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + lm_ggml_op_name(node->op), (node->flags & LM_GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs, (double) node->perf_cycles / (double) lm_ggml_cycles_per_ms(), (double) node->perf_cycles / (double) lm_ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_time_us / 1000.0, @@ -17512,7 +18429,7 @@ void lm_ggml_graph_dump_dot(const struct lm_ggml_cgraph * gb, const struct lm_gg continue; } - if (node->is_param) { + if (node->flags & LM_GGML_TENSOR_FLAG_PARAM) { snprintf(color, sizeof(color), "yellow"); } else if (node->grad) { if (lm_ggml_graph_find(gf, node)) { @@ -17686,7 +18603,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam( int np = 0; int64_t nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { - if (gf->nodes[i]->is_param) { + if (gf->nodes[i]->flags & LM_GGML_TENSOR_FLAG_PARAM) { LM_GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); LM_GGML_ASSERT(np < LM_GGML_MAX_PARAMS); @@ -17939,7 +18856,7 @@ static enum lm_ggml_opt_result linesearch_backtracking( } // compute the initial gradient in the search direction - lm_ggml_vec_dot_f32(nx, &dginit, g, d); + lm_ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1); // make sure that d points to a descent direction if (0 < dginit) { @@ -17989,7 +18906,7 @@ static enum lm_ggml_opt_result linesearch_backtracking( return count; } - lm_ggml_vec_dot_f32(nx, &dg, g, d); + lm_ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1); // check the Wolfe condition if (dg < params->lbfgs.wolfe * dginit) { @@ -18022,7 +18939,9 @@ static enum lm_ggml_opt_result linesearch_backtracking( (*step) *= width; } - LM_GGML_UNREACHABLE(); + LM_GGML_ASSERT(false && "line search failed"); + + return LM_GGML_LINESEARCH_FAIL; } static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( @@ -18049,7 +18968,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( int np = 0; int nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { - if (gf->nodes[i]->is_param) { + if (gf->nodes[i]->flags & LM_GGML_TENSOR_FLAG_PARAM) { LM_GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); LM_GGML_ASSERT(np < LM_GGML_MAX_PARAMS); @@ -18250,8 +19169,8 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - lm_ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]); - lm_ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); + lm_ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1); + lm_ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1); lm_ys[end[0]] = ys; @@ -18270,7 +19189,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( for (int i = 0; i < bound; ++i) { j[0] = (j[0] + m - 1) % m; // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} - lm_ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); + lm_ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1); lm_alpha[j[0]] /= lm_ys[j[0]]; // q_{i} = q_{i+1} - \alpha_{i} y_{i} lm_ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); @@ -18280,7 +19199,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( for (int i = 0; i < bound; ++i) { // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} - lm_ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); + lm_ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1); beta /= lm_ys[j[0]]; // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} lm_ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); @@ -18290,7 +19209,9 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( step[0] = 1.0; } - LM_GGML_UNREACHABLE(); + LM_GGML_ASSERT(false && "lbfgs failed"); + + return LM_GGML_OPT_DID_NOT_CONVERGE; } struct lm_ggml_opt_params lm_ggml_opt_default_params(enum lm_ggml_opt_type type) { @@ -18524,12 +19445,24 @@ enum lm_ggml_opt_result lm_ggml_opt_resume_g( //////////////////////////////////////////////////////////////////////////////// +void lm_ggml_set_input(struct lm_ggml_tensor * tensor) { + tensor->flags |= LM_GGML_TENSOR_FLAG_INPUT; +} + +void lm_ggml_set_output(struct lm_ggml_tensor * tensor) { + tensor->flags |= LM_GGML_TENSOR_FLAG_OUTPUT; +} + +//////////////////////////////////////////////////////////////////////////////// + void lm_ggml_quantize_init(enum lm_ggml_type type) { lm_ggml_critical_section_start(); switch (type) { - case LM_GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break; - case LM_GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break; + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: + case LM_GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break; + case LM_GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; default: // nothing break; } @@ -18540,8 +19473,10 @@ void lm_ggml_quantize_init(enum lm_ggml_type type) { void lm_ggml_quantize_free(void) { lm_ggml_critical_section_start(); - iq2xs_free_impl(256); - iq2xs_free_impl(512); + iq2xs_free_impl(LM_GGML_TYPE_IQ2_XXS); + iq2xs_free_impl(LM_GGML_TYPE_IQ2_XS); + iq2xs_free_impl(LM_GGML_TYPE_IQ1_S); + iq3xs_free_impl(256); lm_ggml_critical_section_end(); } @@ -18676,7 +19611,8 @@ size_t lm_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_ bool lm_ggml_quantize_requires_imatrix(enum lm_ggml_type type) { return type == LM_GGML_TYPE_IQ2_XXS || - type == LM_GGML_TYPE_IQ2_XS; + type == LM_GGML_TYPE_IQ2_XS || + type == LM_GGML_TYPE_IQ1_S; } size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * dst, int start, @@ -18792,6 +19728,33 @@ size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); LM_GGML_ASSERT(result == row_size * nrows); } break; + case LM_GGML_TYPE_IQ3_XXS: + { + LM_GGML_ASSERT(start % QK_K == 0); + LM_GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); + } break; + case LM_GGML_TYPE_IQ1_S: + { + LM_GGML_ASSERT(start % QK_K == 0); + LM_GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); + } break; + case LM_GGML_TYPE_IQ4_NL: + { + LM_GGML_ASSERT(start % QK4_NL == 0); + LM_GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); + } break; case LM_GGML_TYPE_F16: { size_t elemsize = sizeof(lm_ggml_fp16_t); @@ -18918,6 +19881,25 @@ struct lm_gguf_context { void * data; }; +static size_t lm_gguf_type_size(enum lm_gguf_type type) { + LM_GGML_ASSERT(0 <= type && type < LM_GGUF_TYPE_COUNT); + return LM_GGUF_TYPE_SIZE[type]; +} + +static void lm_gguf_tensor_info_sanitize(struct lm_gguf_tensor_info * info) { + LM_GGML_ASSERT(info->n_dims <= LM_GGML_MAX_DIMS); + LM_GGML_ASSERT(0 <= info->type && info->type < LM_GGML_TYPE_COUNT); + + for (uint32_t i = 0; i < info->n_dims; ++i) { + LM_GGML_ASSERT(info->ne[i] > 0); + } + + // prevent overflow for total number of elements + LM_GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]); + LM_GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]); + LM_GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]); +} + static bool lm_gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { const size_t n = fread(dst, 1, size, file); *offset += n; @@ -18930,8 +19912,17 @@ static bool lm_gguf_fread_str(FILE * file, struct lm_gguf_str * p, size_t * offs bool ok = true; - ok = ok && lm_gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); - ok = ok && lm_gguf_fread_el(file, p->data, p->n, offset); + ok = ok && lm_gguf_fread_el(file, &p->n, sizeof(p->n), offset); + + // early exit if string length is invalid, prevents from integer overflow + if (p->n == SIZE_MAX) { + fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n); + return false; + } + + p->data = LM_GGML_CALLOC(p->n + 1, 1); + + ok = ok && lm_gguf_fread_el(file, p->data, p->n, offset); return ok; } @@ -19003,6 +19994,12 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg return NULL; } + // sanity-checks to prevent from integer/buffer overflows + + ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct lm_gguf_tensor_info)); + ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/lm_ggml_tensor_overhead()); + ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct lm_gguf_kv)); + if (!ok) { fprintf(stderr, "%s: failed to read header\n", __func__); fclose(file); @@ -19013,7 +20010,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg // read the kv pairs { - ctx->kv = malloc(ctx->header.n_kv * sizeof(struct lm_gguf_kv)); + ctx->kv = LM_GGML_MALLOC(ctx->header.n_kv * sizeof(struct lm_gguf_kv)); for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct lm_gguf_kv * kv = &ctx->kv[i]; @@ -19041,7 +20038,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg case LM_GGUF_TYPE_ARRAY: { ok = ok && lm_gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - ok = ok && lm_gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + ok = ok && lm_gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); switch (kv->value.arr.type) { case LM_GGUF_TYPE_UINT8: @@ -19056,21 +20053,39 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg case LM_GGUF_TYPE_FLOAT64: case LM_GGUF_TYPE_BOOL: { - kv->value.arr.data = malloc(kv->value.arr.n * LM_GGUF_TYPE_SIZE[kv->value.arr.type]); - ok = ok && lm_gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * LM_GGUF_TYPE_SIZE[kv->value.arr.type], &offset); + // prevent from integer overflow in the malloc below + if (kv->value.arr.n >= SIZE_MAX/lm_gguf_type_size(kv->value.arr.type)) { + fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); + fclose(file); + lm_gguf_free(ctx); + return NULL; + } + + kv->value.arr.data = LM_GGML_MALLOC(kv->value.arr.n * lm_gguf_type_size(kv->value.arr.type)); + + ok = ok && lm_gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * lm_gguf_type_size(kv->value.arr.type), &offset); } break; case LM_GGUF_TYPE_STRING: { - kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct lm_gguf_str)); + // prevent from integer overflow in the malloc below + if (kv->value.arr.n >= SIZE_MAX/sizeof(struct lm_gguf_str)) { + fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); + fclose(file); + lm_gguf_free(ctx); + return NULL; + } + + kv->value.arr.data = LM_GGML_MALLOC(kv->value.arr.n * sizeof(struct lm_gguf_str)); + for (uint64_t j = 0; j < kv->value.arr.n; ++j) { ok = ok && lm_gguf_fread_str(file, &((struct lm_gguf_str *) kv->value.arr.data)[j], &offset); } } break; case LM_GGUF_TYPE_ARRAY: - case LM_GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); break; + default: LM_GGML_ASSERT(false && "invalid type"); break; } } break; - case LM_GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); + default: LM_GGML_ASSERT(false && "invalid type"); } if (!ok) { @@ -19088,7 +20103,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg // read the tensor infos { - ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct lm_gguf_tensor_info)); + ctx->infos = LM_GGML_MALLOC(ctx->header.n_tensors * sizeof(struct lm_gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct lm_gguf_tensor_info * info = &ctx->infos[i]; @@ -19099,12 +20114,18 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg ok = ok && lm_gguf_fread_str(file, &info->name, &offset); ok = ok && lm_gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); + + ok = ok && (info->n_dims <= LM_GGML_MAX_DIMS); + for (uint32_t j = 0; j < info->n_dims; ++j) { ok = ok && lm_gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); } + ok = ok && lm_gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && lm_gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + lm_gguf_tensor_info_sanitize(info); + if (!ok) { fprintf(stderr, "%s: failed to read tensor info\n", __func__); fclose(file); @@ -19258,12 +20279,12 @@ void lm_gguf_free(struct lm_gguf_context * ctx) { struct lm_gguf_kv * kv = &ctx->kv[i]; if (kv->key.data) { - free(kv->key.data); + LM_GGML_FREE(kv->key.data); } if (kv->type == LM_GGUF_TYPE_STRING) { if (kv->value.str.data) { - free(kv->value.str.data); + LM_GGML_FREE(kv->value.str.data); } } @@ -19273,16 +20294,16 @@ void lm_gguf_free(struct lm_gguf_context * ctx) { for (uint64_t j = 0; j < kv->value.arr.n; ++j) { struct lm_gguf_str * str = &((struct lm_gguf_str *) kv->value.arr.data)[j]; if (str->data) { - free(str->data); + LM_GGML_FREE(str->data); } } } - free(kv->value.arr.data); + LM_GGML_FREE(kv->value.arr.data); } } } - free(ctx->kv); + LM_GGML_FREE(ctx->kv); } if (ctx->infos) { @@ -19290,11 +20311,11 @@ void lm_gguf_free(struct lm_gguf_context * ctx) { struct lm_gguf_tensor_info * info = &ctx->infos[i]; if (info->name.data) { - free(info->name.data); + LM_GGML_FREE(info->name.data); } } - free(ctx->infos); + LM_GGML_FREE(ctx->infos); } LM_GGML_ALIGNED_FREE(ctx); @@ -19595,8 +20616,8 @@ void lm_gguf_set_arr_data(struct lm_gguf_context * ctx, const char * key, enum l ctx->kv[idx].type = LM_GGUF_TYPE_ARRAY; ctx->kv[idx].value.arr.type = type; ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = malloc(n*LM_GGUF_TYPE_SIZE[type]); - memcpy(ctx->kv[idx].value.arr.data, data, n*LM_GGUF_TYPE_SIZE[type]); + ctx->kv[idx].value.arr.data = LM_GGML_MALLOC(n*lm_gguf_type_size(type)); + memcpy(ctx->kv[idx].value.arr.data, data, n*lm_gguf_type_size(type)); } void lm_gguf_set_arr_str(struct lm_gguf_context * ctx, const char * key, const char ** data, int n) { @@ -19605,7 +20626,7 @@ void lm_gguf_set_arr_str(struct lm_gguf_context * ctx, const char * key, const c ctx->kv[idx].type = LM_GGUF_TYPE_ARRAY; ctx->kv[idx].value.arr.type = LM_GGUF_TYPE_STRING; ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct lm_gguf_str)); + ctx->kv[idx].value.arr.data = LM_GGML_MALLOC(n*sizeof(struct lm_gguf_str)); for (int i = 0; i < n; i++) { struct lm_gguf_str * str = &((struct lm_gguf_str *)ctx->kv[idx].value.arr.data)[i]; str->n = strlen(data[i]); @@ -19632,19 +20653,19 @@ void lm_gguf_set_kv(struct lm_gguf_context * ctx, struct lm_gguf_context * src) case LM_GGUF_TYPE_ARRAY: { if (src->kv[i].value.arr.type == LM_GGUF_TYPE_STRING) { - const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + const char ** data = LM_GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *)); for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { data[j] = ((struct lm_gguf_str *)src->kv[i].value.arr.data)[j].data; } lm_gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); - free((void *)data); + LM_GGML_FREE((void *)data); } else if (src->kv[i].value.arr.type == LM_GGUF_TYPE_ARRAY) { LM_GGML_ASSERT(false && "nested arrays not supported"); } else { lm_gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); } } break; - case LM_GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); break; + default: LM_GGML_ASSERT(false && "invalid type"); break; } } } @@ -19720,7 +20741,7 @@ struct lm_gguf_buf { static struct lm_gguf_buf lm_gguf_buf_init(size_t size) { struct lm_gguf_buf buf = { - /*buf.data =*/ size == 0 ? NULL : malloc(size), + /*buf.data =*/ size == 0 ? NULL : LM_GGML_MALLOC(size), /*buf.size =*/ size, /*buf.offset =*/ 0, }; @@ -19730,7 +20751,7 @@ static struct lm_gguf_buf lm_gguf_buf_init(size_t size) { static void lm_gguf_buf_free(struct lm_gguf_buf buf) { if (buf.data) { - free(buf.data); + LM_GGML_FREE(buf.data); } } @@ -19811,7 +20832,7 @@ static void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_g case LM_GGUF_TYPE_FLOAT64: case LM_GGUF_TYPE_BOOL: { - lm_gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * LM_GGUF_TYPE_SIZE[kv->value.arr.type]); + lm_gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * lm_gguf_type_size(kv->value.arr.type)); } break; case LM_GGUF_TYPE_STRING: { @@ -19820,10 +20841,10 @@ static void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_g } } break; case LM_GGUF_TYPE_ARRAY: - case LM_GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); break; + default: LM_GGML_ASSERT(false && "invalid type"); break; } } break; - case LM_GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); + default: LM_GGML_ASSERT(false && "invalid type"); } } @@ -20024,7 +21045,7 @@ int lm_ggml_cpu_has_wasm_simd(void) { } int lm_ggml_cpu_has_blas(void) { -#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) || defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) +#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) || defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_VULKAN) || defined(LM_GGML_USE_CLBLAST) || defined(LM_GGML_USE_SYCL) return 1; #else return 0; @@ -20047,8 +21068,33 @@ int lm_ggml_cpu_has_clblast(void) { #endif } +int lm_ggml_cpu_has_vulkan(void) { +#if defined(LM_GGML_USE_VULKAN) + return 1; +#else + return 0; +#endif +} + +int lm_ggml_cpu_has_kompute(void) { +#if defined(LM_GGML_USE_KOMPUTE) + return 1; +#else + return 0; +#endif +} + +int lm_ggml_cpu_has_sycl(void) { +#if defined(LM_GGML_USE_SYCL) + return 1; +#else + return 0; +#endif +} + int lm_ggml_cpu_has_gpublas(void) { - return lm_ggml_cpu_has_cublas() || lm_ggml_cpu_has_clblast(); + return lm_ggml_cpu_has_cublas() || lm_ggml_cpu_has_clblast() || lm_ggml_cpu_has_vulkan() || lm_ggml_cpu_has_kompute() || + lm_ggml_cpu_has_sycl(); } int lm_ggml_cpu_has_sse3(void) { @@ -20075,4 +21121,12 @@ int lm_ggml_cpu_has_vsx(void) { #endif } +int lm_ggml_cpu_has_matmul_int8(void) { +#if defined(__ARM_FEATURE_MATMUL_INT8) + return 1; +#else + return 0; +#endif +} + //////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/ggml.h b/cpp/ggml.h index 08cfdbb1..cb359a93 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -353,6 +353,9 @@ extern "C" { LM_GGML_TYPE_Q8_K = 15, LM_GGML_TYPE_IQ2_XXS = 16, LM_GGML_TYPE_IQ2_XS = 17, + LM_GGML_TYPE_IQ3_XXS = 18, + LM_GGML_TYPE_IQ1_S = 19, + LM_GGML_TYPE_IQ4_NL = 20, LM_GGML_TYPE_I8, LM_GGML_TYPE_I16, LM_GGML_TYPE_I32, @@ -389,6 +392,9 @@ extern "C" { LM_GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors LM_GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors LM_GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors + LM_GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors + LM_GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors + LM_GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors }; // available tensor operations: @@ -489,6 +495,8 @@ extern "C" { LM_GGML_UNARY_OP_GELU, LM_GGML_UNARY_OP_GELU_QUICK, LM_GGML_UNARY_OP_SILU, + LM_GGML_UNARY_OP_HARDSWISH, + LM_GGML_UNARY_OP_HARDSIGMOID, LM_GGML_UNARY_OP_COUNT, }; @@ -501,11 +509,17 @@ extern "C" { enum lm_ggml_log_level { LM_GGML_LOG_LEVEL_ERROR = 2, - LM_GGML_LOG_LEVEL_WARN = 3, - LM_GGML_LOG_LEVEL_INFO = 4, + LM_GGML_LOG_LEVEL_WARN = 3, + LM_GGML_LOG_LEVEL_INFO = 4, LM_GGML_LOG_LEVEL_DEBUG = 5 }; + enum lm_ggml_tensor_flag { + LM_GGML_TENSOR_FLAG_INPUT = 1, + LM_GGML_TENSOR_FLAG_OUTPUT = 2, + LM_GGML_TENSOR_FLAG_PARAM = 4, + }; + // ggml object struct lm_ggml_object { size_t offs; @@ -539,7 +553,7 @@ extern "C" { // op params - allocated as int32_t for alignment int32_t op_params[LM_GGML_MAX_OP_PARAMS / sizeof(int32_t)]; - bool is_param; + int32_t flags; struct lm_ggml_tensor * grad; struct lm_ggml_tensor * src[LM_GGML_MAX_SRC]; @@ -563,6 +577,11 @@ extern "C" { static const size_t LM_GGML_TENSOR_SIZE = sizeof(struct lm_ggml_tensor); + // Abort callback + // If not NULL, called before ggml computation + // If it returns true, the computation is aborted + typedef bool (*lm_ggml_abort_callback)(void * data); + // the compute plan that needs to be prepared for lm_ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct lm_ggml_cplan { @@ -572,8 +591,8 @@ extern "C" { int n_threads; // abort lm_ggml_graph_compute when true - bool (*abort_callback)(void * data); - void * abort_callback_data; + lm_ggml_abort_callback abort_callback; + void * abort_callback_data; }; enum lm_ggml_cgraph_eval_order { @@ -643,6 +662,16 @@ extern "C" { void * wdata; }; + // numa strategies + enum lm_ggml_numa_strategy { + LM_GGML_NUMA_STRATEGY_DISABLED = 0, + LM_GGML_NUMA_STRATEGY_DISTRIBUTE = 1, + LM_GGML_NUMA_STRATEGY_ISOLATE = 2, + LM_GGML_NUMA_STRATEGY_NUMACTL = 3, + LM_GGML_NUMA_STRATEGY_MIRROR = 4, + LM_GGML_NUMA_STRATEGY_COUNT + }; + // misc LM_GGML_API void lm_ggml_time_init(void); // call this once at the beginning of the program @@ -653,7 +682,7 @@ extern "C" { LM_GGML_API void lm_ggml_print_backtrace(void); - LM_GGML_API void lm_ggml_numa_init(void); // call once for better performance on NUMA systems + LM_GGML_API void lm_ggml_numa_init(enum lm_ggml_numa_strategy numa); // call once for better performance on NUMA systems LM_GGML_API bool lm_ggml_is_numa(void); // true if init detected that system has >1 NUMA node LM_GGML_API void lm_ggml_print_object (const struct lm_ggml_object * obj); @@ -1032,6 +1061,16 @@ extern "C" { struct lm_ggml_tensor * a, struct lm_ggml_tensor * b); + // hardswish(x) = x * relu6(x + 3) / 6 + LM_GGML_API struct lm_ggml_tensor * lm_ggml_hardswish( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a); + + // hardsigmoid(x) = relu6(x + 3) / 6 + LM_GGML_API struct lm_ggml_tensor * lm_ggml_hardsigmoid( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a); + // normalize along rows LM_GGML_API struct lm_ggml_tensor * lm_ggml_norm( struct lm_ggml_context * ctx, @@ -1348,13 +1387,17 @@ extern "C" { struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); - // fused soft_max(a*scale + mask) + // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope)) // mask is optional + // pos is required when max_bias > 0.0f + // max_bias = 0.0f for no ALiBi LM_GGML_API struct lm_ggml_tensor * lm_ggml_soft_max_ext( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * mask, - float scale); + struct lm_ggml_tensor * pos, + float scale, + float max_bias); LM_GGML_API struct lm_ggml_tensor * lm_ggml_soft_max_back( struct lm_ggml_context * ctx, @@ -1456,12 +1499,13 @@ extern "C" { // alibi position embedding // in-place, returns view(a) - LM_GGML_API struct lm_ggml_tensor * lm_ggml_alibi( + LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_alibi( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, int n_past, int n_head, - float bias_max); + float bias_max), + "use lm_ggml_soft_max_ext instead (will be removed in Mar 2024)"); // clamp // in-place, returns view(a) @@ -1481,7 +1525,19 @@ extern "C" { int p1, int d0, int d1, - bool is_2D); + bool is_2D, + enum lm_ggml_type dst_type); + + LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_depthwise_2d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d( struct lm_ggml_context * ctx, @@ -2061,6 +2117,12 @@ extern "C" { lm_ggml_opt_callback callback, void * callback_data); + // + // tensor flags + // + LM_GGML_API void lm_ggml_set_input(struct lm_ggml_tensor * tensor); + LM_GGML_API void lm_ggml_set_output(struct lm_ggml_tensor * tensor); + // // quantization // @@ -2240,10 +2302,14 @@ extern "C" { LM_GGML_API int lm_ggml_cpu_has_blas (void); LM_GGML_API int lm_ggml_cpu_has_cublas (void); LM_GGML_API int lm_ggml_cpu_has_clblast (void); + LM_GGML_API int lm_ggml_cpu_has_vulkan (void); + LM_GGML_API int lm_ggml_cpu_has_kompute (void); LM_GGML_API int lm_ggml_cpu_has_gpublas (void); LM_GGML_API int lm_ggml_cpu_has_sse3 (void); LM_GGML_API int lm_ggml_cpu_has_ssse3 (void); + LM_GGML_API int lm_ggml_cpu_has_sycl (void); LM_GGML_API int lm_ggml_cpu_has_vsx (void); + LM_GGML_API int lm_ggml_cpu_has_matmul_int8(void); // // Internal types and functions exposed for tests and benchmarks @@ -2257,7 +2323,8 @@ extern "C" { #endif typedef void (*lm_ggml_to_float_t) (const void * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int k); typedef void (*lm_ggml_from_float_t)(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int k); - typedef void (*lm_ggml_vec_dot_t) (const int n, float * LM_GGML_RESTRICT s, const void * LM_GGML_RESTRICT x, const void * LM_GGML_RESTRICT y); + typedef void (*lm_ggml_vec_dot_t) (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT x, size_t bx, + const void * LM_GGML_RESTRICT y, size_t by, int nrc); typedef struct { const char * type_name; @@ -2269,6 +2336,7 @@ extern "C" { lm_ggml_from_float_t from_float_reference; lm_ggml_vec_dot_t vec_dot; enum lm_ggml_type vec_dot_type; + int64_t nrows; // number of rows to process simultaneously; } lm_ggml_type_traits_t; LM_GGML_API lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type); diff --git a/cpp/llama.cpp b/cpp/llama.cpp index eaa969ce..665d10a1 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -11,6 +11,12 @@ # include "ggml-cuda.h" #elif defined(LM_GGML_USE_CLBLAST) # include "ggml-opencl.h" +#elif defined(LM_GGML_USE_VULKAN) +# include "ggml-vulkan.h" +#elif defined(LM_GGML_USE_SYCL) +# include "ggml-sycl.h" +#elif defined(LM_GGML_USE_KOMPUTE) +# include "ggml-kompute.h" #endif #ifdef LM_GGML_USE_METAL @@ -52,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -200,30 +207,46 @@ enum llm_arch { LLM_ARCH_STARCODER, LLM_ARCH_PERSIMMON, LLM_ARCH_REFACT, + LLM_ARCH_BERT, + LLM_ARCH_NOMIC_BERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, + LLM_ARCH_QWEN2, LLM_ARCH_PHI2, LLM_ARCH_PLAMO, + LLM_ARCH_CODESHELL, + LLM_ARCH_ORION, + LLM_ARCH_INTERNLM2, + LLM_ARCH_MINICPM, + LLM_ARCH_GEMMA, LLM_ARCH_UNKNOWN, }; -static std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN, "baichuan" }, - { LLM_ARCH_STARCODER, "starcoder" }, - { LLM_ARCH_PERSIMMON, "persimmon" }, - { LLM_ARCH_REFACT, "refact" }, - { LLM_ARCH_BLOOM, "bloom" }, - { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_QWEN, "qwen" }, - { LLM_ARCH_PHI2, "phi2" }, - { LLM_ARCH_PLAMO, "plamo" }, +static std::map LLM_ARCH_NAMES = { + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_PERSIMMON, "persimmon" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BERT, "bert" }, + { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_BLOOM, "bloom" }, + { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_QWEN2, "qwen2" }, + { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PLAMO, "plamo" }, + { LLM_ARCH_CODESHELL, "codeshell" }, + { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_GEMMA, "gemma" }, }; enum llm_kv { @@ -246,6 +269,7 @@ enum llm_kv { LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, + LLM_KV_POOLING_TYPE, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -255,6 +279,7 @@ enum llm_kv { LLM_KV_ATTENTION_VALUE_LENGTH, LLM_KV_ATTENTION_LAYERNORM_EPS, LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + LLM_KV_ATTENTION_CAUSAL, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, @@ -267,6 +292,7 @@ enum llm_kv { LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, LLM_KV_TOKENIZER_TOKEN_TYPE, + LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, LLM_KV_TOKENIZER_SCORES, LLM_KV_TOKENIZER_MERGES, LLM_KV_TOKENIZER_BOS_ID, @@ -276,11 +302,12 @@ enum llm_kv { LLM_KV_TOKENIZER_PAD_ID, LLM_KV_TOKENIZER_ADD_BOS, LLM_KV_TOKENIZER_ADD_EOS, + LLM_KV_TOKENIZER_ADD_PREFIX, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, }; -static std::map LLM_KV_NAMES = { +static std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, @@ -300,6 +327,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, + { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -309,6 +337,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, + { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, @@ -321,6 +350,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, @@ -330,6 +360,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, }; @@ -340,13 +371,14 @@ struct LLM_KV { llm_arch arch; std::string operator()(llm_kv kv) const { - return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str()); + return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]); } }; enum llm_tensor { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_TOKEN_TYPES, LLM_TENSOR_POS_EMBD, LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, @@ -358,6 +390,7 @@ enum llm_tensor { LLM_TENSOR_ATTN_OUT, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_OUT_NORM, LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_NORM, @@ -370,6 +403,7 @@ enum llm_tensor { LLM_TENSOR_FFN_UP_EXP, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_LAYER_OUT_NORM, }; static std::map> LLM_TENSOR_NAMES = { @@ -528,6 +562,38 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_BERT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_NOMIC_BERT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_BLOOM, { @@ -577,6 +643,23 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_QWEN2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_PHI2, { @@ -611,7 +694,101 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, - + { + LLM_ARCH_CODESHELL, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_ORION, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_INTERNLM2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_MINICPM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + }, + }, + { + LLM_ARCH_GEMMA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -645,22 +822,37 @@ struct LLM_TN { llm_arch arch; std::string operator()(llm_tensor tensor) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return LLM_TENSOR_NAMES[arch].at(tensor); } std::string operator()(llm_tensor tensor, const std::string & suffix) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; } std::string operator()(llm_tensor tensor, int bid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); } std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; } std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix; } }; @@ -669,13 +861,13 @@ struct LLM_TN { // gguf helpers // -static std::map LLAMA_ROPE_SCALING_TYPES = { +static std::map LLAMA_ROPE_SCALING_TYPES = { { LLAMA_ROPE_SCALING_NONE, "none" }, { LLAMA_ROPE_SCALING_LINEAR, "linear" }, { LLAMA_ROPE_SCALING_YARN, "yarn" }, }; -static int8_t llama_rope_scaling_type_from_string(const std::string & name) { +static int32_t llama_rope_scaling_type_from_string(const std::string & name) { for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { if (kv.second == name) { return kv.first; @@ -871,7 +1063,7 @@ struct llama_mmap { int fd = fileno(file->fp); int flags = MAP_SHARED; // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } + if (numa) { prefetch = 0; } #ifdef __linux__ // advise the kernel to read the file sequentially (increases readahead) if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) { @@ -1102,10 +1294,10 @@ struct llama_mlock { #ifdef __APPLE__ #define MLOCK_SUGGESTION \ "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ - "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n" #else #define MLOCK_SUGGESTION \ - "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n" #endif bool raw_lock(const void * addr, size_t size) const { @@ -1226,8 +1418,14 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf if (host_buffer) { buft = lm_ggml_backend_cuda_host_buffer_type(); } +#elif defined(LM_GGML_USE_SYCL) + buft = lm_ggml_backend_sycl_host_buffer_type(); #elif defined(LM_GGML_USE_CPU_HBM) buft = lm_ggml_backend_cpu_hbm_buffer_type(); +#elif defined(LM_GGML_USE_VULKAN) + if (host_buffer) { + buft = lm_ggml_backend_vk_host_buffer_type(); + } #endif if (buft == nullptr) { @@ -1245,8 +1443,17 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) buft = lm_ggml_backend_metal_buffer_type(); #elif defined(LM_GGML_USE_CUBLAS) buft = lm_ggml_backend_cuda_buffer_type(gpu); +#elif defined(LM_GGML_USE_VULKAN) + buft = lm_ggml_backend_vk_buffer_type(gpu); +#elif defined(LM_GGML_USE_SYCL) + buft = lm_ggml_backend_sycl_buffer_type(gpu); #elif defined(LM_GGML_USE_CLBLAST) buft = lm_ggml_backend_opencl_buffer_type(); +#elif defined(LM_GGML_USE_KOMPUTE) + buft = lm_ggml_backend_kompute_buffer_type(gpu); + if (buft == nullptr) { + LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); + } #endif if (buft == nullptr) { @@ -1274,6 +1481,33 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallbac LM_GGML_UNUSED(tensor_split); } +static size_t llama_get_device_count() { +#if defined(LM_GGML_USE_CUBLAS) + return lm_ggml_backend_cuda_get_device_count(); +#elif defined(LM_GGML_USE_VULKAN) + return lm_ggml_backend_vk_get_device_count(); +#else + return 1; +#endif +} + +static size_t llama_get_device_memory(int device) { +#if defined(LM_GGML_USE_CUBLAS) + size_t total; + size_t free; + lm_ggml_backend_cuda_get_device_memory(device, &total, &free); + return free; +#elif defined(LM_GGML_USE_VULKAN) + size_t total; + size_t free; + lm_ggml_backend_vk_get_device_memory(device, &total, &free); + return free; +#else + return 1; + LM_GGML_UNUSED(device); +#endif +} + // // globals // @@ -1295,12 +1529,23 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_17M, + MODEL_22M, + MODEL_33M, + MODEL_109M, + MODEL_137M, + MODEL_335M, + MODEL_0_5B, MODEL_1B, + MODEL_2B, MODEL_3B, + MODEL_4B, MODEL_7B, MODEL_8B, MODEL_13B, + MODEL_14B, MODEL_15B, + MODEL_20B, MODEL_30B, MODEL_34B, MODEL_40B, @@ -1318,6 +1563,7 @@ static const size_t GiB = 1024*MiB; struct llama_hparams { bool vocab_only; + bool rope_finetuned; uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; @@ -1330,6 +1576,7 @@ struct llama_hparams { uint32_t n_ff; uint32_t n_expert = 0; uint32_t n_expert_used = 0; + uint32_t n_vocab_type = 0; // for BERT-style token types float f_norm_eps; float f_norm_rms_eps; @@ -1337,12 +1584,15 @@ struct llama_hparams { float rope_freq_base_train; float rope_freq_scale_train; uint32_t n_yarn_orig_ctx; - int8_t rope_scaling_type_train : 3; - bool rope_finetuned : 1; + int32_t rope_scaling_type_train; - float f_clamp_kqv; - float f_max_alibi_bias; + float f_clamp_kqv = 0.0f; + float f_max_alibi_bias = 0.0f; + bool causal_attn = true; + bool need_kq_pos = false; + + uint32_t pooling_type = LLAMA_POOLING_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -1404,6 +1654,7 @@ struct llama_cparams { bool mul_mat_q; bool offload_kqv; + bool do_pooling; lm_ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; @@ -1419,6 +1670,8 @@ struct llama_layer { struct lm_ggml_tensor * attn_q_norm_b; struct lm_ggml_tensor * attn_k_norm; struct lm_ggml_tensor * attn_k_norm_b; + struct lm_ggml_tensor * attn_out_norm; + struct lm_ggml_tensor * attn_out_norm_b; // attention struct lm_ggml_tensor * wq; @@ -1437,6 +1690,8 @@ struct llama_layer { // normalization struct lm_ggml_tensor * ffn_norm; struct lm_ggml_tensor * ffn_norm_b; + struct lm_ggml_tensor * layer_out_norm; + struct lm_ggml_tensor * layer_out_norm_b; // ff struct lm_ggml_tensor * ffn_gate; // w1 @@ -1542,6 +1797,8 @@ struct llama_vocab { id special_suffix_id = 32008; id special_eot_id = 32010; + bool add_space_prefix = true; + int find_bpe_rank(const std::string & token_left, const std::string & token_right) const { LM_GGML_ASSERT(token_left.find(' ') == std::string::npos); LM_GGML_ASSERT(token_left.find('\n') == std::string::npos); @@ -1568,6 +1825,7 @@ struct llama_model { llama_vocab vocab; struct lm_ggml_tensor * tok_embd; + struct lm_ggml_tensor * type_embd; struct lm_ggml_tensor * pos_embd; struct lm_ggml_tensor * tok_norm; struct lm_ggml_tensor * tok_norm_b; @@ -1637,6 +1895,13 @@ struct llama_context { for (lm_ggml_backend_t backend : backends) { lm_ggml_backend_free(backend); } + +#ifdef LM_GGML_USE_VULKAN + lm_ggml_vk_free_cpu_assist(); +#endif + + lm_ggml_backend_buffer_free(buf_input); + lm_ggml_free(ctx_input); } llama_cparams cparams; @@ -1680,11 +1945,18 @@ struct llama_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; lm_ggml_backend_sched_t sched = nullptr; - // allocator for the input tensors - lm_ggml_tallocr * alloc = nullptr; - // temporary buffer for copying data to/from the backend - std::vector> buf_copy; + // input tensors + lm_ggml_backend_buffer_t buf_input = nullptr; + lm_ggml_context * ctx_input = nullptr; + struct lm_ggml_tensor * inp_tokens; // I32 [n_batch] + struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch] + struct lm_ggml_tensor * inp_pos; // I32 [n_batch] + struct lm_ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] + struct lm_ggml_tensor * inp_KQ_pos; // F32 [n_ctx] + struct lm_ggml_tensor * inp_K_shift; // I32 [n_ctx] + struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch] + struct lm_ggml_tensor * inp_cls; // I32 [n_batch] #ifdef LM_GGML_USE_MPI lm_ggml_mpi_context * ctx_mpi = NULL; @@ -2268,20 +2540,23 @@ struct llama_model_loader { } switch (type_max) { - case LM_GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; - case LM_GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; - case LM_GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; - case LM_GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; - case LM_GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; - case LM_GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; - case LM_GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; - case LM_GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; - case LM_GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; - case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; - case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; - case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + case LM_GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; + case LM_GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; + case LM_GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; + case LM_GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; + case LM_GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; + case LM_GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; + case LM_GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; + case LM_GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; + case LM_GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; + case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; + case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; + case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case LM_GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; + case LM_GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; + case LM_GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case LM_GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max)); @@ -2527,13 +2802,7 @@ struct llama_model_loader { std::vector> read_buf; - for (int i = 0; i < lm_gguf_get_n_tensors(ctx_gguf); i++) { - struct lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx, lm_gguf_get_tensor_name(ctx_gguf, i)); - if (!cur) { - // some tensors may be allocated in a different context - continue; - } - + for (struct lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur != NULL; cur = lm_ggml_get_next_tensor(ctx, cur)) { if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { return false; @@ -2592,7 +2861,7 @@ struct llama_model_loader { // load LLaMA models // -static std::string llama_model_arch_name(llm_arch arch) { +static const char * llama_model_arch_name(llm_arch arch) { auto it = LLM_ARCH_NAMES.find(arch); if (it == LLM_ARCH_NAMES.end()) { return "unknown"; @@ -2627,8 +2896,12 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; - case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; + case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small"; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; default: return "unknown, may not work"; } @@ -2636,12 +2909,20 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { + case MODEL_22M: return "22M"; + case MODEL_33M: return "33M"; + case MODEL_109M: return "109M"; + case MODEL_137M: return "137M"; + case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; + case MODEL_2B: return "2B"; case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; case MODEL_13B: return "13B"; + case MODEL_14B: return "14B"; case MODEL_15B: return "15B"; + case MODEL_20B: return "20B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; case MODEL_40B: return "40B"; @@ -2654,6 +2935,15 @@ static const char * llama_model_type_name(e_model type) { default: return "?B"; } } +static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ + switch (type) { + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + default: return "unknown"; + } +} + static void llm_load_arch(llama_model_loader & ml, llama_model & model) { model.arch = ml.get_arch(); @@ -2766,6 +3056,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MINICPM: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_2B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_FALCON: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -2784,6 +3083,11 @@ static void llm_load_hparams( case 40: model.type = e_model::MODEL_13B; break; default: model.type = e_model::MODEL_UNKNOWN; } + + if (model.type == e_model::MODEL_13B) { + // TODO: become GGUF KV parameter + hparams.f_max_alibi_bias = 8.0f; + } } break; case LLM_ARCH_STARCODER: { @@ -2811,6 +3115,41 @@ static void llm_load_hparams( case 32: model.type = e_model::MODEL_1B; break; default: model.type = e_model::MODEL_UNKNOWN; } + + // TODO: become GGUF KV parameter + hparams.f_max_alibi_bias = 8.0f; + } break; + case LLM_ARCH_BERT: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + + switch (hparams.n_layer) { + case 3: + model.type = e_model::MODEL_17M; break; // bge-micro + case 6: + model.type = e_model::MODEL_22M; break; // MiniLM-L6 + case 12: + switch (hparams.n_embd) { + case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small + case 768: model.type = e_model::MODEL_109M; break; // bge-base + } break; + case 24: + model.type = e_model::MODEL_335M; break; // bge-large + } + } break; + case LLM_ARCH_NOMIC_BERT: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + + if (hparams.n_layer == 12 && hparams.n_embd == 768) { + model.type = e_model::MODEL_137M; + } } break; case LLM_ARCH_BLOOM: { @@ -2824,11 +3163,12 @@ static void llm_load_hparams( case 4096: model.type = e_model::MODEL_7B; break; } break; } + + // TODO: become GGUF KV parameter + hparams.f_max_alibi_bias = 8.0f; } break; case LLM_ARCH_MPT: { - hparams.f_clamp_kqv = 0.0f; - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); @@ -2844,6 +3184,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; default: model.type = e_model::MODEL_UNKNOWN; } @@ -2858,6 +3199,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_QWEN2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break; + case 80: model.type = e_model::MODEL_70B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_PHI2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -2888,11 +3240,50 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_CODESHELL: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 42: model.type = e_model::MODEL_SMALL; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_ORION: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_14B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_INTERNLM2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 48: model.type = e_model::MODEL_20B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_GEMMA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 18: model.type = e_model::MODEL_2B; break; + case 28: model.type = e_model::MODEL_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } model.ftype = ml.ftype; + + if (hparams.f_max_alibi_bias > 0.0f) { + hparams.need_kq_pos = true; + } } // TODO: This should probably be in llama.h @@ -2940,6 +3331,11 @@ static void llm_load_vocab( vocab.special_unk_id = 0; vocab.special_sep_id = -1; vocab.special_pad_id = -1; + + const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); + if (add_space_prefix_keyidx != -1) { + vocab.add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx); + } // The default value of add_space_prefix is true. } else if (tokenizer_name == "gpt2") { vocab.type = LLAMA_VOCAB_TYPE_BPE; @@ -2974,6 +3370,16 @@ static void llm_load_vocab( vocab.special_unk_id = -1; vocab.special_sep_id = -1; vocab.special_pad_id = -1; + } else if (tokenizer_name == "bert") { + vocab.type = LLAMA_VOCAB_TYPE_WPM; + + // default special tokens + vocab.special_bos_id = 101; + vocab.special_eos_id = 102; + vocab.special_unk_id = 100; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.add_space_prefix = false; } else { LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); @@ -3001,7 +3407,14 @@ static void llm_load_vocab( // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + try { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } catch (const std::exception & e) { + LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what()); + vocab.linefeed_id = vocab.special_pad_id; + } + } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { + vocab.linefeed_id = vocab.special_pad_id; } else { const std::vector ids = llama_tokenize_internal(vocab, "\u010A", false); LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); @@ -3147,12 +3560,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; - const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); + const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); // hparams LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); - LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); - LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix + LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch)); + LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type)); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); @@ -3173,7 +3586,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); - LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); + LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); @@ -3239,22 +3652,18 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } -#ifdef LM_GGML_USE_CUBLAS if (split_mode == LLAMA_SPLIT_LAYER) { // calculate the split points - int device_count = lm_ggml_backend_cuda_get_device_count(); + int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); - float splits[LM_GGML_CUDA_MAX_DEVICES]; + std::vector splits(device_count); if (all_zero) { // default split, by free memory for (int i = 0; i < device_count; ++i) { - size_t total; - size_t free; - lm_ggml_backend_cuda_get_device_memory(i, &total, &free); - splits[i] = free; + splits[i] = llama_get_device_memory(i); } } else { - std::copy(tensor_split, tensor_split + device_count, splits); + std::copy(tensor_split, tensor_split + device_count, splits.begin()); } // sum and normalize the splits to get the split points @@ -3270,19 +3679,17 @@ static bool llm_load_tensors( // assign the repeating layers to the devices according to the splits int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int64_t i = i_gpu_start; i < n_layer; ++i) { - int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; + int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); } // assign the output layer if (n_gpu_layers > n_layer) { - int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits; + int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); model.buft_output = llama_default_buffer_type_offload(layer_gpu); } else { model.buft_output = llama_default_buffer_type_cpu(true); } - } else -#endif - { + } else { lm_ggml_backend_buffer_type_t split_buft; if (split_mode == LLAMA_SPLIT_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); @@ -3320,7 +3727,7 @@ static bool llm_load_tensors( } // create one context per buffer type - size_t ctx_size = lm_ggml_tensor_overhead()*ml.n_tensors; + size_t ctx_size = lm_ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output std::map ctx_map; for (auto & it : buft_layer_count) { struct lm_ggml_init_params params = { @@ -3345,6 +3752,7 @@ static bool llm_load_tensors( const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_vocab = hparams.n_vocab; + const int64_t n_vocab_type = hparams.n_vocab_type; const int64_t n_ff = hparams.n_ff; LM_GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); @@ -3361,13 +3769,16 @@ static bool llm_load_tensors( switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: + case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + if (model.arch != LLM_ARCH_MINICPM){ + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } } for (int i = 0; i < n_layer; ++i) { @@ -3454,6 +3865,7 @@ static bool llm_load_tensors( } else { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor + ml.size_data += lm_ggml_nbytes(model.output); } } @@ -3556,39 +3968,91 @@ static bool llm_load_tensors( layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}); } } break; - case LLM_ARCH_BLOOM: + case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: { - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); - model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); - - // output - { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); + if (model.arch == LLM_ARCH_BERT) { + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); } + model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); + for (int i = 0; i < n_layer; ++i) { lm_ggml_context * ctx_layer = ctx_for_layer(i); lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + if (model.arch == LLM_ARCH_BERT) { + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + } else { + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + } - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); + layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + + if (model.arch == LLM_ARCH_BERT) { + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + } else { + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + } + + layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); + layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}); + } + } break; + case LLM_ARCH_BLOOM: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); @@ -3648,6 +4112,11 @@ static bool llm_load_tensors( layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + // optional bias tensors, present in Stable LM 2 1.6B + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); @@ -3685,6 +4154,41 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}); } } break; + case LLM_ARCH_QWEN2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + // optional bias tensors + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; case LLM_ARCH_PHI2: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -3795,6 +4299,135 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; + case LLM_ARCH_CODESHELL: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + } + } break; + case LLM_ARCH_ORION: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; + case LLM_ARCH_INTERNLM2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; + case LLM_ARCH_GEMMA: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading + ml.n_created--; // artificial tensor + ml.size_data += lm_ggml_nbytes(model.output); + + const int64_t n_ff = hparams.n_ff; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + for (uint32_t i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -3847,8 +4480,7 @@ static bool llm_load_tensors( ctx_bufs.emplace_back(ctx, buf); } - // print memory requirements - { + if (llama_supports_gpu_offload()) { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -3860,10 +4492,11 @@ static bool llm_load_tensors( const int max_offloadable_layers = hparams.n_layer + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); + } - for (lm_ggml_backend_buffer_t buf : model.bufs) { - LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); - } + // print memory requirements + for (lm_ggml_backend_buffer_t buf : model.bufs) { + LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); } // populate tensors_by_name @@ -3891,15 +4524,27 @@ static bool llm_load_tensors( } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; - llm_load_arch (ml, model); - llm_load_hparams(ml, model); - llm_load_vocab (ml, model); + try { + llm_load_arch(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model architecture: " + std::string(e.what())); + } + try { + llm_load_hparams(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); + } + try { + llm_load_vocab(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); + } llm_load_print_meta(ml, model); @@ -3912,6 +4557,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons return 0; } +#ifdef LM_GGML_USE_KOMPUTE + if (params.n_gpu_layers > 0 && ( + !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) + || !( + model.ftype == LLAMA_FTYPE_ALL_F32 || + model.ftype == LLAMA_FTYPE_MOSTLY_F16 || + model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || + model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 + ) + )) { + // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file + LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__); + params.n_gpu_layers = 0; + } +#endif + if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data @@ -3960,22 +4621,24 @@ static struct lm_ggml_tensor * llm_build_inp_embd( const llama_hparams & hparams, const llama_batch & batch, struct lm_ggml_tensor * tok_embd, + struct lm_ggml_tensor * inp_tokens, + struct lm_ggml_tensor * inp_embd, const llm_build_cb & cb) { const int64_t n_embd = hparams.n_embd; struct lm_ggml_tensor * inpL; if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, batch.n_tokens); + struct lm_ggml_tensor * inp_tokens_v = lm_ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0); cb(inp_tokens, "inp_tokens", -1); - inpL = lm_ggml_get_rows(ctx, tok_embd, inp_tokens); + inpL = lm_ggml_get_rows(ctx, tok_embd, inp_tokens_v); } else { #ifdef LM_GGML_USE_MPI LM_GGML_ASSERT(false && "not implemented"); #endif - inpL = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens); + inpL = lm_ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0); } return inpL; @@ -3989,6 +4652,7 @@ static void llm_build_k_shift( const llama_cparams & cparams, const llama_kv_cache & kv, struct lm_ggml_cgraph * graph, + struct lm_ggml_tensor * K_shift, llm_rope_type type, int64_t n_ctx, float freq_base, @@ -4005,9 +4669,6 @@ static void llm_build_k_shift( const float beta_fast = cparams.yarn_beta_fast; const float beta_slow = cparams.yarn_beta_slow; - struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - int rope_type = 0; switch (type) { @@ -4195,14 +4856,15 @@ static struct lm_ggml_tensor * llm_build_kqv( const llama_model & model, const llama_hparams & hparams, const llama_kv_cache & kv, + struct lm_ggml_cgraph * graph, struct lm_ggml_tensor * wo, struct lm_ggml_tensor * wo_b, struct lm_ggml_tensor * q_cur, struct lm_ggml_tensor * kq_mask, + struct lm_ggml_tensor * kq_pos, int64_t n_ctx, int32_t n_tokens, int32_t n_kv, - float max_alibi_bias, float kq_scale, const llm_build_cb & cb, int il) { @@ -4232,26 +4894,26 @@ static struct lm_ggml_tensor * llm_build_kqv( lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32); } - if (max_alibi_bias > 0.0f) { - // temporary branch until we figure out how to handle lm_ggml_alibi through lm_ggml_add +#if defined(LM_GGML_USE_VULKAN) || defined(LM_GGML_USE_KOMPUTE) || defined(LM_GGML_USE_SYCL) +#pragma message("TODO: ALiBi support in lm_ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL") +#pragma message(" Falling back to lm_ggml_alibi(). Will become an error in Mar 2024") +#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488") + if (hparams.f_max_alibi_bias > 0.0f) { kq = lm_ggml_scale(ctx, kq, kq_scale); cb(kq, "kq_scaled", il); - if (max_alibi_bias > 0.0f) { - // TODO: n_head or n_head_kv - // TODO: K-shift is likely not working - // TODO: change to lm_ggml_add - kq = lm_ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); - cb(kq, "kq_scaled_alibi", il); - } + kq = lm_ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias); + cb(kq, "kq_scaled_alibi", il); kq = lm_ggml_add(ctx, kq, kq_mask); cb(kq, "kq_masked", il); kq = lm_ggml_soft_max(ctx, kq); cb(kq, "kq_soft_max", il); - } else { - kq = lm_ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale); + } else +#endif + { + kq = lm_ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); } @@ -4273,6 +4935,8 @@ static struct lm_ggml_tensor * llm_build_kqv( struct lm_ggml_tensor * cur = lm_ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens); cb(cur, "kqv_merged_cont", il); + lm_ggml_build_forward_expand(graph, cur); + cur = lm_ggml_mul_mat(ctx, wo, cur); if (wo_b) { cb(cur, "kqv_wo", il); @@ -4285,8 +4949,46 @@ static struct lm_ggml_tensor * llm_build_kqv( return cur; } +static struct lm_ggml_tensor * llm_build_kv( + struct lm_ggml_context * ctx, + const llama_model & model, + const llama_hparams & hparams, + const llama_kv_cache & kv, + struct lm_ggml_cgraph * graph, + struct lm_ggml_tensor * wo, + struct lm_ggml_tensor * wo_b, + struct lm_ggml_tensor * k_cur, + struct lm_ggml_tensor * v_cur, + struct lm_ggml_tensor * q_cur, + struct lm_ggml_tensor * kq_mask, + struct lm_ggml_tensor * kq_pos, + int64_t n_ctx, + int32_t n_tokens, + int32_t kv_head, + int32_t n_kv, + float kq_scale, + const llm_build_cb & cb, + int il) { + + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + lm_ggml_build_forward_expand(graph, q_cur); + lm_ggml_build_forward_expand(graph, k_cur); + lm_ggml_build_forward_expand(graph, v_cur); + + llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); + + struct lm_ggml_tensor * cur; + cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b, + q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il); + cb(cur, "kqv_out", il); + + return cur; +} + struct llm_build_context { const llama_model & model; + const llama_context & lctx; const llama_hparams & hparams; const llama_cparams & cparams; const llama_batch & batch; @@ -4319,6 +5021,7 @@ struct llm_build_context { const int32_t n_orig_ctx; const bool do_rope_shift; + const uint32_t pooling_type; const llm_build_cb & cb; @@ -4333,6 +5036,7 @@ struct llm_build_context { const llm_build_cb & cb, bool worst_case) : model (lctx.model), + lctx (lctx), hparams (model.hparams), cparams (lctx.cparams), batch (batch), @@ -4361,6 +5065,7 @@ struct llm_build_context { kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), + pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -4393,20 +5098,20 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4442,14 +5147,8 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - lm_ggml_build_forward_expand(gf, Qcur); - lm_ggml_build_forward_expand(gf, Kcur); - lm_ggml_build_forward_expand(gf, Vcur); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -4462,11 +5161,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4585,20 +5282,24 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct lm_ggml_tensor * KQ_pos = lm_ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4643,14 +5344,10 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - // apply ALiBi for 13B model - const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4707,20 +5404,20 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4772,11 +5469,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4831,15 +5526,15 @@ struct llm_build_context { struct lm_ggml_tensor * pos; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); pos = lm_ggml_get_rows(ctx0, model.pos_embd, inp_pos); @@ -4873,11 +5568,9 @@ struct llm_build_context { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4930,19 +5623,19 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5080,12 +5773,9 @@ struct llm_build_context { ); cb(Vcur, "Vcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - // TODO: not tested, could be broken - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5140,13 +5830,17 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct lm_ggml_tensor * KQ_pos = lm_ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + for (int il = 0; il < n_layer; ++il) { struct lm_ggml_tensor * inpSA = inpL; @@ -5172,11 +5866,9 @@ struct llm_build_context { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5222,7 +5914,7 @@ struct llm_build_context { return gf; } - struct lm_ggml_cgraph * build_bloom() { + struct lm_ggml_cgraph * build_bert() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; @@ -5232,20 +5924,169 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); - cb(inpL, "inp_embd", -1); + // get input vectors with right size + const size_t stride1 = n_tokens * lm_ggml_type_size(lctx.inp_tokens->type); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + struct lm_ggml_tensor * inp_mean = lm_ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0); + struct lm_ggml_tensor * inp_cls = lm_ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); + // construct input embeddings (token, type, position) + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - inpL = llm_build_norm(ctx0, inpL, hparams, - model.tok_norm, - model.tok_norm_b, - LLM_NORM, cb, -1); + // token types are hardcoded to zero ("Sentence A") + struct lm_ggml_tensor * type_row0 = lm_ggml_view_1d(ctx0, model.type_embd, n_embd, 0); + inpL = lm_ggml_add(ctx0, inpL, type_row0); + if (model.arch == LLM_ARCH_BERT) { + inpL = lm_ggml_add(ctx0, lm_ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); + } + cb(inpL, "inp_embd", -1); + + // embed layer norm + inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); cb(inpL, "inp_norm", -1); - for (int il = 0; il < n_layer; ++il) { + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens] + + // iterate layers + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * cur = inpL; + + // self-attention + if (model.arch == LLM_ARCH_BERT) { + struct lm_ggml_tensor * Qcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); + cb(Qcur, "Qcur", il); + + struct lm_ggml_tensor * Kcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); + cb(Kcur, "Kcur", il); + + struct lm_ggml_tensor * Vcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); + cb(Vcur, "Vcur", il); + + // seems like we just need to do this for Q? + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } else { + // compute Q and K and RoPE them + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + // re-add the layer input + cur = lm_ggml_add(ctx0, cur, inpL); + + // attention layer norm + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); + + struct lm_ggml_tensor * ffn_inp = cur; + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + if (model.arch == LLM_ARCH_BERT) { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + } else { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + } + cb(cur, "ffn_out", il); + + // attentions bypass the intermediate layer + cur = lm_ggml_add(ctx0, cur, ffn_inp); + + // output layer norm + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il); + + // input for next layer + inpL = cur; + } + + // final output + cur = inpL; + + // pooling layer + if (pooling_type == LLAMA_POOLING_MEAN) { + cur = lm_ggml_mul_mat(ctx0, lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur)), inp_mean); + } else if (pooling_type == LLAMA_POOLING_CLS) { + cur = lm_ggml_get_rows(ctx0, cur, inp_cls); + } else { + LM_GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type"); + } + cb(cur, "result_embd", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_bloom() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // positions of the tokens in the KV cache + struct lm_ggml_tensor * KQ_pos = lm_ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + + inpL = llm_build_norm(ctx0, inpL, hparams, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, cb, -1); + cb(inpL, "inp_norm", -1); + + for (int il = 0; il < n_layer; ++il) { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, @@ -5270,11 +6111,9 @@ struct llm_build_context { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5327,13 +6166,17 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct lm_ggml_tensor * KQ_pos = lm_ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + for (int il = 0; il < n_layer; ++il) { struct lm_ggml_tensor * attn_norm; @@ -5365,11 +6208,9 @@ struct llm_build_context { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5425,20 +6266,20 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5456,12 +6297,24 @@ struct llm_build_context { // compute Q and K and RoPE them struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } Qcur = lm_ggml_rope_custom( ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, @@ -5477,11 +6330,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5538,20 +6389,20 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5594,11 +6445,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5643,6 +6492,126 @@ struct llm_build_context { return gf; } + + struct lm_ggml_cgraph * build_qwen2() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + lm_ggml_build_forward_expand(gf, Qcur); + lm_ggml_build_forward_expand(gf, Kcur); + lm_ggml_build_forward_expand(gf, Vcur); + + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + struct lm_ggml_cgraph * build_phi2() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -5655,20 +6624,20 @@ struct llm_build_context { struct lm_ggml_tensor * ffn_output; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5724,11 +6693,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); cb(cur, "kqv_out", il); } @@ -5779,20 +6746,20 @@ struct llm_build_context { struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5829,11 +6796,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } struct lm_ggml_tensor * sa_out = cur; @@ -5888,15 +6853,15 @@ struct llm_build_context { struct lm_ggml_tensor * pos; struct lm_ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); pos = lm_ggml_get_rows(ctx0, model.pos_embd, inp_pos); @@ -5930,11 +6895,9 @@ struct llm_build_context { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5976,138 +6939,629 @@ struct llm_build_context { return gf; } -}; - -static struct lm_ggml_cgraph * llama_build_graph( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - // check if we should build the worst-case graph (for memory measurement) - const bool worst_case = lm_ggml_tallocr_is_measure(lctx.alloc); + struct lm_ggml_cgraph * build_codeshell() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - // keep track of the input that has already been allocated - bool alloc_inp_tokens = false; - bool alloc_inp_embd = false; - bool alloc_inp_pos = false; - bool alloc_inp_KQ_mask = false; - bool alloc_inp_K_shift = false; + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) - // TODO: improve handling of input and output tensors, then replace this with lm_ggml_set_name - llm_build_cb cb = [&](struct lm_ggml_tensor * cur, const char * name, int il) { - if (il >= 0) { - lm_ggml_format_name(cur, "%s-%d", name, il); - } else { - lm_ggml_set_name(cur, name); - } + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - // - // allocate input tensors and set input data - // + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); - if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { - lm_ggml_tallocr_alloc(lctx.alloc, cur); + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); - if (!lm_ggml_tallocr_is_measure(lctx.alloc) && batch.token) { - const int64_t n_tokens = cur->ne[0]; + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); - lm_ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*lm_ggml_element_size(cur)); - } + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct lm_ggml_tensor * tmpq = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * tmpk = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(tmpq, "tmpq", il); + cb(tmpk, "tmpk", il); + cb(Vcur, "Vcur", il); + + struct lm_ggml_tensor * Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + struct lm_ggml_tensor * Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + // add the input + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + inpL = lm_ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_orion() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + // if (model.layers[il].bq) { + // Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq); + // cb(Qcur, "Qcur", il); + // } + + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + // if (model.layers[il].bk) { + // Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk); + // cb(Kcur, "Kcur", il); + // } + + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + // if (model.layers[il].bv) { + // Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); + // cb(Vcur, "Vcur", il); + // } + + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_internlm2() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + // ref: https://arxiv.org/abs/2203.03466 + // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 + // based on the original build_llama() function + struct lm_ggml_cgraph * build_minicpm() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + + const int64_t n_embd = hparams.n_embd; + //TODO: if the model varies, these parameters need to be read from the model + const int64_t n_embd_base = 256; + const float scale_embd = 12.0f; + const float scale_depth = 1.4f; + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // scale the input embeddings + inpL = lm_ggml_scale(ctx0, inpL, scale_embd); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); - alloc_inp_tokens = true; + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } - if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) { - lm_ggml_tallocr_alloc(lctx.alloc, cur); + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); - if (!lm_ggml_tallocr_is_measure(lctx.alloc) && batch.embd) { - const int64_t n_embd = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); - lm_ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*lm_ggml_element_size(cur)); + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); } - alloc_inp_embd = true; - } + // scale_res - scale the hidden states for residual connection + const float scale_res = scale_depth/sqrtf(float(n_layer)); + cur = lm_ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled", -1); - if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { - lm_ggml_tallocr_alloc(lctx.alloc, cur); + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - if (!lm_ggml_tallocr_is_measure(lctx.alloc) && batch.pos) { - const int64_t n_tokens = cur->ne[0]; + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - static_assert(std::is_same::value, "llama_pos must be int32_t"); - lm_ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*lm_ggml_element_size(cur)); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - alloc_inp_pos = true; + // scale the hidden states for residual connection + cur = lm_ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled_ffn", -1); + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } - if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { - lm_ggml_tallocr_alloc(lctx.alloc, cur); + cur = inpL; - if (!lm_ggml_tallocr_is_measure(lctx.alloc)) { - const int64_t n_kv = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); - float * data; - if (lm_ggml_backend_buffer_is_host(cur->buffer)) { - data = (float *) cur->data; - } else { - lctx.buf_copy.resize(lm_ggml_nbytes(cur)); - data = (float *) lctx.buf_copy.data(); - } + // lm_head scaling + const float scale_lmhead = float(n_embd_base)/float(n_embd); + cur = lm_ggml_scale(ctx0, cur, scale_lmhead); + cb(cur, "lmhead_scaling", -1); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - float f; - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - f = -INFINITY; - } else { - f = 0; - } - data[h*(n_kv*n_tokens) + j*n_kv + i] = f; - } - } - } + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.tok_embd, cur); + cb(cur, "result_output", -1); - if (data != cur->data) { - lm_ggml_backend_tensor_set(cur, data, 0, lm_ggml_nbytes(cur)); - } - } + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_gemma() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - alloc_inp_KQ_mask = true; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + inpL = lm_ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*lm_ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } - if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { - lm_ggml_tallocr_alloc(lctx.alloc, cur); + for (int il = 0; il < n_layer; ++il) { - if (!lm_ggml_tallocr_is_measure(lctx.alloc)) { - const int64_t n_ctx = cur->ne[0]; + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - int32_t * data; - if (lm_ggml_backend_buffer_is_host(cur->buffer)) { - data = (int32_t *) cur->data; - } else { - lctx.buf_copy.resize(lm_ggml_nbytes(cur)); - data = (int32_t *) lctx.buf_copy.data(); - } + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - if (data != cur->data) { - lm_ggml_backend_tensor_set(cur, data, 0, lm_ggml_nbytes(cur)); - } + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, + n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + cb(Qcur, "Qcur_scaled", il); + + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, + n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + cb(cur, "kqv_out", il); + } + struct lm_ggml_tensor * sa_out = lm_ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = llm_build_norm(ctx0, sa_out, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - alloc_inp_K_shift = true; + cur = lm_ggml_add(ctx0, cur, sa_out); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } +}; + +static struct lm_ggml_cgraph * llama_build_graph( + llama_context & lctx, + const llama_batch & batch, + bool worst_case) { + const auto & model = lctx.model; + + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + llm_build_cb cb = [&](struct lm_ggml_tensor * cur, const char * name, int il) { + if (il >= 0) { + lm_ggml_format_name(cur, "%s-%d", name, il); + } else { + lm_ggml_set_name(cur, name); + } + + if (!lctx.cparams.offload_kqv) { + if (strcmp(name, "kqv_merged_cont") == 0) { + // all nodes between the KV store and the attention output are run on the CPU + lm_ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu); + } } }; @@ -6142,6 +7596,11 @@ static struct lm_ggml_cgraph * llama_build_graph( { result = llm.build_refact(); } break; + case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: + { + result = llm.build_bert(); + } break; case LLM_ARCH_BLOOM: { result = llm.build_bloom(); @@ -6158,6 +7617,10 @@ static struct lm_ggml_cgraph * llama_build_graph( { result = llm.build_qwen(); } break; + case LLM_ARCH_QWEN2: + { + result = llm.build_qwen2(); + } break; case LLM_ARCH_PHI2: { result = llm.build_phi2(); @@ -6170,13 +7633,156 @@ static struct lm_ggml_cgraph * llama_build_graph( { result = llm.build_gpt2(); } break; + case LLM_ARCH_CODESHELL: + { + result = llm.build_codeshell(); + } break; + case LLM_ARCH_ORION: + { + result = llm.build_orion(); + } break; + case LLM_ARCH_INTERNLM2: + { + result = llm.build_internlm2(); + } break; + case LLM_ARCH_MINICPM: + { + result = llm.build_minicpm(); + } break; + case LLM_ARCH_GEMMA: + { + result = llm.build_gemma(); + } break; default: LM_GGML_ASSERT(false); } - llm.free(); + llm.free(); + + return result; +} + +static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { + // + // set input data + // + + const auto & hparams = lctx.model.hparams; + const auto & cparams = lctx.cparams; + const auto & kv_self = lctx.kv_self; + + if (batch.token) { + const int64_t n_tokens = batch.n_tokens; + + lm_ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*lm_ggml_element_size(lctx.inp_tokens)); + } + + if (batch.embd) { + const int64_t n_embd = hparams.n_embd; + const int64_t n_tokens = batch.n_tokens; + + lm_ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*lm_ggml_element_size(lctx.inp_embd)); + } + + if (batch.pos) { + const int64_t n_tokens = batch.n_tokens; + + lm_ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*lm_ggml_element_size(lctx.inp_pos)); + } + + { + const int64_t n_kv = kv_self.n; + const int64_t n_tokens = batch.n_tokens; + + assert(lm_ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + + float * data = (float *) lctx.inp_KQ_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + float f; + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || + (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) { + f = -INFINITY; + } else { + f = 0; + } + data[h*(n_kv*n_tokens) + j*n_kv + i] = f; + } + } + } + } + + if (hparams.need_kq_pos) { + const int64_t n_kv = kv_self.n; + + assert(lm_ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer)); + + float * data = (float *) lctx.inp_KQ_pos->data; + + for (int i = 0; i < n_kv; ++i) { + data[i] = float(lctx.kv_self.cells[i].pos); + } + } + + if (kv_self.has_shift) { + const int64_t n_ctx = cparams.n_ctx; + + assert(lm_ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); + + int32_t * data = (int32_t *) lctx.inp_K_shift->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } + + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { + const int64_t n_tokens = batch.n_tokens; + + LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); + float * data = (float *) lctx.inp_mean->data; + + memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * lm_ggml_element_size(lctx.inp_mean)); + + std::vector sum(n_tokens, 0); + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + sum[seq_id] += 1; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } + + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + data[seq_id*n_tokens + i] = div[seq_id]; + } + } + + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) { + const int64_t n_tokens = batch.n_tokens; - return result; + LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + uint32_t * data = (uint32_t *) lctx.inp_cls->data; + + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + const llama_pos pos = batch.pos[i]; + if (pos == 0) { + data[seq_id] = i; + } + } + } } // decode a batch of tokens by evaluating the transformer @@ -6277,17 +7883,22 @@ static int llama_decode_internal( lm_ggml_backend_sched_reset(lctx.sched); lm_ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - lm_ggml_cgraph * gf = llama_build_graph(lctx, batch); + lm_ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output is always the last tensor in the graph struct lm_ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0); - - // the embeddings could be the second to last tensor, or the third to last tensor struct lm_ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - if (strcmp(embeddings->name, "result_norm") != 0) { - embeddings = gf->nodes[gf->n_nodes - 3]; - LM_GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + if (strcmp(res->name, "result_output") == 0) { + // the embeddings could be the second to last tensor, or the third to last tensor + if (strcmp(embeddings->name, "result_norm") != 0) { + embeddings = gf->nodes[gf->n_nodes - 3]; + LM_GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + } + } else if (strcmp(res->name, "result_embd") == 0) { + embeddings = res; + res = nullptr; + } else { + LM_GGML_ASSERT(false); } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (lm_ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -6297,15 +7908,12 @@ static int llama_decode_internal( // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering // with the BLAS calls. need a better solution - if (n_tokens >= 32 && lm_ggml_cpu_has_blas() && !lm_ggml_cpu_has_gpublas()) { + // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is + // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. + if (n_tokens >= 32 && hparams.n_expert == 0 && lm_ggml_cpu_has_blas() && !lm_ggml_cpu_has_gpublas()) { n_threads = std::min(4, n_threads); } - const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; - if (lm_ggml_cpu_has_cublas() && fully_offloaded) { - n_threads = 1; - } - #ifdef LM_GGML_USE_MPI const int64_t n_layer = hparams.n_layer; lm_ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); @@ -6320,6 +7928,9 @@ static int llama_decode_internal( if (lctx.backend_cpu != nullptr) { lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); } + + llama_set_inputs(lctx, batch); + lm_ggml_backend_sched_graph_compute(lctx.sched, gf); // fprintf(stderr, "splits: %d\n", lm_ggml_backend_sched_get_n_splits(lctx.sched)); @@ -6359,7 +7970,7 @@ static int llama_decode_internal( // extract logits // TODO: do not compute and extract logits if only embeddings are needed // need to update the graphs to skip "result_output" - { + if (res) { auto & logits_out = lctx.logits; #ifndef NDEBUG @@ -6403,9 +8014,12 @@ static int llama_decode_internal( if (!lctx.embedding.empty()) { auto & embedding_out = lctx.embedding; - embedding_out.resize(n_embd); + const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0; + const int64_t embd_size = res ? n_embd : n_embd * n_tokens; + + embedding_out.resize(embd_size); lm_ggml_backend_t embeddings_backend = lm_ggml_backend_sched_get_node_backend(lctx.sched, embeddings); - lm_ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + lm_ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float)); lm_ggml_backend_synchronize(embeddings_backend); } @@ -6469,6 +8083,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { LM_GGML_ASSERT(false); return unicode_to_bytes_bpe(token_data.text); } + case LLAMA_VOCAB_TYPE_WPM: { + LM_GGML_ASSERT(false); + } default: LM_GGML_ASSERT(false); } @@ -6479,8 +8096,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; - return vocab.token_to_id.at(buf); + auto token = vocab.token_to_id.find(buf); + if (token != vocab.token_to_id.end()) { + return (*token).second; + } + // Try to fall back to just the byte as a string + const char buf2[2] = { (char)ch, 0 }; + return vocab.token_to_id.at(buf2); } + case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: { return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); } @@ -6526,7 +8150,7 @@ struct llm_bigram_spm { }; struct llm_tokenizer_spm { - llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {} + llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {} void tokenize(const std::string & text, std::vector & output) { // split string into utf8 chars @@ -6601,6 +8225,7 @@ struct llm_tokenizer_spm { if (p == rev_merge.end()) { // output any symbols that did not form tokens as bytes. + output.reserve(output.size() + symbol.n); for (int j = 0; j < (int)symbol.n; ++j) { llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]); output.push_back(token_id); @@ -6951,29 +8576,230 @@ struct llm_tokenizer_bpe { llm_bigram_bpe::queue work_queue; }; -typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{ +struct llm_tokenizer_wpm { + llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {} + + void tokenize(const std::string & text, std::vector & output) { + auto * token_map = &vocab.token_to_id; + + // normalize and split by whitespace + std::vector words = preprocess(text); + + // bos token prepended already + + // find the longest tokens that form the words + for (const std::string &word : words) { + // skip empty words + if (word.size() == 0) { + continue; + } + + // prepend phantom space + std::string word1 = "\xe2\x96\x81" + word; + int n = word1.size(); + + // we're at the start of a new word + int i = 0; + bool match_any = false; + + // move through character position in word + while (i < n) { + // loop through possible match length + bool match = false; + for (int j = n; j > i; j--) { + auto it = token_map->find(word1.substr(i, j - i)); + if (it != token_map->end()) { + output.push_back(it->second); + match = true; + match_any = true; + i = j; + break; + } + } + + // must be an unknown character + if (!match) { + i++; + } + } + + // we didn't find any matches for this word + if (!match_any) { + output.push_back(vocab.special_unk_id); + } + } + + // append eos token + output.push_back(vocab.special_eos_id); + } + + std::vector preprocess(const std::string & text) { + std::string ori_str = normalize(text); + uint64_t ori_size = ori_str.size(); + + // single punct / single symbol / single digit + // baseline: add whitespace on the left and right of punct and chinese characters + std::vector words; + std::string new_str = ""; + uint64_t i = 0; + while (i < ori_size) { + int utf_char_len = utf8_len(ori_str[i]); + if ((utf_char_len == 1) && ispunct(ori_str[i])) { + new_str += " "; + new_str += ori_str[i]; + new_str += " "; + i += 1; + } + else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) { + new_str += " "; + new_str += ori_str.substr(i, 3); + new_str += " "; + i += 3; + } + else { + new_str += ori_str[i]; + i += 1; + } + } + + // split by whitespace + uint64_t l = 0; + uint64_t r = 0; + while (r < new_str.size()) { + // if is whitespace + if (isspace(new_str[r])) { + if (r > l) words.push_back(new_str.substr(l, (r - l))); + l = r + 1; + r = l; + } + else { + r += 1; + } + } + if (r > l) { + words.push_back(new_str.substr(l, (r - l))); + } + return words; + } + + std::string normalize(const std::string & text) { + // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98 + std::string text2 = strip_accents(text); + for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) { + char c = text2[i]; + if (c >= 'A' && c <= 'Z') { + text2[i] = c - 'A' + 'a'; + } + } + return text2; + } + + bool is_chinese_char(const std::string & str) { + int len = str.length(); + unsigned int codepoint = 0; + int num_bytes = 0; + int i = 0; + unsigned char ch = static_cast(str[i]); + if (ch <= 0x7f) { + codepoint = ch; + num_bytes = 1; + } else if ((ch >> 5) == 0x06) { + codepoint = ch & 0x1f; + num_bytes = 2; + } else if ((ch >> 4) == 0x0e) { + codepoint = ch & 0x0f; + num_bytes = 3; + } else if ((ch >> 3) == 0x1e) { + codepoint = ch & 0x07; + num_bytes = 4; + } + for (int j = 1; j < num_bytes; ++j) { + if (i + j >= len) { + return false; // incomplete UTF-8 character + } + unsigned char next_ch = static_cast(str[i + j]); + if ((next_ch >> 6) != 0x02) { + return false; // invalid trailing byte + } + codepoint = (codepoint << 6) | (next_ch & 0x3f); + } + if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) || + (codepoint >= 0x3400 && codepoint <= 0x4DBF) || + (codepoint >= 0x20000 && codepoint <= 0x2A6DF) || + (codepoint >= 0x2A700 && codepoint <= 0x2B73F) || + (codepoint >= 0x2B740 && codepoint <= 0x2B81F) || + (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920 + (codepoint >= 0xF900 && codepoint <= 0xFAFF) || + (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) || + (codepoint >= 0x3000 && codepoint <= 0x303F) || + (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) { + return true; // NOLINT + } + return false; + } + + std::string strip_accents(const std::string & input_string) { + std::string resultString; + std::map accent_map = { + {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'}, + {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'}, + {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'}, + {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'}, + {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'}, + {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'}, + {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'}, + {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'}, + {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'}, + }; + + for (size_t i = 0; i < input_string.length();) { + int len = utf8_len(input_string[i]); + std::string curChar = input_string.substr(i, len); + auto iter = accent_map.find(curChar); + if (iter != accent_map.end()) { + resultString += iter->second; + } else { + resultString += curChar; + } + i += len; + } + + return resultString; + } + + static size_t utf8_len(char src) { + const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; + } + + const llama_vocab & vocab; +}; + +typedef enum FRAGMENT_BUFFER_VARIANT_TYPE { FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT } FRAGMENT_BUFFER_VARIANT_TYPE; -struct fragment_buffer_variant{ +struct fragment_buffer_variant { fragment_buffer_variant(llama_vocab::id _token) : type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN), token(_token), raw_text(_dummy), offset(0), - length(0){} + length(0) {} + fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) : type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), - token((llama_vocab::id)-1), + token((llama_vocab::id) - 1), raw_text(_raw_text), offset(_offset), length(_length){ - LM_GGML_ASSERT( _offset >= 0 ); - LM_GGML_ASSERT( _length >= 1 ); - LM_GGML_ASSERT( offset + length <= raw_text.length() ); + LM_GGML_ASSERT(_offset >= 0); + LM_GGML_ASSERT(_length >= 1); + LM_GGML_ASSERT(offset + length <= raw_text.length()); } const FRAGMENT_BUFFER_VARIANT_TYPE type; @@ -6986,8 +8812,7 @@ struct fragment_buffer_variant{ // #define PRETOKENIZERDEBUG -static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) -{ +static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token for (const auto & st: vocab.special_tokens_cache) { const auto & special_token = st.first; @@ -7098,17 +8923,15 @@ static std::vector llama_tokenize_internal(const llama_vocab & } std::forward_list fragment_buffer; - fragment_buffer.emplace_front( raw_text, 0, raw_text.length() ); + fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); - if (special) tokenizer_st_partition( vocab, fragment_buffer ); + if (special) tokenizer_st_partition(vocab, fragment_buffer); switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { - for (const auto & fragment: fragment_buffer) - { - if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) - { + for (const auto & fragment : fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer // TODO: It's likely possible to get rid of this string copy entirely @@ -7117,7 +8940,9 @@ static std::vector llama_tokenize_internal(const llama_vocab & // auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); if (&fragment == &fragment_buffer.front()) { - raw_text = " " + raw_text; // prefix with space if the first token is not special + if (vocab.add_space_prefix) { + raw_text = " " + raw_text; // prefix with space if the first token is not special + } } #ifdef PRETOKENIZERDEBUG @@ -7126,19 +8951,15 @@ static std::vector llama_tokenize_internal(const llama_vocab & llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); tokenizer.tokenize(raw_text, output); - } - else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - { + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } } } break; case LLAMA_VOCAB_TYPE_BPE: { - for (const auto & fragment: fragment_buffer) - { - if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) - { + for (const auto & fragment : fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG @@ -7146,9 +8967,23 @@ static std::vector llama_tokenize_internal(const llama_vocab & #endif llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + output.push_back(fragment.token); } - else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - { + } + } break; + case LLAMA_VOCAB_TYPE_WPM: + { + for (const auto & fragment : fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + +#ifdef PRETOKENIZERDEBUG + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); +#endif + llm_tokenizer_wpm tokenizer(vocab); + tokenizer.tokenize(raw_text, output); + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } } @@ -7596,8 +9431,17 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c } void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) { + // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast + // if (k >= (int32_t)candidates->size) { + // return; + // } + const int64_t t_start_sample_us = lm_ggml_time_us(); + if (k <= 0) { + k = candidates->size; + } + k = std::max(k, (int) min_keep); k = std::min(k, (int) candidates->size); @@ -7606,10 +9450,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; - if (k == (int) candidates->size) { - std::sort(candidates->data, candidates->data + candidates->size, comp); - } else { + if (k <= 128) { std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + } else { + constexpr int nbuckets = 128; + constexpr float bucket_low = -10.0f; + constexpr float bucket_high = 10.0f; + constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); + constexpr float bucker_inter = -bucket_low * bucket_scale; + + std::vector bucket_idx(candidates->size); + std::vector histo(nbuckets, 0); + + for (int i = 0; i < (int)candidates->size; ++i) { + const float val = candidates->data[i].logit; + int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); + ib = std::max(0, std::min(nbuckets-1, ib)); + bucket_idx[i] = ib; + ++histo[ib]; + } + int nhave = 0; + int ib = nbuckets - 1; + for ( ; ib >= 0; --ib) { + nhave += histo[ib]; + if (nhave >= k) break; + } + std::vector tmp_tokens(nhave); + auto ptr = tmp_tokens.data(); + std::vector bucket_ptrs; + bucket_ptrs.reserve(nbuckets - ib); + for (int j = nbuckets - 1; j >= ib; --j) { + bucket_ptrs.push_back(ptr); + ptr += histo[j]; + } + for (int i = 0; i < (int)candidates->size; ++i) { + int j = bucket_idx[i]; + if (j >= ib) { + *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i]; + } + } + + ptr = tmp_tokens.data(); + int ndone = 0; + for (int j = nbuckets-1; j > ib; --j) { + std::sort(ptr, ptr + histo[j], comp); + ptr += histo[j]; + ndone += histo[j]; + } + std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp); + + std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data)); + } candidates->sorted = true; } @@ -7657,21 +9548,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can return; } - llama_sample_softmax(ctx, candidates); - const int64_t t_start_sample_us = lm_ggml_time_us(); - float scale = candidates->data[0].p; // scale by max prob - size_t i = 1; // first token always matches + bool min_p_applied = false; + + // if the candidates aren't sorted, try the unsorted implementation first + if (!candidates->sorted) { + std::vector filtered_tokens; + + float max_logit = -FLT_MAX; + for (size_t i = 0; i < candidates->size; ++i) { + max_logit = std::max(max_logit, candidates->data[i].logit); + } + const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max + + for (size_t i = 0; i < candidates->size; ++i) { + if (candidates->data[i].logit >= min_logit) { + filtered_tokens.push_back(candidates->data[i]); + } + } - for (; i < candidates->size; ++i) { - if (candidates->data[i].p < p * scale && i >= min_keep) { - break; // prob too small + // if we have enough values the operation was a success + if (filtered_tokens.size() >= min_keep) { + memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); + candidates->size = filtered_tokens.size(); + min_p_applied = true; } } - // Resize the output vector to keep only the matching tokens - candidates->size = i; + // if the candidates are sorted or the unsorted implementation failed, use this implementation + if (!min_p_applied) { + // Sort the logits in descending order + if (!candidates->sorted) { + std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + candidates->sorted = true; + } + + const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max + size_t i = 1; // first token always matches + + for (; i < candidates->size; ++i) { + if (candidates->data[i].logit < min_logit && i >= min_keep) { + break; // prob too small + } + } + + // Resize the output vector to keep only the matching tokens + candidates->size = i; + } if (ctx) { ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us; @@ -7801,6 +9727,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } +void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) { + const int64_t t_start_sample_us = lm_ggml_time_us(); + + // no need to do anything if there is only one (or zero) candidates + if(candidates_p->size <= 1) { + return; + } + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / candidates_p->size); + + llama_sample_softmax(nullptr, candidates_p); + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < candidates_p->size; ++i) { + float prob = candidates_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above) + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + +#ifdef DEBUG + LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); + LLAMA_LOG_INFO("Entropy: %f\n", entropy); + LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); + LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); + LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); + LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); +#endif + + // Apply the dynamically calculated temperature scaling + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= dyn_temp; + } + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + double max_l_double = candidates_p->data[0].logit; + double cum_sum_double = 0.0; + for (size_t i = 0; i < candidates_p->size; ++i) { + double p = exp(candidates_p->data[i].logit - max_l_double); + candidates_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + +#ifdef DEBUG + // Print the updated top 25 probabilities after temperature scaling + LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); + for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) { + LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f); + } +#endif + + if (ctx) { + ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us; + } +} + void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = lm_ggml_time_us(); @@ -8389,9 +10382,13 @@ struct quantize_state_internal { const llama_model_quantize_params * params; int n_attention_wv = 0; - int n_feed_forward_w2 = 0; + int n_ffn_down = 0; + int n_ffn_gate = 0; + int n_ffn_up = 0; int i_attention_wv = 0; - int i_feed_forward_w2 = 0; + int i_ffn_down = 0; + int i_ffn_gate = 0; + int i_ffn_up = 0; int n_k_quantized = 0; int n_fallback = 0; @@ -8475,29 +10472,55 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type auto use_more_bits = [](int i_layer, int num_layers) -> bool { return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; }; + const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); + auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { + if (n_expert > 1) { + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work + // for getting the current layer as I initially thought, and we need to resort to parsing the + // tensor name. + n_layer /= n_expert; + if (sscanf(name, "blk.%d.", &i_layer) != 1) { + throw std::runtime_error(format("Failed to determine layer for tensor %s", name)); + } + if (i_layer < 0 || i_layer >= n_layer) { + throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer)); + } + } + return std::make_pair(i_layer, n_layer); + }; if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = LM_GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = LM_GGML_TYPE_Q5_K; } else if (new_type != LM_GGML_TYPE_Q8_0) { new_type = LM_GGML_TYPE_Q6_K; } - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + } else if (name == "token_embd.weight") { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + new_type = LM_GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = LM_GGML_TYPE_Q4_K; + } + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = LM_GGML_TYPE_Q4_K; else new_type = LM_GGML_TYPE_Q2_K; ++qs.i_attention_wv; } else if (name.find("ffn_down") != std::string::npos) { - if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = LM_GGML_TYPE_Q2_K; - ++qs.i_feed_forward_w2; + if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = LM_GGML_TYPE_Q2_K; + ++qs.i_ffn_down; + } + else if (name.find("attn_output.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = LM_GGML_TYPE_IQ2_XXS; } - else if (name == "token_embd.weight") new_type = LM_GGML_TYPE_Q2_K; } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = qs.model.hparams.n_gqa() >= 4 ? LM_GGML_TYPE_Q4_K : LM_GGML_TYPE_Q3_K; @@ -8505,10 +10528,16 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { new_type = LM_GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = qs.model.hparams.n_gqa() >= 4 ? LM_GGML_TYPE_Q4_K : !qs.has_imatrix ? LM_GGML_TYPE_Q3_K : LM_GGML_TYPE_IQ3_XXS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? LM_GGML_TYPE_Q5_K : LM_GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = LM_GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) { + new_type = LM_GGML_TYPE_Q5_K; + } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = LM_GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = LM_GGML_TYPE_Q5_K; @@ -8532,29 +10561,19 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type // TODO: explore better strategies new_type = LM_GGML_TYPE_Q8_0; } - } else if (name.find("ffn_down") != std::string::npos) { - const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); - int i_layer, n_layer; - if (n_expert == 1) { - i_layer = qs.i_feed_forward_w2; - n_layer = qs.n_feed_forward_w2; - } else { - // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly - // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work - // for getting the current layer as I initially thought, and we need to resort to parsing the - // tensor name. - n_layer = qs.n_feed_forward_w2 / n_expert; - if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) { - throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str())); - } - if (i_layer < 0 || i_layer >= n_layer) { - throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer)); - } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { + new_type = LM_GGML_TYPE_Q2_K; } + } else if (name.find("ffn_down") != std::string::npos) { + auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { if (i_layer < n_layer/8) new_type = LM_GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { + new_type = i_layer < n_layer/8 ? LM_GGML_TYPE_Q4_K : LM_GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_layer < n_layer/16 ? LM_GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? LM_GGML_TYPE_Q4_K @@ -8571,6 +10590,9 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type if (use_more_bits(i_layer, n_layer)) new_type = LM_GGML_TYPE_Q6_K; } } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) { + if (i_layer < n_layer/8) new_type = LM_GGML_TYPE_Q5_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = LM_GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = LM_GGML_TYPE_Q5_K; @@ -8582,16 +10604,18 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? LM_GGML_TYPE_Q4_1 : LM_GGML_TYPE_Q5_1; } - ++qs.i_feed_forward_w2; + ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { new_type = LM_GGML_TYPE_Q5_K; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = LM_GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = LM_GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = LM_GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = LM_GGML_TYPE_Q5_K; } @@ -8604,6 +10628,25 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = LM_GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = LM_GGML_TYPE_Q6_K; } + else if (name.find("ffn_gate") != std::string::npos) { + auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { + new_type = LM_GGML_TYPE_Q2_K; + } + ++qs.i_ffn_gate; + } + else if (name.find("ffn_up") != std::string::npos) { + auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { + new_type = LM_GGML_TYPE_Q2_K; + } + ++qs.i_ffn_up; + } + + // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; + //} // IK: let's remove this, else Q2_K is almost the same as Q3_K_S //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) { // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; @@ -8616,7 +10659,8 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type bool convert_incompatible_tensor = false; if (new_type == LM_GGML_TYPE_Q2_K || new_type == LM_GGML_TYPE_Q3_K || new_type == LM_GGML_TYPE_Q4_K || new_type == LM_GGML_TYPE_Q5_K || new_type == LM_GGML_TYPE_Q6_K || - new_type == LM_GGML_TYPE_IQ2_XS || new_type == LM_GGML_TYPE_IQ2_XXS) { + new_type == LM_GGML_TYPE_IQ2_XS || new_type == LM_GGML_TYPE_IQ2_XXS || + new_type == LM_GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -8630,8 +10674,10 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type switch (new_type) { case LM_GGML_TYPE_IQ2_XXS: case LM_GGML_TYPE_IQ2_XS: - case LM_GGML_TYPE_Q2_K: new_type = LM_GGML_TYPE_Q4_0; break; - case LM_GGML_TYPE_Q3_K: new_type = LM_GGML_TYPE_Q4_1; break; + case LM_GGML_TYPE_IQ3_XXS: + case LM_GGML_TYPE_IQ1_S: + case LM_GGML_TYPE_Q2_K: + case LM_GGML_TYPE_Q3_K: new_type = LM_GGML_TYPE_IQ4_NL; break; case LM_GGML_TYPE_Q4_K: new_type = LM_GGML_TYPE_Q5_0; break; case LM_GGML_TYPE_Q5_K: new_type = LM_GGML_TYPE_Q5_1; break; case LM_GGML_TYPE_Q6_K: new_type = LM_GGML_TYPE_Q8_0; break; @@ -8658,18 +10704,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_ALL_F32: quantized_type = LM_GGML_TYPE_F32; break; // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = LM_GGML_TYPE_Q2_K; break; - case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = LM_GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: + case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = LM_GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_XS: case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = LM_GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = LM_GGML_TYPE_Q3_K; break; case LLAMA_FTYPE_MOSTLY_Q4_K_S: - case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = LM_GGML_TYPE_Q4_K; break; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = LM_GGML_TYPE_Q4_K; break; case LLAMA_FTYPE_MOSTLY_Q5_K_S: - case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = LM_GGML_TYPE_Q5_K; break; - case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = LM_GGML_TYPE_Q6_K; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = LM_GGML_TYPE_IQ2_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = LM_GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = LM_GGML_TYPE_Q5_K; break; + case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = LM_GGML_TYPE_Q6_K; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = LM_GGML_TYPE_IQ2_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = LM_GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = LM_GGML_TYPE_IQ3_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = LM_GGML_TYPE_IQ1_S; break; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = LM_GGML_TYPE_IQ4_NL; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -8727,12 +10777,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ++qs.n_attention_wv; } else if (name.find("ffn_down") != std::string::npos) { - ++qs.n_feed_forward_w2; + ++qs.n_ffn_down; + } + else if (name.find("ffn_gate") != std::string::npos) { + ++qs.n_ffn_gate; + } + else if (name.find("ffn_up") != std::string::npos) { + ++qs.n_ffn_up; } } - if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { - LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", - __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); + if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { + LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", + __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer); } size_t total_size_org = 0; @@ -8793,7 +10849,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= !params->only_copy; // do not quantize expert gating tensors - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight"); + + // do not quantize positional embeddings and token types (BERT) + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); enum lm_ggml_type new_type; void * new_data; @@ -8833,6 +10893,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } if ((new_type == LM_GGML_TYPE_IQ2_XXS || new_type == LM_GGML_TYPE_IQ2_XS || + new_type == LM_GGML_TYPE_IQ1_S || (new_type == LM_GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); @@ -9067,7 +11128,7 @@ static int llama_apply_lora_from_file_internal( { LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", __func__, ftype); - return false; + return 1; } } @@ -9295,6 +11356,7 @@ struct llama_context_params llama_context_default_params() { /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, + /*.do_pooling =*/ true, }; return result; @@ -9314,19 +11376,48 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { return result; } -int32_t llama_max_devices(void) { - return LLAMA_MAX_DEVICES; +size_t llama_max_devices(void) { +#if defined(LM_GGML_USE_METAL) + return 1; +#elif defined(LM_GGML_USE_CUBLAS) + return LM_GGML_CUDA_MAX_DEVICES; +#elif defined(LM_GGML_USE_SYCL) + return LM_GGML_SYCL_MAX_DEVICES; +#elif defined(LM_GGML_USE_VULKAN) + return LM_GGML_VK_MAX_DEVICES; +#else + return 1; +#endif } -bool llama_mmap_supported(void) { +bool llama_supports_mmap(void) { return llama_mmap::SUPPORTED; } -bool llama_mlock_supported(void) { +bool llama_supports_mlock(void) { return llama_mlock::SUPPORTED; } -void llama_backend_init(bool numa) { +bool llama_supports_gpu_offload(void) { +#if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) || defined(LM_GGML_USE_METAL) || defined(LM_GGML_USE_VULKAN) || \ + defined(LM_GGML_USE_SYCL) || defined(LM_GGML_USE_KOMPUTE) + // Defined when llama.cpp is compiled with support for offloading model layers to GPU. + return true; +#else + return false; +#endif +} + +// deprecated: +bool llama_mmap_supported(void) { + return llama_supports_mmap(); +} + +bool llama_mlock_supported(void) { + return llama_supports_mlock(); +} + +void llama_backend_init(void) { lm_ggml_time_init(); // needed to initialize f16 tables @@ -9336,15 +11427,17 @@ void llama_backend_init(bool numa) { lm_ggml_free(ctx); } - if (numa) { - lm_ggml_numa_init(); - } - #ifdef LM_GGML_USE_MPI lm_ggml_mpi_backend_init(); #endif } +void llama_numa_init(enum lm_ggml_numa_strategy numa) { + if (numa != LM_GGML_NUMA_STRATEGY_DISABLED) { + lm_ggml_numa_init(numa); + } +} + void llama_backend_free(void) { #ifdef LM_GGML_USE_MPI lm_ggml_mpi_backend_free(); @@ -9357,8 +11450,8 @@ int64_t llama_time_us(void) { } struct llama_model * llama_load_model_from_file( - const char * path_model, - struct llama_model_params params) { + const char * path_model, + struct llama_model_params params) { lm_ggml_time_init(); llama_model * model = new llama_model; @@ -9421,6 +11514,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.mul_mat_q = params.mul_mat_q; cparams.offload_kqv = params.offload_kqv; + cparams.do_pooling = params.do_pooling; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -9499,6 +11593,38 @@ struct llama_context * llama_new_context_with_model( } } } +#elif defined(LM_GGML_USE_VULKAN) + if (model->n_gpu_layers > 0) { + for (int device = 0; device < lm_ggml_backend_vk_get_device_count(); ++device) { + lm_ggml_backend_t backend = lm_ggml_backend_vk_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } + } +#elif defined(LM_GGML_USE_SYCL) + if (model->n_gpu_layers > 0) { + lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } +#elif defined(LM_GGML_USE_KOMPUTE) + if (model->n_gpu_layers > 0) { + auto * backend = lm_ggml_backend_kompute_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif ctx->backend_cpu = lm_ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { @@ -9536,10 +11662,45 @@ struct llama_context * llama_new_context_with_model( // resized during inference, reserve maximum ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); - if (params.embedding){ + if (params.embedding) { ctx->embedding.resize(hparams.n_embd); } + // graph inputs + { + lm_ggml_init_params init_params = { + /* .mem_size */ lm_ggml_tensor_overhead()*8, + /* .mem_buffer */ nullptr, + /* .no_alloc */ true, + }; + ctx->ctx_input = lm_ggml_init(init_params); + + ctx->inp_tokens = lm_ggml_new_tensor_1d(ctx->ctx_input, LM_GGML_TYPE_I32, cparams.n_batch); + ctx->inp_embd = lm_ggml_new_tensor_2d(ctx->ctx_input, LM_GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); + ctx->inp_pos = lm_ggml_new_tensor_1d(ctx->ctx_input, LM_GGML_TYPE_I32, cparams.n_batch); + ctx->inp_KQ_mask = lm_ggml_new_tensor_2d(ctx->ctx_input, LM_GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); + ctx->inp_KQ_pos = lm_ggml_new_tensor_1d(ctx->ctx_input, LM_GGML_TYPE_F32, cparams.n_ctx); + ctx->inp_K_shift = lm_ggml_new_tensor_1d(ctx->ctx_input, LM_GGML_TYPE_I32, cparams.n_ctx); + ctx->inp_mean = lm_ggml_new_tensor_2d(ctx->ctx_input, LM_GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); + ctx->inp_cls = lm_ggml_new_tensor_1d(ctx->ctx_input, LM_GGML_TYPE_I32, cparams.n_batch); + + lm_ggml_set_name(ctx->inp_tokens, "inp_tokens"); + lm_ggml_set_name(ctx->inp_embd, "inp_embd"); + lm_ggml_set_name(ctx->inp_pos, "inp_pos"); + lm_ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); + lm_ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos"); + lm_ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); + lm_ggml_set_name(ctx->inp_mean, "inp_mean"); + lm_ggml_set_name(ctx->inp_cls, "inp_cls"); + + ctx->buf_input = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); + + LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__, + lm_ggml_backend_buffer_name(ctx->buf_input), + lm_ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0); + } + + // scheduler and compute buffers { // buffer types used for the compute buffer of each backend std::vector backend_buft; @@ -9556,27 +11717,32 @@ struct llama_context * llama_new_context_with_model( ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*LLAMA_MAX_NODES + lm_ggml_graph_overhead()); ctx->sched = lm_ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES); - ctx->alloc = lm_ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); int n_past = cparams.n_ctx - n_tokens; llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - lm_ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); + lm_ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true); // initialize scheduler with the worst-case graph - lm_ggml_backend_sched_init_measure(ctx->sched, gf); - // note: the number of splits during measure is higher than during inference due to the kv shift - int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); - ctx->alloc = lm_ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); + if (!lm_ggml_backend_sched_reserve(ctx->sched, gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + llama_free(ctx); + return nullptr; + } - for (lm_ggml_backend_t backend : ctx->backends) { - lm_ggml_backend_buffer_t buf = lm_ggml_backend_sched_get_buffer(ctx->sched, backend); + for (size_t i = 0; i < ctx->backends.size(); i++) { + lm_ggml_backend_t backend = ctx->backends[i]; + lm_ggml_backend_buffer_type_t buft = backend_buft[i]; + size_t size = lm_ggml_backend_sched_get_buffer_size(ctx->sched, backend); LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - lm_ggml_backend_buffer_name(buf), - lm_ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + lm_ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); } + + // note: the number of splits during measure is higher than during inference due to the kv shift + int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched); + LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); } } @@ -9674,7 +11840,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3 int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", - llama_model_arch_name(model->arch).c_str(), + llama_model_arch_name(model->arch), llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } @@ -10017,18 +12183,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(&kv_used, sizeof(kv_used)); if (kv_buf_size) { - const size_t elt_size = lm_ggml_element_size(kv_self.k_l[0]); - std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head); + size_t k_size = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + tmp_buf.resize(k_size); lm_ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); // v is not contiguous, copy row by row - tmp_buf.resize(elt_size*kv_head); + size_t v_row_size = lm_ggml_row_size(kv_self.v_l[il]->type, kv_head); + size_t v_row_stride = lm_ggml_row_size(kv_self.v_l[il]->type, n_ctx); + tmp_buf.resize(v_row_size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size()); + lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); } } @@ -10130,17 +12297,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { if (kv_buf_size) { LM_GGML_ASSERT(kv_self.total_size() == kv_buf_size); - const size_t elt_size = lm_ggml_element_size(kv_self.k_l[0]); - for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = elt_size*n_embd_k_gqa*kv_head; + size_t k_size = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); lm_ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); inp += k_size; // v is not contiguous, copy row by row - size_t v_row_size = elt_size*kv_head; + size_t v_row_size = lm_ggml_row_size(kv_self.v_l[il]->type, kv_head); + size_t v_row_stride = lm_ggml_row_size(kv_self.v_l[il]->type, n_ctx); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - lm_ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size); + lm_ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); inp += v_row_size; } } @@ -10316,22 +12482,24 @@ struct llama_batch llama_batch_get_one( }; } -struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) { +struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) { llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; if (embd) { - batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd); + batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); } else { - batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens); + batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); } - batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); - batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens); - batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens); - for (int i = 0; i < n_tokens; ++i) { + batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc); + batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc); + batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1)); + for (int i = 0; i < n_tokens_alloc; ++i) { batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max); } - batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); + batch.seq_id[n_tokens_alloc] = nullptr; + + batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc); return batch; } @@ -10342,7 +12510,7 @@ void llama_batch_free(struct llama_batch batch) { if (batch.pos) free(batch.pos); if (batch.n_seq_id) free(batch.n_seq_id); if (batch.seq_id) { - for (int i = 0; i < batch.n_tokens; ++i) { + for (int i = 0; batch.seq_id[i] != nullptr; ++i) { free(batch.seq_id[i]); } free(batch.seq_id); @@ -10374,6 +12542,10 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } +float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { + return ctx->embedding.data() + i*ctx->model.hparams.n_embd; +} + const char * llama_token_get_text(const struct llama_model * model, llama_token token) { return model->vocab.id_to_token[token].text.c_str(); } @@ -10458,6 +12630,7 @@ static std::string llama_decode_text(const std::string & text) { int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) { if (0 <= token && token < llama_n_vocab(model)) { switch (llama_vocab_get_type(model->vocab)) { + case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_SPM: { // NOTE: we accept all unsupported token types, // suppressing them like CONTROL tokens. @@ -10523,6 +12696,123 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token return 0; } +// trim whitespace from the beginning and end of a string +static std::string trim(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + while (start < end && isspace(str[start])) { + start += 1; + } + while (end > start && isspace(str[end - 1])) { + end -= 1; + } + return str.substr(start, end - start); +} + +// Simple version of "llama_apply_chat_template" that only works with strings +// This function uses heuristic checks to determine commonly used template. It is not a jinja parser. +static int32_t llama_chat_apply_template_internal( + const std::string & tmpl, + const std::vector & chat, + std::string & dest, bool add_ass) { + // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 + std::stringstream ss; + if (tmpl.find("<|im_start|>") != std::string::npos) { + // chatml template + for (auto message : chat) { + ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; + } + if (add_ass) { + ss << "<|im_start|>assistant\n"; + } + } else if (tmpl.find("[INST]") != std::string::npos) { + // llama2 template and its variants + // [variant] support system message + bool support_system_message = tmpl.find("<>") != std::string::npos; + // [variant] space before + after response + bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos; + // [variant] add BOS inside history + bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos; + // [variant] trim spaces from the input message + bool strip_message = tmpl.find("content.strip()") != std::string::npos; + // construct the prompt + bool is_inside_turn = true; // skip BOS at the beginning + ss << "[INST] "; + for (auto message : chat) { + std::string content = strip_message ? trim(message->content) : message->content; + std::string role(message->role); + if (!is_inside_turn) { + is_inside_turn = true; + ss << (add_bos_inside_history ? "[INST] " : "[INST] "); + } + if (role == "system") { + if (support_system_message) { + ss << "<>\n" << content << "\n<>\n\n"; + } else { + // if the model does not support system message, we still include it in the first message, but without <> + ss << content << "\n"; + } + } else if (role == "user") { + ss << content << " [/INST]"; + } else { + ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << ""; + is_inside_turn = false; + } + } + // llama2 templates seem to not care about "add_generation_prompt" + } else if (tmpl.find("<|user|>") != std::string::npos) { + // zephyr template + for (auto message : chat) { + ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } + } else { + // template not supported + return -1; + } + dest = ss.str(); + return dest.size(); +} + +LLAMA_API int32_t llama_chat_apply_template( + const struct llama_model * model, + const char * tmpl, + const struct llama_chat_message * chat, + size_t n_msg, + bool add_ass, + char * buf, + int32_t length) { + std::string curr_tmpl(tmpl == nullptr ? "" : tmpl); + if (tmpl == nullptr) { + LM_GGML_ASSERT(model != nullptr); + // load template from model + std::vector model_template(2048, 0); // longest known template is about 1200 bytes + std::string template_key = "tokenizer.chat_template"; + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); + if (res < 0) { + // worst case: there is no information about template, we will use chatml by default + curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal + } else { + curr_tmpl = std::string(model_template.data(), model_template.size()); + } + } + // format the chat to string + std::vector chat_vec; + chat_vec.resize(n_msg); + for (size_t i = 0; i < n_msg; i++) { + chat_vec[i] = &chat[i]; + } + std::string formatted_chat; + int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass); + if (res < 0) { + return res; + } + strncpy(buf, formatted_chat.c_str(), length); + return res; +} + struct llama_timings llama_get_timings(struct llama_context * ctx) { struct llama_timings result = { /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, @@ -10581,6 +12871,7 @@ const char * llama_print_system_info(void) { s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | "; s += "SSSE3 = " + std::to_string(lm_ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(lm_ggml_cpu_has_vsx()) + " | "; + s += "MATMUL_INT8 = " + std::to_string(lm_ggml_cpu_has_matmul_int8()) + " | "; return s.c_str(); } diff --git a/cpp/llama.h b/cpp/llama.h index 47d09d89..67bde72e 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -3,12 +3,7 @@ #include "ggml.h" #include "ggml-backend.h" -#ifdef LM_GGML_USE_CUBLAS -#include "ggml-cuda.h" -#define LLAMA_MAX_DEVICES LM_GGML_CUDA_MAX_DEVICES -#else -#define LLAMA_MAX_DEVICES 1 -#endif // LM_GGML_USE_CUBLAS + #include #include #include @@ -46,11 +41,6 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 4 -#if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) || defined(LM_GGML_USE_METAL) -// Defined when llama.cpp is compiled with support for offloading model layers to GPU. -#define LLAMA_SUPPORTS_GPU_OFFLOAD -#endif - #ifdef __cplusplus extern "C" { #endif @@ -71,6 +61,7 @@ extern "C" { enum llama_vocab_type { LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding + LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece }; enum llama_token_type { @@ -107,6 +98,10 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -119,6 +114,12 @@ extern "C" { LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; + enum llama_pooling_type { + LLAMA_POOLING_NONE = 0, + LLAMA_POOLING_MEAN = 1, + LLAMA_POOLING_CLS = 2, + }; + enum llama_split_mode { LLAMA_SPLIT_NONE = 0, // single GPU LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs @@ -195,7 +196,7 @@ extern "C" { // LLAMA_SPLIT_LAYER: ignored int32_t main_gpu; - // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. @@ -221,7 +222,7 @@ extern "C" { uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing - int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` + int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` // ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency, 0 = from model @@ -243,6 +244,7 @@ extern "C" { bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) }; // model quantization parameters @@ -304,6 +306,12 @@ extern "C" { int32_t n_eval; }; + // used in chat template + typedef struct llama_chat_message { + const char * role; + const char * content; + } llama_chat_message; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -312,7 +320,10 @@ extern "C" { // Initialize the llama + ggml backend // If numa is true, use NUMA optimizations // Call once at the start of the program - LLAMA_API void llama_backend_init(bool numa); + LLAMA_API void llama_backend_init(void); + + //optional: + LLAMA_API void llama_numa_init(enum lm_ggml_numa_strategy numa); // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void); @@ -332,9 +343,14 @@ extern "C" { LLAMA_API int64_t llama_time_us(void); - LLAMA_API int32_t llama_max_devices(void); - LLAMA_API bool llama_mmap_supported (void); - LLAMA_API bool llama_mlock_supported(void); + LLAMA_API size_t llama_max_devices(void); + + LLAMA_API bool llama_supports_mmap (void); + LLAMA_API bool llama_supports_mlock (void); + LLAMA_API bool llama_supports_gpu_offload(void); + + LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead"); + LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead"); LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); @@ -630,6 +646,10 @@ extern "C" { // shape: [n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); + // Get the embeddings for the ith sequence + // llama_get_embeddings(ctx) + i*n_embd + LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); + // // Vocab // @@ -686,6 +706,25 @@ extern "C" { char * buf, int32_t length); + /// Apply chat template. Inspired by hf apply_chat_template() on python. + /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" + /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. + /// @param chat Pointer to a list of multiple llama_chat_message + /// @param n_msg Number of llama_chat_message in this chat + /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message. + /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages) + /// @param length The size of the allocated buffer + /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template. + LLAMA_API int32_t llama_chat_apply_template( + const struct llama_model * model, + const char * tmpl, + const struct llama_chat_message * chat, + size_t n_msg, + bool add_ass, + char * buf, + int32_t length); + // // Grammar // @@ -774,6 +813,14 @@ extern "C" { float p, size_t min_keep); + /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. + LLAMA_API void llama_sample_entropy( + struct llama_context * ctx, + llama_token_data_array * candidates_p, + float min_temp, + float max_temp, + float exponent_val); + LLAMA_API void llama_sample_temp( struct llama_context * ctx, llama_token_data_array * candidates, diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp index dd1ffeb1..de4331a1 100644 --- a/cpp/sampling.cpp +++ b/cpp/sampling.cpp @@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ // will be empty (default) if there are parse errors if (result->parsed_grammar.rules.empty()) { fprintf(stderr, "%s: failed to parse grammar\n", __func__); + delete result; return nullptr; } @@ -102,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; if (params.mirostat == 0) { - for (auto s : params.samplers_sequence) { - switch (s) { - case 'k': result += "-> top_k "; break; - case 'f': result += "-> tfs_z "; break; - case 'y': result += "-> typical_p "; break; - case 'p': result += "-> top_p "; break; - case 'm': result += "-> min_p "; break; - case 't': result += "-> temp "; break; - default : break; + for (auto sampler_type : params.samplers_sequence) { + const auto sampler_type_name = sampler_type_to_name_string(sampler_type); + if (!sampler_type_name.empty()) { + result += "-> " + sampler_type_name + " "; } } } else { @@ -125,25 +121,33 @@ static void sampler_queue( struct llama_context * ctx_main, const llama_sampling_params & params, llama_token_data_array & cur_p, - size_t & min_keep) { - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - + size_t min_keep) { const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float dynatemp_range = params.dynatemp_range; + const float dynatemp_exponent = params.dynatemp_exponent; + const int32_t top_k = params.top_k; const float top_p = params.top_p; const float min_p = params.min_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; - const std::string & samplers_sequence = params.samplers_sequence; - - for (auto s : samplers_sequence) { - switch (s){ - case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; - case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; - case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; - case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; - case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; + const std::vector & samplers_sequence = params.samplers_sequence; + + for (auto sampler_type : samplers_sequence) { + switch (sampler_type) { + case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; + case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; + case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; + case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; + case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case llama_sampler_type::TEMPERATURE: + if (dynatemp_range > 0) { + float dynatemp_min = std::max(0.0f, temp - dynatemp_range); + float dynatemp_max = std::max(0.0f, temp + dynatemp_range); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + } else { + llama_sample_temp(ctx_main, &cur_p, temp); + } + break; default : break; } } @@ -245,7 +249,7 @@ static llama_token llama_sampling_sample_impl( id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu); } else { // temperature sampling - size_t min_keep = std::max(1, params.n_probs); + size_t min_keep = std::max(1, params.min_keep); sampler_queue(ctx_main, params, cur_p, min_keep); diff --git a/cpp/sampling.h b/cpp/sampling.h index 2ee18037..95d87539 100644 --- a/cpp/sampling.h +++ b/cpp/sampling.h @@ -8,16 +8,29 @@ #include #include +// sampler types +enum class llama_sampler_type : char { + TOP_K = 'k', + TOP_P = 'p', + MIN_P = 'm', + TFS_Z = 'f', + TYPICAL_P = 'y', + TEMPERATURE = 't' +}; + // sampling parameters typedef struct llama_sampling_params { int32_t n_prev = 64; // number of previous tokens to remember int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.10f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled @@ -26,7 +39,15 @@ typedef struct llama_sampling_params { float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate bool penalize_nl = true; // consider newlines as a repeatable token - std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp + + std::vector samplers_sequence = { + llama_sampler_type::TOP_K, + llama_sampler_type::TFS_Z, + llama_sampler_type::TYPICAL_P, + llama_sampler_type::TOP_P, + llama_sampler_type::MIN_P, + llama_sampler_type::TEMPERATURE + }; std::string grammar; // optional BNF-like grammar to constrain sampling diff --git a/cpp/unicode.h b/cpp/unicode.h index aeca879e..26326070 100644 --- a/cpp/unicode.h +++ b/cpp/unicode.h @@ -2,8 +2,9 @@ #include #include -#include +#include #include +#include static const std::vector> digit_ranges = { {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, @@ -263,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) { offset += 1; return result; } - else if (!(utf8[offset + 0] & 0x40)) { + if (!(utf8[offset + 0] & 0x40)) { throw std::invalid_argument("invalid character"); } - else if (!(utf8[offset + 0] & 0x20)) { - if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x20)) { + if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f); offset += 2; return result; } - else if (!(utf8[offset + 0] & 0x10)) { - if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x10)) { + if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f); offset += 3; return result; } - else if (!(utf8[offset + 0] & 0x08)) { - if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x08)) { + if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f); offset += 4; return result; @@ -330,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector & utf16, size_t offset += 1; return result; } - else { - if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) - throw std::invalid_argument("invalid character"); - auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); - offset += 2; - return result; + + if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { + throw std::invalid_argument("invalid character"); } - throw std::invalid_argument("invalid string"); + + auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); + offset += 2; + return result; } static std::vector codepoints_from_utf16(const std::vector & utf16) { std::vector result; size_t offset = 0; - while (offset < utf16.size()) + while (offset < utf16.size()) { result.push_back(codepoint_from_utf16(utf16, offset)); + } return result; } @@ -360,44 +365,52 @@ static std::vector codepoints_from_utf16(const std::vector & static std::unordered_map codepoint_type_map() { std::unordered_map codepoint_types; for (auto p : digit_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_DIGIT; + } } - for(auto p : letter_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : letter_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_LETTER; + } } - for(auto p : whitespace_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : whitespace_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE; + } } - for(auto p : accent_mark_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : accent_mark_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK; + } } - for(auto p : punctuation_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : punctuation_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION; + } } - for (auto p : symbol_ranges) { - for (auto i = p.first; i <= p.second; ++i) + for (auto p : symbol_ranges) { + for (auto i = p.first; i <= p.second; ++i) { codepoint_types[i] = CODEPOINT_TYPE_SYMBOL; + } } - for(auto p : control_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : control_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_CONTROL; + } } return codepoint_types; } static int codepoint_type(uint32_t cp) { static std::unordered_map codepoint_types = codepoint_type_map(); - return codepoint_types[cp]; + return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp); } static int codepoint_type(const std::string & utf8) { - if (utf8.length() == 0) + if (utf8.length() == 0) { return CODEPOINT_TYPE_UNIDENTIFIED; + } size_t offset = 0; return codepoint_type(codepoint_from_utf8(utf8, offset)); } diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index bf0b355a..43a998f4 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -8,7 +8,7 @@ PODS: - hermes-engine/Pre-built (= 0.72.3) - hermes-engine/Pre-built (0.72.3) - libevent (2.1.12) - - llama-rn (0.3.0-rc.11): + - llama-rn (0.3.0-rc.14): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -1261,7 +1261,7 @@ SPEC CHECKSUMS: glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322 libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913 - llama-rn: 796d4db7819b9572a8364aac5fef94b56b835b34 + llama-rn: 26d2d3c08a3e788889a4833c4678ea9ccbbb1f33 RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1 RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18 RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3 diff --git a/llama.cpp b/llama.cpp index 57e2a7a5..973053d8 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 57e2a7a52a819883f40dada8a2edc24ecf48186b +Subproject commit 973053d8b0d04809836b3339a50f68d9c842de90 diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 15a8467e..4cdaa772 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -31,6 +31,7 @@ cp ./llama.cpp/common/sampling.cpp ./cpp/sampling.cpp files=( "./cpp/ggml.h" "./cpp/ggml.c" + "./cpp/common.h" "./cpp/common.cpp" "./cpp/ggml-metal.h" "./cpp/ggml-metal.m" diff --git a/scripts/common.cpp.patch b/scripts/common.cpp.patch index 168fb352..5adf74dc 100644 --- a/scripts/common.cpp.patch +++ b/scripts/common.cpp.patch @@ -1,5 +1,5 @@ ---- common.cpp.orig 2023-12-19 08:18:55 -+++ common.cpp 2023-12-19 08:18:26 +--- common.cpp.orig 2024-02-22 12:19:15 ++++ common.cpp 2024-02-22 12:19:17 @@ -41,6 +41,12 @@ #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -10,6 +10,6 @@ +char const *LLAMA_COMMIT = "unknown"; +char const *LLAMA_COMPILER = "unknown"; +char const *LLAMA_BUILD_TARGET = "unknown"; - - int32_t get_num_physical_cores() { - #ifdef __linux__ + + #if (defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_SYCL)) + #define LM_GGML_USE_CUBLAS_SYCL diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch index 92bcfaca..20fdba3b 100644 --- a/scripts/ggml-metal.m.patch +++ b/scripts/ggml-metal.m.patch @@ -1,6 +1,6 @@ ---- ggml-metal.m.orig 2024-01-19 10:06:53 -+++ ggml-metal.m 2024-01-19 10:06:54 -@@ -288,7 +288,7 @@ +--- ggml-metal.m.orig 2024-02-22 12:19:15 ++++ ggml-metal.m 2024-02-22 12:19:17 +@@ -300,7 +300,7 @@ if (ggmlMetalPathResources) { sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"]; } else { diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index 04f1e095..51f29d5a 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,6 +1,6 @@ ---- llama.cpp.orig 2024-01-15 12:27:16 -+++ llama.cpp 2024-01-15 12:26:12 -@@ -107,6 +107,17 @@ +--- llama.cpp.orig 2024-02-22 12:19:15 ++++ llama.cpp 2024-02-22 12:19:17 +@@ -114,6 +114,17 @@ #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) @@ -18,7 +18,7 @@ // // helpers // -@@ -876,16 +887,16 @@ +@@ -1068,16 +1079,16 @@ if (prefetch > 0) { // advise the kernel to preload the mapped memory From 3c70c51eb740e85cd6a05d308b933ccf03f82c5e Mon Sep 17 00:00:00 2001 From: jhen Date: Thu, 22 Feb 2024 12:43:59 +0800 Subject: [PATCH 2/2] fix(ci): bump cocoapods version to 1.15.2 --- .github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7936c480..c165b281 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,6 +54,11 @@ jobs: restore-keys: | ${{ runner.os }}-pods- + - name: Upgrade CocoaPods to version 1.15.2 + run: | + gem uninstall cocoapods --ignore-dependencies + gem install cocoapods -v 1.15.2 + - name: Install cocoapods run: | yarn example pods