diff --git a/cpp/common.cpp b/cpp/common.cpp index 987a93ff..e4342ce5 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -1,4 +1,6 @@ #include "common.h" +// Change JSON_ASSERT from assert() to LM_GGML_ASSERT: +#define JSON_ASSERT LM_GGML_ASSERT #include "json.hpp" #include "json-schema-to-grammar.h" #include "llama.h" @@ -77,12 +79,16 @@ char const *LLAMA_BUILD_TARGET = "unknown"; using json = nlohmann::ordered_json; -int32_t get_num_physical_cores() { +// +// CPU utils +// + +int32_t cpu_get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores std::unordered_set siblings; for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { - std::ifstream thread_siblings("/sys/devices/system/cpu" + std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings"); if (!thread_siblings.is_open()) { break; // no more cpus @@ -146,9 +152,9 @@ static bool is_running_on_efficiency_core(void) { return core_type == intel_atom; } -static int count_math_cpus(int cpu_count) { +static int cpu_count_math_cpus(int n_cpu) { int result = 0; - for (int cpu = 0; cpu < cpu_count; ++cpu) { + for (int cpu = 0; cpu < n_cpu; ++cpu) { if (pin_cpu(cpu)) { return -1; } @@ -166,16 +172,16 @@ static int count_math_cpus(int cpu_count) { /** * Returns number of CPUs on system that are useful for math. */ -int get_math_cpu_count() { +int32_t cpu_get_num_math() { #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) - int cpu_count = sysconf(_SC_NPROCESSORS_ONLN); - if (cpu_count < 1) { - return get_num_physical_cores(); + int n_cpu = sysconf(_SC_NPROCESSORS_ONLN); + if (n_cpu < 1) { + return cpu_get_num_physical_cores(); } if (is_hybrid_cpu()) { cpu_set_t affinity; if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) { - int result = count_math_cpus(cpu_count); + int result = cpu_count_math_cpus(n_cpu); pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity); if (result > 0) { return result; @@ -183,108 +189,103 @@ int get_math_cpu_count() { } } #endif - return get_num_physical_cores(); + return cpu_get_num_physical_cores(); } -void process_escapes(std::string & input) { - std::size_t input_len = input.length(); - std::size_t output_idx = 0; +// +// CLI argument parsing +// - for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) { - if (input[input_idx] == '\\' && input_idx + 1 < input_len) { - switch (input[++input_idx]) { - case 'n': input[output_idx++] = '\n'; break; - case 'r': input[output_idx++] = '\r'; break; - case 't': input[output_idx++] = '\t'; break; - case '\'': input[output_idx++] = '\''; break; - case '\"': input[output_idx++] = '\"'; break; - case '\\': input[output_idx++] = '\\'; break; - case 'x': - // Handle \x12, etc - if (input_idx + 2 < input_len) { - const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 }; - char *err_p = nullptr; - const long val = std::strtol(x, &err_p, 16); - if (err_p == x + 2) { - input_idx += 2; - input[output_idx++] = char(val); - break; - } - } - // fall through - default: input[output_idx++] = '\\'; - input[output_idx++] = input[input_idx]; break; +void gpt_params_handle_model_default(gpt_params & params) { + if (!params.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (params.hf_file.empty()) { + if (params.model.empty()) { + throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); } - } else { - input[output_idx++] = input[input_idx]; + params.hf_file = params.model; + } else if (params.model.empty()) { + std::string cache_directory = fs_get_cache_directory(); + const bool success = fs_create_directory_with_parents(cache_directory); + if (!success) { + throw std::runtime_error("failed to create cache directory: " + cache_directory); + } + params.model = cache_directory + string_split(params.hf_file, '/').back(); + } + } else if (!params.model_url.empty()) { + if (params.model.empty()) { + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + f = string_split(f, '/').back(); + params.model = "models/" + f; } + } else if (params.model.empty()) { + params.model = DEFAULT_MODEL_PATH; } +} - input.resize(output_idx); +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { + bool invalid_param = false; + std::string arg; + const std::string arg_prefix = "--"; + llama_sampling_params & sparams = params.sparams; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { + throw std::invalid_argument("error: unknown argument: " + arg); + } + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } + } + + if (params.prompt_cache_all && + (params.interactive || params.interactive_first || + params.instruct)) { + + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); + } + + gpt_params_handle_model_default(params); + + if (params.escape) { + string_process_escapes(params.prompt); + string_process_escapes(params.input_prefix); + string_process_escapes(params.input_suffix); + string_process_escapes(sparams.cfg_negative_prompt); + for (auto & antiprompt : params.antiprompt) { + string_process_escapes(antiprompt); + } + } + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + return true; } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool result = true; try { if (!gpt_params_parse_ex(argc, argv, params)) { - gpt_print_usage(argc, argv, gpt_params()); + gpt_params_print_usage(argc, argv, gpt_params()); exit(0); } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); - gpt_print_usage(argc, argv, gpt_params()); + gpt_params_print_usage(argc, argv, gpt_params()); exit(1); } return result; } -bool parse_kv_override(const char * data, std::vector & overrides) { - const char * sep = strchr(data, '='); - if (sep == nullptr || sep - data >= 128) { - fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); - return false; - } - llama_model_kv_override kvo; - std::strncpy(kvo.key, data, sep - data); - kvo.key[sep - data] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.val_i64 = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.val_f64 = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.val_bool = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.val_bool = false; - } else { - fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); - return false; - } - } else if (strncmp(sep, "str:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; - if (strlen(sep) > 127) { - fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); - return false; - } - strncpy(kvo.val_str, sep, 127); - kvo.val_str[127] = '\0'; - } else { - fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); - return false; - } - overrides.emplace_back(std::move(kvo)); - return true; -} - bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { llama_sampling_params & sparams = params.sparams; @@ -550,7 +551,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); + sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); return true; } if (arg == "--sampling-seq") { @@ -558,7 +559,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - sparams.samplers_sequence = sampler_types_from_chars(argv[i]); + sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); return true; } if (arg == "--top-p") { @@ -905,6 +906,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive = true; return true; } + if (arg == "--interactive-specials") { + params.interactive_specials = true; + return true; + } + if (arg == "--special") { + params.special = true; + return true; + } if (arg == "--embedding") { params.embedding = true; return true; @@ -917,6 +926,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.instruct = true; return true; } + if (arg == "-cnv" || arg == "--conversation") { + params.conversation = true; + return true; + } if (arg == "-cml" || arg == "--chatml") { params.chatml = true; return true; @@ -1056,6 +1069,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa #endif // LM_GGML_USE_CUDA_SYCL_VULKAN return true; } + if (arg == "--rpc") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rpc_servers = argv[i]; + return true; + } if (arg == "--no-mmap") { params.use_mmap = false; return true; @@ -1228,7 +1249,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, gpt_params()); + gpt_params_print_usage(argc, argv, gpt_params()); exit(0); } if (arg == "--version") { @@ -1299,7 +1320,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - if (!parse_kv_override(argv[i], params.kv_overrides)) { + if (!string_parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; return true; @@ -1333,85 +1354,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return false; } -void gpt_params_handle_model_default(gpt_params & params) { - if (!params.hf_repo.empty()) { - // short-hand to avoid specifying --hf-file -> default it to --model - if (params.hf_file.empty()) { - if (params.model.empty()) { - throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); - } - params.hf_file = params.model; - } else if (params.model.empty()) { - params.model = "models/" + string_split(params.hf_file, '/').back(); - } - } else if (!params.model_url.empty()) { - if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); - f = string_split(f, '?').front(); - f = string_split(f, '/').back(); - params.model = "models/" + f; - } - } else if (params.model.empty()) { - params.model = DEFAULT_MODEL_PATH; - } -} - -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { - bool invalid_param = false; - std::string arg; - const std::string arg_prefix = "--"; - llama_sampling_params & sparams = params.sparams; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { - throw std::invalid_argument("error: unknown argument: " + arg); - } - } - - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); - } - - if (params.prompt_cache_all && - (params.interactive || params.interactive_first || - params.instruct)) { - - throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); - } - - gpt_params_handle_model_default(params); - - if (params.escape) { - process_escapes(params.prompt); - process_escapes(params.input_prefix); - process_escapes(params.input_suffix); - process_escapes(sparams.cfg_negative_prompt); - for (auto & antiprompt : params.antiprompt) { - process_escapes(antiprompt); - } - } - - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - return true; -} - -void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { +void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; std::string sampler_type_chars; std::string sampler_type_names; for (const auto sampler_type : sparams.samplers_sequence) { sampler_type_chars += static_cast(sampler_type); - sampler_type_names += sampler_type_to_name_string(sampler_type) + ";"; + sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; } sampler_type_names.pop_back(); @@ -1422,7 +1372,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -h, --help show this help message and exit\n"); printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); + printf(" --special special tokens output enabled\n"); + printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); + printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); @@ -1553,6 +1506,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); } + printf(" --rpc SERVERS comma separated list of RPC servers\n"); printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n"); @@ -1605,7 +1559,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif // LOG_DISABLE_LOGS } -std::string get_system_info(const gpt_params & params) { +std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; os << "system_info: n_threads = " << params.n_threads; @@ -1617,7 +1571,52 @@ std::string get_system_info(const gpt_params & params) { return os.str(); } -std::string gpt_random_prompt(std::mt19937 & rng) { +// +// String utils +// + +std::vector string_split(std::string input, char separator) { + std::vector parts; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(0, separator_pos); + parts.emplace_back(part); + input = input.substr(separator_pos + 1); + separator_pos = input.find(separator); + } + parts.emplace_back(input); + return parts; +} + +std::string string_strip(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + while (start < end && std::isspace(str[start])) { + start++; + } + while (end > start && std::isspace(str[end - 1])) { + end--; + } + return str.substr(start, end - start); +} + +std::string string_get_sortable_timestamp() { + using clock = std::chrono::system_clock; + + const clock::time_point current_time = clock::now(); + const time_t as_time_t = clock::to_time_t(current_time); + char timestamp_no_ns[100]; + std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t)); + + const int64_t ns = std::chrono::duration_cast( + current_time.time_since_epoch() % 1000000000).count(); + char timestamp_ns[11]; + snprintf(timestamp_ns, 11, "%09" PRId64, ns); + + return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); +} + +std::string string_random_prompt(std::mt19937 & rng) { const int r = rng() % 10; switch (r) { case 0: return "So"; @@ -1635,9 +1634,96 @@ std::string gpt_random_prompt(std::mt19937 & rng) { LM_GGML_UNREACHABLE(); } -// Validate if a filename is safe to use -// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function -bool validate_file_name(const std::string & filename) { +void string_process_escapes(std::string & input) { + std::size_t input_len = input.length(); + std::size_t output_idx = 0; + + for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) { + if (input[input_idx] == '\\' && input_idx + 1 < input_len) { + switch (input[++input_idx]) { + case 'n': input[output_idx++] = '\n'; break; + case 'r': input[output_idx++] = '\r'; break; + case 't': input[output_idx++] = '\t'; break; + case '\'': input[output_idx++] = '\''; break; + case '\"': input[output_idx++] = '\"'; break; + case '\\': input[output_idx++] = '\\'; break; + case 'x': + // Handle \x12, etc + if (input_idx + 2 < input_len) { + const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 }; + char *err_p = nullptr; + const long val = std::strtol(x, &err_p, 16); + if (err_p == x + 2) { + input_idx += 2; + input[output_idx++] = char(val); + break; + } + } + // fall through + default: input[output_idx++] = '\\'; + input[output_idx++] = input[input_idx]; break; + } + } else { + input[output_idx++] = input[input_idx]; + } + } + + input.resize(output_idx); +} + +bool string_parse_kv_override(const char * data, std::vector & overrides) { + const char * sep = strchr(data, '='); + if (sep == nullptr || sep - data >= 128) { + fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + return false; + } + llama_model_kv_override kvo; + std::strncpy(kvo.key, data, sep - data); + kvo.key[sep - data] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.val_f64 = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.val_bool = true; + } else if (std::strcmp(sep, "false") == 0) { + kvo.val_bool = false; + } else { + fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); + return false; + } + } else if (strncmp(sep, "str:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + if (strlen(sep) > 127) { + fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); + return false; + } + strncpy(kvo.val_str, sep, 127); + kvo.val_str[127] = '\0'; + } else { + fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + return false; + } + overrides.emplace_back(std::move(kvo)); + return true; +} + +// +// Filesystem utils +// + +// Validate if a filename is safe to use +// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function +bool fs_validate_filename(const std::string & filename) { if (!filename.length()) { // Empty filename invalid return false; @@ -1706,173 +1792,252 @@ bool validate_file_name(const std::string & filename) { return true; } -// -// String utils -// +// returns true if successful, false otherwise +bool fs_create_directory_with_parents(const std::string & path) { +#ifdef _WIN32 + std::wstring_convert> converter; + std::wstring wpath = converter.from_bytes(path); -std::vector string_split(std::string input, char separator) { - std::vector parts; - size_t separator_pos = input.find(separator); - while (separator_pos != std::string::npos) { - std::string part = input.substr(0, separator_pos); - parts.emplace_back(part); - input = input.substr(separator_pos + 1); - separator_pos = input.find(separator); + // if the path already exists, check whether it's a directory + const DWORD attributes = GetFileAttributesW(wpath.c_str()); + if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + return true; } - parts.emplace_back(input); - return parts; -} -std::string string_strip(const std::string & str) { - size_t start = 0; - size_t end = str.size(); - while (start < end && std::isspace(str[start])) { - start++; - } - while (end > start && std::isspace(str[end - 1])) { - end--; - } - return str.substr(start, end - start); -} + size_t pos_slash = 0; -std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) { - std::unordered_map sampler_canonical_name_map { - {"top_k", llama_sampler_type::TOP_K}, - {"top_p", llama_sampler_type::TOP_P}, - {"typical_p", llama_sampler_type::TYPICAL_P}, - {"min_p", llama_sampler_type::MIN_P}, - {"tfs_z", llama_sampler_type::TFS_Z}, - {"temperature", llama_sampler_type::TEMPERATURE} - }; + // process path from front to back, procedurally creating directories + while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { + const std::wstring subpath = wpath.substr(0, pos_slash); + const wchar_t * test = subpath.c_str(); - // since samplers names are written multiple ways - // make it ready for both system names and input names - std::unordered_map sampler_alt_name_map { - {"top-k", llama_sampler_type::TOP_K}, - {"top-p", llama_sampler_type::TOP_P}, - {"nucleus", llama_sampler_type::TOP_P}, - {"typical-p", llama_sampler_type::TYPICAL_P}, - {"typical", llama_sampler_type::TYPICAL_P}, - {"min-p", llama_sampler_type::MIN_P}, - {"tfs-z", llama_sampler_type::TFS_Z}, - {"tfs", llama_sampler_type::TFS_Z}, - {"temp", llama_sampler_type::TEMPERATURE} - }; + const bool success = CreateDirectoryW(test, NULL); + if (!success) { + const DWORD error = GetLastError(); - std::vector sampler_types; - sampler_types.reserve(names.size()); - for (const auto & name : names) - { - auto sampler_item = sampler_canonical_name_map.find(name); - if (sampler_item != sampler_canonical_name_map.end()) - { - sampler_types.push_back(sampler_item->second); - } - else - { - if (allow_alt_names) - { - sampler_item = sampler_alt_name_map.find(name); - if (sampler_item != sampler_alt_name_map.end()) - { - sampler_types.push_back(sampler_item->second); + // if the path already exists, ensure that it's a directory + if (error == ERROR_ALREADY_EXISTS) { + const DWORD attributes = GetFileAttributesW(subpath.c_str()); + if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) { + return false; } + } else { + return false; } } + + pos_slash += 1; } - return sampler_types; -} -std::vector sampler_types_from_chars(const std::string & names_string) { - std::unordered_map sampler_name_map { - {'k', llama_sampler_type::TOP_K}, - {'p', llama_sampler_type::TOP_P}, - {'y', llama_sampler_type::TYPICAL_P}, - {'m', llama_sampler_type::MIN_P}, - {'f', llama_sampler_type::TFS_Z}, - {'t', llama_sampler_type::TEMPERATURE} - }; + return true; +#else + // if the path already exists, check whether it's a directory + struct stat info; + if (stat(path.c_str(), &info) == 0) { + return S_ISDIR(info.st_mode); + } + + size_t pos_slash = 1; // skip leading slashes for directory creation + + // process path from front to back, procedurally creating directories + while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) { + const std::string subpath = path.substr(0, pos_slash); + struct stat info; - std::vector sampler_types; - sampler_types.reserve(names_string.size()); - for (const auto & c : names_string) { - const auto sampler_item = sampler_name_map.find(c); - if (sampler_item != sampler_name_map.end()) { - sampler_types.push_back(sampler_item->second); + // if the path already exists, ensure that it's a directory + if (stat(subpath.c_str(), &info) == 0) { + if (!S_ISDIR(info.st_mode)) { + return false; + } + } else { + // create parent directories + const int ret = mkdir(subpath.c_str(), 0755); + if (ret != 0) { + return false; + } } + + pos_slash += 1; } - return sampler_types; + + return true; +#endif // _WIN32 } -std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { - switch (sampler_type) { - case llama_sampler_type::TOP_K: return "top_k"; - case llama_sampler_type::TFS_Z: return "tfs_z"; - case llama_sampler_type::TYPICAL_P: return "typical_p"; - case llama_sampler_type::TOP_P: return "top_p"; - case llama_sampler_type::MIN_P: return "min_p"; - case llama_sampler_type::TEMPERATURE: return "temperature"; - default : return ""; +std::string fs_get_cache_directory() { + std::string cache_directory = ""; + auto ensure_trailing_slash = [](std::string p) { + // Make sure to add trailing slash + if (p.back() != DIRECTORY_SEPARATOR) { + p += DIRECTORY_SEPARATOR; + } + return p; + }; + if (getenv("LLAMA_CACHE")) { + cache_directory = std::getenv("LLAMA_CACHE"); + } else { +#ifdef __linux__ + if (std::getenv("XDG_CACHE_HOME")) { + cache_directory = std::getenv("XDG_CACHE_HOME"); + } else { + cache_directory = std::getenv("HOME") + std::string("/.cache/"); + } +#elif defined(__APPLE__) + cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); +#elif defined(_WIN32) + cache_directory = std::getenv("LOCALAPPDATA"); +#endif // __linux__ + cache_directory = ensure_trailing_slash(cache_directory); + cache_directory += "llama.cpp"; } + return ensure_trailing_slash(cache_directory); } + // // Model utils // -struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { - auto mparams = llama_model_default_params(); +std::tuple llama_init_from_gpt_params(gpt_params & params) { + auto mparams = llama_model_params_from_gpt_params(params); - if (params.n_gpu_layers != -1) { - mparams.n_gpu_layers = params.n_gpu_layers; - } - mparams.main_gpu = params.main_gpu; - mparams.split_mode = params.split_mode; - mparams.tensor_split = params.tensor_split; - mparams.use_mmap = params.use_mmap; - mparams.use_mlock = params.use_mlock; - mparams.check_tensors = params.check_tensors; - if (params.kv_overrides.empty()) { - mparams.kv_overrides = NULL; + llama_model * model = nullptr; + + if (!params.hf_repo.empty() && !params.hf_file.empty()) { + model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); + } else if (!params.model_url.empty()) { + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); } else { - LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key"); - mparams.kv_overrides = params.kv_overrides.data(); + model = llama_load_model_from_file(params.model.c_str(), mparams); } - return mparams; -} - -static lm_ggml_type kv_cache_type_from_str(const std::string & s) { - if (s == "f32") { - return LM_GGML_TYPE_F32; - } - if (s == "f16") { - return LM_GGML_TYPE_F16; - } - if (s == "q8_0") { - return LM_GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return LM_GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return LM_GGML_TYPE_Q4_1; - } - if (s == "iq4_nl") { - return LM_GGML_TYPE_IQ4_NL; - } - if (s == "q5_0") { - return LM_GGML_TYPE_Q5_0; + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return std::make_tuple(nullptr, nullptr); } - if (s == "q5_1") { - return LM_GGML_TYPE_Q5_1; + + auto cparams = llama_context_params_from_gpt_params(params); + + llama_context * lctx = llama_new_context_with_model(model, cparams); + if (lctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } - throw std::runtime_error("Invalid cache type: " + s); -} + if (!params.control_vectors.empty()) { + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); -struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { - auto cparams = llama_context_default_params(); + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { + const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); + float lora_scale = std::get<1>(params.lora_adapter[i]); + int err = llama_model_apply_lora_from_file(model, + lora_adapter.c_str(), + lora_scale, + ((i > 0) || params.lora_base.empty()) + ? NULL + : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + + if (params.ignore_eos) { + params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; + } + + if (params.warmup) { + LOG("warming up the model with an empty run\n"); + + std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + llama_kv_cache_clear(lctx); + llama_synchronize(lctx); + llama_reset_timings(lctx); + } + + return std::make_tuple(model, lctx); +} + +struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { + auto mparams = llama_model_default_params(); + + if (params.n_gpu_layers != -1) { + mparams.n_gpu_layers = params.n_gpu_layers; + } + mparams.rpc_servers = params.rpc_servers.c_str(); + mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; + mparams.tensor_split = params.tensor_split; + mparams.use_mmap = params.use_mmap; + mparams.use_mlock = params.use_mlock; + mparams.check_tensors = params.check_tensors; + if (params.kv_overrides.empty()) { + mparams.kv_overrides = NULL; + } else { + LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key"); + mparams.kv_overrides = params.kv_overrides.data(); + } + + return mparams; +} + +static lm_ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f32") { + return LM_GGML_TYPE_F32; + } + if (s == "f16") { + return LM_GGML_TYPE_F16; + } + if (s == "q8_0") { + return LM_GGML_TYPE_Q8_0; + } + if (s == "q4_0") { + return LM_GGML_TYPE_Q4_0; + } + if (s == "q4_1") { + return LM_GGML_TYPE_Q4_1; + } + if (s == "iq4_nl") { + return LM_GGML_TYPE_IQ4_NL; + } + if (s == "q5_0") { + return LM_GGML_TYPE_Q5_0; + } + if (s == "q5_1") { + return LM_GGML_TYPE_Q5_1; + } + + throw std::runtime_error("Invalid cache type: " + s); +} + +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { + auto cparams = llama_context_default_params(); cparams.n_ctx = params.n_ctx; cparams.n_seq_max = params.n_parallel; @@ -1904,27 +2069,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param return cparams; } -void llama_batch_clear(struct llama_batch & batch) { - batch.n_tokens = 0; -} - -void llama_batch_add( - struct llama_batch & batch, - llama_token id, - llama_pos pos, - const std::vector & seq_ids, - bool logits) { - batch.token [batch.n_tokens] = id; - batch.pos [batch.n_tokens] = pos; - batch.n_seq_id[batch.n_tokens] = seq_ids.size(); - for (size_t i = 0; i < seq_ids.size(); ++i) { - batch.seq_id[batch.n_tokens][i] = seq_ids[i]; - } - batch.logits [batch.n_tokens] = logits; - - batch.n_tokens++; -} - #ifdef LLAMA_USE_CURL static bool starts_with(const std::string & str, const std::string & prefix) { @@ -1970,18 +2114,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat try { metadata_in >> metadata; fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); - if (metadata.contains("url") && metadata["url"].is_string()) { - auto previous_url = metadata["url"].get(); + if (metadata.contains("url") && metadata.at("url").is_string()) { + auto previous_url = metadata.at("url").get(); if (previous_url != url) { fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); return false; } } - if (metadata.contains("etag") && metadata["etag"].is_string()) { - etag = metadata["etag"]; + if (metadata.contains("etag") && metadata.at("etag").is_string()) { + etag = metadata.at("etag"); } - if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) { - last_modified = metadata["lastModified"]; + if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) { + last_modified = metadata.at("lastModified"); } } catch (const nlohmann::json::exception & e) { fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); @@ -2255,90 +2399,29 @@ struct llama_model * llama_load_model_from_hf( #endif // LLAMA_USE_CURL -std::tuple llama_init_from_gpt_params(gpt_params & params) { - auto mparams = llama_model_params_from_gpt_params(params); - - llama_model * model = nullptr; - - if (!params.hf_repo.empty() && !params.hf_file.empty()) { - model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); - } else if (!params.model_url.empty()) { - model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); - } else { - model = llama_load_model_from_file(params.model.c_str(), mparams); - } - - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return std::make_tuple(nullptr, nullptr); - } - - auto cparams = llama_context_params_from_gpt_params(params); - - llama_context * lctx = llama_new_context_with_model(model, cparams); - if (lctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - - if (!params.control_vectors.empty()) { - if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; - if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); - - const auto cvec = llama_control_vector_load(params.control_vectors); - if (cvec.n_embd == -1) { - llama_free(lctx); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - - int err = llama_control_vector_apply(lctx, - cvec.data.data(), - cvec.data.size(), - cvec.n_embd, - params.control_vector_layer_start, - params.control_vector_layer_end); - if (err) { - llama_free(lctx); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - } - - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); - int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, - ((i > 0) || params.lora_base.empty()) - ? NULL - : params.lora_base.c_str(), - params.n_threads); - if (err != 0) { - fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); - llama_free(lctx); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - } - - if (params.ignore_eos) { - params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } +// +// Batch utils +// - if (params.warmup) { - LOG("warming up the model with an empty run\n"); +void llama_batch_clear(struct llama_batch & batch) { + batch.n_tokens = 0; +} - std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; - llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); - llama_kv_cache_clear(lctx); - llama_synchronize(lctx); - llama_reset_timings(lctx); +void llama_batch_add( + struct llama_batch & batch, + llama_token id, + llama_pos pos, + const std::vector & seq_ids, + bool logits) { + batch.token [batch.n_tokens] = id; + batch.pos [batch.n_tokens] = pos; + batch.n_seq_id[batch.n_tokens] = seq_ids.size(); + for (size_t i = 0; i < seq_ids.size(); ++i) { + batch.seq_id[batch.n_tokens][i] = seq_ids[i]; } + batch.logits [batch.n_tokens] = logits; - return std::make_tuple(model, lctx); + batch.n_tokens++; } // @@ -2391,355 +2474,46 @@ std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { - std::string piece; - std::string result; - - for (size_t i = 0; i < tokens.size(); ++i) { - piece = llama_token_to_piece(ctx, tokens[i]); - - result += piece; - } - - // NOTE: the original tokenizer decodes bytes after collecting the pieces. - return result; -} - -bool llama_should_add_bos_token(const llama_model * model) { - const int add_bos = llama_add_bos_token(model); - - return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); -} - -// -// YAML utils -// - -// returns true if successful, false otherwise -bool create_directory_with_parents(const std::string & path) { -#ifdef _WIN32 - std::wstring_convert> converter; - std::wstring wpath = converter.from_bytes(path); - - // if the path already exists, check whether it's a directory - const DWORD attributes = GetFileAttributesW(wpath.c_str()); - if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { - return true; - } - - size_t pos_slash = 0; - - // process path from front to back, procedurally creating directories - while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { - const std::wstring subpath = wpath.substr(0, pos_slash); - const wchar_t * test = subpath.c_str(); - - const bool success = CreateDirectoryW(test, NULL); - if (!success) { - const DWORD error = GetLastError(); - - // if the path already exists, ensure that it's a directory - if (error == ERROR_ALREADY_EXISTS) { - const DWORD attributes = GetFileAttributesW(subpath.c_str()); - if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) { - return false; - } - } else { - return false; - } - } - - pos_slash += 1; - } - - return true; -#else - // if the path already exists, check whether it's a directory - struct stat info; - if (stat(path.c_str(), &info) == 0) { - return S_ISDIR(info.st_mode); - } - - size_t pos_slash = 1; // skip leading slashes for directory creation - - // process path from front to back, procedurally creating directories - while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) { - const std::string subpath = path.substr(0, pos_slash); - struct stat info; - - // if the path already exists, ensure that it's a directory - if (stat(subpath.c_str(), &info) == 0) { - if (!S_ISDIR(info.st_mode)) { - return false; - } - } else { - // create parent directories - const int ret = mkdir(subpath.c_str(), 0755); - if (ret != 0) { - return false; - } - } - - pos_slash += 1; - } - - return true; -#endif // _WIN32 -} - -void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data) { - if (data.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - fprintf(stream, "%s: [", prop_name); - for (size_t i = 0; i < data.size() - 1; ++i) { - fprintf(stream, "%e, ", data[i]); - } - fprintf(stream, "%e]\n", data.back()); -} - -void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data) { - if (data.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - fprintf(stream, "%s: [", prop_name); - for (size_t i = 0; i < data.size() - 1; ++i) { - fprintf(stream, "%d, ", data[i]); - } - fprintf(stream, "%d]\n", data.back()); -} - -void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) { - std::string data_str(data == NULL ? "" : data); - - if (data_str.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - size_t pos_start = 0; - size_t pos_found = 0; - - if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) { - data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); - data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); - data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); - data_str = "\"" + data_str + "\""; - fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); - return; - } - - if (data_str.find('\n') == std::string::npos) { - fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); - return; - } - - fprintf(stream, "%s: |\n", prop_name); - while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { - fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); - pos_start = pos_found + 1; - } -} - -std::string get_sortable_timestamp() { - using clock = std::chrono::system_clock; - - const clock::time_point current_time = clock::now(); - const time_t as_time_t = clock::to_time_t(current_time); - char timestamp_no_ns[100]; - std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t)); - - const int64_t ns = std::chrono::duration_cast( - current_time.time_since_epoch() % 1000000000).count(); - char timestamp_ns[11]; - snprintf(timestamp_ns, 11, "%09" PRId64, ns); - - return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); -} - -void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx, - const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { - const llama_sampling_params & sparams = params.sparams; - - fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); - fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); - fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false"); - fprintf(stream, "cpu_has_avx_vnni: %s\n", lm_ggml_cpu_has_avx_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512_vnni: %s\n", lm_ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_cuda: %s\n", lm_ggml_cpu_has_cuda() ? "true" : "false"); - fprintf(stream, "cpu_has_vulkan: %s\n", lm_ggml_cpu_has_vulkan() ? "true" : "false"); - fprintf(stream, "cpu_has_clblast: %s\n", lm_ggml_cpu_has_clblast() ? "true" : "false"); - fprintf(stream, "cpu_has_kompute: %s\n", lm_ggml_cpu_has_kompute() ? "true" : "false"); - fprintf(stream, "cpu_has_fma: %s\n", lm_ggml_cpu_has_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_gpublas: %s\n", lm_ggml_cpu_has_gpublas() ? "true" : "false"); - fprintf(stream, "cpu_has_neon: %s\n", lm_ggml_cpu_has_neon() ? "true" : "false"); - fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false"); - fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false"); - fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false"); - fprintf(stream, "cpu_has_vsx: %s\n", lm_ggml_cpu_has_vsx() ? "true" : "false"); - fprintf(stream, "cpu_has_matmul_int8: %s\n", lm_ggml_cpu_has_matmul_int8() ? "true" : "false"); - -#ifdef NDEBUG - fprintf(stream, "debug: false\n"); -#else - fprintf(stream, "debug: true\n"); -#endif // NDEBUG - - fprintf(stream, "model_desc: %s\n", model_desc); - fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); - -#ifdef __OPTIMIZE__ - fprintf(stream, "optimize: true\n"); -#else - fprintf(stream, "optimize: false\n"); -#endif // __OPTIMIZE__ - - fprintf(stream, "time: %s\n", timestamp.c_str()); - - fprintf(stream, "\n"); - fprintf(stream, "###############\n"); - fprintf(stream, "# User Inputs #\n"); - fprintf(stream, "###############\n"); - fprintf(stream, "\n"); - - fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); - fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); - dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str()); - fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); - fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); - fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); - fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); - fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); - fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); - fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); - dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str()); - fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); - fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); - fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - - const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); - const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; - fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); - - dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str()); - fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); - dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str()); - fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); - fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); - fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); - fprintf(stream, "keep: %d # default: 0\n", params.n_keep); - fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); - - fprintf(stream, "logit_bias:\n"); - for (std::pair lb : sparams.logit_bias) { - if (ignore_eos && lb.first == logit_bias_eos->first) { - continue; - } - fprintf(stream, " %d: %f", lb.first, lb.second); - } - - fprintf(stream, "lora:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) != 1.0f) { - continue; - } - fprintf(stream, " - %s\n", std::get<0>(la).c_str()); - } - fprintf(stream, "lora_scaled:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) == 1.0f) { - continue; - } - fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); - } - fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); - fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); - fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); - fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); - fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); - fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); - fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); - fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); - fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); - fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); - fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); - fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); - fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); - fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); - fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); - fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); - fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); - fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); - dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str()); - fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); - fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); - fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); - dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens); - fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); - fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); - - fprintf(stream, "reverse_prompt:\n"); - for (std::string ap : params.antiprompt) { - size_t pos = 0; - while ((pos = ap.find('\n', pos)) != std::string::npos) { - ap.replace(pos, 1, "\\n"); - pos += 1; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + // remove the leading space of the first non-BOS token + if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { + piece = piece.substr(1); } - fprintf(stream, " - %s\n", ap.c_str()); + result += piece; } - fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); - fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); - fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); - fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); - fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); - fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); + return result; +} - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); - dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); +std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) { + std::string piece; + std::string result; - fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); - fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); - fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); - fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); - fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); - fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); - fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + result += piece; + } + + // NOTE: the original tokenizer decodes bytes after collecting the pieces. + return result; +} + +bool llama_should_add_bos_token(const llama_model * model) { + const int add_bos = llama_add_bos_token(model); + + return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); } // // KV cache utils // -void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { +void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", @@ -2762,7 +2536,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } -void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { +void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", @@ -2810,6 +2584,10 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } +// +// Embedding utils +// + void llama_embd_normalize(const float * inp, float * out, int n) { double sum = 0.0; for (int i = 0; i < n; i++) { @@ -2994,3 +2772,226 @@ llama_control_vector_data llama_control_vector_load(const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%e, ", data[i]); + } + fprintf(stream, "%e]\n", data.back()); +} + +void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%d, ", data[i]); + } + fprintf(stream, "%d]\n", data.back()); +} + +void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) { + std::string data_str(data == NULL ? "" : data); + + if (data_str.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + size_t pos_start = 0; + size_t pos_found = 0; + + if (std::isspace(data_str[0]) || std::isspace(data_str.back())) { + data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); + data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); + data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); + data_str = "\"" + data_str + "\""; + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + if (data_str.find('\n') == std::string::npos) { + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + fprintf(stream, "%s: |\n", prop_name); + while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { + fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); + pos_start = pos_found + 1; + } +} + +void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + const llama_sampling_params & sparams = params.sparams; + + fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); + fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); + fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx_vnni: %s\n", lm_ggml_cpu_has_avx_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vnni: %s\n", lm_ggml_cpu_has_avx512_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_cuda: %s\n", lm_ggml_cpu_has_cuda() ? "true" : "false"); + fprintf(stream, "cpu_has_vulkan: %s\n", lm_ggml_cpu_has_vulkan() ? "true" : "false"); + fprintf(stream, "cpu_has_clblast: %s\n", lm_ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_kompute: %s\n", lm_ggml_cpu_has_kompute() ? "true" : "false"); + fprintf(stream, "cpu_has_fma: %s\n", lm_ggml_cpu_has_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_gpublas: %s\n", lm_ggml_cpu_has_gpublas() ? "true" : "false"); + fprintf(stream, "cpu_has_neon: %s\n", lm_ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_sve: %s\n", lm_ggml_cpu_has_sve() ? "true" : "false"); + fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false"); + fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false"); + fprintf(stream, "cpu_has_vsx: %s\n", lm_ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_matmul_int8: %s\n", lm_ggml_cpu_has_matmul_int8() ? "true" : "false"); + +#ifdef NDEBUG + fprintf(stream, "debug: false\n"); +#else + fprintf(stream, "debug: true\n"); +#endif // NDEBUG + + fprintf(stream, "model_desc: %s\n", model_desc); + fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); + +#ifdef __OPTIMIZE__ + fprintf(stream, "optimize: true\n"); +#else + fprintf(stream, "optimize: false\n"); +#endif // __OPTIMIZE__ + + fprintf(stream, "time: %s\n", timestamp.c_str()); + + fprintf(stream, "\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "# User Inputs #\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "\n"); + + fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); + fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); + yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str()); + fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); + fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); + fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); + fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); + fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); + fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); + fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); + yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str()); + fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); + fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); + fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); + + const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); + const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; + fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); + + yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); + fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); + yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); + fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); + fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); + fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); + fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); + fprintf(stream, "keep: %d # default: 0\n", params.n_keep); + fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); + + fprintf(stream, "logit_bias:\n"); + for (std::pair lb : sparams.logit_bias) { + if (ignore_eos && lb.first == logit_bias_eos->first) { + continue; + } + fprintf(stream, " %d: %f", lb.first, lb.second); + } + + fprintf(stream, "lora:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) != 1.0f) { + continue; + } + fprintf(stream, " - %s\n", std::get<0>(la).c_str()); + } + fprintf(stream, "lora_scaled:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) == 1.0f) { + continue; + } + fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); + } + fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); + fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); + fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); + fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); + fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); + fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); + fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); + fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); + fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); + fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); + fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); + fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); + fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); + fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); + fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); + fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); + fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); + yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str()); + fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); + fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); + fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); + yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); + fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); + fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); + + fprintf(stream, "reverse_prompt:\n"); + for (std::string ap : params.antiprompt) { + size_t pos = 0; + while ((pos = ap.find('\n', pos)) != std::string::npos) { + ap.replace(pos, 1, "\\n"); + pos += 1; + } + + fprintf(stream, " - %s\n", ap.c_str()); + } + + fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); + fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); + fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); + fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); + fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); + fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); + fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); + + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); + yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); + + fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); + fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); + fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); + fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); + fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); + fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); +} diff --git a/cpp/common.h b/cpp/common.h index e326cb18..19585bfa 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -27,7 +27,7 @@ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define print_build_info() do { \ - fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) @@ -35,15 +35,12 @@ // build info extern int LLAMA_BUILD_NUMBER; -extern char const *LLAMA_COMMIT; -extern char const *LLAMA_COMPILER; -extern char const *LLAMA_BUILD_TARGET; +extern char const * LLAMA_COMMIT; +extern char const * LLAMA_COMPILER; +extern char const * LLAMA_BUILD_TARGET; struct llama_control_vector_load_info; -int get_math_cpu_count(); -int32_t get_num_physical_cores(); - #define print_build_info() do { \ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ @@ -55,6 +52,13 @@ extern char const *LLAMA_COMMIT; extern char const *LLAMA_COMPILER; extern char const *LLAMA_BUILD_TARGET; +// +// CPU utils +// + +int32_t cpu_get_num_physical_cores(); +int32_t cpu_get_num_math(); + // // CLI argument parsing // @@ -62,7 +66,7 @@ extern char const *LLAMA_BUILD_TARGET; struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_threads = get_math_cpu_count(); + int32_t n_threads = cpu_get_num_math(); int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch_draft = -1; @@ -93,6 +97,7 @@ struct gpt_params { float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + std::string rpc_servers = ""; // comma separated list of RPC servers lm_ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; @@ -151,6 +156,9 @@ struct gpt_params { bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode + bool special = false; // enable special token output + bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it @@ -187,33 +195,34 @@ struct gpt_params { void gpt_params_handle_model_default(gpt_params & params); -bool parse_kv_override(const char * data, std::vector & overrides); - -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); +bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); +bool gpt_params_parse (int argc, char ** argv, gpt_params & params); +bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); +void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); -bool gpt_params_parse(int argc, char ** argv, gpt_params & params); +std::string gpt_params_get_system_info(const gpt_params & params); -void gpt_print_usage(int argc, char ** argv, const gpt_params & params); - -bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); - -std::string get_system_info(const gpt_params & params); +// +// String utils +// -std::string gpt_random_prompt(std::mt19937 & rng); +std::vector string_split(std::string input, char separator); -void process_escapes(std::string& input); +std::string string_strip(const std::string & str); +std::string string_get_sortable_timestamp(); +std::string string_random_prompt(std::mt19937 & rng); -bool validate_file_name(const std::string & filename); +bool string_parse_kv_override(const char * data, std::vector & overrides); +void string_process_escapes(std::string & input); // -// String utils +// Filesystem utils // -std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names); -std::vector sampler_types_from_chars(const std::string & names_string); -std::vector string_split(std::string input, char separator); -std::string string_strip(const std::string & str); -std::string sampler_type_to_name_string(llama_sampler_type sampler_type); +bool fs_validate_filename(const std::string & filename); +bool fs_create_directory_with_parents(const std::string & path); + +std::string fs_get_cache_directory(); // // Model utils @@ -284,29 +293,15 @@ std::string llama_detokenize_bpe( // defaults to true when model type is SPM, otherwise false. bool llama_should_add_bos_token(const llama_model * model); -// -// YAML utils -// - -bool create_directory_with_parents(const std::string & path); -void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data); -void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data); -void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data); -std::string get_sortable_timestamp(); - -void dump_non_result_info_yaml( - FILE * stream, const gpt_params & params, const llama_context * lctx, - const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); - // // KV cache utils // // Dump the KV cache view with the number of sequences per cell. -void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); +void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). -void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); +void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); // // Embedding utils @@ -340,6 +335,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector & data); +void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector & data); +void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); + +void yaml_dump_non_result_info( + FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); + diff --git a/cpp/ggml-backend.c b/cpp/ggml-backend.c index 744fd39b..1e0de870 100644 --- a/cpp/ggml-backend.c +++ b/cpp/ggml-backend.c @@ -1182,9 +1182,9 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch static char * fmt_size(size_t size) { static char buffer[128]; if (size >= 1024*1024) { - sprintf(buffer, "%zuM", size/1024/1024); + snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024); } else { - sprintf(buffer, "%zuK", size/1024); + snprintf(buffer, sizeof(buffer), "%zuK", size/1024); } return buffer; } @@ -1895,7 +1895,6 @@ void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_t tensor->buffer = buffer; tensor->data = (char *)tensor->view_src->data + tensor->view_offs; - tensor->backend = tensor->view_src->backend; lm_ggml_backend_buffer_init_tensor(buffer, tensor); } diff --git a/cpp/ggml-common.h b/cpp/ggml-common.h index de9fdf56..d70faf82 100644 --- a/cpp/ggml-common.h +++ b/cpp/ggml-common.h @@ -65,13 +65,8 @@ typedef sycl::half2 lm_ggml_half2; // QK = number of values after dequantization // QK_K = super-block size -#ifdef LM_GGML_QKK_64 -#define QK_K 64 -#define K_SCALE_SIZE 4 -#else #define QK_K 256 #define K_SCALE_SIZE 12 -#endif // LM_GGML_QKK_64 #if defined(LM_GGML_COMMON_DECL_CUDA) || defined(LM_GGML_COMMON_DECL_HIP) || defined(LM_GGML_COMMON_DECL_SYCL) // QR = QK / number of values before dequantization @@ -131,13 +126,8 @@ typedef sycl::half2 lm_ggml_half2; #define QI4_NL (QK4_NL / (4*QR4_NL)) #define QR4_NL 2 -#if QK_K == 64 -#define QI4_XS QI4_NL -#define QR4_XS QR4_NL -#else #define QI4_XS (QK_K / (4*QR4_XS)) #define QR4_XS 8 -#endif #endif // LM_GGML_COMMON_DECL_CUDA || LM_GGML_COMMON_DECL_HIP @@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(lm_ggml_half) + QK_K/16 + QK_K/4, " // weight is represented as x = a * q // 16 blocks of 16 elements each // Effectively 3.4375 bits per weight -#ifdef LM_GGML_QKK_64 -typedef struct { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits - uint8_t scales[2]; - lm_ggml_half d; // super-block scale -} block_q3_K; -static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding"); -#else typedef struct { uint8_t hmask[QK_K/8]; // quants - high bit uint8_t qs[QK_K/4]; // quants - low 2 bits @@ -244,20 +225,11 @@ typedef struct { lm_ggml_half d; // super-block scale } block_q3_K; static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding"); -#endif // 4-bit quantization // 8 blocks of 32 elements each // weight is represented as x = a * q + b // Effectively 4.5 bits per weight -#ifdef LM_GGML_QKK_64 -typedef struct { - lm_ggml_half d[2]; // super-block scales/mins - uint8_t scales[2]; // 4-bit block scales/mins - uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding"); -#else typedef struct { union { struct { @@ -270,21 +242,11 @@ typedef struct { uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding"); -#endif // 5-bit quantization // 8 blocks of 32 elements each // weight is represented as x = a * q + b // Effectively 5.5 bits per weight -#ifdef LM_GGML_QKK_64 -typedef struct { - lm_ggml_half d; // super-block scale - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_K; -static_assert(sizeof(block_q5_K) == sizeof(lm_ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); -#else typedef struct { union { struct { @@ -298,7 +260,6 @@ typedef struct { uint8_t qs[QK_K/2]; // quants, low 4 bits } block_q5_K; static_assert(sizeof(block_q5_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); -#endif // 6-bit quantization // weight is represented as x = a * q @@ -356,11 +317,7 @@ typedef struct { static_assert(sizeof(block_iq3_xxs) == sizeof(lm_ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); // 3.4375 bpw -#if QK_K == 64 -#define IQ3S_N_SCALE 2 -#else #define IQ3S_N_SCALE QK_K/64 -#endif typedef struct { lm_ggml_half d; uint8_t qs[QK_K/4]; @@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(lm_ggml_half) + QK_K/8 + QK_K/16, "w typedef struct { uint8_t qs[QK_K/8]; // grid index, low 8 bits uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8) -#if QK_K == 64 - lm_ggml_half d; -#endif uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64) } block_iq1_m; -#if QK_K == 64 -static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(lm_ggml_half), "wrong iq1_m block size/padding"); -#else static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding"); -#endif // Used by IQ1_M quants typedef union { @@ -406,9 +356,6 @@ typedef struct { } block_iq4_nl; static_assert(sizeof(block_iq4_nl) == sizeof(lm_ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding"); -#if QK_K == 64 -#define block_iq4_xs block_iq4_nl -#else typedef struct { lm_ggml_half d; uint16_t scales_h; @@ -416,7 +363,6 @@ typedef struct { uint8_t qs[QK_K/2]; } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); -#endif #endif // LM_GGML_COMMON_DECL #endif // LM_GGML_COMMON_DECL diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h index 53315a68..0b0b0aa6 100644 --- a/cpp/ggml-impl.h +++ b/cpp/ggml-impl.h @@ -17,6 +17,95 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#if defined(_WIN32) + +#define m512bh(p) p +#define m512i(p) p + +#else + +#define m512bh(p) (__m512bh)(p) +#define m512i(p) (__m512i)(p) + +#endif + +/** + * Converts brain16 to float32. + * + * The bfloat16 floating point format has the following structure: + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───┐ + * 0b0000000000000000 brain16 + * + * Since bf16 has the same number of exponent bits as a 32bit float, + * encoding and decoding numbers becomes relatively straightforward. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───────────────────┐ + * 0b00000000000000000000000000000000 IEEE binary32 + * + * For comparison, the standard fp16 format has fewer exponent bits. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌─┴─┐┌─┴──────┐ + * 0b0000000000000000 IEEE binary16 + * + * @see IEEE 754-2008 + */ +static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) { + union { + float f; + uint32_t i; + } u; + u.i = (uint32_t)h.bits << 16; + return u.f; +} + +/** + * Converts float32 to brain16. + * + * This function is binary identical to AMD Zen4 VCVTNEPS2BF16. + * Subnormals shall be flushed to zero, and NANs will be quiet. + * This code should vectorize nicely if using modern compilers. + */ +static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) { + lm_ggml_bf16_t h; + union { + float f; + uint32_t i; + } u; + u.f = s; + if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ + h.bits = (u.i >> 16) | 64; /* force to quiet */ + return h; + } + if (!(u.i & 0x7f800000)) { /* subnormal */ + h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */ + return h; + } + h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; + return h; +} + +#define LM_GGML_FP32_TO_BF16(x) lm_ggml_compute_fp32_to_bf16(x) +#define LM_GGML_BF16_TO_FP32(x) lm_ggml_compute_bf16_to_fp32(x) + #ifdef __cplusplus extern "C" { #endif @@ -43,9 +132,20 @@ extern "C" { #ifndef __F16C__ #define __F16C__ #endif +#endif + +// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available +#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) #ifndef __SSE3__ #define __SSE3__ #endif +#ifndef __SSSE3__ +#define __SSSE3__ +#endif +#endif + +#if defined(__ARM_FEATURE_SVE) +#include #endif // 16-bit float @@ -359,6 +459,34 @@ static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) { #include #endif +#if defined(__loongarch64) +#if defined(__loongarch_asx) +#include +#endif +#if defined(__loongarch_sx) +#include +#endif +#endif + +#if defined(__loongarch_asx) + +typedef union { + int32_t i; + float f; +} ft_union; + +/* float type data load instructions */ +static __m128 __lsx_vreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; + return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); +} + +static __m256 __lasx_xvreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; + return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); +} +#endif + #ifdef __F16C__ #ifdef _MSC_VER diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index 502c61b2..b6423cb4 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -35,11 +35,16 @@ LM_GGML_METAL_KERNEL_TYPE_MUL_ROW, LM_GGML_METAL_KERNEL_TYPE_DIV, LM_GGML_METAL_KERNEL_TYPE_DIV_ROW, + LM_GGML_METAL_KERNEL_TYPE_REPEAT_F32, + LM_GGML_METAL_KERNEL_TYPE_REPEAT_F16, + LM_GGML_METAL_KERNEL_TYPE_REPEAT_I32, + LM_GGML_METAL_KERNEL_TYPE_REPEAT_I16, LM_GGML_METAL_KERNEL_TYPE_SCALE, LM_GGML_METAL_KERNEL_TYPE_SCALE_4, LM_GGML_METAL_KERNEL_TYPE_CLAMP, LM_GGML_METAL_KERNEL_TYPE_TANH, LM_GGML_METAL_KERNEL_TYPE_RELU, + LM_GGML_METAL_KERNEL_TYPE_SIGMOID, LM_GGML_METAL_KERNEL_TYPE_GELU, LM_GGML_METAL_KERNEL_TYPE_GELU_4, LM_GGML_METAL_KERNEL_TYPE_GELU_QUICK, @@ -169,7 +174,6 @@ LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, LM_GGML_METAL_KERNEL_TYPE_ROPE_F32, LM_GGML_METAL_KERNEL_TYPE_ROPE_F16, - LM_GGML_METAL_KERNEL_TYPE_ALIBI_F32, LM_GGML_METAL_KERNEL_TYPE_IM2COL_F16, LM_GGML_METAL_KERNEL_TYPE_IM2COL_F32, LM_GGML_METAL_KERNEL_TYPE_UPSCALE_F32, @@ -184,9 +188,9 @@ LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, - LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, + //LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261 LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, - LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, + //LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261 LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F16, LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F32, LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, @@ -265,11 +269,20 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, static void * lm_ggml_metal_host_malloc(size_t n) { void * data = NULL; + +#if TARGET_OS_OSX + kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE); + if (err != KERN_SUCCESS) { + LM_GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__); + return NULL; + } +#else const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { LM_GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } +#endif return data; } @@ -372,10 +385,6 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, // dictionary of preprocessor macros NSMutableDictionary * prep = [NSMutableDictionary dictionary]; -#ifdef LM_GGML_QKK_64 - prep[@"LM_GGML_QKK_64"] = @(1); -#endif - MTLCompileOptions* options = [MTLCompileOptions new]; options.preprocessorMacros = prep; @@ -480,11 +489,16 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_DIV, div, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_REPEAT_I16, repeat_i16, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SCALE, scale, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_TANH, tanh, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_RELU, relu, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SIGMOID, sigmoid, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GELU, gelu, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true); @@ -614,7 +628,6 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); @@ -624,14 +637,14 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, true); - LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm); + //LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction); + //LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); @@ -723,6 +736,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, switch (lm_ggml_get_unary_op(op)) { case LM_GGML_UNARY_OP_TANH: case LM_GGML_UNARY_OP_RELU: + case LM_GGML_UNARY_OP_SIGMOID: case LM_GGML_UNARY_OP_GELU: case LM_GGML_UNARY_OP_GELU_QUICK: case LM_GGML_UNARY_OP_SILU: @@ -740,6 +754,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, case LM_GGML_OP_ACC: case LM_GGML_OP_MUL: case LM_GGML_OP_DIV: + case LM_GGML_OP_REPEAT: case LM_GGML_OP_SCALE: case LM_GGML_OP_CLAMP: case LM_GGML_OP_SQR: @@ -750,7 +765,6 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, case LM_GGML_OP_GROUP_NORM: return ctx->support_simdgroup_reduction; case LM_GGML_OP_NORM: - case LM_GGML_OP_ALIBI: case LM_GGML_OP_ROPE: case LM_GGML_OP_IM2COL: return true; @@ -763,8 +777,12 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, case LM_GGML_OP_TIMESTEP_EMBEDDING: case LM_GGML_OP_ARGSORT: case LM_GGML_OP_LEAKY_RELU: - case LM_GGML_OP_FLASH_ATTN_EXT: return true; + case LM_GGML_OP_FLASH_ATTN_EXT: + if (op->src[0]->ne[0] == 256) { + return false; + } + return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels case LM_GGML_OP_MUL_MAT: case LM_GGML_OP_MUL_MAT_ID: return ctx->support_simdgroup_reduction && @@ -803,7 +821,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, case LM_GGML_OP_DIAG_MASK_INF: case LM_GGML_OP_GET_ROWS: { - return op->ne[3] == 1; + return op->src[0]->type != LM_GGML_TYPE_BF16 && op->ne[3] == 1; } default: return false; @@ -917,22 +935,32 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( const int64_t ne10 = src1 ? src1->ne[0] : 0; const int64_t ne11 = src1 ? src1->ne[1] : 0; const int64_t ne12 = src1 ? src1->ne[2] : 0; - const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); + const int64_t ne13 = src1 ? src1->ne[3] : 0; const uint64_t nb10 = src1 ? src1->nb[0] : 0; const uint64_t nb11 = src1 ? src1->nb[1] : 0; const uint64_t nb12 = src1 ? src1->nb[2] : 0; - const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); + const uint64_t nb13 = src1 ? src1->nb[3] : 0; + + const int64_t ne20 = src2 ? src2->ne[0] : 0; + const int64_t ne21 = src2 ? src2->ne[1] : 0; + const int64_t ne22 = src2 ? src2->ne[2] : 0; LM_GGML_UNUSED(ne22); + const int64_t ne23 = src2 ? src2->ne[3] : 0; LM_GGML_UNUSED(ne23); - const int64_t ne0 = dst ? dst->ne[0] : 0; - const int64_t ne1 = dst ? dst->ne[1] : 0; - const int64_t ne2 = dst ? dst->ne[2] : 0; - const int64_t ne3 = dst ? dst->ne[3] : 0; + const uint64_t nb20 = src2 ? src2->nb[0] : 0; LM_GGML_UNUSED(nb20); + const uint64_t nb21 = src2 ? src2->nb[1] : 0; + const uint64_t nb22 = src2 ? src2->nb[2] : 0; + const uint64_t nb23 = src2 ? src2->nb[3] : 0; - const uint64_t nb0 = dst ? dst->nb[0] : 0; - const uint64_t nb1 = dst ? dst->nb[1] : 0; - const uint64_t nb2 = dst ? dst->nb[2] : 0; - const uint64_t nb3 = dst ? dst->nb[3] : 0; + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; + + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; const enum lm_ggml_type src0t = src0 ? src0->type : LM_GGML_TYPE_COUNT; const enum lm_ggml_type src1t = src1 ? src1->type : LM_GGML_TYPE_COUNT; @@ -960,10 +988,10 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( switch (dst->op) { case LM_GGML_OP_CONCAT: { - const int64_t nb = ne00; - id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; + const int32_t dim = ((int32_t *) dst->op_params)[0]; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; @@ -992,7 +1020,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:27]; + [encoder setBytes:&dim length:sizeof(dim) atIndex:27]; const int nth = MIN(1024, ne0); @@ -1002,11 +1030,14 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( case LM_GGML_OP_MUL: case LM_GGML_OP_DIV: { + LM_GGML_ASSERT(src0t == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32); + const size_t offs = 0; bool bcast_row = false; - int64_t nb = ne00; + int64_t nb = ne00; // used by the "row" kernels id pipeline = nil; @@ -1075,6 +1106,42 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } } break; + case LM_GGML_OP_REPEAT: + { + id pipeline; + + switch (src0t) { + case LM_GGML_TYPE_F32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break; + case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; + case LM_GGML_TYPE_I32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; + case LM_GGML_TYPE_I16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; + default: LM_GGML_ASSERT(false); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case LM_GGML_OP_ACC: { LM_GGML_ASSERT(src0t == LM_GGML_TYPE_F32); @@ -1185,24 +1252,24 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case LM_GGML_OP_CLAMP: - { - id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; + { + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; - float min; - float max; - memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); - memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); + float min; + float max; + memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&min length:sizeof(min) atIndex:2]; - [encoder setBytes:&max length:sizeof(max) atIndex:3]; + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&min length:sizeof(min) atIndex:2]; + [encoder setBytes:&max length:sizeof(max) atIndex:3]; - const int64_t n = lm_ggml_nelements(dst); + const int64_t n = lm_ggml_nelements(dst); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; case LM_GGML_OP_UNARY: switch (lm_ggml_get_unary_op(gf->nodes[i])) { // we are not taking into account the strides, so for now require contiguous tensors @@ -1230,6 +1297,18 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( const int64_t n = lm_ggml_nelements(dst); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case LM_GGML_UNARY_OP_SIGMOID: + { + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = lm_ggml_nelements(dst); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case LM_GGML_UNARY_OP_GELU: @@ -1348,16 +1427,15 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( case LM_GGML_OP_SOFT_MAX: { LM_GGML_ASSERT(!src1 || src1->type == LM_GGML_TYPE_F16 || src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT(!src2 || src2->type == LM_GGML_TYPE_F16 || src2->type == LM_GGML_TYPE_F32); int nth = 32; // SIMD width id pipeline = nil; - const bool use_f16 = (src1 && src1->type == LM_GGML_TYPE_F16) || (src2 && src2->type == LM_GGML_TYPE_F16); + const bool use_f16 = (src1 && src1->type == LM_GGML_TYPE_F16); if (ne00%4 == 0) { - while (nth < ne00/4 && nth < 256) { + while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) { nth *= 2; } if (use_f16) { @@ -1366,7 +1444,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline; } } else { - while (nth < ne00 && nth < 1024) { + while (nth < ne00 && nth*ne01*ne02*ne03 < 256) { nth *= 2; } if (use_f16) { @@ -1385,8 +1463,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( const int64_t nrows_x = lm_ggml_nrows(src0); const int64_t nrows_y = src0->ne[1]; - const uint32_t n_head_kv = nrows_x/nrows_y; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + const uint32_t n_head = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); @@ -1398,20 +1476,15 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( } else { [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; } - if (id_src2) { - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; - } - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:7]; - [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8]; - [encoder setBytes:&m0 length:sizeof(m0) atIndex:9]; - [encoder setBytes:&m1 length:sizeof(m1) atIndex:10]; - [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; + [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:8]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:9]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; @@ -1747,11 +1820,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == LM_GGML_TYPE_Q3_K) { -#ifdef LM_GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#else [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#endif } else if (src0t == LM_GGML_TYPE_Q5_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -1769,16 +1838,6 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( const int n_as = src0->ne[2]; // src2 = ids - const int64_t ne20 = src2->ne[0]; - const int64_t ne21 = src2->ne[1]; - const int64_t ne22 = src2->ne[2]; LM_GGML_UNUSED(ne22); - const int64_t ne23 = src2->ne[3]; LM_GGML_UNUSED(ne23); - - const uint64_t nb20 = src2->nb[0]; LM_GGML_UNUSED(nb20); - const uint64_t nb21 = src2->nb[1]; - const uint64_t nb22 = src2->nb[2]; LM_GGML_UNUSED(nb22); - const uint64_t nb23 = src2->nb[3]; LM_GGML_UNUSED(nb23); - const enum lm_ggml_type src2t = src2->type; LM_GGML_UNUSED(src2t); LM_GGML_ASSERT(src2t == LM_GGML_TYPE_I32); @@ -2002,12 +2061,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( { nth0 = 4; nth1 = 16; - #if QK_K == 64 - pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; - #else pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline; - #endif - } break; default: { @@ -2072,11 +2126,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == LM_GGML_TYPE_Q3_K) { -#ifdef LM_GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#else [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#endif } else if (src0t == LM_GGML_TYPE_Q5_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -2216,49 +2266,6 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; - case LM_GGML_OP_ALIBI: - { - LM_GGML_ASSERT((src0t == LM_GGML_TYPE_F32)); - - const int nth = MIN(1024, ne00); - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_head = ((int32_t *) dst->op_params)[1]; - - float max_bias; - memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); - - const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); - const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - - id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; - [encoder setBytes:&m1 length:sizeof( float) atIndex:19]; - [encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; case LM_GGML_OP_ROPE: { LM_GGML_ASSERT(ne10 == ne02); @@ -2271,7 +2278,13 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); @@ -2279,6 +2292,15 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + + LM_GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal"); + + if (!is_neox) { + LM_GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox"); + } + id pipeline = nil; switch (src0->type) { @@ -2290,33 +2312,38 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; - [encoder setBytes:&mode length:sizeof( int) atIndex:21]; - [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22]; - [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; - [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; - [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; - [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; - [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; - [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; + if (id_src2 != nil) { + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:20]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:21]; + [encoder setBytes:&mode length:sizeof( int) atIndex:22]; + [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:23]; + [encoder setBytes:&freq_base length:sizeof( float) atIndex:24]; + [encoder setBytes:&freq_scale length:sizeof( float) atIndex:25]; + [encoder setBytes:&ext_factor length:sizeof( float) atIndex:26]; + [encoder setBytes:&attn_factor length:sizeof( float) atIndex:27]; + [encoder setBytes:&beta_fast length:sizeof( float) atIndex:28]; + [encoder setBytes:&beta_slow length:sizeof( float) atIndex:29]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -2380,7 +2407,10 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( { LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); - const int sf = dst->op_params[0]; + const float sf0 = (float)ne0/src0->ne[0]; + const float sf1 = (float)ne1/src0->ne[1]; + const float sf2 = (float)ne2/src0->ne[2]; + const float sf3 = (float)ne3/src0->ne[3]; const id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; @@ -2403,7 +2433,10 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - [encoder setBytes:&sf length:sizeof(sf) atIndex:18]; + [encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18]; + [encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19]; + [encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20]; + [encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21]; const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); @@ -2539,13 +2572,14 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( } break; case LM_GGML_OP_FLASH_ATTN_EXT: { - LM_GGML_ASSERT(ne00 % 4 == 0); + LM_GGML_ASSERT(ne00 % 4 == 0); + LM_GGML_ASSERT(ne11 % 32 == 0); + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); - struct lm_ggml_tensor * src3 = gf->nodes[i]->src[3]; + LM_GGML_ASSERT(lm_ggml_are_same_shape (src1, src2)); - LM_GGML_ASSERT(lm_ggml_are_same_shape(src1, src2)); - LM_GGML_ASSERT(src3); + struct lm_ggml_tensor * src3 = gf->nodes[i]->src[3]; size_t offs_src3 = 0; @@ -2556,7 +2590,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); const int64_t ne30 = src3 ? src3->ne[0] : 0; LM_GGML_UNUSED(ne30); - const int64_t ne31 = src3 ? src3->ne[1] : 0; + //const int64_t ne31 = src3 ? src3->ne[1] : 0; const int64_t ne32 = src3 ? src3->ne[2] : 0; LM_GGML_UNUSED(ne32); const int64_t ne33 = src3 ? src3->ne[3] : 0; LM_GGML_UNUSED(ne33); @@ -2568,7 +2602,16 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( const enum lm_ggml_type src2t = src2 ? src2->type : LM_GGML_TYPE_COUNT; LM_GGML_UNUSED(src2t); float scale; - memcpy(&scale, dst->op_params, sizeof(float)); + float max_bias; + + memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale)); + memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias)); + + const uint32_t n_head = src0->ne[2]; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); id pipeline = nil; @@ -2581,7 +2624,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( case 96: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; case 112: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; case 128: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; - case 256: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + //case 256: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; default: { LM_GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); @@ -2594,7 +2637,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( switch (ne00) { case 128: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; - case 256: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + //case 256: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; default: { LM_GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); @@ -2605,34 +2648,38 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( } [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:4]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12]; - [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15]; - [encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16]; - [encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19]; - [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20]; - [encoder setBytes:&ne31 length:sizeof( int64_t) atIndex:21]; - [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:22]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:23]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:24]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:25]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26]; - [encoder setBytes:&scale length:sizeof( float) atIndex:27]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + if (id_src3) { + [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:4]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19]; + [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22]; + [encoder setBytes:&scale length:sizeof( float) atIndex:23]; + [encoder setBytes:&max_bias length:sizeof( float) atIndex:24]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:25]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:26]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27]; if (!use_vec_kernel) { // half8x8 kernel @@ -2840,7 +2887,11 @@ LM_GGML_CALL static void lm_ggml_backend_metal_buffer_free_buffer(lm_ggml_backen lm_ggml_backend_metal_free_device(); if (ctx->owned) { +#if TARGET_OS_OSX + vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size); +#else free(ctx->all_data); +#endif } free(ctx); @@ -2944,14 +2995,16 @@ LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_a ctx->owned = true; ctx->n_buffers = 1; - ctx->buffers[0].data = ctx->all_data; - ctx->buffers[0].size = size; - ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; + if (ctx->all_data != NULL) { + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + } - if (ctx->buffers[0].metal == nil) { + if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) { LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); lm_ggml_backend_metal_free_device(); diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c index 69f30bdd..9b411ddf 100644 --- a/cpp/ggml-quants.c +++ b/cpp/ggml-quants.c @@ -14,6 +14,18 @@ #include // for qsort #include // for LM_GGML_ASSERT +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#if defined(_MSC_VER) +// disable "possible loss of data" to avoid warnings for hundreds of casts +// we should just be careful :) +#pragma warning(disable: 4244 4267) +#endif + #define UNUSED LM_GGML_UNUSED // some compilers don't provide _mm256_set_m128i, e.g. gcc 7 @@ -235,7 +247,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // __AVX__ || __AVX2__ || __AVX512F__ #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -#if defined(__ARM_NEON) || defined(__wasm_simd128__) +#if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) @@ -250,6 +262,403 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif +#if defined(__loongarch_asx) + +#ifdef __clang__ +#define VREGS_PREFIX "$vr" +#define XREGS_PREFIX "$xr" +#else // GCC +#define VREGS_PREFIX "$f" +#define XREGS_PREFIX "$f" +#endif +#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31" +// Convert __m128i to __m256i +static inline __m256i ____m256i(__m128i in) { + __m256i out = __lasx_xvldi(0); + __asm__ volatile ( + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " XREGS_PREFIX"\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " VREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + : [out] "+f" (out) : [in] "f" (in) + ); + return out; +} +// Convert two __m128i to __m256i +static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) { + __m256i out; + __asm__ volatile ( + ".irp i," __ALL_REGS "\n\t" + " .ifc %[hi], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[lo], " VREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".ifnc %[out], %[hi] \n\t" + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " XREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[hi], " VREGS_PREFIX "\\j \n\t" + " xvori.b $xr\\i, $xr\\j, 0 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".endif \n\t" + : [out] "=f" (out), [hi] "+f" (inhi) + : [lo] "f" (inlo) + ); + return out; +} +// Convert __m256i low part to __m128i +static inline __m128i lasx_extracti128_lo(__m256i in) { + __m128i out; + __asm__ volatile ( + ".ifnc %[out], %[in] \n\t" + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " XREGS_PREFIX "\\j \n\t" + " vori.b $vr\\i, $vr\\j, 0 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".endif \n\t" + : [out] "=f" (out) : [in] "f" (in) + ); + return out; +} +// Convert __m256i high part to __m128i +static inline __m128i lasx_extracti128_hi(__m256i in) { + __m128i out; + __asm__ volatile ( + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " XREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + : [out] "=f" (out) : [in] "f" (in) + ); + return out; +} + +static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) { + v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7}; + return (__m256i)__ret; +} + +static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) { + v4i32 __ret = {d, c, b, a}; + return (__m128i)__ret; +} + +static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) { + v4i64 __ret = {d, c, b, a}; + return (__m256i)__ret; +} + +static __m256i lasx_insertf128( __m128i x, __m128i y) { + return lasx_set_q(x, y); +} + +static __m128i lsx_shuffle_b(__m128i a, __m128i b) { + __m128i mask_f, zero, tmp0, tmp2, mask; + int f = 0x8f; + mask_f = __lsx_vreplgr2vr_b(f); + zero = __lsx_vldi(0); + tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits + tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive + mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask + tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones + return __lsx_vshuf_b(a, zero, tmp2); +} + +static __m256i lasx_shuffle_b(__m256i a, __m256i b) { + __m256i mask_f, zero, tmp0, tmp2, mask; + int f = 0x8f; + mask_f = __lasx_xvreplgr2vr_b(f); + zero = __lasx_xvldi(0); + tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits + tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive + mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask + tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones + return __lasx_xvshuf_b(a, zero, tmp2); +} + +static __m256i lasx_extu8_16(__m128i a) { + __m128i zero = __lsx_vldi(0); + __m128i vlo = __lsx_vilvl_b(zero, a); + __m128i vhi = __lsx_vilvh_b(zero, a); + return lasx_set_q(vhi, vlo); +} + +static __m256i lasx_ext8_16(__m128i a) { + __m128i sign = __lsx_vslti_b(a, 0); + __m128i vlo = __lsx_vilvl_b(sign, a); + __m128i vhi = __lsx_vilvh_b(sign, a); + return lasx_set_q(vhi, vlo); +} + +static __m256i lasx_ext16_32(__m128i a) { + __m256i tmp1; + tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0); + tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1); + tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2); + tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3); + tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4); + tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5); + tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6); + tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7); + return tmp1; +} + +static __m128i lasx_extracti128( __m256i a, int pos) { + __m128i ret; + if( pos == 0) + { + ret = lasx_extracti128_lo(a); + } else { + ret = lasx_extracti128_hi(a); + } + return ret; +} + +static __m128 lasx_extractf128( __m256 a, int pos) { + __m128 ret; + if( pos == 0) + { + ret = (__m128)lasx_extracti128_lo((__m256i)a); + } else { + ret = (__m128)lasx_extracti128_hi((__m256i)a); + } + return ret; +} + +static __m128i lsx_hadd_h(__m128i a, __m128i b) { + __m128i tmp1 = __lsx_vpickev_h(b, a); + __m128i tmp2 = __lsx_vpickod_h(b, a); + return __lsx_vadd_h(tmp1, tmp2); +} + +static __m128i lsx_hadd_w(__m128i a, __m128i b) { + __m128i tmp1 = __lsx_vpickev_w(b, a); + __m128i tmp2 = __lsx_vpickod_w(b, a); + return __lsx_vadd_w(tmp1, tmp2); +} + +static __m128 lsx_hadd_s(__m128 a, __m128 b) { + __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a); + __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a); + + return __lsx_vfadd_s(tmp1, tmp2); +} + +static __m256i lasx_maddubs_h(__m256i a, __m256i b) { + __m256i tmp1, tmp2; + tmp1 = __lasx_xvmulwev_h_b(a, b); + tmp2 = __lasx_xvmulwod_h_b(a, b); + return __lasx_xvsadd_h(tmp1, tmp2); +} + +static __m256i lasx_madd_h(__m256i a, __m256i b) { + __m256i tmp1, tmp2; + tmp1 = __lasx_xvmulwev_w_h(a, b); + tmp2 = __lasx_xvmulwod_w_h(a, b); + return __lasx_xvadd_w(tmp1, tmp2); +} + +static __m256i lasx_packs_w(__m256i a, __m256i b) { + __m256i tmp, tmp1; + tmp = __lasx_xvsat_w(a, 15); + tmp1 = __lasx_xvsat_w(b, 15); + return __lasx_xvpickev_h(tmp1, tmp); +} + +static __m256i lasx_packs_h(__m256i a, __m256i b) { + __m256i tmp, tmp1; + tmp = __lasx_xvsat_h(a, 7); + tmp1 = __lasx_xvsat_h(b, 7); + return __lasx_xvpickev_b(tmp1, tmp); +} + +static __m128i lsx_packs_w(__m128i a, __m128i b) { + __m128i tmp, tmp1; + tmp = __lsx_vsat_w(a, 15); + tmp1 = __lsx_vsat_w(b, 15); + return __lsx_vpickev_h(tmp1, tmp); +} + +static __m128i lsx_packs_h(__m128i a, __m128i b) { + __m128i tmp, tmp1; + tmp = __lsx_vsat_h(a, 7); + tmp1 = __lsx_vsat_h(b, 7); + return __lsx_vpickev_b(tmp1, tmp); +} + +static __m128i lsx_packus_h(__m128i a, __m128i b) { + __m128i tmp, tmp1; + tmp = __lsx_vsat_hu(a, 7); + tmp1 = __lsx_vsat_hu(b, 7); + return __lsx_vpickev_b(tmp1, tmp); +} + + +static __m128i lsx_maddubs_h(__m128i a, __m128i b) { + __m128i tmp1, tmp2; + tmp1 = __lsx_vmulwev_h_b(a, b); + tmp2 = __lsx_vmulwod_h_b(a, b); + return __lsx_vsadd_h(tmp1, tmp2); +} + +static __m128i lsx_madd_h(__m128i a, __m128i b) { + __m128i tmp1, tmp2; + tmp1 = __lsx_vmulwev_w_h(a, b); + tmp2 = __lsx_vmulwod_w_h(a, b); + return __lsx_vadd_w(tmp1, tmp2); +} + +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors + const __m128i ax = __lsx_vsigncov_b(x, x); + // Sign the values of the y vectors + const __m128i sy = __lsx_vsigncov_b(x, y); + // Perform multiplication and create 16-bit values + const __m128i dot = lsx_maddubs_h(ax, sy); + const __m128i ones = __lsx_vreplgr2vr_h(1); + return lsx_madd_h(ones, dot); +} + +// horizontally add 8 floats +static inline float hsum_float_8(const __m256 x) { + __m128 res = lasx_extractf128(x, 1); + ft_union tmp; + res = __lsx_vfadd_s(res, lasx_extractf128(x, 0)); + res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res)); + res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0)); + tmp.i = __lsx_vpickve2gr_w(res, 0); + return tmp.f; +} + +// horizontally add 8 int32_t +static inline int hsum_i32_8(const __m256i a) { + + __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11); + __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00); + + __m128i tmp1_128 = lasx_extracti128_lo(tmp1); + __m128i tmp2_128 = lasx_extracti128_lo(tmp2); + + __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128); + + __m128i ev = __lsx_vpickev_w(sum128, sum128); + __m128i od = __lsx_vpickod_w(sum128, sum128); + __m128i sum64 = __lsx_vadd_w(ev, od); + + int sum64_1, sum64_2; + sum64_1 = __lsx_vpickve2gr_w(sum64, 0); + sum64_2 = __lsx_vpickve2gr_w(sum64, 1); + + return sum64_1 + sum64_2; +} + +// horizontally add 4 int32_t +static inline int hsum_i32_4(const __m128i a) { + __m128i ev = __lsx_vpickev_w(a, a); + __m128i od = __lsx_vpickod_w(a, a); + __m128i sum64 = __lsx_vadd_w(ev, od); + + int sum64_1, sum64_2; + sum64_1 = __lsx_vpickve2gr_w(sum64, 0); + sum64_2 = __lsx_vpickve2gr_w(sum64, 1); + + return sum64_1 + sum64_2; +} + +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m256i shuf_mask = lasx_set_d( + 0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000); + + __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask); + const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe); + bytes = __lasx_xvor_v(bytes, bit_mask); + return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1)); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { + const __m128i lo = __lsx_vld((const __m128i *)rsi, 0); + __m128i hi = __lsx_vsrli_h(lo, 4); + return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m256i x) { + __m256i v = __lasx_xvpackod_h(x, x); + __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v); + return __lasx_xvffint_s_w(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { + // Perform multiplication and create 16-bit values + const __m256i dot = lasx_maddubs_h(ax, sy); + return sum_i16_pairs_float(dot); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + + // Get absolute values of x vectors + const __m256i ax = __lasx_xvsigncov_b(x, x); + // Sign the values of the y vectors + const __m256i sy = __lasx_xvsigncov_b(x, y); + + return mul_sum_us8_pairs_float(ax, sy); +} + +static inline __m128i packNibbles( __m256i bytes ) { + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh + const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF); + __m256i high = __lasx_xvandn_v(lowByte, bytes); + __m256i low = __lasx_xvand_v(lowByte, bytes); + high = __lasx_xvsrli_h(high, 4); + bytes = __lasx_xvor_v(low, high); + // Compress uint16_t lanes into bytes + __m128i *r0 = (__m128i *)&bytes; + __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11); + __m128i *r1 = (__m128i *)&tmp_h128; + + __m128i zero = __lsx_vldi(0); + __m128i tmp, tmp2, tmp3; + + tmp = __lsx_vmax_h(zero, *r0); + tmp2 = __lsx_vsat_hu(tmp, 7); + + tmp = __lsx_vmax_h(zero, *r1); + tmp3 = __lsx_vsat_hu(tmp, 7); + return __lsx_vpickev_b(tmp3, tmp2); +} +#endif //__loongarch_asx + // reference implementation for deterministic creation of model files void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) { static const int qk = QK4_0; @@ -637,6 +1046,102 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) // store result __riscv_vse8_v_i8m1(y[i].qs , vs, vl); } + +#elif defined(__POWER9_VECTOR__) + for (int i = 0; i < nb; i++) { + vector float srcv [8]; + vector float asrcv[8]; + vector float amaxv[8]; + vector signed int vi[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(vec_extract(amaxv[0], 0), + vec_extract(amaxv[0], 1)), + MAX(vec_extract(amaxv[0], 2), + vec_extract(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + const vector float vid = vec_splats(id); + + y[i].d = LM_GGML_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const vector float v = vec_round(vec_mul(srcv[j], vid)); + vi[j] = vec_cts(v, 0); + } + vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]); + vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]); + +#elif defined(__loongarch_asx) + for (int i = 0; i < nb; i++) { + ft_union fi; + __m256 v0 = (__m256)__lasx_xvld( x , 0); + __m256 v1 = (__m256)__lasx_xvld( x , 32); + __m256 v2 = (__m256)__lasx_xvld( x , 64); + __m256 v3 = (__m256)__lasx_xvld( x , 96); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f ); + __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) ); + + __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) ); + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); + __m128 tmp = max4; + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 )); + fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); + const float max_scalar = fi.f; + + // Quantize these floats + const float d = max_scalar / 127.f; + y[i].d = LM_GGML_FP32_TO_FP16(d); + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; + const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id ); + + // Apply the multiplier + v0 = __lasx_xvfmul_s( v0, mul ); + v1 = __lasx_xvfmul_s( v1, mul ); + v2 = __lasx_xvfmul_s( v2, mul ); + v3 = __lasx_xvfmul_s( v3, mul ); + + // Round to nearest integer + __m256i i0 = __lasx_xvftintrne_w_s( v0 ); + __m256i i1 = __lasx_xvftintrne_w_s( v1 ); + __m256i i2 = __lasx_xvftintrne_w_s( v2 ); + __m256i i3 = __lasx_xvftintrne_w_s( v3 ); + + __m128i ni0 = lasx_extracti128( i0, 0 ); + __m128i ni1 = lasx_extracti128( i0, 1); + __m128i ni2 = lasx_extracti128( i1, 0); + __m128i ni3 = lasx_extracti128( i1, 1); + __m128i ni4 = lasx_extracti128( i2, 0); + __m128i ni5 = lasx_extracti128( i2, 1); + __m128i ni6 = lasx_extracti128( i3, 0); + __m128i ni7 = lasx_extracti128( i3, 1); + + // Convert int32 to int16 + ni0 = lsx_packs_w( ni0, ni1 ); + ni2 = lsx_packs_w( ni2, ni3 ); + ni4 = lsx_packs_w( ni4, ni5 ); + ni6 = lsx_packs_w( ni6, ni7 ); + // Convert int16 to int8 + ni0 = lsx_packs_h( ni0, ni2 ); + ni4 = lsx_packs_h( ni4, ni6 ); + + __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0); + __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0); + + } #else LM_GGML_UNUSED(nb); // scalar @@ -784,12 +1289,12 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); + const float max_scalar = _mm_cvtss_f32( max4 ); // Quantize these floats - const float d = maxScalar / 127.f; + const float d = max_scalar / 127.f; y[i].d = LM_GGML_FP32_TO_FP16(d); - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); // Apply the multiplier @@ -892,6 +1397,114 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); y[i].s = LM_GGML_FP32_TO_FP16(sum*d); } + +#elif defined(__POWER9_VECTOR__) + for (int i = 0; i < nb; i++) { + vector float srcv [8]; + vector float asrcv[8]; + vector float amaxv[8]; + vector signed int vi[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(vec_extract(amaxv[0], 0), + vec_extract(amaxv[0], 1)), + MAX(vec_extract(amaxv[0], 2), + vec_extract(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + const vector float vid = vec_splats(id); + + y[i].d = LM_GGML_FP32_TO_FP16(d); + + vector int accv = vec_splats(0); + + for (int j = 0; j < 8; j++) { + const vector float v = vec_round(vec_mul(srcv[j], vid)); + vi[j] = vec_cts(v, 0); + + accv = vec_add(accv, vi[j]); + } + vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]); + vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]); + + accv = vec_add(accv, vec_sld(accv, accv, 4)); + accv = vec_add(accv, vec_sld(accv, accv, 8)); + y[i].s = LM_GGML_FP32_TO_FP16(d * vec_extract(accv, 0)); + +#elif defined(__loongarch_asx) + for (int i = 0; i < nb; i++) { + ft_union ft; + __m256 v0 = (__m256)__lasx_xvld( x , 0 ); + __m256 v1 = (__m256)__lasx_xvld( x , 32 ); + __m256 v2 = (__m256)__lasx_xvld( x , 64 ); + __m256 v3 = (__m256)__lasx_xvld( x , 96 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f ); + __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) ); + + __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) ); + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); + __m128 tmp = max4; + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 )); + ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); + const float max_scalar = ft.f; + + // Quantize these floats + const float d = max_scalar / 127.f; + y[i].d = LM_GGML_FP32_TO_FP16(d); + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; + const __m256 mul = __lasx_xvreplfr2vr_s( id ); + + // Apply the multiplier + v0 = __lasx_xvfmul_s( v0, mul ); + v1 = __lasx_xvfmul_s( v1, mul ); + v2 = __lasx_xvfmul_s( v2, mul ); + v3 = __lasx_xvfmul_s( v3, mul ); + + // Round to nearest integer + __m256i i0 = __lasx_xvftintrne_w_s( v0 ); + __m256i i1 = __lasx_xvftintrne_w_s( v1 ); + __m256i i2 = __lasx_xvftintrne_w_s( v2 ); + __m256i i3 = __lasx_xvftintrne_w_s( v3 ); + + __m128i ni0 = lasx_extracti128(i0, 0); + __m128i ni1 = lasx_extracti128( i0, 1); + __m128i ni2 = lasx_extracti128( i1, 0); + __m128i ni3 = lasx_extracti128( i1, 1); + __m128i ni4 = lasx_extracti128( i2, 0 ); + __m128i ni5 = lasx_extracti128( i2, 1); + __m128i ni6 = lasx_extracti128( i3, 0); + __m128i ni7 = lasx_extracti128( i3, 1); + + // Compute the sum of the quants and set y[i].s + const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3)); + const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7)); + y[i].s = LM_GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1))); + + // Convert int32 to int16 + ni0 = lsx_packs_w( ni0, ni1 ); + ni2 = lsx_packs_w( ni2, ni3 ); + ni4 = lsx_packs_w( ni4, ni5 ); + ni6 = lsx_packs_w( ni6, ni7 ); + // Convert int16 to int8 + ni0 = lsx_packs_h( ni0, ni2 ); + ni4 = lsx_packs_h( ni4, ni6 ); + + __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0); + __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0); + } #else LM_GGML_UNUSED(nb); // scalar @@ -1031,7 +1644,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * float ax = fabsf(x[i]); if (ax > amax) { amax = ax; max = x[i]; } } - if (amax < 1e-30f) { // all zero + if (amax < GROUP_MAX_EPS) { // all zero for (int i = 0; i < n; ++i) { L[i] = 0; } @@ -1065,7 +1678,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * sumlx += w*x[i]*l; suml2 += w*l*l; } - float scale = sumlx/suml2; + float scale = suml2 ? sumlx/suml2 : 0.0f; if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; float best = scale * sumlx; for (int is = -9; is <= 9; ++is) { @@ -1099,7 +1712,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * float ax = fabsf(x[i]); if (ax > amax) { amax = ax; max = x[i]; } } - if (!amax) { // all zero + if (amax < GROUP_MAX_EPS) { // all zero for (int i = 0; i < n; ++i) { L[i] = 0; } return 0.f; } @@ -1275,7 +1888,6 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f return scale; } -#if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; @@ -1284,7 +1896,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); } } -#endif //========================- 2-bit (de)-quantization @@ -1348,20 +1959,13 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } -#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } -#else - for (int l = 0; l < 16; ++l) { - y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); - } -#endif x += QK_K; - } } @@ -1376,7 +1980,6 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6 const uint8_t * q = x[i].qs; -#if QK_K == 256 int is = 0; float dl, ml; for (int n = 0; n < QK_K; n += 128) { @@ -1395,19 +1998,6 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6 } q += 32; } -#else - float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); - float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); - float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); - float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); - for (int l = 0; l < 16; ++l) { - y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1; - y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2; - y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3; - y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4; - } - y += QK_K; -#endif } } @@ -1568,7 +2158,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * break; } } - return sumlx / suml2; + return sumlx/suml2; } static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) { @@ -1598,36 +2188,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } float dm, mm; -#if QK_K == 64 - float max_scale = 0, max_min = 0; - for (int j = 0; j < QK_K/16; ++j) { - max_scale = MAX(max_scale, scales[j]); - max_min = MAX(max_min, mins[j]); - } - dm = max_scale/15; - mm = max_min/15; - if (max_scale) { - float id = 1/dm; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(id*scales[j]); - Ls[j] = MAX(0, MIN(15, l)); - } - } else { - memset(Ls, 0, QK_K/16); - } - if (max_min) { - float id = 1/mm; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(id*mins[j]); - Lm[j] = MAX(0, MIN(15, l)); - } - } else { - memset(Lm, 0, QK_K/16); - } -#else dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); -#endif + y[i].d = LM_GGML_FP32_TO_FP16(dm); y[i].dmin = LM_GGML_FP32_TO_FP16(mm); dm = LM_GGML_FP16_TO_FP32(y[i].d); @@ -1650,20 +2213,13 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } } -#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } -#else - for (int l = 0; l < 16; ++l) { - y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); - } -#endif x += QK_K; - } } @@ -1704,7 +2260,6 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } } -#if QK_K == 256 memset(y[i].scales, 0, 12); if (max_scale) { float iscale = -32.f/max_scale; @@ -1738,36 +2293,6 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict L[16*j + ii] = l + 4; } } -#else - if (max_scale) { - float iscale = -8.f/max_scale; - for (int j = 0; j < QK_K/16; j+=2) { - int l1 = nearest_int(iscale*scales[j]); - l1 = 8 + MAX(-8, MIN(7, l1)); - int l2 = nearest_int(iscale*scales[j+1]); - l2 = 8 + MAX(-8, MIN(7, l2)); - y[i].scales[j/2] = l1 | (l2 << 4); - } - y[i].d = LM_GGML_FP32_TO_FP16(1/iscale); - } else { - for (int j = 0; j < QK_K/16; j+=2) { - y[i].scales[j/2] = 0; - } - y[i].d = LM_GGML_FP32_TO_FP16(0.f); - } - for (int j = 0; j < QK_K/16; ++j) { - int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; - float d = LM_GGML_FP16_TO_FP32(y[i].d) * (s - 8); - if (!d) { - continue; - } - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); - L[16*j + ii] = l + 4; - } - } -#endif memset(y[i].hmask, 0, QK_K/8); // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. @@ -1782,23 +2307,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict m = 0; hm <<= 1; } } -#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } -#else - for (int l = 0; l < 16; ++l) { - y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); - } -#endif x += QK_K; } } -#if QK_K == 256 void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1848,49 +2366,12 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6 } } -#else -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) { - assert(k % QK_K == 0); - assert(QK_K == 64); - const int nb = k / QK_K; - - for (int i = 0; i < nb; i++) { - - const float d_all = LM_GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - - const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); - const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); - const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); - const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); - - for (int l=0; l<8; ++l) { - uint8_t h = hm[l]; - y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); - y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); - y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); - y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); - y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); - y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); - y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); - y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); - } - y += QK_K; - } -} -#endif void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) { quantize_row_q3_K_reference(x, vy, k); } static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) { -#if QK_K != 256 - (void)quant_weights; - quantize_row_q3_K_reference(x, y, n_per_row); -#else assert(n_per_row % QK_K == 0); const int nb = n_per_row / QK_K; @@ -1908,7 +2389,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri for (int j = 0; j < QK_K/16; ++j) { if (quant_weights) { - const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL; + const float * qw = quant_weights + QK_K * i + 16*j; for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]); } else { for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l]; @@ -1972,7 +2453,6 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri x += QK_K; } -#endif } size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { @@ -2004,7 +2484,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float scales[QK_K/32]; for (int i = 0; i < nb; i++) { - float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { @@ -2024,7 +2503,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -#if QK_K == 256 float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; float inv_min = max_min > 0 ? 63.f/max_min : 0.f; for (int j = 0; j < QK_K/32; ++j) { @@ -2056,39 +2534,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict L[32*j + ii] = l; } } -#else - const float s_factor = 15.f; - float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; - float inv_min = max_min > 0 ? s_factor/max_min : 0.f; - int d1 = nearest_int(inv_scale*scales[0]); - int m1 = nearest_int(inv_min*mins[0]); - int d2 = nearest_int(inv_scale*scales[1]); - int m2 = nearest_int(inv_min*mins[1]); - y[i].scales[0] = d1 | (m1 << 4); - y[i].scales[1] = d2 | (m2 << 4); - y[i].d[0] = LM_GGML_FP32_TO_FP16(max_scale/s_factor); - y[i].d[1] = LM_GGML_FP32_TO_FP16(max_min/s_factor); - float sumlx = 0; - int suml2 = 0; - for (int j = 0; j < QK_K/32; ++j) { - const uint8_t sd = y[i].scales[j] & 0xF; - const uint8_t sm = y[i].scales[j] >> 4; - const float d = LM_GGML_FP16_TO_FP32(y[i].d[0]) * sd; - if (!d) continue; - const float m = LM_GGML_FP16_TO_FP32(y[i].d[1]) * sm; - for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + m)/d); - l = MAX(0, MIN(15, l)); - L[32*j + ii] = l; - sumlx += (x[32*j + ii] + m)*l*sd; - suml2 += l*l*sd*sd; - } - } - if (suml2) { - y[i].d[0] = LM_GGML_FP32_TO_FP16(sumlx/suml2); - } -#endif uint8_t * q = y[i].qs; for (int j = 0; j < QK_K; j += 64) { for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4); @@ -2096,7 +2542,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } x += QK_K; - } } @@ -2105,11 +2550,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6 const int nb = k / QK_K; for (int i = 0; i < nb; i++) { - const uint8_t * q = x[i].qs; -#if QK_K == 256 - const float d = LM_GGML_FP16_TO_FP32(x[i].d); const float min = LM_GGML_FP16_TO_FP32(x[i].dmin); @@ -2124,18 +2566,6 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6 for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; q += 32; is += 2; } -#else - const float dall = LM_GGML_FP16_TO_FP32(x[i].d[0]); - const float mall = LM_GGML_FP16_TO_FP32(x[i].d[1]); - const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4); - const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4); - for (int l = 0; l < 32; ++l) { - y[l+ 0] = d1 * (q[l] & 0xF) - m1; - y[l+32] = d2 * (q[l] >> 4) - m2; - } - y += QK_K; -#endif - } } @@ -2146,10 +2576,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) } static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) { -#if QK_K != 256 - (void)quant_weights; - quantize_row_q4_K_reference(x, y, n_per_row); -#else assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -2220,7 +2646,6 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri x += QK_K; } -#endif } size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { @@ -2245,21 +2670,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict assert(k % QK_K == 0); const int64_t nb = k / QK_K; -#if QK_K == 256 uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; float weights[32]; uint8_t Laux[32]; -#else - int8_t L[QK_K]; - float scales[QK_K/16]; -#endif for (int i = 0; i < nb; i++) { - -#if QK_K == 256 - float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { @@ -2331,55 +2748,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict m1 <<= 2; m2 <<= 2; ql += 32; } -#else - float max_scale = 0, amax = 0; - for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL); - float abs_scale = fabsf(scales[j]); - if (abs_scale > amax) { - amax = abs_scale; - max_scale = scales[j]; - } - } - - float iscale = -128.f/max_scale; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(iscale*scales[j]); - y[i].scales[j] = MAX(-128, MIN(127, l)); - } - y[i].d = LM_GGML_FP32_TO_FP16(1/iscale); - - for (int j = 0; j < QK_K/16; ++j) { - const float d = LM_GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; - if (!d) continue; - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-16, MIN(15, l)); - L[16*j + ii] = l + 16; - } - } - - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; - memset(qh, 0, QK_K/8); - - for (int j = 0; j < 32; ++j) { - int jm = j%8; - int is = j/8; - int l1 = L[j]; - if (l1 > 15) { - l1 -= 16; qh[jm] |= (1 << is); - } - int l2 = L[j + 32]; - if (l2 > 15) { - l2 -= 16; qh[jm] |= (1 << (4 + is)); - } - ql[j] = l1 | (l2 << 4); - } -#endif x += QK_K; - } } @@ -2388,12 +2758,9 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6 const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { - const uint8_t * ql = x[i].qs; const uint8_t * qh = x[i].qh; -#if QK_K == 256 - const float d = LM_GGML_FP16_TO_FP32(x[i].d); const float min = LM_GGML_FP16_TO_FP32(x[i].dmin); @@ -2410,21 +2777,6 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6 ql += 32; is += 2; u1 <<= 2; u2 <<= 2; } -#else - float d = LM_GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict s = x[i].scales; - for (int l = 0; l < 8; ++l) { - y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); - y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); - y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); - y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); - y[l+32] = d * s[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16)); - y[l+40] = d * s[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16)); - y[l+48] = d * s[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16)); - y[l+56] = d * s[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16)); - } - y += QK_K; -#endif } } @@ -2435,10 +2787,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) } static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) { -#if QK_K != 256 - (void)quant_weights; - quantize_row_q5_K_reference(x, y, n_per_row); -#else assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -2529,7 +2877,6 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri x += QK_K; } -#endif } size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { @@ -2575,7 +2922,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } - if (!max_abs_scale) { + if (max_abs_scale < GROUP_MAX_EPS) { memset(&y[i], 0, sizeof(block_q6_K)); y[i].d = LM_GGML_FP32_TO_FP16(0.f); x += QK_K; @@ -2602,7 +2949,6 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict uint8_t * restrict ql = y[i].ql; uint8_t * restrict qh = y[i].qh; -#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { const uint8_t q1 = L[j + l + 0] & 0xF; @@ -2616,19 +2962,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict ql += 64; qh += 32; } -#else - for (int l = 0; l < 32; ++l) { - const uint8_t q1 = L[l + 0] & 0xF; - const uint8_t q2 = L[l + 32] & 0xF; - ql[l] = q1 | (q2 << 4); - } - for (int l = 0; l < 16; ++l) { - qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6); - } -#endif x += QK_K; - } } @@ -2637,14 +2972,12 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6 const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { - const float d = LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict ql = x[i].ql; const uint8_t * restrict qh = x[i].qh; const int8_t * restrict sc = x[i].scales; -#if QK_K == 256 for (int n = 0; n < QK_K; n += 128) { for (int l = 0; l < 32; ++l) { int is = l/16; @@ -2662,20 +2995,6 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6 qh += 32; sc += 8; } -#else - for (int l = 0; l < 16; ++l) { - const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - y[l+ 0] = d * sc[0] * q1; - y[l+16] = d * sc[1] * q2; - y[l+32] = d * sc[2] * q3; - y[l+48] = d * sc[3] * q4; - } - y += 64; -#endif - } } @@ -2686,10 +3005,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) } static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) { -#if QK_K != 256 - (void)quant_weights; - quantize_row_q6_K_reference(x, y, n_per_row); -#else assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -2727,7 +3042,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri } - if (!max_abs_scale) { + if (max_abs_scale < GROUP_MAX_EPS) { memset(&y[i], 0, sizeof(block_q6_K)); y[i].d = LM_GGML_FP32_TO_FP16(0.f); x += QK_K; @@ -2771,7 +3086,6 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri x += QK_K; } -#endif } size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { @@ -3188,30 +3502,21 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in float delta[4]; uint16_t idx[4]; -#if QK_K != 64 iq1m_scale_t scale; -#endif for (int i = 0; i < nb; i++) { const uint16_t * sc = (const uint16_t *)x[i].scales; -#if QK_K == 64 - const float d = LM_GGML_FP16_TO_FP32(x[i].d); -#else scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); const float d = LM_GGML_FP16_TO_FP32(scale.f16); -#endif + const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; for (int ib = 0; ib < QK_K/32; ++ib) { -#if QK_K == 64 - const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1); - const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1); -#else const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1); const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1); -#endif + idx[0] = qs[0] | ((qh[0] << 8) & 0x700); idx[1] = qs[1] | ((qh[0] << 4) & 0x700); idx[2] = qs[2] | ((qh[1] << 8) & 0x700); @@ -3262,9 +3567,6 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); -#if QK_K == 64 - dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k); -#else const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -3284,7 +3586,6 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, qs += 16; } } -#endif } //===================================== Q8_K ============================================== @@ -3386,6 +3687,43 @@ static inline __m128i get_scale_shuffle(int i) { }; return _mm_loadu_si128((const __m128i*)k_shuffle + i); } +#elif defined(__loongarch_asx) +// shuffles to pick the required scales in dot products +static inline __m256i get_scale_shuffle_q3k(int i) { + static const uint8_t k_shuffle[128] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + }; + return __lasx_xvld((const __m256i*)k_shuffle + i, 0); +} +static inline __m256i get_scale_shuffle_k4(int i) { + static const uint8_t k_shuffle[256] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 + }; + return __lasx_xvld((const __m256i*)k_shuffle + i, 0); +} +static inline __m128i get_scale_shuffle(int i) { + static const uint8_t k_shuffle[128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11, + 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13, + 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15 + }; + return __lsx_vld((const __m128i*)k_shuffle + i, 0); +} #endif void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { @@ -3409,10 +3747,9 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void #if defined(__ARM_FEATURE_MATMUL_INT8) if (nrc == 2) { const block_q4_0 * restrict vx0 = vx; - const block_q4_0 * restrict vx1 = vx + bx; - + const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx); const block_q8_0 * restrict vy0 = vy; - const block_q8_0 * restrict vy1 = vy + by; + const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3446,10 +3783,12 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void const int8x16_t y1_l = vld1q_s8(b_y1->qs); const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); - float32x4_t scale = {LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y0->d), - LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y1->d), - LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y0->d), - LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y1->d)}; + float32_t _scale[4] = { LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y1->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y1->d)}; + + float32x4_t scale = vld1q_f32(_scale); int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); @@ -3474,7 +3813,44 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void return; } #endif -#if defined(__ARM_NEON) +#if defined(__ARM_FEATURE_SVE) + const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); + const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); + + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); + } + + *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); +#elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -3734,6 +4110,190 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void } *s = sumf; + +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + const vector signed char v8 = vec_splats((signed char)0x8); + + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 4 + for (int i = 0; i < nb; i++) { + __builtin_prefetch(x[i].qs, 0, 1); + __builtin_prefetch(y[i].qs, 0, 1); + + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(LM_GGML_FP16_TO_FP32(y[i].d)); + vector float vd = vec_mul(vxd, vyd); + + vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs); + vector signed char q8y0 = vec_xl( 0, y[i].qs); + vector signed char q8y1 = vec_xl(16, y[i].qs); + + vector signed char q4x0 = vec_and(qxs, lowMask); + vector signed char q4x1 = vec_sr(qxs, v4); + + q4x0 = vec_sub(q4x0, v8); + q4x1 = vec_sub(q4x1, v8); + + vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); + + qv0 = vec_add(qv0, qv1); + + vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + // Main loop + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = __lasx_xvreplfr2vr_s( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); + + __m256i qx = bytes_from_nibbles_32(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = __lasx_xvreplgr2vr_b( 8 ); + qx = __lasx_xvsub_b( qx, off ); + + __m256i qy = __lasx_xvld((const __m256i *)y[i].qs, 0); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + /* Multiply q with scale and accumulate */ + acc = __lasx_xvfmadd_s( d, q, acc ); + } + + *s = hsum_float_8(acc); +#elif defined(__loongarch_sx) + // set constants + const __m128i low_mask = __lsx_vreplgr2vr_b(0xF); + const __m128i off = __lsx_vreplgr2vr_b(8); + + // Initialize accumulator with zeros + __m128 acc_0 = __lsx_vldi(0); + __m128 acc_1 = __lsx_vldi(0); + __m128 acc_2 = __lsx_vldi(0); + __m128 acc_3 = __lsx_vldi(0); + + // First round without accumulation + { + _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = __lsx_vreplgr2vr_w( LM_GGML_FP16_TO_FP32(x[0].d) * LM_GGML_FP16_TO_FP32(y[0].d) ); + + const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[0].qs, 0); + + __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1); + __m128i by_0 = __lsx_vld((const __m128i *)y[0].qs, 0); + bx_0 = __lsx_vsub_b(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4)); + __m128i by_1 = __lsx_vld((const __m128i *)(y[0].qs + 16), 0); + bx_1 = __lsx_vsub_b(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = __lsx_vreplgr2vr_w( LM_GGML_FP16_TO_FP32(x[1].d) * LM_GGML_FP16_TO_FP32(y[1].d) ); + + const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[1].qs, 0); + + __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3); + __m128i by_2 = __lsx_vld((const __m128i *)y[1].qs, 0); + bx_2 = __lsx_vsub_b(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4)); + __m128i by_3 = __lsx_vld((const __m128i *)(y[1].qs + 16), 0); + bx_3 = __lsx_vsub_b(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = __lsx_vffint_s_w(i32_0); + __m128 p1 = __lsx_vffint_s_w(i32_1); + __m128 p2 = __lsx_vffint_s_w(i32_2); + __m128 p3 = __lsx_vffint_s_w(i32_3); + + // Apply the scale + acc_0 = __lsx_vfmul_s( d_0_1, p0 ); + acc_1 = __lsx_vfmul_s( d_0_1, p1 ); + acc_2 = __lsx_vfmul_s( d_2_3, p2 ); + acc_3 = __lsx_vfmul_s( d_2_3, p3 ); + } + + assert(nb % 2 == 0); // TODO: handle odd nb + + // Main loop + for (int i = 2; i < nb; i+=2) { + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = __lsx_vreplgr2vr_w( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); + + const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[i].qs, 0); + + __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1); + __m128i by_0 = __lsx_vld((const __m128i *)y[i].qs, 0); + bx_0 = __lsx_vsub_b(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4)); + __m128i by_1 = __lsx_vld((const __m128i *)(y[i].qs + 16), 0); + bx_1 = __lsx_vsub_b(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + //_mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0); + //_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = __lsx_vreplgr2vr_w( LM_GGML_FP16_TO_FP32(x[i + 1].d) * LM_GGML_FP16_TO_FP32(y[i + 1].d) ); + + const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[i + 1].qs, 0); + + __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3); + __m128i by_2 = __lsx_vld((const __m128i *)y[i + 1].qs, 0); + bx_2 = __lsx_vsub_b(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4)); + __m128i by_3 = __lsx_vld((const __m128i *)(y[i + 1].qs + 16), 0); + bx_3 = __lsx_vsub_b(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = __lsx_vffint_s_w(i32_0); + __m128 p1 = __lsx_vffint_s_w(i32_1); + __m128 p2 = __lsx_vffint_s_w(i32_2); + __m128 p3 = __lsx_vffint_s_w(i32_3); + + // Apply the scale + __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 ); + __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 ); + __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 ); + __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 ); + + // Acummulate + acc_0 = __lsx_vfadd_s(p0_d, acc_0); + acc_1 = __lsx_vfadd_s(p1_d, acc_1); + acc_2 = __lsx_vfadd_s(p2_d, acc_2); + acc_3 = __lsx_vfadd_s(p3_d, acc_3); + } + + *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); + #else // scalar float sumf = 0.0; @@ -3776,9 +4336,9 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void #if defined(__ARM_FEATURE_MATMUL_INT8) if (nrc == 2) { const block_q4_1 * restrict vx0 = vx; - const block_q4_1 * restrict vx1 = vx + bx; + const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx); const block_q8_1 * restrict vy0 = vy; - const block_q8_1 * restrict vy1 = vy + by; + const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by); float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t summs0 = vdupq_n_f32(0.0f); @@ -3789,11 +4349,11 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void const block_q8_1 * restrict b_y0 = &vy0[i]; const block_q8_1 * restrict b_y1 = &vy1[i]; - float32x4_t summs_t = {LM_GGML_FP16_TO_FP32(b_x0->m) * LM_GGML_FP16_TO_FP32(b_y0->s), - LM_GGML_FP16_TO_FP32(b_x1->m) * LM_GGML_FP16_TO_FP32(b_y0->s), - LM_GGML_FP16_TO_FP32(b_x0->m) * LM_GGML_FP16_TO_FP32(b_y1->s), - LM_GGML_FP16_TO_FP32(b_x1->m) * LM_GGML_FP16_TO_FP32(b_y1->s)}; - summs0 += summs_t; + float32_t summs_t[4] = {LM_GGML_FP16_TO_FP32(b_x0->m) * LM_GGML_FP16_TO_FP32(b_y0->s), + LM_GGML_FP16_TO_FP32(b_x1->m) * LM_GGML_FP16_TO_FP32(b_y0->s), + LM_GGML_FP16_TO_FP32(b_x0->m) * LM_GGML_FP16_TO_FP32(b_y1->s), + LM_GGML_FP16_TO_FP32(b_x1->m) * LM_GGML_FP16_TO_FP32(b_y1->s)}; + summs0 = vaddq_f32(summs0, vld1q_f32(summs_t)); const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3813,10 +4373,11 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); // mmla into int32x4_t - float32x4_t scale = {LM_GGML_FP16_TO_FP32(b_x0->d)*b_y0->d, - LM_GGML_FP16_TO_FP32(b_x0->d)*b_y1->d, - LM_GGML_FP16_TO_FP32(b_x1->d)*b_y0->d, - LM_GGML_FP16_TO_FP32(b_x1->d)*b_y1->d}; + float32_t _scale[4] = {LM_GGML_FP16_TO_FP32(b_x0->d)*b_y0->d, + LM_GGML_FP16_TO_FP32(b_x0->d)*b_y1->d, + LM_GGML_FP16_TO_FP32(b_x1->d)*b_y0->d, + LM_GGML_FP16_TO_FP32(b_x1->d)*b_y1->d}; + float32x4_t scale = vld1q_f32(_scale); int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); @@ -3835,7 +4396,7 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); - sumv2 = sumv2 + summs0; + sumv2 = vaddq_f32(sumv2, summs0); vst1_f32(s, vget_low_f32(sumv2)); vst1_f32(s + bs, vget_high_f32(sumv2)); @@ -3952,6 +4513,79 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void } *s = sumf; + +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 4 + for (int i = 0; i < nb; i++) { + __builtin_prefetch(x[i].qs, 0, 1); + __builtin_prefetch(y[i].qs, 0, 1); + + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(LM_GGML_FP16_TO_FP32(y[i].d)); + vector float vd = vec_mul(vxd, vyd); + + vector float vxmin = vec_splats(LM_GGML_FP16_TO_FP32(x[i].m)); + vector float vys = {LM_GGML_FP16_TO_FP32(y[i].s), 0.0f, 0.0f, 0.0f}; + vsumf0 = vec_madd(vxmin, vys, vsumf0); + + vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs); + vector signed char q8y0 = vec_xl( 0, y[i].qs); + vector signed char q8y1 = vec_xl(16, y[i].qs); + + vector signed char q4x0 = vec_and(qxs, lowMask); + vector signed char q4x1 = vec_sr(qxs, v4); + + vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); + + qv0 = vec_add(qv0, qv1); + + vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + float summs = 0; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float d0 = LM_GGML_FP16_TO_FP32(x[i].d); + const float d1 = LM_GGML_FP16_TO_FP32(y[i].d); + + summs += LM_GGML_FP16_TO_FP32(x[i].m) * LM_GGML_FP16_TO_FP32(y[i].s); + + const __m256 d0v = __lasx_xvreplfr2vr_s( d0 ); + const __m256 d1v = __lasx_xvreplfr2vr_s( d1 ); + + // Compute combined scales + const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i qx = bytes_from_nibbles_32(x[i].qs); + const __m256i qy = __lasx_xvld( (const __m256i *)y[i].qs, 0); + + const __m256 xy = mul_sum_us8_pairs_float(qx, qy); + + // Accumulate d0*d1*x*y + acc = __lasx_xvfmadd_s( d0d1, xy, acc ); + } + + *s = hsum_float_8(acc) + summs; + #else // scalar float sumf = 0.0; @@ -4237,12 +4871,81 @@ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void } *s = sumf; -#else - // scalar - float sumf = 0.0; - for (int i = 0; i < nb; i++) { - uint32_t qh; +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v4 = vec_splats((unsigned char)4); + + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 4 + for (int i = 0; i < nb; ++i) { + __builtin_prefetch(x[i].qs, 0, 1); + __builtin_prefetch(y[i].qs, 0, 1); + + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(LM_GGML_FP16_TO_FP32(y[i].d)); + vector float vd = vec_mul(vxd, vyd); + + vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[i].qh[0]]), (uint64_t)(table_b2b_1[x[i].qh[1]])}; + vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[i].qh[2]]), (uint64_t)(table_b2b_1[x[i].qh[3]])}; + + vector signed char qh0 = (vector signed char)aux64x2_0; + vector signed char qh1 = (vector signed char)aux64x2_1; + + vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs); + + vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0); + vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1); + + vector signed char q8y0 = vec_xl( 0, y[i].qs); + vector signed char q8y1 = vec_xl( 16, y[i].qs); + + vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1)); + + qv0 = vec_add(qv0, qv1); + + vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = __lasx_xvreplfr2vr_s(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); //FIXME + + __m256i qx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0)); + qx = __lasx_xvor_v(qx, bxhi); + + __m256i qy = __lasx_xvld((const __m256i *)y[i].qs, 0); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + /* Multiply q with scale and accumulate */ + acc = __lasx_xvfmadd_s(d, q, acc); + } + + *s = hsum_float_8(acc); + +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); int sumi = 0; @@ -4541,6 +5244,82 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void } *s = sumf; + +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 4 + for (int i = 0; i < nb; ++i) { + __builtin_prefetch(x[i].qs, 0, 1); + __builtin_prefetch(y[i].qs, 0, 1); + + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(LM_GGML_FP16_TO_FP32(y[i].d)); + vector float vd = vec_mul(vxd, vyd); + + vector float vxmin = vec_splats(LM_GGML_FP16_TO_FP32(x[i].m)); + vector float vys = {LM_GGML_FP16_TO_FP32(y[i].s), 0.f, 0.f, 0.f}; + vsumf0 = vec_madd(vxmin, vys, vsumf0); + + vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[i].qh[0]]), (uint64_t)(table_b2b_0[x[i].qh[1]])}; + vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[i].qh[2]]), (uint64_t)(table_b2b_0[x[i].qh[3]])}; + + vector signed char qh0 = (vector signed char)aux64x2_0; + vector signed char qh1 = (vector signed char)aux64x2_1; + + vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs); + + vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0); + vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1); + + vector signed char q8y0 = vec_xl( 0, y[i].qs); + vector signed char q8y1 = vec_xl( 16, y[i].qs); + + vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1)); + + qv0 = vec_add(qv0, qv1); + + vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = __lasx_xvreplfr2vr_s(LM_GGML_FP16_TO_FP32(x[i].d)); + + summs += LM_GGML_FP16_TO_FP32(x[i].m) * LM_GGML_FP16_TO_FP32(y[i].s); + + __m256i qx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10)); + qx = __lasx_xvor_v(qx, bxhi); + + const __m256 dy = __lasx_xvreplfr2vr_s(LM_GGML_FP16_TO_FP32(y[i].d)); + const __m256i qy = __lasx_xvld((const __m256i *)y[i].qs, 0); + + const __m256 q = mul_sum_us8_pairs_float(qx, qy); + + acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc); + } + + *s = hsum_float_8(acc) + summs; + #else // scalar float sumf = 0.0; @@ -4589,9 +5368,9 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void #if defined(__ARM_FEATURE_MATMUL_INT8) if (nrc == 2) { const block_q8_0 * restrict vx0 = vx; - const block_q8_0 * restrict vx1 = vx + bx; + const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx); const block_q8_0 * restrict vy0 = vy; - const block_q8_0 * restrict vy1 = vy + by; + const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -4613,10 +5392,11 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void const int8x16_t y1_l = vld1q_s8(b_y1->qs); const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); - float32x4_t scale = {LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y0->d), - LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y1->d), - LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y0->d), - LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y1->d)}; + float32_t _scale[4] = {LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x0->d)*LM_GGML_FP16_TO_FP32(b_y1->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y0->d), + LM_GGML_FP16_TO_FP32(b_x1->d)*LM_GGML_FP16_TO_FP32(b_y1->d)}; + float32x4_t scale = vld1q_f32(_scale); int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); @@ -4641,7 +5421,32 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void return; } #endif -#if defined(__ARM_NEON) +#if defined(__ARM_FEATURE_SVE) + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q8_0 * restrict x0 = &x[i + 0]; + const block_q8_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + // load x + const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); + const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); + } + + *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); +#elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -4716,6 +5521,66 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void } *s = sumf; + +#elif defined(__POWER9_VECTOR__) + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 4 + for (int i = 0; i < nb; i++) { + __builtin_prefetch(x[i].qs, 0, 1); + __builtin_prefetch(y[i].qs, 0, 1); + + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(LM_GGML_FP16_TO_FP32(y[i].d)); + vector float vd = vec_mul(vxd, vyd); + + vector signed char q8x0 = vec_xl( 0, x[i].qs); + vector signed char q8x1 = vec_xl(16, x[i].qs); + vector signed char q8y0 = vec_xl( 0, y[i].qs); + vector signed char q8y1 = vec_xl(16, y[i].qs); + + vector signed short qv0 = vec_mule(q8x0, q8y0); + vector signed short qv1 = vec_mulo(q8x0, q8y0); + vector signed short qv2 = vec_mule(q8x1, q8y1); + vector signed short qv3 = vec_mulo(q8x1, q8y1); + + vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1)); + vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1)); + vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3)); + vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3)); + + vsumi0 = vec_add(vsumi0, vsumi2); + vsumi1 = vec_add(vsumi1, vsumi3); + + vsumi0 = vec_add(vsumi0, vsumi1); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = __lasx_xvreplfr2vr_s(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); + __m256i qx = __lasx_xvld((const __m256i *)x[i].qs, 0); + __m256i qy = __lasx_xvld((const __m256i *)y[i].qs, 0); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + // Multiply q with scale and accumulate + acc = __lasx_xvfmadd_s( d, q, acc ); + } + + *s = hsum_float_8(acc); + #else // scalar float sumf = 0.0; @@ -4734,7 +5599,6 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void #endif } -#if QK_K == 256 void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -5071,6 +5935,210 @@ void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void *s = sumf; +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0x3); + const vector signed char lowScaleMask = vec_splats((signed char)0xF); + const vector unsigned char v2 = vec_splats((unsigned char)0x2); + const vector unsigned char v6 = vec_splats((unsigned char)0x6); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector float vxmin = vec_splats(LM_GGML_FP16_TO_FP32(x[i].dmin)); + vector float vdmin = vec_mul(vxmin, vyd); + + vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); + vector signed short q8ysums1 = vec_xl(16, y[i].bsums); + + vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales); + vector signed char vscales = vec_and(q2xmins, lowScaleMask); + + q2xmins = vec_sr(q2xmins, v4); + vector signed short q2xmins0 = vec_unpackh(q2xmins); + vector signed short q2xmins1 = vec_unpackl(q2xmins); + + vector signed int prod0 = vec_mule(q2xmins0, q8ysums0); + vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0); + vector signed int prod2 = vec_mule(q2xmins1, q8ysums1); + vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1); + + vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); + vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); + vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); + vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + + + for (int j = 0; j < QK_K/128; ++j) { + __builtin_prefetch(q2, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q2); + vector signed char qxs1 = (vector signed char)vec_xl(16, q2); + q2 += 32; + + vector signed char q2x00 = vec_and(qxs0, lowMask); + vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask); + vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask); + vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask); + vector signed char q2x10 = vec_and(qxs1, lowMask); + vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask); + vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask); + vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl( 16, q8); + vector signed char q8y01 = vec_xl( 32, q8); + vector signed char q8y11 = vec_xl( 48, q8); + vector signed char q8y02 = vec_xl( 64, q8); + vector signed char q8y12 = vec_xl( 80, q8); + vector signed char q8y03 = vec_xl( 96, q8); + vector signed char q8y13 = vec_xl(112, q8); + q8 += 128; + + vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00)); + vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01)); + vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02)); + vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03)); + vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10)); + vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11)); + vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12)); + vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13)); + + vector signed short vscales_h = vec_unpackh(vscales); + vector signed short vs0 = vec_splat(vscales_h, 0); + vector signed short vs1 = vec_splat(vscales_h, 1); + vector signed short vs2 = vec_splat(vscales_h, 2); + vector signed short vs3 = vec_splat(vscales_h, 3); + vector signed short vs4 = vec_splat(vscales_h, 4); + vector signed short vs5 = vec_splat(vscales_h, 5); + vector signed short vs6 = vec_splat(vscales_h, 6); + vector signed short vs7 = vec_splat(vscales_h, 7); + vscales = vec_sld(vscales, vscales, 8); + + qv0 = vec_mul(qv0, vs0); + qv1 = vec_mul(qv1, vs2); + qv2 = vec_mul(qv2, vs4); + qv3 = vec_mul(qv3, vs6); + + qv0 = vec_madd(qv4, vs1, qv0); + qv1 = vec_madd(qv5, vs3, qv1); + qv2 = vec_madd(qv6, vs5, qv2); + qv3 = vec_madd(qv7, vs7, qv3); + + vsumi0 = vec_add(vec_unpackh(qv0), vsumi0); + vsumi1 = vec_add(vec_unpackh(qv1), vsumi1); + vsumi2 = vec_add(vec_unpackh(qv2), vsumi2); + vsumi3 = vec_add(vec_unpackh(qv3), vsumi3); + + vsumi4 = vec_add(vec_unpackl(qv0), vsumi4); + vsumi5 = vec_add(vec_unpackl(qv1), vsumi5); + vsumi6 = vec_add(vec_unpackl(qv2), vsumi6); + vsumi7 = vec_add(vec_unpackl(qv3), vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined __loongarch_asx + + const __m256i m3 = __lasx_xvreplgr2vr_b(3); + const __m128i m4 = __lsx_vreplgr2vr_b(0xF); + + __m256 acc = (__m256)__lasx_xvldi(0); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); + + const uint8_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0); + const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4); + const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4); + const __m256i mins = lasx_ext8_16(mins8); + const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0)); + + acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc); + + const __m256i all_scales = lasx_ext8_16(scales8); + const __m128i l_scales = lasx_extracti128(all_scales, 0); + const __m128i h_scales = lasx_extracti128(all_scales, 1); + const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)}; + + __m256i sumi = __lasx_xvldi(0); + + for (int j = 0; j < QK_K/128; ++j) { + + const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32; + + const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + + const __m256i q2_0 = __lasx_xvand_v(q2bits, m3); + const __m256i q2_1 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 2), m3); + const __m256i q2_2 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 4), m3); + const __m256i q2_3 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 6), m3); + + __m256i p0 = lasx_maddubs_h(q2_0, q8_0); + __m256i p1 = lasx_maddubs_h(q2_1, q8_1); + __m256i p2 = lasx_maddubs_h(q2_2, q8_2); + __m256i p3 = lasx_maddubs_h(q2_3, q8_3); + + p0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(0)), p0); + p1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(1)), p1); + p2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(2)), p2); + p3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(3)), p3); + + p0 = __lasx_xvadd_w(p0, p1); + p2 = __lasx_xvadd_w(p2, p3); + + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2)); + } + + acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); + + } + + *s = hsum_float_8(acc); + #else float sumf = 0; @@ -5114,63 +6182,112 @@ void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void #endif } -#else - -void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); - const block_q2_K * restrict x = vx; + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * restrict x = vx; const block_q8_K * restrict y = vy; const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m3 = vdupq_n_u8(0x3); - const int32x4_t vzero = vdupq_n_s32(0); + uint32_t aux[3]; + uint32_t utmp[4]; - lm_ggml_int8x16x4_t q2bytes; + const uint8x16_t m3b = vdupq_n_u8(0x3); + const int32x4_t vzero = vdupq_n_s32(0); - uint32_t aux32[2]; - const uint8_t * scales = (const uint8_t *)aux32; + const uint8x16_t m0 = vdupq_n_u8(1); + const uint8x16_t m1 = vshlq_n_u8(m0, 1); + const uint8x16_t m2 = vshlq_n_u8(m0, 2); + const uint8x16_t m3 = vshlq_n_u8(m0, 3); + const int8_t m32 = 32; + + lm_ggml_int8x16x4_t q3bytes; float sum = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q2 = x[i].qs; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict qh = x[i].hmask; const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - aux32[0] = sc[0] & 0x0f0f0f0f; - aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; + lm_ggml_uint8x16x2_t qhbits = lm_ggml_vld1q_u8_x2(qh); + + lm_ggml_uint8x16x4_t q3h; + + int32_t isum = 0; + + // Set up scales + memcpy(aux, x[i].scales, 12); + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + int8_t * scale = (int8_t *)utmp; + for (int j = 0; j < 16; ++j) scale[j] -= m32; + + for (int j = 0; j < QK_K/128; ++j) { + + const lm_ggml_uint8x16x2_t q3bits = lm_ggml_vld1q_u8_x2(q3); q3 += 32; + const lm_ggml_int8x16x4_t q8bytes_1 = lm_ggml_vld1q_s8_x4(q8); q8 += 64; + const lm_ggml_int8x16x4_t q8bytes_2 = lm_ggml_vld1q_s8_x4(q8); q8 += 64; + + q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2); + q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2); + q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1); + q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1); + + q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0])); + q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1])); + q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); + q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); + + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; + + scale += 4; - sum += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]); + q3h.val[0] = vbicq_u8(m2, qhbits.val[0]); + q3h.val[1] = vbicq_u8(m2, qhbits.val[1]); + q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1); + q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1); - int isum1 = 0, isum2 = 0; + q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0])); + q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1])); + q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); + q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); - const uint8x16_t q2bits = vld1q_u8(q2); + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; - const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); + scale += 4; - q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3)); - q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3)); - q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3)); - q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3)); + if (j == 0) { + qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4); + qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4); + } - isum1 += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; - isum2 += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; - isum1 += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; - isum2 += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; + } + sum += d * isum; - sum += d * (isum1 + isum2); } *s = sum; @@ -5178,326 +6295,12 @@ void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void #elif defined __AVX2__ const __m256i m3 = _mm256_set1_epi8(3); + const __m256i mone = _mm256_set1_epi8(1); + const __m128i m32 = _mm_set1_epi8(32); __m256 acc = _mm256_setzero_ps(); - uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; - - float summs = 0; - - // TODO: optimize this - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - ud = (sc[0] >> 0) & 0x0f0f0f0f; - um = (sc[0] >> 4) & 0x0f0f0f0f; - - int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3]; - summs += dmin * smin; - - const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); - const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3); - const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - const __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0); - const __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1); - - const __m256i p_0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 0)); - const __m256i p_1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 1)); - const __m256i p_2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 0)); - const __m256i p_3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 1)); - - acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0), acc); - acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1), acc); - acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2), acc); - acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3), acc); - } - - *s = hsum_float_8(acc) + summs; - -#elif defined __AVX__ - - const __m128i m3 = _mm_set1_epi8(3); - - __m256 acc = _mm256_setzero_ps(); - - uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; - - float summs = 0; - - // TODO: optimize this - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - ud = (sc[0] >> 0) & 0x0f0f0f0f; - um = (sc[0] >> 4) & 0x0f0f0f0f; - - int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3]; - summs += dmin * smin; - - const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); - const __m128i q2_0 = _mm_and_si128(q2bits, m3); - const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); - const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); - const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0)); - const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1)); - const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0)); - const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1)); - - const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0)); - const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1)); - const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2)); - const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3)); - - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc); - } - - *s = hsum_float_8(acc) + summs; - -#elif defined __riscv_v_intrinsic - - uint32_t aux32[2]; - const uint8_t * scales = (const uint8_t *)aux32; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - - aux32[0] = sc[0] & 0x0f0f0f0f; - aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; - - sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]); - - int isum1 = 0; - int isum2 = 0; - - size_t vl = 16; - - vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1); - - // load Q2 - vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl); - - vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl)); - vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl)); - vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl)); - vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl)); - - // load Q8, and take product with Q2 - vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl); - vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl); - vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl); - vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl); - - vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl); - vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl); - vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl); - vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl); - - isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0]; - isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1]; - isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2]; - isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3]; - - sumf += d * (isum1 + isum2); - - } - - *s = sumf; - -#else - - float sumf = 0; - - int isum[QK_K/16]; - - for (int i = 0; i < nb; ++i) { - - const uint8_t * q2 = x[i].qs; - const int8_t * q8 = y[i].qs; - const uint8_t * sc = x[i].scales; - - int summs = 0; - for (int j = 0; j < QK_K/16; ++j) { - summs += y[i].bsums[j] * (sc[j] >> 4); - } - - const float dall = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); - - memset(isum, 0, (QK_K/16)*sizeof(int)); - for (int l = 0; l < 16; ++l) { - isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3); - isum[1] += q8[l+16] * ((q2[l] >> 2) & 3); - isum[2] += q8[l+32] * ((q2[l] >> 4) & 3); - isum[3] += q8[l+48] * ((q2[l] >> 6) & 3); - } - for (int l = 0; l < QK_K/16; ++l) { - isum[l] *= (sc[l] & 0xF); - } - sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs; - } - *s = sumf; -#endif -} -#endif - -#if QK_K == 256 -void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const uint32_t kmask1 = 0x03030303; - const uint32_t kmask2 = 0x0f0f0f0f; - - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - - uint32_t aux[3]; - uint32_t utmp[4]; - - const uint8x16_t m3b = vdupq_n_u8(0x3); - const int32x4_t vzero = vdupq_n_s32(0); - - const uint8x16_t m0 = vdupq_n_u8(1); - const uint8x16_t m1 = vshlq_n_u8(m0, 1); - const uint8x16_t m2 = vshlq_n_u8(m0, 2); - const uint8x16_t m3 = vshlq_n_u8(m0, 3); - const int8_t m32 = 32; - - lm_ggml_int8x16x4_t q3bytes; - - float sum = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - - lm_ggml_uint8x16x2_t qhbits = lm_ggml_vld1q_u8_x2(qh); - - lm_ggml_uint8x16x4_t q3h; - - int32_t isum = 0; - - // Set up scales - memcpy(aux, x[i].scales, 12); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); - - int8_t * scale = (int8_t *)utmp; - for (int j = 0; j < 16; ++j) scale[j] -= m32; - - for (int j = 0; j < QK_K/128; ++j) { - - const lm_ggml_uint8x16x2_t q3bits = lm_ggml_vld1q_u8_x2(q3); q3 += 32; - const lm_ggml_int8x16x4_t q8bytes_1 = lm_ggml_vld1q_s8_x4(q8); q8 += 64; - const lm_ggml_int8x16x4_t q8bytes_2 = lm_ggml_vld1q_s8_x4(q8); q8 += 64; - - q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2); - q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2); - q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1); - q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1); - - q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0])); - q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1])); - q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); - q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); - - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; - - scale += 4; - - q3h.val[0] = vbicq_u8(m2, qhbits.val[0]); - q3h.val[1] = vbicq_u8(m2, qhbits.val[1]); - q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1); - q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1); - - q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0])); - q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1])); - q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); - q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); - - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; - - scale += 4; - - if (j == 0) { - qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4); - qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4); - } - - } - sum += d * isum; - - } - - *s = sum; - -#elif defined __AVX2__ - - const __m256i m3 = _mm256_set1_epi8(3); - const __m256i mone = _mm256_set1_epi8(1); - const __m128i m32 = _mm_set1_epi8(32); - - __m256 acc = _mm256_setzero_ps(); - - uint32_t aux[3]; + uint32_t aux[3]; for (int i = 0; i < nb; ++i) { @@ -5835,410 +6638,318 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void *s = sumf; -#else - // scalar version - // This function is written like this so the compiler can manage to vectorize most of it - // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the - // manually vectorized version above. Every other version I tried would run at least 4 times slower. - // The ideal situation would be if we could just write the code once, and the compiler would - // automatically produce the best possible set of machine instructions, instead of us having to manually - // write vectorized versions for AVX, ARM_NEON, etc. - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0x3); + const vector signed char v1 = vec_splats((signed char)0x1); + const vector unsigned char v2 = vec_splats((unsigned char)0x2); + const vector unsigned char v3 = vec_splats((unsigned char)0x3); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + const vector unsigned char v6 = vec_splats((unsigned char)0x6); + const vector signed char off = vec_splats((signed char)0x20); - uint32_t auxs[4]; - const int8_t * scales = (const int8_t*)auxs; + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); - float sumf = 0; for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + uint32_t aux[3]; + uint32_t utmp[4]; + + memcpy(aux, x[i].scales, 12); + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + vector signed char vscales = (vector signed char)vec_xl( 0, utmp); + vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask); + vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask); + + vscales = vec_sub(vscales, off); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + + const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; - uint8_t m = 1; - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; - for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); - a += 32; m <<= 1; - for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; - for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); - a += 32; m <<= 1; - for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; - for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); - a += 32; m <<= 1; - for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; - for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); - a += 32; m <<= 1; - q3 += 32; - } - a = aux8; - - memcpy(auxs, x[i].scales, 12); - uint32_t tmp = auxs[2]; - auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); - auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); - auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); - auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); - for (int j = 0; j < QK_K/16; ++j) { - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; - q8 += 8; a += 8; - } - const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; - -#endif - -} - -#else - -void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - const int32x4_t vzero = vdupq_n_s32(0); + const int8_t * restrict q8 = y[i].qs; - const uint8x16_t m3b = vdupq_n_u8(0x3); - const uint8x16_t mh = vdupq_n_u8(4); + for (int j = 0; j < QK_K/128; ++j) { + __builtin_prefetch(q3, 0, 1); + __builtin_prefetch(q8, 0, 1); - lm_ggml_int8x16x4_t q3bytes; + vector signed char qxs0 = (vector signed char)vec_xl( 0, q3); + vector signed char qxs1 = (vector signed char)vec_xl(16, q3); + q3 += 32; - uint16_t aux16[2]; - int8_t * scales = (int8_t *)aux16; + //the low 2 bits + vector signed char qxs00 = vec_and(qxs0, lowMask); + vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask); + vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask); + vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask); + vector signed char qxs10 = vec_and(qxs1, lowMask); + vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask); + vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask); + vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask); + + //the 3rd bit + vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2); + vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2); + vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2); + vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2); + vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2); + vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2); + vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2); + vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2); + qxhs0 = vec_sr(qxhs0, v4); + qxhs1 = vec_sr(qxhs1, v4); + + vector signed char q3x00 = vec_sub(qxs00, qxh00); + vector signed char q3x01 = vec_sub(qxs01, qxh01); + vector signed char q3x02 = vec_sub(qxs02, qxh02); + vector signed char q3x03 = vec_sub(qxs03, qxh03); + vector signed char q3x10 = vec_sub(qxs10, qxh10); + vector signed char q3x11 = vec_sub(qxs11, qxh11); + vector signed char q3x12 = vec_sub(qxs12, qxh12); + vector signed char q3x13 = vec_sub(qxs13, qxh13); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl( 16, q8); + vector signed char q8y01 = vec_xl( 32, q8); + vector signed char q8y11 = vec_xl( 48, q8); + vector signed char q8y02 = vec_xl( 64, q8); + vector signed char q8y12 = vec_xl( 80, q8); + vector signed char q8y03 = vec_xl( 96, q8); + vector signed char q8y13 = vec_xl(112, q8); + q8 += 128; + + vector signed short vscales_h = vec_unpackh(vscales); + vector signed short vs0 = vec_splat(vscales_h, 0); + vector signed short vs1 = vec_splat(vscales_h, 1); + vector signed short vs2 = vec_splat(vscales_h, 2); + vector signed short vs3 = vec_splat(vscales_h, 3); + vector signed short vs4 = vec_splat(vscales_h, 4); + vector signed short vs5 = vec_splat(vscales_h, 5); + vector signed short vs6 = vec_splat(vscales_h, 6); + vector signed short vs7 = vec_splat(vscales_h, 7); + vscales = vec_sld(vscales, vscales, 8); + + vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00)); + vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01)); + vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02)); + vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03)); + vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10)); + vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11)); + vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12)); + vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13)); + + vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0)); + vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2)); + vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4)); + vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6)); + vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1)); + vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3)); + vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5)); + vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7)); + + vsumi0 = vec_add(vsum0, vsumi0); + vsumi1 = vec_add(vsum1, vsumi1); + vsumi2 = vec_add(vsum2, vsumi2); + vsumi3 = vec_add(vsum3, vsumi3); + vsumi4 = vec_add(vsum4, vsumi4); + vsumi5 = vec_add(vsum5, vsumi5); + vsumi6 = vec_add(vsum6, vsumi6); + vsumi7 = vec_add(vsum7, vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined __loongarch_asx + + const __m256i m3 = __lasx_xvreplgr2vr_b(3); + const __m256i mone = __lasx_xvreplgr2vr_b(1); + const __m128i m32 = __lsx_vreplgr2vr_b(32); + + __m256 acc = (__m256)__lasx_xvldi(0); - float sum = 0; + uint32_t aux[3]; for (int i = 0; i < nb; ++i) { - lm_ggml_uint8x16x4_t q3h; - - const uint8x8_t hbits = vld1_u8(x[i].hmask); - const uint8x16_t q3bits = vld1q_u8(x[i].qs); - const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(y[i].qs); - - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - for (int j = 0; j < 4; ++j) scales[j] -= 8; - - int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + // Set up scales + memcpy(aux, x[i].scales, 12); + __m128i scales128 = lsx_set_w( + ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), + ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), + (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), + (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); + scales128 = __lsx_vsub_b(scales128, m32); + const __m256i all_scales = lasx_ext8_16(scales128); + const __m128i l_scales = lasx_extracti128(all_scales, 0); + const __m128i h_scales = lasx_extracti128(all_scales, 1); + const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)}; - const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1)); - q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2)); - q3h.val[1] = vandq_u8(mh, htmp); - q3h.val[2] = vandq_u8(mh, vshrq_n_u8(htmp, 2)); - q3h.val[3] = vandq_u8(mh, vshrq_n_u8(htmp, 4)); - - q3bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q3bits, m3b), q3h.val[0])); - q3bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 2), m3b), q3h.val[1])); - q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2])); - q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3])); - - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; - - sum += d * isum; - - } - - *s = sum; - -#elif defined __AVX2__ - - const __m256i m3 = _mm256_set1_epi8(3); - const __m256i m1 = _mm256_set1_epi8(1); - - __m256 acc = _mm256_setzero_ps(); - - uint64_t aux64; - - uint16_t aux16[2]; - const int8_t * aux8 = (const int8_t *)aux16; + // high bit + const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0); - for (int i = 0; i < nb; ++i) { + // integer accumulator + __m256i sumi = __lasx_xvldi(0); - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + int bit = 0; + int is = 0; const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8)); - const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8)); - - memcpy(&aux64, x[i].hmask, 8); - - const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0); - __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux); - __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4); - q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2); - q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2); - - // load low 2 bits - const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3); - - // prepare low and high bits - const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits); - const __m256i q3l_0 = _mm256_and_si256(q3aux, m3); - const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3); - - // load Q8 quants - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, - // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, - // and 2 if the high bit was set) - const __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0); - const __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1); - - __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0); - __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1); - - p16_0 = _mm256_sub_epi16(p16_0, q8s_0); - p16_1 = _mm256_sub_epi16(p16_1, q8s_1); - - // multiply with scales - p16_0 = _mm256_madd_epi16(scale_0, p16_0); - p16_1 = _mm256_madd_epi16(scale_1, p16_1); - - p16_0 = _mm256_add_epi32(p16_0, p16_1); - - // multiply with block scale and accumulate - acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16_0), acc); - - } - - *s = hsum_float_8(acc); + for (int j = 0; j < QK_K/128; ++j) { + // load low 2 bits + const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32; -#elif defined __AVX__ + // prepare low and high bits + const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3); + const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2); + ++bit; - const __m128i m3 = _mm_set1_epi8(3); - const __m128i m1 = _mm_set1_epi8(1); + const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3); + const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2); + ++bit; - __m256 acc = _mm256_setzero_ps(); + const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3); + const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2); + ++bit; - uint64_t aux64; + const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3); + const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2); + ++bit; - uint16_t aux16[2]; - const int8_t * aux8 = (const int8_t *)aux16; + // load Q8 quants + const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - for (int i = 0; i < nb; ++i) { + // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h, + // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, + // and 2 if the high bit was set) + __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0); + __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1); + __m256i q8s_2 = lasx_maddubs_h(q3h_2, q8_2); + __m256i q8s_3 = lasx_maddubs_h(q3h_3, q8_3); - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0); + __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1); + __m256i p16_2 = lasx_maddubs_h(q3l_2, q8_2); + __m256i p16_3 = lasx_maddubs_h(q3l_3, q8_3); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + p16_0 = __lasx_xvsub_h(p16_0, q8s_0); + p16_1 = __lasx_xvsub_h(p16_1, q8s_1); + p16_2 = __lasx_xvsub_h(p16_2, q8s_2); + p16_3 = __lasx_xvsub_h(p16_3, q8s_3); - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8); - const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8); - const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8); - const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8); - - memcpy(&aux64, x[i].hmask, 8); - - __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0); - __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2); - __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4); - __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6); - q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2); - q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2); - q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2); - q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2); - - // load low 2 bits - const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3); - - // prepare low and high bits - const __m128i q3l_0 = _mm_and_si128(q3bits, m3); - const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3); - const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3); - const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3); - - // load Q8 quants - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16, - // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, - // and 2 if the high bit was set) - const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0)); - const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1)); - const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0)); - const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1)); - - __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0)); - __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1)); - __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0)); - __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1)); - - p16_0 = _mm_sub_epi16(p16_0, q8s_0); - p16_1 = _mm_sub_epi16(p16_1, q8s_1); - p16_2 = _mm_sub_epi16(p16_2, q8s_2); - p16_3 = _mm_sub_epi16(p16_3, q8s_3); - - // multiply with scales - p16_0 = _mm_madd_epi16(scale_0, p16_0); - p16_1 = _mm_madd_epi16(scale_1, p16_1); - p16_2 = _mm_madd_epi16(scale_2, p16_2); - p16_3 = _mm_madd_epi16(scale_3, p16_3); - - p16_0 = _mm_add_epi32(p16_0, p16_2); - p16_1 = _mm_add_epi32(p16_1, p16_3); - __m256i p16 = MM256_SET_M128I(p16_1, p16_0); + // multiply with scales + p16_0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0); + p16_1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1); + p16_2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2); + p16_3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3); + // accumulate + p16_0 = __lasx_xvadd_w(p16_0, p16_1); + p16_2 = __lasx_xvadd_w(p16_2, p16_3); + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2)); + } // multiply with block scale and accumulate - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc); - + acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);//FIXME } *s = hsum_float_8(acc); -#elif defined __riscv_v_intrinsic - - uint16_t aux16[2]; - int8_t * scales = (int8_t *)aux16; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - for (int j = 0; j < 4; ++j) scales[j] -= 8; - - int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); - - // load qh - vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(x[i].hmask, 8); - vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8)); - - size_t vl = 16; - - // extend and combine both qh_x1 and qh_x2 - vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl); - - vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl); - vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl); - vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl); - vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl); - - // load Q3 - vuint8mf2_t q3_x = __riscv_vle8_v_u8mf2(q3, vl); - - vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl); - vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl); - vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl); - vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl); - - vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0); - vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1); - vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2); - vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3); - - // load Q8 and take product with Q3 - vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl); - vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl); - vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl); - vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl); - - vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl); - vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl); - vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl); - vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl); - - isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3]; - - sumf += d * isum; - - } - - *s = sumf; - #else + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. int8_t aux8[QK_K]; int16_t aux16[8]; float sums [8]; int32_t aux32[8]; - int32_t scales[4]; memset(sums, 0, 8*sizeof(float)); + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + float sumf = 0; for (int i = 0; i < nb; ++i) { const uint8_t * restrict q3 = x[i].qs; const uint8_t * restrict hm = x[i].hmask; const int8_t * restrict q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); int8_t * restrict a = aux8; - for (int l = 0; l < 8; ++l) { - a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4); - a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4); - a[l+16] = (int8_t)((q3[l+0] >> 2) & 3) - (hm[l] & 0x04 ? 0 : 4); - a[l+24] = (int8_t)((q3[l+8] >> 2) & 3) - (hm[l] & 0x08 ? 0 : 4); - a[l+32] = (int8_t)((q3[l+0] >> 4) & 3) - (hm[l] & 0x10 ? 0 : 4); - a[l+40] = (int8_t)((q3[l+8] >> 4) & 3) - (hm[l] & 0x20 ? 0 : 4); - a[l+48] = (int8_t)((q3[l+0] >> 6) & 3) - (hm[l] & 0x40 ? 0 : 4); - a[l+56] = (int8_t)((q3[l+8] >> 6) & 3) - (hm[l] & 0x80 ? 0 : 4); - } - - scales[0] = (x[i].scales[0] & 0xF) - 8; - scales[1] = (x[i].scales[0] >> 4) - 8; - scales[2] = (x[i].scales[1] & 0xF) - 8; - scales[3] = (x[i].scales[1] >> 4) - 8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; - memset(aux32, 0, 8*sizeof(int32_t)); + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); for (int j = 0; j < QK_K/16; ++j) { for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] += q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l]; } const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; @@ -6249,9 +6960,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void #endif } -#endif -#if QK_K == 256 void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -6481,386 +7190,335 @@ void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); -#elif defined __riscv_v_intrinsic - - const uint8_t * scales = (const uint8_t*)&utmp[0]; - const uint8_t * mins = (const uint8_t*)&utmp[2]; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - size_t vl = 8; - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); - - vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); - vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); - vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl); - vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl)); - vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl); - - vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); - sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - vl = 32; - - int32_t sum_1 = 0; - int32_t sum_2 = 0; - - vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1); - - for (int j = 0; j < QK_K/64; ++j) { - // load Q4 - vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl); - - // load Q8 and multiply it with lower Q4 nibble - vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl); - vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl)); - vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl); - vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl); - - sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0]; - - // load Q8 and multiply it with upper Q4 nibble - vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl); - vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl)); - vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl); - vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl); - - sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1]; - - q4 += 32; q8 += 64; - - } - - sumf += d*(sum_1 + sum_2); - - } - - *s = sumf; - -#else - - - const uint8_t * scales = (const uint8_t*)&utmp[0]; - const uint8_t * mins = (const uint8_t*)&utmp[2]; - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; - for (int j = 0; j < QK_K/64; ++j) { - for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); - a += 32; - for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); - a += 32; q4 += 32; - } - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - int sumi = 0; - for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; - a = aux8; - int is = 0; - for (int j = 0; j < QK_K/32; ++j) { - int32_t scale = scales[is++]; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - } - const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = LM_GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; - sumf -= dmin * sumi; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} -#else -void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); - - const int32x4_t mzero = vdupq_n_s32(0); - - float sumf = 0; - - lm_ggml_int8x16x2_t q4bytes; - lm_ggml_int8x16x4_t q8bytes; - - float sum_mins = 0.f; - - uint16_t aux16[2]; - const uint8_t * restrict scales = (const uint8_t *)aux16; - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint16_t * restrict a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]); - sum_mins += y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[1]) * summi; - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[0]); - - const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4); - - q8bytes = lm_ggml_vld1q_s8_x4(q8); - q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); - q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - - const int32x4_t p1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); - const int32_t sumi1 = vaddvq_s32(p1) * scales[0]; - - q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); - q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - - const int32x4_t p2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); - const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; - - sumf += d * (sumi1 + sumi2); - } - - *s = sumf - sum_mins; +#elif defined __riscv_v_intrinsic -#elif defined __AVX2__ + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; - const __m256i m4 = _mm256_set1_epi8(0xF); + float sumf = 0; - __m256 acc = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { - float summs = 0; + size_t vl = 8; - uint16_t aux16[2]; - const uint8_t * scales = (const uint8_t *)aux16; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); - for (int i = 0; i < nb; ++i) { + vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); + vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); + vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl); - const float d = LM_GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; - const float m = LM_GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; - const __m256 vd = _mm256_set1_ps(d); + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; + vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl); + vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl)); + vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl); - summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); + vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); + sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); - const __m256i q4l = _mm256_and_si256(q4bits, m4); - const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4); + vl = 32; - const __m256i q8l = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8h = _mm256_loadu_si256((const __m256i*)(q8+32)); + int32_t sum_1 = 0; + int32_t sum_2 = 0; - const __m256i p16l = _mm256_maddubs_epi16(q4l, q8l); - const __m256i p16h = _mm256_maddubs_epi16(q4h, q8h); + vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1); - const __m256i p32l = _mm256_madd_epi16(_mm256_set1_epi16(scales[0]), p16l); - acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32l), acc); + for (int j = 0; j < QK_K/64; ++j) { + // load Q4 + vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl); - const __m256i p32h = _mm256_madd_epi16(_mm256_set1_epi16(scales[1]), p16h); - acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32h), acc); + // load Q8 and multiply it with lower Q4 nibble + vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl); + vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl)); + vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl); + vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl); - } + sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0]; - *s = hsum_float_8(acc) - summs; + // load Q8 and multiply it with upper Q4 nibble + vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl); + vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl)); + vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl); + vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl); -#elif defined __AVX__ + sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1]; - const __m128i m4 = _mm_set1_epi8(0xF); + q4 += 32; q8 += 64; - __m256 acc = _mm256_setzero_ps(); + } - float summs = 0; + sumf += d*(sum_1 + sum_2); - uint16_t aux16[2]; - const uint8_t * scales = (const uint8_t *)aux16; + } - for (int i = 0; i < nb; ++i) { + *s = sumf; - const float d = LM_GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; - const float m = LM_GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; - const __m256 vd = _mm256_set1_ps(d); +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); - summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + vector float vxmin = vec_splats(LM_GGML_FP16_TO_FP32(x[i].dmin)); + vector float vdmin = vec_mul(vxmin, vyd); - const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); - const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0); - const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1); - const __m128i q4_0 = _mm_and_si128(q4bits_0, m4); - const __m128i q4_1 = _mm_and_si128(q4bits_1, m4); - const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4); - const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4); + vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); + vector signed short q8ysums1 = vec_xl(16, y[i].bsums); - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + memcpy(utmp, x[i].scales, 12); - const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0)); - const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1)); - const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0)); - const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1)); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; - const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0); - const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1); - acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc); + vector signed char utmps = (vector signed char)vec_xl( 0, utmp); + vector signed short vscales = vec_unpackh(utmps); + vector signed short q4xmins = vec_unpackl(utmps); + vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins); + vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins); + + vector signed int prod0 = vec_mule(q4xmins0, q8ysums0); + vector signed int prod1 = vec_mule(q4xmins1, q8ysums1); + vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0); + vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1); + + vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); + vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); + vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); + vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); - const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2); - const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3); - acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc); + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; - } + for (int j = 0; j < QK_K/64; j+=2) { + __builtin_prefetch(q4, 0, 1); + __builtin_prefetch(q8, 0, 1); - *s = hsum_float_8(acc) - summs; + vector signed char qxs0 = (vector signed char)vec_xl( 0, q4); + vector signed char qxs1 = (vector signed char)vec_xl(16, q4); + vector signed char qxs2 = (vector signed char)vec_xl(32, q4); + vector signed char qxs3 = (vector signed char)vec_xl(48, q4); + q4 += 64; -#elif defined __riscv_v_intrinsic + vector signed char q4x00 = vec_and(qxs0, lowMask); + vector signed char q4x01 = vec_sr(qxs0, v4); + vector signed char q4x10 = vec_and(qxs1, lowMask); + vector signed char q4x11 = vec_sr(qxs1, v4); + vector signed char q4x20 = vec_and(qxs2, lowMask); + vector signed char q4x21 = vec_sr(qxs2, v4); + vector signed char q4x30 = vec_and(qxs3, lowMask); + vector signed char q4x31 = vec_sr(qxs3, v4); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl( 16, q8); + vector signed char q8y01 = vec_xl( 32, q8); + vector signed char q8y11 = vec_xl( 48, q8); + vector signed char q8y20 = vec_xl( 64, q8); + vector signed char q8y30 = vec_xl( 80, q8); + vector signed char q8y21 = vec_xl( 96, q8); + vector signed char q8y31 = vec_xl(112, q8); + q8 += 128; + + vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00)); + vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01)); + vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10)); + vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11)); + vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20)); + vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21)); + vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30)); + vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31)); + + vector signed short vs0 = vec_splat(vscales, 0); + vector signed short vs1 = vec_splat(vscales, 1); + vector signed short vs2 = vec_splat(vscales, 2); + vector signed short vs3 = vec_splat(vscales, 3); + vscales = vec_sld(vscales, vscales, 8); + + qv00 = vec_add(qv00, qv10); + qv10 = vec_add(qv01, qv11); + qv20 = vec_add(qv20, qv30); + qv30 = vec_add(qv21, qv31); + + vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0); + vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1); + vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2); + vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3); + vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4); + vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5); + vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6); + vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined __loongarch_asx + + const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); + + __m256 acc = (__m256)__lasx_xvldi(0); + __m128 acc_m = (__m128)__lsx_vldi(0); - uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + for (int i = 0; i < nb; ++i) { - float sumf = 0; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); - for (int i = 0; i < nb; ++i) { + memcpy(utmp, x[i].scales, 12); const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const int8_t * restrict q8 = y[i].qs; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; - s16[0] = b[0] & 0x0f0f; - s16[1] = (b[0] >> 4) & 0x0f0f; + const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0])); - sumf -= y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[0]); + const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0); + const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1)); + const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s); + acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m); - size_t vl = 32; + const __m128i sc128 = lasx_extracti128(mins_and_scales, 0); + const __m256i scales = lasx_insertf128(sc128, sc128); - vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1); + __m256i sumi = __lasx_xvldi(0); + + for (int j = 0; j < QK_K/64; ++j) { - // load Q4 - vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl); + const __m256i scale_l = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0)); + const __m256i scale_h = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1)); - // load Q8 and multiply it with lower Q4 nibble - vint8m1_t q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl)); - vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl); - vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl); + const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; + const __m256i q4l = __lasx_xvand_v(q4bits, m4); + const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4); - sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1); + const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + __m256i p16l = lasx_maddubs_h(q4l, q8l); + p16l = lasx_madd_h(scale_l, p16l); - // load Q8 and multiply it with upper Q4 nibble - vint8m1_t q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl)); - vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl); - vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl); + const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + __m256i p16h = lasx_maddubs_h(q4h, q8h); + p16h = lasx_madd_h(scale_h, p16h); + const __m256i sumj = __lasx_xvadd_w(p16l, p16h); - sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2); + sumi = __lasx_xvadd_w(sumi, sumj); + } + __m256 vd = __lasx_xvreplfr2vr_s(d); + acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); } - *s = sumf; + acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee)); + __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0); + acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); + + ft_union fi; + fi.i = __lsx_vpickve2gr_w(acc_m, 0); + *s = hsum_float_8(acc) + fi.f ; #else - uint8_t aux8[QK_K]; - int16_t aux16[16]; + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; float sums [8]; + int32_t aux32[8]; memset(sums, 0, 8*sizeof(float)); - uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; - float sumf = 0; for (int i = 0; i < nb; ++i) { const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - uint8_t * restrict a = aux8; - for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF; - for (int l = 0; l < 32; ++l) a[l+32] = q4[l] >> 4; - - const uint16_t * restrict b = (const uint16_t *)x[i].scales; - s16[0] = b[0] & 0x0f0f; - s16[1] = (b[0] >> 4) & 0x0f0f; - - sumf -= y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[0]); + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * restrict a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; for (int j = 0; j < QK_K/32; ++j) { - for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; - q8 += 16; a += 16; - for (int l = 0; l < 16; ++l) aux16[l] += q8[l] * a[l]; - q8 += 16; a += 16; - const float dl = d * scales[j]; - for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[l+8]); + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; } + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = LM_GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; *s = sumf; #endif } -#endif -#if QK_K == 256 void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -6958,12 +7616,10 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void float summs = 0.f; - for (int i = 0; i < nb; ++i) { - + for (int i = 0; i < nb; ++i) { const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; -#if QK_K == 256 const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); @@ -6973,10 +7629,6 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[2] = uaux; utmp[0] &= kmask1; -#else - // TODO - const float d = 0, dmin = 0; -#endif const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -7220,308 +7872,212 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void *s = sumf+sums; -#else - - const uint8_t * scales = (const uint8_t*)&utmp[0]; - const uint8_t * mins = (const uint8_t*)&utmp[2]; - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; - uint8_t m = 1; - for (int j = 0; j < QK_K/64; ++j) { - for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); - for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); - a += 32; m <<= 1; - for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); - for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); - a += 32; m <<= 1; - q4 += 32; - } - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - int sumi = 0; - for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; - a = aux8; - int is = 0; - for (int j = 0; j < QK_K/32; ++j) { - int32_t scale = scales[is++]; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - } - const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = LM_GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; - sumf -= dmin * sumi; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} - -#else - -void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); - const uint8x16_t mh = vdupq_n_u8(16); - const int32x4_t mzero = vdupq_n_s32(0); - - lm_ggml_int8x16x4_t q5bytes; - lm_ggml_uint8x16x4_t q5h; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const int8_t * sc = x[i].scales; - - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const uint8x8_t qhbits = vld1_u8(qh); - - const lm_ggml_uint8x16x2_t q5bits = lm_ggml_vld1q_u8_x2(q5); - const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); - - const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1)); - q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4)); - q5h.val[1] = vbicq_u8(mh, vshlq_n_u8(htmp, 2)); - q5h.val[2] = vbicq_u8(mh, htmp); - q5h.val[3] = vbicq_u8(mh, vshrq_n_u8(htmp, 2)); - - q5bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[0], m4b)), vreinterpretq_s8_u8(q5h.val[0])); - q5bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[1], m4b)), vreinterpretq_s8_u8(q5h.val[1])); - q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2])); - q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3])); - - int32_t sumi1 = sc[0] * vaddvq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); - int32_t sumi2 = sc[1] * vaddvq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); - int32_t sumi3 = sc[2] * vaddvq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); - int32_t sumi4 = sc[3] * vaddvq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); - - sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); - } - - *s = sumf; - -#elif defined __AVX2__ - - const __m256i m4 = _mm256_set1_epi8(0xF); - const __m256i mone = _mm256_set1_epi8(1); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - - const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); - - const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0])); - const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2])); - - int64_t aux64; - memcpy(&aux64, x[i].qh, 8); - const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64); - const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128); +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v1 = vec_splats((unsigned char)0x1); + const vector unsigned char v2 = vec_splats((unsigned char)0x2); + const vector unsigned char v3 = vec_splats((unsigned char)0x3); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); - const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4); - const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4); + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); - const __m256i q5l_0 = _mm256_and_si256(q5bits, m4); - const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4); + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + vector float vxmin = vec_splats(LM_GGML_FP16_TO_FP32(x[i].dmin)); + vector float vdmin = vec_mul(vxmin, vyd); - const __m256i p16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5l_0, q8_0)); - const __m256i p16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5l_1, q8_1)); - const __m256i s16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5h_0, q8_0)); - const __m256i s16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5h_1, q8_1)); + memcpy(utmp, x[i].scales, 12); - const __m256i dot = _mm256_sub_epi32(_mm256_add_epi32(p16_0, p16_1), _mm256_add_epi32(s16_0, s16_1)); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; - acc = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(dot), acc); + vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); + vector signed short q8ysums1 = vec_xl(16, y[i].bsums); - } + vector signed char utmps = (vector signed char)vec_xl( 0, utmp); + vector signed short vscales = vec_unpackh(utmps); - *s = hsum_float_8(acc); + vector signed short q5xmins = vec_unpackl(utmps); + vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins); + vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins); -#elif defined __AVX__ + vector signed int prod0 = vec_mule(q5xmins0, q8ysums0); + vector signed int prod1 = vec_mule(q5xmins1, q8ysums1); + vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0); + vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1); - const __m128i m4 = _mm_set1_epi8(0xF); - const __m128i mone = _mm_set1_epi8(1); + vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); + vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); + vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); + vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); - __m256 acc = _mm256_setzero_ps(); + vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh); + vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh); - for (int i = 0; i < nb; ++i) { + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK_K/64; ++j) { + __builtin_prefetch(q5, 0, 1); + __builtin_prefetch(q8, 0, 1); - const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); + vector signed char qxs0 = (vector signed char)vec_xl( 0, q5); + vector signed char qxs1 = (vector signed char)vec_xl(16, q5); + q5 += 32; - const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]); - const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]); - const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]); - const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]); + vector signed char qxs00 = vec_and(qxs0, lowMask); + vector signed char qxs01 = vec_sr(qxs0, v4); + vector signed char qxs10 = vec_and(qxs1, lowMask); + vector signed char qxs11 = vec_sr(qxs1, v4); - int64_t aux64; - memcpy(&aux64, x[i].qh, 8); - const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64); - const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2); + vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4); + vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3); + vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4); + vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3); + qxhs0 = vec_sr(qxhs0, v2); + qxhs1 = vec_sr(qxhs1, v2); - const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4); - const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4); - const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4); - const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4); + vector signed char q5x00 = vec_or(q5h00, qxs00); + vector signed char q5x01 = vec_or(q5h01, qxs01); + vector signed char q5x10 = vec_or(q5h10, qxs10); + vector signed char q5x11 = vec_or(q5h11, qxs11); - const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4); - const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4); - const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4); - const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4); + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl(16, q8); + vector signed char q8y01 = vec_xl(32, q8); + vector signed char q8y11 = vec_xl(48, q8); + q8 += 64; - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00)); + vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01)); + vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10)); + vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11)); - const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0))); - const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1))); - const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0))); - const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1))); - const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0))); - const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1))); - const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0))); - const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1))); + vector signed short vs0 = vec_splat(vscales, 0); + vector signed short vs1 = vec_splat(vscales, 1); + vscales = vec_sld(vscales, vscales, 12); - const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2)); - const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3)); + qv00 = vec_add(qv00, qv10); + qv01 = vec_add(qv01, qv11); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc); + vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0); + vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1); + vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2); + vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3); + } + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); } - *s = hsum_float_8(acc); + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); -#elif defined __riscv_v_intrinsic + vsumf0 = vec_add(vsumf0, vsumf1); - float sumf = 0; + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - for (int i = 0; i < nb; ++i) { + *s = vec_extract(vsumf0, 0); - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const int8_t * sc = x[i].scales; +#elif defined __loongarch_asx + + const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); + const __m128i mzero = __lsx_vldi(0); + const __m256i mone = __lasx_xvreplgr2vr_b(1); + + __m256 acc = (__m256)__lasx_xvldi(0); + + float summs = 0.f; + + for (int i = 0; i < nb; ++i) { const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; const int8_t * restrict q8 = y[i].qs; - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); - // load qh - vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(qh, 8); - vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8)); + memcpy(utmp, x[i].scales, 12); - size_t vl = 16; + const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0])); + + const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0); + const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1)); + const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s); + const __m128i hsum = lsx_hadd_w(lsx_hadd_w(prod, mzero), mzero); + summs += dmin * __lsx_vpickve2gr_w(hsum, 0); //TODO check + + const __m128i sc128 = lasx_extracti128(mins_and_scales, 0); + const __m256i scales = lasx_insertf128(sc128, sc128); + + const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0); + __m256i hmask = mone; + + __m256i sumi = __lasx_xvldi(0); - // combine both qh_1 and qh_2 - vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl); + int bit = 0; - vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl); - vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl); - vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl); - vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl); + for (int j = 0; j < QK_K/64; ++j) { - vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0); - vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1); - vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2); - vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3); + const __m256i scale_0 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0)); + const __m256i scale_1 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1)); - // load q5 - vuint8mf2_t q5_x1 = __riscv_vle8_v_u8mf2(q5, vl); - vuint8mf2_t q5_x2 = __riscv_vle8_v_u8mf2(q5+16, vl); + const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32; - vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl)); - vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl)); - vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl)); - vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl)); + const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4); + const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4); + const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0); + hmask = __lasx_xvslli_h(hmask, 1); - vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl); - vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl); - vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl); - vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl); + const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4); + const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4); + const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1); + hmask = __lasx_xvslli_h(hmask, 1); - // load Q8 and multiply it with Q5 - vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl); - vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl); - vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl); - vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl); + const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl); - vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl); - vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl); - vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl); + __m256i p16_0 = lasx_maddubs_h(q5_0, q8_0); + __m256i p16_1 = lasx_maddubs_h(q5_1, q8_1); - int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0); - int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1); - int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2); - int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3); + p16_0 = lasx_madd_h(scale_0, p16_0); + p16_1 = lasx_madd_h(scale_1, p16_1); - sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); + } + __m256 vd = __lasx_xvreplfr2vr_s(d); + acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); } - *s = sumf; + *s = hsum_float_8(acc) + summs; #else - int8_t aux8[QK_K]; - int16_t aux16[16]; + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; float sums [8]; + int32_t aux32[8]; memset(sums, 0, 8*sizeof(float)); float sumf = 0; @@ -7529,34 +8085,54 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void const uint8_t * restrict q4 = x[i].qs; const uint8_t * restrict hm = x[i].qh; const int8_t * restrict q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); int8_t * restrict a = aux8; - for (int l = 0; l < 32; ++l) { - a[l+ 0] = q4[l] & 0xF; - a[l+32] = q4[l] >> 4; - } - for (int is = 0; is < 8; ++is) { - uint8_t m = 1 << is; - for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16); + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict sc = x[i].scales; - - for (int j = 0; j < QK_K/16; ++j) { - const float dl = d * sc[j]; - for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]); - q8 += 16; a += 16; + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; } + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = LM_GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; *s = sumf; #endif } -#endif - -#if QK_K == 256 void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -7893,365 +8469,282 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl); vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl); - vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl); - vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl); - vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl); - vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl); - - vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl); - vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl); - vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl); - vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl); - - vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl); - vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl); - vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl); - vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl); - - vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl); - vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl); - vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl); - vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl); - - // load Q8 and take product - vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl); - vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl); - vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl); - vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl); - - vl = 16; - - vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl); - vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl); - vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl); - vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl); - vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl); - vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl); - vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl); - vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl); - - vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl); - vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl); - vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl); - vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl); - - sum_t += __riscv_vmv_x_s_i32m1_i32(isum3); - - q6 += 64; qh += 32; q8 += 128; is=8; - - } - - sumf += d * sum_t; - - } - - *s = sumf; - -#else - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) { - a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - } - a += 128; - q4 += 64; - qh += 32; - } - a = aux8; - int is = 0; - for (int j = 0; j < QK_K/16; ++j) { - int scale = x[i].scales[is++]; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - } - const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} - -#else - -void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - float sum = 0; - - const uint8x16_t m4b = vdupq_n_u8(0xF); - const int8x16_t m32s = vdupq_n_s8(32); - const int32x4_t vzero = vdupq_n_s32(0); - - const uint8x16_t mone = vdupq_n_u8(3); - - lm_ggml_int8x16x4_t q6bytes; - lm_ggml_uint8x16x4_t q6h; - - for (int i = 0; i < nb; ++i) { - - const float d_all = LM_GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const int8_t * restrict scale = x[i].scales; - - int32_t isum = 0; - - uint8x16_t qhbits = vld1q_u8(qh); - lm_ggml_uint8x16x2_t q6bits = lm_ggml_vld1q_u8_x2(q6); - lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); - - q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4); - uint8x16_t shifted = vshrq_n_u8(qhbits, 2); - q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - shifted = vshrq_n_u8(qhbits, 4); - q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - shifted = vshrq_n_u8(qhbits, 6); - q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - - q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s); - q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s); - q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s); - q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s); - - isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; - - sum += isum * d_all * y[i].d; - - } - *s = sum; - -#elif defined __AVX2__ - - const __m256i m4 = _mm256_set1_epi8(0xF); - const __m256i m2 = _mm256_set1_epi8(3); - const __m256i m32s = _mm256_set1_epi8(32); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); - const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); - const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]); - const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]); + vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl); + vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl); + vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl); + vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl); - __m256i sumi = _mm256_setzero_si256(); + vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl); + vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl); + vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl); + vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl); - const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1); - const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3); + vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl); + vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl); + vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl); + vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl); - const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); - const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh); + vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl); + vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl); + vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl); + vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl); - const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4); - const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4); + // load Q8 and take product + vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl); + vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl); + vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl); + vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl); - const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); - const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1); + vl = 16; - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl); + vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl); + vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl); + vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl); + vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl); + vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl); + vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl); + vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl); - __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0); - __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1); + vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl); + vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl); + vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl); + vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl); - __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0); - __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1); + sum_t += __riscv_vmv_x_s_i32m1_i32(isum3); - p16_0 = _mm256_sub_epi16(p16_0, q8s_0); - p16_1 = _mm256_sub_epi16(p16_1, q8s_1); + q6 += 64; qh += 32; q8 += 128; is=8; - p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0); - p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1); + } - sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); + sumf += d * sum_t; - acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); } - *s = hsum_float_8(acc); - -#elif defined __AVX__ + *s = sumf; - const __m128i m4 = _mm_set1_epi8(0xF); - const __m128i m2 = _mm_set1_epi8(3); - const __m128i m32s = _mm_set1_epi8(32); +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v2 = vec_splats((unsigned char)0x2); + const vector unsigned char v3 = vec_splats((unsigned char)0x3); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + const vector unsigned char v6 = vec_splats((unsigned char)0x6); + const vector signed char off = vec_splats((signed char)0x20); - __m256 acc = _mm256_setzero_ps(); + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q4 = x[i].ql; + const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict qs = x[i].scales; const int8_t * restrict q8 = y[i].qs; - const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); - const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); - const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]); - const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]); - - __m128i sumi_0 = _mm_setzero_si128(); - __m128i sumi_1 = _mm_setzero_si128(); - - const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1); - const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3); + for (int j = 0; j < QK_K/128; ++j) { + __builtin_prefetch(q6, 0, 0); + __builtin_prefetch(qh, 0, 0); + __builtin_prefetch(q8, 0, 0); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q6); + vector signed char qxs1 = (vector signed char)vec_xl(16, q6); + vector signed char qxs2 = (vector signed char)vec_xl(32, q6); + vector signed char qxs3 = (vector signed char)vec_xl(48, q6); + q6 += 64; + + vector signed char qxs00 = vec_and(qxs0, lowMask); + vector signed char qxs01 = vec_sr(qxs0, v4); + vector signed char qxs10 = vec_and(qxs1, lowMask); + vector signed char qxs11 = vec_sr(qxs1, v4); + vector signed char qxs20 = vec_and(qxs2, lowMask); + vector signed char qxs21 = vec_sr(qxs2, v4); + vector signed char qxs30 = vec_and(qxs3, lowMask); + vector signed char qxs31 = vec_sr(qxs3, v4); + + vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh); + vector signed char qxhs1 = (vector signed char)vec_xl(16, qh); + qh += 32; - const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); - const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh); + vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4); + vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4); + vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4); + vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4); + vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4); + vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4); + vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4); + vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4); + + vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off); + vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off); + vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off); + vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off); + vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off); + vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off); + vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off); + vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl( 16, q8); + vector signed char q8y20 = vec_xl( 32, q8); + vector signed char q8y30 = vec_xl( 48, q8); + vector signed char q8y01 = vec_xl( 64, q8); + vector signed char q8y11 = vec_xl( 80, q8); + vector signed char q8y21 = vec_xl( 96, q8); + vector signed char q8y31 = vec_xl(112, q8); + q8 += 128; + + vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00)); + vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10)); + vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20)); + vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30)); + vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01)); + vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11)); + vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21)); + vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31)); + + vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8)); + qs += 8; - const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4); - const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4); - const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4); - const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4); + vector signed short vs0 = vec_splat(vscales, 0); + vector signed short vs1 = vec_splat(vscales, 1); + vector signed short vs2 = vec_splat(vscales, 2); + vector signed short vs3 = vec_splat(vscales, 3); + vector signed short vs4 = vec_splat(vscales, 4); + vector signed short vs5 = vec_splat(vscales, 5); + vector signed short vs6 = vec_splat(vscales, 6); + vector signed short vs7 = vec_splat(vscales, 7); - const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0); - const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1); - const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2); - const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3); + vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0); + vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1); + vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2); + vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3); + vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4); + vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5); + vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6); + vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7); - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0); + vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1); + vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2); + vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3); + vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4); + vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5); + vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6); + vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7); + } - __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0)); - __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1)); - __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0)); - __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1)); + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); - __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0)); - __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1)); - __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0)); - __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1)); + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } - p16_0 = _mm_sub_epi16(p16_0, q8s_0); - p16_1 = _mm_sub_epi16(p16_1, q8s_1); - p16_2 = _mm_sub_epi16(p16_2, q8s_2); - p16_3 = _mm_sub_epi16(p16_3, q8s_3); + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); - p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); - p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1); - p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); - p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3); + vsumf0 = vec_add(vsumf0, vsumf1); - sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); - sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc); - } + *s = vec_extract(vsumf0, 0); - *s = hsum_float_8(acc); +#elif defined __loongarch_asx -#elif defined __riscv_v_intrinsic + const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); + const __m256i m2 = __lasx_xvreplgr2vr_b(3); + const __m256i m32s = __lasx_xvreplgr2vr_b(32); - float sumf = 0; + __m256 acc = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d_all = LM_GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q6 = x[i].ql; + const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; const int8_t * restrict q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const __m128i scales = __lsx_vld((const __m128i*)x[i].scales, 0); - int32_t isum = 0; + __m256i sumi = __lasx_xvldi(0); - size_t vl = 16; + int is = 0; - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); + for (int j = 0; j < QK_K/128; ++j) { - // load Q6 - vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl); - vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl); + const __m128i scale_0 = lsx_shuffle_b(scales, get_scale_shuffle(is + 0)); + const __m128i scale_1 = lsx_shuffle_b(scales, get_scale_shuffle(is + 1)); + const __m128i scale_2 = lsx_shuffle_b(scales, get_scale_shuffle(is + 2)); + const __m128i scale_3 = lsx_shuffle_b(scales, get_scale_shuffle(is + 3)); + is += 4; - // load qh - vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl); + const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; + const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; + const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32; - vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl); - qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl); - vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl); - qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl); - vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl); - qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl); - vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl); + const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(q4bitsH, m2), 4); + const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 2), m2), 4); + const __m256i q4h_2 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 4), m2), 4); + const __m256i q4h_3 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 6), m2), 4); - vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl); - vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl); - vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl); - vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl); + const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0); + const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(q4bits2, m4), q4h_1); + const __m256i q4_2 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_2); + const __m256i q4_3 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits2, 4), m4), q4h_3); - vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl); - vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl); - vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl); - vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl); + const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - // load Q8 and take product - vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl); - vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl); - vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl); - vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl); + __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0); + __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1); + __m256i q8s_2 = lasx_maddubs_h(m32s, q8_2); + __m256i q8s_3 = lasx_maddubs_h(m32s, q8_3); - vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl); - vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl); - vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl); - vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl); + __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0); + __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1); + __m256i p16_2 = lasx_maddubs_h(q4_2, q8_2); + __m256i p16_3 = lasx_maddubs_h(q4_3, q8_3); - isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3]; + p16_0 = __lasx_xvsub_h(p16_0, q8s_0); + p16_1 = __lasx_xvsub_h(p16_1, q8s_1); + p16_2 = __lasx_xvsub_h(p16_2, q8s_2); + p16_3 = __lasx_xvsub_h(p16_3, q8s_3); - sumf += isum * d_all * y[i].d; + p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0); + p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1); + p16_2 = lasx_madd_h(lasx_ext8_16(scale_2), p16_2); + p16_3 = lasx_madd_h(lasx_ext8_16(scale_3), p16_3); + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3)); + } + + acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); } - *s = sumf; + *s = hsum_float_8(acc); #else @@ -8268,12 +8761,18 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void const int8_t * restrict q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); int8_t * restrict a = aux8; - for (int l = 0; l < 16; ++l) { - a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - a[l+32] = (int8_t)((q4[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - a[l+48] = (int8_t)((q4[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; } + a = aux8; int is = 0; for (int j = 0; j < QK_K/16; ++j) { int scale = x[i].scales[is++]; @@ -8292,9 +8791,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void #endif } -#endif - -#if defined (__AVX2__) || defined (__ARM_NEON) +#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx) static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, @@ -8427,6 +8924,146 @@ void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const vo *s = 0.125f * hsum_float_8(accumf); +#elif defined(__POWER9_VECTOR__) + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q2, 0, 1); + __builtin_prefetch(q8, 0, 1); + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + memcpy(aux32, q2, 4*sizeof(uint32_t)); + q2 += 8; + + vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])}; + vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])}; + vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])}; + vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])}; + + vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127))}; + vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))}; + vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127))}; + vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))}; + + vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0); + vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1); + vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2); + vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); + + const uint16_t ls0 = aux32[1] >> 28; + const uint16_t ls1 = aux32[3] >> 28; + + vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1)); + vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1)); + + vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); + vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); + vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); + vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); + vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); + vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); + vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); + vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = 0.125f * vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; + + const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); + const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); + const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127], + signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); + const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1); + const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2); + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); + const uint16_t ls1 = aux32[1] >> 28; + const uint16_t ls2 = aux32[3] >> 28; + const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); + const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); + sumi1 = __lasx_xvadd_w(sumi1, p1); + sumi2 = __lasx_xvadd_w(sumi2, p2); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + } + + *s = 0.125f * hsum_float_8(accumf); + #else uint32_t aux32[2]; @@ -8545,64 +9182,6 @@ void lm_ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const voi const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1); const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2); -#if QK_K == 64 - static const uint8_t k_bit_helper[16] = { - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - }; - const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper); - const __m128i m511 = _mm_set1_epi16(511); - typedef union { - __m128i vec_index; - uint16_t index[8]; - } index_t; - - index_t idx; - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs); - idx.vec_index = _mm_and_si128(q2_data, m511); - - const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9); - const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13); - const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper); - - const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); - const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits); - const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits); - - const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32)); - - const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]], - iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]); - const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]], - iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]); - - __m256i signs; - signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1); - signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone)); - - signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2); - signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone)); - - const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); - const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); - - const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1)); - const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1)); - - const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2)); - - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); -#else - static const uint8_t k_bit_helper[32] = { 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, @@ -8684,24 +9263,237 @@ void lm_ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const voi const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3); const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4); - const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); - const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); - const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2))); - const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3))); + const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); + const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); + const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2))); + const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3))); + + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4)); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); +#elif defined(__loongarch_asx) + + const __m256i mone = __lasx_xvreplgr2vr_b(1); + static const char block_sign_shuffle_mask_1[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + }; + static const char block_sign_shuffle_mask_2[32] = { + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, + }; + static const uint8_t bit_selector_mask_bytes[32] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0); + const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0); + const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0); + + static const uint8_t k_bit_helper[32] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0); + const __m256i m511 = __lasx_xvreplgr2vr_h(511); + const __m128i m4 = __lsx_vreplgr2vr_b(0xf); + const __m128i m1 = __lsx_vreplgr2vr_b(1); + + uint64_t aux64; + + // somewhat hacky, but gives a significant boost in performance + __m256i aux_gindex; + const uint16_t * gindex = (const uint16_t *)&aux_gindex; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + __m128i stmp = __lsx_vreplgr2vr_d(aux64); + stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4)); + const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1); + + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { + + const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0); q2 += 16; + aux_gindex = __lasx_xvand_v(q2_data, m511); + + const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9); + const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13); + const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper); + + const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting); + const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits); + + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + + const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]], + iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]); + const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]], + iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]); + const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]], + iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]); + const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]], + iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); + + const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0); + const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1); + const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l); + const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h); + + __m256i signs; + signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1); + signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1); + + signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2); + signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2); + + signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1); + signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3); + + signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2); + signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4); + + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); + const __m256i dot3 = lasx_maddubs_h(q2_3, q8s_3); + const __m256i dot4 = lasx_maddubs_h(q2_4, q8s_4); + + const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0))); + const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1))); + const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2))); + const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3))); + + sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1)); + sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2)); + sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3)); + sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4)); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); +#elif defined(__POWER9_VECTOR__) + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + + const uint16_t * restrict q2 = x[i].qs; + const uint8_t * restrict sc = x[i].scales; + const int8_t * restrict q8 = y[i].qs; + + for (int j = 0; j < QK_K/64; ++j) { + __builtin_prefetch(q2, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))}; + vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))}; + vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))}; + vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))}; + + vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))}; + vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))}; + vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))}; + vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))}; + q2 += 8; + + vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0); + vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1); + vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2); + vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); + + const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); + const uint16_t ls1 = (uint16_t)(sc[0] >> 4); + const uint16_t ls2 = (uint16_t)(sc[1] & 0xf); + const uint16_t ls3 = (uint16_t)(sc[1] >> 4); + sc += 2; - sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); - sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); - sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3)); - sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4)); + vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1)); + vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1)); + vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1)); + vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1)); + + vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0); + vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1); + vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2); + vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3); + vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4); + vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5); + vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6); + vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7); } - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); } - *s = 0.125f * hsum_float_8(accumf); -#endif + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = 0.125f * vec_extract(vsumf0, 0); #else float sumf = 0.f; @@ -8902,6 +9694,199 @@ void lm_ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *s = 0.125f * hsum_float_8(accumf); +#elif defined(__POWER9_VECTOR__) + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const vector unsigned char mask0 = vec_xl( 0, k_mask1); + const vector unsigned char mask1 = vec_xl(16, k_mask1); + const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + + const uint8_t * restrict q2 = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8); + const uint8_t * restrict sc = x[i].scales; + const int8_t * restrict q8 = y[i].qs; + + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q2, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))}; + vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))}; + vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))}; + vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))}; + q2 += 8; + qh += 2; + + vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]); + vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]); + signs += 4; + + vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0); + vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1); + vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0); + vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1); + + vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2); + vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2); + vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2); + vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2); + + vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0); + vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1); + vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2); + vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); + + const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); + const uint16_t ls1 = (uint16_t)(sc[0] >> 4); + const uint16_t ls2 = (uint16_t)(sc[1] & 0xf); + const uint16_t ls3 = (uint16_t)(sc[1] >> 4); + sc += 2; + + vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1)); + vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1)); + vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1)); + vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1)); + + vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0); + vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1); + vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2); + vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3); + vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4); + vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5); + vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6); + vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = 0.125f * vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + + const __m128i m4 = __lsx_vreplgr2vr_b(0xf); + const __m128i m1 = __lsx_vreplgr2vr_b(1); + + const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0); + const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0); + uint64_t aux64; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * restrict q8 = y[i].qs; + + __m128i tmp1; + memcpy(&aux64, x[i].scales, 8); + tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0); + tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1); + const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1); + const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 + + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], + iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)], + iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], + iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); + const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], + iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)], + iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], + iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); + qs += 8; + + __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16)); + aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); + const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2); + const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1); + + aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16)); + aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); + const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2); + const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2); + + signs += 4; + + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1 + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3 + + const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0))); + const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1))); + sumi1 = __lasx_xvadd_w(sumi1, p1); + sumi2 = __lasx_xvadd_w(sumi2, p2); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + } + + *s = 0.125f * hsum_float_8(accumf); + #else float sumf = 0; @@ -9046,6 +10031,149 @@ void lm_ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const vo *s = 0.25f * hsum_float_8(accumf); +#elif defined(__POWER9_VECTOR__) + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + + const uint8_t * restrict q3 = x[i].qs; + const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4); + const int8_t * restrict q8 = y[i].qs; + +#pragma GCC unroll 1 + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q3, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]}; + vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]}; + vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]}; + vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]}; + q3 += 16; + + vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >> 0) & 127]), (uint64_t)(signs64[(signs[0] >> 7) & 127])}; + vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])}; + vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >> 0) & 127]), (uint64_t)(signs64[(signs[1] >> 7) & 127])}; + vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])}; + + vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0); + vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1); + vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2); + vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3)); + + const uint16_t ls0 = (uint16_t)(signs[0] >> 28); + const uint16_t ls1 = (uint16_t)(signs[1] >> 28); + signs += 2; + + vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); + vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); + + vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); + vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); + vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); + vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); + vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); + vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); + vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); + vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = 0.25f * vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict gas = x[i].qs + QK_K/4; + const int8_t * restrict q8 = y[i].qs; + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + memcpy(aux32, gas, 8); gas += 8; + + const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127], + signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); + const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1); + const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2); + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); + const uint16_t ls1 = aux32[0] >> 28; + const uint16_t ls2 = aux32[1] >> 28; + + const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); + const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); + sumi1 = __lasx_xvadd_w(sumi1, p1); + sumi2 = __lasx_xvadd_w(sumi2, p2); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + } + + *s = 0.25f * hsum_float_8(accumf); + #else uint32_t aux32; @@ -9120,10 +10248,8 @@ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const voi lm_ggml_int8x16x4_t q8b; vec_index_t idx; -#if QK_K == 256 uint32_t scales32[2]; const uint8_t * scales8 = (const uint8_t *)scales32; -#endif float sumf = 0; for (int i = 0; i < nb; ++i) { @@ -9133,11 +10259,9 @@ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const voi const uint16_t * restrict signs = (const uint16_t *)x[i].signs; const int8_t * restrict q8 = y[i].qs; -#if QK_K == 256 memcpy(scales32, x[i].scales, 4); scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101; -#endif int sumi1 = 0, sumi2 = 0; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -9156,41 +10280,239 @@ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const voi iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); - vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16))); - vs.val[1] = vandq_u8(lm_ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(lm_ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); - vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); - vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16))); + vs.val[1] = vandq_u8(lm_ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(lm_ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); + vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); + + q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0)); + q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1)); + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16))); + vs.val[1] = vandq_u8(lm_ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(lm_ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); + vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); + + signs += 4; + + q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2)); + q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3)); + + const int32x4_t p1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); + const int32x4_t p2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); + + sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0]; + sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4]; + } + sumf += d*(sumi1 + sumi2); + } + *s = sumf; + +#elif defined(__AVX2__) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); + const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); + + const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + const __m256i idx_mask = _mm256_set1_epi32(256); + + typedef union { + __m256i vec[2]; + uint32_t index[16]; + } index_t; + + index_t idx; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)x[i].signs; + const int8_t * restrict q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16; + idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]); + idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]); + idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask); + idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask); + idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l))); + idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1))); + + // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange. + //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4); + //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); + const __m256i q2_1 = _mm256_set_epi32( + iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], + iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] + ); + const __m256i q2_2 = _mm256_set_epi32( + iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], + iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]] + ); + + __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); + + aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); + + signs += 4; + + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; + const uint16_t ls2 = x[i].scales[ib32/2] >> 4; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = hsum_float_8(accumf); + +#elif defined(__POWER9_VECTOR__) + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); - q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0)); - q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1)); + const vector unsigned char mask0 = vec_xl( 0, k_mask1); + const vector unsigned char mask1 = vec_xl(16, k_mask1); + const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); - vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16))); - vs.val[1] = vandq_u8(lm_ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(lm_ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); - vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); - vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)(x[i].signs); + const uint8_t * restrict sc = x[i].scales; + const int8_t * restrict q8 = y[i].qs; + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q3, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)], + iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]}; + vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)], + iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]}; + vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)], + iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]}; + vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)], + iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]}; + q3 += 16; + qh += 2; + vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]); + vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]); signs += 4; - q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2)); - q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3)); + vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0); + vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1); + vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0); + vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1); - const int32x4_t p1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); - const int32x4_t p2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); -#if QK_K == 256 - sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0]; - sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4]; -#else - sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf)); - sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4)); -#endif + vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2); + vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2); + vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2); + vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2); + + vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0); + vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1); + vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2); + vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3)); + + const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); + const uint16_t ls1 = (uint16_t)(sc[0] >> 4); + sc ++; + + vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); + vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); + + vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); + vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); + vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); + vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); + vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); + vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); + vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); + vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); } - sumf += d*(sumi1 + sumi2); + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); } - *s = sumf; -#elif defined(__AVX2__) + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 @@ -9200,11 +10522,11 @@ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const voi 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; - const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); - const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); + const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0); + const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0); - const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); - const __m256i idx_mask = _mm256_set1_epi32(256); + __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8); + const __m256i idx_mask = __lasx_xvreplgr2vr_w(256); typedef union { __m256i vec[2]; @@ -9213,62 +10535,61 @@ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const voi index_t idx; - __m256 accumf = _mm256_setzero_ps(); + __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * restrict qs = x[i].qs; const uint8_t * restrict qh = x[i].qh; const uint16_t * restrict signs = (const uint16_t *)x[i].signs; const int8_t * restrict q8 = y[i].qs; - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16; - idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]); - idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]); - idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask); - idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask); - idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l))); - idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1))); + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16; + idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]); + idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]); + idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask); + idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask); + idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0))); + idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1))); // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange. //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4); //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); - const __m256i q2_1 = _mm256_set_epi32( + const __m256i q2_1 = lasx_set_w( iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] ); - const __m256i q2_2 = _mm256_set_epi32( + const __m256i q2_2 = lasx_set_w( iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]] ); - __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); - const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); - const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); + __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16)); + aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); + const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2); + const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1); - aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); - const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); - const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); + aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16)); + aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); + const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2); + const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2); signs += 4; - const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); - const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; const uint16_t ls2 = x[i].scales[ib32/2] >> 4; - const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); - const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); - sumi1 = _mm256_add_epi32(sumi1, p1); - sumi2 = _mm256_add_epi32(sumi2, p2); + const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); + const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); + sumi1 = __lasx_xvadd_w(sumi1, p1); + sumi2 = __lasx_xvadd_w(sumi2, p2); } - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); - + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); } *s = hsum_float_8(accumf); @@ -9320,12 +10641,22 @@ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const voi } -#ifdef __AVX2__ +#if defined(__AVX2__) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i ax = _mm256_sign_epi8(x, x); const __m256i sy = _mm256_sign_epi8(y, x); return _mm256_maddubs_epi16(ax, sy); } +#elif defined(__loongarch_asx) +static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { + const __m256i ax = __lasx_xvsigncov_b(x, x); + const __m256i sy = __lasx_xvsigncov_b(x, y); + __m256i tmp1, tmp2, tmp3; + tmp1 = __lasx_xvmulwev_h_bu_b(ax, sy); + tmp2 = __lasx_xvmulwod_h_bu_b(ax, sy); + tmp3 = __lasx_xvadd_h(tmp1, tmp2); + return __lasx_xvsat_h(tmp3, 15); +} #endif void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { @@ -9427,6 +10758,169 @@ void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const vo *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; +#elif defined(__POWER9_VECTOR__) + const vector unsigned char v0 = vec_splats((unsigned char)0x0); + const vector unsigned short vsign = vec_splats((unsigned short)0x8000); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi8 = vec_splats((int32_t)0); + + const uint8_t * restrict q1 = x[i].qs; + const uint16_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + const int16_t * restrict qs = y[i].bsums; + + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q1, 0, 1); + __builtin_prefetch(qh, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))}; + vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))}; + vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))}; + vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))}; + q1 += 8; + + vector signed char q1x0 = (vector signed char)aux64x2_0; + vector signed char q1x1 = (vector signed char)aux64x2_1; + vector signed char q1x2 = (vector signed char)aux64x2_2; + vector signed char q1x3 = (vector signed char)aux64x2_3; + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3)); + + const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7); + const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7); + + vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); + vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); + vector signed short vscales = vec_sld(vscales23, vscales01, 8); + + vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); + vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); + vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); + vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); + vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); + vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); + vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); + vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + + vector signed short q8ysums = vec_xl_len(qs, 8); + qs += 4; + q8ysums = vec_mergeh(q8ysums, (vector signed short)v0); + + vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8); + qh += 2; + vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0); + + vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel); + + vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + + vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + + __m256 accum = (__m256)__lasx_xvldi(0); + float accum1 = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + __m256i sumi = __lasx_xvldi(0); + int sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ib += 2) { + __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0); + q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1); + q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2); + q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3); + + __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0); + q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1); + q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2); + q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3); + + qs += 8; + const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + + const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); + const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); + const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; + const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; + + __m256i tmp1, tmp5, tmp6; + tmp1 = __lasx_xvreplgr2vr_h(ls1); + tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1); + tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1); + const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6); + + tmp1 = __lasx_xvreplgr2vr_h(ls2); + tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1); + tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1); + const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6); + + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2)); + sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 + + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; + } + + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum); + accum1 += d * sumi1; + } + + *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; + #else float sumf = 0; @@ -9474,17 +10968,10 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo const int nb = n / QK_K; -#if QK_K != 64 iq1m_scale_t scale; -#endif #if defined __ARM_NEON - -#if QK_K == 64 - const int32x4_t mask = vdupq_n_s32(0xf); -#else const int32x4_t mask = vdupq_n_s32(0x7); -#endif const int32x4_t mone = vdupq_n_s32(1); const int32x4_t mzero = vdupq_n_s32(0); @@ -9508,9 +10995,7 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo const uint8_t * qh = x[i].qh; const uint16_t * sc = (const uint16_t *)x[i].scales; -#if QK_K != 64 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); -#endif int32x4_t sumi1 = mzero; int32x4_t sumi2 = mzero; @@ -9539,11 +11024,8 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo const int32x4_t p4 = vpaddq_s32(lm_ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), lm_ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3])); const int32x4_t p34 = vpaddq_s32(p3, p4); -#if QK_K == 64 - int32x4_t scales_4 = lm_ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12); -#else int32x4_t scales_4 = lm_ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9); -#endif + scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone); sumi1 = vmlaq_s32(sumi1, scales_4, p12); @@ -9553,22 +11035,14 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo } -#if QK_K == 64 - sumf += y[i].d * LM_GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); -#else sumf += y[i].d * LM_GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); -#endif } *s = sumf; #elif defined __AVX2__ -#if QK_K == 64 - const __m256i mask = _mm256_set1_epi16(0xf); -#else const __m256i mask = _mm256_set1_epi16(0x7); -#endif const __m256i mone = _mm256_set1_epi16(1); __m256 accum1 = _mm256_setzero_ps(); @@ -9580,9 +11054,7 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo const uint8_t * qh = x[i].qh; const uint16_t * sc = (const uint16_t *)x[i].scales; -#if QK_K != 64 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); -#endif __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); @@ -9612,13 +11084,10 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo const __m256i dot3 = mul_add_epi8(delta1, q8b_1); const __m256i dot4 = mul_add_epi8(delta2, q8b_2); -#if QK_K == 64 - __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0)); - __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8)); -#else + __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0)); __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6)); -#endif + scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone); scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone); const __m256i p1 = _mm256_madd_epi16(dot1, scale1); @@ -9632,14 +11101,10 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo qs += 8; qh += 4; } -#if QK_K == 64 - const __m256 d = _mm256_set1_ps(y[i].d * LM_GGML_FP16_TO_FP32(x[i].d)); -#else const __m256 d = _mm256_set1_ps(y[i].d * LM_GGML_FP16_TO_FP32(scale.f16)); -#endif + accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); - } *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); @@ -9656,9 +11121,7 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo const uint8_t * qh = x[i].qh; const uint16_t * sc = (const uint16_t *)x[i].scales; -#if QK_K != 64 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); -#endif int sumi1 = 0, sumi2 = 0; for (int ib = 0; ib < QK_K/32; ++ib) { @@ -9678,24 +11141,17 @@ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const vo sum1[l/2] += lsum1; sum2[l/2] += lsum2*delta[l]; } -#if QK_K == 64 - const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1; - const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1; -#else + const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1; const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1; -#endif + sumi1 += sum1[0] * ls1 + sum1[1] * ls2; sumi2 += sum2[0] * ls1 + sum2[1] * ls2; qs += 4; qh += 2; } -#if QK_K == 64 - sumf += LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); -#else sumf += LM_GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); -#endif } *s = sumf; @@ -9783,6 +11239,84 @@ void lm_ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const voi *s = hsum_float_8(_mm256_add_ps(accum1, accum2)); +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + + const vector signed char values = vec_xl( 0, kvalues_iq4nl); + +#pragma GCC unroll 4 + for (int ib = 0; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(LM_GGML_FP16_TO_FP32(y[ib].d)); + vector float vd = vec_mul(vxd, vyd); + + vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); + vector signed char q4x0 = vec_and(qxs, lowMask); + vector signed char q4x1 = vec_sr(qxs, v4); + + q4x0 = vec_perm(values, values, (vector unsigned char)q4x0); + q4x1 = vec_perm(values, values, (vector unsigned char)q4x1); + + vector signed char q8y0 = vec_xl( 0, y[ib].qs); + vector signed char q8y1 = vec_xl(16, y[ib].qs); + + vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); + + vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1)); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + } + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined (__loongarch_asx) + + const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0); + const __m128i m4b = __lsx_vreplgr2vr_b(0x0f); + const __m256i mone = __lasx_xvreplgr2vr_h(1); + + __m256 accum1 = (__m256)__lasx_xvldi(0); + __m256 accum2 = (__m256)__lasx_xvldi(0); + for (int ib = 0; ib < nb; ib += 2) { + const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[0].qs, 0); + const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[1].qs, 0); + const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[0].qs, 0); + const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[1].qs, 0); + const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)), + lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b))); + const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)), + lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const __m256i p_1 = lasx_madd_h(p16_1, mone); + const __m256i p_2 = lasx_madd_h(p16_2, mone); + accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(LM_GGML_FP16_TO_FP32(y[0].d)*LM_GGML_FP16_TO_FP32(x[0].d)), + __lasx_xvffint_s_w(p_1), accum1); + accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(LM_GGML_FP16_TO_FP32(y[1].d)*LM_GGML_FP16_TO_FP32(x[1].d)), + __lasx_xvffint_s_w(p_2), accum2); + + y += 2; + x += 2; + } + + *s = hsum_float_8(__lasx_xvfadd_s(accum1, accum2)); + #else float sumf = 0; for (int ib = 0; ib < nb; ++ib) { @@ -9805,9 +11339,6 @@ void lm_ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const voi UNUSED(by); UNUSED(bs); assert(n % QK_K == 0); -#if QK_K == 64 - lm_ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc); -#else const block_iq4_xs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -9894,6 +11425,179 @@ void lm_ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const voi *s = hsum_float_8(accum); +#elif defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const vector signed char values = vec_xl( 0, kvalues_iq4nl); + + for (int ibl = 0; ibl < nb; ++ibl) { + + vector float vxd = vec_splats(LM_GGML_FP16_TO_FP32(x[ibl].d)); + vector float vyd = vec_splats(y[ibl].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi4 = vec_splats((int32_t)0); + vector signed int vsumi5 = vec_splats((int32_t)0); + vector signed int vsumi6 = vec_splats((int32_t)0); + vector signed int vsumi7 = vec_splats((int32_t)0); + + uint16_t h = x[ibl].scales_h; + + const uint8_t * restrict q4 = x[ibl].qs; + const uint8_t * restrict sc = x[ibl].scales_l; + const int8_t * restrict q8 = y[ibl].qs; + + for (int ib = 0; ib < QK_K/64; ib ++ ) { + __builtin_prefetch(q4, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q4); + vector signed char qxs1 = (vector signed char)vec_xl(16, q4); + q4 += 32; + + vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask); + vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4); + vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask); + vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4); + + q4x00 = vec_perm(values, values, (vector unsigned char)q4x00); + q4x01 = vec_perm(values, values, (vector unsigned char)q4x01); + q4x10 = vec_perm(values, values, (vector unsigned char)q4x10); + q4x11 = vec_perm(values, values, (vector unsigned char)q4x11); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3)); + + const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32); + const uint16_t ls1 = (uint16_t)(((sc[0] >> 4) | ((h << 2) & 0x30)) - 32); + h >>= 4; + sc ++; + + vector signed short vscales01 = vec_splats((int16_t)ls0); + vector signed short vscales23 = vec_splats((int16_t)ls1); + + vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); + vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); + vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); + vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); + vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); + vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); + vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); + vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#elif defined(__loongarch_asx) + + const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0); + const __m128i m4b = __lsx_vreplgr2vr_b(0x0f); + + __m256 accum = (__m256)__lasx_xvldi(0); + __m256i tmp1; + __m128i tmp0, tmp2, tmp3, tmp4, mask_8f, mask; + + mask_8f = __lsx_vreplgr2vr_b(0x8f); + for (int ibl = 0; ibl < nb; ++ibl) { + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + uint16_t sh = x[ibl].scales_h; + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + __m128i zero = __lsx_vldi(0); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16; + const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16; + const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b), mask_8f); + tmp0 = __lsx_vori_b(tmp2, 0x10); + mask = __lsx_vsle_b(zero, tmp2); + tmp3 = __lsx_vand_v(tmp0, mask); + tmp3 = __lsx_vshuf_b(values128, zero, tmp3); + + tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_1, m4b), mask_8f); + tmp0 = __lsx_vori_b(tmp2, 0x10); + mask = __lsx_vsle_b(zero, tmp2); + tmp4 = __lsx_vand_v(tmp0, mask); + tmp4 = __lsx_vshuf_b(values128, zero, tmp4); + + const __m256i q4b_1 = lasx_insertf128(tmp3, tmp4); + + tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b), mask_8f); + tmp0 = __lsx_vori_b(tmp2, 0x10); + mask = __lsx_vsle_b(zero, tmp2); + tmp3 = __lsx_vand_v(tmp0, mask); + tmp3 = __lsx_vshuf_b(values128, zero, tmp3); + + tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_2, m4b), mask_8f); + tmp0 = __lsx_vori_b(tmp2, 0x10); + mask = __lsx_vsle_b(zero, tmp2); + tmp4 = __lsx_vand_v(tmp0, mask); + tmp4 = __lsx_vshuf_b(values128, zero, tmp4); + + const __m256i q4b_2 = lasx_insertf128(tmp3, tmp4); + + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; + const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; + sh >>= 4; + __m256i tmp5, tmp6; + tmp1 = __lasx_xvreplgr2vr_h(ls1); + tmp5 = __lasx_xvmulwev_w_h(p16_1, tmp1); + tmp6 = __lasx_xvmulwod_w_h(p16_1, tmp1); + const __m256i p_1 = __lasx_xvadd_w(tmp5, tmp6); + tmp1 = __lasx_xvreplgr2vr_h(ls2); + tmp5 = __lasx_xvmulwev_w_h(p16_2, tmp1); + tmp6 = __lasx_xvmulwod_w_h(p16_2, tmp1); + const __m256i p_2 = __lasx_xvadd_w(tmp5, tmp6); + sumi1 = __lasx_xvadd_w(p_1, sumi1); + sumi2 = __lasx_xvadd_w(p_2, sumi2); + } + accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(LM_GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum); + } + + *s = hsum_float_8(accum); + #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { @@ -9927,7 +11631,6 @@ void lm_ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const voi } *s = sumf; #endif -#endif } // ================================ IQ2 quantization ============================================= @@ -10425,7 +12128,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict } float max = xval[0]; for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]); - if (!max) { + if (max < GROUP_MAX_EPS) { scales[ib] = 0; memset(L, 0, 32); continue; @@ -10503,7 +12206,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict printf("\n"); LM_GGML_ASSERT(false); } - q2[2*ib+0] |= (grid_index << 8*k); + q2[2*ib+0] |= ((uint32_t) grid_index << 8*k); q2[2*ib+1] |= (block_signs[k] << 7*k); } LM_GGML_ASSERT(scale >= 0); @@ -10601,7 +12304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v } float max = xval[0]; for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]); - if (!max) { + if (max < GROUP_MAX_EPS) { scales[ib] = 0; memset(L, 0, 16); continue; @@ -11042,7 +12745,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v } float max = xval[0]; for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]); - if (!max) { + if (max < GROUP_MAX_EPS_IQ3_XXS) { scales[ib] = 0; memset(L, 0, 32); continue; @@ -11582,7 +13285,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); float max = fabsf(xb[0]); for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); - if (!max) { + if (max < GROUP_MAX_EPS_IQ1_S) { scales[ib] = 0; memset(L, 1, block_size); continue; @@ -11745,10 +13448,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy const float * xx; for (int ibl = 0; ibl < nbl; ++ibl) { - -#if QK_K == 64 - y[ibl].d = LM_GGML_FP32_TO_FP16(0.f); -#endif memset(y[ibl].qs, 0, QK_K/8); memset(y[ibl].qh, 0, QK_K/16); memset(y[ibl].scales, 0, QK_K/32); @@ -11770,7 +13469,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy } float max = fabsf(xb[0]); for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); - if (!max) { + if (max < GROUP_MAX_EPS_IQ1_M) { scales[ib] = 0; memset(L, 1, block_size); continue; @@ -11923,22 +13622,13 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy } uint16_t * sc = (uint16_t *)y[ibl].scales; -#if QK_K == 64 - float d = max_scale/31; -#else float d = max_scale/15; -#endif float id = 1/d; float sumqx_f = 0, sumq2_f = 0; for (int ib = 0; ib < QK_K/block_size; ++ib) { int l = nearest_int(0.5f*(id*scales[ib+0]-1)); -#if QK_K == 64 - l = MAX(0, MIN(15, l)); - sc[ib/4] |= (l << 4*(ib%4)); -#else l = MAX(0, MIN(7, l)); sc[ib/4] |= (l << 3*(ib%4)); -#endif y[ibl].qh[ib] |= masks[shifts[ib]]; const float * xb = xbl + block_size*ib; if (quant_weights) { @@ -11961,14 +13651,10 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy } if (sumq2_f > 0) d = sumqx_f/sumq2_f; s.f16 = LM_GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed. -#if QK_K == 64 - y[ibl].d = s.f16; -#else sc[0] |= ((s.u16 & 0x000f) << 12); sc[1] |= ((s.u16 & 0x00f0) << 8); sc[2] |= ((s.u16 & 0x0f00) << 4); sc[3] |= ((s.u16 & 0xf000) << 0); -#endif } } @@ -12034,7 +13720,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block amax = ax; max = xb[j]; } } - if (!amax) { + if (amax < GROUP_MAX_EPS) { scales[ib] = 0; continue; } @@ -12157,9 +13843,6 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest } size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { -#if QK_K == 64 - return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights); -#else LM_GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -12177,7 +13860,6 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t qrow += nblock*sizeof(block_iq4_xs); } return nrow * nblock * sizeof(block_iq4_xs); -#endif } void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) { @@ -12255,7 +13937,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy } float max = xval[0]; for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]); - if (!max) { + if (max < GROUP_MAX_EPS_IQ2_S) { scales[ib] = 0; continue; } @@ -12450,6 +14132,24 @@ bool lm_ggml_validate_row_data(enum lm_ggml_type type, const void * data, size_t const size_t nb = nbytes/lm_ggml_type_size(type); switch (type) { + case LM_GGML_TYPE_BF16: + { + int nans = 0; + int infs = 0; + const unsigned short * f = (const unsigned short *) data; + for (size_t i = 0; i < nb; ++i) { + nans += (f[i] & 0x7fff) > 0x7f80; + infs += (f[i] & 0x7fff) == 0x7f80; + } + if (nans) { + fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb); + return false; + } + if (infs) { + fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb); + return false; + } + } break; case LM_GGML_TYPE_F16: { const lm_ggml_fp16_t * f = (const lm_ggml_fp16_t *) data; @@ -12571,19 +14271,11 @@ bool lm_ggml_validate_row_data(enum lm_ggml_type type, const void * data, size_t } break; case LM_GGML_TYPE_Q4_K: { - #ifdef LM_GGML_QKK_64 - VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]); - #else VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin); - #endif } break; case LM_GGML_TYPE_Q5_K: { - #ifdef LM_GGML_QKK_64 - VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb); - #else VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin); - #endif } break; case LM_GGML_TYPE_Q6_K: { @@ -12606,18 +14298,12 @@ bool lm_ggml_validate_row_data(enum lm_ggml_type type, const void * data, size_t { const block_iq1_m * q = (const block_iq1_m *) data; for (size_t i = 0; i < nb; ++i) { - #if QK_K == 64 - if (!validate_fp16(q[i].d, i)) { - return false; - } - #else iq1m_scale_t scale; const uint16_t * sc = (const uint16_t *)q[i].scales; scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); if (!validate_fp16(scale.f16, i)) { return false; } - #endif } } break; case LM_GGML_TYPE_IQ2_XXS: @@ -12642,12 +14328,9 @@ bool lm_ggml_validate_row_data(enum lm_ggml_type type, const void * data, size_t VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb); } break; case LM_GGML_TYPE_IQ4_XS: - #if QK_K != 64 { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb); } break; - #endif - // with QK_K == 64, iq4_xs is iq4_nl case LM_GGML_TYPE_IQ4_NL: { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); diff --git a/cpp/ggml.c b/cpp/ggml.c index 9e656d9a..d9eb5ba0 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -4,7 +4,6 @@ #include "ggml-impl.h" #include "ggml-quants.h" #include "ggml.h" -#include "sgemm.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -37,6 +36,10 @@ #undef LM_GGML_USE_LLAMAFILE #endif +#ifdef LM_GGML_USE_LLAMAFILE +#include "sgemm.h" +#endif + #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) @@ -109,6 +112,8 @@ typedef void * thread_ret_t; #endif +typedef pthread_t lm_ggml_thread_t; + #ifdef LM_GGML_USE_CPU_HBM #include #endif @@ -160,9 +165,6 @@ void lm_ggml_print_backtrace(void) { #define LM_GGML_DEBUG 0 #define LM_GGML_GELU_FP16 #define LM_GGML_GELU_QUICK_FP16 -#define LM_GGML_SILU_FP16 -// #define LM_GGML_CROSS_ENTROPY_EXP_FP16 -// #define LM_GGML_FLASH_ATTN_EXP_FP16 #define LM_GGML_SOFT_MAX_UNROLL 4 #define LM_GGML_VEC_DOT_UNROLL 2 @@ -313,16 +315,10 @@ static lm_ggml_fp16_t lm_ggml_table_gelu_f16[1 << 16]; // precomputed quick gelu table for f16 (128 KB) static lm_ggml_fp16_t lm_ggml_table_gelu_quick_f16[1 << 16]; -// precomputed silu table for f16 (128 KB) -static lm_ggml_fp16_t lm_ggml_table_silu_f16[1 << 16]; - -// precomputed exp table for f16 (128 KB) -static lm_ggml_fp16_t lm_ggml_table_exp_f16[1 << 16]; - // precomputed f32 table for f16 (256 KB) (ggml-impl.h) float lm_ggml_table_f32_f16[1 << 16]; -const char * lm_ggml_status_to_string(enum lm_ggml_status status) { +LM_GGML_CALL const char * lm_ggml_status_to_string(enum lm_ggml_status status) { switch (status) { case LM_GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; case LM_GGML_STATUS_FAILED: return "GGML status: error (operation failed)"; @@ -333,16 +329,26 @@ const char * lm_ggml_status_to_string(enum lm_ggml_status status) { return "GGML status: unknown"; } -// note: do not use these inside ggml.c -// these are meant to be used via the ggml.h API float lm_ggml_fp16_to_fp32(lm_ggml_fp16_t x) { +#define lm_ggml_fp16_to_fp32 do_not_use__lm_ggml_fp16_to_fp32__in_ggml return LM_GGML_FP16_TO_FP32(x); } lm_ggml_fp16_t lm_ggml_fp32_to_fp16(float x) { +#define lm_ggml_fp32_to_fp16 do_not_use__lm_ggml_fp32_to_fp16__in_ggml return LM_GGML_FP32_TO_FP16(x); } +float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t x) { +#define lm_ggml_bf16_to_fp32 do_not_use__lm_ggml_bf16_to_fp32__in_ggml + return LM_GGML_BF16_TO_FP32(x); // it just left shifts +} + +lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float x) { +#define lm_ggml_fp32_to_bf16 do_not_use__lm_ggml_fp32_to_bf16__in_ggml + return LM_GGML_FP32_TO_BF16(x); +} + void lm_ggml_fp16_to_fp32_row(const lm_ggml_fp16_t * x, float * y, int64_t n) { for (int64_t i = 0; i < n; i++) { y[i] = LM_GGML_FP16_TO_FP32(x[i]); @@ -368,6 +374,49 @@ void lm_ggml_fp32_to_fp16_row(const float * x, lm_ggml_fp16_t * y, int64_t n) { } } +void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__AVX512F__) + for (; i + 16 <= n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } +#elif defined(__AVX2__) + for (; i + 8 <= n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } +#endif + for (; i < n; i++) { + y[i] = LM_GGML_BF16_TO_FP32(x[i]); + } +} + +void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) { + int i = 0; +#if defined(__AVX512BF16__) + for (; i + 32 <= n; i += 32) { + _mm512_storeu_si512( + (__m512i *)(y + i), + m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16), + _mm512_loadu_ps(x + i)))); + } +#endif + for (; i < n; i++) { + y[i] = LM_GGML_FP32_TO_BF16(x[i]); + } +} + bool lm_ggml_guid_matches(lm_ggml_guid_t guid_a, lm_ggml_guid_t guid_b) { return memcmp(guid_a, guid_b, sizeof(lm_ggml_guid)) == 0; } @@ -503,6 +552,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static void lm_ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); static void lm_ggml_vec_dot_f16(int n, float * restrict s, size_t bs, lm_ggml_fp16_t * restrict x, size_t bx, lm_ggml_fp16_t * restrict y, size_t by, int nrc); +static void lm_ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, lm_ggml_bf16_t * restrict x, size_t bx, lm_ggml_bf16_t * restrict y, size_t by, int nrc); static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { [LM_GGML_TYPE_I8] = { @@ -821,22 +871,14 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { }, [LM_GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", -#if QK_K == 64 - .blck_size = QK4_NL, -#else .blck_size = QK_K, -#endif .type_size = sizeof(block_iq4_xs), .is_quantized = true, .to_float = (lm_ggml_to_float_t) dequantize_row_iq4_xs, .from_float = quantize_row_iq4_xs, .from_float_reference = (lm_ggml_from_float_t)quantize_row_iq4_xs_reference, .vec_dot = lm_ggml_vec_dot_iq4_xs_q8_K, -#if QK_K == 64 - .vec_dot_type = LM_GGML_TYPE_Q8_0, -#else .vec_dot_type = LM_GGML_TYPE_Q8_K, -#endif .nrows = 1, }, [LM_GGML_TYPE_Q8_K] = { @@ -845,6 +887,18 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .type_size = sizeof(block_q8_K), .is_quantized = true, .from_float = quantize_row_q8_K, + }, + [LM_GGML_TYPE_BF16] = { + .type_name = "bf16", + .blck_size = 1, + .type_size = sizeof(lm_ggml_bf16_t), + .is_quantized = false, + .to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row, + .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row, + .from_float_reference = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row, + .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16, + .vec_dot_type = LM_GGML_TYPE_BF16, + .nrows = 1, } }; @@ -1237,6 +1291,8 @@ static inline void __avx_f32cx8_store(lm_ggml_fp16_t *x, __m256 y) { #define LM_GGML_F16_VEC_ZERO LM_GGML_F32x4_ZERO #define LM_GGML_F16_VEC_SET1 LM_GGML_F32x4_SET1 #define LM_GGML_F16_VEC_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F16_VEC_ADD LM_GGML_F32x4_ADD +#define LM_GGML_F16_VEC_MUL LM_GGML_F32x4_MUL #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32x4_REDUCE // Use vec_xl, not vec_ld, in case the load address is not aligned. #define LM_GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ @@ -1459,6 +1515,195 @@ static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) { #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE +#elif defined(__loongarch_asx) + +#define LM_GGML_SIMD + +// F32 LASX +#define LM_GGML_F32_STEP 32 +#define LM_GGML_F32_EPR 8 + +#define LM_GGML_F32x8 __m256 +#define LM_GGML_F32x8_ZERO (__m256)__lasx_xvldi(0) +#define LM_GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x)) +#define LM_GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0) +#define LM_GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0) +#define LM_GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a) +#define LM_GGML_F32x8_ADD __lasx_xvfadd_s +#define LM_GGML_F32x8_MUL __lasx_xvfmul_s +#define LM_GGML_F32x8_REDUCE(res, x) \ +do { \ + int offset = LM_GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ + } \ + float *tmp_p = (float *)&x[0]; \ + res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \ +} while (0) +// TODO: is this optimal ? + +#define LM_GGML_F32_VEC LM_GGML_F32x8 +#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x8_ZERO +#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x8_SET1 +#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x8_LOAD +#define LM_GGML_F32_VEC_STORE LM_GGML_F32x8_STORE +#define LM_GGML_F32_VEC_FMA LM_GGML_F32x8_FMA +#define LM_GGML_F32_VEC_ADD LM_GGML_F32x8_ADD +#define LM_GGML_F32_VEC_MUL LM_GGML_F32x8_MUL +#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x8_REDUCE + +// F16 LASX + +#define LM_GGML_F16_STEP 32 +#define LM_GGML_F16_EPR 8 + +// F16 arithmetic is not supported by AVX, so we use F32 instead + +#define LM_GGML_F32Cx8 __m256 +#define LM_GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) +#define LM_GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) + +static inline __m256 __lasx_f32cx8_load(lm_ggml_fp16_t *x) { + float tmp[8]; + + for (int i = 0; i < 8; i++) { + tmp[i] = LM_GGML_FP16_TO_FP32(x[i]); + } + + return (__m256)__lasx_xvld(tmp, 0); +} +static inline void __lasx_f32cx8_store(lm_ggml_fp16_t *x, __m256 y) { + float arr[8]; + + __lasx_xvst(y, arr, 0); + + for (int i = 0; i < 8; i++) + x[i] = LM_GGML_FP32_TO_FP16(arr[i]); +} +#define LM_GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) +#define LM_GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) + +#define LM_GGML_F32Cx8_FMA LM_GGML_F32x8_FMA +#define LM_GGML_F32Cx8_ADD __lasx_xvfadd_s +#define LM_GGML_F32Cx8_MUL __lasx_xvfmul_s +#define LM_GGML_F32Cx8_REDUCE LM_GGML_F32x8_REDUCE + +#define LM_GGML_F16_VEC LM_GGML_F32Cx8 +#define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx8_ZERO +#define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx8_SET1 +#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx8_LOAD(p) +#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx8_STORE(p, r[i]) +#define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx8_FMA +#define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx8_ADD +#define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx8_MUL +#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx8_REDUCE + +#elif defined(__loongarch_sx) + +#define LM_GGML_SIMD + +// F32 LSX + +#define LM_GGML_F32_STEP 32 +#define LM_GGML_F32_EPR 4 + +#define LM_GGML_F32x4 __m128 +#define LM_GGML_F32x4_ZERO __lsx_vldi(0) +#define LM_GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) +#define LM_GGML_F32x4_LOAD(x) __lsx_vld((x), 0) +#define LM_GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0) +#define LM_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) +#define LM_GGML_F32x4_ADD __lsx_vfadd_s +#define LM_GGML_F32x4_MUL __lsx_vfmul_s +#define LM_GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = LM_GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ + } \ + __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \ + tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \ + tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ + const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \ + tmp = __lsx_vsrli_d((__m128i)t0, 32); \ + tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \ + tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ + res = (lm_ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \ +} + +#define LM_GGML_F32_VEC LM_GGML_F32x4 +#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO +#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 +#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD +#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE +#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD +#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL +#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE + +// F16 LSX + +#define LM_GGML_F16_STEP 32 +#define LM_GGML_F16_EPR 4 + +static inline __m128 __lsx_f16x4_load(lm_ggml_fp16_t *x) { + float tmp[4]; + + tmp[0] = LM_GGML_FP16_TO_FP32(x[0]); + tmp[1] = LM_GGML_FP16_TO_FP32(x[1]); + tmp[2] = LM_GGML_FP16_TO_FP32(x[2]); + tmp[3] = LM_GGML_FP16_TO_FP32(x[3]); + + return __lsx_vld(tmp, 0); +} + +static inline void __lsx_f16x4_store(lm_ggml_fp16_t *x, __m128 y) { + float arr[4]; + + __lsx_vst(y, arr, 0); + + x[0] = LM_GGML_FP32_TO_FP16(arr[0]); + x[1] = LM_GGML_FP32_TO_FP16(arr[1]); + x[2] = LM_GGML_FP32_TO_FP16(arr[2]); + x[3] = LM_GGML_FP32_TO_FP16(arr[3]); +} + +#define LM_GGML_F32Cx4 __m128 +#define LM_GGML_F32Cx4_ZERO __lsx_vldi(0) +#define LM_GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) +#define LM_GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x) +#define LM_GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y) +#define LM_GGML_F32Cx4_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F32Cx4_ADD __lsx_vfadd_s +#define LM_GGML_F32Cx4_MUL __lsx_vfmul_s +#define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE + +#define LM_GGML_F16_VEC LM_GGML_F32Cx4 +#define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO +#define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1 +#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p) +#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE(p, r[i]) +#define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA +#define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD +#define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL +#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE + #endif // LM_GGML_F32_ARR / LM_GGML_F16_ARR @@ -1468,6 +1713,59 @@ static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) { #define LM_GGML_F16_ARR (LM_GGML_F16_STEP/LM_GGML_F16_EPR) #endif +// +// ggml context +// + +struct lm_ggml_context { + size_t mem_size; + void* mem_buffer; + bool mem_buffer_owned; + bool no_alloc; + bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers + + int n_objects; + + struct lm_ggml_object* objects_begin; + struct lm_ggml_object* objects_end; + + struct lm_ggml_scratch scratch; + struct lm_ggml_scratch scratch_save; +}; + +struct lm_ggml_context_container { + bool used; + + struct lm_ggml_context context; +}; + +struct lm_ggml_compute_state_shared { + const struct lm_ggml_cgraph* cgraph; + const struct lm_ggml_cplan* cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; + + const int n_threads; + + // synchronization primitives + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + atomic_int node_task; // active graph node task phase + + lm_ggml_abort_callback abort_callback; // abort lm_ggml_graph_compute when true + void* abort_callback_data; + + atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. +}; + +struct lm_ggml_compute_state { + lm_ggml_thread_t thrd; + int ith; + struct lm_ggml_compute_state_shared* shared; + enum lm_ggml_status ec; +}; + // // fundamental operations // @@ -1480,6 +1778,8 @@ inline static void lm_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v inline static void lm_ggml_vec_set_f16(const int n, lm_ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void lm_ggml_vec_set_bf16(const int n, lm_ggml_bf16_t * x, const lm_ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + inline static void lm_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void lm_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } inline static void lm_ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } @@ -1498,7 +1798,7 @@ static void lm_ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const floa UNUSED(by); UNUSED(bs); -#ifdef LM_GGML_SIMD +#if defined(LM_GGML_SIMD) float sumf = 0.0f; const int np = (n & ~(LM_GGML_F32_STEP - 1)); @@ -1534,6 +1834,70 @@ static void lm_ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const floa *s = sumf; } +static void lm_ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, lm_ggml_bf16_t * restrict x, size_t bx, lm_ggml_bf16_t * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + int i = 0; + lm_ggml_float sumf = 0; + +#if defined(__AVX512BF16__) + __m512 c1 = _mm512_setzero_ps(); + __m512 c2 = _mm512_setzero_ps(); + for (; i + 64 <= n; i += 64) { + c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))), + m512bh(_mm512_loadu_si512((y + i)))); + c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))), + m512bh(_mm512_loadu_si512((y + i + 32)))); + } + sumf += (lm_ggml_float)_mm512_reduce_add_ps(c1); + sumf += (lm_ggml_float)_mm512_reduce_add_ps(c2); + +#elif defined(__AVX512F__) +#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16)) + __m512 c1 = _mm512_setzero_ps(); + __m512 c2 = _mm512_setzero_ps(); + for (; i + 32 <= n; i += 32) { + c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1); + c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2); + } + sumf += (lm_ggml_float)_mm512_reduce_add_ps(c1); + sumf += (lm_ggml_float)_mm512_reduce_add_ps(c2); + +#undef LOAD +#elif defined(__AVX2__) +#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)) + __m256 c1 = _mm256_setzero_ps(); + __m256 c2 = _mm256_setzero_ps(); + __m256 c3 = _mm256_setzero_ps(); + __m256 c4 = _mm256_setzero_ps(); + for (; i + 32 <= n; i += 32) { + c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1); + c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2); + c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3); + c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4); + } + __m128 g; + c1 = _mm256_add_ps(_mm256_add_ps(c1, c3), + _mm256_add_ps(c2, c4)); + g = _mm_add_ps(_mm256_extractf128_ps(c1, 1), + _mm256_castps256_ps128(c1)); + g = _mm_add_ps(g, _mm_movehl_ps(g, g)); + g = _mm_add_ss(g, _mm_movehdup_ps(g)); + sumf += (lm_ggml_float)_mm_cvtss_f32(g); + +#undef LOAD +#endif + + for (; i < n; ++i) { + sumf += (lm_ggml_float)(LM_GGML_BF16_TO_FP32(x[i]) * + LM_GGML_BF16_TO_FP32(y[i])); + } + *s = sumf; +} + static void lm_ggml_vec_dot_f16(int n, float * restrict s, size_t bs, lm_ggml_fp16_t * restrict x, size_t bx, lm_ggml_fp16_t * restrict y, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -1817,6 +2181,7 @@ inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } +inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } // TODO: optimize performance inline static void lm_ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } inline static void lm_ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } @@ -1892,52 +2257,291 @@ inline static float lm_ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } -//inline static void lm_ggml_vec_silu_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) { -// const uint16_t * i16 = (const uint16_t *) x; -// for (int i = 0; i < n; ++i) { -// y[i] = lm_ggml_table_silu_f16[i16[i]]; -// } -//} +#if defined(__ARM_NEON) && defined(__aarch64__) + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static float32x4_t lm_ggml_v_expf(float32x4_t x) { + const float32x4_t r = vdupq_n_f32(0x1.8p23f); + const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f)); + const float32x4_t n = vsubq_f32(z, r); + const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, + vdupq_n_f32(0x1.7f7d1cp-20f)); + const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); + const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1)))); + const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126)); + const float32x4_t u = vmulq_f32(b, b); + const float32x4_t j = vfmaq_f32( + vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b), + vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b), + vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u); + if (!vpaddd_u64(vreinterpretq_u64_u32(c))) + return vfmaq_f32(k, j, k); + const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000)); + const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000))); + const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d)); + return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1), + vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static float32x4_t lm_ggml_v_silu(float32x4_t x) { + const float32x4_t one = vdupq_n_f32(1.0f); + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t neg_x = vsubq_f32(zero, x); + const float32x4_t exp_neg_x = lm_ggml_v_expf(neg_x); + const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); + return vdivq_f32(x, one_plus_exp_neg_x); +} + +#elif defined(__AVX512F__) && defined(__AVX512DQ__) + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m512 lm_ggml_v_expf(__m512 x) { + const __m512 r = _mm512_set1_ps(0x1.8p23f); + const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r); + const __m512 n = _mm512_sub_ps(z, r); + const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f), + _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x)); + const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23); + const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1)))); + const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ); + const __m512 u = _mm512_mul_ps(b, b); + const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b, + _mm512_set1_ps(0x1.573e2ep-5f)), u, + _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b, + _mm512_set1_ps(0x1.fffdb6p-2f))), + u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b)); + if (_mm512_kortestz(c, c)) + return _mm512_fmadd_ps(j, k, k); + const __m512i g = _mm512_and_si512( + _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)), + _mm512_set1_epi32(0x82000000u)); + const __m512 s1 = + _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u))); + const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g)); + const __mmask16 d = + _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ); + return _mm512_mask_blend_ps( + d, _mm512_mask_blend_ps( + c, _mm512_fmadd_ps(k, j, k), + _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)), + _mm512_mul_ps(s1, s1)); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m512 lm_ggml_v_silu(__m512 x) { + const __m512 one = _mm512_set1_ps(1); + const __m512 zero = _mm512_setzero_ps(); + const __m512 neg_x = _mm512_sub_ps(zero, x); + const __m512 exp_neg_x = lm_ggml_v_expf(neg_x); + const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x); + return _mm512_div_ps(x, one_plus_exp_neg_x); +} + +#elif defined(__AVX2__) && defined(__FMA__) + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m256 lm_ggml_v_expf(__m256 x) { + const __m256 r = _mm256_set1_ps(0x1.8p23f); + const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r); + const __m256 n = _mm256_sub_ps(z, r); + const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f), + _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x)); + const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23); + const __m256 k = _mm256_castsi256_ps( + _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1)))); + const __m256i c = _mm256_castps_si256( + _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), + _mm256_set1_ps(126), _CMP_GT_OQ)); + const __m256 u = _mm256_mul_ps(b, b); + const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b, + _mm256_set1_ps(0x1.573e2ep-5f)), u, + _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b, + _mm256_set1_ps(0x1.fffdb6p-2f))), + u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b)); + if (!_mm256_movemask_ps(_mm256_castsi256_ps(c))) + return _mm256_fmadd_ps(j, k, k); + const __m256i g = _mm256_and_si256( + _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)), + _mm256_set1_epi32(0x82000000u)); + const __m256 s1 = + _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u))); + const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g)); + const __m256i d = _mm256_castps_si256( + _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), + _mm256_set1_ps(192), _CMP_GT_OQ)); + return _mm256_or_ps( + _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)), + _mm256_andnot_ps( + _mm256_castsi256_ps(d), + _mm256_or_ps( + _mm256_and_ps(_mm256_castsi256_ps(c), + _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)), + _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k))))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m256 lm_ggml_v_silu(__m256 x) { + const __m256 one = _mm256_set1_ps(1); + const __m256 zero = _mm256_setzero_ps(); + const __m256 neg_x = _mm256_sub_ps(zero, x); + const __m256 exp_neg_x = lm_ggml_v_expf(neg_x); + const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x); + return _mm256_div_ps(x, one_plus_exp_neg_x); +} + +#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON -#ifdef LM_GGML_SILU_FP16 -inline static void lm_ggml_vec_silu_f32(const int n, float * y, const float * x) { - uint16_t t; - for (int i = 0; i < n; ++i) { - lm_ggml_fp16_t fp16 = LM_GGML_FP32_TO_FP16(x[i]); - memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = LM_GGML_FP16_TO_FP32(lm_ggml_table_silu_f16[t]); - } -} +#if defined(__FMA__) +#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z) +#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z) #else -inline static void lm_ggml_vec_silu_f32(const int n, float * y, const float * x) { - for (int i = 0; i < n; ++i) { +#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z) +#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y)) +#endif + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m128 lm_ggml_v_expf(__m128 x) { + const __m128 r = _mm_set1_ps(0x1.8p23f); + const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r); + const __m128 n = _mm_sub_ps(z, r); + const __m128 b = + NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x)); + const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23); + const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1)))); + const __m128i c = + _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126))); + const __m128 u = _mm_mul_ps(b, b); + const __m128 j = + MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u, + MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))), + u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b)); + if (!_mm_movemask_epi8(c)) + return MADD128(j, k, k); + const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())), + _mm_set1_epi32(0x82000000u)); + const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u))); + const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g)); + const __m128i d = + _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192))); + return _mm_or_ps( + _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)), + _mm_andnot_ps(_mm_castsi128_ps(d), + _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)), + _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k))))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m128 lm_ggml_v_silu(__m128 x) { + const __m128 one = _mm_set1_ps(1); + const __m128 zero = _mm_setzero_ps(); + const __m128 neg_x = _mm_sub_ps(zero, x); + const __m128 exp_neg_x = lm_ggml_v_expf(neg_x); + const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x); + return _mm_div_ps(x, one_plus_exp_neg_x); +} + +#endif // __ARM_NEON / __AVX2__ / __SSE2__ + +static void lm_ggml_vec_silu_f32(const int n, float * y, const float * x) { + int i = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, lm_ggml_v_silu(_mm512_loadu_ps(x + i))); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, lm_ggml_v_silu(_mm256_loadu_ps(x + i))); + } +#elif defined(__SSE2__) + for (; i + 3 < n; i += 4) { + _mm_storeu_ps(y + i, lm_ggml_v_silu(_mm_loadu_ps(x + i))); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + for (; i + 3 < n; i += 4) { + vst1q_f32(y + i, lm_ggml_v_silu(vld1q_f32(x + i))); + } +#endif + for (; i < n; ++i) { y[i] = lm_ggml_silu_f32(x[i]); } } -#endif -inline static float lm_ggml_silu_backward_f32(float x, float dy) { - const float s = 1.0f/(1.0f + expf(-x)); - return dy*s*(1.0f + x*(1.0f - s)); +static lm_ggml_float lm_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { + int i = 0; + lm_ggml_float sum = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i + 15 < n; i += 16) { + __m512 val = lm_ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i), + _mm512_set1_ps(max))); + _mm512_storeu_ps(y + i, val); + sum += (lm_ggml_float)_mm512_reduce_add_ps(val); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + __m256 val = lm_ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i), + _mm256_set1_ps(max))); + _mm256_storeu_ps(y + i, val); + __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), + _mm256_castps256_ps128(val)); + val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); + val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); + sum += (lm_ggml_float)_mm_cvtss_f32(val2); + } +#elif defined(__SSE2__) + for (; i + 3 < n; i += 4) { + __m128 val = lm_ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i), + _mm_set1_ps(max))); + _mm_storeu_ps(y + i, val); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + val = _mm_add_ps(val, _mm_movehl_ps(val, val)); + val = _mm_add_ss(val, _mm_movehdup_ps(val)); +#else + __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); + val = _mm_add_ps(val, tmp); + tmp = _mm_movehl_ps(tmp, val); + val = _mm_add_ss(val, tmp); +#endif + sum += (lm_ggml_float)_mm_cvtss_f32(val); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + for (; i + 3 < n; i += 4) { + float32x4_t val = lm_ggml_v_expf(vsubq_f32(vld1q_f32(x + i), + vdupq_n_f32(max))); + vst1q_f32(y + i, val); + sum += (lm_ggml_float)vaddvq_f32(val); + } +#endif + for (; i < n; ++i) { + float val = expf(x[i] - max); + sum += (lm_ggml_float)val; + y[i] = val; + } + return sum; } -#ifdef LM_GGML_SILU_FP16 -inline static void lm_ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { - for (int i = 0; i < n; ++i) { - // we did not use x[i] to compute forward silu but its f16 equivalent - // take derivative at f16 of x[i]: - lm_ggml_fp16_t fp16 = LM_GGML_FP32_TO_FP16(x[i]); - float usedx = LM_GGML_FP16_TO_FP32(fp16); - dx[i] = lm_ggml_silu_backward_f32(usedx, dy[i]); - } +inline static float lm_ggml_silu_backward_f32(float x, float dy) { + const float s = 1.0f/(1.0f + expf(-x)); + return dy*s*(1.0f + x*(1.0f - s)); } -#else + inline static void lm_ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { for (int i = 0; i < n; ++i) { dx[i] = lm_ggml_silu_backward_f32(x[i], dy[i]); } } -#endif inline static void lm_ggml_vec_sum_f32(const int n, float * s, const float * x) { #ifndef LM_GGML_USE_ACCELERATE @@ -1967,6 +2571,14 @@ inline static void lm_ggml_vec_sum_f16_ggf(const int n, float * s, const lm_ggml *s = sum; } +inline static void lm_ggml_vec_sum_bf16_ggf(const int n, float * s, const lm_ggml_bf16_t * x) { + float sum = 0.0f; + for (int i = 0; i < n; ++i) { + sum += LM_GGML_BF16_TO_FP32(x[i]); + } + *s = sum; +} + inline static void lm_ggml_vec_max_f32(const int n, float * s, const float * x) { #ifndef LM_GGML_USE_ACCELERATE float max = -INFINITY; @@ -2045,7 +2657,6 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = { "SOFT_MAX_BACK", "ROPE", "ROPE_BACK", - "ALIBI", "CLAMP", "CONV_TRANSPOSE_1D", "IM2COL", @@ -2059,9 +2670,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = { "ARGSORT", "LEAKY_RELU", - "FLASH_ATTN", "FLASH_ATTN_EXT", - "FLASH_FF", "FLASH_ATTN_BACK", "SSM_CONV", "SSM_SCAN", @@ -2087,7 +2696,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(LM_GGML_OP_COUNT == 77, "LM_GGML_OP_COUNT != 77"); +static_assert(LM_GGML_OP_COUNT == 74, "LM_GGML_OP_COUNT != 74"); static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "none", @@ -2136,7 +2745,6 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "soft_max_back(x)", "rope(x)", "rope_back(x)", - "alibi(x)", "clamp(x)", "conv_transpose_1d(x)", "im2col(x)", @@ -2150,9 +2758,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "argsort(x)", "leaky_relu(x)", - "flash_attn(x)", "flash_attn_ext(x)", - "flash_ff(x)", "flash_attn_back(x)", "ssm_conv(x)", "ssm_scan(x)", @@ -2178,7 +2784,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(LM_GGML_OP_COUNT == 77, "LM_GGML_OP_COUNT != 77"); +static_assert(LM_GGML_OP_COUNT == 74, "LM_GGML_OP_COUNT != 74"); static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2"); @@ -2191,6 +2797,7 @@ static const char * LM_GGML_UNARY_OP_NAME[LM_GGML_UNARY_OP_COUNT] = { "TANH", "ELU", "RELU", + "SIGMOID", "GELU", "GELU_QUICK", "SILU", @@ -2198,7 +2805,7 @@ static const char * LM_GGML_UNARY_OP_NAME[LM_GGML_UNARY_OP_COUNT] = { "HARDSIGMOID", }; -static_assert(LM_GGML_UNARY_OP_COUNT == 12, "LM_GGML_UNARY_OP_COUNT != 12"); +static_assert(LM_GGML_UNARY_OP_COUNT == 13, "LM_GGML_UNARY_OP_COUNT != 13"); static_assert(sizeof(struct lm_ggml_object)%LM_GGML_MEM_ALIGN == 0, "lm_ggml_object size must be a multiple of LM_GGML_MEM_ALIGN"); @@ -2240,32 +2847,6 @@ static void lm_ggml_setup_op_has_task_pass(void) { } } -// -// ggml context -// - -struct lm_ggml_context { - size_t mem_size; - void * mem_buffer; - bool mem_buffer_owned; - bool no_alloc; - bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers - - int n_objects; - - struct lm_ggml_object * objects_begin; - struct lm_ggml_object * objects_end; - - struct lm_ggml_scratch scratch; - struct lm_ggml_scratch scratch_save; -}; - -struct lm_ggml_context_container { - bool used; - - struct lm_ggml_context context; -}; - // // NUMA support // @@ -2377,7 +2958,7 @@ void lm_ggml_numa_init(enum lm_ggml_numa_strategy numa_flag) { // figure out which node we're on uint current_cpu; int getcpu_ret = 0; -#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__) getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); #else // old glibc doesn't have a wrapper for this call. Fall back on direct syscall @@ -2588,6 +3169,7 @@ enum lm_ggml_type lm_ggml_ftype_to_lm_ggml_type(enum lm_ggml_ftype ftype) { switch (ftype) { case LM_GGML_FTYPE_ALL_F32: wtype = LM_GGML_TYPE_F32; break; case LM_GGML_FTYPE_MOSTLY_F16: wtype = LM_GGML_TYPE_F16; break; + case LM_GGML_FTYPE_MOSTLY_BF16: wtype = LM_GGML_TYPE_BF16; break; case LM_GGML_FTYPE_MOSTLY_Q4_0: wtype = LM_GGML_TYPE_Q4_0; break; case LM_GGML_FTYPE_MOSTLY_Q4_1: wtype = LM_GGML_TYPE_Q4_1; break; case LM_GGML_FTYPE_MOSTLY_Q5_0: wtype = LM_GGML_TYPE_Q5_0; break; @@ -2678,6 +3260,16 @@ bool lm_ggml_are_same_shape(const struct lm_ggml_tensor * t0, const struct lm_gg (t0->ne[3] == t1->ne[3] ); } +bool lm_ggml_are_same_stride(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1) { + static_assert(LM_GGML_MAX_DIMS == 4, "LM_GGML_MAX_DIMS is not 4 - update this function"); + + return + (t0->nb[0] == t1->nb[0] ) && + (t0->nb[1] == t1->nb[1] ) && + (t0->nb[2] == t1->nb[2] ) && + (t0->nb[3] == t1->nb[3] ); +} + // check if t1 can be represented as a repeatition of t0 static inline bool lm_ggml_can_repeat(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1) { static_assert(LM_GGML_MAX_DIMS == 4, "LM_GGML_MAX_DIMS is not 4 - update this function"); @@ -2729,15 +3321,14 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) { { const uint64_t t_start = lm_ggml_time_us(); UNUSED(t_start); - lm_ggml_fp16_t ii; for (int i = 0; i < (1 << 16); ++i) { - uint16_t ui = i; - memcpy(&ii, &ui, sizeof(ii)); - const float f = lm_ggml_table_f32_f16[i] = LM_GGML_COMPUTE_FP16_TO_FP32(ii); + union { + uint16_t u16; + lm_ggml_fp16_t fp16; + } u = {i}; + float f = lm_ggml_table_f32_f16[i] = LM_GGML_COMPUTE_FP16_TO_FP32(u.fp16); lm_ggml_table_gelu_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_gelu_f32(f)); lm_ggml_table_gelu_quick_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_gelu_quick_f32(f)); - lm_ggml_table_silu_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_silu_f32(f)); - lm_ggml_table_exp_f16[i] = LM_GGML_FP32_TO_FP16(expf(f)); } const uint64_t t_end = lm_ggml_time_us(); UNUSED(t_end); @@ -3021,6 +3612,12 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl( struct lm_ggml_tensor * const result = (struct lm_ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); +#ifdef __clang__ + // temporary until lm_ggml_tensor::backend is removed + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif + *result = (struct lm_ggml_tensor) { /*.type =*/ type, /*.backend =*/ LM_GGML_BACKEND_TYPE_CPU, @@ -3043,6 +3640,10 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl( /*.padding =*/ { 0 }, }; +#ifdef __clang__ + #pragma clang diagnostic pop +#endif + // TODO: this should not be needed as long as we don't rely on aligned SIMD loads //lm_ggml_assert_aligned(result->data); @@ -3201,6 +3802,13 @@ struct lm_ggml_tensor * lm_ggml_set_i32 (struct lm_ggml_tensor * tensor, int32_t lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1), LM_GGML_FP32_TO_FP16(value)); } } break; + case LM_GGML_TYPE_BF16: + { + assert(tensor->nb[0] == sizeof(lm_ggml_fp16_t)); + for (int i = 0; i < n; i++) { + lm_ggml_vec_set_bf16(nc, (lm_ggml_bf16_t *)(data + i*n1), LM_GGML_FP32_TO_BF16(value)); + } + } break; case LM_GGML_TYPE_F32: { assert(tensor->nb[0] == sizeof(float)); @@ -3253,6 +3861,13 @@ struct lm_ggml_tensor * lm_ggml_set_f32(struct lm_ggml_tensor * tensor, float va lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1), LM_GGML_FP32_TO_FP16(value)); } } break; + case LM_GGML_TYPE_BF16: + { + assert(tensor->nb[0] == sizeof(lm_ggml_bf16_t)); + for (int i = 0; i < n; i++) { + lm_ggml_vec_set_bf16(nc, (lm_ggml_bf16_t *)(data + i*n1), LM_GGML_FP32_TO_BF16(value)); + } + } break; case LM_GGML_TYPE_F32: { assert(tensor->nb[0] == sizeof(float)); @@ -3320,6 +3935,11 @@ int32_t lm_ggml_get_i32_1d(const struct lm_ggml_tensor * tensor, int i) { LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t)); return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]); } + case LM_GGML_TYPE_BF16: + { + LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_bf16_t)); + return LM_GGML_BF16_TO_FP32(((lm_ggml_bf16_t *)(tensor->data))[i]); + } case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(float)); @@ -3362,6 +3982,11 @@ void lm_ggml_set_i32_1d(const struct lm_ggml_tensor * tensor, int i, int32_t val LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t)); ((lm_ggml_fp16_t *)(tensor->data))[i] = LM_GGML_FP32_TO_FP16(value); } break; + case LM_GGML_TYPE_BF16: + { + LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_bf16_t)); + ((lm_ggml_bf16_t *)(tensor->data))[i] = LM_GGML_FP32_TO_BF16(value); + } break; case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(float)); @@ -3385,6 +4010,8 @@ int32_t lm_ggml_get_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, return ((int32_t *) data)[0]; case LM_GGML_TYPE_F16: return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]); + case LM_GGML_TYPE_BF16: + return LM_GGML_BF16_TO_FP32(((lm_ggml_bf16_t *) data)[0]); case LM_GGML_TYPE_F32: return ((float *) data)[0]; default: @@ -3413,6 +4040,10 @@ void lm_ggml_set_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, in { ((lm_ggml_fp16_t *)(data))[0] = LM_GGML_FP32_TO_FP16(value); } break; + case LM_GGML_TYPE_BF16: + { + ((lm_ggml_bf16_t *)(data))[0] = LM_GGML_FP32_TO_BF16(value); + } break; case LM_GGML_TYPE_F32: { ((float *)(data))[0] = value; @@ -3451,6 +4082,11 @@ float lm_ggml_get_f32_1d(const struct lm_ggml_tensor * tensor, int i) { LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t)); return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]); } + case LM_GGML_TYPE_BF16: + { + LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_bf16_t)); + return LM_GGML_BF16_TO_FP32(((lm_ggml_bf16_t *)(tensor->data))[i]); + } case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(float)); @@ -3493,6 +4129,11 @@ void lm_ggml_set_f32_1d(const struct lm_ggml_tensor * tensor, int i, float value LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t)); ((lm_ggml_fp16_t *)(tensor->data))[i] = LM_GGML_FP32_TO_FP16(value); } break; + case LM_GGML_TYPE_BF16: + { + LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_bf16_t)); + ((lm_ggml_bf16_t *)(tensor->data))[i] = LM_GGML_FP32_TO_BF16(value); + } break; case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(float)); @@ -3516,6 +4157,8 @@ float lm_ggml_get_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, i return ((int32_t *) data)[0]; case LM_GGML_TYPE_F16: return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]); + case LM_GGML_TYPE_BF16: + return LM_GGML_BF16_TO_FP32(((lm_ggml_bf16_t *) data)[0]); case LM_GGML_TYPE_F32: return ((float *) data)[0]; default: @@ -3544,6 +4187,10 @@ void lm_ggml_set_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, in { ((lm_ggml_fp16_t *)(data))[0] = LM_GGML_FP32_TO_FP16(value); } break; + case LM_GGML_TYPE_BF16: + { + ((lm_ggml_bf16_t *)(data))[0] = LM_GGML_FP32_TO_BF16(value); + } break; case LM_GGML_TYPE_F32: { ((float *)(data))[0] = value; @@ -3738,7 +4385,11 @@ static struct lm_ggml_tensor * lm_ggml_add_cast_impl( // TODO: support less-strict constraint // LM_GGML_ASSERT(lm_ggml_can_repeat(b, a)); LM_GGML_ASSERT(lm_ggml_can_repeat_rows(b, a)); - LM_GGML_ASSERT(lm_ggml_is_quantized(a->type) || a->type == LM_GGML_TYPE_F16); // currently only supported for quantized input and f16 + + // currently only supported for quantized input and f16 + LM_GGML_ASSERT(lm_ggml_is_quantized(a->type) || + a->type == LM_GGML_TYPE_F16 || + a->type == LM_GGML_TYPE_BF16); bool is_node = false; @@ -4231,10 +4882,21 @@ struct lm_ggml_tensor * lm_ggml_repeat_back( // lm_ggml_concat struct lm_ggml_tensor * lm_ggml_concat( - struct lm_ggml_context* ctx, - struct lm_ggml_tensor* a, - struct lm_ggml_tensor* b) { - LM_GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]); + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + int dim) { + LM_GGML_ASSERT(dim >= 0 && dim < LM_GGML_MAX_DIMS); + + int64_t ne[LM_GGML_MAX_DIMS]; + for (int d = 0; d < LM_GGML_MAX_DIMS; ++d) { + if (d == dim) { + ne[d] = a->ne[d] + b->ne[d]; + continue; + } + LM_GGML_ASSERT(a->ne[d] == b->ne[d]); + ne[d] = a->ne[d]; + } bool is_node = false; @@ -4242,7 +4904,9 @@ struct lm_ggml_tensor * lm_ggml_concat( is_node = true; } - struct lm_ggml_tensor * result = lm_ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]); + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, a->type, LM_GGML_MAX_DIMS, ne); + + lm_ggml_set_op_params_i32(result, 0, dim); result->op = LM_GGML_OP_CONCAT; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; @@ -4362,6 +5026,7 @@ struct lm_ggml_tensor * lm_ggml_leaky_relu( } struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); + lm_ggml_set_op_params(result, &negative_slope, sizeof(negative_slope)); result->op = LM_GGML_OP_LEAKY_RELU; @@ -4371,6 +5036,20 @@ struct lm_ggml_tensor * lm_ggml_leaky_relu( return result; } +// lm_ggml_sigmoid + +struct lm_ggml_tensor * lm_ggml_sigmoid( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a) { + return lm_ggml_unary(ctx, a, LM_GGML_UNARY_OP_SIGMOID); +} + +struct lm_ggml_tensor * lm_ggml_sigmoid_inplace( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a) { + return lm_ggml_unary_inplace(ctx, a, LM_GGML_UNARY_OP_SIGMOID); +} + // lm_ggml_gelu struct lm_ggml_tensor * lm_ggml_gelu( @@ -5454,7 +6133,6 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * mask, - struct lm_ggml_tensor * pos, float scale, float max_bias, bool inplace) { @@ -5468,18 +6146,8 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( LM_GGML_ASSERT(mask->ne[1] >= a->ne[1]); } - if (pos) { - LM_GGML_ASSERT(lm_ggml_is_vector(pos)); - LM_GGML_ASSERT(pos->type == LM_GGML_TYPE_F16 || pos->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT(pos->ne[0] == a->ne[0]); - } - - if (pos && mask) { - LM_GGML_ASSERT(pos->type == mask->type); - } - if (max_bias > 0.0f) { - LM_GGML_ASSERT(pos); + LM_GGML_ASSERT(mask); } bool is_node = false; @@ -5497,7 +6165,6 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = mask; - result->src[2] = pos; return result; } @@ -5505,23 +6172,22 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( struct lm_ggml_tensor * lm_ggml_soft_max( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a) { - return lm_ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false); + return lm_ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false); } struct lm_ggml_tensor * lm_ggml_soft_max_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a) { - return lm_ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true); + return lm_ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true); } struct lm_ggml_tensor * lm_ggml_soft_max_ext( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * mask, - struct lm_ggml_tensor * pos, float scale, float max_bias) { - return lm_ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false); + return lm_ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false); } // lm_ggml_soft_max_back @@ -5567,6 +6233,7 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, + struct lm_ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -5580,10 +6247,17 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( float xpos_base, bool xpos_down, bool inplace) { + LM_GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported"); + LM_GGML_ASSERT(lm_ggml_is_vector(b)); LM_GGML_ASSERT(b->type == LM_GGML_TYPE_I32); LM_GGML_ASSERT(a->ne[2] == b->ne[0]); + if (c) { + LM_GGML_ASSERT(c->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(c->ne[0] >= n_dims / 2); + } + bool is_node = false; if (a->grad) { @@ -5607,6 +6281,7 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; + result->src[2] = c; return result; } @@ -5619,7 +6294,7 @@ struct lm_ggml_tensor * lm_ggml_rope( int mode, int n_ctx) { return lm_ggml_rope_impl( - ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false + ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false ); } @@ -5631,14 +6306,15 @@ struct lm_ggml_tensor * lm_ggml_rope_inplace( int mode, int n_ctx) { return lm_ggml_rope_impl( - ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true + ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true ); } -struct lm_ggml_tensor * lm_ggml_rope_custom( +struct lm_ggml_tensor * lm_ggml_rope_ext( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, + struct lm_ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -5650,15 +6326,16 @@ struct lm_ggml_tensor * lm_ggml_rope_custom( float beta_fast, float beta_slow) { return lm_ggml_rope_impl( - ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false ); } -struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( +struct lm_ggml_tensor * lm_ggml_rope_ext_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, + struct lm_ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -5670,19 +6347,49 @@ struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( float beta_fast, float beta_slow) { return lm_ggml_rope_impl( - ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true ); } -struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( +struct lm_ggml_tensor * lm_ggml_rope_custom( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + int n_dims, + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return lm_ggml_rope_impl( + ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false + ); +} + +struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, int n_dims, - float base, - bool down) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true); + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return lm_ggml_rope_impl( + ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true + ); } // lm_ggml_rope_back @@ -5691,6 +6398,7 @@ struct lm_ggml_tensor * lm_ggml_rope_back( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, + struct lm_ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -5706,6 +6414,7 @@ struct lm_ggml_tensor * lm_ggml_rope_back( LM_GGML_ASSERT(lm_ggml_is_vector(b)); LM_GGML_ASSERT(b->type == LM_GGML_TYPE_I32); LM_GGML_ASSERT(a->ne[2] == b->ne[0]); + LM_GGML_ASSERT(c == NULL && "freq factors not implemented yet"); LM_GGML_ASSERT((mode & 4) == 0 && "lm_ggml_rope_back() for ChatGLM not implemented yet"); @@ -5736,37 +6445,6 @@ struct lm_ggml_tensor * lm_ggml_rope_back( return result; } -// lm_ggml_alibi - -struct lm_ggml_tensor * lm_ggml_alibi( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - int n_past, - int n_head, - float bias_max) { - LM_GGML_ASSERT(n_past >= 0); - bool is_node = false; - - if (a->grad) { - LM_GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - - // TODO: when implement backward, fix this: - //struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); - struct lm_ggml_tensor * result = lm_ggml_view_tensor(ctx, a); - - int32_t op_params[3] = { n_past, n_head }; - memcpy(op_params + 2, &bias_max, sizeof(float)); - lm_ggml_set_op_params(result, op_params, sizeof(op_params)); - - result->op = LM_GGML_OP_ALIBI; - result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - - return result; -} - // lm_ggml_clamp struct lm_ggml_tensor * lm_ggml_clamp( @@ -6116,7 +6794,10 @@ struct lm_ggml_tensor * lm_ggml_pool_2d( static struct lm_ggml_tensor * lm_ggml_upscale_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int scale_factor) { + int ne0, + int ne1, + int ne2, + int ne3) { bool is_node = false; if (a->grad) { @@ -6124,19 +6805,45 @@ static struct lm_ggml_tensor * lm_ggml_upscale_impl( is_node = true; } + LM_GGML_ASSERT(a->ne[0] <= ne0); + LM_GGML_ASSERT(a->ne[1] <= ne1); + LM_GGML_ASSERT(a->ne[2] <= ne2); + LM_GGML_ASSERT(a->ne[3] <= ne3); + struct lm_ggml_tensor * result = lm_ggml_new_tensor_4d(ctx, a->type, - a->ne[0] * scale_factor, - a->ne[1] * scale_factor, - a->ne[2], a->ne[3]); + ne0, + ne1, + ne2, + ne3 + ); result->op = LM_GGML_OP_UPSCALE; - result->op_params[0] = scale_factor; + result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; return result; } +struct lm_ggml_tensor * lm_ggml_upscale( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int scale_factor) { + return lm_ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]); +} + +struct lm_ggml_tensor * lm_ggml_upscale_ext( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int ne0, + int ne1, + int ne2, + int ne3) { + return lm_ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3); +} + +// lm_ggml_pad + struct lm_ggml_tensor * lm_ggml_pad( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -6161,12 +6868,7 @@ struct lm_ggml_tensor * lm_ggml_pad( return result; } -struct lm_ggml_tensor * lm_ggml_upscale( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - int scale_factor) { - return lm_ggml_upscale_impl(ctx, a, scale_factor); -} +// lm_ggml_arange struct lm_ggml_tensor * lm_ggml_arange( struct lm_ggml_context * ctx, @@ -6188,6 +6890,8 @@ struct lm_ggml_tensor * lm_ggml_arange( return result; } +// lm_ggml_timestep_embedding + struct lm_ggml_tensor * lm_ggml_timestep_embedding( struct lm_ggml_context * ctx, struct lm_ggml_tensor * timesteps, @@ -6254,38 +6958,6 @@ struct lm_ggml_tensor * lm_ggml_top_k( return result; } -// lm_ggml_flash_attn - -struct lm_ggml_tensor * lm_ggml_flash_attn( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * q, - struct lm_ggml_tensor * k, - struct lm_ggml_tensor * v, - bool masked) { - LM_GGML_ASSERT(lm_ggml_can_mul_mat(k, q)); - // TODO: check if vT can be multiplied by (k*qT) - - bool is_node = false; - - if (q->grad || k->grad || v->grad) { - is_node = true; - } - - //struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, q); - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, LM_GGML_MAX_DIMS, q->ne); - - int32_t t = masked ? 1 : 0; - lm_ggml_set_op_params(result, &t, sizeof(t)); - - result->op = LM_GGML_OP_FLASH_ATTN; - result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = q; - result->src[1] = k; - result->src[2] = v; - - return result; -} - // lm_ggml_flash_attn_ext struct lm_ggml_tensor * lm_ggml_flash_attn_ext( @@ -6294,9 +6966,11 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext( struct lm_ggml_tensor * k, struct lm_ggml_tensor * v, struct lm_ggml_tensor * mask, - float scale) { + float scale, + float max_bias) { LM_GGML_ASSERT(lm_ggml_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) + if (mask) { LM_GGML_ASSERT(lm_ggml_is_contiguous(mask)); LM_GGML_ASSERT(mask->ne[2] == 1); @@ -6306,6 +6980,10 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext( //LM_GGML_ASSERT(lm_ggml_can_repeat_rows(mask, qk)); } + if (max_bias > 0.0f) { + LM_GGML_ASSERT(mask); + } + bool is_node = false; if (q->grad || k->grad || v->grad) { @@ -6316,7 +6994,7 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext( int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] }; struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne); - float params[] = { scale }; + float params[] = { scale, max_bias }; lm_ggml_set_op_params(result, params, sizeof(params)); result->op = LM_GGML_OP_FLASH_ATTN_EXT; @@ -6336,39 +7014,7 @@ void lm_ggml_flash_attn_ext_set_prec( const int32_t prec_i32 = (int32_t) prec; - lm_ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos -} - -// lm_ggml_flash_ff - -struct lm_ggml_tensor * lm_ggml_flash_ff( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b0, - struct lm_ggml_tensor * b1, - struct lm_ggml_tensor * c0, - struct lm_ggml_tensor * c1) { - LM_GGML_ASSERT(lm_ggml_can_mul_mat(b0, a)); - // TODO: more checks - - bool is_node = false; - - if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { - is_node = true; - } - - //struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a); - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, LM_GGML_MAX_DIMS, a->ne); - - result->op = LM_GGML_OP_FLASH_FF; - result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b0; - result->src[2] = b1; - result->src[3] = c0; - result->src[4] = c1; - - return result; + lm_ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second } // lm_ggml_flash_attn_back @@ -6380,6 +7026,8 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_back( struct lm_ggml_tensor * v, struct lm_ggml_tensor * d, bool masked) { + LM_GGML_ASSERT(false && "TODO: adapt to lm_ggml_flash_attn_ext() changes"); + LM_GGML_ASSERT(lm_ggml_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) @@ -7215,8 +7863,8 @@ static void lm_ggml_compute_forward_dup_same_cont( ((char *) src0->data + ie0*nb00), (ie1 - ie0) * lm_ggml_type_size(src0->type)); } - } + static void lm_ggml_compute_forward_dup_f16( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { @@ -7490,7 +8138,7 @@ static void lm_ggml_compute_forward_dup_f16( } } -static void lm_ggml_compute_forward_dup_f32( +static void lm_ggml_compute_forward_dup_bf16( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { @@ -7538,10 +8186,11 @@ static void lm_ggml_compute_forward_dup_f32( return; } + // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy + if (lm_ggml_is_contiguous(dst)) { - // TODO: simplify - if (nb00 == sizeof(float)) { - if (dst->type == LM_GGML_TYPE_F32) { + if (nb00 == sizeof(lm_ggml_bf16_t)) { + if (dst->type == LM_GGML_TYPE_BF16) { size_t id = 0; const size_t rs = ne00 * nb00; char * dst_ptr = (char *) dst->data; @@ -7557,20 +8206,60 @@ static void lm_ggml_compute_forward_dup_f32( id += rs * (ne01 - ir1); } } - } else if (type_traits[dst->type].from_float) { - lm_ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; - + } else if (dst->type == LM_GGML_TYPE_F16) { size_t id = 0; - size_t rs = nb0 * (ne00 / lm_ggml_blck_size(dst->type)); - char * dst_ptr = (char *) dst->data; + lm_ggml_fp16_t * dst_ptr = (lm_ggml_fp16_t *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; + id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - quantize_row_q(src0_ptr, dst_ptr + id, ne00); - id += rs; + const lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + for (int i00 = 0; i00 < ne00; i00++) { + dst_ptr[id] = LM_GGML_FP32_TO_FP16(LM_GGML_BF16_TO_FP32(src0_ptr[i00])); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == LM_GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + for (int i00 = 0; i00 < ne00; i00++) { + dst_ptr[id] = LM_GGML_BF16_TO_FP32(src0_ptr[i00]); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (type_traits[dst->type].from_float) { + lm_ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; + float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + size_t id = 0; + size_t rs = nb0 * (ne00 / lm_ggml_blck_size(dst->type)); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + for (int i00 = 0; i00 < ne00; i00++) { + src0_f32[i00] = LM_GGML_BF16_TO_FP32(src0_ptr[i00]); + } + + quantize_row_q(src0_f32, dst_ptr + id, ne00); + id += rs; } id += rs * (ne01 - ir1); } @@ -7590,7 +8279,25 @@ static void lm_ggml_compute_forward_dup_f32( id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = LM_GGML_BF16_TO_FP32(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == LM_GGML_TYPE_BF16) { + size_t id = 0; + lm_ggml_bf16_t * dst_ptr = (lm_ggml_bf16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = *src0_ptr; id++; @@ -7608,9 +8315,9 @@ static void lm_ggml_compute_forward_dup_f32( id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = LM_GGML_FP32_TO_FP16(*src0_ptr); + dst_ptr[id] = LM_GGML_FP32_TO_FP16(LM_GGML_BF16_TO_FP32(*src0_ptr)); id++; } } @@ -7621,18 +8328,16 @@ static void lm_ggml_compute_forward_dup_f32( LM_GGML_ASSERT(false); // TODO: implement } } - return; } // dst counters - int64_t i10 = 0; int64_t i11 = 0; int64_t i12 = 0; int64_t i13 = 0; - if (dst->type == LM_GGML_TYPE_F32) { + if (dst->type == LM_GGML_TYPE_BF16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -7653,7 +8358,59 @@ static void lm_ggml_compute_forward_dup_f32( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - memcpy(dst_ptr, src0_ptr, sizeof(float)); + memcpy(dst_ptr, src0_ptr, sizeof(lm_ggml_bf16_t)); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == LM_GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(lm_ggml_fp16_t *) dst_ptr = LM_GGML_FP32_TO_FP16(LM_GGML_BF16_TO_FP32(*(const lm_ggml_bf16_t *) src0_ptr)); if (++i10 == ne0) { i10 = 0; @@ -7684,7 +8441,7 @@ static void lm_ggml_compute_forward_dup_f32( } } } - } else if (dst->type == LM_GGML_TYPE_F16) { + } else if (dst->type == LM_GGML_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -7705,7 +8462,7 @@ static void lm_ggml_compute_forward_dup_f32( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(lm_ggml_fp16_t *) dst_ptr = LM_GGML_FP32_TO_FP16(*(const float *) src0_ptr); + *(float *) dst_ptr = LM_GGML_BF16_TO_FP32(*(const lm_ggml_bf16_t *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -7741,31 +8498,27 @@ static void lm_ggml_compute_forward_dup_f32( } } -// A simplified version of lm_ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. -static void lm_ggml_compute_forward_dup_bytes( +static void lm_ggml_compute_forward_dup_f32( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { const struct lm_ggml_tensor * src0 = dst->src[0]; LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0)); - LM_GGML_ASSERT(src0->type == dst->type); if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { return; } - if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst)) { - lm_ggml_compute_forward_dup_same_cont(params, dst); - return; - } - - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS - const size_t type_size = lm_ggml_type_size(src0->type); const int ith = params->ith; // thread index const int nth = params->nth; // number of threads + if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst) && src0->type == dst->type) { + lm_ggml_compute_forward_dup_same_cont(params, dst); + return; + } // parallelize by rows const int nr = ne01; @@ -7777,9 +8530,9 @@ static void lm_ggml_compute_forward_dup_bytes( if (src0->type == dst->type && ne00 == ne0 && - nb00 == type_size && nb0 == type_size) { + nb00 == lm_ggml_type_size(src0->type) && nb0 == lm_ggml_type_size(dst->type)) { // copy by rows - const size_t rs = ne00 * type_size; + const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ir0; i01 < ir1; i01++) { @@ -7794,39 +8547,104 @@ static void lm_ggml_compute_forward_dup_bytes( } if (lm_ggml_is_contiguous(dst)) { - size_t id = 0; - char * dst_ptr = (char *) dst->data; - const size_t rs = ne00 * type_size; + // TODO: simplify + if (nb00 == sizeof(float)) { + if (dst->type == LM_GGML_TYPE_F32) { + size_t id = 0; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; - if (nb00 == type_size) { - // src0 is contigous on first dimension, copy by rows - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int64_t i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, rs); - id += rs; + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else if (type_traits[dst->type].from_float) { + lm_ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; + + size_t id = 0; + size_t rs = nb0 * (ne00 / lm_ggml_blck_size(dst->type)); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + quantize_row_q(src0_ptr, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); } - id += rs * (ne01 - ir1); } + } else { + LM_GGML_ASSERT(false); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, type_size); + if (dst->type == LM_GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; - id += type_size; + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == LM_GGML_TYPE_F16) { + size_t id = 0; + lm_ggml_fp16_t * dst_ptr = (lm_ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = LM_GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); } - id += rs * (ne01 - ir1); } + } else if (dst->type == LM_GGML_TYPE_BF16) { + size_t id = 0; + lm_ggml_bf16_t * dst_ptr = (lm_ggml_bf16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = LM_GGML_FP32_TO_BF16(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else { + LM_GGML_ASSERT(false); // TODO: implement } } @@ -7840,21 +8658,281 @@ static void lm_ggml_compute_forward_dup_bytes( int64_t i12 = 0; int64_t i13 = 0; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } + if (dst->type == LM_GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(float)); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == LM_GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(lm_ggml_fp16_t *) dst_ptr = LM_GGML_FP32_TO_FP16(*(const float *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == LM_GGML_TYPE_BF16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(lm_ggml_bf16_t *) dst_ptr = LM_GGML_FP32_TO_BF16(*(const float *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else { + LM_GGML_ASSERT(false); // TODO: implement + } +} + +// A simplified version of lm_ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. +static void lm_ggml_compute_forward_dup_bytes( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0)); + LM_GGML_ASSERT(src0->type == dst->type); + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst)) { + lm_ggml_compute_forward_dup_same_cont(params, dst); + return; + } + + LM_GGML_TENSOR_UNARY_OP_LOCALS; + + const size_t type_size = lm_ggml_type_size(src0->type); + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == type_size && nb0 == type_size) { + // copy by rows + const size_t rs = ne00 * type_size; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (lm_ggml_is_contiguous(dst)) { + size_t id = 0; + char * dst_ptr = (char *) dst->data; + const size_t rs = ne00 * type_size; + + if (nb00 == type_size) { + // src0 is contigous on first dimension, copy by rows + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int64_t i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, type_size); + + id += type_size; + } + } + id += rs * (ne01 - ir1); + } + } + } + + return; + } + + // dst counters + + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -7909,6 +8987,10 @@ static void lm_ggml_compute_forward_dup( { lm_ggml_compute_forward_dup_f16(params, dst); } break; + case LM_GGML_TYPE_BF16: + { + lm_ggml_compute_forward_dup_bf16(params, dst); + } break; case LM_GGML_TYPE_F32: { lm_ggml_compute_forward_dup_f32(params, dst); @@ -8091,6 +9173,85 @@ static void lm_ggml_compute_forward_add_f16_f32( } } +static void lm_ggml_compute_forward_add_bf16_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst)); + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = lm_ggml_nrows(src0); + + LM_GGML_TENSOR_BINARY_OP_LOCALS + + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_BF16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + + if (dst->type == LM_GGML_TYPE_F32) { + LM_GGML_ASSERT( nb0 == sizeof(float)); + } + else { + LM_GGML_ASSERT(dst->type == LM_GGML_TYPE_BF16); + LM_GGML_ASSERT( nb0 == sizeof(lm_ggml_bf16_t)); + } + + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(float)) { + if (dst->type == LM_GGML_TYPE_BF16) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + lm_ggml_bf16_t * dst_ptr = (lm_ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = LM_GGML_FP32_TO_BF16(LM_GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } else { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = LM_GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; + } + } + } + } + else { + // src1 is not contiguous + LM_GGML_ASSERT(false); + } +} + static void lm_ggml_compute_forward_add_f16_f16( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { @@ -8111,12 +9272,68 @@ static void lm_ggml_compute_forward_add_f16_f16( LM_GGML_TENSOR_BINARY_OP_LOCALS - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(dst->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(dst->type == LM_GGML_TYPE_F16); + + LM_GGML_ASSERT( nb0 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(lm_ggml_fp16_t)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + lm_ggml_fp16_t * dst_ptr = (lm_ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + lm_ggml_fp16_t * src0_ptr = (lm_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + lm_ggml_fp16_t * src1_ptr = (lm_ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(src0_ptr[i]) + LM_GGML_FP16_TO_FP32(src1_ptr[i])); + } + } + } + else { + // src1 is not contiguous + LM_GGML_ASSERT(false); + } +} + +static void lm_ggml_compute_forward_add_bf16_bf16( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst)); + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = lm_ggml_nrows(src0); + + LM_GGML_TENSOR_BINARY_OP_LOCALS + + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_BF16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_BF16); + LM_GGML_ASSERT(dst->type == LM_GGML_TYPE_BF16); - LM_GGML_ASSERT( nb0 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT( nb0 == sizeof(lm_ggml_bf16_t)); + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_bf16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8125,19 +9342,19 @@ static void lm_ggml_compute_forward_add_f16_f16( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - if (nb10 == sizeof(lm_ggml_fp16_t)) { + if (nb10 == sizeof(lm_ggml_bf16_t)) { for (int ir = ir0; ir < ir1; ++ir) { // src0, src1 and dst are same shape => same indices const int i3 = ir/(ne2*ne1); const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - lm_ggml_fp16_t * dst_ptr = (lm_ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - lm_ggml_fp16_t * src0_ptr = (lm_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - lm_ggml_fp16_t * src1_ptr = (lm_ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + lm_ggml_bf16_t * dst_ptr = (lm_ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + lm_ggml_bf16_t * src1_ptr = (lm_ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(src0_ptr[i]) + LM_GGML_FP16_TO_FP32(src1_ptr[i])); + dst_ptr[i] = LM_GGML_FP32_TO_BF16(LM_GGML_BF16_TO_FP32(src0_ptr[i]) + LM_GGML_BF16_TO_FP32(src1_ptr[i])); } } } @@ -8256,6 +9473,18 @@ static void lm_ggml_compute_forward_add( LM_GGML_ASSERT(false); } } break; + case LM_GGML_TYPE_BF16: + { + if (src1->type == LM_GGML_TYPE_BF16) { + lm_ggml_compute_forward_add_bf16_bf16(params, dst); + } + else if (src1->type == LM_GGML_TYPE_F32) { + lm_ggml_compute_forward_add_bf16_f32(params, dst); + } + else { + LM_GGML_ASSERT(false); + } + } break; case LM_GGML_TYPE_Q4_0: case LM_GGML_TYPE_Q4_1: case LM_GGML_TYPE_Q5_0: @@ -8514,6 +9743,110 @@ static void lm_ggml_compute_forward_add1_q_f32( } } +static void lm_ggml_compute_forward_add1_bf16_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); + LM_GGML_ASSERT(lm_ggml_is_scalar(src1)); + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = lm_ggml_nrows(src0); + + LM_GGML_TENSOR_UNARY_OP_LOCALS + + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_BF16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(dst->type == LM_GGML_TYPE_BF16); + + LM_GGML_ASSERT( nb0 == sizeof(lm_ggml_bf16_t)); + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + lm_ggml_bf16_t * dst_ptr = (lm_ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = LM_GGML_FP32_TO_BF16(LM_GGML_BF16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void lm_ggml_compute_forward_add1_bf16_bf16( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); + LM_GGML_ASSERT(lm_ggml_is_scalar(src1)); + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + // scalar to add + const float v = LM_GGML_BF16_TO_FP32(*(lm_ggml_bf16_t *) src1->data); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = lm_ggml_nrows(src0); + + LM_GGML_TENSOR_UNARY_OP_LOCALS + + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_BF16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_BF16); + LM_GGML_ASSERT(dst->type == LM_GGML_TYPE_BF16); + + LM_GGML_ASSERT( nb0 == sizeof(lm_ggml_bf16_t)); + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + lm_ggml_bf16_t * dst_ptr = (lm_ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + lm_ggml_bf16_t * src0_ptr = (lm_ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = LM_GGML_FP32_TO_BF16(LM_GGML_BF16_TO_FP32(src0_ptr[i]) + v); + } + } +} + static void lm_ggml_compute_forward_add1( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { @@ -8538,6 +9871,18 @@ static void lm_ggml_compute_forward_add1( LM_GGML_ASSERT(false); } } break; + case LM_GGML_TYPE_BF16: + { + if (src1->type == LM_GGML_TYPE_BF16) { + lm_ggml_compute_forward_add1_bf16_bf16(params, dst); + } + else if (src1->type == LM_GGML_TYPE_F32) { + lm_ggml_compute_forward_add1_bf16_f32(params, dst); + } + else { + LM_GGML_ASSERT(false); + } + } break; case LM_GGML_TYPE_Q4_0: case LM_GGML_TYPE_Q4_1: case LM_GGML_TYPE_Q5_0: @@ -8666,6 +10011,7 @@ static void lm_ggml_compute_forward_acc( lm_ggml_compute_forward_acc_f32(params, dst); } break; case LM_GGML_TYPE_F16: + case LM_GGML_TYPE_BF16: case LM_GGML_TYPE_Q4_0: case LM_GGML_TYPE_Q4_1: case LM_GGML_TYPE_Q5_0: @@ -9187,6 +10533,40 @@ static void lm_ggml_compute_forward_sum_f16( ((lm_ggml_fp16_t *) dst->data)[0] = LM_GGML_FP32_TO_FP16(sum); } +static void lm_ggml_compute_forward_sum_bf16( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + assert(params->ith == 0); + assert(lm_ggml_is_scalar(dst)); + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(lm_ggml_bf16_t)); + + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) + + float sum = 0; + float row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + lm_ggml_vec_sum_bf16_ggf(ne00, + &row_sum, + (lm_ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; + } + } + } + ((lm_ggml_bf16_t *) dst->data)[0] = LM_GGML_FP32_TO_BF16(sum); +} + static void lm_ggml_compute_forward_sum( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { @@ -9202,6 +10582,10 @@ static void lm_ggml_compute_forward_sum( { lm_ggml_compute_forward_sum_f16(params, dst); } break; + case LM_GGML_TYPE_BF16: + { + lm_ggml_compute_forward_sum_bf16(params, dst); + } break; default: { LM_GGML_ASSERT(false); @@ -9476,6 +10860,7 @@ static void lm_ggml_compute_forward_repeat( switch (src0->type) { case LM_GGML_TYPE_F16: + case LM_GGML_TYPE_BF16: case LM_GGML_TYPE_I16: { lm_ggml_compute_forward_repeat_f16(params, dst); @@ -9596,26 +10981,29 @@ static void lm_ggml_compute_forward_concat_f32( LM_GGML_ASSERT(nb00 == sizeof(float)); LM_GGML_ASSERT(nb10 == sizeof(float)); + const int32_t dim = lm_ggml_get_op_params_i32(dst, 0); + + LM_GGML_ASSERT(dim >= 0 && dim < 4); + + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = src0->ne[dim]; + + const float * x; + + // TODO: smarter multi-theading for (int i3 = 0; i3 < ne3; i3++) { for (int i2 = ith; i2 < ne2; i2 += nth) { - if (i2 < ne02) { // src0 - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03); - - float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); - *y = *x; - } - } - } // src1 - else { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13); - - float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); - *y = *x; + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); + } else { + x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); } + + float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + + *y = *x; } } } @@ -9623,7 +11011,7 @@ static void lm_ggml_compute_forward_concat_f32( } static void lm_ggml_compute_forward_concat( - const struct lm_ggml_compute_params* params, + const struct lm_ggml_compute_params * params, struct lm_ggml_tensor* dst) { const struct lm_ggml_tensor * src0 = dst->src[0]; @@ -9963,6 +11351,52 @@ static void lm_ggml_compute_forward_relu( } } +// lm_ggml_compute_forward_sigmoid + +static void lm_ggml_compute_forward_sigmoid_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + assert(params->ith == 0); + assert(lm_ggml_are_same_shape(src0, dst)); + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + const int n = lm_ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + lm_ggml_vec_sigmoid_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void lm_ggml_compute_forward_sigmoid( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_sigmoid_f32(params, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + // lm_ggml_compute_forward_gelu static void lm_ggml_compute_forward_gelu_f32( @@ -10805,17 +12239,109 @@ static bool lm_ggml_compute_forward_mul_mat_use_blas(struct lm_ggml_tensor * dst src1->type == LM_GGML_TYPE_F32 && (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return true; + } + + return false; +} +#endif + +static void lm_ggml_compute_forward_mul_mat_one_chunk( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst, + const int64_t num_rows_per_vec_dot, + const int64_t ir0_start, + const int64_t ir0_end, + const int64_t ir1_start, + const int64_t ir1_end) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + LM_GGML_TENSOR_BINARY_OP_LOCALS + + const enum lm_ggml_type type = src0->type; + + const bool src1_cont = lm_ggml_is_contiguous(src1); + + lm_ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum lm_ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + + // broadcast factors + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + + //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end); + + // threads with no work simply yield (not sure if it helps) + if (ir0_start >= ir0_end || ir1_start >= ir1_end) { + return; + } + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = lm_ggml_row_size(vec_dot_type, ne10); + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + + // attempt to reduce false-sharing (does not seem to make a difference) + // 16 * 2, accounting for mmla kernels + float tmp[32]; + + for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { + for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { + const int64_t i13 = (ir1 / (ne12 * ne1)); + const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; + const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); + + // broadcast src0 into src1 + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char*)wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12 + i13 * nb13)); + float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { + vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); + } - return false; + for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { + memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); + } + } + } + } } -#endif static void lm_ggml_compute_forward_mul_mat( const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { + struct lm_ggml_tensor * dst, + struct lm_ggml_compute_state * state) { const struct lm_ggml_tensor * src0 = dst->src[0]; const struct lm_ggml_tensor * src1 = dst->src[1]; @@ -10830,9 +12356,6 @@ static void lm_ggml_compute_forward_mul_mat( const enum lm_ggml_type type = src0->type; - const bool src1_cont = lm_ggml_is_contiguous(src1); - - lm_ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; enum lm_ggml_type const vec_dot_type = type_traits[type].vec_dot_type; lm_ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; int64_t const vec_dot_num_rows = type_traits[type].nrows; @@ -10853,8 +12376,10 @@ static void lm_ggml_compute_forward_mul_mat( LM_GGML_ASSERT(nb2 <= nb3); // broadcast factors - const int64_t r2 = ne12/ne02; - const int64_t r3 = ne13/ne03; + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + UNUSED(r2); + UNUSED(r3); // nb01 >= nb00 - src0 is not transposed // compute by src0 rows @@ -10936,6 +12461,8 @@ static void lm_ggml_compute_forward_mul_mat( #endif #if LM_GGML_USE_LLAMAFILE + const bool src1_cont = lm_ggml_is_contiguous(src1); + if (src1_cont) { for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) @@ -10961,6 +12488,8 @@ UseGgmlGemm1:; if (ith != 0) { return; } + // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. + atomic_store(&state->shared->current_chunk, nth); if (src1->type != vec_dot_type) { char * wdata = params->wdata; const size_t row_size = lm_ggml_row_size(vec_dot_type, ne10); @@ -10985,11 +12514,11 @@ UseGgmlGemm1:; return; } - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = lm_ggml_row_size(vec_dot_type, ne10); - #if LM_GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { + const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = lm_ggml_row_size(vec_dot_type, ne10); + for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) if (!llamafile_sgemm(ne01, ne11, ne00/lm_ggml_blck_size(src0->type), @@ -11010,98 +12539,87 @@ UseGgmlGemm1:; UseGgmlGemm2:; #endif - const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = ne1*ne12*ne13; // src1 rows - - //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); - - // distribute the thread work across the inner or outer loop based on which one is larger - - const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows - const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows - - const int64_t ith0 = ith % nth0; - const int64_t ith1 = ith / nth0; - - const int64_t dr0 = (nr0 + nth0 - 1)/nth0; - const int64_t dr1 = (nr1 + nth1 - 1)/nth1; - - const int64_t ir010 = dr0*ith0; - const int64_t ir011 = MIN(ir010 + dr0, nr0); - - const int64_t ir110 = dr1*ith1; - const int64_t ir111 = MIN(ir110 + dr1, nr1); - - //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); - - // threads with no work simply yield (not sure if it helps) - if (ir010 >= ir011 || ir110 >= ir111) { - sched_yield(); - return; - } +#ifdef LM_GGML_PERF + int chunks_executed = 0; + UNUSED(chunks_executed); +#endif - assert(ne12 % ne02 == 0); - assert(ne13 % ne03 == 0); + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) + const int64_t nr0 = ne0; - // block-tiling attempt - const int64_t blck_0 = 16; - const int64_t blck_1 = 16; + // This is the size of the rest of the dimensions of the result + const int64_t nr1 = ne1 * ne2 * ne3; // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols - int64_t nrc = vec_dot_num_rows; + int64_t num_rows_per_vec_dot = vec_dot_num_rows; // TODO: currently the mmla kernels support only even numbered rows/cols. // this check can be removed once they are extended to support odd numbered rows/cols too if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { - nrc = 1; + num_rows_per_vec_dot = 1; } - const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + // Now select a reasonable chunk size. + int chunk_size = 16; - // attempt to reduce false-sharing (does not seem to make a difference) - // 16 * 2, accounting for mmla kernels - float tmp[32]; + // We need to step up the size if it's small + if (nr0 == 1 || nr1 == 1) { + chunk_size = 64; + } - for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { - for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) { - const int64_t i13 = (ir1/(ne12*ne1)); - const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1; - const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1); + // distribute the work across the inner or outer loop based on which one is larger + // The number of chunks in the 0/1 dim. + // CEIL(nr0/chunk_size) + int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; + int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; - // broadcast src0 into src1 - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; + // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread. + // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915 + // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that. + if (nchunk0 * nchunk1 < nth * 4 || lm_ggml_is_numa()) { + // distribute the thread work across the inner or outer loop based on which one is larger + nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + } - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; + // The number of elements in each chunk + const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; + const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; - const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03); + //if (ith == 0) + // printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1); - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - // TODO: this is a bit of a hack, we should probably have a better way to handle this - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size - : (i11*nb11 + i12*nb12 + i13*nb13)); - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + // The first chunk comes from our thread_id, the rest will get auto-assigned. + int current_chunk = ith; - //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); - //} + while (current_chunk < nchunk0 * nchunk1) { + const int64_t ith0 = current_chunk % nchunk0; + const int64_t ith1 = current_chunk / nchunk0; - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) { - vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc); - } + const int64_t ir0_start = dr0 * ith0; + const int64_t ir0_end = MIN(ir0_start + dr0, nr0); - for (int cn = 0; cn < nrc; ++cn) { - memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); - } - } + const int64_t ir1_start = dr1 * ith1; + const int64_t ir1_end = MIN(ir1_start + dr1, nr1); + + lm_ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); + +#ifdef LM_GGML_PERF + chunks_executed++; +#endif + + if (nth >= nchunk0 * nchunk1) { + break; } + + current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1); } + +#ifdef LM_GGML_PERF + // These numbers are useful when trying to measure how well the threading scheduling works. + //int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1; + //float time = (lm_ggml_perf_time_us() - t0); + //printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed); +#endif } // lm_ggml_compute_forward_mul_mat_id @@ -11793,6 +13311,7 @@ static void lm_ggml_compute_forward_set( lm_ggml_compute_forward_set_f32(params, dst); } break; case LM_GGML_TYPE_F16: + case LM_GGML_TYPE_BF16: case LM_GGML_TYPE_Q4_0: case LM_GGML_TYPE_Q4_1: case LM_GGML_TYPE_Q5_0: @@ -11967,6 +13486,49 @@ static void lm_ggml_compute_forward_get_rows_f16( } } +static void lm_ggml_compute_forward_get_rows_bf16( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + LM_GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = lm_ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(lm_ggml_bf16_t)); + assert(lm_ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + lm_ggml_bf16_to_fp32_row( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } +} + static void lm_ggml_compute_forward_get_rows_f32( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { @@ -12044,6 +13606,10 @@ static void lm_ggml_compute_forward_get_rows( { lm_ggml_compute_forward_get_rows_f16(params, dst); } break; + case LM_GGML_TYPE_BF16: + { + lm_ggml_compute_forward_get_rows_bf16(params, dst); + } break; case LM_GGML_TYPE_F32: case LM_GGML_TYPE_I32: { @@ -12356,7 +13922,6 @@ static void lm_ggml_compute_forward_soft_max_f32( const struct lm_ggml_tensor * src0 = dst->src[0]; const struct lm_ggml_tensor * src1 = dst->src[1]; - const struct lm_ggml_tensor * src2 = dst->src[2]; assert(lm_ggml_is_contiguous(dst)); assert(lm_ggml_are_same_shape(src0, dst)); @@ -12382,8 +13947,8 @@ static void lm_ggml_compute_forward_soft_max_f32( // TODO: is this supposed to be ceil instead of floor? // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 - const uint32_t n_head_kv = ne02; - const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv)); + const uint32_t n_head = ne02; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); @@ -12400,13 +13965,13 @@ static void lm_ggml_compute_forward_soft_max_f32( float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; - // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching - lm_ggml_fp16_t * pos_f16 = src2 ? (lm_ggml_fp16_t *) src2->data : src0->data; - float * pos_f32 = src2 ? (float *) src2->data : src0->data; - - const bool use_f16 = (src1 && src1->type == LM_GGML_TYPE_F16) || (src2 && src2->type == LM_GGML_TYPE_F16); + const bool use_f16 = (src1 && src1->type == LM_GGML_TYPE_F16); for (int i1 = ir0; i1 < ir1; i1++) { + // ALiBi + const uint32_t h = (i1/ne01)%ne02; // head + const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; + float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); @@ -12419,27 +13984,11 @@ static void lm_ggml_compute_forward_soft_max_f32( if (mp_f32) { if (use_f16) { for (int i = 0; i < nc; ++i) { - wp[i] += LM_GGML_FP16_TO_FP32(mp_f16[i]); - } - } else { - for (int i = 0; i < nc; ++i) { - wp[i] += mp_f32[i]; - } - } - } - - // ALiBi bias - if (max_bias > 0.0f) { - const uint32_t h = (i1/ne01)%ne02; // head - const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1); - - if (use_f16) { - for (int i = 0; i < nc; ++i) { - wp[i] += slope*LM_GGML_FP16_TO_FP32(pos_f16[i]); + wp[i] += slope*LM_GGML_FP16_TO_FP32(mp_f16[i]); } } else { for (int i = 0; i < nc; ++i) { - wp[i] += slope*pos_f32[i]; + wp[i] += slope*mp_f32[i]; } } } @@ -12454,22 +14003,7 @@ static void lm_ggml_compute_forward_soft_max_f32( float max = -INFINITY; lm_ggml_vec_max_f32(nc, &max, wp); - lm_ggml_float sum = 0.0; - - uint16_t scvt; - for (int i = 0; i < nc; i++) { - if (wp[i] == -INFINITY) { - dp[i] = 0.0f; - } else { - // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max); - lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(wp[i] - max); - memcpy(&scvt, &s, sizeof(scvt)); - const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt]); - sum += (lm_ggml_float)val; - dp[i] = val; - } - } - + lm_ggml_float sum = lm_ggml_vec_soft_max_f32(nc, dp, wp, max); assert(sum > 0.0); sum = 1.0/sum; @@ -12523,249 +14057,78 @@ static void lm_ggml_compute_forward_soft_max_back_f32( // TODO: handle transposed/permuted matrices - const int ith = params->ith; - const int nth = params->nth; - - const int nc = src0->ne[0]; - const int nr = lm_ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int i1 = ir0; i1 < ir1; i1++) { - float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); - float *y = (float *)((char *) src1->data + i1*src1->nb[1]); - float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); - -#ifndef NDEBUG - for (int i = 0; i < nc; ++i) { - //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(dy[i])); - assert(!isnan(y[i])); - } -#endif - // Jii = yi - yi*yi - // Jij = -yi*yj - // J = diag(y)-y.T*y - // dx = J * dy - // dxk = sum_i(Jki * dyi) - // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk - // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk - // dxk = sum_i(-yk*yi * dyi) + yk*dyk - // dxk = -yk * sum_i(yi * dyi) + yk*dyk - // dxk = -yk * dot(y, dy) + yk*dyk - // dxk = yk * (- dot(y, dy) + dyk) - // dxk = yk * (dyk - dot(y, dy)) - // - // post-order: - // dot_y_dy := dot(y, dy) - // dx := dy - // dx := dx - dot_y_dy - // dx := dx * y - - // linear runtime, no additional memory - float dot_y_dy = 0; - lm_ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); - lm_ggml_vec_cpy_f32 (nc, dx, dy); - lm_ggml_vec_acc1_f32(nc, dx, -dot_y_dy); - lm_ggml_vec_mul_f32 (nc, dx, dx, y); - -#ifndef NDEBUG - for (int i = 0; i < nc; ++i) { - assert(!isnan(dx[i])); - assert(!isinf(dx[i])); - } -#endif - } -} - -static void lm_ggml_compute_forward_soft_max_back( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_soft_max_back_f32(params, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -// lm_ggml_compute_forward_alibi - -static void lm_ggml_compute_forward_alibi_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - - assert(params->ith == 0); - - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_head = ((int32_t *) dst->op_params)[1]; - float max_bias; - memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); - - const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 - const int64_t ne1 = src0->ne[1]; // seq_len_without_past - const int64_t ne2 = src0->ne[2]; // n_head -> this is k - //const int64_t ne3 = src0->ne[3]; // 1 -> bsz - - const int64_t n = lm_ggml_nrows(src0); - const int64_t ne2_ne3 = n/ne1; // ne2*ne3 - - const size_t nb0 = src0->nb[0]; - const size_t nb1 = src0->nb[1]; - const size_t nb2 = src0->nb[2]; - //const int nb3 = src0->nb[3]; - - LM_GGML_ASSERT(nb0 == sizeof(float)); - LM_GGML_ASSERT(n_head == ne2); - - // add alibi to src0 (KQ_scaled) - const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); - - const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - - for (int64_t k = 0; k < ne2_ne3; k++) { - // TODO: k*nb2 or k*nb3 - float m_k; - - if (k < n_heads_log2_floor) { - m_k = powf(m0, k + 1); - } else { - m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); - } - - for (int64_t i = 0; i < ne0; i++) { - for (int64_t j = 0; j < ne1; j++) { - float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); - float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); - pdst[0] = i * m_k + src[0]; - } - } - } -} - -static void lm_ggml_compute_forward_alibi_f16( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - - assert(params->ith == 0); - - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_head = ((int32_t *) dst->op_params)[1]; - float max_bias; - memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); - - const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 - const int ne1 = src0->ne[1]; // seq_len_without_past - const int ne2 = src0->ne[2]; // n_head -> this is k - //const int ne3 = src0->ne[3]; // 1 -> bsz - - const int n = lm_ggml_nrows(src0); - const int ne2_ne3 = n/ne1; // ne2*ne3 - - const int nb0 = src0->nb[0]; - const int nb1 = src0->nb[1]; - const int nb2 = src0->nb[2]; - //const int nb3 = src0->nb[3]; + const int ith = params->ith; + const int nth = params->nth; - LM_GGML_ASSERT(nb0 == sizeof(lm_ggml_fp16_t)); - //LM_GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; - LM_GGML_ASSERT(n_head == ne2); + const int nc = src0->ne[0]; + const int nr = lm_ggml_nrows(src0); - // add alibi to src0 (KQ_scaled) - const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + // rows per thread + const int dr = (nr + nth - 1)/nth; - const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); - for (int k = 0; k < ne2_ne3; k++) { - // TODO: k*nb2 or k*nb3 - float m_k; + for (int i1 = ir0; i1 < ir1; i1++) { + float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); + float *y = (float *)((char *) src1->data + i1*src1->nb[1]); + float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); - if (k < n_heads_log2_floor) { - m_k = powf(m0, k + 1); - } else { - m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(dy[i])); + assert(!isnan(y[i])); } +#endif + // Jii = yi - yi*yi + // Jij = -yi*yj + // J = diag(y)-y.T*y + // dx = J * dy + // dxk = sum_i(Jki * dyi) + // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*dyk + // dxk = -yk * sum_i(yi * dyi) + yk*dyk + // dxk = -yk * dot(y, dy) + yk*dyk + // dxk = yk * (- dot(y, dy) + dyk) + // dxk = yk * (dyk - dot(y, dy)) + // + // post-order: + // dot_y_dy := dot(y, dy) + // dx := dy + // dx := dx - dot_y_dy + // dx := dx * y - for (int i = 0; i < ne0; i++) { - for (int j = 0; j < ne1; j++) { - lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); - float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); + // linear runtime, no additional memory + float dot_y_dy = 0; + lm_ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); + lm_ggml_vec_cpy_f32 (nc, dx, dy); + lm_ggml_vec_acc1_f32(nc, dx, -dot_y_dy); + lm_ggml_vec_mul_f32 (nc, dx, dx, y); - // we return F32 - pdst[0] = i * m_k + LM_GGML_FP16_TO_FP32(src[0]); - } +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(dx[i])); + assert(!isinf(dx[i])); } +#endif } } -static void lm_ggml_compute_forward_alibi( +static void lm_ggml_compute_forward_soft_max_back( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { const struct lm_ggml_tensor * src0 = dst->src[0]; switch (src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_alibi_f16(params, dst); - } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_alibi_f32(params, dst); + lm_ggml_compute_forward_soft_max_back_f32(params, dst); } break; - case LM_GGML_TYPE_Q4_0: - case LM_GGML_TYPE_Q4_1: - case LM_GGML_TYPE_Q5_0: - case LM_GGML_TYPE_Q5_1: - case LM_GGML_TYPE_Q8_0: - case LM_GGML_TYPE_Q8_1: - case LM_GGML_TYPE_Q2_K: - case LM_GGML_TYPE_Q3_K: - case LM_GGML_TYPE_Q4_K: - case LM_GGML_TYPE_Q5_K: - case LM_GGML_TYPE_Q6_K: - case LM_GGML_TYPE_IQ2_XXS: - case LM_GGML_TYPE_IQ2_XS: - case LM_GGML_TYPE_IQ3_XXS: - case LM_GGML_TYPE_IQ1_S: - case LM_GGML_TYPE_IQ1_M: - case LM_GGML_TYPE_IQ4_NL: - case LM_GGML_TYPE_IQ4_XS: - case LM_GGML_TYPE_IQ3_S: - case LM_GGML_TYPE_IQ2_S: - case LM_GGML_TYPE_Q8_K: - case LM_GGML_TYPE_I8: - case LM_GGML_TYPE_I16: - case LM_GGML_TYPE_I32: - case LM_GGML_TYPE_I64: - case LM_GGML_TYPE_F64: - case LM_GGML_TYPE_COUNT: + default: { LM_GGML_ASSERT(false); } break; @@ -12828,6 +14191,7 @@ static void lm_ggml_compute_forward_clamp( lm_ggml_compute_forward_clamp_f32(params, dst); } break; case LM_GGML_TYPE_F16: + case LM_GGML_TYPE_BF16: case LM_GGML_TYPE_Q4_0: case LM_GGML_TYPE_Q4_1: case LM_GGML_TYPE_Q5_0: @@ -12926,6 +14290,7 @@ static void lm_ggml_compute_forward_rope_f32( const struct lm_ggml_tensor * src0 = dst->src[0]; const struct lm_ggml_tensor * src1 = dst->src[1]; + const struct lm_ggml_tensor * src2 = dst->src[2]; if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { return; @@ -12985,6 +14350,17 @@ static void lm_ggml_compute_forward_rope_f32( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + const float * freq_factors = NULL; + if (is_neox) { + if (src2 != NULL) { + LM_GGML_ASSERT(src2->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(src2->ne[0] >= n_dims / 2); + freq_factors = (const float *) src2->data; + } + } else { + LM_GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox"); + } + // backward process uses inverse rotation by cos and sin. // cos and sin build a rotation matrix, where the inverse is the transpose. // this essentially just switches the sign of sin. @@ -13019,686 +14395,258 @@ static void lm_ggml_compute_forward_rope_f32( theta_base *= theta_scale; block_theta *= theta_scale; - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float x0 = src[0]; - const float x1 = src[n_dims/2]; - const float x2 = src[n_dims]; - const float x3 = src[n_dims/2*3]; - - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; - dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta; - dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta; - } - } else if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cache[i0 + 0]; - const float sin_theta = cache[i0 + 1]; - - // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; - if (xpos_down) zeta = 1.0f / zeta; - - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float x0 = src[0]; - const float x1 = src[1]; - - dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta; - dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; - } - } else { - // TODO: this might be wrong for ne0 != n_dims - need double check - // it seems we have to rope just the first n_dims elements and do nothing with the rest - // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 - theta_base *= freq_scale; - for (int64_t ic = 0; ic < ne0; ic += 2) { - if (ic < n_dims) { - const int64_t ib = 0; - - // simplified from `(ib * n_dims + ic) * inv_ndims` - float cur_rot = inv_ndims * ic - ib; - - float cos_theta, sin_theta; - rope_yarn( - theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, - &cos_theta, &sin_theta - ); - sin_theta *= sin_sign; - - theta_base *= theta_scale; - - const int64_t i0 = ib*n_dims + ic/2; - - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float x0 = src[0]; - const float x1 = src[n_dims/2]; - - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; - } else { - const int64_t i0 = ic; - - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - dst_data[0] = src[0]; - dst_data[1] = src[1]; - } - } - } - } - } - } -} - -static void lm_ggml_compute_forward_rope_f16( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst, - const bool forward) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - const struct lm_ggml_tensor * src1 = dst->src[1]; - - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } - - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - - LM_GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - LM_GGML_ASSERT(nb0 == sizeof(lm_ggml_fp16_t)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = lm_ggml_nrows(dst); - - LM_GGML_ASSERT(n_dims <= ne0); - LM_GGML_ASSERT(n_dims % 2 == 0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - const float inv_ndims = -1.f/n_dims; - float corr_dims[2]; - lm_ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); - - const bool is_neox = mode & 2; - const bool is_glm = mode & 4; - - // backward process uses inverse rotation by cos and sin. - // cos and sin build a rotation matrix, where the inverse is the transpose. - // this essentially just switches the sign of sin. - const float sin_sign = forward ? 1.0f : -1.0f; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - - float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; - if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox - lm_ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); - } - - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - float theta_base = (float)p; - - if (is_glm) { - theta_base = MIN(p, n_ctx - 2); - float block_theta = MAX(p - (n_ctx - 2), 0); - for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base) * sin_sign; - const float cos_block_theta = cosf(block_theta); - const float sin_block_theta = sinf(block_theta) * sin_sign; - - theta_base *= theta_scale; - block_theta *= theta_scale; - - const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = LM_GGML_FP16_TO_FP32(src[0]); - const float x1 = LM_GGML_FP16_TO_FP32(src[n_dims/2]); - const float x2 = LM_GGML_FP16_TO_FP32(src[n_dims]); - const float x3 = LM_GGML_FP16_TO_FP32(src[n_dims/2*3]); + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + const float x2 = src[n_dims]; + const float x3 = src[n_dims/2*3]; - dst_data[0] = LM_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = LM_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); - dst_data[n_dims] = LM_GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta); - dst_data[n_dims/2*3] = LM_GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta; + dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta; } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float x0 = LM_GGML_FP16_TO_FP32(src[0]); - const float x1 = LM_GGML_FP16_TO_FP32(src[1]); - - dst_data[0] = LM_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[1] = LM_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); - } - } else { - // TODO: this might be wrong for ne0 != n_dims - need double check - // it seems we have to rope just the first n_dims elements and do nothing with the rest - // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 - theta_base *= freq_scale; - for (int64_t ic = 0; ic < ne0; ic += 2) { - if (ic < n_dims) { - const int64_t ib = 0; - - // simplified from `(ib * n_dims + ic) * inv_ndims` - float cur_rot = inv_ndims * ic - ib; - - float cos_theta, sin_theta; - rope_yarn( - theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, - &cos_theta, &sin_theta - ); - sin_theta *= sin_sign; - - theta_base *= theta_scale; - - const int64_t i0 = ib*n_dims + ic/2; - - const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float x0 = LM_GGML_FP16_TO_FP32(src[0]); - const float x1 = LM_GGML_FP16_TO_FP32(src[n_dims/2]); - - dst_data[0] = LM_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = LM_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); - } else { - const int64_t i0 = ic; - - const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - dst_data[0] = src[0]; - dst_data[1] = src[1]; - } - } - } - } - } - } -} - -static void lm_ggml_compute_forward_rope( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_rope_f16(params, dst, true); - } break; - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_rope_f32(params, dst, true); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -// lm_ggml_compute_forward_rope_back - -static void lm_ggml_compute_forward_rope_back( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_rope_f16(params, dst, false); - } break; - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_rope_f32(params, dst, false); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -// lm_ggml_compute_forward_conv_transpose_1d - -static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - const struct lm_ggml_tensor * src1 = dst->src[1]; - - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - LM_GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const int nk = ne00*ne01*ne02; - - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb10 == sizeof(float)); - - if (params->type == LM_GGML_TASK_TYPE_INIT) { - if (ith != 0) { - return; - } - memset(params->wdata, 0, params->wsize); - - // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) - { - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - lm_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ne02 + i02] = src[i00]; - } - } - } - } - - // permute source data (src1) from (L x Cin) to (Cin x L) - { - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + nk; - lm_ggml_fp16_t * dst_data = wdata; - - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = LM_GGML_FP32_TO_FP16(src[i10]); - } - } - } - - // need to zero dst since we are accumulating into it - memset(dst->data, 0, lm_ggml_nbytes(dst)); - - return; - } - - if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - - // total rows in dst - const int nr = ne1; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - lm_ggml_fp16_t * const wdata_src = wdata + nk; - - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); - lm_ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i10*ne11; - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - lm_ggml_vec_dot_f16(ne02, &v, 0, - (lm_ggml_fp16_t *) wdata_src + i1n, 0, - (lm_ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); - dst_data[i10*s0 + i00] += v; - } - } - } -} - -static void lm_ggml_compute_forward_conv_transpose_1d_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - const struct lm_ggml_tensor * src1 = dst->src[1]; - - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - LM_GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const int nk = ne00*ne01*ne02; - - LM_GGML_ASSERT(nb00 == sizeof(float)); - LM_GGML_ASSERT(nb10 == sizeof(float)); - - if (params->type == LM_GGML_TASK_TYPE_INIT) { - if (ith != 0) { - return; - } - memset(params->wdata, 0, params->wsize); - - // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) - { - float * const wdata = (float *) params->wdata + 0; + // zeta scaling for xPos only: + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); - float * dst_data = wdata + i01*ne00*ne02; - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ne02 + i02] = src[i00]; - } - } - } - } + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - // prepare source data (src1) - { - float * const wdata = (float *) params->wdata + nk; - float * dst_data = wdata; + const float x0 = src[0]; + const float x1 = src[1]; - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = src[i10]; - } - } - } + dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta; + dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; + } + } else { + // TODO: this might be wrong for ne0 != n_dims - need double check + // it seems we have to rope just the first n_dims elements and do nothing with the rest + // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 + theta_base *= freq_scale; + for (int64_t ic = 0; ic < ne0; ic += 2) { + if (ic < n_dims) { + const int64_t ib = 0; - // need to zero dst since we are accumulating into it - memset(dst->data, 0, lm_ggml_nbytes(dst)); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; + float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f; - return; - } + float cos_theta, sin_theta; + rope_yarn( + theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); + sin_theta *= sin_sign; - if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } + theta_base *= theta_scale; - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int64_t i0 = ib*n_dims + ic/2; - // total rows in dst - const int nr = ne1; + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - // rows per thread - const int dr = (nr + nth - 1)/nth; + const float x0 = src[0]; + const float x1 = src[n_dims/2]; - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + const int64_t i0 = ic; - float * const wdata = (float *) params->wdata + 0; - float * const wdata_src = wdata + nk; + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); - float * wdata_kernel = wdata + i1*ne02*ne00; - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i10*ne11; - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - lm_ggml_vec_dot_f32(ne02, &v, 0, - wdata_src + i1n, 0, - wdata_kernel + i00*ne02, 0, 1); - dst_data[i10*s0 + i00] += v; + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } + } } } } } -static void lm_ggml_compute_forward_conv_transpose_1d( +// TODO: deduplicate f16/f32 code +static void lm_ggml_compute_forward_rope_f16( const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { + struct lm_ggml_tensor * dst, + const bool forward) { const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + const struct lm_ggml_tensor * src2 = dst->src[2]; - switch (src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); - } break; - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_conv_transpose_1d_f32(params, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; } -} - -// src0: kernel [OC, IC, KH, KW] -// src1: image [N, IC, IH, IW] -// dst: result [N, OH, OW, IC*KH*KW] -static void lm_ggml_compute_forward_im2col_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - const struct lm_ggml_tensor * src0 = dst->src[0]; - const struct lm_ggml_tensor * src1 = dst->src[1]; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); + LM_GGML_TENSOR_UNARY_OP_LOCALS - LM_GGML_TENSOR_BINARY_OP_LOCALS; + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; - const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + LM_GGML_ASSERT(nb0 == sizeof(lm_ggml_fp16_t)); const int ith = params->ith; const int nth = params->nth; - const int64_t N = is_2D ? ne13 : ne12; - const int64_t IC = is_2D ? ne12 : ne11; - const int64_t IH = is_2D ? ne11 : 1; - const int64_t IW = ne10; + const int nr = lm_ggml_nrows(dst); - const int64_t KH = is_2D ? ne01 : 1; - const int64_t KW = ne00; + LM_GGML_ASSERT(n_dims <= ne0); + LM_GGML_ASSERT(n_dims % 2 == 0); - const int64_t OH = is_2D ? ne2 : 1; - const int64_t OW = ne1; + // rows per thread + const int dr = (nr + nth - 1)/nth; - int ofs0 = is_2D ? nb13 : nb12; - int ofs1 = is_2D ? nb12 : nb11; + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb10 == sizeof(float)); + // row index used to determine which thread to use + int ir = 0; - if (params->type == LM_GGML_TASK_TYPE_INIT) { - return; - } + const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; + float corr_dims[2]; + lm_ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); - if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; - // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] - { - float * const wdata = (float *) dst->data; + const float * freq_factors = NULL; + if (is_neox) { + if (src2 != NULL) { + LM_GGML_ASSERT(src2->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(src2->ne[0] >= n_dims / 2); + freq_factors = (const float *) src2->data; + } + } else { + LM_GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox"); + } - for (int64_t in = 0; in < N; in++) { - for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 - for (int64_t iow = 0; iow < OW; iow++) { - for (int64_t iic = ith; iic < IC; iic += nth) { + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; - // micro kernel - float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + const int32_t * pos = (const int32_t *) src1->data; - for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 - for (int64_t ikw = 0; ikw < KW; ikw++) { - const int64_t iiw = iow*s0 + ikw*d0 - p0; - const int64_t iih = ioh*s1 + ikh*d1 - p1; + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; - if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; - } else { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]); - } - } - } - } - } + float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; + if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox + lm_ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); } - } - } -} - - -// src0: kernel [OC, IC, KH, KW] -// src1: image [N, IC, IH, IW] -// dst: result [N, OH, OW, IC*KH*KW] -static void lm_ggml_compute_forward_im2col_f16( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - const struct lm_ggml_tensor * src0 = dst->src[0]; - const struct lm_ggml_tensor * src1 = dst->src[1]; + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16); + float theta_base = (float)p; - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); + if (is_glm) { + theta_base = MIN(p, n_ctx - 2); + float block_theta = MAX(p - (n_ctx - 2), 0); + for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base) * sin_sign; + const float cos_block_theta = cosf(block_theta); + const float sin_block_theta = sinf(block_theta) * sin_sign; - LM_GGML_TENSOR_BINARY_OP_LOCALS; + theta_base *= theta_scale; + block_theta *= theta_scale; - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; - const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const int ith = params->ith; - const int nth = params->nth; + const float x0 = LM_GGML_FP16_TO_FP32(src[0]); + const float x1 = LM_GGML_FP16_TO_FP32(src[n_dims/2]); + const float x2 = LM_GGML_FP16_TO_FP32(src[n_dims]); + const float x3 = LM_GGML_FP16_TO_FP32(src[n_dims/2*3]); - const int64_t N = is_2D ? ne13 : ne12; - const int64_t IC = is_2D ? ne12 : ne11; - const int64_t IH = is_2D ? ne11 : 1; - const int64_t IW = ne10; + dst_data[0] = LM_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = LM_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[n_dims] = LM_GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta); + dst_data[n_dims/2*3] = LM_GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); + } + } else if (!is_neox) { + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; - const int64_t KH = is_2D ? ne01 : 1; - const int64_t KW = ne00; + const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const int64_t OH = is_2D ? ne2 : 1; - const int64_t OW = ne1; + const float x0 = LM_GGML_FP16_TO_FP32(src[0]); + const float x1 = LM_GGML_FP16_TO_FP32(src[1]); - int ofs0 = is_2D ? nb13 : nb12; - int ofs1 = is_2D ? nb12 : nb11; + dst_data[0] = LM_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = LM_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } + } else { + // TODO: this might be wrong for ne0 != n_dims - need double check + // it seems we have to rope just the first n_dims elements and do nothing with the rest + // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 + theta_base *= freq_scale; + for (int64_t ic = 0; ic < ne0; ic += 2) { + if (ic < n_dims) { + const int64_t ib = 0; - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb10 == sizeof(float)); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; + float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f; - if (params->type == LM_GGML_TASK_TYPE_INIT) { - return; - } + float cos_theta, sin_theta; + rope_yarn( + theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); + sin_theta *= sin_sign; - if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } + theta_base *= theta_scale; - // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] - { - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data; + const int64_t i0 = ib*n_dims + ic/2; - for (int64_t in = 0; in < N; in++) { - for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 - for (int64_t iow = 0; iow < OW; iow++) { - for (int64_t iic = ith; iic < IC; iic += nth) { + const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - // micro kernel - lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + const float x0 = LM_GGML_FP16_TO_FP32(src[0]); + const float x1 = LM_GGML_FP16_TO_FP32(src[n_dims/2]); - for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 - for (int64_t ikw = 0; ikw < KW; ikw++) { - const int64_t iiw = iow*s0 + ikw*d0 - p0; - const int64_t iih = ioh*s1 + ikh*d1 - p1; + dst_data[0] = LM_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = LM_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } else { + const int64_t i0 = ic; - if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; - } else { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); - } - } + const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } @@ -13707,17 +14655,20 @@ static void lm_ggml_compute_forward_im2col_f16( } } -static void lm_ggml_compute_forward_im2col( +static void lm_ggml_compute_forward_rope( const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - switch (dst->type) { + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_im2col_f16(params, dst); + lm_ggml_compute_forward_rope_f16(params, dst, true); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_im2col_f32(params, dst); + lm_ggml_compute_forward_rope_f32(params, dst, true); } break; default: { @@ -13726,10 +14677,33 @@ static void lm_ggml_compute_forward_im2col( } } +// lm_ggml_compute_forward_rope_back -// lm_ggml_compute_forward_conv_transpose_2d +static void lm_ggml_compute_forward_rope_back( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { -static void lm_ggml_compute_forward_conv_transpose_2d( + const struct lm_ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case LM_GGML_TYPE_F16: + { + lm_ggml_compute_forward_rope_f16(params, dst, false); + } break; + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_rope_f32(params, dst, false); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + +// lm_ggml_compute_forward_conv_transpose_1d + +static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { @@ -13748,7 +14722,7 @@ static void lm_ggml_compute_forward_conv_transpose_2d( const int ith = params->ith; const int nth = params->nth; - const int nk = ne00*ne01*ne02*ne03; + const int nk = ne00*ne01*ne02; LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); LM_GGML_ASSERT(nb10 == sizeof(float)); @@ -13759,37 +14733,35 @@ static void lm_ggml_compute_forward_conv_transpose_2d( } memset(params->wdata, 0, params->wsize); - // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) + // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) { lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); - lm_ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; - } + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + lm_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ne02 + i02] = src[i00]; } } } } - // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) + // permute source data (src1) from (L x Cin) to (Cin x L) { lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + nk; - for (int i12 = 0; i12 < ne12; i12++) { - for (int i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); - lm_ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; - for (int i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne12 + i12] = LM_GGML_FP32_TO_FP16(src[i10]); - } + lm_ggml_fp16_t * dst_data = wdata; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne11 + i11] = LM_GGML_FP32_TO_FP16(src[i10]); } } } + // need to zero dst since we are accumulating into it memset(dst->data, 0, lm_ggml_nbytes(dst)); return; @@ -13799,284 +14771,326 @@ static void lm_ggml_compute_forward_conv_transpose_2d( return; } - const int32_t stride = lm_ggml_get_op_params_i32(dst, 0); + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - // total patches in dst - const int np = ne2; + // total rows in dst + const int nr = ne1; - // patches per thread - const int dp = (np + nth - 1)/nth; + // rows per thread + const int dr = (nr + nth - 1)/nth; - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; lm_ggml_fp16_t * const wdata_src = wdata + nk; - for (int i2 = ip0; i2 < ip1; i2++) { // Cout - float * dst_data = (float *)((char *) dst->data + i2*nb2); - lm_ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; - for (int i11 = 0; i11 < ne11; i11++) { - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i11*ne10*ne12 + i10*ne12; - for (int i01 = 0; i01 < ne01; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - lm_ggml_vec_dot_f16(ne03, &v, 0, - wdata_src + i1n, 0, - wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); - dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + lm_ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i10*ne11; + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + lm_ggml_vec_dot_f16(ne02, &v, 0, + (lm_ggml_fp16_t *) wdata_src + i1n, 0, + (lm_ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); + dst_data[i10*s0 + i00] += v; + } + } + } +} + +static void lm_ggml_compute_forward_conv_transpose_1d_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); + + int64_t t0 = lm_ggml_perf_time_us(); + UNUSED(t0); + + LM_GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00*ne01*ne02; + + LM_GGML_ASSERT(nb00 == sizeof(float)); + LM_GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == LM_GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) + { + float * const wdata = (float *) params->wdata + 0; + + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + float * dst_data = wdata + i01*ne00*ne02; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ne02 + i02] = src[i00]; } } } } - } -} -// lm_ggml_compute_forward_pool_1d_sk_p0 + // prepare source data (src1) + { + float * const wdata = (float *) params->wdata + nk; + float * dst_data = wdata; -static void lm_ggml_compute_forward_pool_1d_sk_p0( - const struct lm_ggml_compute_params * params, - const enum lm_ggml_op_pool op, - const int k, - struct lm_ggml_tensor * dst) { + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne11 + i11] = src[i10]; + } + } + } - const struct lm_ggml_tensor * src = dst->src[0]; + // need to zero dst since we are accumulating into it + memset(dst->data, 0, lm_ggml_nbytes(dst)); - assert(src->type == LM_GGML_TYPE_F32); - assert(params->ith == 0); + return; + } - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { return; } - const char * cdata = (const char *)src->data; - const char * const data_end = cdata + lm_ggml_nbytes(src); - float * drow = (float *)dst->data; + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int64_t rs = dst->ne[0]; + // total rows in dst + const int nr = ne1; - while (cdata < data_end) { - const float * const srow = (const float *)cdata; + // rows per thread + const int dr = (nr + nth - 1)/nth; - int j = 0; + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); - for (int64_t i = 0; i < rs; ++i) { - switch (op) { - case LM_GGML_OP_POOL_AVG: drow[i] = 0; break; - case LM_GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; - case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; - } - for (int ki = 0; ki < k; ++ki) { - switch (op) { - case LM_GGML_OP_POOL_AVG: drow[i] += srow[j]; break; - case LM_GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; - case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; - } - ++j; - } - switch (op) { - case LM_GGML_OP_POOL_AVG: drow[i] /= k; break; - case LM_GGML_OP_POOL_MAX: break; - case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; + float * const wdata = (float *) params->wdata + 0; + float * const wdata_src = wdata + nk; + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + float * wdata_kernel = wdata + i1*ne02*ne00; + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i10*ne11; + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + lm_ggml_vec_dot_f32(ne02, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i00*ne02, 0, 1); + dst_data[i10*s0 + i00] += v; } } - - cdata += src->nb[1]; - drow += rs; } } -// lm_ggml_compute_forward_pool_1d - -static void lm_ggml_compute_forward_pool_1d( +static void lm_ggml_compute_forward_conv_transpose_1d( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { - const int32_t * opts = (const int32_t *)dst->op_params; - enum lm_ggml_op_pool op = opts[0]; - const int k0 = opts[1]; - const int s0 = opts[2]; - const int p0 = opts[3]; - LM_GGML_ASSERT(p0 == 0); // padding not supported - LM_GGML_ASSERT(k0 == s0); // only s = k supported + const struct lm_ggml_tensor * src0 = dst->src[0]; - lm_ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); + switch (src0->type) { + case LM_GGML_TYPE_F16: + { + lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); + } break; + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_conv_transpose_1d_f32(params, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } } -// lm_ggml_compute_forward_pool_2d - -static void lm_ggml_compute_forward_pool_2d( +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void lm_ggml_compute_forward_im2col_f32( const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src = dst->src[0]; - - LM_GGML_ASSERT(src->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT(params->ith == 0); - - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } + struct lm_ggml_tensor * dst) { - const int32_t * opts = (const int32_t *)dst->op_params; - enum lm_ggml_op_pool op = opts[0]; - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - const char * cdata = (const char*)src->data; - const char * const data_end = cdata + lm_ggml_nbytes(src); + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; - const int64_t px = dst->ne[0]; - const int64_t py = dst->ne[1]; - const int64_t pa = px * py; + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); - float * dplane = (float *)dst->data; + int64_t t0 = lm_ggml_perf_time_us(); + UNUSED(t0); - const int ka = k0 * k1; - const int offset0 = -p0; - const int offset1 = -p1; + LM_GGML_TENSOR_BINARY_OP_LOCALS; - while (cdata < data_end) { - for (int oy = 0; oy < py; ++oy) { - float * const drow = dplane + oy * px; - for (int ox = 0; ox < px; ++ox) { - float * const out = drow + ox; - switch (op) { - case LM_GGML_OP_POOL_AVG: *out = 0; break; - case LM_GGML_OP_POOL_MAX: *out = -FLT_MAX; break; - case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; - } + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; - const int ix = offset0 + ox * s0; - const int iy = offset1 + oy * s1; + const int ith = params->ith; + const int nth = params->nth; - for (int ky = 0; ky < k1; ++ky) { - if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; - const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); - for (int kx = 0; kx < k0; ++kx) { - int j = ix + kx; - if (j < 0 || j >= src->ne[0]) continue; - switch (op) { - case LM_GGML_OP_POOL_AVG: *out += srow[j]; break; - case LM_GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; - case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; - } - } - } - switch (op) { - case LM_GGML_OP_POOL_AVG: *out /= ka; break; - case LM_GGML_OP_POOL_MAX: break; - case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; - } - } - } + const int64_t N = is_2D ? ne13 : ne12; + const int64_t IC = is_2D ? ne12 : ne11; + const int64_t IH = is_2D ? ne11 : 1; + const int64_t IW = ne10; - cdata += src->nb[2]; - dplane += pa; - } -} + const int64_t KH = is_2D ? ne01 : 1; + const int64_t KW = ne00; -// lm_ggml_compute_forward_upscale + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; -static void lm_ggml_compute_forward_upscale_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { + int ofs0 = is_2D ? nb13 : nb12; + int ofs1 = is_2D ? nb12 : nb11; - const struct lm_ggml_tensor * src0 = dst->src[0]; + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + if (params->type == LM_GGML_TASK_TYPE_INIT) { return; } - LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - LM_GGML_TENSOR_UNARY_OP_LOCALS + if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } - const int scale_factor = dst->op_params[0]; + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + float * const wdata = (float *) dst->data; - // TODO: optimize + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic += nth) { - for (int64_t i3 = 0; i3 < ne3; i3++) { - const int64_t i03 = i3; - for (int64_t i2 = ith; i2 < ne2; i2 += nth) { - const int64_t i02 = i2; - for (int64_t i1 = 0; i1 < ne1; i1++) { - const int64_t i01 = i1 / scale_factor; - for (int64_t i0 = 0; i0 < ne0; i0++) { - const int64_t i00 = i0 / scale_factor; + // micro kernel + float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] - const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; - *y = *x; + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; + } else { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]); + } + } + } + } } } } } } -static void lm_ggml_compute_forward_upscale( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - switch (src0->type) { - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_upscale_f32(params, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void lm_ggml_compute_forward_im2col_f16( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { -// lm_ggml_compute_forward_pad + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; -static void lm_ggml_compute_forward_pad_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16); - const struct lm_ggml_tensor * src0 = dst->src[0]; + int64_t t0 = lm_ggml_perf_time_us(); + UNUSED(t0); - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } + LM_GGML_TENSOR_BINARY_OP_LOCALS; - LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); - LM_GGML_ASSERT( dst->nb[0] == sizeof(float)); + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; const int ith = params->ith; const int nth = params->nth; - LM_GGML_TENSOR_UNARY_OP_LOCALS + const int64_t N = is_2D ? ne13 : ne12; + const int64_t IC = is_2D ? ne12 : ne11; + const int64_t IH = is_2D ? ne11 : 1; + const int64_t IW = ne10; - float * dst_ptr = (float *) dst->data; + const int64_t KH = is_2D ? ne01 : 1; + const int64_t KW = ne00; - // TODO: optimize + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = ith; i1 < ne1; i1 += nth) { - for (int64_t i0 = 0; i0 < ne0; ++i0) { - for (int64_t i3 = 0; i3 < ne3; ++i3) { - const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; + int ofs0 = is_2D ? nb13 : nb12; + int ofs1 = is_2D ? nb12 : nb11; - const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb10 == sizeof(float)); - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - dst_ptr[dst_idx] = *src_ptr; - } else { - dst_ptr[dst_idx] = 0; + if (params->type == LM_GGML_TASK_TYPE_INIT) { + return; + } + + if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic += nth) { + + // micro kernel + lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; + } else { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + } + } + } } } } @@ -14084,16 +15098,17 @@ static void lm_ggml_compute_forward_pad_f32( } } -static void lm_ggml_compute_forward_pad( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { +static void lm_ggml_compute_forward_im2col( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + switch (dst->type) { + case LM_GGML_TYPE_F16: + { + lm_ggml_compute_forward_im2col_f16(params, dst); + } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_pad_f32(params, dst); + lm_ggml_compute_forward_im2col_f32(params, dst); } break; default: { @@ -14103,605 +15118,540 @@ static void lm_ggml_compute_forward_pad( } -// lm_ggml_compute_forward_arange +// lm_ggml_compute_forward_conv_transpose_2d -static void lm_ggml_compute_forward_arange_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { +static void lm_ggml_compute_forward_conv_transpose_2d( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; - LM_GGML_ASSERT(dst->nb[0] == sizeof(float)); + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); + + int64_t t0 = lm_ggml_perf_time_us(); + UNUSED(t0); + + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; - const float start = lm_ggml_get_op_params_f32(dst, 0); - const float stop = lm_ggml_get_op_params_f32(dst, 1); - const float step = lm_ggml_get_op_params_f32(dst, 2); + const int nk = ne00*ne01*ne02*ne03; - const int64_t steps = (int64_t) ceilf((stop - start) / step); + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb10 == sizeof(float)); - LM_GGML_ASSERT(lm_ggml_nelements(dst) == steps); + if (params->type == LM_GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } + memset(params->wdata, 0, params->wsize); - for (int64_t i = ith; i < steps; i+= nth) { - float value = start + step * i; - ((float *)dst->data)[i] = value; - } -} + // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) + { + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; -static void lm_ggml_compute_forward_arange( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - switch (dst->type) { - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_arange_f32(params, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); + lm_ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; + } + } + } + } + } -static void lm_ggml_compute_forward_timestep_embedding_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { + // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) + { + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + nk; + for (int i12 = 0; i12 < ne12; i12++) { + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); + lm_ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne12 + i12] = LM_GGML_FP32_TO_FP16(src[i10]); + } + } + } + } + + memset(dst->data, 0, lm_ggml_nbytes(dst)); - if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { return; } - const struct lm_ggml_tensor * src0 = dst->src[0]; + if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } - LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); + const int32_t stride = lm_ggml_get_op_params_i32(dst, 0); - const int ith = params->ith; - const int nth = params->nth; + // total patches in dst + const int np = ne2; - LM_GGML_TENSOR_UNARY_OP_LOCALS + // patches per thread + const int dp = (np + nth - 1)/nth; - const int dim = lm_ggml_get_op_params_i32(dst, 0); - const int max_period = lm_ggml_get_op_params_i32(dst, 1); + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); - int half = dim / 2; + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; + lm_ggml_fp16_t * const wdata_src = wdata + nk; - for (int64_t i = 0; i < ne00; i++) { - float * embed_data = (float *)((char *) dst->data + i*nb1); - for (int64_t j = ith; j < half; j += nth) { - float timestep = ((float *)src0->data)[i]; - float freq = (float)expf(-logf(max_period) * j / half); - float arg = timestep * freq; - embed_data[j] = cosf(arg); - embed_data[j + half] = sinf(arg); - } - if (dim % 2 != 0 && ith == 0) { - embed_data[dim] = 0.f; + for (int i2 = ip0; i2 < ip1; i2++) { // Cout + float * dst_data = (float *)((char *) dst->data + i2*nb2); + lm_ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; + for (int i11 = 0; i11 < ne11; i11++) { + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i11*ne10*ne12 + i10*ne12; + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + lm_ggml_vec_dot_f16(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; + } + } + } } } } -static void lm_ggml_compute_forward_timestep_embedding( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_timestep_embedding_f32(params, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} +// lm_ggml_compute_forward_pool_1d_sk_p0 -// lm_ggml_compute_forward_argsort +static void lm_ggml_compute_forward_pool_1d_sk_p0( + const struct lm_ggml_compute_params * params, + const enum lm_ggml_op_pool op, + const int k, + struct lm_ggml_tensor * dst) { -static void lm_ggml_compute_forward_argsort_f32( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src = dst->src[0]; - const struct lm_ggml_tensor * src0 = dst->src[0]; + assert(src->type == LM_GGML_TYPE_F32); + assert(params->ith == 0); if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { return; } - LM_GGML_TENSOR_UNARY_OP_LOCALS - - LM_GGML_ASSERT(nb0 == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nr = lm_ggml_nrows(src0); + const char * cdata = (const char *)src->data; + const char * const data_end = cdata + lm_ggml_nbytes(src); + float * drow = (float *)dst->data; - enum lm_ggml_sort_order order = (enum lm_ggml_sort_order) lm_ggml_get_op_params_i32(dst, 0); + const int64_t rs = dst->ne[0]; - for (int64_t i = ith; i < nr; i += nth) { - int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); - const float * src_data = (float *)((char *) src0->data + i*nb01); + while (cdata < data_end) { + const float * const srow = (const float *)cdata; - for (int64_t j = 0; j < ne0; j++) { - dst_data[j] = j; - } + int j = 0; - // C doesn't have a functional sort, so we do a bubble sort instead - for (int64_t j = 0; j < ne0; j++) { - for (int64_t k = j + 1; k < ne0; k++) { - if ((order == LM_GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || - (order == LM_GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { - int32_t tmp = dst_data[j]; - dst_data[j] = dst_data[k]; - dst_data[k] = tmp; + for (int64_t i = 0; i < rs; ++i) { + switch (op) { + case LM_GGML_OP_POOL_AVG: drow[i] = 0; break; + case LM_GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; + case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; + } + for (int ki = 0; ki < k; ++ki) { + switch (op) { + case LM_GGML_OP_POOL_AVG: drow[i] += srow[j]; break; + case LM_GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; + case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; } + ++j; + } + switch (op) { + case LM_GGML_OP_POOL_AVG: drow[i] /= k; break; + case LM_GGML_OP_POOL_MAX: break; + case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; } } + + cdata += src->nb[1]; + drow += rs; } } -static void lm_ggml_compute_forward_argsort( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { +// lm_ggml_compute_forward_pool_1d - const struct lm_ggml_tensor * src0 = dst->src[0]; +static void lm_ggml_compute_forward_pool_1d( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - switch (src0->type) { - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_argsort_f32(params, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } + const int32_t * opts = (const int32_t *)dst->op_params; + enum lm_ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int s0 = opts[2]; + const int p0 = opts[3]; + LM_GGML_ASSERT(p0 == 0); // padding not supported + LM_GGML_ASSERT(k0 == s0); // only s = k supported + + lm_ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); } -// lm_ggml_compute_forward_flash_attn +// lm_ggml_compute_forward_pool_2d -static void lm_ggml_compute_forward_flash_attn_f32( +static void lm_ggml_compute_forward_pool_2d( const struct lm_ggml_compute_params * params, - const bool masked, struct lm_ggml_tensor * dst) { - const struct lm_ggml_tensor * q = dst->src[0]; - const struct lm_ggml_tensor * k = dst->src[1]; - const struct lm_ggml_tensor * v = dst->src[2]; - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); + const struct lm_ggml_tensor * src = dst->src[0]; - LM_GGML_TENSOR_LOCALS(int64_t, neq, q, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbq, q, nb) - LM_GGML_TENSOR_LOCALS(int64_t, nek, k, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbk, k, nb) - LM_GGML_TENSOR_LOCALS(int64_t, nev, v, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbv, v, nb) - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + LM_GGML_ASSERT(src->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(params->ith == 0); - const int ith = params->ith; - const int nth = params->nth; + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } - const int64_t D = neq0; - const int64_t N = neq1; - const int64_t P = nek1 - N; - const int64_t M = P + N; + const int32_t * opts = (const int32_t *)dst->op_params; + enum lm_ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + const char * cdata = (const char*)src->data; + const char * const data_end = cdata + lm_ggml_nbytes(src); - const int Mup = lm_ggml_up(M, LM_GGML_SOFT_MAX_UNROLL); + const int64_t px = dst->ne[0]; + const int64_t py = dst->ne[1]; + const int64_t pa = px * py; - LM_GGML_ASSERT(ne0 == D); - LM_GGML_ASSERT(ne1 == N); - LM_GGML_ASSERT(P >= 0); + float * dplane = (float *)dst->data; - LM_GGML_ASSERT(nbq0 == sizeof(float)); - LM_GGML_ASSERT(nbk0 == sizeof(float)); - LM_GGML_ASSERT(nbv0 == sizeof(float)); + const int ka = k0 * k1; + const int offset0 = -p0; + const int offset1 = -p1; - LM_GGML_ASSERT(neq0 == D); - LM_GGML_ASSERT(nek0 == D); - LM_GGML_ASSERT(nev1 == D); + while (cdata < data_end) { + for (int oy = 0; oy < py; ++oy) { + float * const drow = dplane + oy * px; + for (int ox = 0; ox < px; ++ox) { + float * const out = drow + ox; + switch (op) { + case LM_GGML_OP_POOL_AVG: *out = 0; break; + case LM_GGML_OP_POOL_MAX: *out = -FLT_MAX; break; + case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; + } - LM_GGML_ASSERT(neq1 == N); - LM_GGML_ASSERT(nek1 == N + P); - LM_GGML_ASSERT(nev1 == D); + const int ix = offset0 + ox * s0; + const int iy = offset1 + oy * s1; - // dst cannot be transposed or permuted - LM_GGML_ASSERT(nb0 == sizeof(float)); - LM_GGML_ASSERT(nb0 <= nb1); - LM_GGML_ASSERT(nb1 <= nb2); - LM_GGML_ASSERT(nb2 <= nb3); + for (int ky = 0; ky < k1; ++ky) { + if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; + const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + for (int kx = 0; kx < k0; ++kx) { + int j = ix + kx; + if (j < 0 || j >= src->ne[0]) continue; + switch (op) { + case LM_GGML_OP_POOL_AVG: *out += srow[j]; break; + case LM_GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; + case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; + } + } + } + switch (op) { + case LM_GGML_OP_POOL_AVG: *out /= ka; break; + case LM_GGML_OP_POOL_MAX: break; + case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; + } + } + } - if (params->type == LM_GGML_TASK_TYPE_INIT) { - return; + cdata += src->nb[2]; + dplane += pa; } +} - if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { +// lm_ggml_compute_forward_upscale + +static void lm_ggml_compute_forward_upscale_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * src0 = dst->src[0]; + + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { return; } - // parallelize by q rows using lm_ggml_vec_dot_f32 + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); - // total rows in q - const int nr = neq1*neq2*neq3; + const int ith = params->ith; + const int nth = params->nth; - // rows per thread - const int dr = (nr + nth - 1)/nth; + LM_GGML_TENSOR_UNARY_OP_LOCALS - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + const float sf0 = (float)ne0/src0->ne[0]; + const float sf1 = (float)ne1/src0->ne[1]; + const float sf2 = (float)ne2/src0->ne[2]; + const float sf3 = (float)ne3/src0->ne[3]; - const float scale = 1.0f/sqrtf(D); + // TODO: optimize - //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + for (int64_t i3 = 0; i3 < ne3; i3++) { + const int64_t i03 = i3 / sf3; + for (int64_t i2 = ith; i2 < ne2; i2 += nth) { + const int64_t i02 = i2 / sf2; + for (int64_t i1 = 0; i1 < ne1; i1++) { + const int64_t i01 = i1 / sf1; + for (int64_t i0 = 0; i0 < ne0; i0++) { + const int64_t i00 = i0 / sf0; - for (int ir = ir0; ir < ir1; ++ir) { - // q indices - const int iq3 = ir/(neq2*neq1); - const int iq2 = (ir - iq3*neq2*neq1)/neq1; - const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + + *y = *x; + } + } + } + } +} + +static void lm_ggml_compute_forward_upscale( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32); + const struct lm_ggml_tensor * src0 = dst->src[0]; - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; - } + switch (src0->type) { + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_upscale_f32(params, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} - const int64_t masked_begin = masked ? (P + iq1 + 1) : M; - for (int64_t ic = 0; ic < masked_begin; ++ic) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2 % nek2; - const int ik1 = ic; - // S indices - const int i1 = ik1; +// lm_ggml_compute_forward_pad - lm_ggml_vec_dot_f32(neq0, - S + i1, 0, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); - } +static void lm_ggml_compute_forward_pad_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - // scale - lm_ggml_vec_scale_f32(masked_begin, S, scale); + const struct lm_ggml_tensor * src0 = dst->src[0]; - for (int64_t i = masked_begin; i < M; i++) { - S[i] = -INFINITY; - } + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } - // softmax - // exclude known -INF S[..] values from max and loop - // dont forget to set their SW values to zero - { - float max = -INFINITY; - lm_ggml_vec_max_f32(masked_begin, &max, S); + LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); + LM_GGML_ASSERT( dst->nb[0] == sizeof(float)); - lm_ggml_float sum = 0.0; - { -#ifdef LM_GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(S, 1, &max, S, 1, Mup); - vvexpf(S, S, &Mup); - lm_ggml_vec_sum_f32(Mup, &sum, S); -#else - uint16_t scvt[LM_GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); - lm_ggml_float sump[LM_GGML_SOFT_MAX_UNROLL] = { 0.0 }; + const int ith = params->ith; + const int nth = params->nth; - for (int i = 0; i < Mup; i += LM_GGML_SOFT_MAX_UNROLL) { - if (i >= masked_begin) { - break; - } - float * SS = S + i; + LM_GGML_TENSOR_UNARY_OP_LOCALS - for (int j = 0; j < LM_GGML_SOFT_MAX_UNROLL; ++j) { - if (i + j >= masked_begin) { - break; - } else if (SS[j] == -INFINITY) { - SS[j] = 0.0f; - } else { -#ifndef LM_GGML_FLASH_ATTN_EXP_FP16 - const float val = expf(SS[j] - max); -#else - lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(SS[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt[j]]); -#endif - sump[j] += (lm_ggml_float)val; - SS[j] = val; - } - } - } + float * dst_ptr = (float *) dst->data; - for (int i = 0; i < LM_GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } -#endif - } + // TODO: optimize - assert(sum > 0.0); + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + for (int64_t i3 = 0; i3 < ne3; ++i3) { + const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - sum = 1.0/sum; - lm_ggml_vec_scale_f32(masked_begin, S, sum); + const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); -#ifndef NDEBUG - for (int i = 0; i < masked_begin; ++i) { - assert(!isnan(S[i])); - assert(!isinf(S[i])); + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + dst_ptr[dst_idx] = *src_ptr; + } else { + dst_ptr[dst_idx] = 0; + } + } } -#endif } + } +} - for (int64_t ic = 0; ic < nev1; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; +static void lm_ggml_compute_forward_pad( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - // v indices - const int iv2 = iq2 % nev2; - const int iv3 = iq3; + const struct lm_ggml_tensor * src0 = dst->src[0]; - lm_ggml_vec_dot_f32(masked_begin, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, - (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, - S, 0, 1); - } + switch (src0->type) { + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_pad_f32(params, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; } } -static void lm_ggml_compute_forward_flash_attn_f16( - const struct lm_ggml_compute_params * params, - const bool masked, - struct lm_ggml_tensor * dst) { - const struct lm_ggml_tensor * q = dst->src[0]; - const struct lm_ggml_tensor * k = dst->src[1]; - const struct lm_ggml_tensor * v = dst->src[2]; +// lm_ggml_compute_forward_arange - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); +static void lm_ggml_compute_forward_arange_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - LM_GGML_TENSOR_LOCALS(int64_t, neq, q, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbq, q, nb) - LM_GGML_TENSOR_LOCALS(int64_t, nek, k, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbk, k, nb) - LM_GGML_TENSOR_LOCALS(int64_t, nev, v, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbv, v, nb) - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } + + LM_GGML_ASSERT(dst->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; - const int64_t D = neq0; - const int64_t N = neq1; - const int64_t P = nek1 - N; - const int64_t M = P + N; - - const int Mup = lm_ggml_up(M, LM_GGML_SOFT_MAX_UNROLL); - - LM_GGML_ASSERT(ne0 == D); - LM_GGML_ASSERT(ne1 == N); - LM_GGML_ASSERT(P >= 0); - - LM_GGML_ASSERT(nbq0 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nbk0 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nbv0 == sizeof(lm_ggml_fp16_t)); - - LM_GGML_ASSERT(neq0 == D); - LM_GGML_ASSERT(nek0 == D); - LM_GGML_ASSERT(nev1 == D); + const float start = lm_ggml_get_op_params_f32(dst, 0); + const float stop = lm_ggml_get_op_params_f32(dst, 1); + const float step = lm_ggml_get_op_params_f32(dst, 2); - LM_GGML_ASSERT(neq1 == N); - LM_GGML_ASSERT(nek1 == N + P); - LM_GGML_ASSERT(nev1 == D); + const int64_t steps = (int64_t) ceilf((stop - start) / step); - // dst cannot be transposed or permuted - LM_GGML_ASSERT(nb0 == sizeof(float)); - LM_GGML_ASSERT(nb0 <= nb1); - LM_GGML_ASSERT(nb1 <= nb2); - LM_GGML_ASSERT(nb2 <= nb3); + LM_GGML_ASSERT(lm_ggml_nelements(dst) == steps); - if (params->type == LM_GGML_TASK_TYPE_INIT) { - return; + for (int64_t i = ith; i < steps; i+= nth) { + float value = start + step * i; + ((float *)dst->data)[i] = value; } +} - if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; +static void lm_ggml_compute_forward_arange( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + switch (dst->type) { + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_arange_f32(params, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; } +} - // parallelize by q rows using lm_ggml_vec_dot_f32 +static void lm_ggml_compute_forward_timestep_embedding_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - // total rows in q - const int nr = neq1*neq2*neq3; + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } - // rows per thread - const int dr = (nr + nth - 1)/nth; + const struct lm_ggml_tensor * src0 = dst->src[0]; - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); - const float scale = 1.0f/sqrtf(D); + const int ith = params->ith; + const int nth = params->nth; - //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + LM_GGML_TENSOR_UNARY_OP_LOCALS - for (int ir = ir0; ir < ir1; ++ir) { - // q indices - const int iq3 = ir/(neq2*neq1); - const int iq2 = (ir - iq3*neq2*neq1)/neq1; - const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + const int dim = lm_ggml_get_op_params_i32(dst, 0); + const int max_period = lm_ggml_get_op_params_i32(dst, 1); - float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32); + int half = dim / 2; - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; + for (int64_t i = 0; i < ne00; i++) { + float * embed_data = (float *)((char *) dst->data + i*nb1); + for (int64_t j = ith; j < half; j += nth) { + float timestep = ((float *)src0->data)[i]; + float freq = (float)expf(-logf(max_period) * j / half); + float arg = timestep * freq; + embed_data[j] = cosf(arg); + embed_data[j + half] = sinf(arg); } - - if (LM_GGML_VEC_DOT_UNROLL > 2 || nek1 % LM_GGML_VEC_DOT_UNROLL != 0) { - for (int64_t ic = 0; ic < nek1; ++ic) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2 % nek2; - const int ik1 = ic; - - // S indices - const int i1 = ik1; - - lm_ggml_vec_dot_f16(neq0, - S + i1, 0, - (lm_ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - (lm_ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); - } - } else { - for (int64_t ic = 0; ic < nek1; ic += LM_GGML_VEC_DOT_UNROLL) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2 % nek2; - const int ik1 = ic; - - // S indices - const int i1 = ik1; - - lm_ggml_vec_dot_f16_unroll(neq0, nbk1, - S + i1, - ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (lm_ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); - } + if (dim % 2 != 0 && ith == 0) { + embed_data[dim] = 0.f; } + } +} - // scale - lm_ggml_vec_scale_f32(nek1, S, scale); - - if (masked) { - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = -INFINITY; - } - } - } +static void lm_ggml_compute_forward_timestep_embedding( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - // softmax - // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero. - // dont forget to set their S values to zero - { - float max = -INFINITY; - lm_ggml_vec_max_f32(M, &max, S); + const struct lm_ggml_tensor * src0 = dst->src[0]; - lm_ggml_float sum = 0.0; + switch (src0->type) { + case LM_GGML_TYPE_F32: { -#ifdef LM_GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(S, 1, &max, S, 1, Mup); - vvexpf(S, S, &Mup); - lm_ggml_vec_sum_f32(Mup, &sum, S); -#else - uint16_t scvt[LM_GGML_SOFT_MAX_UNROLL]; - lm_ggml_float sump[LM_GGML_SOFT_MAX_UNROLL] = { 0.0 }; + lm_ggml_compute_forward_timestep_embedding_f32(params, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} - for (int i = 0; i < Mup; i += LM_GGML_SOFT_MAX_UNROLL) { - float * SS = S + i; +// lm_ggml_compute_forward_argsort - for (int j = 0; j < LM_GGML_SOFT_MAX_UNROLL; ++j) { - if (SS[j] == -INFINITY) { - SS[j] = 0.0f; - } else { - lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(SS[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt[j]]); - sump[j] += (lm_ggml_float)val; - SS[j] = val; - } - } - } +static void lm_ggml_compute_forward_argsort_f32( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - for (int i = 0; i < LM_GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } -#endif - } + const struct lm_ggml_tensor * src0 = dst->src[0]; - assert(sum > 0.0); + if (params->type == LM_GGML_TASK_TYPE_INIT || params->type == LM_GGML_TASK_TYPE_FINALIZE) { + return; + } - sum = 1.0/sum; - lm_ggml_vec_scale_f32(M, S, sum); + LM_GGML_TENSOR_UNARY_OP_LOCALS -#ifndef NDEBUG - for (int i = 0; i < M; ++i) { - assert(!isnan(S[i])); - assert(!isinf(S[i])); - } -#endif - } + LM_GGML_ASSERT(nb0 == sizeof(float)); - lm_ggml_fp16_t * S16 = (lm_ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup); + const int ith = params->ith; + const int nth = params->nth; - for (int64_t i = 0; i < M; i++) { - S16[i] = LM_GGML_FP32_TO_FP16(S[i]); - } + const int64_t nr = lm_ggml_nrows(src0); - // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16). - if (LM_GGML_VEC_DOT_UNROLL == 1 || (nev1 % LM_GGML_VEC_DOT_UNROLL != 0)) { - for (int64_t ic = 0; ic < nev1; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - - // v indices - const int iv2 = iq2 % nev2; - const int iv3 = iq3; - - lm_ggml_vec_dot_f16(nev0, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, - (lm_ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, - S16, 0, 1); - } - } else { - for (int64_t ic = 0; ic < nev1; ic += LM_GGML_VEC_DOT_UNROLL) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - - // v indices - const int iv2 = iq2 % nev2; - const int iv3 = iq3; - - lm_ggml_vec_dot_f16_unroll(nev0, nbv1, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - S16); + enum lm_ggml_sort_order order = (enum lm_ggml_sort_order) lm_ggml_get_op_params_i32(dst, 0); + + for (int64_t i = ith; i < nr; i += nth) { + int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); + const float * src_data = (float *)((char *) src0->data + i*nb01); + + for (int64_t j = 0; j < ne0; j++) { + dst_data[j] = j; + } + + // C doesn't have a functional sort, so we do a bubble sort instead + for (int64_t j = 0; j < ne0; j++) { + for (int64_t k = j + 1; k < ne0; k++) { + if ((order == LM_GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || + (order == LM_GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { + int32_t tmp = dst_data[j]; + dst_data[j] = dst_data[k]; + dst_data[k] = tmp; + } } } } } -static void lm_ggml_compute_forward_flash_attn( - const struct lm_ggml_compute_params * params, - const bool masked, - struct lm_ggml_tensor * dst) { +static void lm_ggml_compute_forward_argsort( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { - const struct lm_ggml_tensor * q = dst->src[0]; + const struct lm_ggml_tensor * src0 = dst->src[0]; - switch (q->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_flash_attn_f16(params, masked, dst); - } break; + switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_flash_attn_f32(params, masked, dst); + lm_ggml_compute_forward_argsort_f32(params, dst); } break; default: { @@ -14740,9 +15690,10 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16( LM_GGML_ASSERT(ne0 == D); LM_GGML_ASSERT(ne2 == N); - LM_GGML_ASSERT(nbq0 == sizeof(float)); - LM_GGML_ASSERT(nbk0 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nbv0 == sizeof(lm_ggml_fp16_t)); + // input tensor rows must be contiguous + LM_GGML_ASSERT(nbq0 == lm_ggml_type_size(q->type)); + LM_GGML_ASSERT(nbk0 == lm_ggml_type_size(k->type)); + LM_GGML_ASSERT(nbv0 == lm_ggml_type_size(v->type)); LM_GGML_ASSERT(neq0 == D); LM_GGML_ASSERT(nek0 == D); @@ -14784,8 +15735,22 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - float scale = 1.0f; - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + + const uint32_t n_head = neq2; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + enum lm_ggml_type const k_vec_dot_type = type_traits[k->type].vec_dot_type; + lm_ggml_from_float_t const q_to_vec_dot = type_traits[k_vec_dot_type].from_float; + lm_ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot; + lm_ggml_to_float_t const v_to_float = type_traits[v->type].to_float; // loop over n_batch and n_head for (int ir = ir0; ir < ir1; ++ir) { @@ -14794,14 +15759,22 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16( const int iq2 = (ir - iq3*neq2*neq1)/neq1; const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); - float S = 0.0f; - float M = -INFINITY; + const uint32_t h = iq2; // head index + const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; + + float S = 0.0f; // sum + float M = -INFINITY; // maximum KQ value - float * V32 = (float *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32); - lm_ggml_fp16_t * Q16 = (lm_ggml_fp16_t *) (V32); // reuse memory - lm_ggml_fp16_t * V16 = (lm_ggml_fp16_t *) (V32 + D); + float * VKQ32 = (float *) params->wdata + ith*(3*D + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator + float * V32 = (VKQ32 + 1*D); // (temporary) FP32 V buffer + lm_ggml_fp16_t * VKQ16 = (lm_ggml_fp16_t *) (VKQ32 + 1*D); // (temporary) FP16 VKQ accumulator + lm_ggml_fp16_t * Q_q = (lm_ggml_fp16_t *) (VKQ32 + 2*D); // (temporary) buffer for Q converted to quantized/FP16 - memset(V16, 0, D*sizeof(lm_ggml_fp16_t)); + if (v->type == LM_GGML_TYPE_F16) { + memset(VKQ16, 0, D*sizeof(lm_ggml_fp16_t)); + } else { + memset(VKQ32, 0, D*sizeof(float)); + } const lm_ggml_fp16_t * mp = mask ? (lm_ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL; @@ -14813,61 +15786,79 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16( const int iv3 = iq3 / rv3; const int iv2 = iq2 / rv2; + const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)); + q_to_vec_dot(pq, Q_q, D); + // online softmax / attention // loop over n_kv and n_head_kv // ref: https://arxiv.org/pdf/2112.05682.pdf for (int64_t ic = 0; ic < nek1; ++ic) { - const float mv = mp ? LM_GGML_FP16_TO_FP32(mp[ic]) : 0.0f; + const float mv = mp ? slope*LM_GGML_FP16_TO_FP32(mp[ic]) : 0.0f; if (mv == -INFINITY) { continue; } - float s; + float s; // KQ value - // convert Q to F16 in V32 - { - const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)); + const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); + kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1); - for (int64_t d = 0; d < D; ++d) { - Q16[d] = LM_GGML_FP32_TO_FP16(pq[d]); - } - } + s = s*scale + mv; // scale KQ value and apply mask - lm_ggml_vec_dot_f16(D, - &s, 0, - (lm_ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - Q16, 0, 1); + const float Mold = M; - s = s*scale + mv; + float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value + float vs = 1.0f; // post-softmax KQ value, expf(s - M) - const float Mold = M; + const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); - float ms = 1.0f; - float vs = 1.0f; + if (v->type== LM_GGML_TYPE_F16) { + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); - if (s > M) { - M = s; - ms = expf(Mold - M); + // V = V*expf(Mold - M) + lm_ggml_vec_scale_f16(D, VKQ16, ms); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); + } - // V = V*expf(Mold - M) - lm_ggml_vec_scale_f16(D, V16, ms); + // V += v*expf(s - M) + lm_ggml_vec_mad_f16(D, VKQ16, (const lm_ggml_fp16_t *) v_data, vs); } else { - vs = expf(s - M); - } + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + lm_ggml_vec_scale_f32(D, VKQ32, ms); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); + } - const lm_ggml_fp16_t * v16 = (const lm_ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); + v_to_float(v_data, V32, D); - // V += v*expf(s - M) - lm_ggml_vec_mad_f16(D, V16, v16, vs); + // V += v*expf(s - M) + lm_ggml_vec_mad_f32(D, VKQ32, V32, vs); + } - S = S*ms + vs; + S = S*ms + vs; // scale and increment sum with partial sum } - // V /= S - for (int64_t d = 0; d < D; ++d) { - V32[d] = LM_GGML_FP16_TO_FP32(V16[d])/S; + if (v->type == LM_GGML_TYPE_F16) { + for (int64_t d = 0; d < D; ++d) { + VKQ32[d] = LM_GGML_FP16_TO_FP32(VKQ16[d]); + } } + // V /= S + const float S_inv = 1.0f/S; + lm_ggml_vec_scale_f32(D, VKQ32, S_inv); + // dst indices const int i1 = iq1; const int i2 = iq2; @@ -14877,7 +15868,7 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16( //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float)); // permute(0, 2, 1, 3) - memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1); + memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1); } } @@ -14888,7 +15879,7 @@ static void lm_ggml_compute_forward_flash_attn_ext( const struct lm_ggml_tensor * v, const struct lm_ggml_tensor * mask, struct lm_ggml_tensor * dst) { - switch (dst->op_params[1]) { + switch (dst->op_params[2]) { case LM_GGML_PREC_DEFAULT: case LM_GGML_PREC_F32: { @@ -14902,165 +15893,6 @@ static void lm_ggml_compute_forward_flash_attn_ext( } } -// lm_ggml_compute_forward_flash_ff - -static void lm_ggml_compute_forward_flash_ff_f16( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * a = dst->src[0]; // F16 - const struct lm_ggml_tensor * b0 = dst->src[1]; // F16 fc_w - const struct lm_ggml_tensor * b1 = dst->src[2]; // F32 fc_b - const struct lm_ggml_tensor * c0 = dst->src[3]; // F16 proj_w - const struct lm_ggml_tensor * c1 = dst->src[4]; // F32 proj_b - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - LM_GGML_TENSOR_LOCALS(int64_t, nea, a, ne) - LM_GGML_TENSOR_LOCALS(size_t, nba, a, nb) - LM_GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb) - LM_GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb) - LM_GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb) - LM_GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne) - LM_GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb) - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t D = nea0; - //const int64_t N = nea1; - const int64_t M = neb01; - - LM_GGML_ASSERT(ne0 == nea0); - LM_GGML_ASSERT(ne1 == nea1); - LM_GGML_ASSERT(ne2 == nea2); - - LM_GGML_ASSERT(nba0 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nbb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nbb10 == sizeof(float)); - LM_GGML_ASSERT(nbc00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nbc10 == sizeof(float)); - - LM_GGML_ASSERT(neb00 == D); - LM_GGML_ASSERT(neb01 == M); - LM_GGML_ASSERT(neb10 == M); - LM_GGML_ASSERT(neb11 == 1); - - LM_GGML_ASSERT(nec00 == M); - LM_GGML_ASSERT(nec01 == D); - LM_GGML_ASSERT(nec10 == D); - LM_GGML_ASSERT(nec11 == 1); - - // dst cannot be transposed or permuted - LM_GGML_ASSERT(nb0 == sizeof(float)); - LM_GGML_ASSERT(nb0 <= nb1); - LM_GGML_ASSERT(nb1 <= nb2); - LM_GGML_ASSERT(nb2 <= nb3); - - if (params->type == LM_GGML_TASK_TYPE_INIT) { - return; - } - - if (params->type == LM_GGML_TASK_TYPE_FINALIZE) { - return; - } - - // parallelize by a rows using lm_ggml_vec_dot_f32 - - // total rows in a - const int nr = nea1*nea2*nea3; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // a indices - const int ia3 = ir/(nea2*nea1); - const int ia2 = (ir - ia3*nea2*nea1)/nea1; - const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1); - - float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32); - - for (int64_t ic = 0; ic < neb01; ++ic) { - // b0 indices - const int ib03 = ia3; - const int ib02 = ia2; - const int ib01 = ic; - - // S indices - const int i1 = ib01; - - lm_ggml_vec_dot_f16(nea0, - S + i1, 0, - (lm_ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0, - (lm_ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1); - } - - lm_ggml_vec_add_f32(neb01, S, S, (float *) b1->data); - //lm_ggml_vec_gelu_f32(neb01, S, S); - - lm_ggml_fp16_t * S16 = (lm_ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); - - for (int64_t i = 0; i < M; i++) { - S16[i] = LM_GGML_FP32_TO_FP16(S[i]); - } - - lm_ggml_vec_gelu_f16(neb01, S16, S16); - - { - // dst indices - const int i1 = ia1; - const int i2 = ia2; - const int i3 = ia3; - - for (int64_t ic = 0; ic < nec01; ++ic) { - - lm_ggml_vec_dot_f16(neb01, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, - (lm_ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0, - S16, 0, 1); - } - - lm_ggml_vec_add_f32(nec01, - (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), - (float *) c1->data); - } - } -} - -static void lm_ggml_compute_forward_flash_ff( - const struct lm_ggml_compute_params * params, - struct lm_ggml_tensor * dst) { - - const struct lm_ggml_tensor * b0 = dst->src[1]; - - switch (b0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_flash_ff_f16(params, dst); - } break; - case LM_GGML_TYPE_F32: - { - LM_GGML_ASSERT(false); // TODO - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - // lm_ggml_compute_forward_flash_attn_back static void lm_ggml_compute_forward_flash_attn_back_f32( @@ -15242,38 +16074,7 @@ static void lm_ggml_compute_forward_flash_attn_back_f32( vvexpf(SM, SM, &Mup); lm_ggml_vec_sum_f32(Mup, &sum, SM); #else - uint16_t scvt[LM_GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); - lm_ggml_float sump[LM_GGML_SOFT_MAX_UNROLL] = { 0.0 }; - - for (int i = 0; i < Mup; i += LM_GGML_SOFT_MAX_UNROLL) { - if (i >= masked_begin) { - break; - } - float * SR = S + i; - float * SW = SM + i; - - for (int j = 0; j < LM_GGML_SOFT_MAX_UNROLL; ++j) { - if (i + j >= masked_begin) { - break; - } else if (SR[j] == -INFINITY) { - SW[j] = 0.0f; - } else { -#ifndef LM_GGML_FLASH_ATTN_EXP_FP16 - const float val = expf(SR[j] - max); -#else - lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(SR[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt[j]]); -#endif - sump[j] += (lm_ggml_float)val; - SW[j] = val; - } - } - } - - for (int i = 0; i < LM_GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } + sum = lm_ggml_vec_soft_max_f32(Mup, SM, S, max); #endif } @@ -15855,6 +16656,10 @@ static void lm_ggml_compute_forward_unary( { lm_ggml_compute_forward_relu(params, dst); } break; + case LM_GGML_UNARY_OP_SIGMOID: + { + lm_ggml_compute_forward_sigmoid(params, dst); + } break; case LM_GGML_UNARY_OP_GELU: { lm_ggml_compute_forward_gelu(params, dst); @@ -15921,6 +16726,7 @@ static void lm_ggml_compute_forward_get_rel_pos( switch (src0->type) { case LM_GGML_TYPE_F16: + case LM_GGML_TYPE_BF16: { lm_ggml_compute_forward_get_rel_pos_f16(params, dst); } break; @@ -16294,35 +17100,15 @@ static void lm_ggml_compute_forward_cross_entropy_loss_f32( assert(!isnan(s1[i])); } #endif - // soft_max - lm_ggml_float sum = 0.0; - { - float max = -INFINITY; - lm_ggml_vec_max_f32(nc, &max, s0); - uint16_t scvt; UNUSED(scvt); - for (int i = 0; i < nc; i++) { - if (s0[i] == -INFINITY) { - st[i] = 0.0f; - } else { -#ifndef LM_GGML_CROSS_ENTROPY_EXP_FP16 - const float s = s0[i] - max; - const float val = expf(s); -#else - lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(s0[i] - max); - memcpy(&scvt, &s, sizeof(scvt)); - const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt]); -#endif - sum += (lm_ggml_float)val; - st[i] = val; - } - } + // soft_max + float max = -INFINITY; + lm_ggml_vec_max_f32(nc, &max, s0); + lm_ggml_float sum = lm_ggml_vec_soft_max_f32(nc, st, s0, max); + assert(sum > 0.0); + sum = (1.0 - eps) / sum; - assert(sum > 0.0); - // sum = 1.0/sum; - } // avoid log(0) by rescaling from [0..1] to [eps..1] - sum = (1.0 - eps) / sum; lm_ggml_vec_scale_f32(nc, st, sum); lm_ggml_vec_add1_f32(nc, st, st, eps); lm_ggml_vec_log_f32(nc, st, st); @@ -16412,32 +17198,11 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back_f32( #endif // soft_max - lm_ggml_float sum = 0.0; - { - float max = -INFINITY; - lm_ggml_vec_max_f32(nc, &max, s0); - - uint16_t scvt; UNUSED(scvt); - for (int i = 0; i < nc; i++) { - if (s0[i] == -INFINITY) { - ds0[i] = 0.0f; - } else { -#ifndef LM_GGML_CROSS_ENTROPY_EXP_FP16 - const float s = s0[i] - max; - const float val = expf(s); -#else - lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(s0[i] - max); - memcpy(&scvt, &s, sizeof(scvt)); - const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt]); -#endif - sum += (lm_ggml_float)val; - ds0[i] = val; - } - } - - assert(sum > 0.0); - sum = (1.0 - eps)/sum; - } + float max = -INFINITY; + lm_ggml_vec_max_f32(nc, &max, s0); + lm_ggml_float sum = lm_ggml_vec_soft_max_f32(nc, ds0, s0, max); + assert(sum > 0.0); + sum = (1.0 - eps) / sum; // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr lm_ggml_vec_scale_f32(nc, ds0, sum); @@ -16474,7 +17239,7 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, struct lm_ggml_tensor * tensor) { +static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, struct lm_ggml_tensor * tensor, struct lm_ggml_compute_state * state) { LM_GGML_ASSERT(params); if (tensor->op == LM_GGML_OP_NONE || lm_ggml_is_empty(tensor)) { @@ -16572,7 +17337,7 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru } break; case LM_GGML_OP_MUL_MAT: { - lm_ggml_compute_forward_mul_mat(params, tensor); + lm_ggml_compute_forward_mul_mat(params, tensor, state); } break; case LM_GGML_OP_MUL_MAT_ID: { @@ -16650,10 +17415,6 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru { lm_ggml_compute_forward_rope_back(params, tensor); } break; - case LM_GGML_OP_ALIBI: - { - lm_ggml_compute_forward_alibi(params, tensor); - } break; case LM_GGML_OP_CLAMP: { lm_ggml_compute_forward_clamp(params, tensor); @@ -16702,21 +17463,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru { lm_ggml_compute_forward_leaky_relu(params, tensor); } break; - case LM_GGML_OP_FLASH_ATTN: - { - const int32_t t = lm_ggml_get_op_params_i32(tensor, 0); - LM_GGML_ASSERT(t == 0 || t == 1); - const bool masked = t != 0; - lm_ggml_compute_forward_flash_attn(params, masked, tensor); - } break; case LM_GGML_OP_FLASH_ATTN_EXT: { lm_ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor); } break; - case LM_GGML_OP_FLASH_FF: - { - lm_ggml_compute_forward_flash_ff(params, tensor); - } break; case LM_GGML_OP_FLASH_ATTN_BACK: { int32_t t = lm_ggml_get_op_params_i32(tensor, 0); @@ -17086,6 +17836,7 @@ static struct lm_ggml_tensor * lm_ggml_sub_or_set(struct lm_ggml_context * ctx, static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor, struct lm_ggml_hash_set zero_table) { struct lm_ggml_tensor * src0 = tensor->src[0]; struct lm_ggml_tensor * src1 = tensor->src[1]; + struct lm_ggml_tensor * src2 = tensor->src[2]; switch (tensor->op) { case LM_GGML_OP_DUP: @@ -17617,6 +18368,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm lm_ggml_rope_back(ctx, tensor->grad, src1, + src2, n_dims, mode, n_ctx, @@ -17656,6 +18408,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm lm_ggml_rope_impl(ctx, tensor->grad, src1, + src2, n_dims, mode, n_ctx, @@ -17672,10 +18425,6 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm zero_table); } } break; - case LM_GGML_OP_ALIBI: - { - LM_GGML_ASSERT(false); // TODO: not implemented - } break; case LM_GGML_OP_CLAMP: { LM_GGML_ASSERT(false); // TODO: not implemented @@ -17724,7 +18473,6 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { LM_GGML_ASSERT(false); // TODO: not implemented } break; - case LM_GGML_OP_FLASH_ATTN: case LM_GGML_OP_FLASH_ATTN_EXT: { struct lm_ggml_tensor * flash_grad = NULL; @@ -17741,7 +18489,6 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm masked); } - struct lm_ggml_tensor * src2 = tensor->src[2]; const int64_t elem_q = lm_ggml_nelements(src0); const int64_t elem_k = lm_ggml_nelements(src1); const int64_t elem_v = lm_ggml_nelements(src2); @@ -17779,10 +18526,6 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm zero_table); } } break; - case LM_GGML_OP_FLASH_FF: - { - LM_GGML_ASSERT(false); // not supported - } break; case LM_GGML_OP_FLASH_ATTN_BACK: { LM_GGML_ASSERT(false); // not supported @@ -17846,6 +18589,10 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm zero_table); } } break; + case LM_GGML_UNARY_OP_SIGMOID: + { + LM_GGML_ASSERT(false); // TODO: not implemented + } break; case LM_GGML_UNARY_OP_GELU: { LM_GGML_ASSERT(false); // TODO: not implemented @@ -18192,8 +18939,6 @@ typedef int lm_ggml_lock_t; #define LM_GGML_LOCK_INITIALIZER 0 -typedef pthread_t lm_ggml_thread_t; - #define lm_ggml_thread_create pthread_create #define lm_ggml_thread_join pthread_join @@ -18219,8 +18964,6 @@ typedef int lm_ggml_lock_t; #define LM_GGML_LOCK_INITIALIZER 0 -typedef pthread_t lm_ggml_thread_t; - #define lm_ggml_thread_create pthread_create #define lm_ggml_thread_join pthread_join @@ -18300,31 +19043,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} #endif -struct lm_ggml_compute_state_shared { - const struct lm_ggml_cgraph * cgraph; - const struct lm_ggml_cplan * cplan; - - int64_t perf_node_start_cycles; - int64_t perf_node_start_time_us; - - const int n_threads; - - // synchronization primitives - atomic_int n_active; // num active threads - atomic_int node_n; // active graph node - atomic_int node_task; // active graph node task phase - - lm_ggml_abort_callback abort_callback; // abort lm_ggml_graph_compute when true - void * abort_callback_data; -}; - -struct lm_ggml_compute_state { - lm_ggml_thread_t thrd; - int ith; - struct lm_ggml_compute_state_shared * shared; - enum lm_ggml_status ec; -}; - static void lm_ggml_graph_compute_perf_stats_node(struct lm_ggml_tensor * node, const struct lm_ggml_compute_state_shared * st) { int64_t cycles_cur = lm_ggml_perf_cycles() - st->perf_node_start_cycles; int64_t time_us_cur = lm_ggml_perf_time_us() - st->perf_node_start_time_us; @@ -18375,6 +19093,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads, int case LM_GGML_UNARY_OP_TANH: case LM_GGML_UNARY_OP_ELU: case LM_GGML_UNARY_OP_RELU: + case LM_GGML_UNARY_OP_SIGMOID: case LM_GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads case LM_GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads { @@ -18448,10 +19167,6 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads, int { n_tasks = n_threads; } break; - case LM_GGML_OP_ALIBI: - { - n_tasks = 1; //TODO - } break; case LM_GGML_OP_CLAMP: { n_tasks = 1; //TODO @@ -18497,15 +19212,10 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads, int { n_tasks = n_threads; } break; - case LM_GGML_OP_FLASH_ATTN: case LM_GGML_OP_FLASH_ATTN_EXT: { n_tasks = n_threads; } break; - case LM_GGML_OP_FLASH_FF: - { - n_tasks = n_threads; - } break; case LM_GGML_OP_FLASH_ATTN_BACK: { n_tasks = n_threads; @@ -18600,6 +19310,10 @@ static void lm_ggml_graph_compute_thread_sync_node(int * node_n, struct lm_ggml_ * node_n = atomic_load(&state->shared->node_n); if (* node_n != last_node_n) break; +#if defined(__SSE3__) + // Tell the processor we're spinning. It's a processor hint for spinlocks. + _mm_pause(); +#endif } } @@ -18614,6 +19328,10 @@ static void lm_ggml_graph_compute_thread_sync_task(int * task_phase, struct lm_g * task_phase = atomic_load(&state->shared->node_task); if (* task_phase != last_task_phase) break; +#if defined(__SSE3__) + // Tell the processor we're spinning. It's a processor hint for spinlocks. + _mm_pause(); +#endif } } @@ -18653,7 +19371,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { struct lm_ggml_tensor * node = cgraph->nodes[node_n]; if (LM_GGML_OP_HAS_FINALIZE[node->op]) { params.nth = lm_ggml_get_n_tasks(node, n_threads, state->shared->n_threads); - lm_ggml_compute_forward(¶ms, node); + lm_ggml_compute_forward(¶ms, node, state); } lm_ggml_graph_compute_perf_stats_node(node, state->shared); } @@ -18673,17 +19391,17 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { /* INIT */ if (LM_GGML_OP_HAS_INIT[node->op]) { params.type = LM_GGML_TASK_TYPE_INIT; - lm_ggml_compute_forward(¶ms, node); + lm_ggml_compute_forward(¶ms, node, state); } // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) params.type = LM_GGML_TASK_TYPE_COMPUTE; - lm_ggml_compute_forward(¶ms, node); + lm_ggml_compute_forward(¶ms, node, state); if (LM_GGML_OP_HAS_FINALIZE[node->op]) { params.type = LM_GGML_TASK_TYPE_FINALIZE; - lm_ggml_compute_forward(¶ms, node); + lm_ggml_compute_forward(¶ms, node, state); } lm_ggml_graph_compute_perf_stats_node(node, state->shared); @@ -18722,7 +19440,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { if (state->ith < n_tasks) { if (LM_GGML_OP_HAS_INIT[node->op]) { - lm_ggml_compute_forward(¶ms, node); + lm_ggml_compute_forward(¶ms, node, state); } } @@ -18743,7 +19461,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { if (state->ith < n_tasks) { params.type = LM_GGML_TASK_TYPE_COMPUTE; - lm_ggml_compute_forward(¶ms, node); + lm_ggml_compute_forward(¶ms, node, state); } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { @@ -18785,7 +19503,10 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in case LM_GGML_OP_CPY: case LM_GGML_OP_DUP: { - if (lm_ggml_is_quantized(node->type)) { + if (lm_ggml_is_quantized(node->type) || + // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 + (node->src[0]->type == LM_GGML_TYPE_F16 && node->src[1] && node->src[1]->type == LM_GGML_TYPE_BF16) || + (node->src[0]->type == LM_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == LM_GGML_TYPE_F16)) { cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->ne[0] * n_tasks; } } break; @@ -18864,7 +19585,8 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in const int64_t ne10 = node->src[1]->ne[0]; // L const int64_t ne11 = node->src[1]->ne[1]; // Cin - if (node->src[0]->type == LM_GGML_TYPE_F16 && + if ((node->src[0]->type == LM_GGML_TYPE_F16 || + node->src[0]->type == LM_GGML_TYPE_BF16) && node->src[1]->type == LM_GGML_TYPE_F32) { cur += sizeof(lm_ggml_fp16_t)*ne00*ne01*ne02; cur += sizeof(lm_ggml_fp16_t)*ne10*ne11; @@ -18890,33 +19612,11 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in cur += sizeof(lm_ggml_fp16_t)*ne00*ne01*ne02*ne03; cur += sizeof(lm_ggml_fp16_t)*ne10*ne11*ne12; } break; - case LM_GGML_OP_FLASH_ATTN: - { - const int64_t ne11 = lm_ggml_up(node->src[1]->ne[1], LM_GGML_SOFT_MAX_UNROLL); - - if (node->src[1]->type == LM_GGML_TYPE_F32) { - cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == LM_GGML_TYPE_F16) { - cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 - } - } break; case LM_GGML_OP_FLASH_ATTN_EXT: { const int64_t ne00 = node->src[0]->ne[0]; // D - cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size - } break; - case LM_GGML_OP_FLASH_FF: - { - if (node->src[1]->type == LM_GGML_TYPE_F32) { - cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == LM_GGML_TYPE_F16) { - cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 - } + cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread } break; case LM_GGML_OP_FLASH_ATTN_BACK: { @@ -18929,6 +19629,9 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in } else if (node->src[1]->type == LM_GGML_TYPE_F16) { cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == LM_GGML_TYPE_BF16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } } break; @@ -18981,6 +19684,7 @@ enum lm_ggml_status lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct /*.node_task =*/ LM_GGML_TASK_TYPE_FINALIZE, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, + /*.current_chunk; =*/ 0, }; struct lm_ggml_compute_state * workers = alloca(sizeof(struct lm_ggml_compute_state)*n_threads); @@ -19705,7 +20409,9 @@ void lm_ggml_graph_dump_dot(const struct lm_ggml_cgraph * gb, const struct lm_gg if (node->type == LM_GGML_TYPE_I8 || node->type == LM_GGML_TYPE_I16 || node->type == LM_GGML_TYPE_I32) { fprintf(fp, "%d", lm_ggml_get_i32_1d(node, j)); } - else if (node->type == LM_GGML_TYPE_F32 || node->type == LM_GGML_TYPE_F16) { + else if (node->type == LM_GGML_TYPE_F32 || + node->type == LM_GGML_TYPE_F16 || + node->type == LM_GGML_TYPE_BF16) { fprintf(fp, "%.1e", (double)lm_ggml_get_f32_1d(node, j)); } else { @@ -20752,17 +21458,19 @@ size_t lm_ggml_quantize_chunk( case LM_GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case LM_GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case LM_GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; -#if QK_K == 64 - case LM_GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; -#else case LM_GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; -#endif case LM_GGML_TYPE_F16: { size_t elemsize = sizeof(lm_ggml_fp16_t); lm_ggml_fp32_to_fp16_row(src + start, (lm_ggml_fp16_t *)dst + start, n); result = n * elemsize; } break; + case LM_GGML_TYPE_BF16: + { + size_t elemsize = sizeof(lm_ggml_bf16_t); + lm_ggml_fp32_to_bf16_row(src + start, (lm_ggml_bf16_t *)dst + start, n); + result = n * elemsize; + } break; case LM_GGML_TYPE_F32: { size_t elemsize = sizeof(float); @@ -21139,7 +21847,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg } // read the tensor infos - { + if (ctx->header.n_tensors > 0) { ctx->infos = LM_GGML_CALLOC(ctx->header.n_tensors, sizeof(struct lm_gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { @@ -22027,6 +22735,14 @@ int lm_ggml_cpu_has_avx512_vnni(void) { #endif } +int lm_ggml_cpu_has_avx512_bf16(void) { +#if defined(__AVX512BF16__) + return 1; +#else + return 0; +#endif +} + int lm_ggml_cpu_has_fma(void) { #if defined(__FMA__) return 1; @@ -22043,6 +22759,16 @@ int lm_ggml_cpu_has_neon(void) { #endif } +int lm_ggml_cpu_has_sve(void) { +#if defined(__ARM_FEATURE_SVE) + // TODO: Currently, SVE 256 bit is only supported. + LM_GGML_ASSERT(svcntb() == QK8_0); + return 1; +#else + return 0; +#endif +} + int lm_ggml_cpu_has_arm_fma(void) { #if defined(__ARM_FEATURE_FMA) return 1; diff --git a/cpp/ggml.h b/cpp/ggml.h index 6a348d23..1c5d9f31 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -326,14 +326,20 @@ extern "C" { // get lm_ggml_status name string LM_GGML_API LM_GGML_CALL const char * lm_ggml_status_to_string(enum lm_ggml_status status); + // ieee 754-2008 half-precision float16 + // todo: make this not an integral type typedef uint16_t lm_ggml_fp16_t; - - // convert FP16 <-> FP32 - LM_GGML_API float lm_ggml_fp16_to_fp32(lm_ggml_fp16_t x); - LM_GGML_API lm_ggml_fp16_t lm_ggml_fp32_to_fp16(float x); - - LM_GGML_API void lm_ggml_fp16_to_fp32_row(const lm_ggml_fp16_t * x, float * y, int64_t n); - LM_GGML_API void lm_ggml_fp32_to_fp16_row(const float * x, lm_ggml_fp16_t * y, int64_t n); + LM_GGML_API float lm_ggml_fp16_to_fp32(lm_ggml_fp16_t); + LM_GGML_API lm_ggml_fp16_t lm_ggml_fp32_to_fp16(float); + LM_GGML_API void lm_ggml_fp16_to_fp32_row(const lm_ggml_fp16_t *, float *, int64_t); + LM_GGML_API void lm_ggml_fp32_to_fp16_row(const float *, lm_ggml_fp16_t *, int64_t); + + // google brain half-precision bfloat16 + typedef struct { uint16_t bits; } lm_ggml_bf16_t; + LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float); + LM_GGML_API float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t); // consider just doing << 16 + LM_GGML_API void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t); + LM_GGML_API void lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t); struct lm_ggml_object; struct lm_ggml_context; @@ -370,6 +376,7 @@ extern "C" { LM_GGML_TYPE_I64 = 27, LM_GGML_TYPE_F64 = 28, LM_GGML_TYPE_IQ1_M = 29, + LM_GGML_TYPE_BF16 = 30, LM_GGML_TYPE_COUNT, }; @@ -410,6 +417,7 @@ extern "C" { LM_GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors LM_GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors LM_GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors + LM_GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors }; // available tensor operations: @@ -460,7 +468,6 @@ extern "C" { LM_GGML_OP_SOFT_MAX_BACK, LM_GGML_OP_ROPE, LM_GGML_OP_ROPE_BACK, - LM_GGML_OP_ALIBI, LM_GGML_OP_CLAMP, LM_GGML_OP_CONV_TRANSPOSE_1D, LM_GGML_OP_IM2COL, @@ -474,9 +481,7 @@ extern "C" { LM_GGML_OP_ARGSORT, LM_GGML_OP_LEAKY_RELU, - LM_GGML_OP_FLASH_ATTN, LM_GGML_OP_FLASH_ATTN_EXT, - LM_GGML_OP_FLASH_FF, LM_GGML_OP_FLASH_ATTN_BACK, LM_GGML_OP_SSM_CONV, LM_GGML_OP_SSM_SCAN, @@ -512,6 +517,7 @@ extern "C" { LM_GGML_UNARY_OP_TANH, LM_GGML_UNARY_OP_ELU, LM_GGML_UNARY_OP_RELU, + LM_GGML_UNARY_OP_SIGMOID, LM_GGML_UNARY_OP_GELU, LM_GGML_UNARY_OP_GELU_QUICK, LM_GGML_UNARY_OP_SILU, @@ -557,7 +563,8 @@ extern "C" { // n-dimensional tensor struct lm_ggml_tensor { enum lm_ggml_type type; - enum lm_ggml_backend_type backend; + + LM_GGML_DEPRECATED(enum lm_ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); struct lm_ggml_backend_buffer * buffer; @@ -758,7 +765,8 @@ extern "C" { LM_GGML_API bool lm_ggml_is_3d (const struct lm_ggml_tensor * tensor); LM_GGML_API int lm_ggml_n_dims (const struct lm_ggml_tensor * tensor); // returns 1 for scalars - LM_GGML_API bool lm_ggml_are_same_shape(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1); + LM_GGML_API bool lm_ggml_are_same_shape (const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1); + LM_GGML_API bool lm_ggml_are_same_stride(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1); // use this to compute the memory overhead of a tensor LM_GGML_API size_t lm_ggml_tensor_overhead(void); @@ -999,12 +1007,13 @@ extern "C" { struct lm_ggml_tensor * a, struct lm_ggml_tensor * b); - // concat a and b on dim 2 + // concat a and b along dim // used in stable-diffusion LM_GGML_API struct lm_ggml_tensor * lm_ggml_concat( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b); + struct lm_ggml_tensor * b, + int dim); LM_GGML_API struct lm_ggml_tensor * lm_ggml_abs( struct lm_ggml_context * ctx, @@ -1066,6 +1075,14 @@ extern "C" { struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_sigmoid( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a); + + LM_GGML_API struct lm_ggml_tensor * lm_ggml_sigmoid_inplace( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_gelu( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); @@ -1420,15 +1437,13 @@ extern "C" { struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); - // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope)) + // fused soft_max(a*scale + mask*(ALiBi slope)) // mask is optional - // pos is required when max_bias > 0.0f // max_bias = 0.0f for no ALiBi LM_GGML_API struct lm_ggml_tensor * lm_ggml_soft_max_ext( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * mask, - struct lm_ggml_tensor * pos, float scale, float max_bias); @@ -1444,11 +1459,12 @@ extern "C" { struct lm_ggml_tensor * b); // rotary position embedding - // if mode & 1 == 1, skip n_past elements (DEPRECATED) + // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED) // if mode & 2 == 1, GPT-NeoX style // if mode & 4 == 1, ChatGLM style // // b is an int32 vector with size a->ne[2], it contains the positions + // c is freq factors (e.g. phi3-128k), (optional) LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -1467,10 +1483,11 @@ extern "C" { int n_ctx); // custom RoPE - LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_custom( + LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, + struct lm_ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -1483,10 +1500,11 @@ extern "C" { float beta_slow); // in-place, returns view(a) - LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( + LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, + struct lm_ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -1498,18 +1516,41 @@ extern "C" { float beta_fast, float beta_slow); - // compute correction dims for YaRN RoPE scaling - LM_GGML_CALL void lm_ggml_rope_yarn_corr_dims( - int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); + LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_custom( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + int n_dims, + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow), + "use lm_ggml_rope_ext instead"); - // xPos RoPE, in-place, returns view(a) - LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( + LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, int n_dims, - float base, - bool down); + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow), + "use lm_ggml_rope_ext_inplace instead"); + + // compute correction dims for YaRN RoPE scaling + LM_GGML_CALL void lm_ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); // rotary position embedding backward, i.e compute dx from dy // a - dy @@ -1517,6 +1558,7 @@ extern "C" { struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, + struct lm_ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -1530,16 +1572,6 @@ extern "C" { float xpos_base, bool xpos_down); - // alibi position embedding - // in-place, returns view(a) - LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_alibi( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - int n_past, - int n_head, - float bias_max), - "use lm_ggml_soft_max_ext instead (will be removed in Mar 2024)"); - // clamp // in-place, returns view(a) LM_GGML_API struct lm_ggml_tensor * lm_ggml_clamp( @@ -1669,12 +1701,24 @@ extern "C" { float p1); // nearest interpolate + // multiplies ne0 and ne1 by scale factor // used in stable-diffusion LM_GGML_API struct lm_ggml_tensor * lm_ggml_upscale( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, int scale_factor); + // nearest interpolate + // nearest interpolate to specified dimensions + // used in tortoise.cpp + LM_GGML_API struct lm_ggml_tensor * lm_ggml_upscale_ext( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int ne0, + int ne1, + int ne2, + int ne3); + // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0] LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad( struct lm_ggml_context * ctx, @@ -1716,13 +1760,6 @@ extern "C" { struct lm_ggml_tensor * a, int k); - LM_GGML_API struct lm_ggml_tensor * lm_ggml_flash_attn( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * q, - struct lm_ggml_tensor * k, - struct lm_ggml_tensor * v, - bool masked); - #define LM_GGML_KQ_MASK_PAD 32 // q: [n_embd, n_batch, n_head, 1] @@ -1736,12 +1773,14 @@ extern "C" { struct lm_ggml_tensor * k, struct lm_ggml_tensor * v, struct lm_ggml_tensor * mask, - float scale); + float scale, + float max_bias); LM_GGML_API void lm_ggml_flash_attn_ext_set_prec( struct lm_ggml_tensor * a, enum lm_ggml_prec prec); + // TODO: needs to be adapted to lm_ggml_flash_attn_ext LM_GGML_API struct lm_ggml_tensor * lm_ggml_flash_attn_back( struct lm_ggml_context * ctx, struct lm_ggml_tensor * q, @@ -1750,14 +1789,6 @@ extern "C" { struct lm_ggml_tensor * d, bool masked); - LM_GGML_API struct lm_ggml_tensor * lm_ggml_flash_ff( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b0, - struct lm_ggml_tensor * b1, - struct lm_ggml_tensor * c0, - struct lm_ggml_tensor * c1); - LM_GGML_API struct lm_ggml_tensor * lm_ggml_ssm_conv( struct lm_ggml_context * ctx, struct lm_ggml_tensor * s, @@ -2371,8 +2402,10 @@ extern "C" { LM_GGML_API int lm_ggml_cpu_has_avx512 (void); LM_GGML_API int lm_ggml_cpu_has_avx512_vbmi(void); LM_GGML_API int lm_ggml_cpu_has_avx512_vnni(void); + LM_GGML_API int lm_ggml_cpu_has_avx512_bf16(void); LM_GGML_API int lm_ggml_cpu_has_fma (void); LM_GGML_API int lm_ggml_cpu_has_neon (void); + LM_GGML_API int lm_ggml_cpu_has_sve (void); LM_GGML_API int lm_ggml_cpu_has_arm_fma (void); LM_GGML_API int lm_ggml_cpu_has_metal (void); LM_GGML_API int lm_ggml_cpu_has_f16c (void); diff --git a/cpp/grammar-parser.cpp b/cpp/grammar-parser.cpp index 2a130156..b5bc7d49 100644 --- a/cpp/grammar-parser.cpp +++ b/cpp/grammar-parser.cpp @@ -26,7 +26,7 @@ namespace grammar_parser { static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { uint32_t next_id = static_cast(state.symbol_ids.size()); - auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); + auto result = state.symbol_ids.emplace(std::string(src, len), next_id); return result.first->second; } @@ -142,6 +142,9 @@ namespace grammar_parser { pos++; last_sym_start = out_elements.size(); while (*pos != '"') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } auto char_pair = parse_char(pos); pos = char_pair.second; out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); @@ -156,6 +159,9 @@ namespace grammar_parser { } last_sym_start = out_elements.size(); while (*pos != ']') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } auto char_pair = parse_char(pos); pos = char_pair.second; enum llama_gretype type = last_sym_start < out_elements.size() @@ -164,6 +170,9 @@ namespace grammar_parser { out_elements.push_back({type, char_pair.first}); if (pos[0] == '-' && pos[1] != ']') { + if (!pos[1]) { + throw std::runtime_error("unexpected end of input"); + } auto endchar_pair = parse_char(pos + 1); pos = endchar_pair.second; out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); diff --git a/cpp/json-schema-to-grammar.cpp b/cpp/json-schema-to-grammar.cpp index 0f8f1b1d..9a71f5d8 100644 --- a/cpp/json-schema-to-grammar.cpp +++ b/cpp/json-schema-to-grammar.cpp @@ -272,7 +272,7 @@ class SchemaConverter { if (literal.empty()) { return false; } - ret.push_back(std::make_pair(literal, true)); + ret.emplace_back(literal, true); literal.clear(); return true; }; @@ -298,7 +298,7 @@ class SchemaConverter { while (i < length) { char c = sub_pattern[i]; if (c == '.') { - seq.push_back(std::make_pair(get_dot(), false)); + seq.emplace_back(get_dot(), false); i++; } else if (c == '(') { i++; @@ -307,7 +307,7 @@ class SchemaConverter { _warnings.push_back("Unsupported pattern syntax"); } } - seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false)); + seq.emplace_back("(" + to_rule(transform()) + ")", false); } else if (c == ')') { i++; if (start > 0 && sub_pattern[start - 1] != '(') { @@ -331,9 +331,9 @@ class SchemaConverter { } square_brackets += ']'; i++; - seq.push_back(std::make_pair(square_brackets, false)); + seq.emplace_back(square_brackets, false); } else if (c == '|') { - seq.push_back(std::make_pair("|", false)); + seq.emplace_back("|", false); i++; } else if (c == '*' || c == '+' || c == '?') { seq.back() = std::make_pair(to_rule(seq.back()) + c, false); @@ -417,7 +417,7 @@ class SchemaConverter { } } if (!literal.empty()) { - seq.push_back(std::make_pair(literal, true)); + seq.emplace_back(literal, true); } } } diff --git a/cpp/json-schema-to-grammar.h b/cpp/json-schema-to-grammar.h index e1abed30..0f8ffab1 100644 --- a/cpp/json-schema-to-grammar.h +++ b/cpp/json-schema-to-grammar.h @@ -1,4 +1,8 @@ #pragma once + +#include "ggml.h" +// Change JSON_ASSERT from assert() to LM_GGML_ASSERT: +#define JSON_ASSERT LM_GGML_ASSERT #include "json.hpp" std::string json_schema_to_grammar(const nlohmann::ordered_json& schema); diff --git a/cpp/llama.cpp b/cpp/llama.cpp index b5668e2f..5084bee1 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -7,6 +7,10 @@ #include "ggml-alloc.h" #include "ggml-backend.h" +#ifdef LM_GGML_USE_RPC +# include "ggml-rpc.h" +#endif + #ifdef LM_GGML_USE_CUDA # include "ggml-cuda.h" #elif defined(LM_GGML_USE_CLBLAST) @@ -22,16 +26,9 @@ #ifdef LM_GGML_USE_METAL # include "ggml-metal.h" #endif -#ifdef LM_GGML_USE_MPI -# include "ggml-mpi.h" -#endif -#ifndef QK_K -# ifdef LM_GGML_QKK_64 -# define QK_K 64 -# else -# define QK_K 256 -# endif -#endif + +// TODO: replace with ggml API call +#define QK_K 256 #ifdef __has_include #if __has_include() @@ -106,7 +103,7 @@ #endif #define LLAMA_MAX_NODES 8192 -#define LLAMA_MAX_EXPERTS 60 +#define LLAMA_MAX_EXPERTS 160 // // logging @@ -212,10 +209,10 @@ enum llm_arch { LLM_ARCH_GPTNEOX, LLM_ARCH_MPT, LLM_ARCH_STARCODER, - LLM_ARCH_PERSIMMON, LLM_ARCH_REFACT, LLM_ARCH_BERT, LLM_ARCH_NOMIC_BERT, + LLM_ARCH_JINA_BERT_V2, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, @@ -235,43 +232,47 @@ enum llm_arch { LLM_ARCH_COMMAND_R, LLM_ARCH_DBRX, LLM_ARCH_OLMO, + LLM_ARCH_ARCTIC, + LLM_ARCH_DEEPSEEK2, LLM_ARCH_UNKNOWN, }; static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GROK, "grok" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN, "baichuan" }, - { LLM_ARCH_STARCODER, "starcoder" }, - { LLM_ARCH_PERSIMMON, "persimmon" }, - { LLM_ARCH_REFACT, "refact" }, - { LLM_ARCH_BERT, "bert" }, - { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, - { LLM_ARCH_BLOOM, "bloom" }, - { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_QWEN, "qwen" }, - { LLM_ARCH_QWEN2, "qwen2" }, - { LLM_ARCH_QWEN2MOE, "qwen2moe" }, - { LLM_ARCH_PHI2, "phi2" }, - { LLM_ARCH_PHI3, "phi3" }, - { LLM_ARCH_PLAMO, "plamo" }, - { LLM_ARCH_CODESHELL, "codeshell" }, - { LLM_ARCH_ORION, "orion" }, - { LLM_ARCH_INTERNLM2, "internlm2" }, - { LLM_ARCH_MINICPM, "minicpm" }, - { LLM_ARCH_GEMMA, "gemma" }, - { LLM_ARCH_STARCODER2, "starcoder2" }, - { LLM_ARCH_MAMBA, "mamba" }, - { LLM_ARCH_XVERSE, "xverse" }, - { LLM_ARCH_COMMAND_R, "command-r" }, - { LLM_ARCH_DBRX, "dbrx" }, - { LLM_ARCH_OLMO, "olmo" }, - { LLM_ARCH_UNKNOWN, "(unknown)" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GROK, "grok" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BERT, "bert" }, + { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, + { LLM_ARCH_BLOOM, "bloom" }, + { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_QWEN2, "qwen2" }, + { LLM_ARCH_QWEN2MOE, "qwen2moe" }, + { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PHI3, "phi3" }, + { LLM_ARCH_PLAMO, "plamo" }, + { LLM_ARCH_CODESHELL, "codeshell" }, + { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_XVERSE, "xverse" }, + { LLM_ARCH_COMMAND_R, "command-r" }, + { LLM_ARCH_DBRX, "dbrx" }, + { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_ARCTIC, "arctic" }, + { LLM_ARCH_DEEPSEEK2, "deepseek2" }, + { LLM_ARCH_UNKNOWN, "(unknown)" }, }; enum llm_kv { @@ -291,11 +292,15 @@ enum llm_kv { LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, LLM_KV_BLOCK_COUNT, + LLM_KV_LEADING_DENSE_BLOCK_COUNT, LLM_KV_FEED_FORWARD_LENGTH, + LLM_KV_EXPERT_FEED_FORWARD_LENGTH, LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, + LLM_KV_EXPERT_SHARED_COUNT, + LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, @@ -308,14 +313,18 @@ enum llm_kv { LLM_KV_ATTENTION_LAYERNORM_EPS, LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, LLM_KV_ATTENTION_CAUSAL, + LLM_KV_ATTENTION_Q_LORA_RANK, + LLM_KV_ATTENTION_KV_LORA_RANK, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ATTN_FACTOR, LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, + LLM_KV_ROPE_SCALING_YARN_LOG_MUL, LLM_KV_SPLIT_NO, LLM_KV_SPLIT_COUNT, @@ -364,17 +373,21 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, - { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, - { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, - { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, - { LLM_KV_BLOCK_COUNT, "%s.block_count" }, - { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, - { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, - { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, - { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, - { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, - { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, - { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, + { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, + { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, + { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, + { LLM_KV_BLOCK_COUNT, "%s.block_count" }, + { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, + { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, + { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" }, + { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, + { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, + { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, + { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, + { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" }, + { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, + { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, + { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -385,14 +398,18 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, + { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" }, + { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, { LLM_KV_SPLIT_NO, "split.no" }, { LLM_KV_SPLIT_COUNT, "split.count" }, @@ -446,6 +463,8 @@ enum llm_tensor { LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ROPE_FACTORS_LONG, + LLM_TENSOR_ROPE_FACTORS_SHORT, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, LLM_TENSOR_ATTN_V, @@ -465,6 +484,7 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility LLM_TENSOR_FFN_GATE_EXP, LLM_TENSOR_FFN_UP_EXP, + LLM_TENSOR_FFN_NORM_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, // merged experts LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_UP_EXPS, @@ -481,6 +501,12 @@ enum llm_tensor { LLM_TENSOR_SSM_A, LLM_TENSOR_SSM_D, LLM_TENSOR_SSM_OUT, + LLM_TENSOR_ATTN_Q_A, + LLM_TENSOR_ATTN_Q_B, + LLM_TENSOR_ATTN_KV_A_MQA, + LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_Q_A_NORM, + LLM_TENSOR_ATTN_KV_A_NORM, }; static const std::map> LLM_TENSOR_NAMES = { @@ -603,23 +629,6 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, - { - LLM_ARCH_PERSIMMON, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd"}, - { LLM_TENSOR_OUTPUT_NORM, "output_norm"}, - { LLM_TENSOR_OUTPUT, "output"}, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"}, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"}, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"}, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"}, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"}, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"}, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"}, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"}, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"}, - }, - }, { LLM_ARCH_MPT, { @@ -702,6 +711,25 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_JINA_BERT_V2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_BLOOM, { @@ -811,18 +839,20 @@ static const std::map> LLM_TENSOR_NA { LLM_ARCH_PHI3, { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, { @@ -1038,6 +1068,57 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_ARCTIC, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, + { + LLM_ARCH_DEEPSEEK2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, + { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, + { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, + { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1675,91 +1756,6 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf LM_GGML_UNUSED(host_buffer); } -static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { - lm_ggml_backend_buffer_type_t buft = nullptr; - -#ifdef LM_GGML_USE_METAL - buft = lm_ggml_backend_metal_buffer_type(); -#elif defined(LM_GGML_USE_CUDA) - buft = lm_ggml_backend_cuda_buffer_type(gpu); -#elif defined(LM_GGML_USE_VULKAN) - buft = lm_ggml_backend_vk_buffer_type(gpu); -#elif defined(LM_GGML_USE_SYCL) - buft = lm_ggml_backend_sycl_buffer_type(gpu); -#elif defined(LM_GGML_USE_CLBLAST) - buft = lm_ggml_backend_opencl_buffer_type(); -#elif defined(LM_GGML_USE_KOMPUTE) - buft = lm_ggml_backend_kompute_buffer_type(gpu); - if (buft == nullptr) { - LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); - } -#endif - - if (buft == nullptr) { - buft = llama_default_buffer_type_cpu(true); - } - return buft; - - LM_GGML_UNUSED(gpu); -} - -static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { - lm_ggml_backend_buffer_type_t buft = nullptr; - -#ifdef LM_GGML_USE_CUDA - if (lm_ggml_backend_cuda_get_device_count() > 1) { - buft = lm_ggml_backend_cuda_split_buffer_type(tensor_split); - } -#endif - -#ifdef LM_GGML_USE_SYCL - if (lm_ggml_backend_sycl_get_device_count() > 1) { - buft = lm_ggml_backend_sycl_split_buffer_type(tensor_split); - } -#endif - - if (buft == nullptr) { - buft = llama_default_buffer_type_offload(fallback_gpu); - } - return buft; - - LM_GGML_UNUSED(tensor_split); -} - -static size_t llama_get_device_count() { -#if defined(LM_GGML_USE_CUDA) - return lm_ggml_backend_cuda_get_device_count(); -#elif defined(LM_GGML_USE_SYCL) - return lm_ggml_backend_sycl_get_device_count(); -#elif defined(LM_GGML_USE_VULKAN) - return lm_ggml_backend_vk_get_device_count(); -#else - return 1; -#endif -} - -static size_t llama_get_device_memory(int device) { -#if defined(LM_GGML_USE_CUDA) - size_t total; - size_t free; - lm_ggml_backend_cuda_get_device_memory(device, &free, &total); - return free; -#elif defined(LM_GGML_USE_SYCL) - size_t total; - size_t free; - lm_ggml_backend_sycl_get_device_memory(device, &free, &total); - return free; -#elif defined(LM_GGML_USE_VULKAN) - size_t total; - size_t free; - lm_ggml_backend_vk_get_device_memory(device, &free, &total); - return free; -#else - return 1; - LM_GGML_UNUSED(device); -#endif -} - // // globals // @@ -1768,6 +1764,8 @@ struct llama_state { llama_state() { #ifdef LM_GGML_USE_METAL lm_ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data); +#elif defined(LM_GGML_USE_CUDA) + lm_ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data); #endif } @@ -1781,23 +1779,31 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_14M, MODEL_17M, MODEL_22M, MODEL_33M, + MODEL_70M, MODEL_109M, MODEL_137M, + MODEL_160M, MODEL_335M, + MODEL_410M, MODEL_0_5B, MODEL_1B, + MODEL_1_4B, MODEL_2B, + MODEL_2_8B, MODEL_3B, MODEL_4B, + MODEL_6_9B, MODEL_7B, MODEL_8B, MODEL_12B, MODEL_13B, MODEL_14B, MODEL_15B, + MODEL_16B, MODEL_20B, MODEL_30B, MODEL_34B, @@ -1805,6 +1811,7 @@ enum e_model { MODEL_40B, MODEL_65B, MODEL_70B, + MODEL_236B, MODEL_314B, MODEL_SMALL, MODEL_MEDIUM, @@ -1814,6 +1821,7 @@ enum e_model { MODEL_8x7B, MODEL_8x22B, MODEL_16x12B, + MODEL_10B_128x3_66B, }; static const size_t kiB = 1024; @@ -1823,6 +1831,7 @@ static const size_t GiB = 1024*MiB; struct llama_hparams { bool vocab_only; bool rope_finetuned; + bool use_par_res; uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on @@ -1838,12 +1847,21 @@ struct llama_hparams { uint32_t n_expert_used = 0; uint32_t n_vocab_type = 0; // for BERT-style token types + uint32_t n_layer_dense_lead = 0; + uint32_t n_lora_q = 0; + uint32_t n_lora_kv = 0; + uint32_t n_ff_exp = 0; + uint32_t n_expert_shared = 0; + float expert_weights_scale = 0.0; + float f_norm_eps; float f_norm_rms_eps; + float rope_attn_factor = 1.0f; float rope_freq_base_train; float rope_freq_scale_train; uint32_t n_yarn_orig_ctx; + float rope_yarn_log_mul; // for State Space Models uint32_t ssm_d_conv = 0; @@ -1856,7 +1874,7 @@ struct llama_hparams { float f_logit_scale = 0.0f; bool causal_attn = true; - bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models + bool use_alibi = false; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; @@ -1877,6 +1895,12 @@ struct llama_hparams { if (this->n_expert != other.n_expert) return true; if (this->n_expert_used != other.n_expert_used) return true; + if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true; + if (this->n_lora_q != other.n_lora_q) return true; + if (this->n_lora_kv != other.n_lora_kv) return true; + if (this->n_ff_exp != other.n_ff_exp) return true; + if (this->n_expert_shared != other.n_expert_shared) return true; + if (this->rope_finetuned != other.rope_finetuned) return true; if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; @@ -1889,8 +1913,11 @@ struct llama_hparams { if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; + if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; + if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; return false; } @@ -1966,6 +1993,8 @@ struct llama_layer { struct lm_ggml_tensor * attn_k_norm_b; struct lm_ggml_tensor * attn_out_norm; struct lm_ggml_tensor * attn_out_norm_b; + struct lm_ggml_tensor * attn_q_a_norm; + struct lm_ggml_tensor * attn_kv_a_norm; // attention struct lm_ggml_tensor * wq; @@ -1973,6 +2002,10 @@ struct llama_layer { struct lm_ggml_tensor * wv; struct lm_ggml_tensor * wo; struct lm_ggml_tensor * wqkv; + struct lm_ggml_tensor * wq_a; + struct lm_ggml_tensor * wq_b; + struct lm_ggml_tensor * wkv_a_mqa; + struct lm_ggml_tensor * wkv_b; // attention bias struct lm_ggml_tensor * bq; @@ -1986,6 +2019,7 @@ struct llama_layer { struct lm_ggml_tensor * ffn_norm_b; struct lm_ggml_tensor * layer_out_norm; struct lm_ggml_tensor * layer_out_norm_b; + struct lm_ggml_tensor * ffn_norm_exps; // ff struct lm_ggml_tensor * ffn_gate; // w1 @@ -2005,8 +2039,9 @@ struct llama_layer { struct lm_ggml_tensor * ffn_up_shexp; // ff bias - struct lm_ggml_tensor * ffn_down_b; // b2 - struct lm_ggml_tensor * ffn_up_b; // b3 + struct lm_ggml_tensor * ffn_gate_b = nullptr; + struct lm_ggml_tensor * ffn_down_b = nullptr; // b2 + struct lm_ggml_tensor * ffn_up_b = nullptr; // b3 struct lm_ggml_tensor * ffn_act; // mamba proj @@ -2023,6 +2058,10 @@ struct llama_layer { // mamba bias struct lm_ggml_tensor * ssm_conv1d_b; struct lm_ggml_tensor * ssm_dt_b; + + // long rope factors + struct lm_ggml_tensor * rope_long = nullptr; + struct lm_ggml_tensor * rope_short = nullptr; }; struct llama_kv_cell { @@ -2134,7 +2173,7 @@ struct llama_vocab { std::unordered_map token_to_id; std::vector id_to_token; - std::unordered_map special_tokens_cache; + std::vector special_tokens_cache; std::map, int> bpe_ranks; @@ -2200,6 +2239,8 @@ struct llama_model { int main_gpu; int n_gpu_layers; + std::vector rpc_servers; + // gguf metadata std::unordered_map lm_gguf_kv; @@ -2328,7 +2369,6 @@ struct llama_context { struct lm_ggml_tensor * inp_pos; // I32 [n_batch] struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs] struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct lm_ggml_tensor * inp_KQ_pos; // F32 [n_kv] struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size] struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct lm_ggml_tensor * inp_cls; // I32 [n_batch] @@ -2338,11 +2378,105 @@ struct llama_context { // control vectors struct llama_control_vector cvec; +}; + +static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { + lm_ggml_backend_buffer_type_t buft = nullptr; + +#ifdef LM_GGML_USE_RPC + std::string endpoint = model.rpc_servers[gpu]; + buft = lm_ggml_backend_rpc_buffer_type(endpoint.c_str()); +#elif defined(LM_GGML_USE_METAL) + buft = lm_ggml_backend_metal_buffer_type(); +#elif defined(LM_GGML_USE_CUDA) + buft = lm_ggml_backend_cuda_buffer_type(gpu); +#elif defined(LM_GGML_USE_VULKAN) + buft = lm_ggml_backend_vk_buffer_type(gpu); +#elif defined(LM_GGML_USE_SYCL) + buft = lm_ggml_backend_sycl_buffer_type(gpu); +#elif defined(LM_GGML_USE_CLBLAST) + buft = lm_ggml_backend_opencl_buffer_type(); +#elif defined(LM_GGML_USE_KOMPUTE) + buft = lm_ggml_backend_kompute_buffer_type(gpu); + if (buft == nullptr) { + LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); + } +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_cpu(true); + } + return buft; + LM_GGML_UNUSED(model); + LM_GGML_UNUSED(gpu); +} + +static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { + lm_ggml_backend_buffer_type_t buft = nullptr; -#ifdef LM_GGML_USE_MPI - lm_ggml_mpi_context * ctx_mpi = NULL; +#ifdef LM_GGML_USE_CUDA + if (lm_ggml_backend_cuda_get_device_count() > 1) { + buft = lm_ggml_backend_cuda_split_buffer_type(tensor_split); + } #endif -}; + +#ifdef LM_GGML_USE_SYCL + if (lm_ggml_backend_sycl_get_device_count() > 1) { + buft = lm_ggml_backend_sycl_split_buffer_type(tensor_split); + } +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_offload(model, fallback_gpu); + } + return buft; + + LM_GGML_UNUSED(tensor_split); +} + +static size_t llama_get_device_count(const llama_model & model) { +#if defined(LM_GGML_USE_RPC) + return model.rpc_servers.size(); +#elif defined(LM_GGML_USE_CUDA) + return lm_ggml_backend_cuda_get_device_count(); +#elif defined(LM_GGML_USE_SYCL) + return lm_ggml_backend_sycl_get_device_count(); +#elif defined(LM_GGML_USE_VULKAN) + return lm_ggml_backend_vk_get_device_count(); +#else + return 1; +#endif + LM_GGML_UNUSED(model); +} + +static size_t llama_get_device_memory(const llama_model & model, int device) { +#if defined(LM_GGML_USE_RPC) + size_t total; + size_t free; + std::string endpoint = model.rpc_servers[device]; + lm_ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total); + return free; +#elif defined(LM_GGML_USE_CUDA) + size_t total; + size_t free; + lm_ggml_backend_cuda_get_device_memory(device, &free, &total); + return free; +#elif defined(LM_GGML_USE_SYCL) + size_t total; + size_t free; + lm_ggml_backend_sycl_get_device_memory(device, &free, &total); + return free; +#elif defined(LM_GGML_USE_VULKAN) + size_t total; + size_t free; + lm_ggml_backend_vk_get_device_memory(device, &free, &total); + return free; +#else + return 1; +#endif + LM_GGML_UNUSED(model); + LM_GGML_UNUSED(device); +} // // kv cache helpers @@ -2463,7 +2597,6 @@ static bool llama_kv_cache_init( static bool llama_kv_cache_find_slot( struct llama_kv_cache & cache, const struct llama_batch & batch) { - const uint32_t n_ctx = cache.size; const uint32_t n_tokens = batch.n_tokens; if (cache.recurrent) { @@ -2514,16 +2647,16 @@ static bool llama_kv_cache_find_slot( } // otherwise, one cell per token. - if (n_tokens > n_ctx) { - LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx); + if (n_tokens > cache.size) { + LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size); return false; } uint32_t n_tested = 0; while (true) { - if (cache.head + n_tokens > n_ctx) { - n_tested += n_ctx - cache.head; + if (cache.head + n_tokens > cache.size) { + n_tested += cache.size - cache.head; cache.head = 0; continue; } @@ -2542,7 +2675,7 @@ static bool llama_kv_cache_find_slot( break; } - if (n_tested >= n_ctx) { + if (n_tested >= cache.size) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); return false; } @@ -2796,6 +2929,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) { cache.do_defrag = true; } +static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) { + // the FA kernels require padding to avoid extra runtime boundary checks + return cparams.flash_attn ? 256u : 32u; +} + // // model loading and saving // @@ -3186,6 +3324,7 @@ struct llama_model_loader { switch (type_max) { case LM_GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; case LM_GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; + case LM_GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; case LM_GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; case LM_GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; case LM_GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; @@ -3296,6 +3435,39 @@ struct llama_model_loader { return get_arr_n(llm_kv(kid), result, required); } + template + bool get_arr(const std::string & key, std::vector & result, const bool required = true) { + const int kid = lm_gguf_find_key(meta, key.c_str()); + + if (kid < 0) { + if (required) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + return false; + } + + struct GGUFMeta::ArrayInfo arr_info = + GGUFMeta::GKV::get_kv(meta, kid); + + if (arr_info.gt != LM_GGUF_TYPE_FLOAT32 && arr_info.gt != LM_GGUF_TYPE_INT32) { + throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str())); + } + + // LM_GGML_ASSERT(lm_gguf_type_size(arr_info.gt) == sizeof(T)); + LM_GGML_ASSERT((arr_info.gt != LM_GGUF_TYPE_FLOAT32 || std::is_same::value)); + LM_GGML_ASSERT((arr_info.gt != LM_GGUF_TYPE_INT32 || std::is_same::value)); + + result.resize(arr_info.length); + result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); + + return true; + } + + template + bool get_arr(const enum llm_kv kid, T& result, const bool required = true) { + return get_arr(llm_kv(kid), result, required); + } + template bool get_key(const std::string & key, T & result, const bool required = true) { auto it = kv_overrides.find(key); @@ -3370,11 +3542,15 @@ struct llama_model_loader { return get_tensor_meta(get_tensor_name(i)); } - struct lm_ggml_tensor * create_tensor_for(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * cur) { + struct lm_ggml_tensor * create_tensor_for(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * cur, bool duplicated) { struct lm_ggml_tensor * tensor = lm_ggml_dup_tensor(ctx, cur); lm_ggml_set_name(tensor, lm_ggml_get_name(cur)); - n_created++; + if (duplicated) { + size_data += lm_ggml_nbytes(cur); + } else { + n_created++; + } return tensor; } @@ -3409,14 +3585,17 @@ struct llama_model_loader { return cur; } - struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { - const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, required); + static const int TENSOR_NOT_REQUIRED = 1; + static const int TENSOR_DUPLICATED = 2; + + struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector & ne, int flags = 0) { + const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); if (cur == NULL) { return NULL; } - return create_tensor_for(ctx, cur); + return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED); } struct lm_ggml_tensor * create_tensor_as_view(struct lm_ggml_context * ctx, struct lm_ggml_tensor * base, const std::string & name, const std::vector & ne, size_t offset, bool required = true) { @@ -3677,6 +3856,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; + case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: @@ -3715,37 +3895,50 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { - case MODEL_22M: return "22M"; - case MODEL_33M: return "33M"; - case MODEL_109M: return "109M"; - case MODEL_137M: return "137M"; - case MODEL_0_5B: return "0.5B"; - case MODEL_1B: return "1B"; - case MODEL_2B: return "2B"; - case MODEL_3B: return "3B"; - case MODEL_7B: return "7B"; - case MODEL_8B: return "8B"; - case MODEL_12B: return "12B"; - case MODEL_13B: return "13B"; - case MODEL_14B: return "14B"; - case MODEL_15B: return "15B"; - case MODEL_20B: return "20B"; - case MODEL_30B: return "30B"; - case MODEL_34B: return "34B"; - case MODEL_35B: return "35B"; - case MODEL_40B: return "40B"; - case MODEL_65B: return "65B"; - case MODEL_70B: return "70B"; - case MODEL_314B: return "314B"; - case MODEL_SMALL: return "0.1B"; - case MODEL_MEDIUM: return "0.4B"; - case MODEL_LARGE: return "0.8B"; - case MODEL_XL: return "1.5B"; - case MODEL_A2_7B: return "A2.7B"; - case MODEL_8x7B: return "8x7B"; - case MODEL_8x22B: return "8x22B"; - case MODEL_16x12B: return "16x12B"; - default: return "?B"; + case MODEL_14M: return "14M"; + case MODEL_17M: return "17M"; + case MODEL_22M: return "22M"; + case MODEL_33M: return "33M"; + case MODEL_70M: return "70M"; + case MODEL_109M: return "109M"; + case MODEL_137M: return "137M"; + case MODEL_160M: return "160M"; + case MODEL_335M: return "335M"; + case MODEL_410M: return "410M"; + case MODEL_0_5B: return "0.5B"; + case MODEL_1B: return "1B"; + case MODEL_1_4B: return "1.4B"; + case MODEL_2B: return "2B"; + case MODEL_2_8B: return "2.8B"; + case MODEL_3B: return "3B"; + case MODEL_4B: return "4B"; + case MODEL_6_9B: return "6.9B"; + case MODEL_7B: return "7B"; + case MODEL_8B: return "8B"; + case MODEL_12B: return "12B"; + case MODEL_13B: return "13B"; + case MODEL_14B: return "14B"; + case MODEL_15B: return "15B"; + case MODEL_16B: return "16B"; + case MODEL_20B: return "20B"; + case MODEL_30B: return "30B"; + case MODEL_34B: return "34B"; + case MODEL_35B: return "35B"; + case MODEL_40B: return "40B"; + case MODEL_65B: return "65B"; + case MODEL_70B: return "70B"; + case MODEL_236B: return "236B"; + case MODEL_314B: return "314B"; + case MODEL_SMALL: return "0.1B"; + case MODEL_MEDIUM: return "0.4B"; + case MODEL_LARGE: return "0.8B"; + case MODEL_XL: return "1.5B"; + case MODEL_A2_7B: return "A2.7B"; + case MODEL_8x7B: return "8x7B"; + case MODEL_8x22B: return "8x22B"; + case MODEL_16x12B: return "16x12B"; + case MODEL_10B_128x3_66B: return "10B+128x3.66B"; + default: return "?B"; } } @@ -3788,6 +3981,12 @@ static void llm_load_hparams( // get hparams kv ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + + // everything past this point is not vocab-related + if (hparams.vocab_only) { + return; + } + ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); @@ -3832,6 +4031,8 @@ static void llm_load_hparams( } hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; + ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); + // sanity check for n_rot (optional) { hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head; @@ -3869,7 +4070,9 @@ static void llm_load_hparams( switch (hparams.n_layer) { case 22: model.type = e_model::MODEL_1B; break; case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA + // granite uses a vocab with len 49152 + case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break; + case 36: model.type = e_model::MODEL_8B; break; // granite case 40: model.type = e_model::MODEL_13B; break; case 48: model.type = e_model::MODEL_34B; break; case 60: model.type = e_model::MODEL_30B; break; @@ -3931,14 +4134,6 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; - case LLM_ARCH_PERSIMMON: - { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { - case 36: model.type = e_model::MODEL_8B; break; - default: model.type = e_model::MODEL_UNKNOWN; - } - } break; case LLM_ARCH_REFACT: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -3971,6 +4166,19 @@ static void llm_load_hparams( model.type = e_model::MODEL_335M; break; // bge-large } } break; + case LLM_ARCH_JINA_BERT_V2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + hparams.f_max_alibi_bias = 8.0f; + + switch (hparams.n_layer) { + case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small + case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base + } + } break; case LLM_ARCH_NOMIC_BERT: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -4067,6 +4275,7 @@ static void llm_load_hparams( switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; + case 40: model.type = e_model::MODEL_14B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -4133,6 +4342,8 @@ static void llm_load_hparams( case 30: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_15B; break; + case 52: model.type = e_model::MODEL_20B; break; // granite + case 88: model.type = e_model::MODEL_34B; break; // granite default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -4207,6 +4418,85 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GPTNEOX: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); + switch (hparams.n_layer) { + case 6: + switch (hparams.n_ff) { + case 512: model.type = e_model::MODEL_14M; break; + case 2048: model.type = e_model::MODEL_70M; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 12: + switch (hparams.n_ff) { + case 3072: model.type = e_model::MODEL_160M; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 16: + switch (hparams.n_ff) { + case 8192: model.type = e_model::MODEL_1B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 24: + switch (hparams.n_ff) { + case 4096: model.type = e_model::MODEL_410M; break; + case 8192: model.type = e_model::MODEL_1_4B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 32: + switch (hparams.n_ff) { + case 10240: model.type = e_model::MODEL_2_8B; break; + case 16384: model.type = e_model::MODEL_6_9B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 36: + switch (hparams.n_ff) { + case 20480: model.type = e_model::MODEL_12B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 44: + switch (hparams.n_ff) { + case 24576: model.type = e_model::MODEL_20B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_ARCTIC: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + if (hparams.n_expert == 128) { + switch (hparams.n_layer) { + case 35: model.type = e_model::MODEL_10B_128x3_66B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } else { + model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_DEEPSEEK2: + { + bool is_lite = (hparams.n_layer == 27); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + if (!is_lite) { + ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + } + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul); + + switch (hparams.n_layer) { + case 27: model.type = e_model::MODEL_16B; break; + case 60: model.type = e_model::MODEL_236B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -4316,6 +4606,11 @@ static void llm_load_vocab( } else { if (tokenizer_model == "gpt2") { vocab.type = LLAMA_VOCAB_TYPE_BPE; + + const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); + if (add_space_prefix_keyidx != -1) { + vocab.add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx); + } } else { LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str()); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); @@ -4392,8 +4687,33 @@ static void llm_load_vocab( tokenizer_pre == "starcoder") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER; } else if ( - tokenizer_pre == "gpt-2") { + tokenizer_pre == "gpt-2" || + tokenizer_pre == "jina-es" || + tokenizer_pre == "jina-de" || + tokenizer_pre == "jina-v2-es" || + tokenizer_pre == "jina-v2-de") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; + } else if ( + tokenizer_pre == "refact") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; + } else if ( + tokenizer_pre == "command-r") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; + } else if ( + tokenizer_pre == "qwen2") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; + } else if ( + tokenizer_pre == "stablelm2") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2; + } else if ( + tokenizer_pre == "olmo") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO; + } else if ( + tokenizer_pre == "dbrx") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX; + } else if ( + tokenizer_pre == "smaug-bpe") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -4509,7 +4829,8 @@ static void llm_load_vocab( (t.first == "<|eot_id|>" || t.first == "<|im_end|>" || t.first == "<|end|>" || - t.first == "" + t.first == "" || + t.first == "<|endoftext|>" ) ) { vocab.special_eot_id = t.second; @@ -4521,97 +4842,19 @@ static void llm_load_vocab( // build special tokens cache { - // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type, - // and will always be correctly labeled in 'added_tokens.json' etc. - // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed - // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer - // are special tokens. - // From testing, this appears to correlate 1:1 with special tokens. - // - - // Counting special tokens and verifying in only one direction - // is sufficient to detect difference in those two sets. - // - uint32_t special_tokens_count_by_type = 0; - uint32_t special_tokens_count_from_verification = 0; - - bool special_tokens_definition_mismatch = false; - - for (const auto & t : vocab.token_to_id) { - const auto & token = t.first; - const auto & id = t.second; - - // Count all non-normal tokens in the vocab while iterating + for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) { if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) { - special_tokens_count_by_type++; + vocab.special_tokens_cache.push_back(id); } + } - // Skip single character tokens - if (token.length() > 1) { - bool is_tokenizable = false; - - // Split token string representation in two, in all possible ways - // and check if both halves can be matched to a valid token - for (unsigned i = 1; i < token.length();) { - const auto left = token.substr(0, i); - const auto right = token.substr(i); - - // check if we didnt partition in the middle of a utf sequence - auto utf = utf8_len(left.at(left.length() - 1)); - - if (utf == 1) { - if (vocab.token_to_id.find(left) != vocab.token_to_id.end() && - vocab.token_to_id.find(right) != vocab.token_to_id.end() ) { - is_tokenizable = true; - break; - } - i++; - } else { - // skip over the rest of multibyte utf sequence - i += utf - 1; - } - } - - if (!is_tokenizable) { - // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1 - // it's faster to re-filter them here, since there are way less candidates now - - // Calculate a total "utf" length of a token string representation - size_t utf8_str_len = 0; - for (unsigned i = 0; i < token.length();) { - utf8_str_len++; - i += utf8_len(token.at(i)); - } - - // And skip the ones which are one character - if (utf8_str_len > 1) { - // At this point what we have left are special tokens only - vocab.special_tokens_cache[token] = id; - - // Count manually found special tokens - special_tokens_count_from_verification++; - - // If this manually found special token is not marked as such, flag a mismatch - if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) { - special_tokens_definition_mismatch = true; - } - } - } + std::sort( vocab.special_tokens_cache.begin(), vocab.special_tokens_cache.end(), + [&] (const llama_vocab::id a, const llama_vocab::id b) { + return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size(); } - } + ); - if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) { - LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n", - __func__, - special_tokens_count_from_verification, vocab.id_to_token.size(), - special_tokens_count_by_type, vocab.id_to_token.size() - ); - } else { - LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n", - __func__, - special_tokens_count_from_verification, vocab.id_to_token.size() - ); - } + LLAMA_LOG_INFO("%s: special tokens cache size = %u.\n", __func__, (uint32_t)vocab.special_tokens_cache.size()); } } @@ -4692,6 +4935,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); } if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); } if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } + + if (model.arch == LLM_ARCH_DEEPSEEK2) { + LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); + LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); + LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); + LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); + } } // Returns false if cancelled by progress_callback @@ -4737,13 +4990,13 @@ static bool llm_load_tensors( if (split_mode == LLAMA_SPLIT_MODE_LAYER) { // calculate the split points - int device_count = llama_get_device_count(); + int device_count = llama_get_device_count(model); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); std::vector splits(device_count); if (all_zero) { // default split, by free memory for (int i = 0; i < device_count; ++i) { - splits[i] = llama_get_device_memory(i); + splits[i] = llama_get_device_memory(model, i); } } else { std::copy(tensor_split, tensor_split + device_count, splits.begin()); @@ -4763,35 +5016,35 @@ static bool llm_load_tensors( int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int64_t i = i_gpu_start; i < n_layer; ++i) { int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); - model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); + model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu); } // assign the output layer if (n_gpu_layers > n_layer) { int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); - model.buft_output = llama_default_buffer_type_offload(layer_gpu); + model.buft_output = llama_default_buffer_type_offload(model, layer_gpu); } else { model.buft_output = llama_default_buffer_type_cpu(true); } } else { lm_ggml_backend_buffer_type_t split_buft; if (split_mode == LLAMA_SPLIT_MODE_ROW) { - split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); + split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split); } else { // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported - split_buft = llama_default_buffer_type_offload(main_gpu); + split_buft = llama_default_buffer_type_offload(model, main_gpu); } // assign the repeating layers for (int64_t i = i_gpu_start; i < n_layer; ++i) { model.buft_layer[i] = { split_buft, - llama_default_buffer_type_offload(main_gpu) + llama_default_buffer_type_offload(model, main_gpu) }; } // assign the output layer if (n_gpu_layers > n_layer) { model.buft_output = { split_buft, - llama_default_buffer_type_offload(main_gpu) + llama_default_buffer_type_offload(model, main_gpu) }; } else { model.buft_output = llama_default_buffer_type_cpu(true); @@ -4835,6 +5088,7 @@ static bool llm_load_tensors( // create tensors for the weights { const int64_t n_embd = hparams.n_embd; + const int64_t n_embd_head = n_embd / hparams.n_head; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = n_embd_v_gqa; @@ -4847,8 +5101,6 @@ static bool llm_load_tensors( throw std::runtime_error("model has expert layers but no expert layers are used"); } - LM_GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - lm_ggml_context * ctx_input = ctx_map.at(model.buft_input.buft); lm_ggml_context * ctx_output = ctx_map.at(model.buft_output.buft); lm_ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix); @@ -4869,12 +5121,10 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); if (model.arch != LLM_ARCH_MINICPM){ - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } } @@ -4893,10 +5143,10 @@ static bool llm_load_tensors( layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); // optional bias tensors - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); - layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); - layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); - layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); @@ -4904,10 +5154,15 @@ static bool llm_load_tensors( layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + // optional MLP bias + layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); } else { layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); if (layer.ffn_gate_exps) { layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); @@ -4949,12 +5204,10 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -4977,7 +5230,7 @@ static bool llm_load_tensors( layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); if (layer.ffn_gate_exps) { layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); @@ -5079,11 +5332,9 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); if (!model.output) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU } } @@ -5096,8 +5347,8 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false); - layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false); + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); @@ -5115,7 +5366,12 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + if (!model.output) { + // needs to be on GPU + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } for (int i = 0; i < n_layer; ++i) { @@ -5143,47 +5399,6 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; - case LLM_ARCH_PERSIMMON: - { - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - - { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - } - - for (int i = 0; i < n_layer; ++i) { - lm_ggml_context * ctx_layer = ctx_for_layer(i); - lm_ggml_context * ctx_split = ctx_for_layer_split(i); - - auto & layer = model.layers[i]; - - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); - - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}); - layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}); - - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}); - layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}); - } - } break; case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: { @@ -5236,6 +5451,50 @@ static bool llm_load_tensors( layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}); } } break; + case LLM_ARCH_JINA_BERT_V2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings + model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings + model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm + model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; // JinaBertLayer + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens + layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens + + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm + layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); + layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}); + } + } break; case LLM_ARCH_BLOOM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -5277,18 +5536,16 @@ static bool llm_load_tensors( case LLM_ARCH_MPT: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED); // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); if (!model.output) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU } } @@ -5299,31 +5556,31 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false); - layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false); - layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); // AWQ ScaleActivation layer - layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); + layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); } } break; case LLM_ARCH_STABLELM: @@ -5352,17 +5609,17 @@ static bool llm_load_tensors( layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); // optional bias tensors, present in Stable LM 2 1.6B - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); - layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); - layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); // optional q and k layernorms, present in StableLM 2 12B - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED); // optional FFN norm, not present in StableLM 2 12B which uses parallel residual - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); @@ -5405,12 +5662,10 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5508,8 +5763,8 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); if (layer.wqkv == nullptr) { layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); @@ -5546,17 +5801,20 @@ static bool llm_load_tensors( lm_ggml_context* ctx_layer = ctx_for_layer(i); lm_ggml_context* ctx_split = ctx_for_layer_split(i); - auto& layer = model.layers[i]; + auto & layer = model.layers[i]; layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }); + + layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); } } break; case LLM_ARCH_PLAMO: @@ -5725,9 +5983,7 @@ static bool llm_load_tensors( // output model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading const int64_t n_ff = hparams.n_ff; const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -5762,12 +6018,10 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5818,12 +6072,10 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed, duplicated to allow offloading if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5884,9 +6136,7 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); // init output from the input tok embed - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } for (int i = 0; i < n_layer; ++i) { @@ -5918,12 +6168,10 @@ static bool llm_load_tensors( // output { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += lm_ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5943,71 +6191,210 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; - default: - throw std::runtime_error("unknown architecture"); - } - } + case LLM_ARCH_GPTNEOX: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - ml.done_getting_tensors(); + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); - ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr); - model.mappings.reserve(ml.mappings.size()); + auto & layer = model.layers[i]; - // create the backend buffers - std::vector> ctx_bufs; - ctx_bufs.reserve(ctx_map.size()); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - // Ensure we have enough capacity for the maximum backend buffer we will potentially create - size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); - model.bufs.reserve(n_max_backend_buffer); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - for (auto & it : ctx_map) { - lm_ggml_backend_buffer_type_t buft = it.first; - lm_ggml_context * ctx = it.second; + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - llama_buf_map bufs; - bufs.reserve(n_max_backend_buffer); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - // only the mmap region containing the tensors in the model is mapped to the backend buffer - // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers - // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size - if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) { - for (uint32_t idx = 0; idx < ml.files.size(); idx++) { - void * addr = nullptr; - size_t first, last; - ml.get_mapping_range(&first, &last, &addr, idx, ctx); - if (first >= last) { - continue; - } - lm_ggml_backend_buffer_t buf = lm_ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend CPU buffer"); - } - model.bufs.push_back(buf); - bufs.emplace(idx, buf); -#ifdef LM_GGML_USE_CUDA - if (n_layer >= n_gpu_layers) { - lm_ggml_backend_cuda_register_host_buffer( - lm_ggml_backend_buffer_get_base(buf), - lm_ggml_backend_buffer_get_size(buf)); - } -#endif - } - } -#ifdef LM_GGML_USE_METAL - else if (ml.use_mmap && use_mmap_buffer && buft == lm_ggml_backend_metal_buffer_type()) { - for (uint32_t idx = 0; idx < ml.files.size(); idx++) { - const size_t max_size = lm_ggml_get_max_tensor_size(ctx); - void * addr = nullptr; - size_t first, last; - ml.get_mapping_range(&first, &last, &addr, idx, ctx); - if (first >= last) { - continue; - } - lm_ggml_backend_buffer_t buf = lm_ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend metal buffer"); - } + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + } + } break; + case LLM_ARCH_ARCTIC: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + } + } break; + case LLM_ARCH_DEEPSEEK2: + { + bool is_lite = (hparams.n_layer == 27); + + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t q_lora_rank = hparams.n_lora_q; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + const uint32_t n_ff_exp = hparams.n_ff_exp; + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + if (!is_lite) { + layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}); + } + layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}); + + if (!is_lite) { + layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}); + layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k}); + } else { + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}); + } + layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}); + layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + if ((uint32_t) i < hparams.n_layer_dense_lead) { + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } else { + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + + LM_GGML_ASSERT(hparams.n_expert > 0); + LM_GGML_ASSERT(hparams.n_expert_used > 0); + + // MoE branch + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + + // Shared expert branch + layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared}); + layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd}); + layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared}); + } + } + } break; + default: + throw std::runtime_error("unknown architecture"); + } + } + + ml.done_getting_tensors(); + + ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr); + model.mappings.reserve(ml.mappings.size()); + + // create the backend buffers + std::vector> ctx_bufs; + ctx_bufs.reserve(ctx_map.size()); + + // Ensure we have enough capacity for the maximum backend buffer we will potentially create + size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); + model.bufs.reserve(n_max_backend_buffer); + + for (auto & it : ctx_map) { + lm_ggml_backend_buffer_type_t buft = it.first; + lm_ggml_context * ctx = it.second; + + llama_buf_map bufs; + bufs.reserve(n_max_backend_buffer); + + // only the mmap region containing the tensors in the model is mapped to the backend buffer + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size + if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) { + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + void * addr = nullptr; + size_t first, last; + ml.get_mapping_range(&first, &last, &addr, idx, ctx); + if (first >= last) { + continue; + } + lm_ggml_backend_buffer_t buf = lm_ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend CPU buffer"); + } + model.bufs.push_back(buf); + bufs.emplace(idx, buf); +#ifdef LM_GGML_USE_CUDA + if (n_layer >= n_gpu_layers) { + lm_ggml_backend_cuda_register_host_buffer( + lm_ggml_backend_buffer_get_base(buf), + lm_ggml_backend_buffer_get_size(buf)); + } +#endif + } + } +#ifdef LM_GGML_USE_METAL + else if (ml.use_mmap && use_mmap_buffer && buft == lm_ggml_backend_metal_buffer_type()) { + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + const size_t max_size = lm_ggml_get_max_tensor_size(ctx); + void * addr = nullptr; + size_t first, last; + ml.get_mapping_range(&first, &last, &addr, idx, ctx); + if (first >= last) { + continue; + } + lm_ggml_backend_buffer_t buf = lm_ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend metal buffer"); + } model.bufs.push_back(buf); bufs.emplace(idx, buf); } @@ -6131,6 +6518,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam || !( model.ftype == LLAMA_FTYPE_ALL_F32 || model.ftype == LLAMA_FTYPE_MOSTLY_F16 || + model.ftype == LLAMA_FTYPE_MOSTLY_BF16 || model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ) @@ -6206,10 +6594,7 @@ static struct lm_ggml_tensor * llm_build_inp_embd( inpL = lm_ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); } else { -#ifdef LM_GGML_USE_MPI - LM_GGML_ASSERT(false && "not implemented"); -#endif - lctx.inp_embd = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens); + lctx.inp_embd = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens); inpL = lctx.inp_embd; lm_ggml_set_input(lctx.inp_embd); } @@ -6311,7 +6696,7 @@ static struct lm_ggml_tensor * llm_build_ffn( llm_ffn_gate_type type_gate, const llm_build_cb & cb, int il) { - struct lm_ggml_tensor * tmp = lm_ggml_mul_mat(ctx, up, cur); + struct lm_ggml_tensor * tmp = up ? lm_ggml_mul_mat(ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -6399,6 +6784,8 @@ static struct lm_ggml_tensor * llm_build_moe_ffn( int64_t n_expert_used, llm_ffn_op_type type_op, bool norm_w, + bool scale_w, + float w_scale, const llm_build_cb & cb, int il) { int64_t n_embd = cur->ne[0]; @@ -6430,6 +6817,10 @@ static struct lm_ggml_tensor * llm_build_moe_ffn( weights = lm_ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); } + if (scale_w) { + weights = lm_ggml_scale(ctx, weights, w_scale); + cb(weights, "ffn_moe_weights_scaled", il); + } cur = lm_ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); lm_ggml_tensor * up = lm_ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] @@ -6493,7 +6884,6 @@ static struct lm_ggml_tensor * llm_build_kqv( struct lm_ggml_tensor * wo_b, struct lm_ggml_tensor * q_cur, struct lm_ggml_tensor * kq_mask, - struct lm_ggml_tensor * kq_pos, int32_t n_tokens, int32_t n_kv, float kq_scale, @@ -6505,6 +6895,7 @@ static struct lm_ggml_tensor * llm_build_kqv( const int64_t n_embd_head_k = hparams.n_embd_head_k; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_head_v = hparams.n_embd_head_v; + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); struct lm_ggml_tensor * q = lm_ggml_permute(ctx, q_cur, 0, 2, 1, 3); cb(q, "q", il); @@ -6523,31 +6914,27 @@ static struct lm_ggml_tensor * llm_build_kqv( LM_GGML_UNUSED(model); LM_GGML_UNUSED(n_ctx); - // note: if this assert triggers, then some check has failed earlier - // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention - LM_GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention"); - // split cached v into n_head heads (not transposed) struct lm_ggml_tensor * v = lm_ggml_view_3d(ctx, kv.v_l[il], n_embd_head_v, n_kv, n_head_kv, - lm_ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa), - lm_ggml_row_size(kv.v_l[il]->type, n_embd_head_k), + lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), + lm_ggml_row_size(kv.v_l[il]->type, n_embd_head_v), 0); cb(v, "v", il); - cur = lm_ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale); + cur = lm_ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias); - if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { lm_ggml_flash_attn_ext_set_prec(cur, LM_GGML_PREC_F32); } - cur = lm_ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens); + cur = lm_ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32); @@ -6567,28 +6954,8 @@ static struct lm_ggml_tensor * llm_build_kqv( kq = lm_ggml_scale(ctx, kq, 30); } -#if defined(LM_GGML_USE_KOMPUTE) -#pragma message("TODO: ALiBi support in lm_ggml_soft_max_ext is not implemented for Kompute") -#pragma message(" Falling back to lm_ggml_alibi(). Will become an error in Mar 2024") -#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488") - if (hparams.use_alibi) { - kq = lm_ggml_scale(ctx, kq, kq_scale); - cb(kq, "kq_scaled", il); - - kq = lm_ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias); - cb(kq, "kq_scaled_alibi", il); - - kq = lm_ggml_add(ctx, kq, kq_mask); - cb(kq, "kq_masked", il); - - kq = lm_ggml_soft_max(ctx, kq); - cb(kq, "kq_soft_max", il); - } else -#endif - { - kq = lm_ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); - } + kq = lm_ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); LM_GGML_ASSERT(kv.size == n_ctx); @@ -6607,7 +6974,7 @@ static struct lm_ggml_tensor * llm_build_kqv( struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx, kqv, 0, 2, 1, 3); cb(kqv_merged, "kqv_merged", il); - cur = lm_ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens); + cur = lm_ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens); cb(cur, "kqv_merged_cont", il); } @@ -6638,7 +7005,6 @@ static struct lm_ggml_tensor * llm_build_kv( struct lm_ggml_tensor * v_cur, struct lm_ggml_tensor * q_cur, struct lm_ggml_tensor * kq_mask, - struct lm_ggml_tensor * kq_pos, int32_t n_tokens, int32_t kv_head, int32_t n_kv, @@ -6657,7 +7023,7 @@ static struct lm_ggml_tensor * llm_build_kv( struct lm_ggml_tensor * cur; cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b, - q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il); + q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); return cur; @@ -6764,18 +7130,17 @@ struct llm_build_context { ctx0 = lm_ggml_init(params); - lctx.inp_tokens = nullptr; - lctx.inp_embd = nullptr; - lctx.inp_pos = nullptr; + lctx.inp_tokens = nullptr; + lctx.inp_embd = nullptr; + lctx.inp_pos = nullptr; lctx.inp_out_ids = nullptr; lctx.inp_KQ_mask = nullptr; - lctx.inp_KQ_pos = nullptr; lctx.inp_K_shift = nullptr; - lctx.inp_mean = nullptr; - lctx.inp_cls = nullptr; - lctx.inp_s_copy = nullptr; - lctx.inp_s_mask = nullptr; - lctx.inp_s_seq = nullptr; + lctx.inp_mean = nullptr; + lctx.inp_cls = nullptr; + lctx.inp_s_copy = nullptr; + lctx.inp_s_mask = nullptr; + lctx.inp_s_seq = nullptr; } void free() { @@ -6794,17 +7159,20 @@ struct llm_build_context { cb(lctx.inp_K_shift, "K_shift", -1); lm_ggml_set_input(lctx.inp_K_shift); + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * rope_factors = build_rope_factors(il); struct lm_ggml_tensor * tmp = // we rotate only the first n_rot dimensions - lm_ggml_rope_custom_inplace(ctx0, + lm_ggml_rope_ext_inplace(ctx0, lm_ggml_view_3d(ctx0, kv_self.k_l[il], n_embd_head_k, n_head_kv, n_ctx, lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), 0), - lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + cb(tmp, "K_shifted", il); lm_ggml_build_forward_expand(gf, tmp); } @@ -6907,6 +7275,17 @@ struct llm_build_context { return lctx.inp_pos; } + struct lm_ggml_tensor * build_rope_factors(int il) { + // choose long/short freq factors based on the context size + const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + + if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) { + return model.layers[il].rope_long; + } + + return model.layers[il].rope_short; + } + struct lm_ggml_tensor * build_inp_out_ids() { lctx.inp_out_ids = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_outputs); cb(lctx.inp_out_ids, "inp_out_ids", -1); @@ -6925,19 +7304,6 @@ struct llm_build_context { return flash_attn ? lm_ggml_cast(ctx0, lctx.inp_KQ_mask, LM_GGML_TYPE_F16) : lctx.inp_KQ_mask; } - struct lm_ggml_tensor * build_inp_KQ_pos(bool causal = true) { - if (causal) { - lctx.inp_KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, n_kv); - } else { - // TODO: this will be needed for ALiBi-based BERT models - // https://github.com/ggerganov/llama.cpp/pull/6826 - lctx.inp_KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, n_tokens); - } - cb(lctx.inp_KQ_pos, "KQ_pos", -1); - lm_ggml_set_input(lctx.inp_KQ_pos); - return flash_attn ? lm_ggml_cast(ctx0, lctx.inp_KQ_pos, LM_GGML_TYPE_F16) : lctx.inp_KQ_pos; - } - struct lm_ggml_tensor * build_inp_mean() { lctx.inp_mean = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_tokens, n_tokens); cb(lctx.inp_mean, "inp_mean", -1); @@ -7027,15 +7393,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -7043,7 +7409,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7065,9 +7431,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -7085,6 +7451,7 @@ struct llm_build_context { model.layers[il].ffn_down_exps, n_expert, n_expert_used, LLM_FFN_SILU, true, + false, 0.0, cb, il); cb(cur, "ffn_moe_out", il); } @@ -7136,9 +7503,6 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); - // positions of the tokens in the KV cache - struct lm_ggml_tensor * KQ_pos = build_inp_KQ_pos(); - for (int il = 0; il < n_layer; ++il) { struct lm_ggml_tensor * inpSA = inpL; @@ -7160,13 +7524,13 @@ struct llm_build_context { switch (model.type) { case MODEL_7B: - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -7183,7 +7547,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7253,9 +7617,6 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); - // positions of the tokens in the KV cache - struct lm_ggml_tensor * KQ_pos = build_inp_KQ_pos(); - for (int il = 0; il < n_layer; ++il) { struct lm_ggml_tensor * inpSA = inpL; @@ -7275,22 +7636,22 @@ struct llm_build_context { struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7396,21 +7757,21 @@ struct llm_build_context { Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode - Qcur = lm_ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Qcur = lm_ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Kcur = lm_ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7519,15 +7880,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -7535,7 +7896,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -7572,6 +7933,7 @@ struct llm_build_context { model.layers[il].ffn_down_exps, n_expert, n_expert_used, LLM_FFN_GELU, true, + false, 0.0, cb, il); cb(cur, "ffn_moe_out", il); @@ -7671,15 +8033,15 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -7687,7 +8049,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7715,6 +8077,7 @@ struct llm_build_context { model.layers[il].ffn_down_exps, n_expert, n_expert_used, LLM_FFN_SILU, true, + false, 0.0, cb, il); cb(cur, "ffn_moe_out", il); @@ -7799,7 +8162,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7848,205 +8211,91 @@ struct llm_build_context { return gf; } - struct lm_ggml_cgraph * build_persimmon() { + struct lm_ggml_cgraph * build_refact() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; - LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - LM_GGML_ASSERT(n_embd_head/2 == hparams.n_rot); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - // inp_pos - contains the positions - struct lm_ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * residual = inpL; + struct lm_ggml_tensor * inpSA = inpL; cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); - // self attention + // self-attention { - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - // split qkv - LM_GGML_ASSERT(n_head_kv == n_head); - - struct lm_ggml_tensor * tmpqkv = lm_ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - cb(tmpqkv, "tmpqkv", il); - - struct lm_ggml_tensor * tmpqkv_perm = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); - cb(tmpqkv_perm, "tmpqkv", il); - - struct lm_ggml_tensor * tmpq = lm_ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - 0 - ); - cb(tmpq, "tmpq", il); - - struct lm_ggml_tensor * tmpk = lm_ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens - ); - cb(tmpk, "tmpk", il); - - // Q/K Layernorm - tmpq = llm_build_norm(ctx0, tmpq, hparams, - model.layers[il].attn_q_norm, - model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); - cb(tmpq, "tmpq", il); - - tmpk = llm_build_norm(ctx0, tmpk, hparams, - model.layers[il].attn_k_norm, - model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); - cb(tmpk, "tmpk", il); - - // RoPE the first n_rot of q/k, pass the other half, and concat. - struct lm_ggml_tensor * qrot = lm_ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, - lm_ggml_element_size(tmpq) * n_embd_head, - lm_ggml_element_size(tmpq) * n_embd_head * n_head, - 0 - ); - cb(qrot, "qrot", il); - - struct lm_ggml_tensor * krot = lm_ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, - lm_ggml_element_size(tmpk) * n_embd_head, - lm_ggml_element_size(tmpk) * n_embd_head * n_head, - 0 - ); - cb(krot, "krot", il); - - // get the second half of tmpq, e.g tmpq[n_rot:, :, :] - struct lm_ggml_tensor * qpass = lm_ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, - lm_ggml_element_size(tmpq) * n_embd_head, - lm_ggml_element_size(tmpq) * n_embd_head * n_head, - lm_ggml_element_size(tmpq) * n_rot - ); - cb(qpass, "qpass", il); - - struct lm_ggml_tensor * kpass = lm_ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, - lm_ggml_element_size(tmpk) * n_embd_head, - lm_ggml_element_size(tmpk) * n_embd_head * n_head, - lm_ggml_element_size(tmpk) * n_rot - ); - cb(kpass, "kpass", il); - - struct lm_ggml_tensor * qrotated = lm_ggml_rope_custom( - ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(qrotated, "qrotated", il); - - struct lm_ggml_tensor * krotated = lm_ggml_rope_custom( - ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(krotated, "krotated", il); - - // ggml currently only supports concatenation on dim=2 - // so we need to permute qrot, qpass, concat, then permute back. - qrotated = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); - cb(qrotated, "qrotated", il); - - krotated = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, krotated, 2, 1, 0, 3)); - cb(krotated, "krotated", il); - - qpass = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - cb(qpass, "qpass", il); - - kpass = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, kpass, 2, 1, 0, 3)); - cb(kpass, "kpass", il); - - struct lm_ggml_tensor * Qcur = lm_ggml_concat(ctx0, qrotated, qpass); + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, krotated, kpass); + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct lm_ggml_tensor * Q = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Qcur, 2, 1, 0, 3)); - cb(Q, "Q", il); + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - Kcur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); + Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); cb(Kcur, "Kcur", il); - struct lm_ggml_tensor * Vcur = lm_ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 - ); - cb(Vcur, "Vcur", il); + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { // skip computing output for unused tokens struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); - residual = lm_ggml_get_rows(ctx0, residual, inp_out_ids); + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, residual, cur); + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network { cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, NULL, - LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } cur = lm_ggml_add(ctx0, cur, ffn_inp); cb(cur, "l_out", il); + // input for next layer inpL = cur; } cur = inpL; cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, - model.output_norm_b, - LLM_NORM, cb, -1); + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); + // lm_head cur = lm_ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -8055,114 +8304,21 @@ struct llm_build_context { return gf; } - struct lm_ggml_cgraph * build_refact() { + struct lm_ggml_cgraph * build_bert() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; + struct lm_ggml_tensor * inp_pos = nullptr; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); - - // positions of the tokens in the KV cache - struct lm_ggml_tensor * KQ_pos = build_inp_KQ_pos(); - - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * inpSA = inpL; - - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cb(Kcur, "Kcur", il); - - Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cb(Qcur, "Qcur", il); - - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = lm_ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; + if (model.arch != LLM_ARCH_JINA_BERT_V2) { + inp_pos = build_inp_pos(); } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - lm_ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct lm_ggml_cgraph * build_bert() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - - LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct lm_ggml_tensor * cur; - struct lm_ggml_tensor * inpL; - - struct lm_ggml_tensor * inp_pos = build_inp_pos(); struct lm_ggml_tensor * inp_mean = build_inp_mean(); struct lm_ggml_tensor * inp_cls = build_inp_cls(); @@ -8193,13 +8349,26 @@ struct llm_build_context { struct lm_ggml_tensor * Vcur; // self-attention - if (model.arch == LLM_ARCH_BERT) { + if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { Qcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); + if (model.layers[il].attn_q_norm) { + Qcur = llm_build_norm(ctx0, Qcur, hparams, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, cb, il); + } + Kcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); cb(Kcur, "Kcur", il); + if (model.layers[il].attn_k_norm) { + Kcur = llm_build_norm(ctx0, Kcur, hparams, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, cb, il); + } Vcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -8218,15 +8387,15 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8239,7 +8408,7 @@ struct llm_build_context { struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q); cb(kq, "kq", il); - kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); @@ -8290,6 +8459,13 @@ struct llm_build_context { model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, @@ -8356,9 +8532,6 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); - // positions of the tokens in the KV cache - struct lm_ggml_tensor * KQ_pos = build_inp_KQ_pos(); - inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, @@ -8392,7 +8565,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -8457,9 +8630,6 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); - // positions of the tokens in the KV cache - struct lm_ggml_tensor * KQ_pos = build_inp_KQ_pos(); - if (model.pos_embd) { // inp_pos - contains the positions struct lm_ggml_tensor * inp_pos = build_inp_pos(); @@ -8523,13 +8693,13 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } } @@ -8657,15 +8827,15 @@ struct llm_build_context { } - Qcur = lm_ggml_rope_custom( - ctx0, Qcur, inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, Kcur, inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8673,7 +8843,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -8777,21 +8947,21 @@ struct llm_build_context { Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode - Qcur = lm_ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Qcur = lm_ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Kcur = lm_ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -8888,15 +9058,15 @@ struct llm_build_context { Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8904,7 +9074,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -9002,15 +9172,15 @@ struct llm_build_context { Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9018,7 +9188,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -9046,6 +9216,7 @@ struct llm_build_context { model.layers[il].ffn_down_exps, n_expert, n_expert_used, LLM_FFN_SILU, false, + false, 0.0, cb, il); cb(cur, "ffn_moe_out", il); @@ -9154,8 +9325,8 @@ struct llm_build_context { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Qcur = lm_ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Qcur = lm_ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); @@ -9165,15 +9336,15 @@ struct llm_build_context { Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Kcur = lm_ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -9242,6 +9413,9 @@ struct llm_build_context { // self-attention { + // rope freq factors for 128k context + struct lm_ggml_tensor * rope_factors = build_rope_factors(il); + struct lm_ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, @@ -9273,8 +9447,8 @@ struct llm_build_context { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Qcur = lm_ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Qcur = lm_ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); @@ -9282,15 +9456,15 @@ struct llm_build_context { Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Kcur = lm_ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -9389,21 +9563,21 @@ struct llm_build_context { struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr, n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr, n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } struct lm_ggml_tensor * sa_out = cur; @@ -9506,7 +9680,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -9597,15 +9771,15 @@ struct llm_build_context { cb(tmpk, "tmpk", il); cb(Vcur, "Vcur", il); - struct lm_ggml_tensor * Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, + struct lm_ggml_tensor * Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - struct lm_ggml_tensor * Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, + struct lm_ggml_tensor * Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9613,7 +9787,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -9713,15 +9887,15 @@ struct llm_build_context { // cb(Vcur, "Vcur", il); // } - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9729,7 +9903,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -9830,15 +10004,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9846,7 +10020,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -9960,15 +10134,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9976,7 +10150,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -10080,8 +10254,8 @@ struct llm_build_context { struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); @@ -10089,15 +10263,15 @@ struct llm_build_context { Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); cb(Qcur, "Qcur_scaled", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -10200,15 +10374,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -10216,7 +10390,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -10490,15 +10664,15 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -10506,7 +10680,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -10593,51 +10767,501 @@ struct llm_build_context { // norm cur = llm_build_norm(ctx0, inpL, hparams, - NULL, NULL, - LLM_NORM, cb, il); + NULL, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Qcur = lm_ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Qcur, "Qcur", il); + } + + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Kcur = lm_ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Kcur, "Kcur", il); + } + + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Vcur = lm_ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Vcur, "Vcur", il); + } + + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, nullptr, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + NULL, NULL, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = lm_ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + NULL, NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_gptneox() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = lm_ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // ffn + if (hparams.use_par_res) { + // attention and ffn are computed in parallel + // x = x + attn(ln1(x)) + ffn(ln2(x)) + + struct lm_ggml_tensor * attn_out = cur; + + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + + cur = lm_ggml_add(ctx0, cur, inpL); + cb(cur, "ffn_out", il); + + inpL = lm_ggml_add(ctx0, cur, attn_out); + cb(inpL, "l_out", il); + } else { + // attention and ffn are computed sequentially + // x = x + attn(ln1(x)) + // x = x + ffn(ln2(x)) + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + + inpL = lm_ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_arctic() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + struct lm_ggml_tensor * ffn_out = lm_ggml_add(ctx0, cur, ffn_inp); + cb(ffn_out, "ffn_out", il); + + // MoE + cur = llm_build_norm(ctx0, inpSA, hparams, + model.layers[il].ffn_norm_exps, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm_exps", il); + + cur = llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + cb, il); + cb(cur, "ffn_moe_out", il); + + cur = lm_ggml_add(ctx0, cur, ffn_out); + cb(cur, "ffn_out", il); + + lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = lm_ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_deepseek2() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + bool is_lite = (hparams.n_layer == 27); + + // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. + // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. + const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); + const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k)); + const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); + + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + // {n_embd, n_tokens} + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); - // self-attention + // self_attention { - // compute Q and K and RoPE them - struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (hparams.f_clamp_kqv > 0.0f) { - Qcur = lm_ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(Qcur, "Qcur", il); + struct lm_ggml_tensor * q = NULL; + if (!is_lite) { + // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} + q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q, "q", il); + + q = llm_build_norm(ctx0, q, hparams, + model.layers[il].attn_q_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(q, "q", il); + + // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} + q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + cb(q, "q", il); + } else { + q = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(q, "q", il); } - struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (hparams.f_clamp_kqv > 0.0f) { - Kcur = lm_ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(Kcur, "Kcur", il); - } + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, lm_ggml_element_size(q) * hparams.n_embd_head_k, lm_ggml_element_size(q) * hparams.n_embd_head_k * n_head, 0); + cb(q_nope, "q_nope", il); + // and {n_head * n_embd_head_qk_rope, n_tokens} + struct lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, lm_ggml_element_size(q) * hparams.n_embd_head_k, lm_ggml_element_size(q) * hparams.n_embd_head_k * n_head, lm_ggml_element_size(q) * n_embd_head_qk_nope); + cb(q_pe, "q_pe", il); + + // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} + struct lm_ggml_tensor * compressed_kv_pe = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(compressed_kv_pe, "compressed_kv_pe", il); + + // split into {kv_lora_rank, n_tokens} + struct lm_ggml_tensor * compressed_kv = lm_ggml_view_2d(ctx0, compressed_kv_pe, kv_lora_rank, n_tokens, compressed_kv_pe->nb[1], 0); + cb(compressed_kv, "compressed_kv", il); + // and {n_embd_head_qk_rope, n_tokens} + struct lm_ggml_tensor * k_pe = lm_ggml_view_2d(ctx0, compressed_kv_pe, n_embd_head_qk_rope, n_tokens, compressed_kv_pe->nb[1], lm_ggml_element_size(compressed_kv_pe)*kv_lora_rank); + cb(k_pe, "k_pe", il); + + compressed_kv = llm_build_norm(ctx0, compressed_kv, hparams, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(compressed_kv, "compressed_kv", il); - struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (hparams.f_clamp_kqv > 0.0f) { - Vcur = lm_ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(Vcur, "Vcur", il); - } + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} + struct lm_ggml_tensor * kv = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_b, compressed_kv); + cb(kv, "kv", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct lm_ggml_tensor * k_nope = lm_ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, lm_ggml_element_size(kv) * (n_embd_head_qk_nope + hparams.n_embd_head_v), lm_ggml_element_size(kv) * n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v), 0); + cb(k_nope, "k_nope", il); - Qcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + // and {n_head * n_embd_head_v, n_tokens} + struct lm_ggml_tensor * v_states = lm_ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, lm_ggml_element_size(kv) * (n_embd_head_qk_nope + hparams.n_embd_head_v), lm_ggml_element_size(kv) * n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v), lm_ggml_element_size(kv) * n_embd_head_qk_nope); + cb(v_states, "v_states", il); + + v_states = lm_ggml_cont(ctx0, v_states); + cb(v_states, "v_states", il); + + v_states = lm_ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, lm_ggml_element_size(kv) * hparams.n_embd_head_v * n_head, 0); + cb(v_states, "v_states", il); + + q_pe = lm_ggml_rope_ext( + ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow + ext_factor, attn_factor_scaled, beta_fast, beta_slow ); - cb(Qcur, "Qcur", il); + cb(q_pe, "q_pe", il); - Kcur = lm_ggml_rope_custom( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + // shared RoPE key + k_pe = lm_ggml_rope_ext( + ctx0, lm_ggml_view_3d(ctx0, k_pe, n_embd_head_qk_rope, 1, n_tokens, k_pe->nb[0], k_pe->nb[1], 0), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow + ext_factor, attn_factor_scaled, beta_fast, beta_slow ); - cb(Kcur, "Kcur", il); + cb(k_pe, "k_pe", il); + + struct lm_ggml_tensor * q_states = lm_ggml_concat(ctx0, q_nope, q_pe, 0); + cb(q_states, "q_states", il); + + struct lm_ggml_tensor * k_states = lm_ggml_concat(ctx0, k_nope, lm_ggml_repeat(ctx0, k_pe, q_pe), 0); + cb(k_states, "k_states", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, - model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + model.layers[il].wo, NULL, + k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -10651,27 +11275,54 @@ struct llm_build_context { struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, - NULL, NULL, - LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); + if ((uint32_t) il < hparams.n_layer_dense_lead) { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - cur = lm_ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + lm_ggml_tensor * moe_out = + llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + true, hparams.expert_weights_scale, + cb, il); + cb(moe_out, "ffn_moe_out", il); + + // FFN shared expert + { + lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up_shexp, NULL, + model.layers[il].ffn_gate_shexp, NULL, + model.layers[il].ffn_down_shexp, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(ffn_shexp, "ffn_shexp", il); - lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = lm_ggml_add(ctx0, cur, layer_dir); + cur = lm_ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } } + + cur = lm_ggml_add(ctx0, cur, ffn_inp); cb(cur, "l_out", il); // input for next layer @@ -10681,8 +11332,8 @@ struct llm_build_context { cur = inpL; cur = llm_build_norm(ctx0, cur, hparams, - NULL, NULL, - LLM_NORM, cb, -1); + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); // lm_head @@ -10693,6 +11344,7 @@ struct llm_build_context { return gf; } + }; static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -10809,15 +11461,12 @@ static struct lm_ggml_cgraph * llama_build_graph( { result = llm.build_starcoder(); } break; - case LLM_ARCH_PERSIMMON: - { - result = llm.build_persimmon(); - } break; case LLM_ARCH_REFACT: { result = llm.build_refact(); } break; case LLM_ARCH_BERT: + case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: { result = llm.build_bert(); @@ -10906,6 +11555,18 @@ static struct lm_ggml_cgraph * llama_build_graph( { result = llm.build_olmo(); } break; + case LLM_ARCH_GPTNEOX: + { + result = llm.build_gptneox(); + } break; + case LLM_ARCH_ARCTIC: + { + result = llm.build_arctic(); + } break; + case LLM_ARCH_DEEPSEEK2: + { + result = llm.build_deepseek2(); + } break; default: LM_GGML_ASSERT(false); } @@ -11025,11 +11686,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { f = -INFINITY; } else { - f = 0.0f; + if (hparams.use_alibi) { + f = -fabs(lctx.kv_self.cells[i].pos - pos); + } else { + f = 0.0f; + } } data[h*(n_kv*n_tokens) + j*n_kv + i] = f; } } + + for (int i = n_tokens; i < LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD); ++i) { + for (int j = 0; j < n_kv; ++j) { + data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + } + } } } else { // when using kv cache, the mask needs to match the kv cache size @@ -11048,7 +11719,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { float f = -INFINITY; for (int s = 0; s < batch.n_seq_id[i]; ++s) { if (batch.seq_id[i][s] == seq_id) { - f = 0.0f; + if (hparams.use_alibi) { + f = -fabs(batch.pos[i] - batch.pos[j]); + } else { + f = 0.0f; + } break; } } @@ -11064,21 +11739,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch - // this allows to process multiple sequences in parallel with ALiBi-based models - if (hparams.use_alibi) { - const int64_t n_kv = kv_self.n; - - LM_GGML_ASSERT(lctx.inp_KQ_pos); - LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer)); - - float * data = (float *) lctx.inp_KQ_pos->data; - - for (int i = 0; i < n_kv; ++i) { - data[i] = float(lctx.kv_self.cells[i].pos); - } - } - if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; @@ -11252,11 +11912,6 @@ static void llama_graph_compute( llama_context & lctx, lm_ggml_cgraph * gf, int n_threads) { -#ifdef LM_GGML_USE_MPI - const int64_t n_layer = lctx.model.hparams.n_layer; - lm_ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); -#endif - #ifdef LM_GGML_USE_METAL if (lm_ggml_backend_is_metal(lctx.backend_metal)) { lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); @@ -11271,10 +11926,6 @@ static void llama_graph_compute( lm_ggml_backend_sched_graph_compute_async(lctx.sched, gf); // fprintf(stderr, "splits: %d\n", lm_ggml_backend_sched_get_n_splits(lctx.sched)); - -#ifdef LM_GGML_USE_MPI - lm_ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); -#endif } // decode a batch of tokens by evaluating the transformer @@ -11312,12 +11963,6 @@ static int llama_decode_internal( } lctx.n_queued_tokens += n_tokens_all; -#ifdef LM_GGML_USE_MPI - // TODO: needs fix after #3228 - LM_GGML_ASSERT(false && "not implemented"); - //lm_ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); -#endif - auto & kv_self = lctx.kv_self; const int64_t n_embd = hparams.n_embd; @@ -11448,7 +12093,8 @@ static int llama_decode_internal( // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - kv_self.n = std::min(kv_self.size, std::max(256u, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), 256))); + const uint32_t pad = llama_kv_cache_get_padding(cparams); + kv_self.n = std::min(kv_self.size, std::max(pad, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), pad))); //kv_self.n = llama_kv_cache_cell_max(kv_self); } } @@ -11963,7 +12609,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { LM_GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); LM_GGML_ASSERT(llama_is_byte_token(vocab, id)); - const auto& token_data = vocab.id_to_token.at(id); + const auto & token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { auto buf = token_data.text.substr(3, 2); @@ -12193,12 +12839,14 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; + bool ignore_merges = false; std::vector word_collection; switch (vocab.type) { case LLAMA_VOCAB_TYPE_BPE: switch (vocab.type_pre) { case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + ignore_merges = true; word_collection = unicode_regex_split(text, { // original regex from tokenizer.json //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", @@ -12207,6 +12855,13 @@ struct llm_tokenizer_bpe { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); break; + case LLAMA_VOCAB_PRE_TYPE_DBRX: + case LLAMA_VOCAB_PRE_TYPE_SMAUG: + word_collection = unicode_regex_split(text, { + // same as llama3 + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }); + break; case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: word_collection = unicode_regex_split(text, { "[\r\n]", @@ -12223,14 +12878,13 @@ struct llm_tokenizer_bpe { "\\s?\\p{L}+", "\\s?\\p{P}+", "[一-龥ࠀ-一가-퟿]+", - "\\p{N}+", + "\\p{N}", }); break; case LLAMA_VOCAB_PRE_TYPE_FALCON: word_collection = unicode_regex_split(text, { "[\\p{P}\\$\\+<=>\\^~\\|]+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "\\p{N}+", "[0-9][0-9][0-9]", }); break; @@ -12246,11 +12900,27 @@ struct llm_tokenizer_bpe { }); break; case LLAMA_VOCAB_PRE_TYPE_STARCODER: + case LLAMA_VOCAB_PRE_TYPE_REFACT: + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + word_collection = unicode_regex_split(text, { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; case LLAMA_VOCAB_PRE_TYPE_GPT2: + case LLAMA_VOCAB_PRE_TYPE_OLMO: word_collection = unicode_regex_split(text, { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); break; + case LLAMA_VOCAB_PRE_TYPE_STABLELM2: + case LLAMA_VOCAB_PRE_TYPE_QWEN2: + word_collection = unicode_regex_split(text, { + // original regex from tokenizer.json + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }); + break; default: // default regex for BPE tokenization pre-processing word_collection = unicode_regex_split(text, { @@ -12276,6 +12946,11 @@ struct llm_tokenizer_bpe { int index = 0; size_t offset = 0; + if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { + symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()}); + offset = word.size(); + } + while (offset < word.size()) { llm_symbol sym; size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset])); @@ -12404,7 +13079,7 @@ struct llm_tokenizer_wpm { llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {} void tokenize(const std::string & text, std::vector & output) { - auto * token_map = &vocab.token_to_id; + const auto & token_map = vocab.token_to_id; // normalize and split by whitespace std::vector words = preprocess(text); @@ -12419,108 +13094,89 @@ struct llm_tokenizer_wpm { } // prepend phantom space - std::string word1 = "\xe2\x96\x81" + word; - int n = word1.size(); + const std::string word1 = "\xe2\x96\x81" + word; + const int n = word1.size(); - // we're at the start of a new word - int i = 0; - bool match_any = false; + const size_t current_tokens = output.size(); + // we're at the start of a new word // move through character position in word - while (i < n) { + for (int i = 0; i < n; ++i) { // loop through possible match length bool match = false; for (int j = n; j > i; j--) { - auto it = token_map->find(word1.substr(i, j - i)); - if (it != token_map->end()) { + auto it = token_map.find(word1.substr(i, j - i)); + if (it != token_map.end()) { output.push_back(it->second); match = true; - match_any = true; - i = j; + i = j - 1; break; } } - // must be an unknown character - if (!match) { - i++; + if (!match) { // discard all + output.resize(current_tokens); + break; // and discard next tokens } } // we didn't find any matches for this word - if (!match_any) { + if (current_tokens == output.size()) { output.push_back(vocab.special_unk_id); } } } std::vector preprocess(const std::string & text) { - std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); - - // strip accents, strip control, uniformize whitespace, - // to lowercase, pad chinese characters, pad punctuation - std::string new_str = ""; - for (uint32_t code : cpts_nfd) { - int type = unicode_cpt_type(code); - if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) { + const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); + std::vector words(1, ""); + + for (const char32_t cpt : cpts_nfd) { + const auto flags = unicode_cpt_flags(cpt); + + if (flags.is_whitespace) { + if (words.back().size()) { // finish previous word if any + words.emplace_back(); + } continue; } - code = unicode_tolower(code); - if (type == CODEPOINT_TYPE_WHITESPACE) { - code = ' '; - } - std::string s = unicode_cpt_to_utf8(code); - if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) { - new_str += " "; - new_str += s; - new_str += " "; - } else { - new_str += s; + + assert (!flags.is_separator); + if (cpt == 0 || cpt == 0xFFFD || flags.is_control) { + continue; } - } - // split by whitespace - uint64_t l = 0; - uint64_t r = 0; - std::vector words; - while (r < new_str.size()) { - // if is whitespace - if (isspace(new_str[r], std::locale::classic())) { - if (r > l) words.push_back(new_str.substr(l, (r - l))); - l = r + 1; - r = l; + const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt)); + if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { + if (words.back().size()) { // finish previous word if any + words.emplace_back(); + } + words.back() = s; // single char word + words.emplace_back(); // start a new word } else { - r += 1; + words.back() += s; // append char to word } } - if (r > l) { - words.push_back(new_str.substr(l, (r - l))); - } - return words; - } - bool is_ascii_punct(uint32_t code) { - if (code > 0xFF) { - return false; + if (!words.back().size()) { + words.pop_back(); } - auto c = char(static_cast(code)); - return ispunct(c, std::locale::classic()); + + return words; } - bool is_chinese_char(uint32_t cpt) { - if ((cpt >= 0x4E00 && cpt <= 0x9FFF) || - (cpt >= 0x3400 && cpt <= 0x4DBF) || + static bool is_chinese_char(uint32_t cpt) { + return + (cpt >= 0x04E00 && cpt <= 0x09FFF) || + (cpt >= 0x03400 && cpt <= 0x04DBF) || (cpt >= 0x20000 && cpt <= 0x2A6DF) || (cpt >= 0x2A700 && cpt <= 0x2B73F) || (cpt >= 0x2B740 && cpt <= 0x2B81F) || (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920 - (cpt >= 0xF900 && cpt <= 0xFAFF) || - (cpt >= 0x2F800 && cpt <= 0x2FA1F) || - (cpt >= 0x3000 && cpt <= 0x303F) || - (cpt >= 0xFF00 && cpt <= 0xFFEF)) { - return true; // NOLINT - } - return false; + (cpt >= 0x0F900 && cpt <= 0x0FAFF) || + (cpt >= 0x2F800 && cpt <= 0x2FA1F); + //(cpt >= 0x3000 && cpt <= 0x303F) || + //(cpt >= 0xFF00 && cpt <= 0xFFEF); } const llama_vocab & vocab; @@ -12564,9 +13220,8 @@ struct fragment_buffer_variant { static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token - for (const auto & st: vocab.special_tokens_cache) { - const auto & special_token = st.first; - const auto & special_id = st.second; + for (const llama_vocab::id special_id : vocab.special_tokens_cache) { + const auto & special_token = vocab.id_to_token[special_id].text; // for each text fragment std::forward_list::iterator it = buffer.begin(); @@ -12575,7 +13230,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // if a fragment is text ( not yet processed ) if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - auto * raw_text = &(fragment.raw_text); + auto & raw_text = fragment.raw_text; auto raw_text_base_offset = fragment.offset; auto raw_text_base_length = fragment.length; @@ -12585,7 +13240,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // find the first occurrence of a given special token in this fragment // passing offset argument only limit the "search area" but match coordinates // are still relative to the source full raw_text - auto match = raw_text->find(special_token, raw_text_base_offset); + auto match = raw_text.find(special_token, raw_text_base_offset); // no occurrences found, stop processing this fragment for a given special token if (match == std::string::npos) break; @@ -12604,7 +13259,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // left const int64_t left_reminder_offset = raw_text_base_offset + 0; const int64_t left_reminder_length = match - raw_text_base_offset; - buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length); + buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); @@ -12620,7 +13275,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { const int64_t right_reminder_offset = match + special_token.length(); const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); - buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length); + buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); @@ -12673,9 +13328,14 @@ static std::vector llama_tokenize_internal(const llama_vocab & // tokenizer.encode('', add_special_tokens=True) returns [1] // tokenizer.encode('', add_special_tokens=False) returns [] + static const bool rtrim = true; //TODO: as param + bool is_prev_special = false; + bool special_token_rtrim = false; + if (add_special && vocab.special_add_bos != 0) { LM_GGML_ASSERT(vocab.special_bos_id != -1); output.push_back(vocab.special_bos_id); + is_prev_special = true; } for (const auto & fragment : fragment_buffer) { @@ -12687,9 +13347,21 @@ static std::vector llama_tokenize_internal(const llama_vocab & // and passing 'add space prefix' as bool argument // auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); - if (&fragment == &fragment_buffer.front()) { - if (vocab.add_space_prefix) { - raw_text = " " + raw_text; // prefix with space if the first token is not special + + if (special_token_rtrim) { + size_t num_whitespaces = 0; + while (isspace(raw_text[num_whitespaces])) { + num_whitespaces++; + } + if (num_whitespaces == raw_text.size()) { + continue; // skip if all whitespaces + } + raw_text = raw_text.substr(num_whitespaces); + } + + if (vocab.add_space_prefix) { + if (!output.size() || is_prev_special) { // prefix with space if first token + raw_text = " " + raw_text; } } @@ -12701,9 +13373,22 @@ static std::vector llama_tokenize_internal(const llama_vocab & tokenizer.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); + is_prev_special = true; + // phi-3 special tokens without rtrim, works fine for llama-spm too + special_token_rtrim = rtrim + && fragment.token != vocab.special_bos_id + && fragment.token != vocab.special_unk_id + && fragment.token != vocab.special_eos_id; } } + if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { + LLAMA_LOG_WARN( + "%s: Added a BOS token to the prompt as specified by the model but the prompt " + "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + if (add_special && vocab.special_add_eos == 1) { LM_GGML_ASSERT(vocab.special_eos_id != -1); output.push_back(vocab.special_eos_id); @@ -12730,7 +13415,17 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } - LM_GGML_ASSERT(vocab.special_add_eos != 1); + if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { + LLAMA_LOG_WARN( + "%s: Added a BOS token to the prompt as specified by the model but the prompt " + "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + + if (add_special && vocab.special_add_eos == 1) { + LM_GGML_ASSERT(vocab.special_add_eos != -1); + output.push_back(vocab.special_eos_id); + } } break; case LLAMA_VOCAB_TYPE_WPM: { @@ -13084,6 +13779,58 @@ static std::vector llama_grammar_reject_candidates( return rejects; } +static bool llama_grammar_detect_left_recursion( + const std::vector> & rules, + size_t rule_index, + std::vector * rules_visited, + std::vector * rules_in_progress, + std::vector * rules_may_be_empty) { + if ((*rules_in_progress)[rule_index]) { + return true; + } + + (*rules_in_progress)[rule_index] = true; + + const std::vector & rule = rules[rule_index]; + + // First check if the rule might produce the empty string. This could be done combined with the second + // step but it's more readable as two steps. + bool at_rule_start = true; + for (size_t i = 0; i < rule.size(); i++) { + if (llama_grammar_is_end_of_sequence(&rule[i])) { + if (at_rule_start) { + (*rules_may_be_empty)[rule_index] = true; + break; + } + at_rule_start = true; + } else { + at_rule_start = false; + } + } + + // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may + // be empty) + bool recurse_into_nonterminal = true; + for (size_t i = 0; i < rule.size(); i++) { + if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) { + if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) { + return true; + } + if (!((*rules_may_be_empty)[(size_t)rule[i].value])) { + recurse_into_nonterminal = false; + } + } else if (llama_grammar_is_end_of_sequence(&rule[i])) { + recurse_into_nonterminal = true; + } else { + recurse_into_nonterminal = false; + } + } + + (*rules_in_progress)[rule_index] = false; + (*rules_visited)[rule_index] = true; + return false; +} + // // grammar - external // @@ -13103,6 +13850,19 @@ struct llama_grammar * llama_grammar_init( vec_rules[i].push_back({LLAMA_GRETYPE_END, 0}); } + // Check for left recursion + std::vector rules_visited(n_rules); + std::vector rules_in_progress(n_rules); + std::vector rules_may_be_empty(n_rules); + for (size_t i = 0; i < n_rules; i++) { + if (rules_visited[i]) { + continue; + } + if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) { + throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i)); + } + } + // loop over alternates of start rule to build initial stacks std::vector> stacks; pos = vec_rules[start_rule_index].data(); @@ -13125,6 +13885,9 @@ struct llama_grammar * llama_grammar_init( } } while (true); + // Important: vec_rules has to be moved here, not copied, because stacks contains + // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar + // then the pointers would be invalidated when the local vec_rules goes out of scope. return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} }; } @@ -13719,9 +14482,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ // Sample the next word X using top-k sampling llama_sample_top_k(nullptr, candidates, int(k), 1); - if (ctx) { - ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us; - } + ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us; llama_token X = llama_sample_token(ctx, candidates); t_start_sample_us = lm_ggml_time_us(); @@ -13735,9 +14496,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ // Update mu using the learning rate and error *mu = *mu - eta * e; - if (ctx) { - ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us; - } + ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us; return X; } @@ -14153,13 +14912,16 @@ static void llama_tensor_dequantize_internal( if (qtype.to_float == NULL) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", lm_ggml_type_name(tensor->type))); } - } else if (tensor->type != LM_GGML_TYPE_F16) { + } else if (tensor->type != LM_GGML_TYPE_F16 && + tensor->type != LM_GGML_TYPE_BF16) { throw std::runtime_error(format("cannot dequantize/convert tensor type %s", lm_ggml_type_name(tensor->type))); } if (nthread < 2) { if (tensor->type == LM_GGML_TYPE_F16) { lm_ggml_fp16_to_fp32_row((lm_ggml_fp16_t *)tensor->data, f32_output, nelements); + } else if (tensor->type == LM_GGML_TYPE_BF16) { + lm_ggml_bf16_to_fp32_row((lm_ggml_bf16_t *)tensor->data, f32_output, nelements); } else if (lm_ggml_is_quantized(tensor->type)) { qtype.to_float(tensor->data, f32_output, nelements); } else { @@ -14168,7 +14930,14 @@ static void llama_tensor_dequantize_internal( return; } - size_t block_size = tensor->type == LM_GGML_TYPE_F16 ? 1 : (size_t)lm_ggml_blck_size(tensor->type); + size_t block_size; + if (tensor->type == LM_GGML_TYPE_F16 || + tensor->type == LM_GGML_TYPE_BF16) { + block_size = 1; + } else { + block_size = (size_t)lm_ggml_blck_size(tensor->type); + } + size_t block_size_bytes = lm_ggml_type_size(tensor->type); LM_GGML_ASSERT(nelements % block_size == 0); @@ -14187,6 +14956,8 @@ static void llama_tensor_dequantize_internal( auto compute = [qtype] (lm_ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { if (typ == LM_GGML_TYPE_F16) { lm_ggml_fp16_to_fp32_row((lm_ggml_fp16_t *)inbuf, outbuf, nels); + } else if (typ == LM_GGML_TYPE_BF16) { + lm_ggml_bf16_to_fp32_row((lm_ggml_bf16_t *)inbuf, outbuf, nels); } else { qtype.to_float(inbuf, outbuf, nels); } @@ -14310,8 +15081,6 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = LM_GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = LM_GGML_TYPE_Q5_K; - else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && - (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = LM_GGML_TYPE_Q6_K; if (qs.model.type == MODEL_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with @@ -14547,6 +15316,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = LM_GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = LM_GGML_TYPE_Q8_0; break; case LLAMA_FTYPE_MOSTLY_F16: default_type = LM_GGML_TYPE_F16; break; + case LLAMA_FTYPE_MOSTLY_BF16: default_type = LM_GGML_TYPE_BF16; break; case LLAMA_FTYPE_ALL_F32: default_type = LM_GGML_TYPE_F32; break; // K-quants @@ -15211,6 +15981,7 @@ struct llama_model_params llama_model_default_params() { /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, + /*.rpc_servers =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.kv_overrides =*/ nullptr, @@ -15281,7 +16052,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { } size_t llama_max_devices(void) { -#if defined(LM_GGML_USE_METAL) +#if defined(LM_GGML_USE_RPC) + return LM_GGML_RPC_MAX_SERVERS; +#elif defined(LM_GGML_USE_METAL) return 1; #elif defined(LM_GGML_USE_CUDA) return LM_GGML_CUDA_MAX_DEVICES; @@ -15304,7 +16077,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(LM_GGML_USE_CUDA) || defined(LM_GGML_USE_CLBLAST) || defined(LM_GGML_USE_METAL) || defined(LM_GGML_USE_VULKAN) || \ - defined(LM_GGML_USE_SYCL) || defined(LM_GGML_USE_KOMPUTE) + defined(LM_GGML_USE_SYCL) || defined(LM_GGML_USE_KOMPUTE) || defined(LM_GGML_USE_RPC) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -15321,10 +16094,6 @@ void llama_backend_init(void) { struct lm_ggml_context * ctx = lm_ggml_init(params); lm_ggml_free(ctx); } - -#ifdef LM_GGML_USE_MPI - lm_ggml_mpi_backend_init(); -#endif } void llama_numa_init(enum lm_ggml_numa_strategy numa) { @@ -15334,9 +16103,6 @@ void llama_numa_init(enum lm_ggml_numa_strategy numa) { } void llama_backend_free(void) { -#ifdef LM_GGML_USE_MPI - lm_ggml_mpi_backend_free(); -#endif lm_ggml_quantize_free(); } @@ -15367,7 +16133,17 @@ struct llama_model * llama_load_model_from_file( return true; }; } - + if (params.rpc_servers != nullptr) { + // split the servers set them into model->rpc_servers + std::string servers(params.rpc_servers); + size_t pos = 0; + while ((pos = servers.find(",")) != std::string::npos) { + std::string server = servers.substr(0, pos); + model->rpc_servers.push_back(server); + servers.erase(0, pos + 1); + } + model->rpc_servers.push_back(servers); + } int status = llama_model_load(path_model, *model, params); LM_GGML_ASSERT(status <= 0); if (status < 0) { @@ -15406,6 +16182,11 @@ struct llama_context * llama_new_context_with_model( return nullptr; } + if (params.flash_attn && model->arch == LLM_ARCH_GROK) { + LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__); + params.flash_attn = false; + } + llama_context * ctx = new llama_context(*model); const auto & hparams = model->hparams; @@ -15429,7 +16210,7 @@ struct llama_context * llama_new_context_with_model( cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; // this is necessary due to kv_self.n being padded later during inference - cparams.n_ctx = LM_GGML_PAD(cparams.n_ctx, 256); + cparams.n_ctx = LM_GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams)); // with causal attention, the batch size is limited by the context size cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; @@ -15464,6 +16245,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } + cparams.yarn_attn_factor *= hparams.rope_attn_factor; cparams.causal_attn = hparams.causal_attn; if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { @@ -15474,23 +16256,6 @@ struct llama_context * llama_new_context_with_model( } } - if (cparams.flash_attn && hparams.use_alibi) { - LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__); - cparams.flash_attn = false; - } - - if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) { - LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__); - cparams.flash_attn = false; - } - -#ifdef LM_GGML_USE_HIPBLAS - if (cparams.flash_attn) { - LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__); - cparams.flash_attn = false; - } -#endif - if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); } @@ -15526,7 +16291,17 @@ struct llama_context * llama_new_context_with_model( if (!hparams.vocab_only) { // initialize backends -#ifdef LM_GGML_USE_METAL +#if defined(LM_GGML_USE_RPC) + for (auto & server : model->rpc_servers) { + lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(server.c_str()); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str()); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } +#elif defined(LM_GGML_USE_METAL) if (model->n_gpu_layers > 0) { ctx->backend_metal = lm_ggml_backend_metal_init(); if (ctx->backend_metal == nullptr) { @@ -15682,7 +16457,11 @@ struct llama_context * llama_new_context_with_model( ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*LLAMA_MAX_NODES + lm_ggml_graph_overhead_custom(LLAMA_MAX_NODES, false)); // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary - bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER; + bool pipeline_parallel = + llama_get_device_count(*model) > 1 && + model->n_gpu_layers > (int)model->hparams.n_layer && + model->split_mode == LLAMA_SPLIT_MODE_LAYER && + params.offload_kqv; #ifndef LM_GGML_USE_CUDA // pipeline parallelism requires support for async compute and events // currently this is only implemented in the CUDA backend @@ -15725,20 +16504,6 @@ struct llama_context * llama_new_context_with_model( } } -#ifdef LM_GGML_USE_MPI - ctx->ctx_mpi = lm_ggml_mpi_init(); - - if (lm_ggml_mpi_rank(ctx->ctx_mpi) > 0) { - // Enter a blocking eval loop with dummy input, letting rank=0 drive the process - // TODO: needs fix after #3228 - LM_GGML_ASSERT(false && "not implemented"); - //const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx)); - //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; - llama_backend_free(); - exit(1); - } -#endif - return ctx; } @@ -15775,11 +16540,11 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // these models do not use RoPE case LLM_ARCH_GPT2: case LLM_ARCH_GPTJ: - case LLM_ARCH_GPTNEOX: case LLM_ARCH_MPT: case LLM_ARCH_REFACT: case LLM_ARCH_BLOOM: case LLM_ARCH_MAMBA: + case LLM_ARCH_JINA_BERT_V2: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -15794,13 +16559,14 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: case LLM_ARCH_OLMO: + case LLM_ARCH_ARCTIC: + case LLM_ARCH_DEEPSEEK2: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: case LLM_ARCH_GROK: case LLM_ARCH_DBRX: - case LLM_ARCH_PERSIMMON: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_STABLELM: @@ -15811,6 +16577,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: case LLM_ARCH_STARCODER2: + case LLM_ARCH_GPTNEOX: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here @@ -15970,6 +16737,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const } // make tensors + cvec.tensors.reserve(model.hparams.n_layer); cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 for (size_t il = 1; il < model.hparams.n_layer; il++) { struct lm_ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft); @@ -15978,6 +16746,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const } // allocate tensors / buffers and zero + cvec.ctxs.reserve(ctx_map.size()); + cvec.bufs.reserve(ctx_map.size()); for (auto it : ctx_map) { lm_ggml_backend_buffer_type_t buft = it.first; lm_ggml_context * ctx = it.second; @@ -16801,13 +17571,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam } else { if (cell_range_begin != kv_self.size) { - cell_ranges.push_back({ cell_range_begin, i }); + cell_ranges.emplace_back(cell_range_begin, i); cell_range_begin = kv_self.size; } } } if (cell_range_begin != kv_self.size) { - cell_ranges.push_back({ cell_range_begin, kv_self.size }); + cell_ranges.emplace_back(cell_range_begin, kv_self.size); } // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count @@ -17186,6 +17956,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_ ctx->cparams.n_threads_batch = n_threads_batch; } +uint32_t llama_n_threads(struct llama_context * ctx) { + return ctx->cparams.n_threads; +} + +uint32_t llama_n_threads_batch(struct llama_context * ctx) { + return ctx->cparams.n_threads_batch; +} + void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; @@ -17409,6 +18187,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) { ); } +bool llama_token_is_control(const struct llama_model * model, llama_token token) { + return llama_is_control_token(model->vocab, token); +} + llama_token llama_token_bos(const struct llama_model * model) { return model->vocab.special_bos_id; } @@ -17477,9 +18259,19 @@ int32_t llama_tokenize( static std::string llama_decode_text(const std::string & text) { std::string decoded_text; - auto unicode_sequences = unicode_cpts_from_utf8(text); - for (auto & unicode_sequence : unicode_sequences) { - decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence)); + + const auto cpts = unicode_cpts_from_utf8(text); + for (const auto cpt : cpts) { + const auto utf8 = unicode_cpt_to_utf8(cpt); + try { + decoded_text += unicode_utf8_to_byte(utf8); + } catch (const std::out_of_range & e) { + decoded_text += "[UNK_BYTE_0x"; + for (const auto c : utf8) { + decoded_text += format("%02x", (uint8_t) c); + } + decoded_text += text + "]"; + } } return decoded_text; @@ -17619,6 +18411,15 @@ static int32_t llama_chat_apply_template_internal( } } // llama2 templates seem to not care about "add_generation_prompt" + } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) { + // Phi 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>\n" << message->content << "<|end|>\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) { // zephyr template for (auto message : chat) { @@ -17751,15 +18552,6 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } - } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) { - // Phi 3 - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n"; - } - if (add_ass) { - ss << "<|assistant|>\n"; - } } else { // template not supported return -1; @@ -17843,7 +18635,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) { /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, /*.n_sample =*/ std::max(1, ctx->n_sample), - /*.n_p_eval =*/ std::max(1, ctx->n_p_eval), + /*.n_p_eval =*/ std::max(0, ctx->n_p_eval), /*.n_eval =*/ std::max(1, ctx->n_eval), }; @@ -17881,8 +18673,10 @@ const char * llama_print_system_info(void) { s += "AVX512 = " + std::to_string(lm_ggml_cpu_has_avx512()) + " | "; s += "AVX512_VBMI = " + std::to_string(lm_ggml_cpu_has_avx512_vbmi()) + " | "; s += "AVX512_VNNI = " + std::to_string(lm_ggml_cpu_has_avx512_vnni()) + " | "; + s += "AVX512_BF16 = " + std::to_string(lm_ggml_cpu_has_avx512_bf16()) + " | "; s += "FMA = " + std::to_string(lm_ggml_cpu_has_fma()) + " | "; s += "NEON = " + std::to_string(lm_ggml_cpu_has_neon()) + " | "; + s += "SVE = " + std::to_string(lm_ggml_cpu_has_sve()) + " | "; s += "ARM_FMA = " + std::to_string(lm_ggml_cpu_has_arm_fma()) + " | "; s += "F16C = " + std::to_string(lm_ggml_cpu_has_f16c()) + " | "; s += "FP16_VA = " + std::to_string(lm_ggml_cpu_has_fp16_va()) + " | "; @@ -17941,6 +18735,8 @@ void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) { g_state.log_callback_user_data = user_data; #ifdef LM_GGML_USE_METAL lm_ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); +#elif defined(LM_GGML_USE_CUDA) + lm_ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); #endif } diff --git a/cpp/llama.h b/cpp/llama.h index faab40ca..565cce03 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -79,6 +79,13 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_MPT = 5, LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_REFACT = 8, + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, + LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10, + LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, + LLAMA_VOCAB_PRE_TYPE_OLMO = 12, + LLAMA_VOCAB_PRE_TYPE_DBRX = 13, + LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, }; // note: these values should be synchronized with lm_ggml_rope @@ -134,6 +141,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors + LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -236,6 +244,9 @@ extern "C" { // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; + // comma separated list of RPC servers to use for offloading + const char * rpc_servers; + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // If the provided progress_callback returns true, model loading continues. // If it returns false, model loading is immediately aborted. @@ -254,6 +265,8 @@ extern "C" { bool check_tensors; // validate model tensor data }; + // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations + // https://github.com/ggerganov/llama.cpp/pull/7544 struct llama_context_params { uint32_t seed; // RNG seed, -1 for random uint32_t n_ctx; // text context, 0 = from model @@ -280,14 +293,14 @@ extern "C" { lm_ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; - enum lm_ggml_type type_k; // data type for K cache - enum lm_ggml_type type_v; // data type for V cache + enum lm_ggml_type type_k; // data type for K cache [EXPERIMENTAL] + enum lm_ggml_type type_v; // data type for V cache [EXPERIMENTAL] // Keep the booleans together to avoid misalignment during copy-by-value. bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention + bool flash_attn; // whether to use flash attention [EXPERIMENTAL] // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -749,6 +762,12 @@ extern "C" { // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); + // Get the number of threads used for generation of a single token. + LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx); + + // Get the number of threads used for prompt and batch processing (multiple token). + LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); + // Set whether to use causal attention or not // If set to true, the model will only attend to the past tokens LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); @@ -807,6 +826,9 @@ extern "C" { // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); + // Identify if Token Id is a control token or a render-able token + LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token); + // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence diff --git a/cpp/log.h b/cpp/log.h index d5d4517c..2cd0b543 100644 --- a/cpp/log.h +++ b/cpp/log.h @@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #else #define LOG_FLF_FMT "[%24s:%5ld][%24s] " - #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ #endif #else #define LOG_FLF_FMT "%s" @@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #else #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " - #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ #endif #else #define LOG_TEE_FLF_FMT "%s" @@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: // Main LOG macro. // behaves like printf, and supports arguments the exact same way. // -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__clang__) #define LOG(...) LOG_IMPL(__VA_ARGS__, "") #else #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") @@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: // Secondary target can be changed just like LOG_TARGET // by defining LOG_TEE_TARGET // -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__clang__) #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #else #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") #endif // LOG macro variants with auto endline. -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__clang__) #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #else diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp index a9b6b323..d591e52f 100644 --- a/cpp/sampling.cpp +++ b/cpp/sampling.cpp @@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ result->prev.resize(params.n_prev); + result->n_valid = 0; + llama_sampling_set_rng_seed(result, params.seed); return result; @@ -64,6 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) { std::fill(ctx->prev.begin(), ctx->prev.end(), 0); ctx->cur.clear(); + ctx->n_valid = 0; } void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) { @@ -122,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; if (params.mirostat == 0) { for (auto sampler_type : params.samplers_sequence) { - const auto sampler_type_name = sampler_type_to_name_string(sampler_type); + const auto sampler_type_name = llama_sampling_type_to_str(sampler_type); if (!sampler_type_name.empty()) { result += "-> " + sampler_type_name + " "; } @@ -134,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) { return result; } +std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) { + switch (sampler_type) { + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMPERATURE: return "temperature"; + default : return ""; + } +} + +std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"temperature", llama_sampler_type::TEMPERATURE} + }; + + // since samplers names are written multiple ways + // make it ready for both system names and input names + std::unordered_map sampler_alt_name_map { + {"top-k", llama_sampler_type::TOP_K}, + {"top-p", llama_sampler_type::TOP_P}, + {"nucleus", llama_sampler_type::TOP_P}, + {"typical-p", llama_sampler_type::TYPICAL_P}, + {"typical", llama_sampler_type::TYPICAL_P}, + {"min-p", llama_sampler_type::MIN_P}, + {"tfs-z", llama_sampler_type::TFS_Z}, + {"tfs", llama_sampler_type::TFS_Z}, + {"temp", llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names.size()); + for (const auto & name : names) + { + auto sampler_item = sampler_canonical_name_map.find(name); + if (sampler_item != sampler_canonical_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + else + { + if (allow_alt_names) + { + sampler_item = sampler_alt_name_map.find(name); + if (sampler_item != sampler_alt_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + } + } + } + return sampler_types; +} + +std::vector llama_sampling_types_from_chars(const std::string & names_string) { + std::unordered_map sampler_name_map { + {'k', llama_sampler_type::TOP_K}, + {'p', llama_sampler_type::TOP_P}, + {'y', llama_sampler_type::TYPICAL_P}, + {'m', llama_sampler_type::MIN_P}, + {'f', llama_sampler_type::TFS_Z}, + {'t', llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names_string.size()); + for (const auto & c : names_string) { + const auto sampler_item = sampler_name_map.find(c); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); + } + } + return sampler_types; +} + // no reasons to expose this function in header static void sampler_queue( struct llama_context * ctx_main, @@ -176,7 +260,7 @@ static llama_token llama_sampling_sample_impl( struct llama_context * ctx_main, struct llama_context * ctx_cfg, const int idx, - bool is_resampling) { // Add a parameter to indicate if we are resampling + bool is_resampling) { const llama_sampling_params & params = ctx_sampling->params; const float temp = params.temp; @@ -185,8 +269,8 @@ static llama_token llama_sampling_sample_impl( const float mirostat_eta = params.mirostat_eta; std::vector original_logits; - auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits); - if (!is_resampling) { + auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits); + if (ctx_sampling->grammar != NULL && !is_resampling) { LM_GGML_ASSERT(!original_logits.empty()); } llama_token id = 0; @@ -249,10 +333,12 @@ static llama_token llama_sampling_sample_impl( // Restore logits from the copy std::copy(original_logits.begin(), original_logits.end(), logits); - return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true); } } + ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size; + return id; } @@ -280,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl( // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); - if (apply_grammar && original_logits != NULL) { + if (ctx_sampling->grammar != NULL && !apply_grammar) { + LM_GGML_ASSERT(original_logits != NULL); // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; } @@ -337,7 +424,7 @@ llama_token llama_sampling_sample( struct llama_context * ctx_cfg, const int idx) { // Call the implementation function with is_resampling set to false by default - return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false); } llama_token_data_array llama_sampling_prepare( diff --git a/cpp/sampling.h b/cpp/sampling.h index cf7081e3..eeaa53b8 100644 --- a/cpp/sampling.h +++ b/cpp/sampling.h @@ -81,6 +81,7 @@ struct llama_sampling_context { // TODO: replace with ring-buffer std::vector prev; std::vector cur; + size_t n_valid; // Number of correct top tokens with correct probabilities. std::mt19937 rng; }; @@ -115,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params); // Print sampling order into a string std::string llama_sampling_order_print(const llama_sampling_params & params); +std::string llama_sampling_type_to_str(llama_sampler_type sampler_type); + +std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector llama_sampling_types_from_chars(const std::string & names_string); + // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call diff --git a/cpp/sgemm.cpp b/cpp/sgemm.cpp index 1378e941..16945992 100644 --- a/cpp/sgemm.cpp +++ b/cpp/sgemm.cpp @@ -1,6 +1,3 @@ -// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- -// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi -// // Copyright 2024 Mozilla Foundation // // Permission is hereby granted, free of charge, to any person obtaining @@ -585,15 +582,15 @@ class tinyBLAS_Q0_ARM { }; #endif // __ARM_FEATURE_DOTPROD -#if defined(__AVX2__) || defined(__AVX512F__) +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) template -class tinyBLAS_Q0_AVX2 { +class tinyBLAS_Q0_AVX { public: - tinyBLAS_Q0_AVX2(int64_t k, - const TA *A, int64_t lda, - const TB *B, int64_t ldb, - TC *C, int64_t ldc, - int ith, int nth) + tinyBLAS_Q0_AVX(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, + int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } @@ -728,14 +725,34 @@ class tinyBLAS_Q0_AVX2 { __m256 Cv[RN][RM] = {}; for (int64_t l = 0; l < k; ++l) for (int64_t j = 0; j < RN; ++j) - for (int64_t i = 0; i < RM; ++i) + for (int64_t i = 0; i < RM; ++i) { +#if defined(__AVX2__) + __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l), + load(A + lda * (ii + i) + l)), + _mm256_sign_epi8(load(B + ldb * (jj + j) + l), + load(A + lda * (ii + i) + l))); +#else + __m128i ali0 = load0(A + lda * (ii + i) + l); + __m128i ali1 = load1(A + lda * (ii + i) + l); + __m128i blj0 = load0(B + ldb * (jj + j) + l); + __m128i blj1 = load1(B + ldb * (jj + j) + l); + + __m128i sepAA0 = _mm_sign_epi8(ali0, ali0); + __m128i sepAA1 = _mm_sign_epi8(ali1, ali1); + __m128i sepBA0 = _mm_sign_epi8(blj0, ali0); + __m128i sepBA1 = _mm_sign_epi8(blj1, ali1); + + // updot + const __m128i oneFill = _mm_set1_epi16(1); + __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0); + __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1); + __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0))); +#endif Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) * unhalf(B[ldb * (jj + j) + l].d)), - updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l), - load(A + lda * (ii + i) + l)), - _mm256_sign_epi8(load(B + ldb * (jj + j) + l), - load(A + lda * (ii + i) + l))), - Cv[j][i]); + udTmp, + Cv[j][i]); + } for (int64_t j = 0; j < RN; ++j) for (int64_t i = 0; i < RM; ++i) C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); @@ -746,10 +763,28 @@ class tinyBLAS_Q0_AVX2 { return _mm256_loadu_si256((const __m256i *)b->qs); } + inline __m128i load0(const block_q8_0 *b) { + return _mm_loadu_si128((const __m128i *)b->qs); + } + + inline __m128i load1(const block_q8_0 *b) { + return _mm_loadu_si128(((const __m128i *)b->qs) + 1); + } + inline __m256i load(const block_q4_0 *b) { return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8)); } + inline __m128i load0(const block_q4_0 *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8)); + } + + inline __m128i load1(const block_q4_0 *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8)); + } + inline __m256 updot(__m256i u, __m256i s) { __m256i res; #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) @@ -777,7 +812,7 @@ class tinyBLAS_Q0_AVX2 { const int ith; const int nth; }; -#endif // __AVX2__ +#endif // __AVX__ } // namespace @@ -928,8 +963,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda case LM_GGML_TYPE_Q8_0: { if (Btype != LM_GGML_TYPE_Q8_0) return false; -#if defined(__AVX2__) || defined(__AVX512F__) - tinyBLAS_Q0_AVX2 tb{ +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ k, (const block_q8_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, @@ -952,8 +987,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda case LM_GGML_TYPE_Q4_0: { if (Btype != LM_GGML_TYPE_Q8_0) return false; -#if defined(__AVX2__) || defined(__AVX512F__) - tinyBLAS_Q0_AVX2 tb{ +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ k, (const block_q4_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, diff --git a/cpp/unicode-data.cpp b/cpp/unicode-data.cpp index e6bafb3a..d7c1c898 100644 --- a/cpp/unicode-data.cpp +++ b/cpp/unicode-data.cpp @@ -1,1651 +1,6983 @@ -#include "unicode-data.h" +// generated with scripts/gen-unicode-data.py + +#include "unicode-data.h" #include -#include -#include #include +#include +#include -const std::vector> unicode_ranges_digit = { -{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x00000660, 0x00000669}, -{0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F}, {0x000009E6, 0x000009EF}, -{0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF}, {0x00000B66, 0x00000B6F}, {0x00000BE6, 0x00000BEF}, -{0x00000C66, 0x00000C6F}, {0x00000CE6, 0x00000CEF}, {0x00000D66, 0x00000D6F}, {0x00000DE6, 0x00000DEF}, -{0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F29}, {0x00001040, 0x00001049}, -{0x00001090, 0x00001099}, {0x00001369, 0x00001371}, {0x000017E0, 0x000017E9}, {0x00001810, 0x00001819}, -{0x00001946, 0x0000194F}, {0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99}, -{0x00001B50, 0x00001B59}, {0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59}, -{0x00002070, 0x00002070}, {0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002460, 0x00002468}, -{0x00002474, 0x0000247C}, {0x00002488, 0x00002490}, {0x000024EA, 0x000024EA}, {0x000024F5, 0x000024FD}, -{0x000024FF, 0x000024FF}, {0x00002776, 0x0000277E}, {0x00002780, 0x00002788}, {0x0000278A, 0x00002792}, -{0x0000A620, 0x0000A629}, {0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9}, -{0x0000A9F0, 0x0000A9F9}, {0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19}, -{0x000104A0, 0x000104A9}, {0x00010A40, 0x00010A43}, {0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E68}, -{0x00011052, 0x0001105A}, {0x00011066, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F}, -{0x000111D0, 0x000111D9}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459}, {0x000114D0, 0x000114D9}, -{0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x00011739}, {0x000118E0, 0x000118E9}, -{0x00011950, 0x00011959}, {0x00011C50, 0x00011C59}, {0x00011D50, 0x00011D59}, {0x00011DA0, 0x00011DA9}, -{0x00016A60, 0x00016A69}, {0x00016B50, 0x00016B59}, {0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149}, -{0x0001E2F0, 0x0001E2F9}, {0x0001E950, 0x0001E959}, {0x0001F100, 0x0001F10A}, {0x0001FBF0, 0x0001FBF9}, -}; - -const std::vector> unicode_ranges_letter = { -{0x00000041, 0x0000005A}, {0x00000061, 0x0000007A}, {0x000000AA, 0x000000AA}, {0x000000B5, 0x000000B5}, -{0x000000BA, 0x000000BA}, {0x000000C0, 0x000000D6}, {0x000000D8, 0x000000F6}, {0x000000F8, 0x000002C1}, -{0x000002C6, 0x000002D1}, {0x000002E0, 0x000002E4}, {0x000002EC, 0x000002EC}, {0x000002EE, 0x000002EE}, -{0x00000370, 0x00000374}, {0x00000376, 0x00000377}, {0x0000037A, 0x0000037D}, {0x0000037F, 0x0000037F}, -{0x00000386, 0x00000386}, {0x00000388, 0x0000038A}, {0x0000038C, 0x0000038C}, {0x0000038E, 0x000003A1}, -{0x000003A3, 0x000003F5}, {0x000003F7, 0x00000481}, {0x0000048A, 0x0000052F}, {0x00000531, 0x00000556}, -{0x00000559, 0x00000559}, {0x00000560, 0x00000588}, {0x000005D0, 0x000005EA}, {0x000005EF, 0x000005F2}, -{0x00000620, 0x0000064A}, {0x0000066E, 0x0000066F}, {0x00000671, 0x000006D3}, {0x000006D5, 0x000006D5}, -{0x000006E5, 0x000006E6}, {0x000006EE, 0x000006EF}, {0x000006FA, 0x000006FC}, {0x000006FF, 0x000006FF}, -{0x00000710, 0x00000710}, {0x00000712, 0x0000072F}, {0x0000074D, 0x000007A5}, {0x000007B1, 0x000007B1}, -{0x000007CA, 0x000007EA}, {0x000007F4, 0x000007F5}, {0x000007FA, 0x000007FA}, {0x00000800, 0x00000815}, -{0x0000081A, 0x0000081A}, {0x00000824, 0x00000824}, {0x00000828, 0x00000828}, {0x00000840, 0x00000858}, -{0x00000860, 0x0000086A}, {0x000008A0, 0x000008B4}, {0x000008B6, 0x000008C7}, {0x00000904, 0x00000939}, -{0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961}, {0x00000971, 0x00000980}, -{0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8}, {0x000009AA, 0x000009B0}, -{0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD}, {0x000009CE, 0x000009CE}, -{0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1}, {0x000009FC, 0x000009FC}, -{0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28}, {0x00000A2A, 0x00000A30}, -{0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39}, {0x00000A59, 0x00000A5C}, -{0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D}, {0x00000A8F, 0x00000A91}, -{0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3}, {0x00000AB5, 0x00000AB9}, -{0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1}, {0x00000AF9, 0x00000AF9}, -{0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28}, {0x00000B2A, 0x00000B30}, -{0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D}, {0x00000B5C, 0x00000B5D}, -{0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83}, {0x00000B85, 0x00000B8A}, -{0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A}, {0x00000B9C, 0x00000B9C}, -{0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA}, {0x00000BAE, 0x00000BB9}, -{0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10}, {0x00000C12, 0x00000C28}, -{0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A}, {0x00000C60, 0x00000C61}, -{0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C}, {0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8}, -{0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9}, {0x00000CBD, 0x00000CBD}, {0x00000CDE, 0x00000CDE}, -{0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2}, {0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10}, -{0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D}, {0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56}, -{0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F}, {0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1}, -{0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD}, {0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30}, -{0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46}, {0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84}, -{0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3}, {0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0}, -{0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD}, {0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6}, -{0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00}, {0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C}, -{0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A}, {0x0000103F, 0x0000103F}, {0x00001050, 0x00001055}, -{0x0000105A, 0x0000105D}, {0x00001061, 0x00001061}, {0x00001065, 0x00001066}, {0x0000106E, 0x00001070}, -{0x00001075, 0x00001081}, {0x0000108E, 0x0000108E}, {0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7}, -{0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA}, {0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D}, -{0x00001250, 0x00001256}, {0x00001258, 0x00001258}, {0x0000125A, 0x0000125D}, {0x00001260, 0x00001288}, -{0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0}, {0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE}, -{0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5}, {0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310}, -{0x00001312, 0x00001315}, {0x00001318, 0x0000135A}, {0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5}, -{0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C}, {0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A}, -{0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8}, {0x00001700, 0x0000170C}, {0x0000170E, 0x00001711}, -{0x00001720, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C}, {0x0000176E, 0x00001770}, -{0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC}, {0x00001820, 0x00001878}, -{0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA}, {0x000018B0, 0x000018F5}, -{0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974}, {0x00001980, 0x000019AB}, -{0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54}, {0x00001AA7, 0x00001AA7}, -{0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4B}, {0x00001B83, 0x00001BA0}, {0x00001BAE, 0x00001BAF}, -{0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F}, {0x00001C5A, 0x00001C7D}, -{0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF}, {0x00001CE9, 0x00001CEC}, -{0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA}, {0x00001D00, 0x00001DBF}, -{0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45}, {0x00001F48, 0x00001F4D}, -{0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B}, {0x00001F5D, 0x00001F5D}, -{0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC}, {0x00001FBE, 0x00001FBE}, -{0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3}, {0x00001FD6, 0x00001FDB}, -{0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC}, {0x00002071, 0x00002071}, -{0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102}, {0x00002107, 0x00002107}, -{0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D}, {0x00002124, 0x00002124}, -{0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D}, {0x0000212F, 0x00002139}, -{0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E}, {0x00002183, 0x00002184}, -{0x00002C00, 0x00002C2E}, {0x00002C30, 0x00002C5E}, {0x00002C60, 0x00002CE4}, {0x00002CEB, 0x00002CEE}, -{0x00002CF2, 0x00002CF3}, {0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D}, -{0x00002D30, 0x00002D67}, {0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6}, -{0x00002DA8, 0x00002DAE}, {0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6}, -{0x00002DC8, 0x00002DCE}, {0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F}, -{0x00003005, 0x00003006}, {0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096}, -{0x0000309D, 0x0000309F}, {0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F}, -{0x00003131, 0x0000318E}, {0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF}, -{0x00004E00, 0x00009FFC}, {0x0000A000, 0x0000A48C}, {0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C}, -{0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B}, {0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D}, -{0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F}, {0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7BF}, -{0x0000A7C2, 0x0000A7CA}, {0x0000A7F5, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A}, -{0x0000A80C, 0x0000A822}, {0x0000A840, 0x0000A873}, {0x0000A882, 0x0000A8B3}, {0x0000A8F2, 0x0000A8F7}, -{0x0000A8FB, 0x0000A8FB}, {0x0000A8FD, 0x0000A8FE}, {0x0000A90A, 0x0000A925}, {0x0000A930, 0x0000A946}, -{0x0000A960, 0x0000A97C}, {0x0000A984, 0x0000A9B2}, {0x0000A9CF, 0x0000A9CF}, {0x0000A9E0, 0x0000A9E4}, -{0x0000A9E6, 0x0000A9EF}, {0x0000A9FA, 0x0000A9FE}, {0x0000AA00, 0x0000AA28}, {0x0000AA40, 0x0000AA42}, -{0x0000AA44, 0x0000AA4B}, {0x0000AA60, 0x0000AA76}, {0x0000AA7A, 0x0000AA7A}, {0x0000AA7E, 0x0000AAAF}, -{0x0000AAB1, 0x0000AAB1}, {0x0000AAB5, 0x0000AAB6}, {0x0000AAB9, 0x0000AABD}, {0x0000AAC0, 0x0000AAC0}, -{0x0000AAC2, 0x0000AAC2}, {0x0000AADB, 0x0000AADD}, {0x0000AAE0, 0x0000AAEA}, {0x0000AAF2, 0x0000AAF4}, -{0x0000AB01, 0x0000AB06}, {0x0000AB09, 0x0000AB0E}, {0x0000AB11, 0x0000AB16}, {0x0000AB20, 0x0000AB26}, -{0x0000AB28, 0x0000AB2E}, {0x0000AB30, 0x0000AB5A}, {0x0000AB5C, 0x0000AB69}, {0x0000AB70, 0x0000ABE2}, -{0x0000AC00, 0x0000D7A3}, {0x0000D7B0, 0x0000D7C6}, {0x0000D7CB, 0x0000D7FB}, {0x0000F900, 0x0000FA6D}, -{0x0000FA70, 0x0000FAD9}, {0x0000FB00, 0x0000FB06}, {0x0000FB13, 0x0000FB17}, {0x0000FB1D, 0x0000FB1D}, -{0x0000FB1F, 0x0000FB28}, {0x0000FB2A, 0x0000FB36}, {0x0000FB38, 0x0000FB3C}, {0x0000FB3E, 0x0000FB3E}, -{0x0000FB40, 0x0000FB41}, {0x0000FB43, 0x0000FB44}, {0x0000FB46, 0x0000FBB1}, {0x0000FBD3, 0x0000FD3D}, -{0x0000FD50, 0x0000FD8F}, {0x0000FD92, 0x0000FDC7}, {0x0000FDF0, 0x0000FDFB}, {0x0000FE70, 0x0000FE74}, -{0x0000FE76, 0x0000FEFC}, {0x0000FF21, 0x0000FF3A}, {0x0000FF41, 0x0000FF5A}, {0x0000FF66, 0x0000FFBE}, -{0x0000FFC2, 0x0000FFC7}, {0x0000FFCA, 0x0000FFCF}, {0x0000FFD2, 0x0000FFD7}, {0x0000FFDA, 0x0000FFDC}, -{0x00010000, 0x0001000B}, {0x0001000D, 0x00010026}, {0x00010028, 0x0001003A}, {0x0001003C, 0x0001003D}, -{0x0001003F, 0x0001004D}, {0x00010050, 0x0001005D}, {0x00010080, 0x000100FA}, {0x00010280, 0x0001029C}, -{0x000102A0, 0x000102D0}, {0x00010300, 0x0001031F}, {0x0001032D, 0x00010340}, {0x00010342, 0x00010349}, -{0x00010350, 0x00010375}, {0x00010380, 0x0001039D}, {0x000103A0, 0x000103C3}, {0x000103C8, 0x000103CF}, -{0x00010400, 0x0001049D}, {0x000104B0, 0x000104D3}, {0x000104D8, 0x000104FB}, {0x00010500, 0x00010527}, -{0x00010530, 0x00010563}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767}, -{0x00010800, 0x00010805}, {0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838}, -{0x0001083C, 0x0001083C}, {0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E}, -{0x000108E0, 0x000108F2}, {0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939}, -{0x00010980, 0x000109B7}, {0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13}, -{0x00010A15, 0x00010A17}, {0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C}, -{0x00010AC0, 0x00010AC7}, {0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55}, -{0x00010B60, 0x00010B72}, {0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2}, -{0x00010CC0, 0x00010CF2}, {0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1}, -{0x00010F00, 0x00010F1C}, {0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010FB0, 0x00010FC4}, -{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8}, -{0x00011103, 0x00011126}, {0x00011144, 0x00011144}, {0x00011147, 0x00011147}, {0x00011150, 0x00011172}, -{0x00011176, 0x00011176}, {0x00011183, 0x000111B2}, {0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA}, -{0x000111DC, 0x000111DC}, {0x00011200, 0x00011211}, {0x00011213, 0x0001122B}, {0x00011280, 0x00011286}, -{0x00011288, 0x00011288}, {0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8}, -{0x000112B0, 0x000112DE}, {0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328}, -{0x0001132A, 0x00011330}, {0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D}, -{0x00011350, 0x00011350}, {0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A}, -{0x0001145F, 0x00011461}, {0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7}, -{0x00011580, 0x000115AE}, {0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644}, -{0x00011680, 0x000116AA}, {0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011800, 0x0001182B}, -{0x000118A0, 0x000118DF}, {0x000118FF, 0x00011906}, {0x00011909, 0x00011909}, {0x0001190C, 0x00011913}, -{0x00011915, 0x00011916}, {0x00011918, 0x0001192F}, {0x0001193F, 0x0001193F}, {0x00011941, 0x00011941}, -{0x000119A0, 0x000119A7}, {0x000119AA, 0x000119D0}, {0x000119E1, 0x000119E1}, {0x000119E3, 0x000119E3}, -{0x00011A00, 0x00011A00}, {0x00011A0B, 0x00011A32}, {0x00011A3A, 0x00011A3A}, {0x00011A50, 0x00011A50}, -{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AC0, 0x00011AF8}, {0x00011C00, 0x00011C08}, -{0x00011C0A, 0x00011C2E}, {0x00011C40, 0x00011C40}, {0x00011C72, 0x00011C8F}, {0x00011D00, 0x00011D06}, -{0x00011D08, 0x00011D09}, {0x00011D0B, 0x00011D30}, {0x00011D46, 0x00011D46}, {0x00011D60, 0x00011D65}, -{0x00011D67, 0x00011D68}, {0x00011D6A, 0x00011D89}, {0x00011D98, 0x00011D98}, {0x00011EE0, 0x00011EF2}, -{0x00011FB0, 0x00011FB0}, {0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00013000, 0x0001342E}, -{0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E}, {0x00016AD0, 0x00016AED}, -{0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43}, {0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F}, -{0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A}, {0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F}, -{0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3}, {0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5}, -{0x00018D00, 0x00018D08}, {0x0001B000, 0x0001B11E}, {0x0001B150, 0x0001B152}, {0x0001B164, 0x0001B167}, -{0x0001B170, 0x0001B2FB}, {0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88}, -{0x0001BC90, 0x0001BC99}, {0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F}, -{0x0001D4A2, 0x0001D4A2}, {0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9}, -{0x0001D4BB, 0x0001D4BB}, {0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A}, -{0x0001D50D, 0x0001D514}, {0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E}, -{0x0001D540, 0x0001D544}, {0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5}, -{0x0001D6A8, 0x0001D6C0}, {0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714}, -{0x0001D716, 0x0001D734}, {0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788}, -{0x0001D78A, 0x0001D7A8}, {0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001E100, 0x0001E12C}, -{0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E}, {0x0001E2C0, 0x0001E2EB}, {0x0001E800, 0x0001E8C4}, -{0x0001E900, 0x0001E943}, {0x0001E94B, 0x0001E94B}, {0x0001EE00, 0x0001EE03}, {0x0001EE05, 0x0001EE1F}, -{0x0001EE21, 0x0001EE22}, {0x0001EE24, 0x0001EE24}, {0x0001EE27, 0x0001EE27}, {0x0001EE29, 0x0001EE32}, -{0x0001EE34, 0x0001EE37}, {0x0001EE39, 0x0001EE39}, {0x0001EE3B, 0x0001EE3B}, {0x0001EE42, 0x0001EE42}, -{0x0001EE47, 0x0001EE47}, {0x0001EE49, 0x0001EE49}, {0x0001EE4B, 0x0001EE4B}, {0x0001EE4D, 0x0001EE4F}, -{0x0001EE51, 0x0001EE52}, {0x0001EE54, 0x0001EE54}, {0x0001EE57, 0x0001EE57}, {0x0001EE59, 0x0001EE59}, -{0x0001EE5B, 0x0001EE5B}, {0x0001EE5D, 0x0001EE5D}, {0x0001EE5F, 0x0001EE5F}, {0x0001EE61, 0x0001EE62}, -{0x0001EE64, 0x0001EE64}, {0x0001EE67, 0x0001EE6A}, {0x0001EE6C, 0x0001EE72}, {0x0001EE74, 0x0001EE77}, -{0x0001EE79, 0x0001EE7C}, {0x0001EE7E, 0x0001EE7E}, {0x0001EE80, 0x0001EE89}, {0x0001EE8B, 0x0001EE9B}, -{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DD}, -{0x0002A700, 0x0002B734}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0}, -{0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A}, -}; - -const std::vector> unicode_ranges_whitespace = { -{0x00000009, 0x0000000D}, {0x0000001C, 0x00000020}, {0x00000085, 0x00000085}, {0x000000A0, 0x000000A0}, -{0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, {0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, -{0x0000205F, 0x0000205F}, {0x00003000, 0x00003000}, -}; - -const std::vector> unicode_ranges_accent_mark = { -{0x00000300, 0x0000036F}, {0x00000483, 0x00000489}, {0x00000591, 0x000005BD}, {0x000005BF, 0x000005BF}, -{0x000005C1, 0x000005C2}, {0x000005C4, 0x000005C5}, {0x000005C7, 0x000005C7}, {0x00000610, 0x0000061A}, -{0x0000064B, 0x0000065F}, {0x00000670, 0x00000670}, {0x000006D6, 0x000006DC}, {0x000006DF, 0x000006E4}, -{0x000006E7, 0x000006E8}, {0x000006EA, 0x000006ED}, {0x00000711, 0x00000711}, {0x00000730, 0x0000074A}, -{0x000007A6, 0x000007B0}, {0x000007EB, 0x000007F3}, {0x000007FD, 0x000007FD}, {0x00000816, 0x00000819}, -{0x0000081B, 0x00000823}, {0x00000825, 0x00000827}, {0x00000829, 0x0000082D}, {0x00000859, 0x0000085B}, -{0x000008D3, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C}, {0x0000093E, 0x0000094F}, -{0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983}, {0x000009BC, 0x000009BC}, -{0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD}, {0x000009D7, 0x000009D7}, -{0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03}, {0x00000A3C, 0x00000A3C}, -{0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D}, {0x00000A51, 0x00000A51}, -{0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83}, {0x00000ABC, 0x00000ABC}, -{0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD}, {0x00000AE2, 0x00000AE3}, -{0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C}, {0x00000B3E, 0x00000B44}, -{0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57}, {0x00000B62, 0x00000B63}, -{0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8}, {0x00000BCA, 0x00000BCD}, -{0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48}, -{0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56}, {0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83}, -{0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4}, {0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD}, -{0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C}, -{0x00000D3E, 0x00000D44}, {0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57}, -{0x00000D62, 0x00000D63}, {0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4}, -{0x00000DD6, 0x00000DD6}, {0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31}, -{0x00000E34, 0x00000E3A}, {0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC}, -{0x00000EC8, 0x00000ECD}, {0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37}, -{0x00000F39, 0x00000F39}, {0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87}, -{0x00000F8D, 0x00000F97}, {0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E}, -{0x00001056, 0x00001059}, {0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D}, -{0x00001071, 0x00001074}, {0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D}, -{0x0000135D, 0x0000135F}, {0x00001712, 0x00001714}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753}, -{0x00001772, 0x00001773}, {0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D}, -{0x00001885, 0x00001886}, {0x000018A9, 0x000018A9}, {0x00001920, 0x0000192B}, {0x00001930, 0x0000193B}, -{0x00001A17, 0x00001A1B}, {0x00001A55, 0x00001A5E}, {0x00001A60, 0x00001A7C}, {0x00001A7F, 0x00001A7F}, -{0x00001AB0, 0x00001AC0}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73}, -{0x00001B80, 0x00001B82}, {0x00001BA1, 0x00001BAD}, {0x00001BE6, 0x00001BF3}, {0x00001C24, 0x00001C37}, -{0x00001CD0, 0x00001CD2}, {0x00001CD4, 0x00001CE8}, {0x00001CED, 0x00001CED}, {0x00001CF4, 0x00001CF4}, -{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DF9}, {0x00001DFB, 0x00001DFF}, {0x000020D0, 0x000020F0}, -{0x00002CEF, 0x00002CF1}, {0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F}, -{0x00003099, 0x0000309A}, {0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F}, -{0x0000A6F0, 0x0000A6F1}, {0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B}, -{0x0000A823, 0x0000A827}, {0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5}, -{0x0000A8E0, 0x0000A8F1}, {0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953}, -{0x0000A980, 0x0000A983}, {0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36}, -{0x0000AA43, 0x0000AA43}, {0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0}, -{0x0000AAB2, 0x0000AAB4}, {0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1}, -{0x0000AAEB, 0x0000AAEF}, {0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED}, -{0x0000FB1E, 0x0000FB1E}, {0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD}, -{0x000102E0, 0x000102E0}, {0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06}, -{0x00010A0C, 0x00010A0F}, {0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6}, -{0x00010D24, 0x00010D27}, {0x00010EAB, 0x00010EAC}, {0x00010F46, 0x00010F50}, {0x00011000, 0x00011002}, -{0x00011038, 0x00011046}, {0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x00011100, 0x00011102}, -{0x00011127, 0x00011134}, {0x00011145, 0x00011146}, {0x00011173, 0x00011173}, {0x00011180, 0x00011182}, -{0x000111B3, 0x000111C0}, {0x000111C9, 0x000111CC}, {0x000111CE, 0x000111CF}, {0x0001122C, 0x00011237}, -{0x0001123E, 0x0001123E}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303}, {0x0001133B, 0x0001133C}, -{0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D}, {0x00011357, 0x00011357}, -{0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374}, {0x00011435, 0x00011446}, -{0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5}, {0x000115B8, 0x000115C0}, -{0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7}, {0x0001171D, 0x0001172B}, -{0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938}, {0x0001193B, 0x0001193E}, -{0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7}, {0x000119DA, 0x000119E0}, -{0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39}, {0x00011A3B, 0x00011A3E}, -{0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99}, {0x00011C2F, 0x00011C36}, -{0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6}, {0x00011D31, 0x00011D36}, -{0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45}, {0x00011D47, 0x00011D47}, -{0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97}, {0x00011EF3, 0x00011EF6}, -{0x00016AF0, 0x00016AF4}, {0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87}, -{0x00016F8F, 0x00016F92}, {0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E}, -{0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182}, {0x0001D185, 0x0001D18B}, -{0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36}, {0x0001DA3B, 0x0001DA6C}, -{0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F}, {0x0001DAA1, 0x0001DAAF}, -{0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021}, {0x0001E023, 0x0001E024}, -{0x0001E026, 0x0001E02A}, {0x0001E130, 0x0001E136}, {0x0001E2EC, 0x0001E2EF}, {0x0001E8D0, 0x0001E8D6}, -{0x0001E944, 0x0001E94A}, {0x000E0100, 0x000E01EF}, +const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1 +{0x000000, 0x0080}, +{0x000020, 0x0008}, +{0x000021, 0x0020}, +{0x000024, 0x0040}, +{0x000025, 0x0020}, +{0x00002B, 0x0040}, +{0x00002C, 0x0020}, +{0x000030, 0x0002}, +{0x00003A, 0x0020}, +{0x00003C, 0x0040}, +{0x00003F, 0x0020}, +{0x000041, 0x0004}, +{0x00005B, 0x0020}, +{0x00005E, 0x0040}, +{0x00005F, 0x0020}, +{0x000060, 0x0040}, +{0x000061, 0x0004}, +{0x00007B, 0x0020}, +{0x00007C, 0x0040}, +{0x00007D, 0x0020}, +{0x00007E, 0x0040}, +{0x00007F, 0x0080}, +{0x0000A0, 0x0008}, +{0x0000A1, 0x0020}, +{0x0000A2, 0x0040}, +{0x0000A7, 0x0020}, +{0x0000A8, 0x0040}, +{0x0000AA, 0x0004}, +{0x0000AB, 0x0020}, +{0x0000AC, 0x0040}, +{0x0000AD, 0x0080}, +{0x0000AE, 0x0040}, +{0x0000B2, 0x0002}, +{0x0000B4, 0x0040}, +{0x0000B5, 0x0004}, +{0x0000B6, 0x0020}, +{0x0000B8, 0x0040}, +{0x0000B9, 0x0002}, +{0x0000BA, 0x0004}, +{0x0000BB, 0x0020}, +{0x0000BC, 0x0002}, +{0x0000BF, 0x0020}, +{0x0000C0, 0x0004}, +{0x0000D7, 0x0040}, +{0x0000D8, 0x0004}, +{0x0000F7, 0x0040}, +{0x0000F8, 0x0004}, +{0x0002C2, 0x0040}, +{0x0002C6, 0x0004}, +{0x0002D2, 0x0040}, +{0x0002E0, 0x0004}, +{0x0002E5, 0x0040}, +{0x0002EC, 0x0004}, +{0x0002ED, 0x0040}, +{0x0002EE, 0x0004}, +{0x0002EF, 0x0040}, +{0x000300, 0x0010}, +{0x000370, 0x0004}, +{0x000375, 0x0040}, +{0x000376, 0x0004}, +{0x000378, 0x0080}, +{0x00037A, 0x0004}, +{0x00037E, 0x0020}, +{0x00037F, 0x0004}, +{0x000380, 0x0080}, +{0x000384, 0x0040}, +{0x000386, 0x0004}, +{0x000387, 0x0020}, +{0x000388, 0x0004}, +{0x00038B, 0x0080}, +{0x00038C, 0x0004}, +{0x00038D, 0x0080}, +{0x00038E, 0x0004}, +{0x0003A2, 0x0080}, +{0x0003A3, 0x0004}, +{0x0003F6, 0x0040}, +{0x0003F7, 0x0004}, +{0x000482, 0x0040}, +{0x000483, 0x0010}, +{0x00048A, 0x0004}, +{0x000530, 0x0080}, +{0x000531, 0x0004}, +{0x000557, 0x0080}, +{0x000559, 0x0004}, +{0x00055A, 0x0020}, +{0x000560, 0x0004}, +{0x000589, 0x0020}, +{0x00058B, 0x0080}, +{0x00058D, 0x0040}, +{0x000590, 0x0080}, +{0x000591, 0x0010}, +{0x0005BE, 0x0020}, +{0x0005BF, 0x0010}, +{0x0005C0, 0x0020}, +{0x0005C1, 0x0010}, +{0x0005C3, 0x0020}, +{0x0005C4, 0x0010}, +{0x0005C6, 0x0020}, +{0x0005C7, 0x0010}, +{0x0005C8, 0x0080}, +{0x0005D0, 0x0004}, +{0x0005EB, 0x0080}, +{0x0005EF, 0x0004}, +{0x0005F3, 0x0020}, +{0x0005F5, 0x0080}, +{0x000606, 0x0040}, +{0x000609, 0x0020}, +{0x00060B, 0x0040}, +{0x00060C, 0x0020}, +{0x00060E, 0x0040}, +{0x000610, 0x0010}, +{0x00061B, 0x0020}, +{0x00061C, 0x0080}, +{0x00061D, 0x0020}, +{0x000620, 0x0004}, +{0x00064B, 0x0010}, +{0x000660, 0x0002}, +{0x00066A, 0x0020}, +{0x00066E, 0x0004}, +{0x000670, 0x0010}, +{0x000671, 0x0004}, +{0x0006D4, 0x0020}, +{0x0006D5, 0x0004}, +{0x0006D6, 0x0010}, +{0x0006DD, 0x0080}, +{0x0006DE, 0x0040}, +{0x0006DF, 0x0010}, +{0x0006E5, 0x0004}, +{0x0006E7, 0x0010}, +{0x0006E9, 0x0040}, +{0x0006EA, 0x0010}, +{0x0006EE, 0x0004}, +{0x0006F0, 0x0002}, +{0x0006FA, 0x0004}, +{0x0006FD, 0x0040}, +{0x0006FF, 0x0004}, +{0x000700, 0x0020}, +{0x00070E, 0x0080}, +{0x000710, 0x0004}, +{0x000711, 0x0010}, +{0x000712, 0x0004}, +{0x000730, 0x0010}, +{0x00074B, 0x0080}, +{0x00074D, 0x0004}, +{0x0007A6, 0x0010}, +{0x0007B1, 0x0004}, +{0x0007B2, 0x0080}, +{0x0007C0, 0x0002}, +{0x0007CA, 0x0004}, +{0x0007EB, 0x0010}, +{0x0007F4, 0x0004}, +{0x0007F6, 0x0040}, +{0x0007F7, 0x0020}, +{0x0007FA, 0x0004}, +{0x0007FB, 0x0080}, +{0x0007FD, 0x0010}, +{0x0007FE, 0x0040}, +{0x000800, 0x0004}, +{0x000816, 0x0010}, +{0x00081A, 0x0004}, +{0x00081B, 0x0010}, +{0x000824, 0x0004}, +{0x000825, 0x0010}, +{0x000828, 0x0004}, +{0x000829, 0x0010}, +{0x00082E, 0x0080}, +{0x000830, 0x0020}, +{0x00083F, 0x0080}, +{0x000840, 0x0004}, +{0x000859, 0x0010}, +{0x00085C, 0x0080}, +{0x00085E, 0x0020}, +{0x00085F, 0x0080}, +{0x000860, 0x0004}, +{0x00086B, 0x0080}, +{0x000870, 0x0004}, +{0x000888, 0x0040}, +{0x000889, 0x0004}, +{0x00088F, 0x0080}, +{0x000898, 0x0010}, +{0x0008A0, 0x0004}, +{0x0008CA, 0x0010}, +{0x0008E2, 0x0080}, +{0x0008E3, 0x0010}, +{0x000904, 0x0004}, +{0x00093A, 0x0010}, +{0x00093D, 0x0004}, +{0x00093E, 0x0010}, +{0x000950, 0x0004}, +{0x000951, 0x0010}, +{0x000958, 0x0004}, +{0x000962, 0x0010}, +{0x000964, 0x0020}, +{0x000966, 0x0002}, +{0x000970, 0x0020}, +{0x000971, 0x0004}, +{0x000981, 0x0010}, +{0x000984, 0x0080}, +{0x000985, 0x0004}, +{0x00098D, 0x0080}, +{0x00098F, 0x0004}, +{0x000991, 0x0080}, +{0x000993, 0x0004}, +{0x0009A9, 0x0080}, +{0x0009AA, 0x0004}, +{0x0009B1, 0x0080}, +{0x0009B2, 0x0004}, +{0x0009B3, 0x0080}, +{0x0009B6, 0x0004}, +{0x0009BA, 0x0080}, +{0x0009BC, 0x0010}, +{0x0009BD, 0x0004}, +{0x0009BE, 0x0010}, +{0x0009C5, 0x0080}, +{0x0009C7, 0x0010}, +{0x0009C9, 0x0080}, +{0x0009CB, 0x0010}, +{0x0009CE, 0x0004}, +{0x0009CF, 0x0080}, +{0x0009D7, 0x0010}, +{0x0009D8, 0x0080}, +{0x0009DC, 0x0004}, +{0x0009DE, 0x0080}, +{0x0009DF, 0x0004}, +{0x0009E2, 0x0010}, +{0x0009E4, 0x0080}, +{0x0009E6, 0x0002}, +{0x0009F0, 0x0004}, +{0x0009F2, 0x0040}, +{0x0009F4, 0x0002}, +{0x0009FA, 0x0040}, +{0x0009FC, 0x0004}, +{0x0009FD, 0x0020}, +{0x0009FE, 0x0010}, +{0x0009FF, 0x0080}, +{0x000A01, 0x0010}, +{0x000A04, 0x0080}, +{0x000A05, 0x0004}, +{0x000A0B, 0x0080}, +{0x000A0F, 0x0004}, +{0x000A11, 0x0080}, +{0x000A13, 0x0004}, +{0x000A29, 0x0080}, +{0x000A2A, 0x0004}, +{0x000A31, 0x0080}, +{0x000A32, 0x0004}, +{0x000A34, 0x0080}, +{0x000A35, 0x0004}, +{0x000A37, 0x0080}, +{0x000A38, 0x0004}, +{0x000A3A, 0x0080}, +{0x000A3C, 0x0010}, +{0x000A3D, 0x0080}, +{0x000A3E, 0x0010}, +{0x000A43, 0x0080}, +{0x000A47, 0x0010}, +{0x000A49, 0x0080}, +{0x000A4B, 0x0010}, +{0x000A4E, 0x0080}, +{0x000A51, 0x0010}, +{0x000A52, 0x0080}, +{0x000A59, 0x0004}, +{0x000A5D, 0x0080}, +{0x000A5E, 0x0004}, +{0x000A5F, 0x0080}, +{0x000A66, 0x0002}, +{0x000A70, 0x0010}, +{0x000A72, 0x0004}, +{0x000A75, 0x0010}, +{0x000A76, 0x0020}, +{0x000A77, 0x0080}, +{0x000A81, 0x0010}, +{0x000A84, 0x0080}, +{0x000A85, 0x0004}, +{0x000A8E, 0x0080}, +{0x000A8F, 0x0004}, +{0x000A92, 0x0080}, +{0x000A93, 0x0004}, +{0x000AA9, 0x0080}, +{0x000AAA, 0x0004}, +{0x000AB1, 0x0080}, +{0x000AB2, 0x0004}, +{0x000AB4, 0x0080}, +{0x000AB5, 0x0004}, +{0x000ABA, 0x0080}, +{0x000ABC, 0x0010}, +{0x000ABD, 0x0004}, +{0x000ABE, 0x0010}, +{0x000AC6, 0x0080}, +{0x000AC7, 0x0010}, +{0x000ACA, 0x0080}, +{0x000ACB, 0x0010}, +{0x000ACE, 0x0080}, +{0x000AD0, 0x0004}, +{0x000AD1, 0x0080}, +{0x000AE0, 0x0004}, +{0x000AE2, 0x0010}, +{0x000AE4, 0x0080}, +{0x000AE6, 0x0002}, +{0x000AF0, 0x0020}, +{0x000AF1, 0x0040}, +{0x000AF2, 0x0080}, +{0x000AF9, 0x0004}, +{0x000AFA, 0x0010}, +{0x000B00, 0x0080}, +{0x000B01, 0x0010}, +{0x000B04, 0x0080}, +{0x000B05, 0x0004}, +{0x000B0D, 0x0080}, +{0x000B0F, 0x0004}, +{0x000B11, 0x0080}, +{0x000B13, 0x0004}, +{0x000B29, 0x0080}, +{0x000B2A, 0x0004}, +{0x000B31, 0x0080}, +{0x000B32, 0x0004}, +{0x000B34, 0x0080}, +{0x000B35, 0x0004}, +{0x000B3A, 0x0080}, +{0x000B3C, 0x0010}, +{0x000B3D, 0x0004}, +{0x000B3E, 0x0010}, +{0x000B45, 0x0080}, +{0x000B47, 0x0010}, +{0x000B49, 0x0080}, +{0x000B4B, 0x0010}, +{0x000B4E, 0x0080}, +{0x000B55, 0x0010}, +{0x000B58, 0x0080}, +{0x000B5C, 0x0004}, +{0x000B5E, 0x0080}, +{0x000B5F, 0x0004}, +{0x000B62, 0x0010}, +{0x000B64, 0x0080}, +{0x000B66, 0x0002}, +{0x000B70, 0x0040}, +{0x000B71, 0x0004}, +{0x000B72, 0x0002}, +{0x000B78, 0x0080}, +{0x000B82, 0x0010}, +{0x000B83, 0x0004}, +{0x000B84, 0x0080}, +{0x000B85, 0x0004}, +{0x000B8B, 0x0080}, +{0x000B8E, 0x0004}, +{0x000B91, 0x0080}, +{0x000B92, 0x0004}, +{0x000B96, 0x0080}, +{0x000B99, 0x0004}, +{0x000B9B, 0x0080}, +{0x000B9C, 0x0004}, +{0x000B9D, 0x0080}, +{0x000B9E, 0x0004}, +{0x000BA0, 0x0080}, +{0x000BA3, 0x0004}, +{0x000BA5, 0x0080}, +{0x000BA8, 0x0004}, +{0x000BAB, 0x0080}, +{0x000BAE, 0x0004}, +{0x000BBA, 0x0080}, +{0x000BBE, 0x0010}, +{0x000BC3, 0x0080}, +{0x000BC6, 0x0010}, +{0x000BC9, 0x0080}, +{0x000BCA, 0x0010}, +{0x000BCE, 0x0080}, +{0x000BD0, 0x0004}, +{0x000BD1, 0x0080}, +{0x000BD7, 0x0010}, +{0x000BD8, 0x0080}, +{0x000BE6, 0x0002}, +{0x000BF3, 0x0040}, +{0x000BFB, 0x0080}, +{0x000C00, 0x0010}, +{0x000C05, 0x0004}, +{0x000C0D, 0x0080}, +{0x000C0E, 0x0004}, +{0x000C11, 0x0080}, +{0x000C12, 0x0004}, +{0x000C29, 0x0080}, +{0x000C2A, 0x0004}, +{0x000C3A, 0x0080}, +{0x000C3C, 0x0010}, +{0x000C3D, 0x0004}, +{0x000C3E, 0x0010}, +{0x000C45, 0x0080}, +{0x000C46, 0x0010}, +{0x000C49, 0x0080}, +{0x000C4A, 0x0010}, +{0x000C4E, 0x0080}, +{0x000C55, 0x0010}, +{0x000C57, 0x0080}, +{0x000C58, 0x0004}, +{0x000C5B, 0x0080}, +{0x000C5D, 0x0004}, +{0x000C5E, 0x0080}, +{0x000C60, 0x0004}, +{0x000C62, 0x0010}, +{0x000C64, 0x0080}, +{0x000C66, 0x0002}, +{0x000C70, 0x0080}, +{0x000C77, 0x0020}, +{0x000C78, 0x0002}, +{0x000C7F, 0x0040}, +{0x000C80, 0x0004}, +{0x000C81, 0x0010}, +{0x000C84, 0x0020}, +{0x000C85, 0x0004}, +{0x000C8D, 0x0080}, +{0x000C8E, 0x0004}, +{0x000C91, 0x0080}, +{0x000C92, 0x0004}, +{0x000CA9, 0x0080}, +{0x000CAA, 0x0004}, +{0x000CB4, 0x0080}, +{0x000CB5, 0x0004}, +{0x000CBA, 0x0080}, +{0x000CBC, 0x0010}, +{0x000CBD, 0x0004}, +{0x000CBE, 0x0010}, +{0x000CC5, 0x0080}, +{0x000CC6, 0x0010}, +{0x000CC9, 0x0080}, +{0x000CCA, 0x0010}, +{0x000CCE, 0x0080}, +{0x000CD5, 0x0010}, +{0x000CD7, 0x0080}, +{0x000CDD, 0x0004}, +{0x000CDF, 0x0080}, +{0x000CE0, 0x0004}, +{0x000CE2, 0x0010}, +{0x000CE4, 0x0080}, +{0x000CE6, 0x0002}, +{0x000CF0, 0x0080}, +{0x000CF1, 0x0004}, +{0x000CF3, 0x0010}, +{0x000CF4, 0x0080}, +{0x000D00, 0x0010}, +{0x000D04, 0x0004}, +{0x000D0D, 0x0080}, +{0x000D0E, 0x0004}, +{0x000D11, 0x0080}, +{0x000D12, 0x0004}, +{0x000D3B, 0x0010}, +{0x000D3D, 0x0004}, +{0x000D3E, 0x0010}, +{0x000D45, 0x0080}, +{0x000D46, 0x0010}, +{0x000D49, 0x0080}, +{0x000D4A, 0x0010}, +{0x000D4E, 0x0004}, +{0x000D4F, 0x0040}, +{0x000D50, 0x0080}, +{0x000D54, 0x0004}, +{0x000D57, 0x0010}, +{0x000D58, 0x0002}, +{0x000D5F, 0x0004}, +{0x000D62, 0x0010}, +{0x000D64, 0x0080}, +{0x000D66, 0x0002}, +{0x000D79, 0x0040}, +{0x000D7A, 0x0004}, +{0x000D80, 0x0080}, +{0x000D81, 0x0010}, +{0x000D84, 0x0080}, +{0x000D85, 0x0004}, +{0x000D97, 0x0080}, +{0x000D9A, 0x0004}, +{0x000DB2, 0x0080}, +{0x000DB3, 0x0004}, +{0x000DBC, 0x0080}, +{0x000DBD, 0x0004}, +{0x000DBE, 0x0080}, +{0x000DC0, 0x0004}, +{0x000DC7, 0x0080}, +{0x000DCA, 0x0010}, +{0x000DCB, 0x0080}, +{0x000DCF, 0x0010}, +{0x000DD5, 0x0080}, +{0x000DD6, 0x0010}, +{0x000DD7, 0x0080}, +{0x000DD8, 0x0010}, +{0x000DE0, 0x0080}, +{0x000DE6, 0x0002}, +{0x000DF0, 0x0080}, +{0x000DF2, 0x0010}, +{0x000DF4, 0x0020}, +{0x000DF5, 0x0080}, +{0x000E01, 0x0004}, +{0x000E31, 0x0010}, +{0x000E32, 0x0004}, +{0x000E34, 0x0010}, +{0x000E3B, 0x0080}, +{0x000E3F, 0x0040}, +{0x000E40, 0x0004}, +{0x000E47, 0x0010}, +{0x000E4F, 0x0020}, +{0x000E50, 0x0002}, +{0x000E5A, 0x0020}, +{0x000E5C, 0x0080}, +{0x000E81, 0x0004}, +{0x000E83, 0x0080}, +{0x000E84, 0x0004}, +{0x000E85, 0x0080}, +{0x000E86, 0x0004}, +{0x000E8B, 0x0080}, +{0x000E8C, 0x0004}, +{0x000EA4, 0x0080}, +{0x000EA5, 0x0004}, +{0x000EA6, 0x0080}, +{0x000EA7, 0x0004}, +{0x000EB1, 0x0010}, +{0x000EB2, 0x0004}, +{0x000EB4, 0x0010}, +{0x000EBD, 0x0004}, +{0x000EBE, 0x0080}, +{0x000EC0, 0x0004}, +{0x000EC5, 0x0080}, +{0x000EC6, 0x0004}, +{0x000EC7, 0x0080}, +{0x000EC8, 0x0010}, +{0x000ECF, 0x0080}, +{0x000ED0, 0x0002}, +{0x000EDA, 0x0080}, +{0x000EDC, 0x0004}, +{0x000EE0, 0x0080}, +{0x000F00, 0x0004}, +{0x000F01, 0x0040}, +{0x000F04, 0x0020}, +{0x000F13, 0x0040}, +{0x000F14, 0x0020}, +{0x000F15, 0x0040}, +{0x000F18, 0x0010}, +{0x000F1A, 0x0040}, +{0x000F20, 0x0002}, +{0x000F34, 0x0040}, +{0x000F35, 0x0010}, +{0x000F36, 0x0040}, +{0x000F37, 0x0010}, +{0x000F38, 0x0040}, +{0x000F39, 0x0010}, +{0x000F3A, 0x0020}, +{0x000F3E, 0x0010}, +{0x000F40, 0x0004}, +{0x000F48, 0x0080}, +{0x000F49, 0x0004}, +{0x000F6D, 0x0080}, +{0x000F71, 0x0010}, +{0x000F85, 0x0020}, +{0x000F86, 0x0010}, +{0x000F88, 0x0004}, +{0x000F8D, 0x0010}, +{0x000F98, 0x0080}, +{0x000F99, 0x0010}, +{0x000FBD, 0x0080}, +{0x000FBE, 0x0040}, +{0x000FC6, 0x0010}, +{0x000FC7, 0x0040}, +{0x000FCD, 0x0080}, +{0x000FCE, 0x0040}, +{0x000FD0, 0x0020}, +{0x000FD5, 0x0040}, +{0x000FD9, 0x0020}, +{0x000FDB, 0x0080}, +{0x001000, 0x0004}, +{0x00102B, 0x0010}, +{0x00103F, 0x0004}, +{0x001040, 0x0002}, +{0x00104A, 0x0020}, +{0x001050, 0x0004}, +{0x001056, 0x0010}, +{0x00105A, 0x0004}, +{0x00105E, 0x0010}, +{0x001061, 0x0004}, +{0x001062, 0x0010}, +{0x001065, 0x0004}, +{0x001067, 0x0010}, +{0x00106E, 0x0004}, +{0x001071, 0x0010}, +{0x001075, 0x0004}, +{0x001082, 0x0010}, +{0x00108E, 0x0004}, +{0x00108F, 0x0010}, +{0x001090, 0x0002}, +{0x00109A, 0x0010}, +{0x00109E, 0x0040}, +{0x0010A0, 0x0004}, +{0x0010C6, 0x0080}, +{0x0010C7, 0x0004}, +{0x0010C8, 0x0080}, +{0x0010CD, 0x0004}, +{0x0010CE, 0x0080}, +{0x0010D0, 0x0004}, +{0x0010FB, 0x0020}, +{0x0010FC, 0x0004}, +{0x001249, 0x0080}, +{0x00124A, 0x0004}, +{0x00124E, 0x0080}, +{0x001250, 0x0004}, +{0x001257, 0x0080}, +{0x001258, 0x0004}, +{0x001259, 0x0080}, +{0x00125A, 0x0004}, +{0x00125E, 0x0080}, +{0x001260, 0x0004}, +{0x001289, 0x0080}, +{0x00128A, 0x0004}, +{0x00128E, 0x0080}, +{0x001290, 0x0004}, +{0x0012B1, 0x0080}, +{0x0012B2, 0x0004}, +{0x0012B6, 0x0080}, +{0x0012B8, 0x0004}, +{0x0012BF, 0x0080}, +{0x0012C0, 0x0004}, +{0x0012C1, 0x0080}, +{0x0012C2, 0x0004}, +{0x0012C6, 0x0080}, +{0x0012C8, 0x0004}, +{0x0012D7, 0x0080}, +{0x0012D8, 0x0004}, +{0x001311, 0x0080}, +{0x001312, 0x0004}, +{0x001316, 0x0080}, +{0x001318, 0x0004}, +{0x00135B, 0x0080}, +{0x00135D, 0x0010}, +{0x001360, 0x0020}, +{0x001369, 0x0002}, +{0x00137D, 0x0080}, +{0x001380, 0x0004}, +{0x001390, 0x0040}, +{0x00139A, 0x0080}, +{0x0013A0, 0x0004}, +{0x0013F6, 0x0080}, +{0x0013F8, 0x0004}, +{0x0013FE, 0x0080}, +{0x001400, 0x0020}, +{0x001401, 0x0004}, +{0x00166D, 0x0040}, +{0x00166E, 0x0020}, +{0x00166F, 0x0004}, +{0x001680, 0x0008}, +{0x001681, 0x0004}, +{0x00169B, 0x0020}, +{0x00169D, 0x0080}, +{0x0016A0, 0x0004}, +{0x0016EB, 0x0020}, +{0x0016EE, 0x0002}, +{0x0016F1, 0x0004}, +{0x0016F9, 0x0080}, +{0x001700, 0x0004}, +{0x001712, 0x0010}, +{0x001716, 0x0080}, +{0x00171F, 0x0004}, +{0x001732, 0x0010}, +{0x001735, 0x0020}, +{0x001737, 0x0080}, +{0x001740, 0x0004}, +{0x001752, 0x0010}, +{0x001754, 0x0080}, +{0x001760, 0x0004}, +{0x00176D, 0x0080}, +{0x00176E, 0x0004}, +{0x001771, 0x0080}, +{0x001772, 0x0010}, +{0x001774, 0x0080}, +{0x001780, 0x0004}, +{0x0017B4, 0x0010}, +{0x0017D4, 0x0020}, +{0x0017D7, 0x0004}, +{0x0017D8, 0x0020}, +{0x0017DB, 0x0040}, +{0x0017DC, 0x0004}, +{0x0017DD, 0x0010}, +{0x0017DE, 0x0080}, +{0x0017E0, 0x0002}, +{0x0017EA, 0x0080}, +{0x0017F0, 0x0002}, +{0x0017FA, 0x0080}, +{0x001800, 0x0020}, +{0x00180B, 0x0010}, +{0x00180E, 0x0080}, +{0x00180F, 0x0010}, +{0x001810, 0x0002}, +{0x00181A, 0x0080}, +{0x001820, 0x0004}, +{0x001879, 0x0080}, +{0x001880, 0x0004}, +{0x001885, 0x0010}, +{0x001887, 0x0004}, +{0x0018A9, 0x0010}, +{0x0018AA, 0x0004}, +{0x0018AB, 0x0080}, +{0x0018B0, 0x0004}, +{0x0018F6, 0x0080}, +{0x001900, 0x0004}, +{0x00191F, 0x0080}, +{0x001920, 0x0010}, +{0x00192C, 0x0080}, +{0x001930, 0x0010}, +{0x00193C, 0x0080}, +{0x001940, 0x0040}, +{0x001941, 0x0080}, +{0x001944, 0x0020}, +{0x001946, 0x0002}, +{0x001950, 0x0004}, +{0x00196E, 0x0080}, +{0x001970, 0x0004}, +{0x001975, 0x0080}, +{0x001980, 0x0004}, +{0x0019AC, 0x0080}, +{0x0019B0, 0x0004}, +{0x0019CA, 0x0080}, +{0x0019D0, 0x0002}, +{0x0019DB, 0x0080}, +{0x0019DE, 0x0040}, +{0x001A00, 0x0004}, +{0x001A17, 0x0010}, +{0x001A1C, 0x0080}, +{0x001A1E, 0x0020}, +{0x001A20, 0x0004}, +{0x001A55, 0x0010}, +{0x001A5F, 0x0080}, +{0x001A60, 0x0010}, +{0x001A7D, 0x0080}, +{0x001A7F, 0x0010}, +{0x001A80, 0x0002}, +{0x001A8A, 0x0080}, +{0x001A90, 0x0002}, +{0x001A9A, 0x0080}, +{0x001AA0, 0x0020}, +{0x001AA7, 0x0004}, +{0x001AA8, 0x0020}, +{0x001AAE, 0x0080}, +{0x001AB0, 0x0010}, +{0x001ACF, 0x0080}, +{0x001B00, 0x0010}, +{0x001B05, 0x0004}, +{0x001B34, 0x0010}, +{0x001B45, 0x0004}, +{0x001B4D, 0x0080}, +{0x001B50, 0x0002}, +{0x001B5A, 0x0020}, +{0x001B61, 0x0040}, +{0x001B6B, 0x0010}, +{0x001B74, 0x0040}, +{0x001B7D, 0x0020}, +{0x001B7F, 0x0080}, +{0x001B80, 0x0010}, +{0x001B83, 0x0004}, +{0x001BA1, 0x0010}, +{0x001BAE, 0x0004}, +{0x001BB0, 0x0002}, +{0x001BBA, 0x0004}, +{0x001BE6, 0x0010}, +{0x001BF4, 0x0080}, +{0x001BFC, 0x0020}, +{0x001C00, 0x0004}, +{0x001C24, 0x0010}, +{0x001C38, 0x0080}, +{0x001C3B, 0x0020}, +{0x001C40, 0x0002}, +{0x001C4A, 0x0080}, +{0x001C4D, 0x0004}, +{0x001C50, 0x0002}, +{0x001C5A, 0x0004}, +{0x001C7E, 0x0020}, +{0x001C80, 0x0004}, +{0x001C89, 0x0080}, +{0x001C90, 0x0004}, +{0x001CBB, 0x0080}, +{0x001CBD, 0x0004}, +{0x001CC0, 0x0020}, +{0x001CC8, 0x0080}, +{0x001CD0, 0x0010}, +{0x001CD3, 0x0020}, +{0x001CD4, 0x0010}, +{0x001CE9, 0x0004}, +{0x001CED, 0x0010}, +{0x001CEE, 0x0004}, +{0x001CF4, 0x0010}, +{0x001CF5, 0x0004}, +{0x001CF7, 0x0010}, +{0x001CFA, 0x0004}, +{0x001CFB, 0x0080}, +{0x001D00, 0x0004}, +{0x001DC0, 0x0010}, +{0x001E00, 0x0004}, +{0x001F16, 0x0080}, +{0x001F18, 0x0004}, +{0x001F1E, 0x0080}, +{0x001F20, 0x0004}, +{0x001F46, 0x0080}, +{0x001F48, 0x0004}, +{0x001F4E, 0x0080}, +{0x001F50, 0x0004}, +{0x001F58, 0x0080}, +{0x001F59, 0x0004}, +{0x001F5A, 0x0080}, +{0x001F5B, 0x0004}, +{0x001F5C, 0x0080}, +{0x001F5D, 0x0004}, +{0x001F5E, 0x0080}, +{0x001F5F, 0x0004}, +{0x001F7E, 0x0080}, +{0x001F80, 0x0004}, +{0x001FB5, 0x0080}, +{0x001FB6, 0x0004}, +{0x001FBD, 0x0040}, +{0x001FBE, 0x0004}, +{0x001FBF, 0x0040}, +{0x001FC2, 0x0004}, +{0x001FC5, 0x0080}, +{0x001FC6, 0x0004}, +{0x001FCD, 0x0040}, +{0x001FD0, 0x0004}, +{0x001FD4, 0x0080}, +{0x001FD6, 0x0004}, +{0x001FDC, 0x0080}, +{0x001FDD, 0x0040}, +{0x001FE0, 0x0004}, +{0x001FED, 0x0040}, +{0x001FF0, 0x0080}, +{0x001FF2, 0x0004}, +{0x001FF5, 0x0080}, +{0x001FF6, 0x0004}, +{0x001FFD, 0x0040}, +{0x001FFF, 0x0080}, +{0x002000, 0x0008}, +{0x00200B, 0x0080}, +{0x002010, 0x0020}, +{0x002028, 0x0008}, +{0x00202A, 0x0080}, +{0x00202F, 0x0008}, +{0x002030, 0x0020}, +{0x002044, 0x0040}, +{0x002045, 0x0020}, +{0x002052, 0x0040}, +{0x002053, 0x0020}, +{0x00205F, 0x0008}, +{0x002060, 0x0080}, +{0x002070, 0x0002}, +{0x002071, 0x0004}, +{0x002072, 0x0080}, +{0x002074, 0x0002}, +{0x00207A, 0x0040}, +{0x00207D, 0x0020}, +{0x00207F, 0x0004}, +{0x002080, 0x0002}, +{0x00208A, 0x0040}, +{0x00208D, 0x0020}, +{0x00208F, 0x0080}, +{0x002090, 0x0004}, +{0x00209D, 0x0080}, +{0x0020A0, 0x0040}, +{0x0020C1, 0x0080}, +{0x0020D0, 0x0010}, +{0x0020F1, 0x0080}, +{0x002100, 0x0040}, +{0x002102, 0x0004}, +{0x002103, 0x0040}, +{0x002107, 0x0004}, +{0x002108, 0x0040}, +{0x00210A, 0x0004}, +{0x002114, 0x0040}, +{0x002115, 0x0004}, +{0x002116, 0x0040}, +{0x002119, 0x0004}, +{0x00211E, 0x0040}, +{0x002124, 0x0004}, +{0x002125, 0x0040}, +{0x002126, 0x0004}, +{0x002127, 0x0040}, +{0x002128, 0x0004}, +{0x002129, 0x0040}, +{0x00212A, 0x0004}, +{0x00212E, 0x0040}, +{0x00212F, 0x0004}, +{0x00213A, 0x0040}, +{0x00213C, 0x0004}, +{0x002140, 0x0040}, +{0x002145, 0x0004}, +{0x00214A, 0x0040}, +{0x00214E, 0x0004}, +{0x00214F, 0x0040}, +{0x002150, 0x0002}, +{0x002183, 0x0004}, +{0x002185, 0x0002}, +{0x00218A, 0x0040}, +{0x00218C, 0x0080}, +{0x002190, 0x0040}, +{0x002308, 0x0020}, +{0x00230C, 0x0040}, +{0x002329, 0x0020}, +{0x00232B, 0x0040}, +{0x002427, 0x0080}, +{0x002440, 0x0040}, +{0x00244B, 0x0080}, +{0x002460, 0x0002}, +{0x00249C, 0x0040}, +{0x0024EA, 0x0002}, +{0x002500, 0x0040}, +{0x002768, 0x0020}, +{0x002776, 0x0002}, +{0x002794, 0x0040}, +{0x0027C5, 0x0020}, +{0x0027C7, 0x0040}, +{0x0027E6, 0x0020}, +{0x0027F0, 0x0040}, +{0x002983, 0x0020}, +{0x002999, 0x0040}, +{0x0029D8, 0x0020}, +{0x0029DC, 0x0040}, +{0x0029FC, 0x0020}, +{0x0029FE, 0x0040}, +{0x002B74, 0x0080}, +{0x002B76, 0x0040}, +{0x002B96, 0x0080}, +{0x002B97, 0x0040}, +{0x002C00, 0x0004}, +{0x002CE5, 0x0040}, +{0x002CEB, 0x0004}, +{0x002CEF, 0x0010}, +{0x002CF2, 0x0004}, +{0x002CF4, 0x0080}, +{0x002CF9, 0x0020}, +{0x002CFD, 0x0002}, +{0x002CFE, 0x0020}, +{0x002D00, 0x0004}, +{0x002D26, 0x0080}, +{0x002D27, 0x0004}, +{0x002D28, 0x0080}, +{0x002D2D, 0x0004}, +{0x002D2E, 0x0080}, +{0x002D30, 0x0004}, +{0x002D68, 0x0080}, +{0x002D6F, 0x0004}, +{0x002D70, 0x0020}, +{0x002D71, 0x0080}, +{0x002D7F, 0x0010}, +{0x002D80, 0x0004}, +{0x002D97, 0x0080}, +{0x002DA0, 0x0004}, +{0x002DA7, 0x0080}, +{0x002DA8, 0x0004}, +{0x002DAF, 0x0080}, +{0x002DB0, 0x0004}, +{0x002DB7, 0x0080}, +{0x002DB8, 0x0004}, +{0x002DBF, 0x0080}, +{0x002DC0, 0x0004}, +{0x002DC7, 0x0080}, +{0x002DC8, 0x0004}, +{0x002DCF, 0x0080}, +{0x002DD0, 0x0004}, +{0x002DD7, 0x0080}, +{0x002DD8, 0x0004}, +{0x002DDF, 0x0080}, +{0x002DE0, 0x0010}, +{0x002E00, 0x0020}, +{0x002E2F, 0x0004}, +{0x002E30, 0x0020}, +{0x002E50, 0x0040}, +{0x002E52, 0x0020}, +{0x002E5E, 0x0080}, +{0x002E80, 0x0040}, +{0x002E9A, 0x0080}, +{0x002E9B, 0x0040}, +{0x002EF4, 0x0080}, +{0x002F00, 0x0040}, +{0x002FD6, 0x0080}, +{0x002FF0, 0x0040}, +{0x003000, 0x0008}, +{0x003001, 0x0020}, +{0x003004, 0x0040}, +{0x003005, 0x0004}, +{0x003007, 0x0002}, +{0x003008, 0x0020}, +{0x003012, 0x0040}, +{0x003014, 0x0020}, +{0x003020, 0x0040}, +{0x003021, 0x0002}, +{0x00302A, 0x0010}, +{0x003030, 0x0020}, +{0x003031, 0x0004}, +{0x003036, 0x0040}, +{0x003038, 0x0002}, +{0x00303B, 0x0004}, +{0x00303D, 0x0020}, +{0x00303E, 0x0040}, +{0x003040, 0x0080}, +{0x003041, 0x0004}, +{0x003097, 0x0080}, +{0x003099, 0x0010}, +{0x00309B, 0x0040}, +{0x00309D, 0x0004}, +{0x0030A0, 0x0020}, +{0x0030A1, 0x0004}, +{0x0030FB, 0x0020}, +{0x0030FC, 0x0004}, +{0x003100, 0x0080}, +{0x003105, 0x0004}, +{0x003130, 0x0080}, +{0x003131, 0x0004}, +{0x00318F, 0x0080}, +{0x003190, 0x0040}, +{0x003192, 0x0002}, +{0x003196, 0x0040}, +{0x0031A0, 0x0004}, +{0x0031C0, 0x0040}, +{0x0031E4, 0x0080}, +{0x0031EF, 0x0040}, +{0x0031F0, 0x0004}, +{0x003200, 0x0040}, +{0x00321F, 0x0080}, +{0x003220, 0x0002}, +{0x00322A, 0x0040}, +{0x003248, 0x0002}, +{0x003250, 0x0040}, +{0x003251, 0x0002}, +{0x003260, 0x0040}, +{0x003280, 0x0002}, +{0x00328A, 0x0040}, +{0x0032B1, 0x0002}, +{0x0032C0, 0x0040}, +{0x003400, 0x0004}, +{0x004DC0, 0x0040}, +{0x004E00, 0x0004}, +{0x00A48D, 0x0080}, +{0x00A490, 0x0040}, +{0x00A4C7, 0x0080}, +{0x00A4D0, 0x0004}, +{0x00A4FE, 0x0020}, +{0x00A500, 0x0004}, +{0x00A60D, 0x0020}, +{0x00A610, 0x0004}, +{0x00A620, 0x0002}, +{0x00A62A, 0x0004}, +{0x00A62C, 0x0080}, +{0x00A640, 0x0004}, +{0x00A66F, 0x0010}, +{0x00A673, 0x0020}, +{0x00A674, 0x0010}, +{0x00A67E, 0x0020}, +{0x00A67F, 0x0004}, +{0x00A69E, 0x0010}, +{0x00A6A0, 0x0004}, +{0x00A6E6, 0x0002}, +{0x00A6F0, 0x0010}, +{0x00A6F2, 0x0020}, +{0x00A6F8, 0x0080}, +{0x00A700, 0x0040}, +{0x00A717, 0x0004}, +{0x00A720, 0x0040}, +{0x00A722, 0x0004}, +{0x00A789, 0x0040}, +{0x00A78B, 0x0004}, +{0x00A7CB, 0x0080}, +{0x00A7D0, 0x0004}, +{0x00A7D2, 0x0080}, +{0x00A7D3, 0x0004}, +{0x00A7D4, 0x0080}, +{0x00A7D5, 0x0004}, +{0x00A7DA, 0x0080}, +{0x00A7F2, 0x0004}, +{0x00A802, 0x0010}, +{0x00A803, 0x0004}, +{0x00A806, 0x0010}, +{0x00A807, 0x0004}, +{0x00A80B, 0x0010}, +{0x00A80C, 0x0004}, +{0x00A823, 0x0010}, +{0x00A828, 0x0040}, +{0x00A82C, 0x0010}, +{0x00A82D, 0x0080}, +{0x00A830, 0x0002}, +{0x00A836, 0x0040}, +{0x00A83A, 0x0080}, +{0x00A840, 0x0004}, +{0x00A874, 0x0020}, +{0x00A878, 0x0080}, +{0x00A880, 0x0010}, +{0x00A882, 0x0004}, +{0x00A8B4, 0x0010}, +{0x00A8C6, 0x0080}, +{0x00A8CE, 0x0020}, +{0x00A8D0, 0x0002}, +{0x00A8DA, 0x0080}, +{0x00A8E0, 0x0010}, +{0x00A8F2, 0x0004}, +{0x00A8F8, 0x0020}, +{0x00A8FB, 0x0004}, +{0x00A8FC, 0x0020}, +{0x00A8FD, 0x0004}, +{0x00A8FF, 0x0010}, +{0x00A900, 0x0002}, +{0x00A90A, 0x0004}, +{0x00A926, 0x0010}, +{0x00A92E, 0x0020}, +{0x00A930, 0x0004}, +{0x00A947, 0x0010}, +{0x00A954, 0x0080}, +{0x00A95F, 0x0020}, +{0x00A960, 0x0004}, +{0x00A97D, 0x0080}, +{0x00A980, 0x0010}, +{0x00A984, 0x0004}, +{0x00A9B3, 0x0010}, +{0x00A9C1, 0x0020}, +{0x00A9CE, 0x0080}, +{0x00A9CF, 0x0004}, +{0x00A9D0, 0x0002}, +{0x00A9DA, 0x0080}, +{0x00A9DE, 0x0020}, +{0x00A9E0, 0x0004}, +{0x00A9E5, 0x0010}, +{0x00A9E6, 0x0004}, +{0x00A9F0, 0x0002}, +{0x00A9FA, 0x0004}, +{0x00A9FF, 0x0080}, +{0x00AA00, 0x0004}, +{0x00AA29, 0x0010}, +{0x00AA37, 0x0080}, +{0x00AA40, 0x0004}, +{0x00AA43, 0x0010}, +{0x00AA44, 0x0004}, +{0x00AA4C, 0x0010}, +{0x00AA4E, 0x0080}, +{0x00AA50, 0x0002}, +{0x00AA5A, 0x0080}, +{0x00AA5C, 0x0020}, +{0x00AA60, 0x0004}, +{0x00AA77, 0x0040}, +{0x00AA7A, 0x0004}, +{0x00AA7B, 0x0010}, +{0x00AA7E, 0x0004}, +{0x00AAB0, 0x0010}, +{0x00AAB1, 0x0004}, +{0x00AAB2, 0x0010}, +{0x00AAB5, 0x0004}, +{0x00AAB7, 0x0010}, +{0x00AAB9, 0x0004}, +{0x00AABE, 0x0010}, +{0x00AAC0, 0x0004}, +{0x00AAC1, 0x0010}, +{0x00AAC2, 0x0004}, +{0x00AAC3, 0x0080}, +{0x00AADB, 0x0004}, +{0x00AADE, 0x0020}, +{0x00AAE0, 0x0004}, +{0x00AAEB, 0x0010}, +{0x00AAF0, 0x0020}, +{0x00AAF2, 0x0004}, +{0x00AAF5, 0x0010}, +{0x00AAF7, 0x0080}, +{0x00AB01, 0x0004}, +{0x00AB07, 0x0080}, +{0x00AB09, 0x0004}, +{0x00AB0F, 0x0080}, +{0x00AB11, 0x0004}, +{0x00AB17, 0x0080}, +{0x00AB20, 0x0004}, +{0x00AB27, 0x0080}, +{0x00AB28, 0x0004}, +{0x00AB2F, 0x0080}, +{0x00AB30, 0x0004}, +{0x00AB5B, 0x0040}, +{0x00AB5C, 0x0004}, +{0x00AB6A, 0x0040}, +{0x00AB6C, 0x0080}, +{0x00AB70, 0x0004}, +{0x00ABE3, 0x0010}, +{0x00ABEB, 0x0020}, +{0x00ABEC, 0x0010}, +{0x00ABEE, 0x0080}, +{0x00ABF0, 0x0002}, +{0x00ABFA, 0x0080}, +{0x00AC00, 0x0004}, +{0x00D7A4, 0x0080}, +{0x00D7B0, 0x0004}, +{0x00D7C7, 0x0080}, +{0x00D7CB, 0x0004}, +{0x00D7FC, 0x0080}, +{0x00F900, 0x0004}, +{0x00FA6E, 0x0080}, +{0x00FA70, 0x0004}, +{0x00FADA, 0x0080}, +{0x00FB00, 0x0004}, +{0x00FB07, 0x0080}, +{0x00FB13, 0x0004}, +{0x00FB18, 0x0080}, +{0x00FB1D, 0x0004}, +{0x00FB1E, 0x0010}, +{0x00FB1F, 0x0004}, +{0x00FB29, 0x0040}, +{0x00FB2A, 0x0004}, +{0x00FB37, 0x0080}, +{0x00FB38, 0x0004}, +{0x00FB3D, 0x0080}, +{0x00FB3E, 0x0004}, +{0x00FB3F, 0x0080}, +{0x00FB40, 0x0004}, +{0x00FB42, 0x0080}, +{0x00FB43, 0x0004}, +{0x00FB45, 0x0080}, +{0x00FB46, 0x0004}, +{0x00FBB2, 0x0040}, +{0x00FBC3, 0x0080}, +{0x00FBD3, 0x0004}, +{0x00FD3E, 0x0020}, +{0x00FD40, 0x0040}, +{0x00FD50, 0x0004}, +{0x00FD90, 0x0080}, +{0x00FD92, 0x0004}, +{0x00FDC8, 0x0080}, +{0x00FDCF, 0x0040}, +{0x00FDD0, 0x0080}, +{0x00FDF0, 0x0004}, +{0x00FDFC, 0x0040}, +{0x00FE00, 0x0010}, +{0x00FE10, 0x0020}, +{0x00FE1A, 0x0080}, +{0x00FE20, 0x0010}, +{0x00FE30, 0x0020}, +{0x00FE53, 0x0080}, +{0x00FE54, 0x0020}, +{0x00FE62, 0x0040}, +{0x00FE63, 0x0020}, +{0x00FE64, 0x0040}, +{0x00FE67, 0x0080}, +{0x00FE68, 0x0020}, +{0x00FE69, 0x0040}, +{0x00FE6A, 0x0020}, +{0x00FE6C, 0x0080}, +{0x00FE70, 0x0004}, +{0x00FE75, 0x0080}, +{0x00FE76, 0x0004}, +{0x00FEFD, 0x0080}, +{0x00FF01, 0x0020}, +{0x00FF04, 0x0040}, +{0x00FF05, 0x0020}, +{0x00FF0B, 0x0040}, +{0x00FF0C, 0x0020}, +{0x00FF10, 0x0002}, +{0x00FF1A, 0x0020}, +{0x00FF1C, 0x0040}, +{0x00FF1F, 0x0020}, +{0x00FF21, 0x0004}, +{0x00FF3B, 0x0020}, +{0x00FF3E, 0x0040}, +{0x00FF3F, 0x0020}, +{0x00FF40, 0x0040}, +{0x00FF41, 0x0004}, +{0x00FF5B, 0x0020}, +{0x00FF5C, 0x0040}, +{0x00FF5D, 0x0020}, +{0x00FF5E, 0x0040}, +{0x00FF5F, 0x0020}, +{0x00FF66, 0x0004}, +{0x00FFBF, 0x0080}, +{0x00FFC2, 0x0004}, +{0x00FFC8, 0x0080}, +{0x00FFCA, 0x0004}, +{0x00FFD0, 0x0080}, +{0x00FFD2, 0x0004}, +{0x00FFD8, 0x0080}, +{0x00FFDA, 0x0004}, +{0x00FFDD, 0x0080}, +{0x00FFE0, 0x0040}, +{0x00FFE7, 0x0080}, +{0x00FFE8, 0x0040}, +{0x00FFEF, 0x0080}, +{0x00FFFC, 0x0040}, +{0x00FFFE, 0x0080}, +{0x010000, 0x0004}, +{0x01000C, 0x0080}, +{0x01000D, 0x0004}, +{0x010027, 0x0080}, +{0x010028, 0x0004}, +{0x01003B, 0x0080}, +{0x01003C, 0x0004}, +{0x01003E, 0x0080}, +{0x01003F, 0x0004}, +{0x01004E, 0x0080}, +{0x010050, 0x0004}, +{0x01005E, 0x0080}, +{0x010080, 0x0004}, +{0x0100FB, 0x0080}, +{0x010100, 0x0020}, +{0x010103, 0x0080}, +{0x010107, 0x0002}, +{0x010134, 0x0080}, +{0x010137, 0x0040}, +{0x010140, 0x0002}, +{0x010179, 0x0040}, +{0x01018A, 0x0002}, +{0x01018C, 0x0040}, +{0x01018F, 0x0080}, +{0x010190, 0x0040}, +{0x01019D, 0x0080}, +{0x0101A0, 0x0040}, +{0x0101A1, 0x0080}, +{0x0101D0, 0x0040}, +{0x0101FD, 0x0010}, +{0x0101FE, 0x0080}, +{0x010280, 0x0004}, +{0x01029D, 0x0080}, +{0x0102A0, 0x0004}, +{0x0102D1, 0x0080}, +{0x0102E0, 0x0010}, +{0x0102E1, 0x0002}, +{0x0102FC, 0x0080}, +{0x010300, 0x0004}, +{0x010320, 0x0002}, +{0x010324, 0x0080}, +{0x01032D, 0x0004}, +{0x010341, 0x0002}, +{0x010342, 0x0004}, +{0x01034A, 0x0002}, +{0x01034B, 0x0080}, +{0x010350, 0x0004}, +{0x010376, 0x0010}, +{0x01037B, 0x0080}, +{0x010380, 0x0004}, +{0x01039E, 0x0080}, +{0x01039F, 0x0020}, +{0x0103A0, 0x0004}, +{0x0103C4, 0x0080}, +{0x0103C8, 0x0004}, +{0x0103D0, 0x0020}, +{0x0103D1, 0x0002}, +{0x0103D6, 0x0080}, +{0x010400, 0x0004}, +{0x01049E, 0x0080}, +{0x0104A0, 0x0002}, +{0x0104AA, 0x0080}, +{0x0104B0, 0x0004}, +{0x0104D4, 0x0080}, +{0x0104D8, 0x0004}, +{0x0104FC, 0x0080}, +{0x010500, 0x0004}, +{0x010528, 0x0080}, +{0x010530, 0x0004}, +{0x010564, 0x0080}, +{0x01056F, 0x0020}, +{0x010570, 0x0004}, +{0x01057B, 0x0080}, +{0x01057C, 0x0004}, +{0x01058B, 0x0080}, +{0x01058C, 0x0004}, +{0x010593, 0x0080}, +{0x010594, 0x0004}, +{0x010596, 0x0080}, +{0x010597, 0x0004}, +{0x0105A2, 0x0080}, +{0x0105A3, 0x0004}, +{0x0105B2, 0x0080}, +{0x0105B3, 0x0004}, +{0x0105BA, 0x0080}, +{0x0105BB, 0x0004}, +{0x0105BD, 0x0080}, +{0x010600, 0x0004}, +{0x010737, 0x0080}, +{0x010740, 0x0004}, +{0x010756, 0x0080}, +{0x010760, 0x0004}, +{0x010768, 0x0080}, +{0x010780, 0x0004}, +{0x010786, 0x0080}, +{0x010787, 0x0004}, +{0x0107B1, 0x0080}, +{0x0107B2, 0x0004}, +{0x0107BB, 0x0080}, +{0x010800, 0x0004}, +{0x010806, 0x0080}, +{0x010808, 0x0004}, +{0x010809, 0x0080}, +{0x01080A, 0x0004}, +{0x010836, 0x0080}, +{0x010837, 0x0004}, +{0x010839, 0x0080}, +{0x01083C, 0x0004}, +{0x01083D, 0x0080}, +{0x01083F, 0x0004}, +{0x010856, 0x0080}, +{0x010857, 0x0020}, +{0x010858, 0x0002}, +{0x010860, 0x0004}, +{0x010877, 0x0040}, +{0x010879, 0x0002}, +{0x010880, 0x0004}, +{0x01089F, 0x0080}, +{0x0108A7, 0x0002}, +{0x0108B0, 0x0080}, +{0x0108E0, 0x0004}, +{0x0108F3, 0x0080}, +{0x0108F4, 0x0004}, +{0x0108F6, 0x0080}, +{0x0108FB, 0x0002}, +{0x010900, 0x0004}, +{0x010916, 0x0002}, +{0x01091C, 0x0080}, +{0x01091F, 0x0020}, +{0x010920, 0x0004}, +{0x01093A, 0x0080}, +{0x01093F, 0x0020}, +{0x010940, 0x0080}, +{0x010980, 0x0004}, +{0x0109B8, 0x0080}, +{0x0109BC, 0x0002}, +{0x0109BE, 0x0004}, +{0x0109C0, 0x0002}, +{0x0109D0, 0x0080}, +{0x0109D2, 0x0002}, +{0x010A00, 0x0004}, +{0x010A01, 0x0010}, +{0x010A04, 0x0080}, +{0x010A05, 0x0010}, +{0x010A07, 0x0080}, +{0x010A0C, 0x0010}, +{0x010A10, 0x0004}, +{0x010A14, 0x0080}, +{0x010A15, 0x0004}, +{0x010A18, 0x0080}, +{0x010A19, 0x0004}, +{0x010A36, 0x0080}, +{0x010A38, 0x0010}, +{0x010A3B, 0x0080}, +{0x010A3F, 0x0010}, +{0x010A40, 0x0002}, +{0x010A49, 0x0080}, +{0x010A50, 0x0020}, +{0x010A59, 0x0080}, +{0x010A60, 0x0004}, +{0x010A7D, 0x0002}, +{0x010A7F, 0x0020}, +{0x010A80, 0x0004}, +{0x010A9D, 0x0002}, +{0x010AA0, 0x0080}, +{0x010AC0, 0x0004}, +{0x010AC8, 0x0040}, +{0x010AC9, 0x0004}, +{0x010AE5, 0x0010}, +{0x010AE7, 0x0080}, +{0x010AEB, 0x0002}, +{0x010AF0, 0x0020}, +{0x010AF7, 0x0080}, +{0x010B00, 0x0004}, +{0x010B36, 0x0080}, +{0x010B39, 0x0020}, +{0x010B40, 0x0004}, +{0x010B56, 0x0080}, +{0x010B58, 0x0002}, +{0x010B60, 0x0004}, +{0x010B73, 0x0080}, +{0x010B78, 0x0002}, +{0x010B80, 0x0004}, +{0x010B92, 0x0080}, +{0x010B99, 0x0020}, +{0x010B9D, 0x0080}, +{0x010BA9, 0x0002}, +{0x010BB0, 0x0080}, +{0x010C00, 0x0004}, +{0x010C49, 0x0080}, +{0x010C80, 0x0004}, +{0x010CB3, 0x0080}, +{0x010CC0, 0x0004}, +{0x010CF3, 0x0080}, +{0x010CFA, 0x0002}, +{0x010D00, 0x0004}, +{0x010D24, 0x0010}, +{0x010D28, 0x0080}, +{0x010D30, 0x0002}, +{0x010D3A, 0x0080}, +{0x010E60, 0x0002}, +{0x010E7F, 0x0080}, +{0x010E80, 0x0004}, +{0x010EAA, 0x0080}, +{0x010EAB, 0x0010}, +{0x010EAD, 0x0020}, +{0x010EAE, 0x0080}, +{0x010EB0, 0x0004}, +{0x010EB2, 0x0080}, +{0x010EFD, 0x0010}, +{0x010F00, 0x0004}, +{0x010F1D, 0x0002}, +{0x010F27, 0x0004}, +{0x010F28, 0x0080}, +{0x010F30, 0x0004}, +{0x010F46, 0x0010}, +{0x010F51, 0x0002}, +{0x010F55, 0x0020}, +{0x010F5A, 0x0080}, +{0x010F70, 0x0004}, +{0x010F82, 0x0010}, +{0x010F86, 0x0020}, +{0x010F8A, 0x0080}, +{0x010FB0, 0x0004}, +{0x010FC5, 0x0002}, +{0x010FCC, 0x0080}, +{0x010FE0, 0x0004}, +{0x010FF7, 0x0080}, +{0x011000, 0x0010}, +{0x011003, 0x0004}, +{0x011038, 0x0010}, +{0x011047, 0x0020}, +{0x01104E, 0x0080}, +{0x011052, 0x0002}, +{0x011070, 0x0010}, +{0x011071, 0x0004}, +{0x011073, 0x0010}, +{0x011075, 0x0004}, +{0x011076, 0x0080}, +{0x01107F, 0x0010}, +{0x011083, 0x0004}, +{0x0110B0, 0x0010}, +{0x0110BB, 0x0020}, +{0x0110BD, 0x0080}, +{0x0110BE, 0x0020}, +{0x0110C2, 0x0010}, +{0x0110C3, 0x0080}, +{0x0110D0, 0x0004}, +{0x0110E9, 0x0080}, +{0x0110F0, 0x0002}, +{0x0110FA, 0x0080}, +{0x011100, 0x0010}, +{0x011103, 0x0004}, +{0x011127, 0x0010}, +{0x011135, 0x0080}, +{0x011136, 0x0002}, +{0x011140, 0x0020}, +{0x011144, 0x0004}, +{0x011145, 0x0010}, +{0x011147, 0x0004}, +{0x011148, 0x0080}, +{0x011150, 0x0004}, +{0x011173, 0x0010}, +{0x011174, 0x0020}, +{0x011176, 0x0004}, +{0x011177, 0x0080}, +{0x011180, 0x0010}, +{0x011183, 0x0004}, +{0x0111B3, 0x0010}, +{0x0111C1, 0x0004}, +{0x0111C5, 0x0020}, +{0x0111C9, 0x0010}, +{0x0111CD, 0x0020}, +{0x0111CE, 0x0010}, +{0x0111D0, 0x0002}, +{0x0111DA, 0x0004}, +{0x0111DB, 0x0020}, +{0x0111DC, 0x0004}, +{0x0111DD, 0x0020}, +{0x0111E0, 0x0080}, +{0x0111E1, 0x0002}, +{0x0111F5, 0x0080}, +{0x011200, 0x0004}, +{0x011212, 0x0080}, +{0x011213, 0x0004}, +{0x01122C, 0x0010}, +{0x011238, 0x0020}, +{0x01123E, 0x0010}, +{0x01123F, 0x0004}, +{0x011241, 0x0010}, +{0x011242, 0x0080}, +{0x011280, 0x0004}, +{0x011287, 0x0080}, +{0x011288, 0x0004}, +{0x011289, 0x0080}, +{0x01128A, 0x0004}, +{0x01128E, 0x0080}, +{0x01128F, 0x0004}, +{0x01129E, 0x0080}, +{0x01129F, 0x0004}, +{0x0112A9, 0x0020}, +{0x0112AA, 0x0080}, +{0x0112B0, 0x0004}, +{0x0112DF, 0x0010}, +{0x0112EB, 0x0080}, +{0x0112F0, 0x0002}, +{0x0112FA, 0x0080}, +{0x011300, 0x0010}, +{0x011304, 0x0080}, +{0x011305, 0x0004}, +{0x01130D, 0x0080}, +{0x01130F, 0x0004}, +{0x011311, 0x0080}, +{0x011313, 0x0004}, +{0x011329, 0x0080}, +{0x01132A, 0x0004}, +{0x011331, 0x0080}, +{0x011332, 0x0004}, +{0x011334, 0x0080}, +{0x011335, 0x0004}, +{0x01133A, 0x0080}, +{0x01133B, 0x0010}, +{0x01133D, 0x0004}, +{0x01133E, 0x0010}, +{0x011345, 0x0080}, +{0x011347, 0x0010}, +{0x011349, 0x0080}, +{0x01134B, 0x0010}, +{0x01134E, 0x0080}, +{0x011350, 0x0004}, +{0x011351, 0x0080}, +{0x011357, 0x0010}, +{0x011358, 0x0080}, +{0x01135D, 0x0004}, +{0x011362, 0x0010}, +{0x011364, 0x0080}, +{0x011366, 0x0010}, +{0x01136D, 0x0080}, +{0x011370, 0x0010}, +{0x011375, 0x0080}, +{0x011400, 0x0004}, +{0x011435, 0x0010}, +{0x011447, 0x0004}, +{0x01144B, 0x0020}, +{0x011450, 0x0002}, +{0x01145A, 0x0020}, +{0x01145C, 0x0080}, +{0x01145D, 0x0020}, +{0x01145E, 0x0010}, +{0x01145F, 0x0004}, +{0x011462, 0x0080}, +{0x011480, 0x0004}, +{0x0114B0, 0x0010}, +{0x0114C4, 0x0004}, +{0x0114C6, 0x0020}, +{0x0114C7, 0x0004}, +{0x0114C8, 0x0080}, +{0x0114D0, 0x0002}, +{0x0114DA, 0x0080}, +{0x011580, 0x0004}, +{0x0115AF, 0x0010}, +{0x0115B6, 0x0080}, +{0x0115B8, 0x0010}, +{0x0115C1, 0x0020}, +{0x0115D8, 0x0004}, +{0x0115DC, 0x0010}, +{0x0115DE, 0x0080}, +{0x011600, 0x0004}, +{0x011630, 0x0010}, +{0x011641, 0x0020}, +{0x011644, 0x0004}, +{0x011645, 0x0080}, +{0x011650, 0x0002}, +{0x01165A, 0x0080}, +{0x011660, 0x0020}, +{0x01166D, 0x0080}, +{0x011680, 0x0004}, +{0x0116AB, 0x0010}, +{0x0116B8, 0x0004}, +{0x0116B9, 0x0020}, +{0x0116BA, 0x0080}, +{0x0116C0, 0x0002}, +{0x0116CA, 0x0080}, +{0x011700, 0x0004}, +{0x01171B, 0x0080}, +{0x01171D, 0x0010}, +{0x01172C, 0x0080}, +{0x011730, 0x0002}, +{0x01173C, 0x0020}, +{0x01173F, 0x0040}, +{0x011740, 0x0004}, +{0x011747, 0x0080}, +{0x011800, 0x0004}, +{0x01182C, 0x0010}, +{0x01183B, 0x0020}, +{0x01183C, 0x0080}, +{0x0118A0, 0x0004}, +{0x0118E0, 0x0002}, +{0x0118F3, 0x0080}, +{0x0118FF, 0x0004}, +{0x011907, 0x0080}, +{0x011909, 0x0004}, +{0x01190A, 0x0080}, +{0x01190C, 0x0004}, +{0x011914, 0x0080}, +{0x011915, 0x0004}, +{0x011917, 0x0080}, +{0x011918, 0x0004}, +{0x011930, 0x0010}, +{0x011936, 0x0080}, +{0x011937, 0x0010}, +{0x011939, 0x0080}, +{0x01193B, 0x0010}, +{0x01193F, 0x0004}, +{0x011940, 0x0010}, +{0x011941, 0x0004}, +{0x011942, 0x0010}, +{0x011944, 0x0020}, +{0x011947, 0x0080}, +{0x011950, 0x0002}, +{0x01195A, 0x0080}, +{0x0119A0, 0x0004}, +{0x0119A8, 0x0080}, +{0x0119AA, 0x0004}, +{0x0119D1, 0x0010}, +{0x0119D8, 0x0080}, +{0x0119DA, 0x0010}, +{0x0119E1, 0x0004}, +{0x0119E2, 0x0020}, +{0x0119E3, 0x0004}, +{0x0119E4, 0x0010}, +{0x0119E5, 0x0080}, +{0x011A00, 0x0004}, +{0x011A01, 0x0010}, +{0x011A0B, 0x0004}, +{0x011A33, 0x0010}, +{0x011A3A, 0x0004}, +{0x011A3B, 0x0010}, +{0x011A3F, 0x0020}, +{0x011A47, 0x0010}, +{0x011A48, 0x0080}, +{0x011A50, 0x0004}, +{0x011A51, 0x0010}, +{0x011A5C, 0x0004}, +{0x011A8A, 0x0010}, +{0x011A9A, 0x0020}, +{0x011A9D, 0x0004}, +{0x011A9E, 0x0020}, +{0x011AA3, 0x0080}, +{0x011AB0, 0x0004}, +{0x011AF9, 0x0080}, +{0x011B00, 0x0020}, +{0x011B0A, 0x0080}, +{0x011C00, 0x0004}, +{0x011C09, 0x0080}, +{0x011C0A, 0x0004}, +{0x011C2F, 0x0010}, +{0x011C37, 0x0080}, +{0x011C38, 0x0010}, +{0x011C40, 0x0004}, +{0x011C41, 0x0020}, +{0x011C46, 0x0080}, +{0x011C50, 0x0002}, +{0x011C6D, 0x0080}, +{0x011C70, 0x0020}, +{0x011C72, 0x0004}, +{0x011C90, 0x0080}, +{0x011C92, 0x0010}, +{0x011CA8, 0x0080}, +{0x011CA9, 0x0010}, +{0x011CB7, 0x0080}, +{0x011D00, 0x0004}, +{0x011D07, 0x0080}, +{0x011D08, 0x0004}, +{0x011D0A, 0x0080}, +{0x011D0B, 0x0004}, +{0x011D31, 0x0010}, +{0x011D37, 0x0080}, +{0x011D3A, 0x0010}, +{0x011D3B, 0x0080}, +{0x011D3C, 0x0010}, +{0x011D3E, 0x0080}, +{0x011D3F, 0x0010}, +{0x011D46, 0x0004}, +{0x011D47, 0x0010}, +{0x011D48, 0x0080}, +{0x011D50, 0x0002}, +{0x011D5A, 0x0080}, +{0x011D60, 0x0004}, +{0x011D66, 0x0080}, +{0x011D67, 0x0004}, +{0x011D69, 0x0080}, +{0x011D6A, 0x0004}, +{0x011D8A, 0x0010}, +{0x011D8F, 0x0080}, +{0x011D90, 0x0010}, +{0x011D92, 0x0080}, +{0x011D93, 0x0010}, +{0x011D98, 0x0004}, +{0x011D99, 0x0080}, +{0x011DA0, 0x0002}, +{0x011DAA, 0x0080}, +{0x011EE0, 0x0004}, +{0x011EF3, 0x0010}, +{0x011EF7, 0x0020}, +{0x011EF9, 0x0080}, +{0x011F00, 0x0010}, +{0x011F02, 0x0004}, +{0x011F03, 0x0010}, +{0x011F04, 0x0004}, +{0x011F11, 0x0080}, +{0x011F12, 0x0004}, +{0x011F34, 0x0010}, +{0x011F3B, 0x0080}, +{0x011F3E, 0x0010}, +{0x011F43, 0x0020}, +{0x011F50, 0x0002}, +{0x011F5A, 0x0080}, +{0x011FB0, 0x0004}, +{0x011FB1, 0x0080}, +{0x011FC0, 0x0002}, +{0x011FD5, 0x0040}, +{0x011FF2, 0x0080}, +{0x011FFF, 0x0020}, +{0x012000, 0x0004}, +{0x01239A, 0x0080}, +{0x012400, 0x0002}, +{0x01246F, 0x0080}, +{0x012470, 0x0020}, +{0x012475, 0x0080}, +{0x012480, 0x0004}, +{0x012544, 0x0080}, +{0x012F90, 0x0004}, +{0x012FF1, 0x0020}, +{0x012FF3, 0x0080}, +{0x013000, 0x0004}, +{0x013430, 0x0080}, +{0x013440, 0x0010}, +{0x013441, 0x0004}, +{0x013447, 0x0010}, +{0x013456, 0x0080}, +{0x014400, 0x0004}, +{0x014647, 0x0080}, +{0x016800, 0x0004}, +{0x016A39, 0x0080}, +{0x016A40, 0x0004}, +{0x016A5F, 0x0080}, +{0x016A60, 0x0002}, +{0x016A6A, 0x0080}, +{0x016A6E, 0x0020}, +{0x016A70, 0x0004}, +{0x016ABF, 0x0080}, +{0x016AC0, 0x0002}, +{0x016ACA, 0x0080}, +{0x016AD0, 0x0004}, +{0x016AEE, 0x0080}, +{0x016AF0, 0x0010}, +{0x016AF5, 0x0020}, +{0x016AF6, 0x0080}, +{0x016B00, 0x0004}, +{0x016B30, 0x0010}, +{0x016B37, 0x0020}, +{0x016B3C, 0x0040}, +{0x016B40, 0x0004}, +{0x016B44, 0x0020}, +{0x016B45, 0x0040}, +{0x016B46, 0x0080}, +{0x016B50, 0x0002}, +{0x016B5A, 0x0080}, +{0x016B5B, 0x0002}, +{0x016B62, 0x0080}, +{0x016B63, 0x0004}, +{0x016B78, 0x0080}, +{0x016B7D, 0x0004}, +{0x016B90, 0x0080}, +{0x016E40, 0x0004}, +{0x016E80, 0x0002}, +{0x016E97, 0x0020}, +{0x016E9B, 0x0080}, +{0x016F00, 0x0004}, +{0x016F4B, 0x0080}, +{0x016F4F, 0x0010}, +{0x016F50, 0x0004}, +{0x016F51, 0x0010}, +{0x016F88, 0x0080}, +{0x016F8F, 0x0010}, +{0x016F93, 0x0004}, +{0x016FA0, 0x0080}, +{0x016FE0, 0x0004}, +{0x016FE2, 0x0020}, +{0x016FE3, 0x0004}, +{0x016FE4, 0x0010}, +{0x016FE5, 0x0080}, +{0x016FF0, 0x0010}, +{0x016FF2, 0x0080}, +{0x017000, 0x0004}, +{0x0187F8, 0x0080}, +{0x018800, 0x0004}, +{0x018CD6, 0x0080}, +{0x018D00, 0x0004}, +{0x018D09, 0x0080}, +{0x01AFF0, 0x0004}, +{0x01AFF4, 0x0080}, +{0x01AFF5, 0x0004}, +{0x01AFFC, 0x0080}, +{0x01AFFD, 0x0004}, +{0x01AFFF, 0x0080}, +{0x01B000, 0x0004}, +{0x01B123, 0x0080}, +{0x01B132, 0x0004}, +{0x01B133, 0x0080}, +{0x01B150, 0x0004}, +{0x01B153, 0x0080}, +{0x01B155, 0x0004}, +{0x01B156, 0x0080}, +{0x01B164, 0x0004}, +{0x01B168, 0x0080}, +{0x01B170, 0x0004}, +{0x01B2FC, 0x0080}, +{0x01BC00, 0x0004}, +{0x01BC6B, 0x0080}, +{0x01BC70, 0x0004}, +{0x01BC7D, 0x0080}, +{0x01BC80, 0x0004}, +{0x01BC89, 0x0080}, +{0x01BC90, 0x0004}, +{0x01BC9A, 0x0080}, +{0x01BC9C, 0x0040}, +{0x01BC9D, 0x0010}, +{0x01BC9F, 0x0020}, +{0x01BCA0, 0x0080}, +{0x01CF00, 0x0010}, +{0x01CF2E, 0x0080}, +{0x01CF30, 0x0010}, +{0x01CF47, 0x0080}, +{0x01CF50, 0x0040}, +{0x01CFC4, 0x0080}, +{0x01D000, 0x0040}, +{0x01D0F6, 0x0080}, +{0x01D100, 0x0040}, +{0x01D127, 0x0080}, +{0x01D129, 0x0040}, +{0x01D165, 0x0010}, +{0x01D16A, 0x0040}, +{0x01D16D, 0x0010}, +{0x01D173, 0x0080}, +{0x01D17B, 0x0010}, +{0x01D183, 0x0040}, +{0x01D185, 0x0010}, +{0x01D18C, 0x0040}, +{0x01D1AA, 0x0010}, +{0x01D1AE, 0x0040}, +{0x01D1EB, 0x0080}, +{0x01D200, 0x0040}, +{0x01D242, 0x0010}, +{0x01D245, 0x0040}, +{0x01D246, 0x0080}, +{0x01D2C0, 0x0002}, +{0x01D2D4, 0x0080}, +{0x01D2E0, 0x0002}, +{0x01D2F4, 0x0080}, +{0x01D300, 0x0040}, +{0x01D357, 0x0080}, +{0x01D360, 0x0002}, +{0x01D379, 0x0080}, +{0x01D400, 0x0004}, +{0x01D455, 0x0080}, +{0x01D456, 0x0004}, +{0x01D49D, 0x0080}, +{0x01D49E, 0x0004}, +{0x01D4A0, 0x0080}, +{0x01D4A2, 0x0004}, +{0x01D4A3, 0x0080}, +{0x01D4A5, 0x0004}, +{0x01D4A7, 0x0080}, +{0x01D4A9, 0x0004}, +{0x01D4AD, 0x0080}, +{0x01D4AE, 0x0004}, +{0x01D4BA, 0x0080}, +{0x01D4BB, 0x0004}, +{0x01D4BC, 0x0080}, +{0x01D4BD, 0x0004}, +{0x01D4C4, 0x0080}, +{0x01D4C5, 0x0004}, +{0x01D506, 0x0080}, +{0x01D507, 0x0004}, +{0x01D50B, 0x0080}, +{0x01D50D, 0x0004}, +{0x01D515, 0x0080}, +{0x01D516, 0x0004}, +{0x01D51D, 0x0080}, +{0x01D51E, 0x0004}, +{0x01D53A, 0x0080}, +{0x01D53B, 0x0004}, +{0x01D53F, 0x0080}, +{0x01D540, 0x0004}, +{0x01D545, 0x0080}, +{0x01D546, 0x0004}, +{0x01D547, 0x0080}, +{0x01D54A, 0x0004}, +{0x01D551, 0x0080}, +{0x01D552, 0x0004}, +{0x01D6A6, 0x0080}, +{0x01D6A8, 0x0004}, +{0x01D6C1, 0x0040}, +{0x01D6C2, 0x0004}, +{0x01D6DB, 0x0040}, +{0x01D6DC, 0x0004}, +{0x01D6FB, 0x0040}, +{0x01D6FC, 0x0004}, +{0x01D715, 0x0040}, +{0x01D716, 0x0004}, +{0x01D735, 0x0040}, +{0x01D736, 0x0004}, +{0x01D74F, 0x0040}, +{0x01D750, 0x0004}, +{0x01D76F, 0x0040}, +{0x01D770, 0x0004}, +{0x01D789, 0x0040}, +{0x01D78A, 0x0004}, +{0x01D7A9, 0x0040}, +{0x01D7AA, 0x0004}, +{0x01D7C3, 0x0040}, +{0x01D7C4, 0x0004}, +{0x01D7CC, 0x0080}, +{0x01D7CE, 0x0002}, +{0x01D800, 0x0040}, +{0x01DA00, 0x0010}, +{0x01DA37, 0x0040}, +{0x01DA3B, 0x0010}, +{0x01DA6D, 0x0040}, +{0x01DA75, 0x0010}, +{0x01DA76, 0x0040}, +{0x01DA84, 0x0010}, +{0x01DA85, 0x0040}, +{0x01DA87, 0x0020}, +{0x01DA8C, 0x0080}, +{0x01DA9B, 0x0010}, +{0x01DAA0, 0x0080}, +{0x01DAA1, 0x0010}, +{0x01DAB0, 0x0080}, +{0x01DF00, 0x0004}, +{0x01DF1F, 0x0080}, +{0x01DF25, 0x0004}, +{0x01DF2B, 0x0080}, +{0x01E000, 0x0010}, +{0x01E007, 0x0080}, +{0x01E008, 0x0010}, +{0x01E019, 0x0080}, +{0x01E01B, 0x0010}, +{0x01E022, 0x0080}, +{0x01E023, 0x0010}, +{0x01E025, 0x0080}, +{0x01E026, 0x0010}, +{0x01E02B, 0x0080}, +{0x01E030, 0x0004}, +{0x01E06E, 0x0080}, +{0x01E08F, 0x0010}, +{0x01E090, 0x0080}, +{0x01E100, 0x0004}, +{0x01E12D, 0x0080}, +{0x01E130, 0x0010}, +{0x01E137, 0x0004}, +{0x01E13E, 0x0080}, +{0x01E140, 0x0002}, +{0x01E14A, 0x0080}, +{0x01E14E, 0x0004}, +{0x01E14F, 0x0040}, +{0x01E150, 0x0080}, +{0x01E290, 0x0004}, +{0x01E2AE, 0x0010}, +{0x01E2AF, 0x0080}, +{0x01E2C0, 0x0004}, +{0x01E2EC, 0x0010}, +{0x01E2F0, 0x0002}, +{0x01E2FA, 0x0080}, +{0x01E2FF, 0x0040}, +{0x01E300, 0x0080}, +{0x01E4D0, 0x0004}, +{0x01E4EC, 0x0010}, +{0x01E4F0, 0x0002}, +{0x01E4FA, 0x0080}, +{0x01E7E0, 0x0004}, +{0x01E7E7, 0x0080}, +{0x01E7E8, 0x0004}, +{0x01E7EC, 0x0080}, +{0x01E7ED, 0x0004}, +{0x01E7EF, 0x0080}, +{0x01E7F0, 0x0004}, +{0x01E7FF, 0x0080}, +{0x01E800, 0x0004}, +{0x01E8C5, 0x0080}, +{0x01E8C7, 0x0002}, +{0x01E8D0, 0x0010}, +{0x01E8D7, 0x0080}, +{0x01E900, 0x0004}, +{0x01E944, 0x0010}, +{0x01E94B, 0x0004}, +{0x01E94C, 0x0080}, +{0x01E950, 0x0002}, +{0x01E95A, 0x0080}, +{0x01E95E, 0x0020}, +{0x01E960, 0x0080}, +{0x01EC71, 0x0002}, +{0x01ECAC, 0x0040}, +{0x01ECAD, 0x0002}, +{0x01ECB0, 0x0040}, +{0x01ECB1, 0x0002}, +{0x01ECB5, 0x0080}, +{0x01ED01, 0x0002}, +{0x01ED2E, 0x0040}, +{0x01ED2F, 0x0002}, +{0x01ED3E, 0x0080}, +{0x01EE00, 0x0004}, +{0x01EE04, 0x0080}, +{0x01EE05, 0x0004}, +{0x01EE20, 0x0080}, +{0x01EE21, 0x0004}, +{0x01EE23, 0x0080}, +{0x01EE24, 0x0004}, +{0x01EE25, 0x0080}, +{0x01EE27, 0x0004}, +{0x01EE28, 0x0080}, +{0x01EE29, 0x0004}, +{0x01EE33, 0x0080}, +{0x01EE34, 0x0004}, +{0x01EE38, 0x0080}, +{0x01EE39, 0x0004}, +{0x01EE3A, 0x0080}, +{0x01EE3B, 0x0004}, +{0x01EE3C, 0x0080}, +{0x01EE42, 0x0004}, +{0x01EE43, 0x0080}, +{0x01EE47, 0x0004}, +{0x01EE48, 0x0080}, +{0x01EE49, 0x0004}, +{0x01EE4A, 0x0080}, +{0x01EE4B, 0x0004}, +{0x01EE4C, 0x0080}, +{0x01EE4D, 0x0004}, +{0x01EE50, 0x0080}, +{0x01EE51, 0x0004}, +{0x01EE53, 0x0080}, +{0x01EE54, 0x0004}, +{0x01EE55, 0x0080}, +{0x01EE57, 0x0004}, +{0x01EE58, 0x0080}, +{0x01EE59, 0x0004}, +{0x01EE5A, 0x0080}, +{0x01EE5B, 0x0004}, +{0x01EE5C, 0x0080}, +{0x01EE5D, 0x0004}, +{0x01EE5E, 0x0080}, +{0x01EE5F, 0x0004}, +{0x01EE60, 0x0080}, +{0x01EE61, 0x0004}, +{0x01EE63, 0x0080}, +{0x01EE64, 0x0004}, +{0x01EE65, 0x0080}, +{0x01EE67, 0x0004}, +{0x01EE6B, 0x0080}, +{0x01EE6C, 0x0004}, +{0x01EE73, 0x0080}, +{0x01EE74, 0x0004}, +{0x01EE78, 0x0080}, +{0x01EE79, 0x0004}, +{0x01EE7D, 0x0080}, +{0x01EE7E, 0x0004}, +{0x01EE7F, 0x0080}, +{0x01EE80, 0x0004}, +{0x01EE8A, 0x0080}, +{0x01EE8B, 0x0004}, +{0x01EE9C, 0x0080}, +{0x01EEA1, 0x0004}, +{0x01EEA4, 0x0080}, +{0x01EEA5, 0x0004}, +{0x01EEAA, 0x0080}, +{0x01EEAB, 0x0004}, +{0x01EEBC, 0x0080}, +{0x01EEF0, 0x0040}, +{0x01EEF2, 0x0080}, +{0x01F000, 0x0040}, +{0x01F02C, 0x0080}, +{0x01F030, 0x0040}, +{0x01F094, 0x0080}, +{0x01F0A0, 0x0040}, +{0x01F0AF, 0x0080}, +{0x01F0B1, 0x0040}, +{0x01F0C0, 0x0080}, +{0x01F0C1, 0x0040}, +{0x01F0D0, 0x0080}, +{0x01F0D1, 0x0040}, +{0x01F0F6, 0x0080}, +{0x01F100, 0x0002}, +{0x01F10D, 0x0040}, +{0x01F1AE, 0x0080}, +{0x01F1E6, 0x0040}, +{0x01F203, 0x0080}, +{0x01F210, 0x0040}, +{0x01F23C, 0x0080}, +{0x01F240, 0x0040}, +{0x01F249, 0x0080}, +{0x01F250, 0x0040}, +{0x01F252, 0x0080}, +{0x01F260, 0x0040}, +{0x01F266, 0x0080}, +{0x01F300, 0x0040}, +{0x01F6D8, 0x0080}, +{0x01F6DC, 0x0040}, +{0x01F6ED, 0x0080}, +{0x01F6F0, 0x0040}, +{0x01F6FD, 0x0080}, +{0x01F700, 0x0040}, +{0x01F777, 0x0080}, +{0x01F77B, 0x0040}, +{0x01F7DA, 0x0080}, +{0x01F7E0, 0x0040}, +{0x01F7EC, 0x0080}, +{0x01F7F0, 0x0040}, +{0x01F7F1, 0x0080}, +{0x01F800, 0x0040}, +{0x01F80C, 0x0080}, +{0x01F810, 0x0040}, +{0x01F848, 0x0080}, +{0x01F850, 0x0040}, +{0x01F85A, 0x0080}, +{0x01F860, 0x0040}, +{0x01F888, 0x0080}, +{0x01F890, 0x0040}, +{0x01F8AE, 0x0080}, +{0x01F8B0, 0x0040}, +{0x01F8B2, 0x0080}, +{0x01F900, 0x0040}, +{0x01FA54, 0x0080}, +{0x01FA60, 0x0040}, +{0x01FA6E, 0x0080}, +{0x01FA70, 0x0040}, +{0x01FA7D, 0x0080}, +{0x01FA80, 0x0040}, +{0x01FA89, 0x0080}, +{0x01FA90, 0x0040}, +{0x01FABE, 0x0080}, +{0x01FABF, 0x0040}, +{0x01FAC6, 0x0080}, +{0x01FACE, 0x0040}, +{0x01FADC, 0x0080}, +{0x01FAE0, 0x0040}, +{0x01FAE9, 0x0080}, +{0x01FAF0, 0x0040}, +{0x01FAF9, 0x0080}, +{0x01FB00, 0x0040}, +{0x01FB93, 0x0080}, +{0x01FB94, 0x0040}, +{0x01FBCB, 0x0080}, +{0x01FBF0, 0x0002}, +{0x01FBFA, 0x0080}, +{0x020000, 0x0004}, +{0x02A6E0, 0x0080}, +{0x02A700, 0x0004}, +{0x02B73A, 0x0080}, +{0x02B740, 0x0004}, +{0x02B81E, 0x0080}, +{0x02B820, 0x0004}, +{0x02CEA2, 0x0080}, +{0x02CEB0, 0x0004}, +{0x02EBE1, 0x0080}, +{0x02EBF0, 0x0004}, +{0x02EE5E, 0x0080}, +{0x02F800, 0x0004}, +{0x02FA1E, 0x0080}, +{0x030000, 0x0004}, +{0x03134B, 0x0080}, +{0x031350, 0x0004}, +{0x0323B0, 0x0080}, +{0x0E0100, 0x0010}, +{0x0E01F0, 0x0080}, +{0x110000, 0x0000}, }; -const std::vector> unicode_ranges_punctuation = { -{0x00000021, 0x00000023}, {0x00000025, 0x0000002A}, {0x0000002C, 0x0000002F}, {0x0000003A, 0x0000003B}, -{0x0000003F, 0x00000040}, {0x0000005B, 0x0000005D}, {0x0000005F, 0x0000005F}, {0x0000007B, 0x0000007B}, -{0x0000007D, 0x0000007D}, {0x000000A1, 0x000000A1}, {0x000000A7, 0x000000A7}, {0x000000AB, 0x000000AB}, -{0x000000B6, 0x000000B7}, {0x000000BB, 0x000000BB}, {0x000000BF, 0x000000BF}, {0x0000037E, 0x0000037E}, -{0x00000387, 0x00000387}, {0x0000055A, 0x0000055F}, {0x00000589, 0x0000058A}, {0x000005BE, 0x000005BE}, -{0x000005C0, 0x000005C0}, {0x000005C3, 0x000005C3}, {0x000005C6, 0x000005C6}, {0x000005F3, 0x000005F4}, -{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061E, 0x0000061F}, -{0x0000066A, 0x0000066D}, {0x000006D4, 0x000006D4}, {0x00000700, 0x0000070D}, {0x000007F7, 0x000007F9}, -{0x00000830, 0x0000083E}, {0x0000085E, 0x0000085E}, {0x00000964, 0x00000965}, {0x00000970, 0x00000970}, -{0x000009FD, 0x000009FD}, {0x00000A76, 0x00000A76}, {0x00000AF0, 0x00000AF0}, {0x00000C77, 0x00000C77}, -{0x00000C84, 0x00000C84}, {0x00000DF4, 0x00000DF4}, {0x00000E4F, 0x00000E4F}, {0x00000E5A, 0x00000E5B}, -{0x00000F04, 0x00000F12}, {0x00000F14, 0x00000F14}, {0x00000F3A, 0x00000F3D}, {0x00000F85, 0x00000F85}, -{0x00000FD0, 0x00000FD4}, {0x00000FD9, 0x00000FDA}, {0x0000104A, 0x0000104F}, {0x000010FB, 0x000010FB}, -{0x00001360, 0x00001368}, {0x00001400, 0x00001400}, {0x0000166E, 0x0000166E}, {0x0000169B, 0x0000169C}, -{0x000016EB, 0x000016ED}, {0x00001735, 0x00001736}, {0x000017D4, 0x000017D6}, {0x000017D8, 0x000017DA}, -{0x00001800, 0x0000180A}, {0x00001944, 0x00001945}, {0x00001A1E, 0x00001A1F}, {0x00001AA0, 0x00001AA6}, -{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001BFC, 0x00001BFF}, {0x00001C3B, 0x00001C3F}, -{0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3}, {0x00002010, 0x00002027}, -{0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E}, {0x0000207D, 0x0000207E}, -{0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A}, {0x00002768, 0x00002775}, -{0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998}, {0x000029D8, 0x000029DB}, -{0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF}, {0x00002D70, 0x00002D70}, -{0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E52}, {0x00003001, 0x00003003}, -{0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030}, {0x0000303D, 0x0000303D}, -{0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF}, {0x0000A60D, 0x0000A60F}, -{0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7}, {0x0000A874, 0x0000A877}, -{0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC}, {0x0000A92E, 0x0000A92F}, -{0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF}, {0x0000AA5C, 0x0000AA5F}, -{0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB}, {0x0000FD3E, 0x0000FD3F}, -{0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61}, {0x0000FE63, 0x0000FE63}, -{0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03}, {0x0000FF05, 0x0000FF0A}, -{0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20}, {0x0000FF3B, 0x0000FF3D}, -{0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D}, {0x0000FF5F, 0x0000FF65}, -{0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0}, {0x0001056F, 0x0001056F}, -{0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F}, {0x00010A50, 0x00010A58}, -{0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F}, {0x00010B99, 0x00010B9C}, -{0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC}, -{0x000110BE, 0x000110C1}, {0x00011140, 0x00011143}, {0x00011174, 0x00011175}, {0x000111C5, 0x000111C8}, -{0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB}, {0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D}, -{0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F}, {0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D}, -{0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7}, {0x00011641, 0x00011643}, {0x00011660, 0x0001166C}, -{0x0001173C, 0x0001173E}, {0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2}, -{0x00011A3F, 0x00011A46}, {0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011C41, 0x00011C45}, -{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011FFF, 0x00011FFF}, {0x00012470, 0x00012474}, -{0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5}, {0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44}, -{0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2}, {0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B}, -{0x0001E95E, 0x0001E95F}, +const std::unordered_set unicode_set_whitespace = { +0x000009, 0x00000A, 0x00000B, 0x00000C, 0x00000D, 0x000020, 0x000085, 0x0000A0, 0x001680, 0x002000, 0x002001, 0x002002, 0x002003, 0x002004, 0x002005, 0x002006, 0x002007, 0x002008, 0x002009, 0x00200A, 0x002028, 0x002029, 0x00202F, 0x00205F, 0x003000 }; -const std::vector> unicode_ranges_symbol = { -{0x00000024, 0x00000024}, {0x0000002B, 0x0000002B}, {0x0000003C, 0x0000003E}, {0x0000005E, 0x0000005E}, -{0x00000060, 0x00000060}, {0x0000007C, 0x0000007C}, {0x0000007E, 0x0000007E}, {0x000000A2, 0x000000A6}, -{0x000000A8, 0x000000A9}, {0x000000AC, 0x000000AC}, {0x000000AE, 0x000000B1}, {0x000000B4, 0x000000B4}, -{0x000000B8, 0x000000B8}, {0x000000D7, 0x000000D7}, {0x000000F7, 0x000000F7}, {0x000002C2, 0x000002C5}, -{0x000002D2, 0x000002DF}, {0x000002E5, 0x000002EB}, {0x000002ED, 0x000002ED}, {0x000002EF, 0x000002FF}, -{0x00000375, 0x00000375}, {0x00000384, 0x00000385}, {0x000003F6, 0x000003F6}, {0x00000482, 0x00000482}, -{0x0000058D, 0x0000058F}, {0x00000606, 0x00000608}, {0x0000060B, 0x0000060B}, {0x0000060E, 0x0000060F}, -{0x000006DE, 0x000006DE}, {0x000006E9, 0x000006E9}, {0x000006FD, 0x000006FE}, {0x000007F6, 0x000007F6}, -{0x000007FE, 0x000007FF}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB}, {0x00000AF1, 0x00000AF1}, -{0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F}, {0x00000D4F, 0x00000D4F}, -{0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03}, {0x00000F13, 0x00000F13}, -{0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34}, {0x00000F36, 0x00000F36}, -{0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC}, {0x00000FCE, 0x00000FCF}, -{0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399}, {0x0000166D, 0x0000166D}, -{0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF}, {0x00001B61, 0x00001B6A}, -{0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1}, {0x00001FCD, 0x00001FCF}, -{0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE}, {0x00002044, 0x00002044}, -{0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C}, {0x000020A0, 0x000020BF}, -{0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109}, {0x00002114, 0x00002114}, -{0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125}, {0x00002127, 0x00002127}, -{0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B}, {0x00002140, 0x00002144}, -{0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B}, {0x00002190, 0x00002307}, -{0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A}, {0x0000249C, 0x000024E9}, -{0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5}, {0x000027F0, 0x00002982}, -{0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73}, {0x00002B76, 0x00002B95}, -{0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51}, {0x00002E80, 0x00002E99}, -{0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFB}, {0x00003004, 0x00003004}, -{0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037}, {0x0000303E, 0x0000303F}, -{0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F}, {0x000031C0, 0x000031E3}, -{0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, {0x00003250, 0x00003250}, {0x00003260, 0x0000327F}, -{0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, {0x00004DC0, 0x00004DFF}, {0x0000A490, 0x0000A4C6}, -{0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, {0x0000A789, 0x0000A78A}, {0x0000A828, 0x0000A82B}, -{0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, {0x0000AB5B, 0x0000AB5B}, {0x0000AB6A, 0x0000AB6B}, -{0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC1}, {0x0000FDFC, 0x0000FDFD}, {0x0000FE62, 0x0000FE62}, -{0x0000FE64, 0x0000FE66}, {0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B}, -{0x0000FF1C, 0x0000FF1E}, {0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C}, -{0x0000FF5E, 0x0000FF5E}, {0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD}, -{0x00010137, 0x0001013F}, {0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C}, -{0x000101A0, 0x000101A0}, {0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8}, -{0x0001173F, 0x0001173F}, {0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45}, -{0x0001BC9C, 0x0001BC9C}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, {0x0001D129, 0x0001D164}, -{0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, {0x0001D1AE, 0x0001D1E8}, -{0x0001D200, 0x0001D241}, {0x0001D245, 0x0001D245}, {0x0001D300, 0x0001D356}, {0x0001D6C1, 0x0001D6C1}, -{0x0001D6DB, 0x0001D6DB}, {0x0001D6FB, 0x0001D6FB}, {0x0001D715, 0x0001D715}, {0x0001D735, 0x0001D735}, -{0x0001D74F, 0x0001D74F}, {0x0001D76F, 0x0001D76F}, {0x0001D789, 0x0001D789}, {0x0001D7A9, 0x0001D7A9}, -{0x0001D7C3, 0x0001D7C3}, {0x0001D800, 0x0001D9FF}, {0x0001DA37, 0x0001DA3A}, {0x0001DA6D, 0x0001DA74}, -{0x0001DA76, 0x0001DA83}, {0x0001DA85, 0x0001DA86}, {0x0001E14F, 0x0001E14F}, {0x0001E2FF, 0x0001E2FF}, -{0x0001ECAC, 0x0001ECAC}, {0x0001ECB0, 0x0001ECB0}, {0x0001ED2E, 0x0001ED2E}, {0x0001EEF0, 0x0001EEF1}, -{0x0001F000, 0x0001F02B}, {0x0001F030, 0x0001F093}, {0x0001F0A0, 0x0001F0AE}, {0x0001F0B1, 0x0001F0BF}, -{0x0001F0C1, 0x0001F0CF}, {0x0001F0D1, 0x0001F0F5}, {0x0001F10D, 0x0001F1AD}, {0x0001F1E6, 0x0001F202}, -{0x0001F210, 0x0001F23B}, {0x0001F240, 0x0001F248}, {0x0001F250, 0x0001F251}, {0x0001F260, 0x0001F265}, -{0x0001F300, 0x0001F6D7}, {0x0001F6E0, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, {0x0001F700, 0x0001F773}, -{0x0001F780, 0x0001F7D8}, {0x0001F7E0, 0x0001F7EB}, {0x0001F800, 0x0001F80B}, {0x0001F810, 0x0001F847}, -{0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, {0x0001F890, 0x0001F8AD}, {0x0001F8B0, 0x0001F8B1}, -{0x0001F900, 0x0001F978}, {0x0001F97A, 0x0001F9CB}, {0x0001F9CD, 0x0001FA53}, {0x0001FA60, 0x0001FA6D}, -{0x0001FA70, 0x0001FA74}, {0x0001FA78, 0x0001FA7A}, {0x0001FA80, 0x0001FA86}, {0x0001FA90, 0x0001FAA8}, -{0x0001FAB0, 0x0001FAB6}, {0x0001FAC0, 0x0001FAC2}, {0x0001FAD0, 0x0001FAD6}, {0x0001FB00, 0x0001FB92}, -{0x0001FB94, 0x0001FBCA}, +const std::unordered_map unicode_map_lowercase = { +{0x000041, 0x000061}, +{0x000042, 0x000062}, +{0x000043, 0x000063}, +{0x000044, 0x000064}, +{0x000045, 0x000065}, +{0x000046, 0x000066}, +{0x000047, 0x000067}, +{0x000048, 0x000068}, +{0x000049, 0x000069}, +{0x00004A, 0x00006A}, +{0x00004B, 0x00006B}, +{0x00004C, 0x00006C}, +{0x00004D, 0x00006D}, +{0x00004E, 0x00006E}, +{0x00004F, 0x00006F}, +{0x000050, 0x000070}, +{0x000051, 0x000071}, +{0x000052, 0x000072}, +{0x000053, 0x000073}, +{0x000054, 0x000074}, +{0x000055, 0x000075}, +{0x000056, 0x000076}, +{0x000057, 0x000077}, +{0x000058, 0x000078}, +{0x000059, 0x000079}, +{0x00005A, 0x00007A}, +{0x0000C0, 0x0000E0}, +{0x0000C1, 0x0000E1}, +{0x0000C2, 0x0000E2}, +{0x0000C3, 0x0000E3}, +{0x0000C4, 0x0000E4}, +{0x0000C5, 0x0000E5}, +{0x0000C6, 0x0000E6}, +{0x0000C7, 0x0000E7}, +{0x0000C8, 0x0000E8}, +{0x0000C9, 0x0000E9}, +{0x0000CA, 0x0000EA}, +{0x0000CB, 0x0000EB}, +{0x0000CC, 0x0000EC}, +{0x0000CD, 0x0000ED}, +{0x0000CE, 0x0000EE}, +{0x0000CF, 0x0000EF}, +{0x0000D0, 0x0000F0}, +{0x0000D1, 0x0000F1}, +{0x0000D2, 0x0000F2}, +{0x0000D3, 0x0000F3}, +{0x0000D4, 0x0000F4}, +{0x0000D5, 0x0000F5}, +{0x0000D6, 0x0000F6}, +{0x0000D8, 0x0000F8}, +{0x0000D9, 0x0000F9}, +{0x0000DA, 0x0000FA}, +{0x0000DB, 0x0000FB}, +{0x0000DC, 0x0000FC}, +{0x0000DD, 0x0000FD}, +{0x0000DE, 0x0000FE}, +{0x000100, 0x000101}, +{0x000102, 0x000103}, +{0x000104, 0x000105}, +{0x000106, 0x000107}, +{0x000108, 0x000109}, +{0x00010A, 0x00010B}, +{0x00010C, 0x00010D}, +{0x00010E, 0x00010F}, +{0x000110, 0x000111}, +{0x000112, 0x000113}, +{0x000114, 0x000115}, +{0x000116, 0x000117}, +{0x000118, 0x000119}, +{0x00011A, 0x00011B}, +{0x00011C, 0x00011D}, +{0x00011E, 0x00011F}, +{0x000120, 0x000121}, +{0x000122, 0x000123}, +{0x000124, 0x000125}, +{0x000126, 0x000127}, +{0x000128, 0x000129}, +{0x00012A, 0x00012B}, +{0x00012C, 0x00012D}, +{0x00012E, 0x00012F}, +{0x000130, 0x000069}, +{0x000132, 0x000133}, +{0x000134, 0x000135}, +{0x000136, 0x000137}, +{0x000139, 0x00013A}, +{0x00013B, 0x00013C}, +{0x00013D, 0x00013E}, +{0x00013F, 0x000140}, +{0x000141, 0x000142}, +{0x000143, 0x000144}, +{0x000145, 0x000146}, +{0x000147, 0x000148}, +{0x00014A, 0x00014B}, +{0x00014C, 0x00014D}, +{0x00014E, 0x00014F}, +{0x000150, 0x000151}, +{0x000152, 0x000153}, +{0x000154, 0x000155}, +{0x000156, 0x000157}, +{0x000158, 0x000159}, +{0x00015A, 0x00015B}, +{0x00015C, 0x00015D}, +{0x00015E, 0x00015F}, +{0x000160, 0x000161}, +{0x000162, 0x000163}, +{0x000164, 0x000165}, +{0x000166, 0x000167}, +{0x000168, 0x000169}, +{0x00016A, 0x00016B}, +{0x00016C, 0x00016D}, +{0x00016E, 0x00016F}, +{0x000170, 0x000171}, +{0x000172, 0x000173}, +{0x000174, 0x000175}, +{0x000176, 0x000177}, +{0x000178, 0x0000FF}, +{0x000179, 0x00017A}, +{0x00017B, 0x00017C}, +{0x00017D, 0x00017E}, +{0x000181, 0x000253}, +{0x000182, 0x000183}, +{0x000184, 0x000185}, +{0x000186, 0x000254}, +{0x000187, 0x000188}, +{0x000189, 0x000256}, +{0x00018A, 0x000257}, +{0x00018B, 0x00018C}, +{0x00018E, 0x0001DD}, +{0x00018F, 0x000259}, +{0x000190, 0x00025B}, +{0x000191, 0x000192}, +{0x000193, 0x000260}, +{0x000194, 0x000263}, +{0x000196, 0x000269}, +{0x000197, 0x000268}, +{0x000198, 0x000199}, +{0x00019C, 0x00026F}, +{0x00019D, 0x000272}, +{0x00019F, 0x000275}, +{0x0001A0, 0x0001A1}, +{0x0001A2, 0x0001A3}, +{0x0001A4, 0x0001A5}, +{0x0001A6, 0x000280}, +{0x0001A7, 0x0001A8}, +{0x0001A9, 0x000283}, +{0x0001AC, 0x0001AD}, +{0x0001AE, 0x000288}, +{0x0001AF, 0x0001B0}, +{0x0001B1, 0x00028A}, +{0x0001B2, 0x00028B}, +{0x0001B3, 0x0001B4}, +{0x0001B5, 0x0001B6}, +{0x0001B7, 0x000292}, +{0x0001B8, 0x0001B9}, +{0x0001BC, 0x0001BD}, +{0x0001C4, 0x0001C6}, +{0x0001C5, 0x0001C6}, +{0x0001C7, 0x0001C9}, +{0x0001C8, 0x0001C9}, +{0x0001CA, 0x0001CC}, +{0x0001CB, 0x0001CC}, +{0x0001CD, 0x0001CE}, +{0x0001CF, 0x0001D0}, +{0x0001D1, 0x0001D2}, +{0x0001D3, 0x0001D4}, +{0x0001D5, 0x0001D6}, +{0x0001D7, 0x0001D8}, +{0x0001D9, 0x0001DA}, +{0x0001DB, 0x0001DC}, +{0x0001DE, 0x0001DF}, +{0x0001E0, 0x0001E1}, +{0x0001E2, 0x0001E3}, +{0x0001E4, 0x0001E5}, +{0x0001E6, 0x0001E7}, +{0x0001E8, 0x0001E9}, +{0x0001EA, 0x0001EB}, +{0x0001EC, 0x0001ED}, +{0x0001EE, 0x0001EF}, +{0x0001F1, 0x0001F3}, +{0x0001F2, 0x0001F3}, +{0x0001F4, 0x0001F5}, +{0x0001F6, 0x000195}, +{0x0001F7, 0x0001BF}, +{0x0001F8, 0x0001F9}, +{0x0001FA, 0x0001FB}, +{0x0001FC, 0x0001FD}, +{0x0001FE, 0x0001FF}, +{0x000200, 0x000201}, +{0x000202, 0x000203}, +{0x000204, 0x000205}, +{0x000206, 0x000207}, +{0x000208, 0x000209}, +{0x00020A, 0x00020B}, +{0x00020C, 0x00020D}, +{0x00020E, 0x00020F}, +{0x000210, 0x000211}, +{0x000212, 0x000213}, +{0x000214, 0x000215}, +{0x000216, 0x000217}, +{0x000218, 0x000219}, +{0x00021A, 0x00021B}, +{0x00021C, 0x00021D}, +{0x00021E, 0x00021F}, +{0x000220, 0x00019E}, +{0x000222, 0x000223}, +{0x000224, 0x000225}, +{0x000226, 0x000227}, +{0x000228, 0x000229}, +{0x00022A, 0x00022B}, +{0x00022C, 0x00022D}, +{0x00022E, 0x00022F}, +{0x000230, 0x000231}, +{0x000232, 0x000233}, +{0x00023A, 0x002C65}, +{0x00023B, 0x00023C}, +{0x00023D, 0x00019A}, +{0x00023E, 0x002C66}, +{0x000241, 0x000242}, +{0x000243, 0x000180}, +{0x000244, 0x000289}, +{0x000245, 0x00028C}, +{0x000246, 0x000247}, +{0x000248, 0x000249}, +{0x00024A, 0x00024B}, +{0x00024C, 0x00024D}, +{0x00024E, 0x00024F}, +{0x000370, 0x000371}, +{0x000372, 0x000373}, +{0x000376, 0x000377}, +{0x00037F, 0x0003F3}, +{0x000386, 0x0003AC}, +{0x000388, 0x0003AD}, +{0x000389, 0x0003AE}, +{0x00038A, 0x0003AF}, +{0x00038C, 0x0003CC}, +{0x00038E, 0x0003CD}, +{0x00038F, 0x0003CE}, +{0x000391, 0x0003B1}, +{0x000392, 0x0003B2}, +{0x000393, 0x0003B3}, +{0x000394, 0x0003B4}, +{0x000395, 0x0003B5}, +{0x000396, 0x0003B6}, +{0x000397, 0x0003B7}, +{0x000398, 0x0003B8}, +{0x000399, 0x0003B9}, +{0x00039A, 0x0003BA}, +{0x00039B, 0x0003BB}, +{0x00039C, 0x0003BC}, +{0x00039D, 0x0003BD}, +{0x00039E, 0x0003BE}, +{0x00039F, 0x0003BF}, +{0x0003A0, 0x0003C0}, +{0x0003A1, 0x0003C1}, +{0x0003A3, 0x0003C3}, +{0x0003A4, 0x0003C4}, +{0x0003A5, 0x0003C5}, +{0x0003A6, 0x0003C6}, +{0x0003A7, 0x0003C7}, +{0x0003A8, 0x0003C8}, +{0x0003A9, 0x0003C9}, +{0x0003AA, 0x0003CA}, +{0x0003AB, 0x0003CB}, +{0x0003CF, 0x0003D7}, +{0x0003D8, 0x0003D9}, +{0x0003DA, 0x0003DB}, +{0x0003DC, 0x0003DD}, +{0x0003DE, 0x0003DF}, +{0x0003E0, 0x0003E1}, +{0x0003E2, 0x0003E3}, +{0x0003E4, 0x0003E5}, +{0x0003E6, 0x0003E7}, +{0x0003E8, 0x0003E9}, +{0x0003EA, 0x0003EB}, +{0x0003EC, 0x0003ED}, +{0x0003EE, 0x0003EF}, +{0x0003F4, 0x0003B8}, +{0x0003F7, 0x0003F8}, +{0x0003F9, 0x0003F2}, +{0x0003FA, 0x0003FB}, +{0x0003FD, 0x00037B}, +{0x0003FE, 0x00037C}, +{0x0003FF, 0x00037D}, +{0x000400, 0x000450}, +{0x000401, 0x000451}, +{0x000402, 0x000452}, +{0x000403, 0x000453}, +{0x000404, 0x000454}, +{0x000405, 0x000455}, +{0x000406, 0x000456}, +{0x000407, 0x000457}, +{0x000408, 0x000458}, +{0x000409, 0x000459}, +{0x00040A, 0x00045A}, +{0x00040B, 0x00045B}, +{0x00040C, 0x00045C}, +{0x00040D, 0x00045D}, +{0x00040E, 0x00045E}, +{0x00040F, 0x00045F}, +{0x000410, 0x000430}, +{0x000411, 0x000431}, +{0x000412, 0x000432}, +{0x000413, 0x000433}, +{0x000414, 0x000434}, +{0x000415, 0x000435}, +{0x000416, 0x000436}, +{0x000417, 0x000437}, +{0x000418, 0x000438}, +{0x000419, 0x000439}, +{0x00041A, 0x00043A}, +{0x00041B, 0x00043B}, +{0x00041C, 0x00043C}, +{0x00041D, 0x00043D}, +{0x00041E, 0x00043E}, +{0x00041F, 0x00043F}, +{0x000420, 0x000440}, +{0x000421, 0x000441}, +{0x000422, 0x000442}, +{0x000423, 0x000443}, +{0x000424, 0x000444}, +{0x000425, 0x000445}, +{0x000426, 0x000446}, +{0x000427, 0x000447}, +{0x000428, 0x000448}, +{0x000429, 0x000449}, +{0x00042A, 0x00044A}, +{0x00042B, 0x00044B}, +{0x00042C, 0x00044C}, +{0x00042D, 0x00044D}, +{0x00042E, 0x00044E}, +{0x00042F, 0x00044F}, +{0x000460, 0x000461}, +{0x000462, 0x000463}, +{0x000464, 0x000465}, +{0x000466, 0x000467}, +{0x000468, 0x000469}, +{0x00046A, 0x00046B}, +{0x00046C, 0x00046D}, +{0x00046E, 0x00046F}, +{0x000470, 0x000471}, +{0x000472, 0x000473}, +{0x000474, 0x000475}, +{0x000476, 0x000477}, +{0x000478, 0x000479}, +{0x00047A, 0x00047B}, +{0x00047C, 0x00047D}, +{0x00047E, 0x00047F}, +{0x000480, 0x000481}, +{0x00048A, 0x00048B}, +{0x00048C, 0x00048D}, +{0x00048E, 0x00048F}, +{0x000490, 0x000491}, +{0x000492, 0x000493}, +{0x000494, 0x000495}, +{0x000496, 0x000497}, +{0x000498, 0x000499}, +{0x00049A, 0x00049B}, +{0x00049C, 0x00049D}, +{0x00049E, 0x00049F}, +{0x0004A0, 0x0004A1}, +{0x0004A2, 0x0004A3}, +{0x0004A4, 0x0004A5}, +{0x0004A6, 0x0004A7}, +{0x0004A8, 0x0004A9}, +{0x0004AA, 0x0004AB}, +{0x0004AC, 0x0004AD}, +{0x0004AE, 0x0004AF}, +{0x0004B0, 0x0004B1}, +{0x0004B2, 0x0004B3}, +{0x0004B4, 0x0004B5}, +{0x0004B6, 0x0004B7}, +{0x0004B8, 0x0004B9}, +{0x0004BA, 0x0004BB}, +{0x0004BC, 0x0004BD}, +{0x0004BE, 0x0004BF}, +{0x0004C0, 0x0004CF}, +{0x0004C1, 0x0004C2}, +{0x0004C3, 0x0004C4}, +{0x0004C5, 0x0004C6}, +{0x0004C7, 0x0004C8}, +{0x0004C9, 0x0004CA}, +{0x0004CB, 0x0004CC}, +{0x0004CD, 0x0004CE}, +{0x0004D0, 0x0004D1}, +{0x0004D2, 0x0004D3}, +{0x0004D4, 0x0004D5}, +{0x0004D6, 0x0004D7}, +{0x0004D8, 0x0004D9}, +{0x0004DA, 0x0004DB}, +{0x0004DC, 0x0004DD}, +{0x0004DE, 0x0004DF}, +{0x0004E0, 0x0004E1}, +{0x0004E2, 0x0004E3}, +{0x0004E4, 0x0004E5}, +{0x0004E6, 0x0004E7}, +{0x0004E8, 0x0004E9}, +{0x0004EA, 0x0004EB}, +{0x0004EC, 0x0004ED}, +{0x0004EE, 0x0004EF}, +{0x0004F0, 0x0004F1}, +{0x0004F2, 0x0004F3}, +{0x0004F4, 0x0004F5}, +{0x0004F6, 0x0004F7}, +{0x0004F8, 0x0004F9}, +{0x0004FA, 0x0004FB}, +{0x0004FC, 0x0004FD}, +{0x0004FE, 0x0004FF}, +{0x000500, 0x000501}, +{0x000502, 0x000503}, +{0x000504, 0x000505}, +{0x000506, 0x000507}, +{0x000508, 0x000509}, +{0x00050A, 0x00050B}, +{0x00050C, 0x00050D}, +{0x00050E, 0x00050F}, +{0x000510, 0x000511}, +{0x000512, 0x000513}, +{0x000514, 0x000515}, +{0x000516, 0x000517}, +{0x000518, 0x000519}, +{0x00051A, 0x00051B}, +{0x00051C, 0x00051D}, +{0x00051E, 0x00051F}, +{0x000520, 0x000521}, +{0x000522, 0x000523}, +{0x000524, 0x000525}, +{0x000526, 0x000527}, +{0x000528, 0x000529}, +{0x00052A, 0x00052B}, +{0x00052C, 0x00052D}, +{0x00052E, 0x00052F}, +{0x000531, 0x000561}, +{0x000532, 0x000562}, +{0x000533, 0x000563}, +{0x000534, 0x000564}, +{0x000535, 0x000565}, +{0x000536, 0x000566}, +{0x000537, 0x000567}, +{0x000538, 0x000568}, +{0x000539, 0x000569}, +{0x00053A, 0x00056A}, +{0x00053B, 0x00056B}, +{0x00053C, 0x00056C}, +{0x00053D, 0x00056D}, +{0x00053E, 0x00056E}, +{0x00053F, 0x00056F}, +{0x000540, 0x000570}, +{0x000541, 0x000571}, +{0x000542, 0x000572}, +{0x000543, 0x000573}, +{0x000544, 0x000574}, +{0x000545, 0x000575}, +{0x000546, 0x000576}, +{0x000547, 0x000577}, +{0x000548, 0x000578}, +{0x000549, 0x000579}, +{0x00054A, 0x00057A}, +{0x00054B, 0x00057B}, +{0x00054C, 0x00057C}, +{0x00054D, 0x00057D}, +{0x00054E, 0x00057E}, +{0x00054F, 0x00057F}, +{0x000550, 0x000580}, +{0x000551, 0x000581}, +{0x000552, 0x000582}, +{0x000553, 0x000583}, +{0x000554, 0x000584}, +{0x000555, 0x000585}, +{0x000556, 0x000586}, +{0x0010A0, 0x002D00}, +{0x0010A1, 0x002D01}, +{0x0010A2, 0x002D02}, +{0x0010A3, 0x002D03}, +{0x0010A4, 0x002D04}, +{0x0010A5, 0x002D05}, +{0x0010A6, 0x002D06}, +{0x0010A7, 0x002D07}, +{0x0010A8, 0x002D08}, +{0x0010A9, 0x002D09}, +{0x0010AA, 0x002D0A}, +{0x0010AB, 0x002D0B}, +{0x0010AC, 0x002D0C}, +{0x0010AD, 0x002D0D}, +{0x0010AE, 0x002D0E}, +{0x0010AF, 0x002D0F}, +{0x0010B0, 0x002D10}, +{0x0010B1, 0x002D11}, +{0x0010B2, 0x002D12}, +{0x0010B3, 0x002D13}, +{0x0010B4, 0x002D14}, +{0x0010B5, 0x002D15}, +{0x0010B6, 0x002D16}, +{0x0010B7, 0x002D17}, +{0x0010B8, 0x002D18}, +{0x0010B9, 0x002D19}, +{0x0010BA, 0x002D1A}, +{0x0010BB, 0x002D1B}, +{0x0010BC, 0x002D1C}, +{0x0010BD, 0x002D1D}, +{0x0010BE, 0x002D1E}, +{0x0010BF, 0x002D1F}, +{0x0010C0, 0x002D20}, +{0x0010C1, 0x002D21}, +{0x0010C2, 0x002D22}, +{0x0010C3, 0x002D23}, +{0x0010C4, 0x002D24}, +{0x0010C5, 0x002D25}, +{0x0010C7, 0x002D27}, +{0x0010CD, 0x002D2D}, +{0x0013A0, 0x00AB70}, +{0x0013A1, 0x00AB71}, +{0x0013A2, 0x00AB72}, +{0x0013A3, 0x00AB73}, +{0x0013A4, 0x00AB74}, +{0x0013A5, 0x00AB75}, +{0x0013A6, 0x00AB76}, +{0x0013A7, 0x00AB77}, +{0x0013A8, 0x00AB78}, +{0x0013A9, 0x00AB79}, +{0x0013AA, 0x00AB7A}, +{0x0013AB, 0x00AB7B}, +{0x0013AC, 0x00AB7C}, +{0x0013AD, 0x00AB7D}, +{0x0013AE, 0x00AB7E}, +{0x0013AF, 0x00AB7F}, +{0x0013B0, 0x00AB80}, +{0x0013B1, 0x00AB81}, +{0x0013B2, 0x00AB82}, +{0x0013B3, 0x00AB83}, +{0x0013B4, 0x00AB84}, +{0x0013B5, 0x00AB85}, +{0x0013B6, 0x00AB86}, +{0x0013B7, 0x00AB87}, +{0x0013B8, 0x00AB88}, +{0x0013B9, 0x00AB89}, +{0x0013BA, 0x00AB8A}, +{0x0013BB, 0x00AB8B}, +{0x0013BC, 0x00AB8C}, +{0x0013BD, 0x00AB8D}, +{0x0013BE, 0x00AB8E}, +{0x0013BF, 0x00AB8F}, +{0x0013C0, 0x00AB90}, +{0x0013C1, 0x00AB91}, +{0x0013C2, 0x00AB92}, +{0x0013C3, 0x00AB93}, +{0x0013C4, 0x00AB94}, +{0x0013C5, 0x00AB95}, +{0x0013C6, 0x00AB96}, +{0x0013C7, 0x00AB97}, +{0x0013C8, 0x00AB98}, +{0x0013C9, 0x00AB99}, +{0x0013CA, 0x00AB9A}, +{0x0013CB, 0x00AB9B}, +{0x0013CC, 0x00AB9C}, +{0x0013CD, 0x00AB9D}, +{0x0013CE, 0x00AB9E}, +{0x0013CF, 0x00AB9F}, +{0x0013D0, 0x00ABA0}, +{0x0013D1, 0x00ABA1}, +{0x0013D2, 0x00ABA2}, +{0x0013D3, 0x00ABA3}, +{0x0013D4, 0x00ABA4}, +{0x0013D5, 0x00ABA5}, +{0x0013D6, 0x00ABA6}, +{0x0013D7, 0x00ABA7}, +{0x0013D8, 0x00ABA8}, +{0x0013D9, 0x00ABA9}, +{0x0013DA, 0x00ABAA}, +{0x0013DB, 0x00ABAB}, +{0x0013DC, 0x00ABAC}, +{0x0013DD, 0x00ABAD}, +{0x0013DE, 0x00ABAE}, +{0x0013DF, 0x00ABAF}, +{0x0013E0, 0x00ABB0}, +{0x0013E1, 0x00ABB1}, +{0x0013E2, 0x00ABB2}, +{0x0013E3, 0x00ABB3}, +{0x0013E4, 0x00ABB4}, +{0x0013E5, 0x00ABB5}, +{0x0013E6, 0x00ABB6}, +{0x0013E7, 0x00ABB7}, +{0x0013E8, 0x00ABB8}, +{0x0013E9, 0x00ABB9}, +{0x0013EA, 0x00ABBA}, +{0x0013EB, 0x00ABBB}, +{0x0013EC, 0x00ABBC}, +{0x0013ED, 0x00ABBD}, +{0x0013EE, 0x00ABBE}, +{0x0013EF, 0x00ABBF}, +{0x0013F0, 0x0013F8}, +{0x0013F1, 0x0013F9}, +{0x0013F2, 0x0013FA}, +{0x0013F3, 0x0013FB}, +{0x0013F4, 0x0013FC}, +{0x0013F5, 0x0013FD}, +{0x001C90, 0x0010D0}, +{0x001C91, 0x0010D1}, +{0x001C92, 0x0010D2}, +{0x001C93, 0x0010D3}, +{0x001C94, 0x0010D4}, +{0x001C95, 0x0010D5}, +{0x001C96, 0x0010D6}, +{0x001C97, 0x0010D7}, +{0x001C98, 0x0010D8}, +{0x001C99, 0x0010D9}, +{0x001C9A, 0x0010DA}, +{0x001C9B, 0x0010DB}, +{0x001C9C, 0x0010DC}, +{0x001C9D, 0x0010DD}, +{0x001C9E, 0x0010DE}, +{0x001C9F, 0x0010DF}, +{0x001CA0, 0x0010E0}, +{0x001CA1, 0x0010E1}, +{0x001CA2, 0x0010E2}, +{0x001CA3, 0x0010E3}, +{0x001CA4, 0x0010E4}, +{0x001CA5, 0x0010E5}, +{0x001CA6, 0x0010E6}, +{0x001CA7, 0x0010E7}, +{0x001CA8, 0x0010E8}, +{0x001CA9, 0x0010E9}, +{0x001CAA, 0x0010EA}, +{0x001CAB, 0x0010EB}, +{0x001CAC, 0x0010EC}, +{0x001CAD, 0x0010ED}, +{0x001CAE, 0x0010EE}, +{0x001CAF, 0x0010EF}, +{0x001CB0, 0x0010F0}, +{0x001CB1, 0x0010F1}, +{0x001CB2, 0x0010F2}, +{0x001CB3, 0x0010F3}, +{0x001CB4, 0x0010F4}, +{0x001CB5, 0x0010F5}, +{0x001CB6, 0x0010F6}, +{0x001CB7, 0x0010F7}, +{0x001CB8, 0x0010F8}, +{0x001CB9, 0x0010F9}, +{0x001CBA, 0x0010FA}, +{0x001CBD, 0x0010FD}, +{0x001CBE, 0x0010FE}, +{0x001CBF, 0x0010FF}, +{0x001E00, 0x001E01}, +{0x001E02, 0x001E03}, +{0x001E04, 0x001E05}, +{0x001E06, 0x001E07}, +{0x001E08, 0x001E09}, +{0x001E0A, 0x001E0B}, +{0x001E0C, 0x001E0D}, +{0x001E0E, 0x001E0F}, +{0x001E10, 0x001E11}, +{0x001E12, 0x001E13}, +{0x001E14, 0x001E15}, +{0x001E16, 0x001E17}, +{0x001E18, 0x001E19}, +{0x001E1A, 0x001E1B}, +{0x001E1C, 0x001E1D}, +{0x001E1E, 0x001E1F}, +{0x001E20, 0x001E21}, +{0x001E22, 0x001E23}, +{0x001E24, 0x001E25}, +{0x001E26, 0x001E27}, +{0x001E28, 0x001E29}, +{0x001E2A, 0x001E2B}, +{0x001E2C, 0x001E2D}, +{0x001E2E, 0x001E2F}, +{0x001E30, 0x001E31}, +{0x001E32, 0x001E33}, +{0x001E34, 0x001E35}, +{0x001E36, 0x001E37}, +{0x001E38, 0x001E39}, +{0x001E3A, 0x001E3B}, +{0x001E3C, 0x001E3D}, +{0x001E3E, 0x001E3F}, +{0x001E40, 0x001E41}, +{0x001E42, 0x001E43}, +{0x001E44, 0x001E45}, +{0x001E46, 0x001E47}, +{0x001E48, 0x001E49}, +{0x001E4A, 0x001E4B}, +{0x001E4C, 0x001E4D}, +{0x001E4E, 0x001E4F}, +{0x001E50, 0x001E51}, +{0x001E52, 0x001E53}, +{0x001E54, 0x001E55}, +{0x001E56, 0x001E57}, +{0x001E58, 0x001E59}, +{0x001E5A, 0x001E5B}, +{0x001E5C, 0x001E5D}, +{0x001E5E, 0x001E5F}, +{0x001E60, 0x001E61}, +{0x001E62, 0x001E63}, +{0x001E64, 0x001E65}, +{0x001E66, 0x001E67}, +{0x001E68, 0x001E69}, +{0x001E6A, 0x001E6B}, +{0x001E6C, 0x001E6D}, +{0x001E6E, 0x001E6F}, +{0x001E70, 0x001E71}, +{0x001E72, 0x001E73}, +{0x001E74, 0x001E75}, +{0x001E76, 0x001E77}, +{0x001E78, 0x001E79}, +{0x001E7A, 0x001E7B}, +{0x001E7C, 0x001E7D}, +{0x001E7E, 0x001E7F}, +{0x001E80, 0x001E81}, +{0x001E82, 0x001E83}, +{0x001E84, 0x001E85}, +{0x001E86, 0x001E87}, +{0x001E88, 0x001E89}, +{0x001E8A, 0x001E8B}, +{0x001E8C, 0x001E8D}, +{0x001E8E, 0x001E8F}, +{0x001E90, 0x001E91}, +{0x001E92, 0x001E93}, +{0x001E94, 0x001E95}, +{0x001E9E, 0x0000DF}, +{0x001EA0, 0x001EA1}, +{0x001EA2, 0x001EA3}, +{0x001EA4, 0x001EA5}, +{0x001EA6, 0x001EA7}, +{0x001EA8, 0x001EA9}, +{0x001EAA, 0x001EAB}, +{0x001EAC, 0x001EAD}, +{0x001EAE, 0x001EAF}, +{0x001EB0, 0x001EB1}, +{0x001EB2, 0x001EB3}, +{0x001EB4, 0x001EB5}, +{0x001EB6, 0x001EB7}, +{0x001EB8, 0x001EB9}, +{0x001EBA, 0x001EBB}, +{0x001EBC, 0x001EBD}, +{0x001EBE, 0x001EBF}, +{0x001EC0, 0x001EC1}, +{0x001EC2, 0x001EC3}, +{0x001EC4, 0x001EC5}, +{0x001EC6, 0x001EC7}, +{0x001EC8, 0x001EC9}, +{0x001ECA, 0x001ECB}, +{0x001ECC, 0x001ECD}, +{0x001ECE, 0x001ECF}, +{0x001ED0, 0x001ED1}, +{0x001ED2, 0x001ED3}, +{0x001ED4, 0x001ED5}, +{0x001ED6, 0x001ED7}, +{0x001ED8, 0x001ED9}, +{0x001EDA, 0x001EDB}, +{0x001EDC, 0x001EDD}, +{0x001EDE, 0x001EDF}, +{0x001EE0, 0x001EE1}, +{0x001EE2, 0x001EE3}, +{0x001EE4, 0x001EE5}, +{0x001EE6, 0x001EE7}, +{0x001EE8, 0x001EE9}, +{0x001EEA, 0x001EEB}, +{0x001EEC, 0x001EED}, +{0x001EEE, 0x001EEF}, +{0x001EF0, 0x001EF1}, +{0x001EF2, 0x001EF3}, +{0x001EF4, 0x001EF5}, +{0x001EF6, 0x001EF7}, +{0x001EF8, 0x001EF9}, +{0x001EFA, 0x001EFB}, +{0x001EFC, 0x001EFD}, +{0x001EFE, 0x001EFF}, +{0x001F08, 0x001F00}, +{0x001F09, 0x001F01}, +{0x001F0A, 0x001F02}, +{0x001F0B, 0x001F03}, +{0x001F0C, 0x001F04}, +{0x001F0D, 0x001F05}, +{0x001F0E, 0x001F06}, +{0x001F0F, 0x001F07}, +{0x001F18, 0x001F10}, +{0x001F19, 0x001F11}, +{0x001F1A, 0x001F12}, +{0x001F1B, 0x001F13}, +{0x001F1C, 0x001F14}, +{0x001F1D, 0x001F15}, +{0x001F28, 0x001F20}, +{0x001F29, 0x001F21}, +{0x001F2A, 0x001F22}, +{0x001F2B, 0x001F23}, +{0x001F2C, 0x001F24}, +{0x001F2D, 0x001F25}, +{0x001F2E, 0x001F26}, +{0x001F2F, 0x001F27}, +{0x001F38, 0x001F30}, +{0x001F39, 0x001F31}, +{0x001F3A, 0x001F32}, +{0x001F3B, 0x001F33}, +{0x001F3C, 0x001F34}, +{0x001F3D, 0x001F35}, +{0x001F3E, 0x001F36}, +{0x001F3F, 0x001F37}, +{0x001F48, 0x001F40}, +{0x001F49, 0x001F41}, +{0x001F4A, 0x001F42}, +{0x001F4B, 0x001F43}, +{0x001F4C, 0x001F44}, +{0x001F4D, 0x001F45}, +{0x001F59, 0x001F51}, +{0x001F5B, 0x001F53}, +{0x001F5D, 0x001F55}, +{0x001F5F, 0x001F57}, +{0x001F68, 0x001F60}, +{0x001F69, 0x001F61}, +{0x001F6A, 0x001F62}, +{0x001F6B, 0x001F63}, +{0x001F6C, 0x001F64}, +{0x001F6D, 0x001F65}, +{0x001F6E, 0x001F66}, +{0x001F6F, 0x001F67}, +{0x001F88, 0x001F80}, +{0x001F89, 0x001F81}, +{0x001F8A, 0x001F82}, +{0x001F8B, 0x001F83}, +{0x001F8C, 0x001F84}, +{0x001F8D, 0x001F85}, +{0x001F8E, 0x001F86}, +{0x001F8F, 0x001F87}, +{0x001F98, 0x001F90}, +{0x001F99, 0x001F91}, +{0x001F9A, 0x001F92}, +{0x001F9B, 0x001F93}, +{0x001F9C, 0x001F94}, +{0x001F9D, 0x001F95}, +{0x001F9E, 0x001F96}, +{0x001F9F, 0x001F97}, +{0x001FA8, 0x001FA0}, +{0x001FA9, 0x001FA1}, +{0x001FAA, 0x001FA2}, +{0x001FAB, 0x001FA3}, +{0x001FAC, 0x001FA4}, +{0x001FAD, 0x001FA5}, +{0x001FAE, 0x001FA6}, +{0x001FAF, 0x001FA7}, +{0x001FB8, 0x001FB0}, +{0x001FB9, 0x001FB1}, +{0x001FBA, 0x001F70}, +{0x001FBB, 0x001F71}, +{0x001FBC, 0x001FB3}, +{0x001FC8, 0x001F72}, +{0x001FC9, 0x001F73}, +{0x001FCA, 0x001F74}, +{0x001FCB, 0x001F75}, +{0x001FCC, 0x001FC3}, +{0x001FD8, 0x001FD0}, +{0x001FD9, 0x001FD1}, +{0x001FDA, 0x001F76}, +{0x001FDB, 0x001F77}, +{0x001FE8, 0x001FE0}, +{0x001FE9, 0x001FE1}, +{0x001FEA, 0x001F7A}, +{0x001FEB, 0x001F7B}, +{0x001FEC, 0x001FE5}, +{0x001FF8, 0x001F78}, +{0x001FF9, 0x001F79}, +{0x001FFA, 0x001F7C}, +{0x001FFB, 0x001F7D}, +{0x001FFC, 0x001FF3}, +{0x002126, 0x0003C9}, +{0x00212A, 0x00006B}, +{0x00212B, 0x0000E5}, +{0x002132, 0x00214E}, +{0x002160, 0x002170}, +{0x002161, 0x002171}, +{0x002162, 0x002172}, +{0x002163, 0x002173}, +{0x002164, 0x002174}, +{0x002165, 0x002175}, +{0x002166, 0x002176}, +{0x002167, 0x002177}, +{0x002168, 0x002178}, +{0x002169, 0x002179}, +{0x00216A, 0x00217A}, +{0x00216B, 0x00217B}, +{0x00216C, 0x00217C}, +{0x00216D, 0x00217D}, +{0x00216E, 0x00217E}, +{0x00216F, 0x00217F}, +{0x002183, 0x002184}, +{0x0024B6, 0x0024D0}, +{0x0024B7, 0x0024D1}, +{0x0024B8, 0x0024D2}, +{0x0024B9, 0x0024D3}, +{0x0024BA, 0x0024D4}, +{0x0024BB, 0x0024D5}, +{0x0024BC, 0x0024D6}, +{0x0024BD, 0x0024D7}, +{0x0024BE, 0x0024D8}, +{0x0024BF, 0x0024D9}, +{0x0024C0, 0x0024DA}, +{0x0024C1, 0x0024DB}, +{0x0024C2, 0x0024DC}, +{0x0024C3, 0x0024DD}, +{0x0024C4, 0x0024DE}, +{0x0024C5, 0x0024DF}, +{0x0024C6, 0x0024E0}, +{0x0024C7, 0x0024E1}, +{0x0024C8, 0x0024E2}, +{0x0024C9, 0x0024E3}, +{0x0024CA, 0x0024E4}, +{0x0024CB, 0x0024E5}, +{0x0024CC, 0x0024E6}, +{0x0024CD, 0x0024E7}, +{0x0024CE, 0x0024E8}, +{0x0024CF, 0x0024E9}, +{0x002C00, 0x002C30}, +{0x002C01, 0x002C31}, +{0x002C02, 0x002C32}, +{0x002C03, 0x002C33}, +{0x002C04, 0x002C34}, +{0x002C05, 0x002C35}, +{0x002C06, 0x002C36}, +{0x002C07, 0x002C37}, +{0x002C08, 0x002C38}, +{0x002C09, 0x002C39}, +{0x002C0A, 0x002C3A}, +{0x002C0B, 0x002C3B}, +{0x002C0C, 0x002C3C}, +{0x002C0D, 0x002C3D}, +{0x002C0E, 0x002C3E}, +{0x002C0F, 0x002C3F}, +{0x002C10, 0x002C40}, +{0x002C11, 0x002C41}, +{0x002C12, 0x002C42}, +{0x002C13, 0x002C43}, +{0x002C14, 0x002C44}, +{0x002C15, 0x002C45}, +{0x002C16, 0x002C46}, +{0x002C17, 0x002C47}, +{0x002C18, 0x002C48}, +{0x002C19, 0x002C49}, +{0x002C1A, 0x002C4A}, +{0x002C1B, 0x002C4B}, +{0x002C1C, 0x002C4C}, +{0x002C1D, 0x002C4D}, +{0x002C1E, 0x002C4E}, +{0x002C1F, 0x002C4F}, +{0x002C20, 0x002C50}, +{0x002C21, 0x002C51}, +{0x002C22, 0x002C52}, +{0x002C23, 0x002C53}, +{0x002C24, 0x002C54}, +{0x002C25, 0x002C55}, +{0x002C26, 0x002C56}, +{0x002C27, 0x002C57}, +{0x002C28, 0x002C58}, +{0x002C29, 0x002C59}, +{0x002C2A, 0x002C5A}, +{0x002C2B, 0x002C5B}, +{0x002C2C, 0x002C5C}, +{0x002C2D, 0x002C5D}, +{0x002C2E, 0x002C5E}, +{0x002C60, 0x002C61}, +{0x002C62, 0x00026B}, +{0x002C63, 0x001D7D}, +{0x002C64, 0x00027D}, +{0x002C67, 0x002C68}, +{0x002C69, 0x002C6A}, +{0x002C6B, 0x002C6C}, +{0x002C6D, 0x000251}, +{0x002C6E, 0x000271}, +{0x002C6F, 0x000250}, +{0x002C70, 0x000252}, +{0x002C72, 0x002C73}, +{0x002C75, 0x002C76}, +{0x002C7E, 0x00023F}, +{0x002C7F, 0x000240}, +{0x002C80, 0x002C81}, +{0x002C82, 0x002C83}, +{0x002C84, 0x002C85}, +{0x002C86, 0x002C87}, +{0x002C88, 0x002C89}, +{0x002C8A, 0x002C8B}, +{0x002C8C, 0x002C8D}, +{0x002C8E, 0x002C8F}, +{0x002C90, 0x002C91}, +{0x002C92, 0x002C93}, +{0x002C94, 0x002C95}, +{0x002C96, 0x002C97}, +{0x002C98, 0x002C99}, +{0x002C9A, 0x002C9B}, +{0x002C9C, 0x002C9D}, +{0x002C9E, 0x002C9F}, +{0x002CA0, 0x002CA1}, +{0x002CA2, 0x002CA3}, +{0x002CA4, 0x002CA5}, +{0x002CA6, 0x002CA7}, +{0x002CA8, 0x002CA9}, +{0x002CAA, 0x002CAB}, +{0x002CAC, 0x002CAD}, +{0x002CAE, 0x002CAF}, +{0x002CB0, 0x002CB1}, +{0x002CB2, 0x002CB3}, +{0x002CB4, 0x002CB5}, +{0x002CB6, 0x002CB7}, +{0x002CB8, 0x002CB9}, +{0x002CBA, 0x002CBB}, +{0x002CBC, 0x002CBD}, +{0x002CBE, 0x002CBF}, +{0x002CC0, 0x002CC1}, +{0x002CC2, 0x002CC3}, +{0x002CC4, 0x002CC5}, +{0x002CC6, 0x002CC7}, +{0x002CC8, 0x002CC9}, +{0x002CCA, 0x002CCB}, +{0x002CCC, 0x002CCD}, +{0x002CCE, 0x002CCF}, +{0x002CD0, 0x002CD1}, +{0x002CD2, 0x002CD3}, +{0x002CD4, 0x002CD5}, +{0x002CD6, 0x002CD7}, +{0x002CD8, 0x002CD9}, +{0x002CDA, 0x002CDB}, +{0x002CDC, 0x002CDD}, +{0x002CDE, 0x002CDF}, +{0x002CE0, 0x002CE1}, +{0x002CE2, 0x002CE3}, +{0x002CEB, 0x002CEC}, +{0x002CED, 0x002CEE}, +{0x002CF2, 0x002CF3}, +{0x00A640, 0x00A641}, +{0x00A642, 0x00A643}, +{0x00A644, 0x00A645}, +{0x00A646, 0x00A647}, +{0x00A648, 0x00A649}, +{0x00A64A, 0x00A64B}, +{0x00A64C, 0x00A64D}, +{0x00A64E, 0x00A64F}, +{0x00A650, 0x00A651}, +{0x00A652, 0x00A653}, +{0x00A654, 0x00A655}, +{0x00A656, 0x00A657}, +{0x00A658, 0x00A659}, +{0x00A65A, 0x00A65B}, +{0x00A65C, 0x00A65D}, +{0x00A65E, 0x00A65F}, +{0x00A660, 0x00A661}, +{0x00A662, 0x00A663}, +{0x00A664, 0x00A665}, +{0x00A666, 0x00A667}, +{0x00A668, 0x00A669}, +{0x00A66A, 0x00A66B}, +{0x00A66C, 0x00A66D}, +{0x00A680, 0x00A681}, +{0x00A682, 0x00A683}, +{0x00A684, 0x00A685}, +{0x00A686, 0x00A687}, +{0x00A688, 0x00A689}, +{0x00A68A, 0x00A68B}, +{0x00A68C, 0x00A68D}, +{0x00A68E, 0x00A68F}, +{0x00A690, 0x00A691}, +{0x00A692, 0x00A693}, +{0x00A694, 0x00A695}, +{0x00A696, 0x00A697}, +{0x00A698, 0x00A699}, +{0x00A69A, 0x00A69B}, +{0x00A722, 0x00A723}, +{0x00A724, 0x00A725}, +{0x00A726, 0x00A727}, +{0x00A728, 0x00A729}, +{0x00A72A, 0x00A72B}, +{0x00A72C, 0x00A72D}, +{0x00A72E, 0x00A72F}, +{0x00A732, 0x00A733}, +{0x00A734, 0x00A735}, +{0x00A736, 0x00A737}, +{0x00A738, 0x00A739}, +{0x00A73A, 0x00A73B}, +{0x00A73C, 0x00A73D}, +{0x00A73E, 0x00A73F}, +{0x00A740, 0x00A741}, +{0x00A742, 0x00A743}, +{0x00A744, 0x00A745}, +{0x00A746, 0x00A747}, +{0x00A748, 0x00A749}, +{0x00A74A, 0x00A74B}, +{0x00A74C, 0x00A74D}, +{0x00A74E, 0x00A74F}, +{0x00A750, 0x00A751}, +{0x00A752, 0x00A753}, +{0x00A754, 0x00A755}, +{0x00A756, 0x00A757}, +{0x00A758, 0x00A759}, +{0x00A75A, 0x00A75B}, +{0x00A75C, 0x00A75D}, +{0x00A75E, 0x00A75F}, +{0x00A760, 0x00A761}, +{0x00A762, 0x00A763}, +{0x00A764, 0x00A765}, +{0x00A766, 0x00A767}, +{0x00A768, 0x00A769}, +{0x00A76A, 0x00A76B}, +{0x00A76C, 0x00A76D}, +{0x00A76E, 0x00A76F}, +{0x00A779, 0x00A77A}, +{0x00A77B, 0x00A77C}, +{0x00A77D, 0x001D79}, +{0x00A77E, 0x00A77F}, +{0x00A780, 0x00A781}, +{0x00A782, 0x00A783}, +{0x00A784, 0x00A785}, +{0x00A786, 0x00A787}, +{0x00A78B, 0x00A78C}, +{0x00A78D, 0x000265}, +{0x00A790, 0x00A791}, +{0x00A792, 0x00A793}, +{0x00A796, 0x00A797}, +{0x00A798, 0x00A799}, +{0x00A79A, 0x00A79B}, +{0x00A79C, 0x00A79D}, +{0x00A79E, 0x00A79F}, +{0x00A7A0, 0x00A7A1}, +{0x00A7A2, 0x00A7A3}, +{0x00A7A4, 0x00A7A5}, +{0x00A7A6, 0x00A7A7}, +{0x00A7A8, 0x00A7A9}, +{0x00A7AA, 0x000266}, +{0x00A7AB, 0x00025C}, +{0x00A7AC, 0x000261}, +{0x00A7AD, 0x00026C}, +{0x00A7AE, 0x00026A}, +{0x00A7B0, 0x00029E}, +{0x00A7B1, 0x000287}, +{0x00A7B2, 0x00029D}, +{0x00A7B3, 0x00AB53}, +{0x00A7B4, 0x00A7B5}, +{0x00A7B6, 0x00A7B7}, +{0x00A7B8, 0x00A7B9}, +{0x00A7BA, 0x00A7BB}, +{0x00A7BC, 0x00A7BD}, +{0x00A7BE, 0x00A7BF}, +{0x00A7C2, 0x00A7C3}, +{0x00A7C4, 0x00A794}, +{0x00A7C5, 0x000282}, +{0x00A7C6, 0x001D8E}, +{0x00A7C7, 0x00A7C8}, +{0x00A7C9, 0x00A7CA}, +{0x00A7F5, 0x00A7F6}, +{0x00FF21, 0x00FF41}, +{0x00FF22, 0x00FF42}, +{0x00FF23, 0x00FF43}, +{0x00FF24, 0x00FF44}, +{0x00FF25, 0x00FF45}, +{0x00FF26, 0x00FF46}, +{0x00FF27, 0x00FF47}, +{0x00FF28, 0x00FF48}, +{0x00FF29, 0x00FF49}, +{0x00FF2A, 0x00FF4A}, +{0x00FF2B, 0x00FF4B}, +{0x00FF2C, 0x00FF4C}, +{0x00FF2D, 0x00FF4D}, +{0x00FF2E, 0x00FF4E}, +{0x00FF2F, 0x00FF4F}, +{0x00FF30, 0x00FF50}, +{0x00FF31, 0x00FF51}, +{0x00FF32, 0x00FF52}, +{0x00FF33, 0x00FF53}, +{0x00FF34, 0x00FF54}, +{0x00FF35, 0x00FF55}, +{0x00FF36, 0x00FF56}, +{0x00FF37, 0x00FF57}, +{0x00FF38, 0x00FF58}, +{0x00FF39, 0x00FF59}, +{0x00FF3A, 0x00FF5A}, +{0x010400, 0x010428}, +{0x010401, 0x010429}, +{0x010402, 0x01042A}, +{0x010403, 0x01042B}, +{0x010404, 0x01042C}, +{0x010405, 0x01042D}, +{0x010406, 0x01042E}, +{0x010407, 0x01042F}, +{0x010408, 0x010430}, +{0x010409, 0x010431}, +{0x01040A, 0x010432}, +{0x01040B, 0x010433}, +{0x01040C, 0x010434}, +{0x01040D, 0x010435}, +{0x01040E, 0x010436}, +{0x01040F, 0x010437}, +{0x010410, 0x010438}, +{0x010411, 0x010439}, +{0x010412, 0x01043A}, +{0x010413, 0x01043B}, +{0x010414, 0x01043C}, +{0x010415, 0x01043D}, +{0x010416, 0x01043E}, +{0x010417, 0x01043F}, +{0x010418, 0x010440}, +{0x010419, 0x010441}, +{0x01041A, 0x010442}, +{0x01041B, 0x010443}, +{0x01041C, 0x010444}, +{0x01041D, 0x010445}, +{0x01041E, 0x010446}, +{0x01041F, 0x010447}, +{0x010420, 0x010448}, +{0x010421, 0x010449}, +{0x010422, 0x01044A}, +{0x010423, 0x01044B}, +{0x010424, 0x01044C}, +{0x010425, 0x01044D}, +{0x010426, 0x01044E}, +{0x010427, 0x01044F}, +{0x0104B0, 0x0104D8}, +{0x0104B1, 0x0104D9}, +{0x0104B2, 0x0104DA}, +{0x0104B3, 0x0104DB}, +{0x0104B4, 0x0104DC}, +{0x0104B5, 0x0104DD}, +{0x0104B6, 0x0104DE}, +{0x0104B7, 0x0104DF}, +{0x0104B8, 0x0104E0}, +{0x0104B9, 0x0104E1}, +{0x0104BA, 0x0104E2}, +{0x0104BB, 0x0104E3}, +{0x0104BC, 0x0104E4}, +{0x0104BD, 0x0104E5}, +{0x0104BE, 0x0104E6}, +{0x0104BF, 0x0104E7}, +{0x0104C0, 0x0104E8}, +{0x0104C1, 0x0104E9}, +{0x0104C2, 0x0104EA}, +{0x0104C3, 0x0104EB}, +{0x0104C4, 0x0104EC}, +{0x0104C5, 0x0104ED}, +{0x0104C6, 0x0104EE}, +{0x0104C7, 0x0104EF}, +{0x0104C8, 0x0104F0}, +{0x0104C9, 0x0104F1}, +{0x0104CA, 0x0104F2}, +{0x0104CB, 0x0104F3}, +{0x0104CC, 0x0104F4}, +{0x0104CD, 0x0104F5}, +{0x0104CE, 0x0104F6}, +{0x0104CF, 0x0104F7}, +{0x0104D0, 0x0104F8}, +{0x0104D1, 0x0104F9}, +{0x0104D2, 0x0104FA}, +{0x0104D3, 0x0104FB}, +{0x010C80, 0x010CC0}, +{0x010C81, 0x010CC1}, +{0x010C82, 0x010CC2}, +{0x010C83, 0x010CC3}, +{0x010C84, 0x010CC4}, +{0x010C85, 0x010CC5}, +{0x010C86, 0x010CC6}, +{0x010C87, 0x010CC7}, +{0x010C88, 0x010CC8}, +{0x010C89, 0x010CC9}, +{0x010C8A, 0x010CCA}, +{0x010C8B, 0x010CCB}, +{0x010C8C, 0x010CCC}, +{0x010C8D, 0x010CCD}, +{0x010C8E, 0x010CCE}, +{0x010C8F, 0x010CCF}, +{0x010C90, 0x010CD0}, +{0x010C91, 0x010CD1}, +{0x010C92, 0x010CD2}, +{0x010C93, 0x010CD3}, +{0x010C94, 0x010CD4}, +{0x010C95, 0x010CD5}, +{0x010C96, 0x010CD6}, +{0x010C97, 0x010CD7}, +{0x010C98, 0x010CD8}, +{0x010C99, 0x010CD9}, +{0x010C9A, 0x010CDA}, +{0x010C9B, 0x010CDB}, +{0x010C9C, 0x010CDC}, +{0x010C9D, 0x010CDD}, +{0x010C9E, 0x010CDE}, +{0x010C9F, 0x010CDF}, +{0x010CA0, 0x010CE0}, +{0x010CA1, 0x010CE1}, +{0x010CA2, 0x010CE2}, +{0x010CA3, 0x010CE3}, +{0x010CA4, 0x010CE4}, +{0x010CA5, 0x010CE5}, +{0x010CA6, 0x010CE6}, +{0x010CA7, 0x010CE7}, +{0x010CA8, 0x010CE8}, +{0x010CA9, 0x010CE9}, +{0x010CAA, 0x010CEA}, +{0x010CAB, 0x010CEB}, +{0x010CAC, 0x010CEC}, +{0x010CAD, 0x010CED}, +{0x010CAE, 0x010CEE}, +{0x010CAF, 0x010CEF}, +{0x010CB0, 0x010CF0}, +{0x010CB1, 0x010CF1}, +{0x010CB2, 0x010CF2}, +{0x0118A0, 0x0118C0}, +{0x0118A1, 0x0118C1}, +{0x0118A2, 0x0118C2}, +{0x0118A3, 0x0118C3}, +{0x0118A4, 0x0118C4}, +{0x0118A5, 0x0118C5}, +{0x0118A6, 0x0118C6}, +{0x0118A7, 0x0118C7}, +{0x0118A8, 0x0118C8}, +{0x0118A9, 0x0118C9}, +{0x0118AA, 0x0118CA}, +{0x0118AB, 0x0118CB}, +{0x0118AC, 0x0118CC}, +{0x0118AD, 0x0118CD}, +{0x0118AE, 0x0118CE}, +{0x0118AF, 0x0118CF}, +{0x0118B0, 0x0118D0}, +{0x0118B1, 0x0118D1}, +{0x0118B2, 0x0118D2}, +{0x0118B3, 0x0118D3}, +{0x0118B4, 0x0118D4}, +{0x0118B5, 0x0118D5}, +{0x0118B6, 0x0118D6}, +{0x0118B7, 0x0118D7}, +{0x0118B8, 0x0118D8}, +{0x0118B9, 0x0118D9}, +{0x0118BA, 0x0118DA}, +{0x0118BB, 0x0118DB}, +{0x0118BC, 0x0118DC}, +{0x0118BD, 0x0118DD}, +{0x0118BE, 0x0118DE}, +{0x0118BF, 0x0118DF}, +{0x016E40, 0x016E60}, +{0x016E41, 0x016E61}, +{0x016E42, 0x016E62}, +{0x016E43, 0x016E63}, +{0x016E44, 0x016E64}, +{0x016E45, 0x016E65}, +{0x016E46, 0x016E66}, +{0x016E47, 0x016E67}, +{0x016E48, 0x016E68}, +{0x016E49, 0x016E69}, +{0x016E4A, 0x016E6A}, +{0x016E4B, 0x016E6B}, +{0x016E4C, 0x016E6C}, +{0x016E4D, 0x016E6D}, +{0x016E4E, 0x016E6E}, +{0x016E4F, 0x016E6F}, +{0x016E50, 0x016E70}, +{0x016E51, 0x016E71}, +{0x016E52, 0x016E72}, +{0x016E53, 0x016E73}, +{0x016E54, 0x016E74}, +{0x016E55, 0x016E75}, +{0x016E56, 0x016E76}, +{0x016E57, 0x016E77}, +{0x016E58, 0x016E78}, +{0x016E59, 0x016E79}, +{0x016E5A, 0x016E7A}, +{0x016E5B, 0x016E7B}, +{0x016E5C, 0x016E7C}, +{0x016E5D, 0x016E7D}, +{0x016E5E, 0x016E7E}, +{0x016E5F, 0x016E7F}, +{0x01E900, 0x01E922}, +{0x01E901, 0x01E923}, +{0x01E902, 0x01E924}, +{0x01E903, 0x01E925}, +{0x01E904, 0x01E926}, +{0x01E905, 0x01E927}, +{0x01E906, 0x01E928}, +{0x01E907, 0x01E929}, +{0x01E908, 0x01E92A}, +{0x01E909, 0x01E92B}, +{0x01E90A, 0x01E92C}, +{0x01E90B, 0x01E92D}, +{0x01E90C, 0x01E92E}, +{0x01E90D, 0x01E92F}, +{0x01E90E, 0x01E930}, +{0x01E90F, 0x01E931}, +{0x01E910, 0x01E932}, +{0x01E911, 0x01E933}, +{0x01E912, 0x01E934}, +{0x01E913, 0x01E935}, +{0x01E914, 0x01E936}, +{0x01E915, 0x01E937}, +{0x01E916, 0x01E938}, +{0x01E917, 0x01E939}, +{0x01E918, 0x01E93A}, +{0x01E919, 0x01E93B}, +{0x01E91A, 0x01E93C}, +{0x01E91B, 0x01E93D}, +{0x01E91C, 0x01E93E}, +{0x01E91D, 0x01E93F}, +{0x01E91E, 0x01E940}, +{0x01E91F, 0x01E941}, +{0x01E920, 0x01E942}, +{0x01E921, 0x01E943}, }; -const std::vector> unicode_ranges_control = { -{0x00000000, 0x00000008}, {0x0000000E, 0x0000001B}, {0x0000007F, 0x00000084}, {0x00000086, 0x0000009F}, -{0x000000AD, 0x000000AD}, {0x00000378, 0x00000379}, {0x00000380, 0x00000383}, {0x0000038B, 0x0000038B}, -{0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2}, {0x00000530, 0x00000530}, {0x00000557, 0x00000558}, -{0x0000058B, 0x0000058C}, {0x00000590, 0x00000590}, {0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE}, -{0x000005F5, 0x00000605}, {0x0000061C, 0x0000061D}, {0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F}, -{0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF}, {0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F}, -{0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D}, {0x0000085F, 0x0000085F}, {0x0000086B, 0x0000089F}, -{0x000008B5, 0x000008B5}, {0x000008C8, 0x000008D2}, {0x000008E2, 0x000008E2}, {0x00000984, 0x00000984}, -{0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9}, {0x000009B1, 0x000009B1}, -{0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6}, {0x000009C9, 0x000009CA}, -{0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE}, {0x000009E4, 0x000009E5}, -{0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E}, {0x00000A11, 0x00000A12}, -{0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34}, {0x00000A37, 0x00000A37}, -{0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46}, {0x00000A49, 0x00000A4A}, -{0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D}, {0x00000A5F, 0x00000A65}, -{0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E}, {0x00000A92, 0x00000A92}, -{0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4}, {0x00000ABA, 0x00000ABB}, -{0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF}, {0x00000AD1, 0x00000ADF}, -{0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00}, {0x00000B04, 0x00000B04}, -{0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29}, {0x00000B31, 0x00000B31}, -{0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46}, {0x00000B49, 0x00000B4A}, -{0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E}, {0x00000B64, 0x00000B65}, -{0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D}, {0x00000B91, 0x00000B91}, -{0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D}, {0x00000BA0, 0x00000BA2}, -{0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD}, {0x00000BC3, 0x00000BC5}, -{0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6}, {0x00000BD8, 0x00000BE5}, -{0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11}, {0x00000C29, 0x00000C29}, -{0x00000C3A, 0x00000C3C}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49}, {0x00000C4E, 0x00000C54}, -{0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5F}, {0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76}, -{0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91}, {0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4}, -{0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5}, {0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4}, -{0x00000CD7, 0x00000CDD}, {0x00000CDF, 0x00000CDF}, {0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0}, -{0x00000CF3, 0x00000CFF}, {0x00000D0D, 0x00000D0D}, {0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45}, -{0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53}, {0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80}, -{0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99}, {0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC}, -{0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9}, {0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5}, -{0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5}, {0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00}, -{0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80}, {0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85}, -{0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4}, {0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF}, -{0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7}, {0x00000ECE, 0x00000ECF}, {0x00000EDA, 0x00000EDB}, -{0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48}, {0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98}, -{0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD}, {0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6}, -{0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF}, {0x00001249, 0x00001249}, {0x0000124E, 0x0000124F}, -{0x00001257, 0x00001257}, {0x00001259, 0x00001259}, {0x0000125E, 0x0000125F}, {0x00001289, 0x00001289}, -{0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1}, {0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF}, -{0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7}, {0x000012D7, 0x000012D7}, {0x00001311, 0x00001311}, -{0x00001316, 0x00001317}, {0x0000135B, 0x0000135C}, {0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F}, -{0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF}, {0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF}, -{0x0000170D, 0x0000170D}, {0x00001715, 0x0000171F}, {0x00001737, 0x0000173F}, {0x00001754, 0x0000175F}, -{0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F}, {0x000017DE, 0x000017DF}, -{0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180F}, {0x0000181A, 0x0000181F}, -{0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF}, {0x0000191F, 0x0000191F}, -{0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943}, {0x0000196E, 0x0000196F}, -{0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF}, {0x000019DB, 0x000019DD}, -{0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E}, {0x00001A8A, 0x00001A8F}, -{0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001AC1, 0x00001AFF}, {0x00001B4C, 0x00001B4F}, -{0x00001B7D, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A}, {0x00001C4A, 0x00001C4C}, -{0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF}, {0x00001CFB, 0x00001CFF}, -{0x00001DFA, 0x00001DFA}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47}, -{0x00001F4E, 0x00001F4F}, {0x00001F58, 0x00001F58}, {0x00001F5A, 0x00001F5A}, {0x00001F5C, 0x00001F5C}, -{0x00001F5E, 0x00001F5E}, {0x00001F7E, 0x00001F7F}, {0x00001FB5, 0x00001FB5}, {0x00001FC5, 0x00001FC5}, -{0x00001FD4, 0x00001FD5}, {0x00001FDC, 0x00001FDC}, {0x00001FF0, 0x00001FF1}, {0x00001FF5, 0x00001FF5}, -{0x00001FFF, 0x00001FFF}, {0x0000200B, 0x0000200F}, {0x0000202A, 0x0000202E}, {0x00002060, 0x0000206F}, -{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C0, 0x000020CF}, -{0x000020F1, 0x000020FF}, {0x0000218C, 0x0000218F}, {0x00002427, 0x0000243F}, {0x0000244B, 0x0000245F}, -{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002C2F, 0x00002C2F}, {0x00002C5F, 0x00002C5F}, -{0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26}, {0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F}, -{0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E}, {0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7}, -{0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7}, {0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7}, -{0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7}, {0x00002DDF, 0x00002DDF}, {0x00002E53, 0x00002E7F}, -{0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF}, {0x00002FD6, 0x00002FEF}, {0x00002FFC, 0x00002FFF}, -{0x00003040, 0x00003040}, {0x00003097, 0x00003098}, {0x00003100, 0x00003104}, {0x00003130, 0x00003130}, -{0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EF}, {0x0000321F, 0x0000321F}, {0x00009FFD, 0x00009FFF}, -{0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F}, {0x0000A6F8, 0x0000A6FF}, -{0x0000A7C0, 0x0000A7C1}, {0x0000A7CB, 0x0000A7F4}, {0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F}, -{0x0000A878, 0x0000A87F}, {0x0000A8C6, 0x0000A8CD}, {0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E}, -{0x0000A97D, 0x0000A97F}, {0x0000A9CE, 0x0000A9CE}, {0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF}, -{0x0000AA37, 0x0000AA3F}, {0x0000AA4E, 0x0000AA4F}, {0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA}, -{0x0000AAF7, 0x0000AB00}, {0x0000AB07, 0x0000AB08}, {0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F}, -{0x0000AB27, 0x0000AB27}, {0x0000AB2F, 0x0000AB2F}, {0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF}, -{0x0000ABFA, 0x0000ABFF}, {0x0000D7A4, 0x0000D7AF}, {0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000F8FF}, -{0x0000FA6E, 0x0000FA6F}, {0x0000FADA, 0x0000FAFF}, {0x0000FB07, 0x0000FB12}, {0x0000FB18, 0x0000FB1C}, -{0x0000FB37, 0x0000FB37}, {0x0000FB3D, 0x0000FB3D}, {0x0000FB3F, 0x0000FB3F}, {0x0000FB42, 0x0000FB42}, -{0x0000FB45, 0x0000FB45}, {0x0000FBC2, 0x0000FBD2}, {0x0000FD40, 0x0000FD4F}, {0x0000FD90, 0x0000FD91}, -{0x0000FDC8, 0x0000FDEF}, {0x0000FDFE, 0x0000FDFF}, {0x0000FE1A, 0x0000FE1F}, {0x0000FE53, 0x0000FE53}, -{0x0000FE67, 0x0000FE67}, {0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, {0x0000FEFD, 0x0000FF00}, -{0x0000FFBF, 0x0000FFC1}, {0x0000FFC8, 0x0000FFC9}, {0x0000FFD0, 0x0000FFD1}, {0x0000FFD8, 0x0000FFD9}, -{0x0000FFDD, 0x0000FFDF}, {0x0000FFE7, 0x0000FFE7}, {0x0000FFEF, 0x0000FFFB}, {0x0000FFFE, 0x0000FFFF}, -{0x0001000C, 0x0001000C}, {0x00010027, 0x00010027}, {0x0001003B, 0x0001003B}, {0x0001003E, 0x0001003E}, -{0x0001004E, 0x0001004F}, {0x0001005E, 0x0001007F}, {0x000100FB, 0x000100FF}, {0x00010103, 0x00010106}, -{0x00010134, 0x00010136}, {0x0001018F, 0x0001018F}, {0x0001019D, 0x0001019F}, {0x000101A1, 0x000101CF}, -{0x000101FE, 0x0001027F}, {0x0001029D, 0x0001029F}, {0x000102D1, 0x000102DF}, {0x000102FC, 0x000102FF}, -{0x00010324, 0x0001032C}, {0x0001034B, 0x0001034F}, {0x0001037B, 0x0001037F}, {0x0001039E, 0x0001039E}, -{0x000103C4, 0x000103C7}, {0x000103D6, 0x000103FF}, {0x0001049E, 0x0001049F}, {0x000104AA, 0x000104AF}, -{0x000104D4, 0x000104D7}, {0x000104FC, 0x000104FF}, {0x00010528, 0x0001052F}, {0x00010564, 0x0001056E}, -{0x00010570, 0x000105FF}, {0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x000107FF}, -{0x00010806, 0x00010807}, {0x00010809, 0x00010809}, {0x00010836, 0x00010836}, {0x00010839, 0x0001083B}, -{0x0001083D, 0x0001083E}, {0x00010856, 0x00010856}, {0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF}, -{0x000108F3, 0x000108F3}, {0x000108F6, 0x000108FA}, {0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E}, -{0x00010940, 0x0001097F}, {0x000109B8, 0x000109BB}, {0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04}, -{0x00010A07, 0x00010A0B}, {0x00010A14, 0x00010A14}, {0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37}, -{0x00010A3B, 0x00010A3E}, {0x00010A49, 0x00010A4F}, {0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF}, -{0x00010AE7, 0x00010AEA}, {0x00010AF7, 0x00010AFF}, {0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57}, -{0x00010B73, 0x00010B77}, {0x00010B92, 0x00010B98}, {0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF}, -{0x00010C49, 0x00010C7F}, {0x00010CB3, 0x00010CBF}, {0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F}, -{0x00010D3A, 0x00010E5F}, {0x00010E7F, 0x00010E7F}, {0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF}, -{0x00010EB2, 0x00010EFF}, {0x00010F28, 0x00010F2F}, {0x00010F5A, 0x00010FAF}, {0x00010FCC, 0x00010FDF}, -{0x00010FF7, 0x00010FFF}, {0x0001104E, 0x00011051}, {0x00011070, 0x0001107E}, {0x000110BD, 0x000110BD}, -{0x000110C2, 0x000110CF}, {0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135}, -{0x00011148, 0x0001114F}, {0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF}, -{0x00011212, 0x00011212}, {0x0001123F, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289}, -{0x0001128E, 0x0001128E}, {0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF}, -{0x000112FA, 0x000112FF}, {0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312}, -{0x00011329, 0x00011329}, {0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A}, -{0x00011345, 0x00011346}, {0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356}, -{0x00011358, 0x0001135C}, {0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF}, -{0x0001145C, 0x0001145C}, {0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F}, -{0x000115B6, 0x000115B7}, {0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F}, -{0x0001166D, 0x0001167F}, {0x000116B9, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C}, -{0x0001172C, 0x0001172F}, {0x00011740, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE}, -{0x00011907, 0x00011908}, {0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917}, -{0x00011936, 0x00011936}, {0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F}, -{0x000119A8, 0x000119A9}, {0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F}, -{0x00011AA3, 0x00011ABF}, {0x00011AF9, 0x00011BFF}, {0x00011C09, 0x00011C09}, {0x00011C37, 0x00011C37}, -{0x00011C46, 0x00011C4F}, {0x00011C6D, 0x00011C6F}, {0x00011C90, 0x00011C91}, {0x00011CA8, 0x00011CA8}, -{0x00011CB7, 0x00011CFF}, {0x00011D07, 0x00011D07}, {0x00011D0A, 0x00011D0A}, {0x00011D37, 0x00011D39}, -{0x00011D3B, 0x00011D3B}, {0x00011D3E, 0x00011D3E}, {0x00011D48, 0x00011D4F}, {0x00011D5A, 0x00011D5F}, -{0x00011D66, 0x00011D66}, {0x00011D69, 0x00011D69}, {0x00011D8F, 0x00011D8F}, {0x00011D92, 0x00011D92}, -{0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011FAF}, {0x00011FB1, 0x00011FBF}, -{0x00011FF2, 0x00011FFE}, {0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F}, -{0x00012544, 0x00012FFF}, {0x0001342F, 0x000143FF}, {0x00014647, 0x000167FF}, {0x00016A39, 0x00016A3F}, -{0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, {0x00016A70, 0x00016ACF}, {0x00016AEE, 0x00016AEF}, -{0x00016AF6, 0x00016AFF}, {0x00016B46, 0x00016B4F}, {0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62}, -{0x00016B78, 0x00016B7C}, {0x00016B90, 0x00016E3F}, {0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E}, -{0x00016F88, 0x00016F8E}, {0x00016FA0, 0x00016FDF}, {0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF}, -{0x000187F8, 0x000187FF}, {0x00018CD6, 0x00018CFF}, {0x00018D09, 0x0001AFFF}, {0x0001B11F, 0x0001B14F}, -{0x0001B153, 0x0001B163}, {0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F}, -{0x0001BC7D, 0x0001BC7F}, {0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CFFF}, -{0x0001D0F6, 0x0001D0FF}, {0x0001D127, 0x0001D128}, {0x0001D173, 0x0001D17A}, {0x0001D1E9, 0x0001D1FF}, -{0x0001D246, 0x0001D2DF}, {0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF}, -{0x0001D455, 0x0001D455}, {0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4}, -{0x0001D4A7, 0x0001D4A8}, {0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC}, -{0x0001D4C4, 0x0001D4C4}, {0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515}, -{0x0001D51D, 0x0001D51D}, {0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545}, -{0x0001D547, 0x0001D549}, {0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD}, -{0x0001DA8C, 0x0001DA9A}, {0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DFFF}, {0x0001E007, 0x0001E007}, -{0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, {0x0001E025, 0x0001E025}, {0x0001E02B, 0x0001E0FF}, -{0x0001E12D, 0x0001E12F}, {0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E2BF}, -{0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E7FF}, {0x0001E8C5, 0x0001E8C6}, {0x0001E8D7, 0x0001E8FF}, -{0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, {0x0001E960, 0x0001EC70}, {0x0001ECB5, 0x0001ED00}, -{0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, {0x0001EE20, 0x0001EE20}, {0x0001EE23, 0x0001EE23}, -{0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, {0x0001EE33, 0x0001EE33}, {0x0001EE38, 0x0001EE38}, -{0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, {0x0001EE43, 0x0001EE46}, {0x0001EE48, 0x0001EE48}, -{0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, {0x0001EE50, 0x0001EE50}, {0x0001EE53, 0x0001EE53}, -{0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, {0x0001EE5A, 0x0001EE5A}, {0x0001EE5C, 0x0001EE5C}, -{0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, {0x0001EE63, 0x0001EE63}, {0x0001EE65, 0x0001EE66}, -{0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, {0x0001EE78, 0x0001EE78}, {0x0001EE7D, 0x0001EE7D}, -{0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, {0x0001EE9C, 0x0001EEA0}, {0x0001EEA4, 0x0001EEA4}, -{0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, {0x0001EEF2, 0x0001EFFF}, {0x0001F02C, 0x0001F02F}, -{0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, {0x0001F0C0, 0x0001F0C0}, {0x0001F0D0, 0x0001F0D0}, -{0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, {0x0001F203, 0x0001F20F}, {0x0001F23C, 0x0001F23F}, -{0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, {0x0001F266, 0x0001F2FF}, {0x0001F6D8, 0x0001F6DF}, -{0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, {0x0001F774, 0x0001F77F}, {0x0001F7D9, 0x0001F7DF}, -{0x0001F7EC, 0x0001F7FF}, {0x0001F80C, 0x0001F80F}, {0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F}, -{0x0001F888, 0x0001F88F}, {0x0001F8AE, 0x0001F8AF}, {0x0001F8B2, 0x0001F8FF}, {0x0001F979, 0x0001F979}, -{0x0001F9CC, 0x0001F9CC}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, {0x0001FA75, 0x0001FA77}, -{0x0001FA7B, 0x0001FA7F}, {0x0001FA87, 0x0001FA8F}, {0x0001FAA9, 0x0001FAAF}, {0x0001FAB7, 0x0001FABF}, -{0x0001FAC3, 0x0001FACF}, {0x0001FAD7, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, {0x0001FBCB, 0x0001FBEF}, -{0x0001FBFA, 0x0001FFFF}, {0x0002A6DE, 0x0002A6FF}, {0x0002B735, 0x0002B73F}, {0x0002B81E, 0x0002B81F}, -{0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002F7FF}, {0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x000E00FF}, -{0x000E01F0, 0x0010FFFF}, +const std::unordered_map unicode_map_uppercase = { +{0x000061, 0x000041}, +{0x000062, 0x000042}, +{0x000063, 0x000043}, +{0x000064, 0x000044}, +{0x000065, 0x000045}, +{0x000066, 0x000046}, +{0x000067, 0x000047}, +{0x000068, 0x000048}, +{0x000069, 0x000049}, +{0x00006A, 0x00004A}, +{0x00006B, 0x00004B}, +{0x00006C, 0x00004C}, +{0x00006D, 0x00004D}, +{0x00006E, 0x00004E}, +{0x00006F, 0x00004F}, +{0x000070, 0x000050}, +{0x000071, 0x000051}, +{0x000072, 0x000052}, +{0x000073, 0x000053}, +{0x000074, 0x000054}, +{0x000075, 0x000055}, +{0x000076, 0x000056}, +{0x000077, 0x000057}, +{0x000078, 0x000058}, +{0x000079, 0x000059}, +{0x00007A, 0x00005A}, +{0x0000B5, 0x00039C}, +{0x0000DF, 0x000053}, +{0x0000E0, 0x0000C0}, +{0x0000E1, 0x0000C1}, +{0x0000E2, 0x0000C2}, +{0x0000E3, 0x0000C3}, +{0x0000E4, 0x0000C4}, +{0x0000E5, 0x0000C5}, +{0x0000E6, 0x0000C6}, +{0x0000E7, 0x0000C7}, +{0x0000E8, 0x0000C8}, +{0x0000E9, 0x0000C9}, +{0x0000EA, 0x0000CA}, +{0x0000EB, 0x0000CB}, +{0x0000EC, 0x0000CC}, +{0x0000ED, 0x0000CD}, +{0x0000EE, 0x0000CE}, +{0x0000EF, 0x0000CF}, +{0x0000F0, 0x0000D0}, +{0x0000F1, 0x0000D1}, +{0x0000F2, 0x0000D2}, +{0x0000F3, 0x0000D3}, +{0x0000F4, 0x0000D4}, +{0x0000F5, 0x0000D5}, +{0x0000F6, 0x0000D6}, +{0x0000F8, 0x0000D8}, +{0x0000F9, 0x0000D9}, +{0x0000FA, 0x0000DA}, +{0x0000FB, 0x0000DB}, +{0x0000FC, 0x0000DC}, +{0x0000FD, 0x0000DD}, +{0x0000FE, 0x0000DE}, +{0x0000FF, 0x000178}, +{0x000101, 0x000100}, +{0x000103, 0x000102}, +{0x000105, 0x000104}, +{0x000107, 0x000106}, +{0x000109, 0x000108}, +{0x00010B, 0x00010A}, +{0x00010D, 0x00010C}, +{0x00010F, 0x00010E}, +{0x000111, 0x000110}, +{0x000113, 0x000112}, +{0x000115, 0x000114}, +{0x000117, 0x000116}, +{0x000119, 0x000118}, +{0x00011B, 0x00011A}, +{0x00011D, 0x00011C}, +{0x00011F, 0x00011E}, +{0x000121, 0x000120}, +{0x000123, 0x000122}, +{0x000125, 0x000124}, +{0x000127, 0x000126}, +{0x000129, 0x000128}, +{0x00012B, 0x00012A}, +{0x00012D, 0x00012C}, +{0x00012F, 0x00012E}, +{0x000131, 0x000049}, +{0x000133, 0x000132}, +{0x000135, 0x000134}, +{0x000137, 0x000136}, +{0x00013A, 0x000139}, +{0x00013C, 0x00013B}, +{0x00013E, 0x00013D}, +{0x000140, 0x00013F}, +{0x000142, 0x000141}, +{0x000144, 0x000143}, +{0x000146, 0x000145}, +{0x000148, 0x000147}, +{0x000149, 0x0002BC}, +{0x00014B, 0x00014A}, +{0x00014D, 0x00014C}, +{0x00014F, 0x00014E}, +{0x000151, 0x000150}, +{0x000153, 0x000152}, +{0x000155, 0x000154}, +{0x000157, 0x000156}, +{0x000159, 0x000158}, +{0x00015B, 0x00015A}, +{0x00015D, 0x00015C}, +{0x00015F, 0x00015E}, +{0x000161, 0x000160}, +{0x000163, 0x000162}, +{0x000165, 0x000164}, +{0x000167, 0x000166}, +{0x000169, 0x000168}, +{0x00016B, 0x00016A}, +{0x00016D, 0x00016C}, +{0x00016F, 0x00016E}, +{0x000171, 0x000170}, +{0x000173, 0x000172}, +{0x000175, 0x000174}, +{0x000177, 0x000176}, +{0x00017A, 0x000179}, +{0x00017C, 0x00017B}, +{0x00017E, 0x00017D}, +{0x00017F, 0x000053}, +{0x000180, 0x000243}, +{0x000183, 0x000182}, +{0x000185, 0x000184}, +{0x000188, 0x000187}, +{0x00018C, 0x00018B}, +{0x000192, 0x000191}, +{0x000195, 0x0001F6}, +{0x000199, 0x000198}, +{0x00019A, 0x00023D}, +{0x00019E, 0x000220}, +{0x0001A1, 0x0001A0}, +{0x0001A3, 0x0001A2}, +{0x0001A5, 0x0001A4}, +{0x0001A8, 0x0001A7}, +{0x0001AD, 0x0001AC}, +{0x0001B0, 0x0001AF}, +{0x0001B4, 0x0001B3}, +{0x0001B6, 0x0001B5}, +{0x0001B9, 0x0001B8}, +{0x0001BD, 0x0001BC}, +{0x0001BF, 0x0001F7}, +{0x0001C5, 0x0001C4}, +{0x0001C6, 0x0001C4}, +{0x0001C8, 0x0001C7}, +{0x0001C9, 0x0001C7}, +{0x0001CB, 0x0001CA}, +{0x0001CC, 0x0001CA}, +{0x0001CE, 0x0001CD}, +{0x0001D0, 0x0001CF}, +{0x0001D2, 0x0001D1}, +{0x0001D4, 0x0001D3}, +{0x0001D6, 0x0001D5}, +{0x0001D8, 0x0001D7}, +{0x0001DA, 0x0001D9}, +{0x0001DC, 0x0001DB}, +{0x0001DD, 0x00018E}, +{0x0001DF, 0x0001DE}, +{0x0001E1, 0x0001E0}, +{0x0001E3, 0x0001E2}, +{0x0001E5, 0x0001E4}, +{0x0001E7, 0x0001E6}, +{0x0001E9, 0x0001E8}, +{0x0001EB, 0x0001EA}, +{0x0001ED, 0x0001EC}, +{0x0001EF, 0x0001EE}, +{0x0001F0, 0x00004A}, +{0x0001F2, 0x0001F1}, +{0x0001F3, 0x0001F1}, +{0x0001F5, 0x0001F4}, +{0x0001F9, 0x0001F8}, +{0x0001FB, 0x0001FA}, +{0x0001FD, 0x0001FC}, +{0x0001FF, 0x0001FE}, +{0x000201, 0x000200}, +{0x000203, 0x000202}, +{0x000205, 0x000204}, +{0x000207, 0x000206}, +{0x000209, 0x000208}, +{0x00020B, 0x00020A}, +{0x00020D, 0x00020C}, +{0x00020F, 0x00020E}, +{0x000211, 0x000210}, +{0x000213, 0x000212}, +{0x000215, 0x000214}, +{0x000217, 0x000216}, +{0x000219, 0x000218}, +{0x00021B, 0x00021A}, +{0x00021D, 0x00021C}, +{0x00021F, 0x00021E}, +{0x000223, 0x000222}, +{0x000225, 0x000224}, +{0x000227, 0x000226}, +{0x000229, 0x000228}, +{0x00022B, 0x00022A}, +{0x00022D, 0x00022C}, +{0x00022F, 0x00022E}, +{0x000231, 0x000230}, +{0x000233, 0x000232}, +{0x00023C, 0x00023B}, +{0x00023F, 0x002C7E}, +{0x000240, 0x002C7F}, +{0x000242, 0x000241}, +{0x000247, 0x000246}, +{0x000249, 0x000248}, +{0x00024B, 0x00024A}, +{0x00024D, 0x00024C}, +{0x00024F, 0x00024E}, +{0x000250, 0x002C6F}, +{0x000251, 0x002C6D}, +{0x000252, 0x002C70}, +{0x000253, 0x000181}, +{0x000254, 0x000186}, +{0x000256, 0x000189}, +{0x000257, 0x00018A}, +{0x000259, 0x00018F}, +{0x00025B, 0x000190}, +{0x00025C, 0x00A7AB}, +{0x000260, 0x000193}, +{0x000261, 0x00A7AC}, +{0x000263, 0x000194}, +{0x000265, 0x00A78D}, +{0x000266, 0x00A7AA}, +{0x000268, 0x000197}, +{0x000269, 0x000196}, +{0x00026A, 0x00A7AE}, +{0x00026B, 0x002C62}, +{0x00026C, 0x00A7AD}, +{0x00026F, 0x00019C}, +{0x000271, 0x002C6E}, +{0x000272, 0x00019D}, +{0x000275, 0x00019F}, +{0x00027D, 0x002C64}, +{0x000280, 0x0001A6}, +{0x000282, 0x00A7C5}, +{0x000283, 0x0001A9}, +{0x000287, 0x00A7B1}, +{0x000288, 0x0001AE}, +{0x000289, 0x000244}, +{0x00028A, 0x0001B1}, +{0x00028B, 0x0001B2}, +{0x00028C, 0x000245}, +{0x000292, 0x0001B7}, +{0x00029D, 0x00A7B2}, +{0x00029E, 0x00A7B0}, +{0x000345, 0x000399}, +{0x000371, 0x000370}, +{0x000373, 0x000372}, +{0x000377, 0x000376}, +{0x00037B, 0x0003FD}, +{0x00037C, 0x0003FE}, +{0x00037D, 0x0003FF}, +{0x000390, 0x000399}, +{0x0003AC, 0x000386}, +{0x0003AD, 0x000388}, +{0x0003AE, 0x000389}, +{0x0003AF, 0x00038A}, +{0x0003B0, 0x0003A5}, +{0x0003B1, 0x000391}, +{0x0003B2, 0x000392}, +{0x0003B3, 0x000393}, +{0x0003B4, 0x000394}, +{0x0003B5, 0x000395}, +{0x0003B6, 0x000396}, +{0x0003B7, 0x000397}, +{0x0003B8, 0x000398}, +{0x0003B9, 0x000399}, +{0x0003BA, 0x00039A}, +{0x0003BB, 0x00039B}, +{0x0003BC, 0x00039C}, +{0x0003BD, 0x00039D}, +{0x0003BE, 0x00039E}, +{0x0003BF, 0x00039F}, +{0x0003C0, 0x0003A0}, +{0x0003C1, 0x0003A1}, +{0x0003C2, 0x0003A3}, +{0x0003C3, 0x0003A3}, +{0x0003C4, 0x0003A4}, +{0x0003C5, 0x0003A5}, +{0x0003C6, 0x0003A6}, +{0x0003C7, 0x0003A7}, +{0x0003C8, 0x0003A8}, +{0x0003C9, 0x0003A9}, +{0x0003CA, 0x0003AA}, +{0x0003CB, 0x0003AB}, +{0x0003CC, 0x00038C}, +{0x0003CD, 0x00038E}, +{0x0003CE, 0x00038F}, +{0x0003D0, 0x000392}, +{0x0003D1, 0x000398}, +{0x0003D5, 0x0003A6}, +{0x0003D6, 0x0003A0}, +{0x0003D7, 0x0003CF}, +{0x0003D9, 0x0003D8}, +{0x0003DB, 0x0003DA}, +{0x0003DD, 0x0003DC}, +{0x0003DF, 0x0003DE}, +{0x0003E1, 0x0003E0}, +{0x0003E3, 0x0003E2}, +{0x0003E5, 0x0003E4}, +{0x0003E7, 0x0003E6}, +{0x0003E9, 0x0003E8}, +{0x0003EB, 0x0003EA}, +{0x0003ED, 0x0003EC}, +{0x0003EF, 0x0003EE}, +{0x0003F0, 0x00039A}, +{0x0003F1, 0x0003A1}, +{0x0003F2, 0x0003F9}, +{0x0003F3, 0x00037F}, +{0x0003F5, 0x000395}, +{0x0003F8, 0x0003F7}, +{0x0003FB, 0x0003FA}, +{0x000430, 0x000410}, +{0x000431, 0x000411}, +{0x000432, 0x000412}, +{0x000433, 0x000413}, +{0x000434, 0x000414}, +{0x000435, 0x000415}, +{0x000436, 0x000416}, +{0x000437, 0x000417}, +{0x000438, 0x000418}, +{0x000439, 0x000419}, +{0x00043A, 0x00041A}, +{0x00043B, 0x00041B}, +{0x00043C, 0x00041C}, +{0x00043D, 0x00041D}, +{0x00043E, 0x00041E}, +{0x00043F, 0x00041F}, +{0x000440, 0x000420}, +{0x000441, 0x000421}, +{0x000442, 0x000422}, +{0x000443, 0x000423}, +{0x000444, 0x000424}, +{0x000445, 0x000425}, +{0x000446, 0x000426}, +{0x000447, 0x000427}, +{0x000448, 0x000428}, +{0x000449, 0x000429}, +{0x00044A, 0x00042A}, +{0x00044B, 0x00042B}, +{0x00044C, 0x00042C}, +{0x00044D, 0x00042D}, +{0x00044E, 0x00042E}, +{0x00044F, 0x00042F}, +{0x000450, 0x000400}, +{0x000451, 0x000401}, +{0x000452, 0x000402}, +{0x000453, 0x000403}, +{0x000454, 0x000404}, +{0x000455, 0x000405}, +{0x000456, 0x000406}, +{0x000457, 0x000407}, +{0x000458, 0x000408}, +{0x000459, 0x000409}, +{0x00045A, 0x00040A}, +{0x00045B, 0x00040B}, +{0x00045C, 0x00040C}, +{0x00045D, 0x00040D}, +{0x00045E, 0x00040E}, +{0x00045F, 0x00040F}, +{0x000461, 0x000460}, +{0x000463, 0x000462}, +{0x000465, 0x000464}, +{0x000467, 0x000466}, +{0x000469, 0x000468}, +{0x00046B, 0x00046A}, +{0x00046D, 0x00046C}, +{0x00046F, 0x00046E}, +{0x000471, 0x000470}, +{0x000473, 0x000472}, +{0x000475, 0x000474}, +{0x000477, 0x000476}, +{0x000479, 0x000478}, +{0x00047B, 0x00047A}, +{0x00047D, 0x00047C}, +{0x00047F, 0x00047E}, +{0x000481, 0x000480}, +{0x00048B, 0x00048A}, +{0x00048D, 0x00048C}, +{0x00048F, 0x00048E}, +{0x000491, 0x000490}, +{0x000493, 0x000492}, +{0x000495, 0x000494}, +{0x000497, 0x000496}, +{0x000499, 0x000498}, +{0x00049B, 0x00049A}, +{0x00049D, 0x00049C}, +{0x00049F, 0x00049E}, +{0x0004A1, 0x0004A0}, +{0x0004A3, 0x0004A2}, +{0x0004A5, 0x0004A4}, +{0x0004A7, 0x0004A6}, +{0x0004A9, 0x0004A8}, +{0x0004AB, 0x0004AA}, +{0x0004AD, 0x0004AC}, +{0x0004AF, 0x0004AE}, +{0x0004B1, 0x0004B0}, +{0x0004B3, 0x0004B2}, +{0x0004B5, 0x0004B4}, +{0x0004B7, 0x0004B6}, +{0x0004B9, 0x0004B8}, +{0x0004BB, 0x0004BA}, +{0x0004BD, 0x0004BC}, +{0x0004BF, 0x0004BE}, +{0x0004C2, 0x0004C1}, +{0x0004C4, 0x0004C3}, +{0x0004C6, 0x0004C5}, +{0x0004C8, 0x0004C7}, +{0x0004CA, 0x0004C9}, +{0x0004CC, 0x0004CB}, +{0x0004CE, 0x0004CD}, +{0x0004CF, 0x0004C0}, +{0x0004D1, 0x0004D0}, +{0x0004D3, 0x0004D2}, +{0x0004D5, 0x0004D4}, +{0x0004D7, 0x0004D6}, +{0x0004D9, 0x0004D8}, +{0x0004DB, 0x0004DA}, +{0x0004DD, 0x0004DC}, +{0x0004DF, 0x0004DE}, +{0x0004E1, 0x0004E0}, +{0x0004E3, 0x0004E2}, +{0x0004E5, 0x0004E4}, +{0x0004E7, 0x0004E6}, +{0x0004E9, 0x0004E8}, +{0x0004EB, 0x0004EA}, +{0x0004ED, 0x0004EC}, +{0x0004EF, 0x0004EE}, +{0x0004F1, 0x0004F0}, +{0x0004F3, 0x0004F2}, +{0x0004F5, 0x0004F4}, +{0x0004F7, 0x0004F6}, +{0x0004F9, 0x0004F8}, +{0x0004FB, 0x0004FA}, +{0x0004FD, 0x0004FC}, +{0x0004FF, 0x0004FE}, +{0x000501, 0x000500}, +{0x000503, 0x000502}, +{0x000505, 0x000504}, +{0x000507, 0x000506}, +{0x000509, 0x000508}, +{0x00050B, 0x00050A}, +{0x00050D, 0x00050C}, +{0x00050F, 0x00050E}, +{0x000511, 0x000510}, +{0x000513, 0x000512}, +{0x000515, 0x000514}, +{0x000517, 0x000516}, +{0x000519, 0x000518}, +{0x00051B, 0x00051A}, +{0x00051D, 0x00051C}, +{0x00051F, 0x00051E}, +{0x000521, 0x000520}, +{0x000523, 0x000522}, +{0x000525, 0x000524}, +{0x000527, 0x000526}, +{0x000529, 0x000528}, +{0x00052B, 0x00052A}, +{0x00052D, 0x00052C}, +{0x00052F, 0x00052E}, +{0x000561, 0x000531}, +{0x000562, 0x000532}, +{0x000563, 0x000533}, +{0x000564, 0x000534}, +{0x000565, 0x000535}, +{0x000566, 0x000536}, +{0x000567, 0x000537}, +{0x000568, 0x000538}, +{0x000569, 0x000539}, +{0x00056A, 0x00053A}, +{0x00056B, 0x00053B}, +{0x00056C, 0x00053C}, +{0x00056D, 0x00053D}, +{0x00056E, 0x00053E}, +{0x00056F, 0x00053F}, +{0x000570, 0x000540}, +{0x000571, 0x000541}, +{0x000572, 0x000542}, +{0x000573, 0x000543}, +{0x000574, 0x000544}, +{0x000575, 0x000545}, +{0x000576, 0x000546}, +{0x000577, 0x000547}, +{0x000578, 0x000548}, +{0x000579, 0x000549}, +{0x00057A, 0x00054A}, +{0x00057B, 0x00054B}, +{0x00057C, 0x00054C}, +{0x00057D, 0x00054D}, +{0x00057E, 0x00054E}, +{0x00057F, 0x00054F}, +{0x000580, 0x000550}, +{0x000581, 0x000551}, +{0x000582, 0x000552}, +{0x000583, 0x000553}, +{0x000584, 0x000554}, +{0x000585, 0x000555}, +{0x000586, 0x000556}, +{0x000587, 0x000535}, +{0x0010D0, 0x001C90}, +{0x0010D1, 0x001C91}, +{0x0010D2, 0x001C92}, +{0x0010D3, 0x001C93}, +{0x0010D4, 0x001C94}, +{0x0010D5, 0x001C95}, +{0x0010D6, 0x001C96}, +{0x0010D7, 0x001C97}, +{0x0010D8, 0x001C98}, +{0x0010D9, 0x001C99}, +{0x0010DA, 0x001C9A}, +{0x0010DB, 0x001C9B}, +{0x0010DC, 0x001C9C}, +{0x0010DD, 0x001C9D}, +{0x0010DE, 0x001C9E}, +{0x0010DF, 0x001C9F}, +{0x0010E0, 0x001CA0}, +{0x0010E1, 0x001CA1}, +{0x0010E2, 0x001CA2}, +{0x0010E3, 0x001CA3}, +{0x0010E4, 0x001CA4}, +{0x0010E5, 0x001CA5}, +{0x0010E6, 0x001CA6}, +{0x0010E7, 0x001CA7}, +{0x0010E8, 0x001CA8}, +{0x0010E9, 0x001CA9}, +{0x0010EA, 0x001CAA}, +{0x0010EB, 0x001CAB}, +{0x0010EC, 0x001CAC}, +{0x0010ED, 0x001CAD}, +{0x0010EE, 0x001CAE}, +{0x0010EF, 0x001CAF}, +{0x0010F0, 0x001CB0}, +{0x0010F1, 0x001CB1}, +{0x0010F2, 0x001CB2}, +{0x0010F3, 0x001CB3}, +{0x0010F4, 0x001CB4}, +{0x0010F5, 0x001CB5}, +{0x0010F6, 0x001CB6}, +{0x0010F7, 0x001CB7}, +{0x0010F8, 0x001CB8}, +{0x0010F9, 0x001CB9}, +{0x0010FA, 0x001CBA}, +{0x0010FD, 0x001CBD}, +{0x0010FE, 0x001CBE}, +{0x0010FF, 0x001CBF}, +{0x0013F8, 0x0013F0}, +{0x0013F9, 0x0013F1}, +{0x0013FA, 0x0013F2}, +{0x0013FB, 0x0013F3}, +{0x0013FC, 0x0013F4}, +{0x0013FD, 0x0013F5}, +{0x001C80, 0x000412}, +{0x001C81, 0x000414}, +{0x001C82, 0x00041E}, +{0x001C83, 0x000421}, +{0x001C84, 0x000422}, +{0x001C85, 0x000422}, +{0x001C86, 0x00042A}, +{0x001C87, 0x000462}, +{0x001C88, 0x00A64A}, +{0x001D79, 0x00A77D}, +{0x001D7D, 0x002C63}, +{0x001D8E, 0x00A7C6}, +{0x001E01, 0x001E00}, +{0x001E03, 0x001E02}, +{0x001E05, 0x001E04}, +{0x001E07, 0x001E06}, +{0x001E09, 0x001E08}, +{0x001E0B, 0x001E0A}, +{0x001E0D, 0x001E0C}, +{0x001E0F, 0x001E0E}, +{0x001E11, 0x001E10}, +{0x001E13, 0x001E12}, +{0x001E15, 0x001E14}, +{0x001E17, 0x001E16}, +{0x001E19, 0x001E18}, +{0x001E1B, 0x001E1A}, +{0x001E1D, 0x001E1C}, +{0x001E1F, 0x001E1E}, +{0x001E21, 0x001E20}, +{0x001E23, 0x001E22}, +{0x001E25, 0x001E24}, +{0x001E27, 0x001E26}, +{0x001E29, 0x001E28}, +{0x001E2B, 0x001E2A}, +{0x001E2D, 0x001E2C}, +{0x001E2F, 0x001E2E}, +{0x001E31, 0x001E30}, +{0x001E33, 0x001E32}, +{0x001E35, 0x001E34}, +{0x001E37, 0x001E36}, +{0x001E39, 0x001E38}, +{0x001E3B, 0x001E3A}, +{0x001E3D, 0x001E3C}, +{0x001E3F, 0x001E3E}, +{0x001E41, 0x001E40}, +{0x001E43, 0x001E42}, +{0x001E45, 0x001E44}, +{0x001E47, 0x001E46}, +{0x001E49, 0x001E48}, +{0x001E4B, 0x001E4A}, +{0x001E4D, 0x001E4C}, +{0x001E4F, 0x001E4E}, +{0x001E51, 0x001E50}, +{0x001E53, 0x001E52}, +{0x001E55, 0x001E54}, +{0x001E57, 0x001E56}, +{0x001E59, 0x001E58}, +{0x001E5B, 0x001E5A}, +{0x001E5D, 0x001E5C}, +{0x001E5F, 0x001E5E}, +{0x001E61, 0x001E60}, +{0x001E63, 0x001E62}, +{0x001E65, 0x001E64}, +{0x001E67, 0x001E66}, +{0x001E69, 0x001E68}, +{0x001E6B, 0x001E6A}, +{0x001E6D, 0x001E6C}, +{0x001E6F, 0x001E6E}, +{0x001E71, 0x001E70}, +{0x001E73, 0x001E72}, +{0x001E75, 0x001E74}, +{0x001E77, 0x001E76}, +{0x001E79, 0x001E78}, +{0x001E7B, 0x001E7A}, +{0x001E7D, 0x001E7C}, +{0x001E7F, 0x001E7E}, +{0x001E81, 0x001E80}, +{0x001E83, 0x001E82}, +{0x001E85, 0x001E84}, +{0x001E87, 0x001E86}, +{0x001E89, 0x001E88}, +{0x001E8B, 0x001E8A}, +{0x001E8D, 0x001E8C}, +{0x001E8F, 0x001E8E}, +{0x001E91, 0x001E90}, +{0x001E93, 0x001E92}, +{0x001E95, 0x001E94}, +{0x001E96, 0x000048}, +{0x001E97, 0x000054}, +{0x001E98, 0x000057}, +{0x001E99, 0x000059}, +{0x001E9A, 0x000041}, +{0x001E9B, 0x001E60}, +{0x001EA1, 0x001EA0}, +{0x001EA3, 0x001EA2}, +{0x001EA5, 0x001EA4}, +{0x001EA7, 0x001EA6}, +{0x001EA9, 0x001EA8}, +{0x001EAB, 0x001EAA}, +{0x001EAD, 0x001EAC}, +{0x001EAF, 0x001EAE}, +{0x001EB1, 0x001EB0}, +{0x001EB3, 0x001EB2}, +{0x001EB5, 0x001EB4}, +{0x001EB7, 0x001EB6}, +{0x001EB9, 0x001EB8}, +{0x001EBB, 0x001EBA}, +{0x001EBD, 0x001EBC}, +{0x001EBF, 0x001EBE}, +{0x001EC1, 0x001EC0}, +{0x001EC3, 0x001EC2}, +{0x001EC5, 0x001EC4}, +{0x001EC7, 0x001EC6}, +{0x001EC9, 0x001EC8}, +{0x001ECB, 0x001ECA}, +{0x001ECD, 0x001ECC}, +{0x001ECF, 0x001ECE}, +{0x001ED1, 0x001ED0}, +{0x001ED3, 0x001ED2}, +{0x001ED5, 0x001ED4}, +{0x001ED7, 0x001ED6}, +{0x001ED9, 0x001ED8}, +{0x001EDB, 0x001EDA}, +{0x001EDD, 0x001EDC}, +{0x001EDF, 0x001EDE}, +{0x001EE1, 0x001EE0}, +{0x001EE3, 0x001EE2}, +{0x001EE5, 0x001EE4}, +{0x001EE7, 0x001EE6}, +{0x001EE9, 0x001EE8}, +{0x001EEB, 0x001EEA}, +{0x001EED, 0x001EEC}, +{0x001EEF, 0x001EEE}, +{0x001EF1, 0x001EF0}, +{0x001EF3, 0x001EF2}, +{0x001EF5, 0x001EF4}, +{0x001EF7, 0x001EF6}, +{0x001EF9, 0x001EF8}, +{0x001EFB, 0x001EFA}, +{0x001EFD, 0x001EFC}, +{0x001EFF, 0x001EFE}, +{0x001F00, 0x001F08}, +{0x001F01, 0x001F09}, +{0x001F02, 0x001F0A}, +{0x001F03, 0x001F0B}, +{0x001F04, 0x001F0C}, +{0x001F05, 0x001F0D}, +{0x001F06, 0x001F0E}, +{0x001F07, 0x001F0F}, +{0x001F10, 0x001F18}, +{0x001F11, 0x001F19}, +{0x001F12, 0x001F1A}, +{0x001F13, 0x001F1B}, +{0x001F14, 0x001F1C}, +{0x001F15, 0x001F1D}, +{0x001F20, 0x001F28}, +{0x001F21, 0x001F29}, +{0x001F22, 0x001F2A}, +{0x001F23, 0x001F2B}, +{0x001F24, 0x001F2C}, +{0x001F25, 0x001F2D}, +{0x001F26, 0x001F2E}, +{0x001F27, 0x001F2F}, +{0x001F30, 0x001F38}, +{0x001F31, 0x001F39}, +{0x001F32, 0x001F3A}, +{0x001F33, 0x001F3B}, +{0x001F34, 0x001F3C}, +{0x001F35, 0x001F3D}, +{0x001F36, 0x001F3E}, +{0x001F37, 0x001F3F}, +{0x001F40, 0x001F48}, +{0x001F41, 0x001F49}, +{0x001F42, 0x001F4A}, +{0x001F43, 0x001F4B}, +{0x001F44, 0x001F4C}, +{0x001F45, 0x001F4D}, +{0x001F50, 0x0003A5}, +{0x001F51, 0x001F59}, +{0x001F52, 0x0003A5}, +{0x001F53, 0x001F5B}, +{0x001F54, 0x0003A5}, +{0x001F55, 0x001F5D}, +{0x001F56, 0x0003A5}, +{0x001F57, 0x001F5F}, +{0x001F60, 0x001F68}, +{0x001F61, 0x001F69}, +{0x001F62, 0x001F6A}, +{0x001F63, 0x001F6B}, +{0x001F64, 0x001F6C}, +{0x001F65, 0x001F6D}, +{0x001F66, 0x001F6E}, +{0x001F67, 0x001F6F}, +{0x001F70, 0x001FBA}, +{0x001F71, 0x001FBB}, +{0x001F72, 0x001FC8}, +{0x001F73, 0x001FC9}, +{0x001F74, 0x001FCA}, +{0x001F75, 0x001FCB}, +{0x001F76, 0x001FDA}, +{0x001F77, 0x001FDB}, +{0x001F78, 0x001FF8}, +{0x001F79, 0x001FF9}, +{0x001F7A, 0x001FEA}, +{0x001F7B, 0x001FEB}, +{0x001F7C, 0x001FFA}, +{0x001F7D, 0x001FFB}, +{0x001F80, 0x001F08}, +{0x001F81, 0x001F09}, +{0x001F82, 0x001F0A}, +{0x001F83, 0x001F0B}, +{0x001F84, 0x001F0C}, +{0x001F85, 0x001F0D}, +{0x001F86, 0x001F0E}, +{0x001F87, 0x001F0F}, +{0x001F88, 0x001F08}, +{0x001F89, 0x001F09}, +{0x001F8A, 0x001F0A}, +{0x001F8B, 0x001F0B}, +{0x001F8C, 0x001F0C}, +{0x001F8D, 0x001F0D}, +{0x001F8E, 0x001F0E}, +{0x001F8F, 0x001F0F}, +{0x001F90, 0x001F28}, +{0x001F91, 0x001F29}, +{0x001F92, 0x001F2A}, +{0x001F93, 0x001F2B}, +{0x001F94, 0x001F2C}, +{0x001F95, 0x001F2D}, +{0x001F96, 0x001F2E}, +{0x001F97, 0x001F2F}, +{0x001F98, 0x001F28}, +{0x001F99, 0x001F29}, +{0x001F9A, 0x001F2A}, +{0x001F9B, 0x001F2B}, +{0x001F9C, 0x001F2C}, +{0x001F9D, 0x001F2D}, +{0x001F9E, 0x001F2E}, +{0x001F9F, 0x001F2F}, +{0x001FA0, 0x001F68}, +{0x001FA1, 0x001F69}, +{0x001FA2, 0x001F6A}, +{0x001FA3, 0x001F6B}, +{0x001FA4, 0x001F6C}, +{0x001FA5, 0x001F6D}, +{0x001FA6, 0x001F6E}, +{0x001FA7, 0x001F6F}, +{0x001FA8, 0x001F68}, +{0x001FA9, 0x001F69}, +{0x001FAA, 0x001F6A}, +{0x001FAB, 0x001F6B}, +{0x001FAC, 0x001F6C}, +{0x001FAD, 0x001F6D}, +{0x001FAE, 0x001F6E}, +{0x001FAF, 0x001F6F}, +{0x001FB0, 0x001FB8}, +{0x001FB1, 0x001FB9}, +{0x001FB2, 0x001FBA}, +{0x001FB3, 0x000391}, +{0x001FB4, 0x000386}, +{0x001FB6, 0x000391}, +{0x001FB7, 0x000391}, +{0x001FBC, 0x000391}, +{0x001FBE, 0x000399}, +{0x001FC2, 0x001FCA}, +{0x001FC3, 0x000397}, +{0x001FC4, 0x000389}, +{0x001FC6, 0x000397}, +{0x001FC7, 0x000397}, +{0x001FCC, 0x000397}, +{0x001FD0, 0x001FD8}, +{0x001FD1, 0x001FD9}, +{0x001FD2, 0x000399}, +{0x001FD3, 0x000399}, +{0x001FD6, 0x000399}, +{0x001FD7, 0x000399}, +{0x001FE0, 0x001FE8}, +{0x001FE1, 0x001FE9}, +{0x001FE2, 0x0003A5}, +{0x001FE3, 0x0003A5}, +{0x001FE4, 0x0003A1}, +{0x001FE5, 0x001FEC}, +{0x001FE6, 0x0003A5}, +{0x001FE7, 0x0003A5}, +{0x001FF2, 0x001FFA}, +{0x001FF3, 0x0003A9}, +{0x001FF4, 0x00038F}, +{0x001FF6, 0x0003A9}, +{0x001FF7, 0x0003A9}, +{0x001FFC, 0x0003A9}, +{0x00214E, 0x002132}, +{0x002170, 0x002160}, +{0x002171, 0x002161}, +{0x002172, 0x002162}, +{0x002173, 0x002163}, +{0x002174, 0x002164}, +{0x002175, 0x002165}, +{0x002176, 0x002166}, +{0x002177, 0x002167}, +{0x002178, 0x002168}, +{0x002179, 0x002169}, +{0x00217A, 0x00216A}, +{0x00217B, 0x00216B}, +{0x00217C, 0x00216C}, +{0x00217D, 0x00216D}, +{0x00217E, 0x00216E}, +{0x00217F, 0x00216F}, +{0x002184, 0x002183}, +{0x0024D0, 0x0024B6}, +{0x0024D1, 0x0024B7}, +{0x0024D2, 0x0024B8}, +{0x0024D3, 0x0024B9}, +{0x0024D4, 0x0024BA}, +{0x0024D5, 0x0024BB}, +{0x0024D6, 0x0024BC}, +{0x0024D7, 0x0024BD}, +{0x0024D8, 0x0024BE}, +{0x0024D9, 0x0024BF}, +{0x0024DA, 0x0024C0}, +{0x0024DB, 0x0024C1}, +{0x0024DC, 0x0024C2}, +{0x0024DD, 0x0024C3}, +{0x0024DE, 0x0024C4}, +{0x0024DF, 0x0024C5}, +{0x0024E0, 0x0024C6}, +{0x0024E1, 0x0024C7}, +{0x0024E2, 0x0024C8}, +{0x0024E3, 0x0024C9}, +{0x0024E4, 0x0024CA}, +{0x0024E5, 0x0024CB}, +{0x0024E6, 0x0024CC}, +{0x0024E7, 0x0024CD}, +{0x0024E8, 0x0024CE}, +{0x0024E9, 0x0024CF}, +{0x002C30, 0x002C00}, +{0x002C31, 0x002C01}, +{0x002C32, 0x002C02}, +{0x002C33, 0x002C03}, +{0x002C34, 0x002C04}, +{0x002C35, 0x002C05}, +{0x002C36, 0x002C06}, +{0x002C37, 0x002C07}, +{0x002C38, 0x002C08}, +{0x002C39, 0x002C09}, +{0x002C3A, 0x002C0A}, +{0x002C3B, 0x002C0B}, +{0x002C3C, 0x002C0C}, +{0x002C3D, 0x002C0D}, +{0x002C3E, 0x002C0E}, +{0x002C3F, 0x002C0F}, +{0x002C40, 0x002C10}, +{0x002C41, 0x002C11}, +{0x002C42, 0x002C12}, +{0x002C43, 0x002C13}, +{0x002C44, 0x002C14}, +{0x002C45, 0x002C15}, +{0x002C46, 0x002C16}, +{0x002C47, 0x002C17}, +{0x002C48, 0x002C18}, +{0x002C49, 0x002C19}, +{0x002C4A, 0x002C1A}, +{0x002C4B, 0x002C1B}, +{0x002C4C, 0x002C1C}, +{0x002C4D, 0x002C1D}, +{0x002C4E, 0x002C1E}, +{0x002C4F, 0x002C1F}, +{0x002C50, 0x002C20}, +{0x002C51, 0x002C21}, +{0x002C52, 0x002C22}, +{0x002C53, 0x002C23}, +{0x002C54, 0x002C24}, +{0x002C55, 0x002C25}, +{0x002C56, 0x002C26}, +{0x002C57, 0x002C27}, +{0x002C58, 0x002C28}, +{0x002C59, 0x002C29}, +{0x002C5A, 0x002C2A}, +{0x002C5B, 0x002C2B}, +{0x002C5C, 0x002C2C}, +{0x002C5D, 0x002C2D}, +{0x002C5E, 0x002C2E}, +{0x002C61, 0x002C60}, +{0x002C65, 0x00023A}, +{0x002C66, 0x00023E}, +{0x002C68, 0x002C67}, +{0x002C6A, 0x002C69}, +{0x002C6C, 0x002C6B}, +{0x002C73, 0x002C72}, +{0x002C76, 0x002C75}, +{0x002C81, 0x002C80}, +{0x002C83, 0x002C82}, +{0x002C85, 0x002C84}, +{0x002C87, 0x002C86}, +{0x002C89, 0x002C88}, +{0x002C8B, 0x002C8A}, +{0x002C8D, 0x002C8C}, +{0x002C8F, 0x002C8E}, +{0x002C91, 0x002C90}, +{0x002C93, 0x002C92}, +{0x002C95, 0x002C94}, +{0x002C97, 0x002C96}, +{0x002C99, 0x002C98}, +{0x002C9B, 0x002C9A}, +{0x002C9D, 0x002C9C}, +{0x002C9F, 0x002C9E}, +{0x002CA1, 0x002CA0}, +{0x002CA3, 0x002CA2}, +{0x002CA5, 0x002CA4}, +{0x002CA7, 0x002CA6}, +{0x002CA9, 0x002CA8}, +{0x002CAB, 0x002CAA}, +{0x002CAD, 0x002CAC}, +{0x002CAF, 0x002CAE}, +{0x002CB1, 0x002CB0}, +{0x002CB3, 0x002CB2}, +{0x002CB5, 0x002CB4}, +{0x002CB7, 0x002CB6}, +{0x002CB9, 0x002CB8}, +{0x002CBB, 0x002CBA}, +{0x002CBD, 0x002CBC}, +{0x002CBF, 0x002CBE}, +{0x002CC1, 0x002CC0}, +{0x002CC3, 0x002CC2}, +{0x002CC5, 0x002CC4}, +{0x002CC7, 0x002CC6}, +{0x002CC9, 0x002CC8}, +{0x002CCB, 0x002CCA}, +{0x002CCD, 0x002CCC}, +{0x002CCF, 0x002CCE}, +{0x002CD1, 0x002CD0}, +{0x002CD3, 0x002CD2}, +{0x002CD5, 0x002CD4}, +{0x002CD7, 0x002CD6}, +{0x002CD9, 0x002CD8}, +{0x002CDB, 0x002CDA}, +{0x002CDD, 0x002CDC}, +{0x002CDF, 0x002CDE}, +{0x002CE1, 0x002CE0}, +{0x002CE3, 0x002CE2}, +{0x002CEC, 0x002CEB}, +{0x002CEE, 0x002CED}, +{0x002CF3, 0x002CF2}, +{0x002D00, 0x0010A0}, +{0x002D01, 0x0010A1}, +{0x002D02, 0x0010A2}, +{0x002D03, 0x0010A3}, +{0x002D04, 0x0010A4}, +{0x002D05, 0x0010A5}, +{0x002D06, 0x0010A6}, +{0x002D07, 0x0010A7}, +{0x002D08, 0x0010A8}, +{0x002D09, 0x0010A9}, +{0x002D0A, 0x0010AA}, +{0x002D0B, 0x0010AB}, +{0x002D0C, 0x0010AC}, +{0x002D0D, 0x0010AD}, +{0x002D0E, 0x0010AE}, +{0x002D0F, 0x0010AF}, +{0x002D10, 0x0010B0}, +{0x002D11, 0x0010B1}, +{0x002D12, 0x0010B2}, +{0x002D13, 0x0010B3}, +{0x002D14, 0x0010B4}, +{0x002D15, 0x0010B5}, +{0x002D16, 0x0010B6}, +{0x002D17, 0x0010B7}, +{0x002D18, 0x0010B8}, +{0x002D19, 0x0010B9}, +{0x002D1A, 0x0010BA}, +{0x002D1B, 0x0010BB}, +{0x002D1C, 0x0010BC}, +{0x002D1D, 0x0010BD}, +{0x002D1E, 0x0010BE}, +{0x002D1F, 0x0010BF}, +{0x002D20, 0x0010C0}, +{0x002D21, 0x0010C1}, +{0x002D22, 0x0010C2}, +{0x002D23, 0x0010C3}, +{0x002D24, 0x0010C4}, +{0x002D25, 0x0010C5}, +{0x002D27, 0x0010C7}, +{0x002D2D, 0x0010CD}, +{0x00A641, 0x00A640}, +{0x00A643, 0x00A642}, +{0x00A645, 0x00A644}, +{0x00A647, 0x00A646}, +{0x00A649, 0x00A648}, +{0x00A64B, 0x00A64A}, +{0x00A64D, 0x00A64C}, +{0x00A64F, 0x00A64E}, +{0x00A651, 0x00A650}, +{0x00A653, 0x00A652}, +{0x00A655, 0x00A654}, +{0x00A657, 0x00A656}, +{0x00A659, 0x00A658}, +{0x00A65B, 0x00A65A}, +{0x00A65D, 0x00A65C}, +{0x00A65F, 0x00A65E}, +{0x00A661, 0x00A660}, +{0x00A663, 0x00A662}, +{0x00A665, 0x00A664}, +{0x00A667, 0x00A666}, +{0x00A669, 0x00A668}, +{0x00A66B, 0x00A66A}, +{0x00A66D, 0x00A66C}, +{0x00A681, 0x00A680}, +{0x00A683, 0x00A682}, +{0x00A685, 0x00A684}, +{0x00A687, 0x00A686}, +{0x00A689, 0x00A688}, +{0x00A68B, 0x00A68A}, +{0x00A68D, 0x00A68C}, +{0x00A68F, 0x00A68E}, +{0x00A691, 0x00A690}, +{0x00A693, 0x00A692}, +{0x00A695, 0x00A694}, +{0x00A697, 0x00A696}, +{0x00A699, 0x00A698}, +{0x00A69B, 0x00A69A}, +{0x00A723, 0x00A722}, +{0x00A725, 0x00A724}, +{0x00A727, 0x00A726}, +{0x00A729, 0x00A728}, +{0x00A72B, 0x00A72A}, +{0x00A72D, 0x00A72C}, +{0x00A72F, 0x00A72E}, +{0x00A733, 0x00A732}, +{0x00A735, 0x00A734}, +{0x00A737, 0x00A736}, +{0x00A739, 0x00A738}, +{0x00A73B, 0x00A73A}, +{0x00A73D, 0x00A73C}, +{0x00A73F, 0x00A73E}, +{0x00A741, 0x00A740}, +{0x00A743, 0x00A742}, +{0x00A745, 0x00A744}, +{0x00A747, 0x00A746}, +{0x00A749, 0x00A748}, +{0x00A74B, 0x00A74A}, +{0x00A74D, 0x00A74C}, +{0x00A74F, 0x00A74E}, +{0x00A751, 0x00A750}, +{0x00A753, 0x00A752}, +{0x00A755, 0x00A754}, +{0x00A757, 0x00A756}, +{0x00A759, 0x00A758}, +{0x00A75B, 0x00A75A}, +{0x00A75D, 0x00A75C}, +{0x00A75F, 0x00A75E}, +{0x00A761, 0x00A760}, +{0x00A763, 0x00A762}, +{0x00A765, 0x00A764}, +{0x00A767, 0x00A766}, +{0x00A769, 0x00A768}, +{0x00A76B, 0x00A76A}, +{0x00A76D, 0x00A76C}, +{0x00A76F, 0x00A76E}, +{0x00A77A, 0x00A779}, +{0x00A77C, 0x00A77B}, +{0x00A77F, 0x00A77E}, +{0x00A781, 0x00A780}, +{0x00A783, 0x00A782}, +{0x00A785, 0x00A784}, +{0x00A787, 0x00A786}, +{0x00A78C, 0x00A78B}, +{0x00A791, 0x00A790}, +{0x00A793, 0x00A792}, +{0x00A794, 0x00A7C4}, +{0x00A797, 0x00A796}, +{0x00A799, 0x00A798}, +{0x00A79B, 0x00A79A}, +{0x00A79D, 0x00A79C}, +{0x00A79F, 0x00A79E}, +{0x00A7A1, 0x00A7A0}, +{0x00A7A3, 0x00A7A2}, +{0x00A7A5, 0x00A7A4}, +{0x00A7A7, 0x00A7A6}, +{0x00A7A9, 0x00A7A8}, +{0x00A7B5, 0x00A7B4}, +{0x00A7B7, 0x00A7B6}, +{0x00A7B9, 0x00A7B8}, +{0x00A7BB, 0x00A7BA}, +{0x00A7BD, 0x00A7BC}, +{0x00A7BF, 0x00A7BE}, +{0x00A7C3, 0x00A7C2}, +{0x00A7C8, 0x00A7C7}, +{0x00A7CA, 0x00A7C9}, +{0x00A7F6, 0x00A7F5}, +{0x00AB53, 0x00A7B3}, +{0x00AB70, 0x0013A0}, +{0x00AB71, 0x0013A1}, +{0x00AB72, 0x0013A2}, +{0x00AB73, 0x0013A3}, +{0x00AB74, 0x0013A4}, +{0x00AB75, 0x0013A5}, +{0x00AB76, 0x0013A6}, +{0x00AB77, 0x0013A7}, +{0x00AB78, 0x0013A8}, +{0x00AB79, 0x0013A9}, +{0x00AB7A, 0x0013AA}, +{0x00AB7B, 0x0013AB}, +{0x00AB7C, 0x0013AC}, +{0x00AB7D, 0x0013AD}, +{0x00AB7E, 0x0013AE}, +{0x00AB7F, 0x0013AF}, +{0x00AB80, 0x0013B0}, +{0x00AB81, 0x0013B1}, +{0x00AB82, 0x0013B2}, +{0x00AB83, 0x0013B3}, +{0x00AB84, 0x0013B4}, +{0x00AB85, 0x0013B5}, +{0x00AB86, 0x0013B6}, +{0x00AB87, 0x0013B7}, +{0x00AB88, 0x0013B8}, +{0x00AB89, 0x0013B9}, +{0x00AB8A, 0x0013BA}, +{0x00AB8B, 0x0013BB}, +{0x00AB8C, 0x0013BC}, +{0x00AB8D, 0x0013BD}, +{0x00AB8E, 0x0013BE}, +{0x00AB8F, 0x0013BF}, +{0x00AB90, 0x0013C0}, +{0x00AB91, 0x0013C1}, +{0x00AB92, 0x0013C2}, +{0x00AB93, 0x0013C3}, +{0x00AB94, 0x0013C4}, +{0x00AB95, 0x0013C5}, +{0x00AB96, 0x0013C6}, +{0x00AB97, 0x0013C7}, +{0x00AB98, 0x0013C8}, +{0x00AB99, 0x0013C9}, +{0x00AB9A, 0x0013CA}, +{0x00AB9B, 0x0013CB}, +{0x00AB9C, 0x0013CC}, +{0x00AB9D, 0x0013CD}, +{0x00AB9E, 0x0013CE}, +{0x00AB9F, 0x0013CF}, +{0x00ABA0, 0x0013D0}, +{0x00ABA1, 0x0013D1}, +{0x00ABA2, 0x0013D2}, +{0x00ABA3, 0x0013D3}, +{0x00ABA4, 0x0013D4}, +{0x00ABA5, 0x0013D5}, +{0x00ABA6, 0x0013D6}, +{0x00ABA7, 0x0013D7}, +{0x00ABA8, 0x0013D8}, +{0x00ABA9, 0x0013D9}, +{0x00ABAA, 0x0013DA}, +{0x00ABAB, 0x0013DB}, +{0x00ABAC, 0x0013DC}, +{0x00ABAD, 0x0013DD}, +{0x00ABAE, 0x0013DE}, +{0x00ABAF, 0x0013DF}, +{0x00ABB0, 0x0013E0}, +{0x00ABB1, 0x0013E1}, +{0x00ABB2, 0x0013E2}, +{0x00ABB3, 0x0013E3}, +{0x00ABB4, 0x0013E4}, +{0x00ABB5, 0x0013E5}, +{0x00ABB6, 0x0013E6}, +{0x00ABB7, 0x0013E7}, +{0x00ABB8, 0x0013E8}, +{0x00ABB9, 0x0013E9}, +{0x00ABBA, 0x0013EA}, +{0x00ABBB, 0x0013EB}, +{0x00ABBC, 0x0013EC}, +{0x00ABBD, 0x0013ED}, +{0x00ABBE, 0x0013EE}, +{0x00ABBF, 0x0013EF}, +{0x00FB00, 0x000046}, +{0x00FB01, 0x000046}, +{0x00FB02, 0x000046}, +{0x00FB03, 0x000046}, +{0x00FB04, 0x000046}, +{0x00FB05, 0x000053}, +{0x00FB06, 0x000053}, +{0x00FB13, 0x000544}, +{0x00FB14, 0x000544}, +{0x00FB15, 0x000544}, +{0x00FB16, 0x00054E}, +{0x00FB17, 0x000544}, +{0x00FF41, 0x00FF21}, +{0x00FF42, 0x00FF22}, +{0x00FF43, 0x00FF23}, +{0x00FF44, 0x00FF24}, +{0x00FF45, 0x00FF25}, +{0x00FF46, 0x00FF26}, +{0x00FF47, 0x00FF27}, +{0x00FF48, 0x00FF28}, +{0x00FF49, 0x00FF29}, +{0x00FF4A, 0x00FF2A}, +{0x00FF4B, 0x00FF2B}, +{0x00FF4C, 0x00FF2C}, +{0x00FF4D, 0x00FF2D}, +{0x00FF4E, 0x00FF2E}, +{0x00FF4F, 0x00FF2F}, +{0x00FF50, 0x00FF30}, +{0x00FF51, 0x00FF31}, +{0x00FF52, 0x00FF32}, +{0x00FF53, 0x00FF33}, +{0x00FF54, 0x00FF34}, +{0x00FF55, 0x00FF35}, +{0x00FF56, 0x00FF36}, +{0x00FF57, 0x00FF37}, +{0x00FF58, 0x00FF38}, +{0x00FF59, 0x00FF39}, +{0x00FF5A, 0x00FF3A}, +{0x010428, 0x010400}, +{0x010429, 0x010401}, +{0x01042A, 0x010402}, +{0x01042B, 0x010403}, +{0x01042C, 0x010404}, +{0x01042D, 0x010405}, +{0x01042E, 0x010406}, +{0x01042F, 0x010407}, +{0x010430, 0x010408}, +{0x010431, 0x010409}, +{0x010432, 0x01040A}, +{0x010433, 0x01040B}, +{0x010434, 0x01040C}, +{0x010435, 0x01040D}, +{0x010436, 0x01040E}, +{0x010437, 0x01040F}, +{0x010438, 0x010410}, +{0x010439, 0x010411}, +{0x01043A, 0x010412}, +{0x01043B, 0x010413}, +{0x01043C, 0x010414}, +{0x01043D, 0x010415}, +{0x01043E, 0x010416}, +{0x01043F, 0x010417}, +{0x010440, 0x010418}, +{0x010441, 0x010419}, +{0x010442, 0x01041A}, +{0x010443, 0x01041B}, +{0x010444, 0x01041C}, +{0x010445, 0x01041D}, +{0x010446, 0x01041E}, +{0x010447, 0x01041F}, +{0x010448, 0x010420}, +{0x010449, 0x010421}, +{0x01044A, 0x010422}, +{0x01044B, 0x010423}, +{0x01044C, 0x010424}, +{0x01044D, 0x010425}, +{0x01044E, 0x010426}, +{0x01044F, 0x010427}, +{0x0104D8, 0x0104B0}, +{0x0104D9, 0x0104B1}, +{0x0104DA, 0x0104B2}, +{0x0104DB, 0x0104B3}, +{0x0104DC, 0x0104B4}, +{0x0104DD, 0x0104B5}, +{0x0104DE, 0x0104B6}, +{0x0104DF, 0x0104B7}, +{0x0104E0, 0x0104B8}, +{0x0104E1, 0x0104B9}, +{0x0104E2, 0x0104BA}, +{0x0104E3, 0x0104BB}, +{0x0104E4, 0x0104BC}, +{0x0104E5, 0x0104BD}, +{0x0104E6, 0x0104BE}, +{0x0104E7, 0x0104BF}, +{0x0104E8, 0x0104C0}, +{0x0104E9, 0x0104C1}, +{0x0104EA, 0x0104C2}, +{0x0104EB, 0x0104C3}, +{0x0104EC, 0x0104C4}, +{0x0104ED, 0x0104C5}, +{0x0104EE, 0x0104C6}, +{0x0104EF, 0x0104C7}, +{0x0104F0, 0x0104C8}, +{0x0104F1, 0x0104C9}, +{0x0104F2, 0x0104CA}, +{0x0104F3, 0x0104CB}, +{0x0104F4, 0x0104CC}, +{0x0104F5, 0x0104CD}, +{0x0104F6, 0x0104CE}, +{0x0104F7, 0x0104CF}, +{0x0104F8, 0x0104D0}, +{0x0104F9, 0x0104D1}, +{0x0104FA, 0x0104D2}, +{0x0104FB, 0x0104D3}, +{0x010CC0, 0x010C80}, +{0x010CC1, 0x010C81}, +{0x010CC2, 0x010C82}, +{0x010CC3, 0x010C83}, +{0x010CC4, 0x010C84}, +{0x010CC5, 0x010C85}, +{0x010CC6, 0x010C86}, +{0x010CC7, 0x010C87}, +{0x010CC8, 0x010C88}, +{0x010CC9, 0x010C89}, +{0x010CCA, 0x010C8A}, +{0x010CCB, 0x010C8B}, +{0x010CCC, 0x010C8C}, +{0x010CCD, 0x010C8D}, +{0x010CCE, 0x010C8E}, +{0x010CCF, 0x010C8F}, +{0x010CD0, 0x010C90}, +{0x010CD1, 0x010C91}, +{0x010CD2, 0x010C92}, +{0x010CD3, 0x010C93}, +{0x010CD4, 0x010C94}, +{0x010CD5, 0x010C95}, +{0x010CD6, 0x010C96}, +{0x010CD7, 0x010C97}, +{0x010CD8, 0x010C98}, +{0x010CD9, 0x010C99}, +{0x010CDA, 0x010C9A}, +{0x010CDB, 0x010C9B}, +{0x010CDC, 0x010C9C}, +{0x010CDD, 0x010C9D}, +{0x010CDE, 0x010C9E}, +{0x010CDF, 0x010C9F}, +{0x010CE0, 0x010CA0}, +{0x010CE1, 0x010CA1}, +{0x010CE2, 0x010CA2}, +{0x010CE3, 0x010CA3}, +{0x010CE4, 0x010CA4}, +{0x010CE5, 0x010CA5}, +{0x010CE6, 0x010CA6}, +{0x010CE7, 0x010CA7}, +{0x010CE8, 0x010CA8}, +{0x010CE9, 0x010CA9}, +{0x010CEA, 0x010CAA}, +{0x010CEB, 0x010CAB}, +{0x010CEC, 0x010CAC}, +{0x010CED, 0x010CAD}, +{0x010CEE, 0x010CAE}, +{0x010CEF, 0x010CAF}, +{0x010CF0, 0x010CB0}, +{0x010CF1, 0x010CB1}, +{0x010CF2, 0x010CB2}, +{0x0118C0, 0x0118A0}, +{0x0118C1, 0x0118A1}, +{0x0118C2, 0x0118A2}, +{0x0118C3, 0x0118A3}, +{0x0118C4, 0x0118A4}, +{0x0118C5, 0x0118A5}, +{0x0118C6, 0x0118A6}, +{0x0118C7, 0x0118A7}, +{0x0118C8, 0x0118A8}, +{0x0118C9, 0x0118A9}, +{0x0118CA, 0x0118AA}, +{0x0118CB, 0x0118AB}, +{0x0118CC, 0x0118AC}, +{0x0118CD, 0x0118AD}, +{0x0118CE, 0x0118AE}, +{0x0118CF, 0x0118AF}, +{0x0118D0, 0x0118B0}, +{0x0118D1, 0x0118B1}, +{0x0118D2, 0x0118B2}, +{0x0118D3, 0x0118B3}, +{0x0118D4, 0x0118B4}, +{0x0118D5, 0x0118B5}, +{0x0118D6, 0x0118B6}, +{0x0118D7, 0x0118B7}, +{0x0118D8, 0x0118B8}, +{0x0118D9, 0x0118B9}, +{0x0118DA, 0x0118BA}, +{0x0118DB, 0x0118BB}, +{0x0118DC, 0x0118BC}, +{0x0118DD, 0x0118BD}, +{0x0118DE, 0x0118BE}, +{0x0118DF, 0x0118BF}, +{0x016E60, 0x016E40}, +{0x016E61, 0x016E41}, +{0x016E62, 0x016E42}, +{0x016E63, 0x016E43}, +{0x016E64, 0x016E44}, +{0x016E65, 0x016E45}, +{0x016E66, 0x016E46}, +{0x016E67, 0x016E47}, +{0x016E68, 0x016E48}, +{0x016E69, 0x016E49}, +{0x016E6A, 0x016E4A}, +{0x016E6B, 0x016E4B}, +{0x016E6C, 0x016E4C}, +{0x016E6D, 0x016E4D}, +{0x016E6E, 0x016E4E}, +{0x016E6F, 0x016E4F}, +{0x016E70, 0x016E50}, +{0x016E71, 0x016E51}, +{0x016E72, 0x016E52}, +{0x016E73, 0x016E53}, +{0x016E74, 0x016E54}, +{0x016E75, 0x016E55}, +{0x016E76, 0x016E56}, +{0x016E77, 0x016E57}, +{0x016E78, 0x016E58}, +{0x016E79, 0x016E59}, +{0x016E7A, 0x016E5A}, +{0x016E7B, 0x016E5B}, +{0x016E7C, 0x016E5C}, +{0x016E7D, 0x016E5D}, +{0x016E7E, 0x016E5E}, +{0x016E7F, 0x016E5F}, +{0x01E922, 0x01E900}, +{0x01E923, 0x01E901}, +{0x01E924, 0x01E902}, +{0x01E925, 0x01E903}, +{0x01E926, 0x01E904}, +{0x01E927, 0x01E905}, +{0x01E928, 0x01E906}, +{0x01E929, 0x01E907}, +{0x01E92A, 0x01E908}, +{0x01E92B, 0x01E909}, +{0x01E92C, 0x01E90A}, +{0x01E92D, 0x01E90B}, +{0x01E92E, 0x01E90C}, +{0x01E92F, 0x01E90D}, +{0x01E930, 0x01E90E}, +{0x01E931, 0x01E90F}, +{0x01E932, 0x01E910}, +{0x01E933, 0x01E911}, +{0x01E934, 0x01E912}, +{0x01E935, 0x01E913}, +{0x01E936, 0x01E914}, +{0x01E937, 0x01E915}, +{0x01E938, 0x01E916}, +{0x01E939, 0x01E917}, +{0x01E93A, 0x01E918}, +{0x01E93B, 0x01E919}, +{0x01E93C, 0x01E91A}, +{0x01E93D, 0x01E91B}, +{0x01E93E, 0x01E91C}, +{0x01E93F, 0x01E91D}, +{0x01E940, 0x01E91E}, +{0x01E941, 0x01E91F}, +{0x01E942, 0x01E920}, +{0x01E943, 0x01E921}, }; -const std::multimap unicode_map_nfd = { -{0x000000C0, 0x00000041}, {0x000000C0, 0x00000300}, {0x000000C1, 0x00000041}, {0x000000C1, 0x00000301}, -{0x000000C2, 0x00000041}, {0x000000C2, 0x00000302}, {0x000000C3, 0x00000041}, {0x000000C3, 0x00000303}, -{0x000000C4, 0x00000041}, {0x000000C4, 0x00000308}, {0x000000C5, 0x00000041}, {0x000000C5, 0x0000030A}, -{0x000000C7, 0x00000043}, {0x000000C7, 0x00000327}, {0x000000C8, 0x00000045}, {0x000000C8, 0x00000300}, -{0x000000C9, 0x00000045}, {0x000000C9, 0x00000301}, {0x000000CA, 0x00000045}, {0x000000CA, 0x00000302}, -{0x000000CB, 0x00000045}, {0x000000CB, 0x00000308}, {0x000000CC, 0x00000049}, {0x000000CC, 0x00000300}, -{0x000000CD, 0x00000049}, {0x000000CD, 0x00000301}, {0x000000CE, 0x00000049}, {0x000000CE, 0x00000302}, -{0x000000CF, 0x00000049}, {0x000000CF, 0x00000308}, {0x000000D1, 0x0000004E}, {0x000000D1, 0x00000303}, -{0x000000D2, 0x0000004F}, {0x000000D2, 0x00000300}, {0x000000D3, 0x0000004F}, {0x000000D3, 0x00000301}, -{0x000000D4, 0x0000004F}, {0x000000D4, 0x00000302}, {0x000000D5, 0x0000004F}, {0x000000D5, 0x00000303}, -{0x000000D6, 0x0000004F}, {0x000000D6, 0x00000308}, {0x000000D9, 0x00000055}, {0x000000D9, 0x00000300}, -{0x000000DA, 0x00000055}, {0x000000DA, 0x00000301}, {0x000000DB, 0x00000055}, {0x000000DB, 0x00000302}, -{0x000000DC, 0x00000055}, {0x000000DC, 0x00000308}, {0x000000DD, 0x00000059}, {0x000000DD, 0x00000301}, -{0x000000E0, 0x00000061}, {0x000000E0, 0x00000300}, {0x000000E1, 0x00000061}, {0x000000E1, 0x00000301}, -{0x000000E2, 0x00000061}, {0x000000E2, 0x00000302}, {0x000000E3, 0x00000061}, {0x000000E3, 0x00000303}, -{0x000000E4, 0x00000061}, {0x000000E4, 0x00000308}, {0x000000E5, 0x00000061}, {0x000000E5, 0x0000030A}, -{0x000000E7, 0x00000063}, {0x000000E7, 0x00000327}, {0x000000E8, 0x00000065}, {0x000000E8, 0x00000300}, -{0x000000E9, 0x00000065}, {0x000000E9, 0x00000301}, {0x000000EA, 0x00000065}, {0x000000EA, 0x00000302}, -{0x000000EB, 0x00000065}, {0x000000EB, 0x00000308}, {0x000000EC, 0x00000069}, {0x000000EC, 0x00000300}, -{0x000000ED, 0x00000069}, {0x000000ED, 0x00000301}, {0x000000EE, 0x00000069}, {0x000000EE, 0x00000302}, -{0x000000EF, 0x00000069}, {0x000000EF, 0x00000308}, {0x000000F1, 0x0000006E}, {0x000000F1, 0x00000303}, -{0x000000F2, 0x0000006F}, {0x000000F2, 0x00000300}, {0x000000F3, 0x0000006F}, {0x000000F3, 0x00000301}, -{0x000000F4, 0x0000006F}, {0x000000F4, 0x00000302}, {0x000000F5, 0x0000006F}, {0x000000F5, 0x00000303}, -{0x000000F6, 0x0000006F}, {0x000000F6, 0x00000308}, {0x000000F9, 0x00000075}, {0x000000F9, 0x00000300}, -{0x000000FA, 0x00000075}, {0x000000FA, 0x00000301}, {0x000000FB, 0x00000075}, {0x000000FB, 0x00000302}, -{0x000000FC, 0x00000075}, {0x000000FC, 0x00000308}, {0x000000FD, 0x00000079}, {0x000000FD, 0x00000301}, -{0x000000FF, 0x00000079}, {0x000000FF, 0x00000308}, {0x00000100, 0x00000041}, {0x00000100, 0x00000304}, -{0x00000101, 0x00000061}, {0x00000101, 0x00000304}, {0x00000102, 0x00000041}, {0x00000102, 0x00000306}, -{0x00000103, 0x00000061}, {0x00000103, 0x00000306}, {0x00000104, 0x00000041}, {0x00000104, 0x00000328}, -{0x00000105, 0x00000061}, {0x00000105, 0x00000328}, {0x00000106, 0x00000043}, {0x00000106, 0x00000301}, -{0x00000107, 0x00000063}, {0x00000107, 0x00000301}, {0x00000108, 0x00000043}, {0x00000108, 0x00000302}, -{0x00000109, 0x00000063}, {0x00000109, 0x00000302}, {0x0000010A, 0x00000043}, {0x0000010A, 0x00000307}, -{0x0000010B, 0x00000063}, {0x0000010B, 0x00000307}, {0x0000010C, 0x00000043}, {0x0000010C, 0x0000030C}, -{0x0000010D, 0x00000063}, {0x0000010D, 0x0000030C}, {0x0000010E, 0x00000044}, {0x0000010E, 0x0000030C}, -{0x0000010F, 0x00000064}, {0x0000010F, 0x0000030C}, {0x00000112, 0x00000045}, {0x00000112, 0x00000304}, -{0x00000113, 0x00000065}, {0x00000113, 0x00000304}, {0x00000114, 0x00000045}, {0x00000114, 0x00000306}, -{0x00000115, 0x00000065}, {0x00000115, 0x00000306}, {0x00000116, 0x00000045}, {0x00000116, 0x00000307}, -{0x00000117, 0x00000065}, {0x00000117, 0x00000307}, {0x00000118, 0x00000045}, {0x00000118, 0x00000328}, -{0x00000119, 0x00000065}, {0x00000119, 0x00000328}, {0x0000011A, 0x00000045}, {0x0000011A, 0x0000030C}, -{0x0000011B, 0x00000065}, {0x0000011B, 0x0000030C}, {0x0000011C, 0x00000047}, {0x0000011C, 0x00000302}, -{0x0000011D, 0x00000067}, {0x0000011D, 0x00000302}, {0x0000011E, 0x00000047}, {0x0000011E, 0x00000306}, -{0x0000011F, 0x00000067}, {0x0000011F, 0x00000306}, {0x00000120, 0x00000047}, {0x00000120, 0x00000307}, -{0x00000121, 0x00000067}, {0x00000121, 0x00000307}, {0x00000122, 0x00000047}, {0x00000122, 0x00000327}, -{0x00000123, 0x00000067}, {0x00000123, 0x00000327}, {0x00000124, 0x00000048}, {0x00000124, 0x00000302}, -{0x00000125, 0x00000068}, {0x00000125, 0x00000302}, {0x00000128, 0x00000049}, {0x00000128, 0x00000303}, -{0x00000129, 0x00000069}, {0x00000129, 0x00000303}, {0x0000012A, 0x00000049}, {0x0000012A, 0x00000304}, -{0x0000012B, 0x00000069}, {0x0000012B, 0x00000304}, {0x0000012C, 0x00000049}, {0x0000012C, 0x00000306}, -{0x0000012D, 0x00000069}, {0x0000012D, 0x00000306}, {0x0000012E, 0x00000049}, {0x0000012E, 0x00000328}, -{0x0000012F, 0x00000069}, {0x0000012F, 0x00000328}, {0x00000130, 0x00000049}, {0x00000130, 0x00000307}, -{0x00000134, 0x0000004A}, {0x00000134, 0x00000302}, {0x00000135, 0x0000006A}, {0x00000135, 0x00000302}, -{0x00000136, 0x0000004B}, {0x00000136, 0x00000327}, {0x00000137, 0x0000006B}, {0x00000137, 0x00000327}, -{0x00000139, 0x0000004C}, {0x00000139, 0x00000301}, {0x0000013A, 0x0000006C}, {0x0000013A, 0x00000301}, -{0x0000013B, 0x0000004C}, {0x0000013B, 0x00000327}, {0x0000013C, 0x0000006C}, {0x0000013C, 0x00000327}, -{0x0000013D, 0x0000004C}, {0x0000013D, 0x0000030C}, {0x0000013E, 0x0000006C}, {0x0000013E, 0x0000030C}, -{0x00000143, 0x0000004E}, {0x00000143, 0x00000301}, {0x00000144, 0x0000006E}, {0x00000144, 0x00000301}, -{0x00000145, 0x0000004E}, {0x00000145, 0x00000327}, {0x00000146, 0x0000006E}, {0x00000146, 0x00000327}, -{0x00000147, 0x0000004E}, {0x00000147, 0x0000030C}, {0x00000148, 0x0000006E}, {0x00000148, 0x0000030C}, -{0x0000014C, 0x0000004F}, {0x0000014C, 0x00000304}, {0x0000014D, 0x0000006F}, {0x0000014D, 0x00000304}, -{0x0000014E, 0x0000004F}, {0x0000014E, 0x00000306}, {0x0000014F, 0x0000006F}, {0x0000014F, 0x00000306}, -{0x00000150, 0x0000004F}, {0x00000150, 0x0000030B}, {0x00000151, 0x0000006F}, {0x00000151, 0x0000030B}, -{0x00000154, 0x00000052}, {0x00000154, 0x00000301}, {0x00000155, 0x00000072}, {0x00000155, 0x00000301}, -{0x00000156, 0x00000052}, {0x00000156, 0x00000327}, {0x00000157, 0x00000072}, {0x00000157, 0x00000327}, -{0x00000158, 0x00000052}, {0x00000158, 0x0000030C}, {0x00000159, 0x00000072}, {0x00000159, 0x0000030C}, -{0x0000015A, 0x00000053}, {0x0000015A, 0x00000301}, {0x0000015B, 0x00000073}, {0x0000015B, 0x00000301}, -{0x0000015C, 0x00000053}, {0x0000015C, 0x00000302}, {0x0000015D, 0x00000073}, {0x0000015D, 0x00000302}, -{0x0000015E, 0x00000053}, {0x0000015E, 0x00000327}, {0x0000015F, 0x00000073}, {0x0000015F, 0x00000327}, -{0x00000160, 0x00000053}, {0x00000160, 0x0000030C}, {0x00000161, 0x00000073}, {0x00000161, 0x0000030C}, -{0x00000162, 0x00000054}, {0x00000162, 0x00000327}, {0x00000163, 0x00000074}, {0x00000163, 0x00000327}, -{0x00000164, 0x00000054}, {0x00000164, 0x0000030C}, {0x00000165, 0x00000074}, {0x00000165, 0x0000030C}, -{0x00000168, 0x00000055}, {0x00000168, 0x00000303}, {0x00000169, 0x00000075}, {0x00000169, 0x00000303}, -{0x0000016A, 0x00000055}, {0x0000016A, 0x00000304}, {0x0000016B, 0x00000075}, {0x0000016B, 0x00000304}, -{0x0000016C, 0x00000055}, {0x0000016C, 0x00000306}, {0x0000016D, 0x00000075}, {0x0000016D, 0x00000306}, -{0x0000016E, 0x00000055}, {0x0000016E, 0x0000030A}, {0x0000016F, 0x00000075}, {0x0000016F, 0x0000030A}, -{0x00000170, 0x00000055}, {0x00000170, 0x0000030B}, {0x00000171, 0x00000075}, {0x00000171, 0x0000030B}, -{0x00000172, 0x00000055}, {0x00000172, 0x00000328}, {0x00000173, 0x00000075}, {0x00000173, 0x00000328}, -{0x00000174, 0x00000057}, {0x00000174, 0x00000302}, {0x00000175, 0x00000077}, {0x00000175, 0x00000302}, -{0x00000176, 0x00000059}, {0x00000176, 0x00000302}, {0x00000177, 0x00000079}, {0x00000177, 0x00000302}, -{0x00000178, 0x00000059}, {0x00000178, 0x00000308}, {0x00000179, 0x0000005A}, {0x00000179, 0x00000301}, -{0x0000017A, 0x0000007A}, {0x0000017A, 0x00000301}, {0x0000017B, 0x0000005A}, {0x0000017B, 0x00000307}, -{0x0000017C, 0x0000007A}, {0x0000017C, 0x00000307}, {0x0000017D, 0x0000005A}, {0x0000017D, 0x0000030C}, -{0x0000017E, 0x0000007A}, {0x0000017E, 0x0000030C}, {0x000001A0, 0x0000004F}, {0x000001A0, 0x0000031B}, -{0x000001A1, 0x0000006F}, {0x000001A1, 0x0000031B}, {0x000001AF, 0x00000055}, {0x000001AF, 0x0000031B}, -{0x000001B0, 0x00000075}, {0x000001B0, 0x0000031B}, {0x000001CD, 0x00000041}, {0x000001CD, 0x0000030C}, -{0x000001CE, 0x00000061}, {0x000001CE, 0x0000030C}, {0x000001CF, 0x00000049}, {0x000001CF, 0x0000030C}, -{0x000001D0, 0x00000069}, {0x000001D0, 0x0000030C}, {0x000001D1, 0x0000004F}, {0x000001D1, 0x0000030C}, -{0x000001D2, 0x0000006F}, {0x000001D2, 0x0000030C}, {0x000001D3, 0x00000055}, {0x000001D3, 0x0000030C}, -{0x000001D4, 0x00000075}, {0x000001D4, 0x0000030C}, {0x000001D5, 0x00000055}, {0x000001D5, 0x00000308}, -{0x000001D5, 0x00000304}, {0x000001D6, 0x00000075}, {0x000001D6, 0x00000308}, {0x000001D6, 0x00000304}, -{0x000001D7, 0x00000055}, {0x000001D7, 0x00000308}, {0x000001D7, 0x00000301}, {0x000001D8, 0x00000075}, -{0x000001D8, 0x00000308}, {0x000001D8, 0x00000301}, {0x000001D9, 0x00000055}, {0x000001D9, 0x00000308}, -{0x000001D9, 0x0000030C}, {0x000001DA, 0x00000075}, {0x000001DA, 0x00000308}, {0x000001DA, 0x0000030C}, -{0x000001DB, 0x00000055}, {0x000001DB, 0x00000308}, {0x000001DB, 0x00000300}, {0x000001DC, 0x00000075}, -{0x000001DC, 0x00000308}, {0x000001DC, 0x00000300}, {0x000001DE, 0x00000041}, {0x000001DE, 0x00000308}, -{0x000001DE, 0x00000304}, {0x000001DF, 0x00000061}, {0x000001DF, 0x00000308}, {0x000001DF, 0x00000304}, -{0x000001E0, 0x00000041}, {0x000001E0, 0x00000307}, {0x000001E0, 0x00000304}, {0x000001E1, 0x00000061}, -{0x000001E1, 0x00000307}, {0x000001E1, 0x00000304}, {0x000001E2, 0x000000C6}, {0x000001E2, 0x00000304}, -{0x000001E3, 0x000000E6}, {0x000001E3, 0x00000304}, {0x000001E6, 0x00000047}, {0x000001E6, 0x0000030C}, -{0x000001E7, 0x00000067}, {0x000001E7, 0x0000030C}, {0x000001E8, 0x0000004B}, {0x000001E8, 0x0000030C}, -{0x000001E9, 0x0000006B}, {0x000001E9, 0x0000030C}, {0x000001EA, 0x0000004F}, {0x000001EA, 0x00000328}, -{0x000001EB, 0x0000006F}, {0x000001EB, 0x00000328}, {0x000001EC, 0x0000004F}, {0x000001EC, 0x00000328}, -{0x000001EC, 0x00000304}, {0x000001ED, 0x0000006F}, {0x000001ED, 0x00000328}, {0x000001ED, 0x00000304}, -{0x000001EE, 0x000001B7}, {0x000001EE, 0x0000030C}, {0x000001EF, 0x00000292}, {0x000001EF, 0x0000030C}, -{0x000001F0, 0x0000006A}, {0x000001F0, 0x0000030C}, {0x000001F4, 0x00000047}, {0x000001F4, 0x00000301}, -{0x000001F5, 0x00000067}, {0x000001F5, 0x00000301}, {0x000001F8, 0x0000004E}, {0x000001F8, 0x00000300}, -{0x000001F9, 0x0000006E}, {0x000001F9, 0x00000300}, {0x000001FA, 0x00000041}, {0x000001FA, 0x0000030A}, -{0x000001FA, 0x00000301}, {0x000001FB, 0x00000061}, {0x000001FB, 0x0000030A}, {0x000001FB, 0x00000301}, -{0x000001FC, 0x000000C6}, {0x000001FC, 0x00000301}, {0x000001FD, 0x000000E6}, {0x000001FD, 0x00000301}, -{0x000001FE, 0x000000D8}, {0x000001FE, 0x00000301}, {0x000001FF, 0x000000F8}, {0x000001FF, 0x00000301}, -{0x00000200, 0x00000041}, {0x00000200, 0x0000030F}, {0x00000201, 0x00000061}, {0x00000201, 0x0000030F}, -{0x00000202, 0x00000041}, {0x00000202, 0x00000311}, {0x00000203, 0x00000061}, {0x00000203, 0x00000311}, -{0x00000204, 0x00000045}, {0x00000204, 0x0000030F}, {0x00000205, 0x00000065}, {0x00000205, 0x0000030F}, -{0x00000206, 0x00000045}, {0x00000206, 0x00000311}, {0x00000207, 0x00000065}, {0x00000207, 0x00000311}, -{0x00000208, 0x00000049}, {0x00000208, 0x0000030F}, {0x00000209, 0x00000069}, {0x00000209, 0x0000030F}, -{0x0000020A, 0x00000049}, {0x0000020A, 0x00000311}, {0x0000020B, 0x00000069}, {0x0000020B, 0x00000311}, -{0x0000020C, 0x0000004F}, {0x0000020C, 0x0000030F}, {0x0000020D, 0x0000006F}, {0x0000020D, 0x0000030F}, -{0x0000020E, 0x0000004F}, {0x0000020E, 0x00000311}, {0x0000020F, 0x0000006F}, {0x0000020F, 0x00000311}, -{0x00000210, 0x00000052}, {0x00000210, 0x0000030F}, {0x00000211, 0x00000072}, {0x00000211, 0x0000030F}, -{0x00000212, 0x00000052}, {0x00000212, 0x00000311}, {0x00000213, 0x00000072}, {0x00000213, 0x00000311}, -{0x00000214, 0x00000055}, {0x00000214, 0x0000030F}, {0x00000215, 0x00000075}, {0x00000215, 0x0000030F}, -{0x00000216, 0x00000055}, {0x00000216, 0x00000311}, {0x00000217, 0x00000075}, {0x00000217, 0x00000311}, -{0x00000218, 0x00000053}, {0x00000218, 0x00000326}, {0x00000219, 0x00000073}, {0x00000219, 0x00000326}, -{0x0000021A, 0x00000054}, {0x0000021A, 0x00000326}, {0x0000021B, 0x00000074}, {0x0000021B, 0x00000326}, -{0x0000021E, 0x00000048}, {0x0000021E, 0x0000030C}, {0x0000021F, 0x00000068}, {0x0000021F, 0x0000030C}, -{0x00000226, 0x00000041}, {0x00000226, 0x00000307}, {0x00000227, 0x00000061}, {0x00000227, 0x00000307}, -{0x00000228, 0x00000045}, {0x00000228, 0x00000327}, {0x00000229, 0x00000065}, {0x00000229, 0x00000327}, -{0x0000022A, 0x0000004F}, {0x0000022A, 0x00000308}, {0x0000022A, 0x00000304}, {0x0000022B, 0x0000006F}, -{0x0000022B, 0x00000308}, {0x0000022B, 0x00000304}, {0x0000022C, 0x0000004F}, {0x0000022C, 0x00000303}, -{0x0000022C, 0x00000304}, {0x0000022D, 0x0000006F}, {0x0000022D, 0x00000303}, {0x0000022D, 0x00000304}, -{0x0000022E, 0x0000004F}, {0x0000022E, 0x00000307}, {0x0000022F, 0x0000006F}, {0x0000022F, 0x00000307}, -{0x00000230, 0x0000004F}, {0x00000230, 0x00000307}, {0x00000230, 0x00000304}, {0x00000231, 0x0000006F}, -{0x00000231, 0x00000307}, {0x00000231, 0x00000304}, {0x00000232, 0x00000059}, {0x00000232, 0x00000304}, -{0x00000233, 0x00000079}, {0x00000233, 0x00000304}, {0x00000340, 0x00000300}, {0x00000341, 0x00000301}, -{0x00000343, 0x00000313}, {0x00000344, 0x00000308}, {0x00000344, 0x00000301}, {0x00000374, 0x000002B9}, -{0x0000037E, 0x0000003B}, {0x00000385, 0x000000A8}, {0x00000385, 0x00000301}, {0x00000386, 0x00000391}, -{0x00000386, 0x00000301}, {0x00000387, 0x000000B7}, {0x00000388, 0x00000395}, {0x00000388, 0x00000301}, -{0x00000389, 0x00000397}, {0x00000389, 0x00000301}, {0x0000038A, 0x00000399}, {0x0000038A, 0x00000301}, -{0x0000038C, 0x0000039F}, {0x0000038C, 0x00000301}, {0x0000038E, 0x000003A5}, {0x0000038E, 0x00000301}, -{0x0000038F, 0x000003A9}, {0x0000038F, 0x00000301}, {0x00000390, 0x000003B9}, {0x00000390, 0x00000308}, -{0x00000390, 0x00000301}, {0x000003AA, 0x00000399}, {0x000003AA, 0x00000308}, {0x000003AB, 0x000003A5}, -{0x000003AB, 0x00000308}, {0x000003AC, 0x000003B1}, {0x000003AC, 0x00000301}, {0x000003AD, 0x000003B5}, -{0x000003AD, 0x00000301}, {0x000003AE, 0x000003B7}, {0x000003AE, 0x00000301}, {0x000003AF, 0x000003B9}, -{0x000003AF, 0x00000301}, {0x000003B0, 0x000003C5}, {0x000003B0, 0x00000308}, {0x000003B0, 0x00000301}, -{0x000003CA, 0x000003B9}, {0x000003CA, 0x00000308}, {0x000003CB, 0x000003C5}, {0x000003CB, 0x00000308}, -{0x000003CC, 0x000003BF}, {0x000003CC, 0x00000301}, {0x000003CD, 0x000003C5}, {0x000003CD, 0x00000301}, -{0x000003CE, 0x000003C9}, {0x000003CE, 0x00000301}, {0x000003D3, 0x000003D2}, {0x000003D3, 0x00000301}, -{0x000003D4, 0x000003D2}, {0x000003D4, 0x00000308}, {0x00000400, 0x00000415}, {0x00000400, 0x00000300}, -{0x00000401, 0x00000415}, {0x00000401, 0x00000308}, {0x00000403, 0x00000413}, {0x00000403, 0x00000301}, -{0x00000407, 0x00000406}, {0x00000407, 0x00000308}, {0x0000040C, 0x0000041A}, {0x0000040C, 0x00000301}, -{0x0000040D, 0x00000418}, {0x0000040D, 0x00000300}, {0x0000040E, 0x00000423}, {0x0000040E, 0x00000306}, -{0x00000419, 0x00000418}, {0x00000419, 0x00000306}, {0x00000439, 0x00000438}, {0x00000439, 0x00000306}, -{0x00000450, 0x00000435}, {0x00000450, 0x00000300}, {0x00000451, 0x00000435}, {0x00000451, 0x00000308}, -{0x00000453, 0x00000433}, {0x00000453, 0x00000301}, {0x00000457, 0x00000456}, {0x00000457, 0x00000308}, -{0x0000045C, 0x0000043A}, {0x0000045C, 0x00000301}, {0x0000045D, 0x00000438}, {0x0000045D, 0x00000300}, -{0x0000045E, 0x00000443}, {0x0000045E, 0x00000306}, {0x00000476, 0x00000474}, {0x00000476, 0x0000030F}, -{0x00000477, 0x00000475}, {0x00000477, 0x0000030F}, {0x000004C1, 0x00000416}, {0x000004C1, 0x00000306}, -{0x000004C2, 0x00000436}, {0x000004C2, 0x00000306}, {0x000004D0, 0x00000410}, {0x000004D0, 0x00000306}, -{0x000004D1, 0x00000430}, {0x000004D1, 0x00000306}, {0x000004D2, 0x00000410}, {0x000004D2, 0x00000308}, -{0x000004D3, 0x00000430}, {0x000004D3, 0x00000308}, {0x000004D6, 0x00000415}, {0x000004D6, 0x00000306}, -{0x000004D7, 0x00000435}, {0x000004D7, 0x00000306}, {0x000004DA, 0x000004D8}, {0x000004DA, 0x00000308}, -{0x000004DB, 0x000004D9}, {0x000004DB, 0x00000308}, {0x000004DC, 0x00000416}, {0x000004DC, 0x00000308}, -{0x000004DD, 0x00000436}, {0x000004DD, 0x00000308}, {0x000004DE, 0x00000417}, {0x000004DE, 0x00000308}, -{0x000004DF, 0x00000437}, {0x000004DF, 0x00000308}, {0x000004E2, 0x00000418}, {0x000004E2, 0x00000304}, -{0x000004E3, 0x00000438}, {0x000004E3, 0x00000304}, {0x000004E4, 0x00000418}, {0x000004E4, 0x00000308}, -{0x000004E5, 0x00000438}, {0x000004E5, 0x00000308}, {0x000004E6, 0x0000041E}, {0x000004E6, 0x00000308}, -{0x000004E7, 0x0000043E}, {0x000004E7, 0x00000308}, {0x000004EA, 0x000004E8}, {0x000004EA, 0x00000308}, -{0x000004EB, 0x000004E9}, {0x000004EB, 0x00000308}, {0x000004EC, 0x0000042D}, {0x000004EC, 0x00000308}, -{0x000004ED, 0x0000044D}, {0x000004ED, 0x00000308}, {0x000004EE, 0x00000423}, {0x000004EE, 0x00000304}, -{0x000004EF, 0x00000443}, {0x000004EF, 0x00000304}, {0x000004F0, 0x00000423}, {0x000004F0, 0x00000308}, -{0x000004F1, 0x00000443}, {0x000004F1, 0x00000308}, {0x000004F2, 0x00000423}, {0x000004F2, 0x0000030B}, -{0x000004F3, 0x00000443}, {0x000004F3, 0x0000030B}, {0x000004F4, 0x00000427}, {0x000004F4, 0x00000308}, -{0x000004F5, 0x00000447}, {0x000004F5, 0x00000308}, {0x000004F8, 0x0000042B}, {0x000004F8, 0x00000308}, -{0x000004F9, 0x0000044B}, {0x000004F9, 0x00000308}, {0x00000622, 0x00000627}, {0x00000622, 0x00000653}, -{0x00000623, 0x00000627}, {0x00000623, 0x00000654}, {0x00000624, 0x00000648}, {0x00000624, 0x00000654}, -{0x00000625, 0x00000627}, {0x00000625, 0x00000655}, {0x00000626, 0x0000064A}, {0x00000626, 0x00000654}, -{0x000006C0, 0x000006D5}, {0x000006C0, 0x00000654}, {0x000006C2, 0x000006C1}, {0x000006C2, 0x00000654}, -{0x000006D3, 0x000006D2}, {0x000006D3, 0x00000654}, {0x00000929, 0x00000928}, {0x00000929, 0x0000093C}, -{0x00000931, 0x00000930}, {0x00000931, 0x0000093C}, {0x00000934, 0x00000933}, {0x00000934, 0x0000093C}, -{0x00000958, 0x00000915}, {0x00000958, 0x0000093C}, {0x00000959, 0x00000916}, {0x00000959, 0x0000093C}, -{0x0000095A, 0x00000917}, {0x0000095A, 0x0000093C}, {0x0000095B, 0x0000091C}, {0x0000095B, 0x0000093C}, -{0x0000095C, 0x00000921}, {0x0000095C, 0x0000093C}, {0x0000095D, 0x00000922}, {0x0000095D, 0x0000093C}, -{0x0000095E, 0x0000092B}, {0x0000095E, 0x0000093C}, {0x0000095F, 0x0000092F}, {0x0000095F, 0x0000093C}, -{0x000009CB, 0x000009C7}, {0x000009CB, 0x000009BE}, {0x000009CC, 0x000009C7}, {0x000009CC, 0x000009D7}, -{0x000009DC, 0x000009A1}, {0x000009DC, 0x000009BC}, {0x000009DD, 0x000009A2}, {0x000009DD, 0x000009BC}, -{0x000009DF, 0x000009AF}, {0x000009DF, 0x000009BC}, {0x00000A33, 0x00000A32}, {0x00000A33, 0x00000A3C}, -{0x00000A36, 0x00000A38}, {0x00000A36, 0x00000A3C}, {0x00000A59, 0x00000A16}, {0x00000A59, 0x00000A3C}, -{0x00000A5A, 0x00000A17}, {0x00000A5A, 0x00000A3C}, {0x00000A5B, 0x00000A1C}, {0x00000A5B, 0x00000A3C}, -{0x00000A5E, 0x00000A2B}, {0x00000A5E, 0x00000A3C}, {0x00000B48, 0x00000B47}, {0x00000B48, 0x00000B56}, -{0x00000B4B, 0x00000B47}, {0x00000B4B, 0x00000B3E}, {0x00000B4C, 0x00000B47}, {0x00000B4C, 0x00000B57}, -{0x00000B5C, 0x00000B21}, {0x00000B5C, 0x00000B3C}, {0x00000B5D, 0x00000B22}, {0x00000B5D, 0x00000B3C}, -{0x00000B94, 0x00000B92}, {0x00000B94, 0x00000BD7}, {0x00000BCA, 0x00000BC6}, {0x00000BCA, 0x00000BBE}, -{0x00000BCB, 0x00000BC7}, {0x00000BCB, 0x00000BBE}, {0x00000BCC, 0x00000BC6}, {0x00000BCC, 0x00000BD7}, -{0x00000C48, 0x00000C46}, {0x00000C48, 0x00000C56}, {0x00000CC0, 0x00000CBF}, {0x00000CC0, 0x00000CD5}, -{0x00000CC7, 0x00000CC6}, {0x00000CC7, 0x00000CD5}, {0x00000CC8, 0x00000CC6}, {0x00000CC8, 0x00000CD6}, -{0x00000CCA, 0x00000CC6}, {0x00000CCA, 0x00000CC2}, {0x00000CCB, 0x00000CC6}, {0x00000CCB, 0x00000CC2}, -{0x00000CCB, 0x00000CD5}, {0x00000D4A, 0x00000D46}, {0x00000D4A, 0x00000D3E}, {0x00000D4B, 0x00000D47}, -{0x00000D4B, 0x00000D3E}, {0x00000D4C, 0x00000D46}, {0x00000D4C, 0x00000D57}, {0x00000DDA, 0x00000DD9}, -{0x00000DDA, 0x00000DCA}, {0x00000DDC, 0x00000DD9}, {0x00000DDC, 0x00000DCF}, {0x00000DDD, 0x00000DD9}, -{0x00000DDD, 0x00000DCF}, {0x00000DDD, 0x00000DCA}, {0x00000DDE, 0x00000DD9}, {0x00000DDE, 0x00000DDF}, -{0x00000F43, 0x00000F42}, {0x00000F43, 0x00000FB7}, {0x00000F4D, 0x00000F4C}, {0x00000F4D, 0x00000FB7}, -{0x00000F52, 0x00000F51}, {0x00000F52, 0x00000FB7}, {0x00000F57, 0x00000F56}, {0x00000F57, 0x00000FB7}, -{0x00000F5C, 0x00000F5B}, {0x00000F5C, 0x00000FB7}, {0x00000F69, 0x00000F40}, {0x00000F69, 0x00000FB5}, -{0x00000F73, 0x00000F71}, {0x00000F73, 0x00000F72}, {0x00000F75, 0x00000F71}, {0x00000F75, 0x00000F74}, -{0x00000F76, 0x00000FB2}, {0x00000F76, 0x00000F80}, {0x00000F78, 0x00000FB3}, {0x00000F78, 0x00000F80}, -{0x00000F81, 0x00000F71}, {0x00000F81, 0x00000F80}, {0x00000F93, 0x00000F92}, {0x00000F93, 0x00000FB7}, -{0x00000F9D, 0x00000F9C}, {0x00000F9D, 0x00000FB7}, {0x00000FA2, 0x00000FA1}, {0x00000FA2, 0x00000FB7}, -{0x00000FA7, 0x00000FA6}, {0x00000FA7, 0x00000FB7}, {0x00000FAC, 0x00000FAB}, {0x00000FAC, 0x00000FB7}, -{0x00000FB9, 0x00000F90}, {0x00000FB9, 0x00000FB5}, {0x00001026, 0x00001025}, {0x00001026, 0x0000102E}, -{0x00001B06, 0x00001B05}, {0x00001B06, 0x00001B35}, {0x00001B08, 0x00001B07}, {0x00001B08, 0x00001B35}, -{0x00001B0A, 0x00001B09}, {0x00001B0A, 0x00001B35}, {0x00001B0C, 0x00001B0B}, {0x00001B0C, 0x00001B35}, -{0x00001B0E, 0x00001B0D}, {0x00001B0E, 0x00001B35}, {0x00001B12, 0x00001B11}, {0x00001B12, 0x00001B35}, -{0x00001B3B, 0x00001B3A}, {0x00001B3B, 0x00001B35}, {0x00001B3D, 0x00001B3C}, {0x00001B3D, 0x00001B35}, -{0x00001B40, 0x00001B3E}, {0x00001B40, 0x00001B35}, {0x00001B41, 0x00001B3F}, {0x00001B41, 0x00001B35}, -{0x00001B43, 0x00001B42}, {0x00001B43, 0x00001B35}, {0x00001E00, 0x00000041}, {0x00001E00, 0x00000325}, -{0x00001E01, 0x00000061}, {0x00001E01, 0x00000325}, {0x00001E02, 0x00000042}, {0x00001E02, 0x00000307}, -{0x00001E03, 0x00000062}, {0x00001E03, 0x00000307}, {0x00001E04, 0x00000042}, {0x00001E04, 0x00000323}, -{0x00001E05, 0x00000062}, {0x00001E05, 0x00000323}, {0x00001E06, 0x00000042}, {0x00001E06, 0x00000331}, -{0x00001E07, 0x00000062}, {0x00001E07, 0x00000331}, {0x00001E08, 0x00000043}, {0x00001E08, 0x00000327}, -{0x00001E08, 0x00000301}, {0x00001E09, 0x00000063}, {0x00001E09, 0x00000327}, {0x00001E09, 0x00000301}, -{0x00001E0A, 0x00000044}, {0x00001E0A, 0x00000307}, {0x00001E0B, 0x00000064}, {0x00001E0B, 0x00000307}, -{0x00001E0C, 0x00000044}, {0x00001E0C, 0x00000323}, {0x00001E0D, 0x00000064}, {0x00001E0D, 0x00000323}, -{0x00001E0E, 0x00000044}, {0x00001E0E, 0x00000331}, {0x00001E0F, 0x00000064}, {0x00001E0F, 0x00000331}, -{0x00001E10, 0x00000044}, {0x00001E10, 0x00000327}, {0x00001E11, 0x00000064}, {0x00001E11, 0x00000327}, -{0x00001E12, 0x00000044}, {0x00001E12, 0x0000032D}, {0x00001E13, 0x00000064}, {0x00001E13, 0x0000032D}, -{0x00001E14, 0x00000045}, {0x00001E14, 0x00000304}, {0x00001E14, 0x00000300}, {0x00001E15, 0x00000065}, -{0x00001E15, 0x00000304}, {0x00001E15, 0x00000300}, {0x00001E16, 0x00000045}, {0x00001E16, 0x00000304}, -{0x00001E16, 0x00000301}, {0x00001E17, 0x00000065}, {0x00001E17, 0x00000304}, {0x00001E17, 0x00000301}, -{0x00001E18, 0x00000045}, {0x00001E18, 0x0000032D}, {0x00001E19, 0x00000065}, {0x00001E19, 0x0000032D}, -{0x00001E1A, 0x00000045}, {0x00001E1A, 0x00000330}, {0x00001E1B, 0x00000065}, {0x00001E1B, 0x00000330}, -{0x00001E1C, 0x00000045}, {0x00001E1C, 0x00000327}, {0x00001E1C, 0x00000306}, {0x00001E1D, 0x00000065}, -{0x00001E1D, 0x00000327}, {0x00001E1D, 0x00000306}, {0x00001E1E, 0x00000046}, {0x00001E1E, 0x00000307}, -{0x00001E1F, 0x00000066}, {0x00001E1F, 0x00000307}, {0x00001E20, 0x00000047}, {0x00001E20, 0x00000304}, -{0x00001E21, 0x00000067}, {0x00001E21, 0x00000304}, {0x00001E22, 0x00000048}, {0x00001E22, 0x00000307}, -{0x00001E23, 0x00000068}, {0x00001E23, 0x00000307}, {0x00001E24, 0x00000048}, {0x00001E24, 0x00000323}, -{0x00001E25, 0x00000068}, {0x00001E25, 0x00000323}, {0x00001E26, 0x00000048}, {0x00001E26, 0x00000308}, -{0x00001E27, 0x00000068}, {0x00001E27, 0x00000308}, {0x00001E28, 0x00000048}, {0x00001E28, 0x00000327}, -{0x00001E29, 0x00000068}, {0x00001E29, 0x00000327}, {0x00001E2A, 0x00000048}, {0x00001E2A, 0x0000032E}, -{0x00001E2B, 0x00000068}, {0x00001E2B, 0x0000032E}, {0x00001E2C, 0x00000049}, {0x00001E2C, 0x00000330}, -{0x00001E2D, 0x00000069}, {0x00001E2D, 0x00000330}, {0x00001E2E, 0x00000049}, {0x00001E2E, 0x00000308}, -{0x00001E2E, 0x00000301}, {0x00001E2F, 0x00000069}, {0x00001E2F, 0x00000308}, {0x00001E2F, 0x00000301}, -{0x00001E30, 0x0000004B}, {0x00001E30, 0x00000301}, {0x00001E31, 0x0000006B}, {0x00001E31, 0x00000301}, -{0x00001E32, 0x0000004B}, {0x00001E32, 0x00000323}, {0x00001E33, 0x0000006B}, {0x00001E33, 0x00000323}, -{0x00001E34, 0x0000004B}, {0x00001E34, 0x00000331}, {0x00001E35, 0x0000006B}, {0x00001E35, 0x00000331}, -{0x00001E36, 0x0000004C}, {0x00001E36, 0x00000323}, {0x00001E37, 0x0000006C}, {0x00001E37, 0x00000323}, -{0x00001E38, 0x0000004C}, {0x00001E38, 0x00000323}, {0x00001E38, 0x00000304}, {0x00001E39, 0x0000006C}, -{0x00001E39, 0x00000323}, {0x00001E39, 0x00000304}, {0x00001E3A, 0x0000004C}, {0x00001E3A, 0x00000331}, -{0x00001E3B, 0x0000006C}, {0x00001E3B, 0x00000331}, {0x00001E3C, 0x0000004C}, {0x00001E3C, 0x0000032D}, -{0x00001E3D, 0x0000006C}, {0x00001E3D, 0x0000032D}, {0x00001E3E, 0x0000004D}, {0x00001E3E, 0x00000301}, -{0x00001E3F, 0x0000006D}, {0x00001E3F, 0x00000301}, {0x00001E40, 0x0000004D}, {0x00001E40, 0x00000307}, -{0x00001E41, 0x0000006D}, {0x00001E41, 0x00000307}, {0x00001E42, 0x0000004D}, {0x00001E42, 0x00000323}, -{0x00001E43, 0x0000006D}, {0x00001E43, 0x00000323}, {0x00001E44, 0x0000004E}, {0x00001E44, 0x00000307}, -{0x00001E45, 0x0000006E}, {0x00001E45, 0x00000307}, {0x00001E46, 0x0000004E}, {0x00001E46, 0x00000323}, -{0x00001E47, 0x0000006E}, {0x00001E47, 0x00000323}, {0x00001E48, 0x0000004E}, {0x00001E48, 0x00000331}, -{0x00001E49, 0x0000006E}, {0x00001E49, 0x00000331}, {0x00001E4A, 0x0000004E}, {0x00001E4A, 0x0000032D}, -{0x00001E4B, 0x0000006E}, {0x00001E4B, 0x0000032D}, {0x00001E4C, 0x0000004F}, {0x00001E4C, 0x00000303}, -{0x00001E4C, 0x00000301}, {0x00001E4D, 0x0000006F}, {0x00001E4D, 0x00000303}, {0x00001E4D, 0x00000301}, -{0x00001E4E, 0x0000004F}, {0x00001E4E, 0x00000303}, {0x00001E4E, 0x00000308}, {0x00001E4F, 0x0000006F}, -{0x00001E4F, 0x00000303}, {0x00001E4F, 0x00000308}, {0x00001E50, 0x0000004F}, {0x00001E50, 0x00000304}, -{0x00001E50, 0x00000300}, {0x00001E51, 0x0000006F}, {0x00001E51, 0x00000304}, {0x00001E51, 0x00000300}, -{0x00001E52, 0x0000004F}, {0x00001E52, 0x00000304}, {0x00001E52, 0x00000301}, {0x00001E53, 0x0000006F}, -{0x00001E53, 0x00000304}, {0x00001E53, 0x00000301}, {0x00001E54, 0x00000050}, {0x00001E54, 0x00000301}, -{0x00001E55, 0x00000070}, {0x00001E55, 0x00000301}, {0x00001E56, 0x00000050}, {0x00001E56, 0x00000307}, -{0x00001E57, 0x00000070}, {0x00001E57, 0x00000307}, {0x00001E58, 0x00000052}, {0x00001E58, 0x00000307}, -{0x00001E59, 0x00000072}, {0x00001E59, 0x00000307}, {0x00001E5A, 0x00000052}, {0x00001E5A, 0x00000323}, -{0x00001E5B, 0x00000072}, {0x00001E5B, 0x00000323}, {0x00001E5C, 0x00000052}, {0x00001E5C, 0x00000323}, -{0x00001E5C, 0x00000304}, {0x00001E5D, 0x00000072}, {0x00001E5D, 0x00000323}, {0x00001E5D, 0x00000304}, -{0x00001E5E, 0x00000052}, {0x00001E5E, 0x00000331}, {0x00001E5F, 0x00000072}, {0x00001E5F, 0x00000331}, -{0x00001E60, 0x00000053}, {0x00001E60, 0x00000307}, {0x00001E61, 0x00000073}, {0x00001E61, 0x00000307}, -{0x00001E62, 0x00000053}, {0x00001E62, 0x00000323}, {0x00001E63, 0x00000073}, {0x00001E63, 0x00000323}, -{0x00001E64, 0x00000053}, {0x00001E64, 0x00000301}, {0x00001E64, 0x00000307}, {0x00001E65, 0x00000073}, -{0x00001E65, 0x00000301}, {0x00001E65, 0x00000307}, {0x00001E66, 0x00000053}, {0x00001E66, 0x0000030C}, -{0x00001E66, 0x00000307}, {0x00001E67, 0x00000073}, {0x00001E67, 0x0000030C}, {0x00001E67, 0x00000307}, -{0x00001E68, 0x00000053}, {0x00001E68, 0x00000323}, {0x00001E68, 0x00000307}, {0x00001E69, 0x00000073}, -{0x00001E69, 0x00000323}, {0x00001E69, 0x00000307}, {0x00001E6A, 0x00000054}, {0x00001E6A, 0x00000307}, -{0x00001E6B, 0x00000074}, {0x00001E6B, 0x00000307}, {0x00001E6C, 0x00000054}, {0x00001E6C, 0x00000323}, -{0x00001E6D, 0x00000074}, {0x00001E6D, 0x00000323}, {0x00001E6E, 0x00000054}, {0x00001E6E, 0x00000331}, -{0x00001E6F, 0x00000074}, {0x00001E6F, 0x00000331}, {0x00001E70, 0x00000054}, {0x00001E70, 0x0000032D}, -{0x00001E71, 0x00000074}, {0x00001E71, 0x0000032D}, {0x00001E72, 0x00000055}, {0x00001E72, 0x00000324}, -{0x00001E73, 0x00000075}, {0x00001E73, 0x00000324}, {0x00001E74, 0x00000055}, {0x00001E74, 0x00000330}, -{0x00001E75, 0x00000075}, {0x00001E75, 0x00000330}, {0x00001E76, 0x00000055}, {0x00001E76, 0x0000032D}, -{0x00001E77, 0x00000075}, {0x00001E77, 0x0000032D}, {0x00001E78, 0x00000055}, {0x00001E78, 0x00000303}, -{0x00001E78, 0x00000301}, {0x00001E79, 0x00000075}, {0x00001E79, 0x00000303}, {0x00001E79, 0x00000301}, -{0x00001E7A, 0x00000055}, {0x00001E7A, 0x00000304}, {0x00001E7A, 0x00000308}, {0x00001E7B, 0x00000075}, -{0x00001E7B, 0x00000304}, {0x00001E7B, 0x00000308}, {0x00001E7C, 0x00000056}, {0x00001E7C, 0x00000303}, -{0x00001E7D, 0x00000076}, {0x00001E7D, 0x00000303}, {0x00001E7E, 0x00000056}, {0x00001E7E, 0x00000323}, -{0x00001E7F, 0x00000076}, {0x00001E7F, 0x00000323}, {0x00001E80, 0x00000057}, {0x00001E80, 0x00000300}, -{0x00001E81, 0x00000077}, {0x00001E81, 0x00000300}, {0x00001E82, 0x00000057}, {0x00001E82, 0x00000301}, -{0x00001E83, 0x00000077}, {0x00001E83, 0x00000301}, {0x00001E84, 0x00000057}, {0x00001E84, 0x00000308}, -{0x00001E85, 0x00000077}, {0x00001E85, 0x00000308}, {0x00001E86, 0x00000057}, {0x00001E86, 0x00000307}, -{0x00001E87, 0x00000077}, {0x00001E87, 0x00000307}, {0x00001E88, 0x00000057}, {0x00001E88, 0x00000323}, -{0x00001E89, 0x00000077}, {0x00001E89, 0x00000323}, {0x00001E8A, 0x00000058}, {0x00001E8A, 0x00000307}, -{0x00001E8B, 0x00000078}, {0x00001E8B, 0x00000307}, {0x00001E8C, 0x00000058}, {0x00001E8C, 0x00000308}, -{0x00001E8D, 0x00000078}, {0x00001E8D, 0x00000308}, {0x00001E8E, 0x00000059}, {0x00001E8E, 0x00000307}, -{0x00001E8F, 0x00000079}, {0x00001E8F, 0x00000307}, {0x00001E90, 0x0000005A}, {0x00001E90, 0x00000302}, -{0x00001E91, 0x0000007A}, {0x00001E91, 0x00000302}, {0x00001E92, 0x0000005A}, {0x00001E92, 0x00000323}, -{0x00001E93, 0x0000007A}, {0x00001E93, 0x00000323}, {0x00001E94, 0x0000005A}, {0x00001E94, 0x00000331}, -{0x00001E95, 0x0000007A}, {0x00001E95, 0x00000331}, {0x00001E96, 0x00000068}, {0x00001E96, 0x00000331}, -{0x00001E97, 0x00000074}, {0x00001E97, 0x00000308}, {0x00001E98, 0x00000077}, {0x00001E98, 0x0000030A}, -{0x00001E99, 0x00000079}, {0x00001E99, 0x0000030A}, {0x00001E9B, 0x0000017F}, {0x00001E9B, 0x00000307}, -{0x00001EA0, 0x00000041}, {0x00001EA0, 0x00000323}, {0x00001EA1, 0x00000061}, {0x00001EA1, 0x00000323}, -{0x00001EA2, 0x00000041}, {0x00001EA2, 0x00000309}, {0x00001EA3, 0x00000061}, {0x00001EA3, 0x00000309}, -{0x00001EA4, 0x00000041}, {0x00001EA4, 0x00000302}, {0x00001EA4, 0x00000301}, {0x00001EA5, 0x00000061}, -{0x00001EA5, 0x00000302}, {0x00001EA5, 0x00000301}, {0x00001EA6, 0x00000041}, {0x00001EA6, 0x00000302}, -{0x00001EA6, 0x00000300}, {0x00001EA7, 0x00000061}, {0x00001EA7, 0x00000302}, {0x00001EA7, 0x00000300}, -{0x00001EA8, 0x00000041}, {0x00001EA8, 0x00000302}, {0x00001EA8, 0x00000309}, {0x00001EA9, 0x00000061}, -{0x00001EA9, 0x00000302}, {0x00001EA9, 0x00000309}, {0x00001EAA, 0x00000041}, {0x00001EAA, 0x00000302}, -{0x00001EAA, 0x00000303}, {0x00001EAB, 0x00000061}, {0x00001EAB, 0x00000302}, {0x00001EAB, 0x00000303}, -{0x00001EAC, 0x00000041}, {0x00001EAC, 0x00000323}, {0x00001EAC, 0x00000302}, {0x00001EAD, 0x00000061}, -{0x00001EAD, 0x00000323}, {0x00001EAD, 0x00000302}, {0x00001EAE, 0x00000041}, {0x00001EAE, 0x00000306}, -{0x00001EAE, 0x00000301}, {0x00001EAF, 0x00000061}, {0x00001EAF, 0x00000306}, {0x00001EAF, 0x00000301}, -{0x00001EB0, 0x00000041}, {0x00001EB0, 0x00000306}, {0x00001EB0, 0x00000300}, {0x00001EB1, 0x00000061}, -{0x00001EB1, 0x00000306}, {0x00001EB1, 0x00000300}, {0x00001EB2, 0x00000041}, {0x00001EB2, 0x00000306}, -{0x00001EB2, 0x00000309}, {0x00001EB3, 0x00000061}, {0x00001EB3, 0x00000306}, {0x00001EB3, 0x00000309}, -{0x00001EB4, 0x00000041}, {0x00001EB4, 0x00000306}, {0x00001EB4, 0x00000303}, {0x00001EB5, 0x00000061}, -{0x00001EB5, 0x00000306}, {0x00001EB5, 0x00000303}, {0x00001EB6, 0x00000041}, {0x00001EB6, 0x00000323}, -{0x00001EB6, 0x00000306}, {0x00001EB7, 0x00000061}, {0x00001EB7, 0x00000323}, {0x00001EB7, 0x00000306}, -{0x00001EB8, 0x00000045}, {0x00001EB8, 0x00000323}, {0x00001EB9, 0x00000065}, {0x00001EB9, 0x00000323}, -{0x00001EBA, 0x00000045}, {0x00001EBA, 0x00000309}, {0x00001EBB, 0x00000065}, {0x00001EBB, 0x00000309}, -{0x00001EBC, 0x00000045}, {0x00001EBC, 0x00000303}, {0x00001EBD, 0x00000065}, {0x00001EBD, 0x00000303}, -{0x00001EBE, 0x00000045}, {0x00001EBE, 0x00000302}, {0x00001EBE, 0x00000301}, {0x00001EBF, 0x00000065}, -{0x00001EBF, 0x00000302}, {0x00001EBF, 0x00000301}, {0x00001EC0, 0x00000045}, {0x00001EC0, 0x00000302}, -{0x00001EC0, 0x00000300}, {0x00001EC1, 0x00000065}, {0x00001EC1, 0x00000302}, {0x00001EC1, 0x00000300}, -{0x00001EC2, 0x00000045}, {0x00001EC2, 0x00000302}, {0x00001EC2, 0x00000309}, {0x00001EC3, 0x00000065}, -{0x00001EC3, 0x00000302}, {0x00001EC3, 0x00000309}, {0x00001EC4, 0x00000045}, {0x00001EC4, 0x00000302}, -{0x00001EC4, 0x00000303}, {0x00001EC5, 0x00000065}, {0x00001EC5, 0x00000302}, {0x00001EC5, 0x00000303}, -{0x00001EC6, 0x00000045}, {0x00001EC6, 0x00000323}, {0x00001EC6, 0x00000302}, {0x00001EC7, 0x00000065}, -{0x00001EC7, 0x00000323}, {0x00001EC7, 0x00000302}, {0x00001EC8, 0x00000049}, {0x00001EC8, 0x00000309}, -{0x00001EC9, 0x00000069}, {0x00001EC9, 0x00000309}, {0x00001ECA, 0x00000049}, {0x00001ECA, 0x00000323}, -{0x00001ECB, 0x00000069}, {0x00001ECB, 0x00000323}, {0x00001ECC, 0x0000004F}, {0x00001ECC, 0x00000323}, -{0x00001ECD, 0x0000006F}, {0x00001ECD, 0x00000323}, {0x00001ECE, 0x0000004F}, {0x00001ECE, 0x00000309}, -{0x00001ECF, 0x0000006F}, {0x00001ECF, 0x00000309}, {0x00001ED0, 0x0000004F}, {0x00001ED0, 0x00000302}, -{0x00001ED0, 0x00000301}, {0x00001ED1, 0x0000006F}, {0x00001ED1, 0x00000302}, {0x00001ED1, 0x00000301}, -{0x00001ED2, 0x0000004F}, {0x00001ED2, 0x00000302}, {0x00001ED2, 0x00000300}, {0x00001ED3, 0x0000006F}, -{0x00001ED3, 0x00000302}, {0x00001ED3, 0x00000300}, {0x00001ED4, 0x0000004F}, {0x00001ED4, 0x00000302}, -{0x00001ED4, 0x00000309}, {0x00001ED5, 0x0000006F}, {0x00001ED5, 0x00000302}, {0x00001ED5, 0x00000309}, -{0x00001ED6, 0x0000004F}, {0x00001ED6, 0x00000302}, {0x00001ED6, 0x00000303}, {0x00001ED7, 0x0000006F}, -{0x00001ED7, 0x00000302}, {0x00001ED7, 0x00000303}, {0x00001ED8, 0x0000004F}, {0x00001ED8, 0x00000323}, -{0x00001ED8, 0x00000302}, {0x00001ED9, 0x0000006F}, {0x00001ED9, 0x00000323}, {0x00001ED9, 0x00000302}, -{0x00001EDA, 0x0000004F}, {0x00001EDA, 0x0000031B}, {0x00001EDA, 0x00000301}, {0x00001EDB, 0x0000006F}, -{0x00001EDB, 0x0000031B}, {0x00001EDB, 0x00000301}, {0x00001EDC, 0x0000004F}, {0x00001EDC, 0x0000031B}, -{0x00001EDC, 0x00000300}, {0x00001EDD, 0x0000006F}, {0x00001EDD, 0x0000031B}, {0x00001EDD, 0x00000300}, -{0x00001EDE, 0x0000004F}, {0x00001EDE, 0x0000031B}, {0x00001EDE, 0x00000309}, {0x00001EDF, 0x0000006F}, -{0x00001EDF, 0x0000031B}, {0x00001EDF, 0x00000309}, {0x00001EE0, 0x0000004F}, {0x00001EE0, 0x0000031B}, -{0x00001EE0, 0x00000303}, {0x00001EE1, 0x0000006F}, {0x00001EE1, 0x0000031B}, {0x00001EE1, 0x00000303}, -{0x00001EE2, 0x0000004F}, {0x00001EE2, 0x0000031B}, {0x00001EE2, 0x00000323}, {0x00001EE3, 0x0000006F}, -{0x00001EE3, 0x0000031B}, {0x00001EE3, 0x00000323}, {0x00001EE4, 0x00000055}, {0x00001EE4, 0x00000323}, -{0x00001EE5, 0x00000075}, {0x00001EE5, 0x00000323}, {0x00001EE6, 0x00000055}, {0x00001EE6, 0x00000309}, -{0x00001EE7, 0x00000075}, {0x00001EE7, 0x00000309}, {0x00001EE8, 0x00000055}, {0x00001EE8, 0x0000031B}, -{0x00001EE8, 0x00000301}, {0x00001EE9, 0x00000075}, {0x00001EE9, 0x0000031B}, {0x00001EE9, 0x00000301}, -{0x00001EEA, 0x00000055}, {0x00001EEA, 0x0000031B}, {0x00001EEA, 0x00000300}, {0x00001EEB, 0x00000075}, -{0x00001EEB, 0x0000031B}, {0x00001EEB, 0x00000300}, {0x00001EEC, 0x00000055}, {0x00001EEC, 0x0000031B}, -{0x00001EEC, 0x00000309}, {0x00001EED, 0x00000075}, {0x00001EED, 0x0000031B}, {0x00001EED, 0x00000309}, -{0x00001EEE, 0x00000055}, {0x00001EEE, 0x0000031B}, {0x00001EEE, 0x00000303}, {0x00001EEF, 0x00000075}, -{0x00001EEF, 0x0000031B}, {0x00001EEF, 0x00000303}, {0x00001EF0, 0x00000055}, {0x00001EF0, 0x0000031B}, -{0x00001EF0, 0x00000323}, {0x00001EF1, 0x00000075}, {0x00001EF1, 0x0000031B}, {0x00001EF1, 0x00000323}, -{0x00001EF2, 0x00000059}, {0x00001EF2, 0x00000300}, {0x00001EF3, 0x00000079}, {0x00001EF3, 0x00000300}, -{0x00001EF4, 0x00000059}, {0x00001EF4, 0x00000323}, {0x00001EF5, 0x00000079}, {0x00001EF5, 0x00000323}, -{0x00001EF6, 0x00000059}, {0x00001EF6, 0x00000309}, {0x00001EF7, 0x00000079}, {0x00001EF7, 0x00000309}, -{0x00001EF8, 0x00000059}, {0x00001EF8, 0x00000303}, {0x00001EF9, 0x00000079}, {0x00001EF9, 0x00000303}, -{0x00001F00, 0x000003B1}, {0x00001F00, 0x00000313}, {0x00001F01, 0x000003B1}, {0x00001F01, 0x00000314}, -{0x00001F02, 0x000003B1}, {0x00001F02, 0x00000313}, {0x00001F02, 0x00000300}, {0x00001F03, 0x000003B1}, -{0x00001F03, 0x00000314}, {0x00001F03, 0x00000300}, {0x00001F04, 0x000003B1}, {0x00001F04, 0x00000313}, -{0x00001F04, 0x00000301}, {0x00001F05, 0x000003B1}, {0x00001F05, 0x00000314}, {0x00001F05, 0x00000301}, -{0x00001F06, 0x000003B1}, {0x00001F06, 0x00000313}, {0x00001F06, 0x00000342}, {0x00001F07, 0x000003B1}, -{0x00001F07, 0x00000314}, {0x00001F07, 0x00000342}, {0x00001F08, 0x00000391}, {0x00001F08, 0x00000313}, -{0x00001F09, 0x00000391}, {0x00001F09, 0x00000314}, {0x00001F0A, 0x00000391}, {0x00001F0A, 0x00000313}, -{0x00001F0A, 0x00000300}, {0x00001F0B, 0x00000391}, {0x00001F0B, 0x00000314}, {0x00001F0B, 0x00000300}, -{0x00001F0C, 0x00000391}, {0x00001F0C, 0x00000313}, {0x00001F0C, 0x00000301}, {0x00001F0D, 0x00000391}, -{0x00001F0D, 0x00000314}, {0x00001F0D, 0x00000301}, {0x00001F0E, 0x00000391}, {0x00001F0E, 0x00000313}, -{0x00001F0E, 0x00000342}, {0x00001F0F, 0x00000391}, {0x00001F0F, 0x00000314}, {0x00001F0F, 0x00000342}, -{0x00001F10, 0x000003B5}, {0x00001F10, 0x00000313}, {0x00001F11, 0x000003B5}, {0x00001F11, 0x00000314}, -{0x00001F12, 0x000003B5}, {0x00001F12, 0x00000313}, {0x00001F12, 0x00000300}, {0x00001F13, 0x000003B5}, -{0x00001F13, 0x00000314}, {0x00001F13, 0x00000300}, {0x00001F14, 0x000003B5}, {0x00001F14, 0x00000313}, -{0x00001F14, 0x00000301}, {0x00001F15, 0x000003B5}, {0x00001F15, 0x00000314}, {0x00001F15, 0x00000301}, -{0x00001F18, 0x00000395}, {0x00001F18, 0x00000313}, {0x00001F19, 0x00000395}, {0x00001F19, 0x00000314}, -{0x00001F1A, 0x00000395}, {0x00001F1A, 0x00000313}, {0x00001F1A, 0x00000300}, {0x00001F1B, 0x00000395}, -{0x00001F1B, 0x00000314}, {0x00001F1B, 0x00000300}, {0x00001F1C, 0x00000395}, {0x00001F1C, 0x00000313}, -{0x00001F1C, 0x00000301}, {0x00001F1D, 0x00000395}, {0x00001F1D, 0x00000314}, {0x00001F1D, 0x00000301}, -{0x00001F20, 0x000003B7}, {0x00001F20, 0x00000313}, {0x00001F21, 0x000003B7}, {0x00001F21, 0x00000314}, -{0x00001F22, 0x000003B7}, {0x00001F22, 0x00000313}, {0x00001F22, 0x00000300}, {0x00001F23, 0x000003B7}, -{0x00001F23, 0x00000314}, {0x00001F23, 0x00000300}, {0x00001F24, 0x000003B7}, {0x00001F24, 0x00000313}, -{0x00001F24, 0x00000301}, {0x00001F25, 0x000003B7}, {0x00001F25, 0x00000314}, {0x00001F25, 0x00000301}, -{0x00001F26, 0x000003B7}, {0x00001F26, 0x00000313}, {0x00001F26, 0x00000342}, {0x00001F27, 0x000003B7}, -{0x00001F27, 0x00000314}, {0x00001F27, 0x00000342}, {0x00001F28, 0x00000397}, {0x00001F28, 0x00000313}, -{0x00001F29, 0x00000397}, {0x00001F29, 0x00000314}, {0x00001F2A, 0x00000397}, {0x00001F2A, 0x00000313}, -{0x00001F2A, 0x00000300}, {0x00001F2B, 0x00000397}, {0x00001F2B, 0x00000314}, {0x00001F2B, 0x00000300}, -{0x00001F2C, 0x00000397}, {0x00001F2C, 0x00000313}, {0x00001F2C, 0x00000301}, {0x00001F2D, 0x00000397}, -{0x00001F2D, 0x00000314}, {0x00001F2D, 0x00000301}, {0x00001F2E, 0x00000397}, {0x00001F2E, 0x00000313}, -{0x00001F2E, 0x00000342}, {0x00001F2F, 0x00000397}, {0x00001F2F, 0x00000314}, {0x00001F2F, 0x00000342}, -{0x00001F30, 0x000003B9}, {0x00001F30, 0x00000313}, {0x00001F31, 0x000003B9}, {0x00001F31, 0x00000314}, -{0x00001F32, 0x000003B9}, {0x00001F32, 0x00000313}, {0x00001F32, 0x00000300}, {0x00001F33, 0x000003B9}, -{0x00001F33, 0x00000314}, {0x00001F33, 0x00000300}, {0x00001F34, 0x000003B9}, {0x00001F34, 0x00000313}, -{0x00001F34, 0x00000301}, {0x00001F35, 0x000003B9}, {0x00001F35, 0x00000314}, {0x00001F35, 0x00000301}, -{0x00001F36, 0x000003B9}, {0x00001F36, 0x00000313}, {0x00001F36, 0x00000342}, {0x00001F37, 0x000003B9}, -{0x00001F37, 0x00000314}, {0x00001F37, 0x00000342}, {0x00001F38, 0x00000399}, {0x00001F38, 0x00000313}, -{0x00001F39, 0x00000399}, {0x00001F39, 0x00000314}, {0x00001F3A, 0x00000399}, {0x00001F3A, 0x00000313}, -{0x00001F3A, 0x00000300}, {0x00001F3B, 0x00000399}, {0x00001F3B, 0x00000314}, {0x00001F3B, 0x00000300}, -{0x00001F3C, 0x00000399}, {0x00001F3C, 0x00000313}, {0x00001F3C, 0x00000301}, {0x00001F3D, 0x00000399}, -{0x00001F3D, 0x00000314}, {0x00001F3D, 0x00000301}, {0x00001F3E, 0x00000399}, {0x00001F3E, 0x00000313}, -{0x00001F3E, 0x00000342}, {0x00001F3F, 0x00000399}, {0x00001F3F, 0x00000314}, {0x00001F3F, 0x00000342}, -{0x00001F40, 0x000003BF}, {0x00001F40, 0x00000313}, {0x00001F41, 0x000003BF}, {0x00001F41, 0x00000314}, -{0x00001F42, 0x000003BF}, {0x00001F42, 0x00000313}, {0x00001F42, 0x00000300}, {0x00001F43, 0x000003BF}, -{0x00001F43, 0x00000314}, {0x00001F43, 0x00000300}, {0x00001F44, 0x000003BF}, {0x00001F44, 0x00000313}, -{0x00001F44, 0x00000301}, {0x00001F45, 0x000003BF}, {0x00001F45, 0x00000314}, {0x00001F45, 0x00000301}, -{0x00001F48, 0x0000039F}, {0x00001F48, 0x00000313}, {0x00001F49, 0x0000039F}, {0x00001F49, 0x00000314}, -{0x00001F4A, 0x0000039F}, {0x00001F4A, 0x00000313}, {0x00001F4A, 0x00000300}, {0x00001F4B, 0x0000039F}, -{0x00001F4B, 0x00000314}, {0x00001F4B, 0x00000300}, {0x00001F4C, 0x0000039F}, {0x00001F4C, 0x00000313}, -{0x00001F4C, 0x00000301}, {0x00001F4D, 0x0000039F}, {0x00001F4D, 0x00000314}, {0x00001F4D, 0x00000301}, -{0x00001F50, 0x000003C5}, {0x00001F50, 0x00000313}, {0x00001F51, 0x000003C5}, {0x00001F51, 0x00000314}, -{0x00001F52, 0x000003C5}, {0x00001F52, 0x00000313}, {0x00001F52, 0x00000300}, {0x00001F53, 0x000003C5}, -{0x00001F53, 0x00000314}, {0x00001F53, 0x00000300}, {0x00001F54, 0x000003C5}, {0x00001F54, 0x00000313}, -{0x00001F54, 0x00000301}, {0x00001F55, 0x000003C5}, {0x00001F55, 0x00000314}, {0x00001F55, 0x00000301}, -{0x00001F56, 0x000003C5}, {0x00001F56, 0x00000313}, {0x00001F56, 0x00000342}, {0x00001F57, 0x000003C5}, -{0x00001F57, 0x00000314}, {0x00001F57, 0x00000342}, {0x00001F59, 0x000003A5}, {0x00001F59, 0x00000314}, -{0x00001F5B, 0x000003A5}, {0x00001F5B, 0x00000314}, {0x00001F5B, 0x00000300}, {0x00001F5D, 0x000003A5}, -{0x00001F5D, 0x00000314}, {0x00001F5D, 0x00000301}, {0x00001F5F, 0x000003A5}, {0x00001F5F, 0x00000314}, -{0x00001F5F, 0x00000342}, {0x00001F60, 0x000003C9}, {0x00001F60, 0x00000313}, {0x00001F61, 0x000003C9}, -{0x00001F61, 0x00000314}, {0x00001F62, 0x000003C9}, {0x00001F62, 0x00000313}, {0x00001F62, 0x00000300}, -{0x00001F63, 0x000003C9}, {0x00001F63, 0x00000314}, {0x00001F63, 0x00000300}, {0x00001F64, 0x000003C9}, -{0x00001F64, 0x00000313}, {0x00001F64, 0x00000301}, {0x00001F65, 0x000003C9}, {0x00001F65, 0x00000314}, -{0x00001F65, 0x00000301}, {0x00001F66, 0x000003C9}, {0x00001F66, 0x00000313}, {0x00001F66, 0x00000342}, -{0x00001F67, 0x000003C9}, {0x00001F67, 0x00000314}, {0x00001F67, 0x00000342}, {0x00001F68, 0x000003A9}, -{0x00001F68, 0x00000313}, {0x00001F69, 0x000003A9}, {0x00001F69, 0x00000314}, {0x00001F6A, 0x000003A9}, -{0x00001F6A, 0x00000313}, {0x00001F6A, 0x00000300}, {0x00001F6B, 0x000003A9}, {0x00001F6B, 0x00000314}, -{0x00001F6B, 0x00000300}, {0x00001F6C, 0x000003A9}, {0x00001F6C, 0x00000313}, {0x00001F6C, 0x00000301}, -{0x00001F6D, 0x000003A9}, {0x00001F6D, 0x00000314}, {0x00001F6D, 0x00000301}, {0x00001F6E, 0x000003A9}, -{0x00001F6E, 0x00000313}, {0x00001F6E, 0x00000342}, {0x00001F6F, 0x000003A9}, {0x00001F6F, 0x00000314}, -{0x00001F6F, 0x00000342}, {0x00001F70, 0x000003B1}, {0x00001F70, 0x00000300}, {0x00001F71, 0x000003B1}, -{0x00001F71, 0x00000301}, {0x00001F72, 0x000003B5}, {0x00001F72, 0x00000300}, {0x00001F73, 0x000003B5}, -{0x00001F73, 0x00000301}, {0x00001F74, 0x000003B7}, {0x00001F74, 0x00000300}, {0x00001F75, 0x000003B7}, -{0x00001F75, 0x00000301}, {0x00001F76, 0x000003B9}, {0x00001F76, 0x00000300}, {0x00001F77, 0x000003B9}, -{0x00001F77, 0x00000301}, {0x00001F78, 0x000003BF}, {0x00001F78, 0x00000300}, {0x00001F79, 0x000003BF}, -{0x00001F79, 0x00000301}, {0x00001F7A, 0x000003C5}, {0x00001F7A, 0x00000300}, {0x00001F7B, 0x000003C5}, -{0x00001F7B, 0x00000301}, {0x00001F7C, 0x000003C9}, {0x00001F7C, 0x00000300}, {0x00001F7D, 0x000003C9}, -{0x00001F7D, 0x00000301}, {0x00001F80, 0x000003B1}, {0x00001F80, 0x00000313}, {0x00001F80, 0x00000345}, -{0x00001F81, 0x000003B1}, {0x00001F81, 0x00000314}, {0x00001F81, 0x00000345}, {0x00001F82, 0x000003B1}, -{0x00001F82, 0x00000313}, {0x00001F82, 0x00000300}, {0x00001F82, 0x00000345}, {0x00001F83, 0x000003B1}, -{0x00001F83, 0x00000314}, {0x00001F83, 0x00000300}, {0x00001F83, 0x00000345}, {0x00001F84, 0x000003B1}, -{0x00001F84, 0x00000313}, {0x00001F84, 0x00000301}, {0x00001F84, 0x00000345}, {0x00001F85, 0x000003B1}, -{0x00001F85, 0x00000314}, {0x00001F85, 0x00000301}, {0x00001F85, 0x00000345}, {0x00001F86, 0x000003B1}, -{0x00001F86, 0x00000313}, {0x00001F86, 0x00000342}, {0x00001F86, 0x00000345}, {0x00001F87, 0x000003B1}, -{0x00001F87, 0x00000314}, {0x00001F87, 0x00000342}, {0x00001F87, 0x00000345}, {0x00001F88, 0x00000391}, -{0x00001F88, 0x00000313}, {0x00001F88, 0x00000345}, {0x00001F89, 0x00000391}, {0x00001F89, 0x00000314}, -{0x00001F89, 0x00000345}, {0x00001F8A, 0x00000391}, {0x00001F8A, 0x00000313}, {0x00001F8A, 0x00000300}, -{0x00001F8A, 0x00000345}, {0x00001F8B, 0x00000391}, {0x00001F8B, 0x00000314}, {0x00001F8B, 0x00000300}, -{0x00001F8B, 0x00000345}, {0x00001F8C, 0x00000391}, {0x00001F8C, 0x00000313}, {0x00001F8C, 0x00000301}, -{0x00001F8C, 0x00000345}, {0x00001F8D, 0x00000391}, {0x00001F8D, 0x00000314}, {0x00001F8D, 0x00000301}, -{0x00001F8D, 0x00000345}, {0x00001F8E, 0x00000391}, {0x00001F8E, 0x00000313}, {0x00001F8E, 0x00000342}, -{0x00001F8E, 0x00000345}, {0x00001F8F, 0x00000391}, {0x00001F8F, 0x00000314}, {0x00001F8F, 0x00000342}, -{0x00001F8F, 0x00000345}, {0x00001F90, 0x000003B7}, {0x00001F90, 0x00000313}, {0x00001F90, 0x00000345}, -{0x00001F91, 0x000003B7}, {0x00001F91, 0x00000314}, {0x00001F91, 0x00000345}, {0x00001F92, 0x000003B7}, -{0x00001F92, 0x00000313}, {0x00001F92, 0x00000300}, {0x00001F92, 0x00000345}, {0x00001F93, 0x000003B7}, -{0x00001F93, 0x00000314}, {0x00001F93, 0x00000300}, {0x00001F93, 0x00000345}, {0x00001F94, 0x000003B7}, -{0x00001F94, 0x00000313}, {0x00001F94, 0x00000301}, {0x00001F94, 0x00000345}, {0x00001F95, 0x000003B7}, -{0x00001F95, 0x00000314}, {0x00001F95, 0x00000301}, {0x00001F95, 0x00000345}, {0x00001F96, 0x000003B7}, -{0x00001F96, 0x00000313}, {0x00001F96, 0x00000342}, {0x00001F96, 0x00000345}, {0x00001F97, 0x000003B7}, -{0x00001F97, 0x00000314}, {0x00001F97, 0x00000342}, {0x00001F97, 0x00000345}, {0x00001F98, 0x00000397}, -{0x00001F98, 0x00000313}, {0x00001F98, 0x00000345}, {0x00001F99, 0x00000397}, {0x00001F99, 0x00000314}, -{0x00001F99, 0x00000345}, {0x00001F9A, 0x00000397}, {0x00001F9A, 0x00000313}, {0x00001F9A, 0x00000300}, -{0x00001F9A, 0x00000345}, {0x00001F9B, 0x00000397}, {0x00001F9B, 0x00000314}, {0x00001F9B, 0x00000300}, -{0x00001F9B, 0x00000345}, {0x00001F9C, 0x00000397}, {0x00001F9C, 0x00000313}, {0x00001F9C, 0x00000301}, -{0x00001F9C, 0x00000345}, {0x00001F9D, 0x00000397}, {0x00001F9D, 0x00000314}, {0x00001F9D, 0x00000301}, -{0x00001F9D, 0x00000345}, {0x00001F9E, 0x00000397}, {0x00001F9E, 0x00000313}, {0x00001F9E, 0x00000342}, -{0x00001F9E, 0x00000345}, {0x00001F9F, 0x00000397}, {0x00001F9F, 0x00000314}, {0x00001F9F, 0x00000342}, -{0x00001F9F, 0x00000345}, {0x00001FA0, 0x000003C9}, {0x00001FA0, 0x00000313}, {0x00001FA0, 0x00000345}, -{0x00001FA1, 0x000003C9}, {0x00001FA1, 0x00000314}, {0x00001FA1, 0x00000345}, {0x00001FA2, 0x000003C9}, -{0x00001FA2, 0x00000313}, {0x00001FA2, 0x00000300}, {0x00001FA2, 0x00000345}, {0x00001FA3, 0x000003C9}, -{0x00001FA3, 0x00000314}, {0x00001FA3, 0x00000300}, {0x00001FA3, 0x00000345}, {0x00001FA4, 0x000003C9}, -{0x00001FA4, 0x00000313}, {0x00001FA4, 0x00000301}, {0x00001FA4, 0x00000345}, {0x00001FA5, 0x000003C9}, -{0x00001FA5, 0x00000314}, {0x00001FA5, 0x00000301}, {0x00001FA5, 0x00000345}, {0x00001FA6, 0x000003C9}, -{0x00001FA6, 0x00000313}, {0x00001FA6, 0x00000342}, {0x00001FA6, 0x00000345}, {0x00001FA7, 0x000003C9}, -{0x00001FA7, 0x00000314}, {0x00001FA7, 0x00000342}, {0x00001FA7, 0x00000345}, {0x00001FA8, 0x000003A9}, -{0x00001FA8, 0x00000313}, {0x00001FA8, 0x00000345}, {0x00001FA9, 0x000003A9}, {0x00001FA9, 0x00000314}, -{0x00001FA9, 0x00000345}, {0x00001FAA, 0x000003A9}, {0x00001FAA, 0x00000313}, {0x00001FAA, 0x00000300}, -{0x00001FAA, 0x00000345}, {0x00001FAB, 0x000003A9}, {0x00001FAB, 0x00000314}, {0x00001FAB, 0x00000300}, -{0x00001FAB, 0x00000345}, {0x00001FAC, 0x000003A9}, {0x00001FAC, 0x00000313}, {0x00001FAC, 0x00000301}, -{0x00001FAC, 0x00000345}, {0x00001FAD, 0x000003A9}, {0x00001FAD, 0x00000314}, {0x00001FAD, 0x00000301}, -{0x00001FAD, 0x00000345}, {0x00001FAE, 0x000003A9}, {0x00001FAE, 0x00000313}, {0x00001FAE, 0x00000342}, -{0x00001FAE, 0x00000345}, {0x00001FAF, 0x000003A9}, {0x00001FAF, 0x00000314}, {0x00001FAF, 0x00000342}, -{0x00001FAF, 0x00000345}, {0x00001FB0, 0x000003B1}, {0x00001FB0, 0x00000306}, {0x00001FB1, 0x000003B1}, -{0x00001FB1, 0x00000304}, {0x00001FB2, 0x000003B1}, {0x00001FB2, 0x00000300}, {0x00001FB2, 0x00000345}, -{0x00001FB3, 0x000003B1}, {0x00001FB3, 0x00000345}, {0x00001FB4, 0x000003B1}, {0x00001FB4, 0x00000301}, -{0x00001FB4, 0x00000345}, {0x00001FB6, 0x000003B1}, {0x00001FB6, 0x00000342}, {0x00001FB7, 0x000003B1}, -{0x00001FB7, 0x00000342}, {0x00001FB7, 0x00000345}, {0x00001FB8, 0x00000391}, {0x00001FB8, 0x00000306}, -{0x00001FB9, 0x00000391}, {0x00001FB9, 0x00000304}, {0x00001FBA, 0x00000391}, {0x00001FBA, 0x00000300}, -{0x00001FBB, 0x00000391}, {0x00001FBB, 0x00000301}, {0x00001FBC, 0x00000391}, {0x00001FBC, 0x00000345}, -{0x00001FBE, 0x000003B9}, {0x00001FC1, 0x000000A8}, {0x00001FC1, 0x00000342}, {0x00001FC2, 0x000003B7}, -{0x00001FC2, 0x00000300}, {0x00001FC2, 0x00000345}, {0x00001FC3, 0x000003B7}, {0x00001FC3, 0x00000345}, -{0x00001FC4, 0x000003B7}, {0x00001FC4, 0x00000301}, {0x00001FC4, 0x00000345}, {0x00001FC6, 0x000003B7}, -{0x00001FC6, 0x00000342}, {0x00001FC7, 0x000003B7}, {0x00001FC7, 0x00000342}, {0x00001FC7, 0x00000345}, -{0x00001FC8, 0x00000395}, {0x00001FC8, 0x00000300}, {0x00001FC9, 0x00000395}, {0x00001FC9, 0x00000301}, -{0x00001FCA, 0x00000397}, {0x00001FCA, 0x00000300}, {0x00001FCB, 0x00000397}, {0x00001FCB, 0x00000301}, -{0x00001FCC, 0x00000397}, {0x00001FCC, 0x00000345}, {0x00001FCD, 0x00001FBF}, {0x00001FCD, 0x00000300}, -{0x00001FCE, 0x00001FBF}, {0x00001FCE, 0x00000301}, {0x00001FCF, 0x00001FBF}, {0x00001FCF, 0x00000342}, -{0x00001FD0, 0x000003B9}, {0x00001FD0, 0x00000306}, {0x00001FD1, 0x000003B9}, {0x00001FD1, 0x00000304}, -{0x00001FD2, 0x000003B9}, {0x00001FD2, 0x00000308}, {0x00001FD2, 0x00000300}, {0x00001FD3, 0x000003B9}, -{0x00001FD3, 0x00000308}, {0x00001FD3, 0x00000301}, {0x00001FD6, 0x000003B9}, {0x00001FD6, 0x00000342}, -{0x00001FD7, 0x000003B9}, {0x00001FD7, 0x00000308}, {0x00001FD7, 0x00000342}, {0x00001FD8, 0x00000399}, -{0x00001FD8, 0x00000306}, {0x00001FD9, 0x00000399}, {0x00001FD9, 0x00000304}, {0x00001FDA, 0x00000399}, -{0x00001FDA, 0x00000300}, {0x00001FDB, 0x00000399}, {0x00001FDB, 0x00000301}, {0x00001FDD, 0x00001FFE}, -{0x00001FDD, 0x00000300}, {0x00001FDE, 0x00001FFE}, {0x00001FDE, 0x00000301}, {0x00001FDF, 0x00001FFE}, -{0x00001FDF, 0x00000342}, {0x00001FE0, 0x000003C5}, {0x00001FE0, 0x00000306}, {0x00001FE1, 0x000003C5}, -{0x00001FE1, 0x00000304}, {0x00001FE2, 0x000003C5}, {0x00001FE2, 0x00000308}, {0x00001FE2, 0x00000300}, -{0x00001FE3, 0x000003C5}, {0x00001FE3, 0x00000308}, {0x00001FE3, 0x00000301}, {0x00001FE4, 0x000003C1}, -{0x00001FE4, 0x00000313}, {0x00001FE5, 0x000003C1}, {0x00001FE5, 0x00000314}, {0x00001FE6, 0x000003C5}, -{0x00001FE6, 0x00000342}, {0x00001FE7, 0x000003C5}, {0x00001FE7, 0x00000308}, {0x00001FE7, 0x00000342}, -{0x00001FE8, 0x000003A5}, {0x00001FE8, 0x00000306}, {0x00001FE9, 0x000003A5}, {0x00001FE9, 0x00000304}, -{0x00001FEA, 0x000003A5}, {0x00001FEA, 0x00000300}, {0x00001FEB, 0x000003A5}, {0x00001FEB, 0x00000301}, -{0x00001FEC, 0x000003A1}, {0x00001FEC, 0x00000314}, {0x00001FED, 0x000000A8}, {0x00001FED, 0x00000300}, -{0x00001FEE, 0x000000A8}, {0x00001FEE, 0x00000301}, {0x00001FEF, 0x00000060}, {0x00001FF2, 0x000003C9}, -{0x00001FF2, 0x00000300}, {0x00001FF2, 0x00000345}, {0x00001FF3, 0x000003C9}, {0x00001FF3, 0x00000345}, -{0x00001FF4, 0x000003C9}, {0x00001FF4, 0x00000301}, {0x00001FF4, 0x00000345}, {0x00001FF6, 0x000003C9}, -{0x00001FF6, 0x00000342}, {0x00001FF7, 0x000003C9}, {0x00001FF7, 0x00000342}, {0x00001FF7, 0x00000345}, -{0x00001FF8, 0x0000039F}, {0x00001FF8, 0x00000300}, {0x00001FF9, 0x0000039F}, {0x00001FF9, 0x00000301}, -{0x00001FFA, 0x000003A9}, {0x00001FFA, 0x00000300}, {0x00001FFB, 0x000003A9}, {0x00001FFB, 0x00000301}, -{0x00001FFC, 0x000003A9}, {0x00001FFC, 0x00000345}, {0x00001FFD, 0x000000B4}, {0x00002000, 0x00002002}, -{0x00002001, 0x00002003}, {0x00002126, 0x000003A9}, {0x0000212A, 0x0000004B}, {0x0000212B, 0x00000041}, -{0x0000212B, 0x0000030A}, {0x0000219A, 0x00002190}, {0x0000219A, 0x00000338}, {0x0000219B, 0x00002192}, -{0x0000219B, 0x00000338}, {0x000021AE, 0x00002194}, {0x000021AE, 0x00000338}, {0x000021CD, 0x000021D0}, -{0x000021CD, 0x00000338}, {0x000021CE, 0x000021D4}, {0x000021CE, 0x00000338}, {0x000021CF, 0x000021D2}, -{0x000021CF, 0x00000338}, {0x00002204, 0x00002203}, {0x00002204, 0x00000338}, {0x00002209, 0x00002208}, -{0x00002209, 0x00000338}, {0x0000220C, 0x0000220B}, {0x0000220C, 0x00000338}, {0x00002224, 0x00002223}, -{0x00002224, 0x00000338}, {0x00002226, 0x00002225}, {0x00002226, 0x00000338}, {0x00002241, 0x0000223C}, -{0x00002241, 0x00000338}, {0x00002244, 0x00002243}, {0x00002244, 0x00000338}, {0x00002247, 0x00002245}, -{0x00002247, 0x00000338}, {0x00002249, 0x00002248}, {0x00002249, 0x00000338}, {0x00002260, 0x0000003D}, -{0x00002260, 0x00000338}, {0x00002262, 0x00002261}, {0x00002262, 0x00000338}, {0x0000226D, 0x0000224D}, -{0x0000226D, 0x00000338}, {0x0000226E, 0x0000003C}, {0x0000226E, 0x00000338}, {0x0000226F, 0x0000003E}, -{0x0000226F, 0x00000338}, {0x00002270, 0x00002264}, {0x00002270, 0x00000338}, {0x00002271, 0x00002265}, -{0x00002271, 0x00000338}, {0x00002274, 0x00002272}, {0x00002274, 0x00000338}, {0x00002275, 0x00002273}, -{0x00002275, 0x00000338}, {0x00002278, 0x00002276}, {0x00002278, 0x00000338}, {0x00002279, 0x00002277}, -{0x00002279, 0x00000338}, {0x00002280, 0x0000227A}, {0x00002280, 0x00000338}, {0x00002281, 0x0000227B}, -{0x00002281, 0x00000338}, {0x00002284, 0x00002282}, {0x00002284, 0x00000338}, {0x00002285, 0x00002283}, -{0x00002285, 0x00000338}, {0x00002288, 0x00002286}, {0x00002288, 0x00000338}, {0x00002289, 0x00002287}, -{0x00002289, 0x00000338}, {0x000022AC, 0x000022A2}, {0x000022AC, 0x00000338}, {0x000022AD, 0x000022A8}, -{0x000022AD, 0x00000338}, {0x000022AE, 0x000022A9}, {0x000022AE, 0x00000338}, {0x000022AF, 0x000022AB}, -{0x000022AF, 0x00000338}, {0x000022E0, 0x0000227C}, {0x000022E0, 0x00000338}, {0x000022E1, 0x0000227D}, -{0x000022E1, 0x00000338}, {0x000022E2, 0x00002291}, {0x000022E2, 0x00000338}, {0x000022E3, 0x00002292}, -{0x000022E3, 0x00000338}, {0x000022EA, 0x000022B2}, {0x000022EA, 0x00000338}, {0x000022EB, 0x000022B3}, -{0x000022EB, 0x00000338}, {0x000022EC, 0x000022B4}, {0x000022EC, 0x00000338}, {0x000022ED, 0x000022B5}, -{0x000022ED, 0x00000338}, {0x00002329, 0x00003008}, {0x0000232A, 0x00003009}, {0x00002ADC, 0x00002ADD}, -{0x00002ADC, 0x00000338}, {0x0000304C, 0x0000304B}, {0x0000304C, 0x00003099}, {0x0000304E, 0x0000304D}, -{0x0000304E, 0x00003099}, {0x00003050, 0x0000304F}, {0x00003050, 0x00003099}, {0x00003052, 0x00003051}, -{0x00003052, 0x00003099}, {0x00003054, 0x00003053}, {0x00003054, 0x00003099}, {0x00003056, 0x00003055}, -{0x00003056, 0x00003099}, {0x00003058, 0x00003057}, {0x00003058, 0x00003099}, {0x0000305A, 0x00003059}, -{0x0000305A, 0x00003099}, {0x0000305C, 0x0000305B}, {0x0000305C, 0x00003099}, {0x0000305E, 0x0000305D}, -{0x0000305E, 0x00003099}, {0x00003060, 0x0000305F}, {0x00003060, 0x00003099}, {0x00003062, 0x00003061}, -{0x00003062, 0x00003099}, {0x00003065, 0x00003064}, {0x00003065, 0x00003099}, {0x00003067, 0x00003066}, -{0x00003067, 0x00003099}, {0x00003069, 0x00003068}, {0x00003069, 0x00003099}, {0x00003070, 0x0000306F}, -{0x00003070, 0x00003099}, {0x00003071, 0x0000306F}, {0x00003071, 0x0000309A}, {0x00003073, 0x00003072}, -{0x00003073, 0x00003099}, {0x00003074, 0x00003072}, {0x00003074, 0x0000309A}, {0x00003076, 0x00003075}, -{0x00003076, 0x00003099}, {0x00003077, 0x00003075}, {0x00003077, 0x0000309A}, {0x00003079, 0x00003078}, -{0x00003079, 0x00003099}, {0x0000307A, 0x00003078}, {0x0000307A, 0x0000309A}, {0x0000307C, 0x0000307B}, -{0x0000307C, 0x00003099}, {0x0000307D, 0x0000307B}, {0x0000307D, 0x0000309A}, {0x00003094, 0x00003046}, -{0x00003094, 0x00003099}, {0x0000309E, 0x0000309D}, {0x0000309E, 0x00003099}, {0x000030AC, 0x000030AB}, -{0x000030AC, 0x00003099}, {0x000030AE, 0x000030AD}, {0x000030AE, 0x00003099}, {0x000030B0, 0x000030AF}, -{0x000030B0, 0x00003099}, {0x000030B2, 0x000030B1}, {0x000030B2, 0x00003099}, {0x000030B4, 0x000030B3}, -{0x000030B4, 0x00003099}, {0x000030B6, 0x000030B5}, {0x000030B6, 0x00003099}, {0x000030B8, 0x000030B7}, -{0x000030B8, 0x00003099}, {0x000030BA, 0x000030B9}, {0x000030BA, 0x00003099}, {0x000030BC, 0x000030BB}, -{0x000030BC, 0x00003099}, {0x000030BE, 0x000030BD}, {0x000030BE, 0x00003099}, {0x000030C0, 0x000030BF}, -{0x000030C0, 0x00003099}, {0x000030C2, 0x000030C1}, {0x000030C2, 0x00003099}, {0x000030C5, 0x000030C4}, -{0x000030C5, 0x00003099}, {0x000030C7, 0x000030C6}, {0x000030C7, 0x00003099}, {0x000030C9, 0x000030C8}, -{0x000030C9, 0x00003099}, {0x000030D0, 0x000030CF}, {0x000030D0, 0x00003099}, {0x000030D1, 0x000030CF}, -{0x000030D1, 0x0000309A}, {0x000030D3, 0x000030D2}, {0x000030D3, 0x00003099}, {0x000030D4, 0x000030D2}, -{0x000030D4, 0x0000309A}, {0x000030D6, 0x000030D5}, {0x000030D6, 0x00003099}, {0x000030D7, 0x000030D5}, -{0x000030D7, 0x0000309A}, {0x000030D9, 0x000030D8}, {0x000030D9, 0x00003099}, {0x000030DA, 0x000030D8}, -{0x000030DA, 0x0000309A}, {0x000030DC, 0x000030DB}, {0x000030DC, 0x00003099}, {0x000030DD, 0x000030DB}, -{0x000030DD, 0x0000309A}, {0x000030F4, 0x000030A6}, {0x000030F4, 0x00003099}, {0x000030F7, 0x000030EF}, -{0x000030F7, 0x00003099}, {0x000030F8, 0x000030F0}, {0x000030F8, 0x00003099}, {0x000030F9, 0x000030F1}, -{0x000030F9, 0x00003099}, {0x000030FA, 0x000030F2}, {0x000030FA, 0x00003099}, {0x000030FE, 0x000030FD}, -{0x000030FE, 0x00003099}, {0x0000F900, 0x00008C48}, {0x0000F901, 0x000066F4}, {0x0000F902, 0x00008ECA}, -{0x0000F903, 0x00008CC8}, {0x0000F904, 0x00006ED1}, {0x0000F905, 0x00004E32}, {0x0000F906, 0x000053E5}, -{0x0000F907, 0x00009F9C}, {0x0000F908, 0x00009F9C}, {0x0000F909, 0x00005951}, {0x0000F90A, 0x000091D1}, -{0x0000F90B, 0x00005587}, {0x0000F90C, 0x00005948}, {0x0000F90D, 0x000061F6}, {0x0000F90E, 0x00007669}, -{0x0000F90F, 0x00007F85}, {0x0000F910, 0x0000863F}, {0x0000F911, 0x000087BA}, {0x0000F912, 0x000088F8}, -{0x0000F913, 0x0000908F}, {0x0000F914, 0x00006A02}, {0x0000F915, 0x00006D1B}, {0x0000F916, 0x000070D9}, -{0x0000F917, 0x000073DE}, {0x0000F918, 0x0000843D}, {0x0000F919, 0x0000916A}, {0x0000F91A, 0x000099F1}, -{0x0000F91B, 0x00004E82}, {0x0000F91C, 0x00005375}, {0x0000F91D, 0x00006B04}, {0x0000F91E, 0x0000721B}, -{0x0000F91F, 0x0000862D}, {0x0000F920, 0x00009E1E}, {0x0000F921, 0x00005D50}, {0x0000F922, 0x00006FEB}, -{0x0000F923, 0x000085CD}, {0x0000F924, 0x00008964}, {0x0000F925, 0x000062C9}, {0x0000F926, 0x000081D8}, -{0x0000F927, 0x0000881F}, {0x0000F928, 0x00005ECA}, {0x0000F929, 0x00006717}, {0x0000F92A, 0x00006D6A}, -{0x0000F92B, 0x000072FC}, {0x0000F92C, 0x000090CE}, {0x0000F92D, 0x00004F86}, {0x0000F92E, 0x000051B7}, -{0x0000F92F, 0x000052DE}, {0x0000F930, 0x000064C4}, {0x0000F931, 0x00006AD3}, {0x0000F932, 0x00007210}, -{0x0000F933, 0x000076E7}, {0x0000F934, 0x00008001}, {0x0000F935, 0x00008606}, {0x0000F936, 0x0000865C}, -{0x0000F937, 0x00008DEF}, {0x0000F938, 0x00009732}, {0x0000F939, 0x00009B6F}, {0x0000F93A, 0x00009DFA}, -{0x0000F93B, 0x0000788C}, {0x0000F93C, 0x0000797F}, {0x0000F93D, 0x00007DA0}, {0x0000F93E, 0x000083C9}, -{0x0000F93F, 0x00009304}, {0x0000F940, 0x00009E7F}, {0x0000F941, 0x00008AD6}, {0x0000F942, 0x000058DF}, -{0x0000F943, 0x00005F04}, {0x0000F944, 0x00007C60}, {0x0000F945, 0x0000807E}, {0x0000F946, 0x00007262}, -{0x0000F947, 0x000078CA}, {0x0000F948, 0x00008CC2}, {0x0000F949, 0x000096F7}, {0x0000F94A, 0x000058D8}, -{0x0000F94B, 0x00005C62}, {0x0000F94C, 0x00006A13}, {0x0000F94D, 0x00006DDA}, {0x0000F94E, 0x00006F0F}, -{0x0000F94F, 0x00007D2F}, {0x0000F950, 0x00007E37}, {0x0000F951, 0x0000964B}, {0x0000F952, 0x000052D2}, -{0x0000F953, 0x0000808B}, {0x0000F954, 0x000051DC}, {0x0000F955, 0x000051CC}, {0x0000F956, 0x00007A1C}, -{0x0000F957, 0x00007DBE}, {0x0000F958, 0x000083F1}, {0x0000F959, 0x00009675}, {0x0000F95A, 0x00008B80}, -{0x0000F95B, 0x000062CF}, {0x0000F95C, 0x00006A02}, {0x0000F95D, 0x00008AFE}, {0x0000F95E, 0x00004E39}, -{0x0000F95F, 0x00005BE7}, {0x0000F960, 0x00006012}, {0x0000F961, 0x00007387}, {0x0000F962, 0x00007570}, -{0x0000F963, 0x00005317}, {0x0000F964, 0x000078FB}, {0x0000F965, 0x00004FBF}, {0x0000F966, 0x00005FA9}, -{0x0000F967, 0x00004E0D}, {0x0000F968, 0x00006CCC}, {0x0000F969, 0x00006578}, {0x0000F96A, 0x00007D22}, -{0x0000F96B, 0x000053C3}, {0x0000F96C, 0x0000585E}, {0x0000F96D, 0x00007701}, {0x0000F96E, 0x00008449}, -{0x0000F96F, 0x00008AAA}, {0x0000F970, 0x00006BBA}, {0x0000F971, 0x00008FB0}, {0x0000F972, 0x00006C88}, -{0x0000F973, 0x000062FE}, {0x0000F974, 0x000082E5}, {0x0000F975, 0x000063A0}, {0x0000F976, 0x00007565}, -{0x0000F977, 0x00004EAE}, {0x0000F978, 0x00005169}, {0x0000F979, 0x000051C9}, {0x0000F97A, 0x00006881}, -{0x0000F97B, 0x00007CE7}, {0x0000F97C, 0x0000826F}, {0x0000F97D, 0x00008AD2}, {0x0000F97E, 0x000091CF}, -{0x0000F97F, 0x000052F5}, {0x0000F980, 0x00005442}, {0x0000F981, 0x00005973}, {0x0000F982, 0x00005EEC}, -{0x0000F983, 0x000065C5}, {0x0000F984, 0x00006FFE}, {0x0000F985, 0x0000792A}, {0x0000F986, 0x000095AD}, -{0x0000F987, 0x00009A6A}, {0x0000F988, 0x00009E97}, {0x0000F989, 0x00009ECE}, {0x0000F98A, 0x0000529B}, -{0x0000F98B, 0x000066C6}, {0x0000F98C, 0x00006B77}, {0x0000F98D, 0x00008F62}, {0x0000F98E, 0x00005E74}, -{0x0000F98F, 0x00006190}, {0x0000F990, 0x00006200}, {0x0000F991, 0x0000649A}, {0x0000F992, 0x00006F23}, -{0x0000F993, 0x00007149}, {0x0000F994, 0x00007489}, {0x0000F995, 0x000079CA}, {0x0000F996, 0x00007DF4}, -{0x0000F997, 0x0000806F}, {0x0000F998, 0x00008F26}, {0x0000F999, 0x000084EE}, {0x0000F99A, 0x00009023}, -{0x0000F99B, 0x0000934A}, {0x0000F99C, 0x00005217}, {0x0000F99D, 0x000052A3}, {0x0000F99E, 0x000054BD}, -{0x0000F99F, 0x000070C8}, {0x0000F9A0, 0x000088C2}, {0x0000F9A1, 0x00008AAA}, {0x0000F9A2, 0x00005EC9}, -{0x0000F9A3, 0x00005FF5}, {0x0000F9A4, 0x0000637B}, {0x0000F9A5, 0x00006BAE}, {0x0000F9A6, 0x00007C3E}, -{0x0000F9A7, 0x00007375}, {0x0000F9A8, 0x00004EE4}, {0x0000F9A9, 0x000056F9}, {0x0000F9AA, 0x00005BE7}, -{0x0000F9AB, 0x00005DBA}, {0x0000F9AC, 0x0000601C}, {0x0000F9AD, 0x000073B2}, {0x0000F9AE, 0x00007469}, -{0x0000F9AF, 0x00007F9A}, {0x0000F9B0, 0x00008046}, {0x0000F9B1, 0x00009234}, {0x0000F9B2, 0x000096F6}, -{0x0000F9B3, 0x00009748}, {0x0000F9B4, 0x00009818}, {0x0000F9B5, 0x00004F8B}, {0x0000F9B6, 0x000079AE}, -{0x0000F9B7, 0x000091B4}, {0x0000F9B8, 0x000096B8}, {0x0000F9B9, 0x000060E1}, {0x0000F9BA, 0x00004E86}, -{0x0000F9BB, 0x000050DA}, {0x0000F9BC, 0x00005BEE}, {0x0000F9BD, 0x00005C3F}, {0x0000F9BE, 0x00006599}, -{0x0000F9BF, 0x00006A02}, {0x0000F9C0, 0x000071CE}, {0x0000F9C1, 0x00007642}, {0x0000F9C2, 0x000084FC}, -{0x0000F9C3, 0x0000907C}, {0x0000F9C4, 0x00009F8D}, {0x0000F9C5, 0x00006688}, {0x0000F9C6, 0x0000962E}, -{0x0000F9C7, 0x00005289}, {0x0000F9C8, 0x0000677B}, {0x0000F9C9, 0x000067F3}, {0x0000F9CA, 0x00006D41}, -{0x0000F9CB, 0x00006E9C}, {0x0000F9CC, 0x00007409}, {0x0000F9CD, 0x00007559}, {0x0000F9CE, 0x0000786B}, -{0x0000F9CF, 0x00007D10}, {0x0000F9D0, 0x0000985E}, {0x0000F9D1, 0x0000516D}, {0x0000F9D2, 0x0000622E}, -{0x0000F9D3, 0x00009678}, {0x0000F9D4, 0x0000502B}, {0x0000F9D5, 0x00005D19}, {0x0000F9D6, 0x00006DEA}, -{0x0000F9D7, 0x00008F2A}, {0x0000F9D8, 0x00005F8B}, {0x0000F9D9, 0x00006144}, {0x0000F9DA, 0x00006817}, -{0x0000F9DB, 0x00007387}, {0x0000F9DC, 0x00009686}, {0x0000F9DD, 0x00005229}, {0x0000F9DE, 0x0000540F}, -{0x0000F9DF, 0x00005C65}, {0x0000F9E0, 0x00006613}, {0x0000F9E1, 0x0000674E}, {0x0000F9E2, 0x000068A8}, -{0x0000F9E3, 0x00006CE5}, {0x0000F9E4, 0x00007406}, {0x0000F9E5, 0x000075E2}, {0x0000F9E6, 0x00007F79}, -{0x0000F9E7, 0x000088CF}, {0x0000F9E8, 0x000088E1}, {0x0000F9E9, 0x000091CC}, {0x0000F9EA, 0x000096E2}, -{0x0000F9EB, 0x0000533F}, {0x0000F9EC, 0x00006EBA}, {0x0000F9ED, 0x0000541D}, {0x0000F9EE, 0x000071D0}, -{0x0000F9EF, 0x00007498}, {0x0000F9F0, 0x000085FA}, {0x0000F9F1, 0x000096A3}, {0x0000F9F2, 0x00009C57}, -{0x0000F9F3, 0x00009E9F}, {0x0000F9F4, 0x00006797}, {0x0000F9F5, 0x00006DCB}, {0x0000F9F6, 0x000081E8}, -{0x0000F9F7, 0x00007ACB}, {0x0000F9F8, 0x00007B20}, {0x0000F9F9, 0x00007C92}, {0x0000F9FA, 0x000072C0}, -{0x0000F9FB, 0x00007099}, {0x0000F9FC, 0x00008B58}, {0x0000F9FD, 0x00004EC0}, {0x0000F9FE, 0x00008336}, -{0x0000F9FF, 0x0000523A}, {0x0000FA00, 0x00005207}, {0x0000FA01, 0x00005EA6}, {0x0000FA02, 0x000062D3}, -{0x0000FA03, 0x00007CD6}, {0x0000FA04, 0x00005B85}, {0x0000FA05, 0x00006D1E}, {0x0000FA06, 0x000066B4}, -{0x0000FA07, 0x00008F3B}, {0x0000FA08, 0x0000884C}, {0x0000FA09, 0x0000964D}, {0x0000FA0A, 0x0000898B}, -{0x0000FA0B, 0x00005ED3}, {0x0000FA0C, 0x00005140}, {0x0000FA0D, 0x000055C0}, {0x0000FA10, 0x0000585A}, -{0x0000FA12, 0x00006674}, {0x0000FA15, 0x000051DE}, {0x0000FA16, 0x0000732A}, {0x0000FA17, 0x000076CA}, -{0x0000FA18, 0x0000793C}, {0x0000FA19, 0x0000795E}, {0x0000FA1A, 0x00007965}, {0x0000FA1B, 0x0000798F}, -{0x0000FA1C, 0x00009756}, {0x0000FA1D, 0x00007CBE}, {0x0000FA1E, 0x00007FBD}, {0x0000FA20, 0x00008612}, -{0x0000FA22, 0x00008AF8}, {0x0000FA25, 0x00009038}, {0x0000FA26, 0x000090FD}, {0x0000FA2A, 0x000098EF}, -{0x0000FA2B, 0x000098FC}, {0x0000FA2C, 0x00009928}, {0x0000FA2D, 0x00009DB4}, {0x0000FA2E, 0x000090DE}, -{0x0000FA2F, 0x000096B7}, {0x0000FA30, 0x00004FAE}, {0x0000FA31, 0x000050E7}, {0x0000FA32, 0x0000514D}, -{0x0000FA33, 0x000052C9}, {0x0000FA34, 0x000052E4}, {0x0000FA35, 0x00005351}, {0x0000FA36, 0x0000559D}, -{0x0000FA37, 0x00005606}, {0x0000FA38, 0x00005668}, {0x0000FA39, 0x00005840}, {0x0000FA3A, 0x000058A8}, -{0x0000FA3B, 0x00005C64}, {0x0000FA3C, 0x00005C6E}, {0x0000FA3D, 0x00006094}, {0x0000FA3E, 0x00006168}, -{0x0000FA3F, 0x0000618E}, {0x0000FA40, 0x000061F2}, {0x0000FA41, 0x0000654F}, {0x0000FA42, 0x000065E2}, -{0x0000FA43, 0x00006691}, {0x0000FA44, 0x00006885}, {0x0000FA45, 0x00006D77}, {0x0000FA46, 0x00006E1A}, -{0x0000FA47, 0x00006F22}, {0x0000FA48, 0x0000716E}, {0x0000FA49, 0x0000722B}, {0x0000FA4A, 0x00007422}, -{0x0000FA4B, 0x00007891}, {0x0000FA4C, 0x0000793E}, {0x0000FA4D, 0x00007949}, {0x0000FA4E, 0x00007948}, -{0x0000FA4F, 0x00007950}, {0x0000FA50, 0x00007956}, {0x0000FA51, 0x0000795D}, {0x0000FA52, 0x0000798D}, -{0x0000FA53, 0x0000798E}, {0x0000FA54, 0x00007A40}, {0x0000FA55, 0x00007A81}, {0x0000FA56, 0x00007BC0}, -{0x0000FA57, 0x00007DF4}, {0x0000FA58, 0x00007E09}, {0x0000FA59, 0x00007E41}, {0x0000FA5A, 0x00007F72}, -{0x0000FA5B, 0x00008005}, {0x0000FA5C, 0x000081ED}, {0x0000FA5D, 0x00008279}, {0x0000FA5E, 0x00008279}, -{0x0000FA5F, 0x00008457}, {0x0000FA60, 0x00008910}, {0x0000FA61, 0x00008996}, {0x0000FA62, 0x00008B01}, -{0x0000FA63, 0x00008B39}, {0x0000FA64, 0x00008CD3}, {0x0000FA65, 0x00008D08}, {0x0000FA66, 0x00008FB6}, -{0x0000FA67, 0x00009038}, {0x0000FA68, 0x000096E3}, {0x0000FA69, 0x000097FF}, {0x0000FA6A, 0x0000983B}, -{0x0000FA6B, 0x00006075}, {0x0000FA6C, 0x000242EE}, {0x0000FA6D, 0x00008218}, {0x0000FA70, 0x00004E26}, -{0x0000FA71, 0x000051B5}, {0x0000FA72, 0x00005168}, {0x0000FA73, 0x00004F80}, {0x0000FA74, 0x00005145}, -{0x0000FA75, 0x00005180}, {0x0000FA76, 0x000052C7}, {0x0000FA77, 0x000052FA}, {0x0000FA78, 0x0000559D}, -{0x0000FA79, 0x00005555}, {0x0000FA7A, 0x00005599}, {0x0000FA7B, 0x000055E2}, {0x0000FA7C, 0x0000585A}, -{0x0000FA7D, 0x000058B3}, {0x0000FA7E, 0x00005944}, {0x0000FA7F, 0x00005954}, {0x0000FA80, 0x00005A62}, -{0x0000FA81, 0x00005B28}, {0x0000FA82, 0x00005ED2}, {0x0000FA83, 0x00005ED9}, {0x0000FA84, 0x00005F69}, -{0x0000FA85, 0x00005FAD}, {0x0000FA86, 0x000060D8}, {0x0000FA87, 0x0000614E}, {0x0000FA88, 0x00006108}, -{0x0000FA89, 0x0000618E}, {0x0000FA8A, 0x00006160}, {0x0000FA8B, 0x000061F2}, {0x0000FA8C, 0x00006234}, -{0x0000FA8D, 0x000063C4}, {0x0000FA8E, 0x0000641C}, {0x0000FA8F, 0x00006452}, {0x0000FA90, 0x00006556}, -{0x0000FA91, 0x00006674}, {0x0000FA92, 0x00006717}, {0x0000FA93, 0x0000671B}, {0x0000FA94, 0x00006756}, -{0x0000FA95, 0x00006B79}, {0x0000FA96, 0x00006BBA}, {0x0000FA97, 0x00006D41}, {0x0000FA98, 0x00006EDB}, -{0x0000FA99, 0x00006ECB}, {0x0000FA9A, 0x00006F22}, {0x0000FA9B, 0x0000701E}, {0x0000FA9C, 0x0000716E}, -{0x0000FA9D, 0x000077A7}, {0x0000FA9E, 0x00007235}, {0x0000FA9F, 0x000072AF}, {0x0000FAA0, 0x0000732A}, -{0x0000FAA1, 0x00007471}, {0x0000FAA2, 0x00007506}, {0x0000FAA3, 0x0000753B}, {0x0000FAA4, 0x0000761D}, -{0x0000FAA5, 0x0000761F}, {0x0000FAA6, 0x000076CA}, {0x0000FAA7, 0x000076DB}, {0x0000FAA8, 0x000076F4}, -{0x0000FAA9, 0x0000774A}, {0x0000FAAA, 0x00007740}, {0x0000FAAB, 0x000078CC}, {0x0000FAAC, 0x00007AB1}, -{0x0000FAAD, 0x00007BC0}, {0x0000FAAE, 0x00007C7B}, {0x0000FAAF, 0x00007D5B}, {0x0000FAB0, 0x00007DF4}, -{0x0000FAB1, 0x00007F3E}, {0x0000FAB2, 0x00008005}, {0x0000FAB3, 0x00008352}, {0x0000FAB4, 0x000083EF}, -{0x0000FAB5, 0x00008779}, {0x0000FAB6, 0x00008941}, {0x0000FAB7, 0x00008986}, {0x0000FAB8, 0x00008996}, -{0x0000FAB9, 0x00008ABF}, {0x0000FABA, 0x00008AF8}, {0x0000FABB, 0x00008ACB}, {0x0000FABC, 0x00008B01}, -{0x0000FABD, 0x00008AFE}, {0x0000FABE, 0x00008AED}, {0x0000FABF, 0x00008B39}, {0x0000FAC0, 0x00008B8A}, -{0x0000FAC1, 0x00008D08}, {0x0000FAC2, 0x00008F38}, {0x0000FAC3, 0x00009072}, {0x0000FAC4, 0x00009199}, -{0x0000FAC5, 0x00009276}, {0x0000FAC6, 0x0000967C}, {0x0000FAC7, 0x000096E3}, {0x0000FAC8, 0x00009756}, -{0x0000FAC9, 0x000097DB}, {0x0000FACA, 0x000097FF}, {0x0000FACB, 0x0000980B}, {0x0000FACC, 0x0000983B}, -{0x0000FACD, 0x00009B12}, {0x0000FACE, 0x00009F9C}, {0x0000FACF, 0x0002284A}, {0x0000FAD0, 0x00022844}, -{0x0000FAD1, 0x000233D5}, {0x0000FAD2, 0x00003B9D}, {0x0000FAD3, 0x00004018}, {0x0000FAD4, 0x00004039}, -{0x0000FAD5, 0x00025249}, {0x0000FAD6, 0x00025CD0}, {0x0000FAD7, 0x00027ED3}, {0x0000FAD8, 0x00009F43}, -{0x0000FAD9, 0x00009F8E}, {0x0000FB1D, 0x000005D9}, {0x0000FB1D, 0x000005B4}, {0x0000FB1F, 0x000005F2}, -{0x0000FB1F, 0x000005B7}, {0x0000FB2A, 0x000005E9}, {0x0000FB2A, 0x000005C1}, {0x0000FB2B, 0x000005E9}, -{0x0000FB2B, 0x000005C2}, {0x0000FB2C, 0x000005E9}, {0x0000FB2C, 0x000005BC}, {0x0000FB2C, 0x000005C1}, -{0x0000FB2D, 0x000005E9}, {0x0000FB2D, 0x000005BC}, {0x0000FB2D, 0x000005C2}, {0x0000FB2E, 0x000005D0}, -{0x0000FB2E, 0x000005B7}, {0x0000FB2F, 0x000005D0}, {0x0000FB2F, 0x000005B8}, {0x0000FB30, 0x000005D0}, -{0x0000FB30, 0x000005BC}, {0x0000FB31, 0x000005D1}, {0x0000FB31, 0x000005BC}, {0x0000FB32, 0x000005D2}, -{0x0000FB32, 0x000005BC}, {0x0000FB33, 0x000005D3}, {0x0000FB33, 0x000005BC}, {0x0000FB34, 0x000005D4}, -{0x0000FB34, 0x000005BC}, {0x0000FB35, 0x000005D5}, {0x0000FB35, 0x000005BC}, {0x0000FB36, 0x000005D6}, -{0x0000FB36, 0x000005BC}, {0x0000FB38, 0x000005D8}, {0x0000FB38, 0x000005BC}, {0x0000FB39, 0x000005D9}, -{0x0000FB39, 0x000005BC}, {0x0000FB3A, 0x000005DA}, {0x0000FB3A, 0x000005BC}, {0x0000FB3B, 0x000005DB}, -{0x0000FB3B, 0x000005BC}, {0x0000FB3C, 0x000005DC}, {0x0000FB3C, 0x000005BC}, {0x0000FB3E, 0x000005DE}, -{0x0000FB3E, 0x000005BC}, {0x0000FB40, 0x000005E0}, {0x0000FB40, 0x000005BC}, {0x0000FB41, 0x000005E1}, -{0x0000FB41, 0x000005BC}, {0x0000FB43, 0x000005E3}, {0x0000FB43, 0x000005BC}, {0x0000FB44, 0x000005E4}, -{0x0000FB44, 0x000005BC}, {0x0000FB46, 0x000005E6}, {0x0000FB46, 0x000005BC}, {0x0000FB47, 0x000005E7}, -{0x0000FB47, 0x000005BC}, {0x0000FB48, 0x000005E8}, {0x0000FB48, 0x000005BC}, {0x0000FB49, 0x000005E9}, -{0x0000FB49, 0x000005BC}, {0x0000FB4A, 0x000005EA}, {0x0000FB4A, 0x000005BC}, {0x0000FB4B, 0x000005D5}, -{0x0000FB4B, 0x000005B9}, {0x0000FB4C, 0x000005D1}, {0x0000FB4C, 0x000005BF}, {0x0000FB4D, 0x000005DB}, -{0x0000FB4D, 0x000005BF}, {0x0000FB4E, 0x000005E4}, {0x0000FB4E, 0x000005BF}, {0x0001109A, 0x00011099}, -{0x0001109A, 0x000110BA}, {0x0001109C, 0x0001109B}, {0x0001109C, 0x000110BA}, {0x000110AB, 0x000110A5}, -{0x000110AB, 0x000110BA}, {0x0001112E, 0x00011131}, {0x0001112E, 0x00011127}, {0x0001112F, 0x00011132}, -{0x0001112F, 0x00011127}, {0x0001134B, 0x00011347}, {0x0001134B, 0x0001133E}, {0x0001134C, 0x00011347}, -{0x0001134C, 0x00011357}, {0x000114BB, 0x000114B9}, {0x000114BB, 0x000114BA}, {0x000114BC, 0x000114B9}, -{0x000114BC, 0x000114B0}, {0x000114BE, 0x000114B9}, {0x000114BE, 0x000114BD}, {0x000115BA, 0x000115B8}, -{0x000115BA, 0x000115AF}, {0x000115BB, 0x000115B9}, {0x000115BB, 0x000115AF}, {0x0001D15E, 0x0001D157}, -{0x0001D15E, 0x0001D165}, {0x0001D15F, 0x0001D158}, {0x0001D15F, 0x0001D165}, {0x0001D160, 0x0001D158}, -{0x0001D160, 0x0001D165}, {0x0001D160, 0x0001D16E}, {0x0001D161, 0x0001D158}, {0x0001D161, 0x0001D165}, -{0x0001D161, 0x0001D16F}, {0x0001D162, 0x0001D158}, {0x0001D162, 0x0001D165}, {0x0001D162, 0x0001D170}, -{0x0001D163, 0x0001D158}, {0x0001D163, 0x0001D165}, {0x0001D163, 0x0001D171}, {0x0001D164, 0x0001D158}, -{0x0001D164, 0x0001D165}, {0x0001D164, 0x0001D172}, {0x0001D1BB, 0x0001D1B9}, {0x0001D1BB, 0x0001D165}, -{0x0001D1BC, 0x0001D1BA}, {0x0001D1BC, 0x0001D165}, {0x0001D1BD, 0x0001D1B9}, {0x0001D1BD, 0x0001D165}, -{0x0001D1BD, 0x0001D16E}, {0x0001D1BE, 0x0001D1BA}, {0x0001D1BE, 0x0001D165}, {0x0001D1BE, 0x0001D16E}, -{0x0001D1BF, 0x0001D1B9}, {0x0001D1BF, 0x0001D165}, {0x0001D1BF, 0x0001D16F}, {0x0001D1C0, 0x0001D1BA}, -{0x0001D1C0, 0x0001D165}, {0x0001D1C0, 0x0001D16F}, {0x0002F800, 0x00004E3D}, {0x0002F801, 0x00004E38}, -{0x0002F802, 0x00004E41}, {0x0002F803, 0x00020122}, {0x0002F804, 0x00004F60}, {0x0002F805, 0x00004FAE}, -{0x0002F806, 0x00004FBB}, {0x0002F807, 0x00005002}, {0x0002F808, 0x0000507A}, {0x0002F809, 0x00005099}, -{0x0002F80A, 0x000050E7}, {0x0002F80B, 0x000050CF}, {0x0002F80C, 0x0000349E}, {0x0002F80D, 0x0002063A}, -{0x0002F80E, 0x0000514D}, {0x0002F80F, 0x00005154}, {0x0002F810, 0x00005164}, {0x0002F811, 0x00005177}, -{0x0002F812, 0x0002051C}, {0x0002F813, 0x000034B9}, {0x0002F814, 0x00005167}, {0x0002F815, 0x0000518D}, -{0x0002F816, 0x0002054B}, {0x0002F817, 0x00005197}, {0x0002F818, 0x000051A4}, {0x0002F819, 0x00004ECC}, -{0x0002F81A, 0x000051AC}, {0x0002F81B, 0x000051B5}, {0x0002F81C, 0x000291DF}, {0x0002F81D, 0x000051F5}, -{0x0002F81E, 0x00005203}, {0x0002F81F, 0x000034DF}, {0x0002F820, 0x0000523B}, {0x0002F821, 0x00005246}, -{0x0002F822, 0x00005272}, {0x0002F823, 0x00005277}, {0x0002F824, 0x00003515}, {0x0002F825, 0x000052C7}, -{0x0002F826, 0x000052C9}, {0x0002F827, 0x000052E4}, {0x0002F828, 0x000052FA}, {0x0002F829, 0x00005305}, -{0x0002F82A, 0x00005306}, {0x0002F82B, 0x00005317}, {0x0002F82C, 0x00005349}, {0x0002F82D, 0x00005351}, -{0x0002F82E, 0x0000535A}, {0x0002F82F, 0x00005373}, {0x0002F830, 0x0000537D}, {0x0002F831, 0x0000537F}, -{0x0002F832, 0x0000537F}, {0x0002F833, 0x0000537F}, {0x0002F834, 0x00020A2C}, {0x0002F835, 0x00007070}, -{0x0002F836, 0x000053CA}, {0x0002F837, 0x000053DF}, {0x0002F838, 0x00020B63}, {0x0002F839, 0x000053EB}, -{0x0002F83A, 0x000053F1}, {0x0002F83B, 0x00005406}, {0x0002F83C, 0x0000549E}, {0x0002F83D, 0x00005438}, -{0x0002F83E, 0x00005448}, {0x0002F83F, 0x00005468}, {0x0002F840, 0x000054A2}, {0x0002F841, 0x000054F6}, -{0x0002F842, 0x00005510}, {0x0002F843, 0x00005553}, {0x0002F844, 0x00005563}, {0x0002F845, 0x00005584}, -{0x0002F846, 0x00005584}, {0x0002F847, 0x00005599}, {0x0002F848, 0x000055AB}, {0x0002F849, 0x000055B3}, -{0x0002F84A, 0x000055C2}, {0x0002F84B, 0x00005716}, {0x0002F84C, 0x00005606}, {0x0002F84D, 0x00005717}, -{0x0002F84E, 0x00005651}, {0x0002F84F, 0x00005674}, {0x0002F850, 0x00005207}, {0x0002F851, 0x000058EE}, -{0x0002F852, 0x000057CE}, {0x0002F853, 0x000057F4}, {0x0002F854, 0x0000580D}, {0x0002F855, 0x0000578B}, -{0x0002F856, 0x00005832}, {0x0002F857, 0x00005831}, {0x0002F858, 0x000058AC}, {0x0002F859, 0x000214E4}, -{0x0002F85A, 0x000058F2}, {0x0002F85B, 0x000058F7}, {0x0002F85C, 0x00005906}, {0x0002F85D, 0x0000591A}, -{0x0002F85E, 0x00005922}, {0x0002F85F, 0x00005962}, {0x0002F860, 0x000216A8}, {0x0002F861, 0x000216EA}, -{0x0002F862, 0x000059EC}, {0x0002F863, 0x00005A1B}, {0x0002F864, 0x00005A27}, {0x0002F865, 0x000059D8}, -{0x0002F866, 0x00005A66}, {0x0002F867, 0x000036EE}, {0x0002F868, 0x000036FC}, {0x0002F869, 0x00005B08}, -{0x0002F86A, 0x00005B3E}, {0x0002F86B, 0x00005B3E}, {0x0002F86C, 0x000219C8}, {0x0002F86D, 0x00005BC3}, -{0x0002F86E, 0x00005BD8}, {0x0002F86F, 0x00005BE7}, {0x0002F870, 0x00005BF3}, {0x0002F871, 0x00021B18}, -{0x0002F872, 0x00005BFF}, {0x0002F873, 0x00005C06}, {0x0002F874, 0x00005F53}, {0x0002F875, 0x00005C22}, -{0x0002F876, 0x00003781}, {0x0002F877, 0x00005C60}, {0x0002F878, 0x00005C6E}, {0x0002F879, 0x00005CC0}, -{0x0002F87A, 0x00005C8D}, {0x0002F87B, 0x00021DE4}, {0x0002F87C, 0x00005D43}, {0x0002F87D, 0x00021DE6}, -{0x0002F87E, 0x00005D6E}, {0x0002F87F, 0x00005D6B}, {0x0002F880, 0x00005D7C}, {0x0002F881, 0x00005DE1}, -{0x0002F882, 0x00005DE2}, {0x0002F883, 0x0000382F}, {0x0002F884, 0x00005DFD}, {0x0002F885, 0x00005E28}, -{0x0002F886, 0x00005E3D}, {0x0002F887, 0x00005E69}, {0x0002F888, 0x00003862}, {0x0002F889, 0x00022183}, -{0x0002F88A, 0x0000387C}, {0x0002F88B, 0x00005EB0}, {0x0002F88C, 0x00005EB3}, {0x0002F88D, 0x00005EB6}, -{0x0002F88E, 0x00005ECA}, {0x0002F88F, 0x0002A392}, {0x0002F890, 0x00005EFE}, {0x0002F891, 0x00022331}, -{0x0002F892, 0x00022331}, {0x0002F893, 0x00008201}, {0x0002F894, 0x00005F22}, {0x0002F895, 0x00005F22}, -{0x0002F896, 0x000038C7}, {0x0002F897, 0x000232B8}, {0x0002F898, 0x000261DA}, {0x0002F899, 0x00005F62}, -{0x0002F89A, 0x00005F6B}, {0x0002F89B, 0x000038E3}, {0x0002F89C, 0x00005F9A}, {0x0002F89D, 0x00005FCD}, -{0x0002F89E, 0x00005FD7}, {0x0002F89F, 0x00005FF9}, {0x0002F8A0, 0x00006081}, {0x0002F8A1, 0x0000393A}, -{0x0002F8A2, 0x0000391C}, {0x0002F8A3, 0x00006094}, {0x0002F8A4, 0x000226D4}, {0x0002F8A5, 0x000060C7}, -{0x0002F8A6, 0x00006148}, {0x0002F8A7, 0x0000614C}, {0x0002F8A8, 0x0000614E}, {0x0002F8A9, 0x0000614C}, -{0x0002F8AA, 0x0000617A}, {0x0002F8AB, 0x0000618E}, {0x0002F8AC, 0x000061B2}, {0x0002F8AD, 0x000061A4}, -{0x0002F8AE, 0x000061AF}, {0x0002F8AF, 0x000061DE}, {0x0002F8B0, 0x000061F2}, {0x0002F8B1, 0x000061F6}, -{0x0002F8B2, 0x00006210}, {0x0002F8B3, 0x0000621B}, {0x0002F8B4, 0x0000625D}, {0x0002F8B5, 0x000062B1}, -{0x0002F8B6, 0x000062D4}, {0x0002F8B7, 0x00006350}, {0x0002F8B8, 0x00022B0C}, {0x0002F8B9, 0x0000633D}, -{0x0002F8BA, 0x000062FC}, {0x0002F8BB, 0x00006368}, {0x0002F8BC, 0x00006383}, {0x0002F8BD, 0x000063E4}, -{0x0002F8BE, 0x00022BF1}, {0x0002F8BF, 0x00006422}, {0x0002F8C0, 0x000063C5}, {0x0002F8C1, 0x000063A9}, -{0x0002F8C2, 0x00003A2E}, {0x0002F8C3, 0x00006469}, {0x0002F8C4, 0x0000647E}, {0x0002F8C5, 0x0000649D}, -{0x0002F8C6, 0x00006477}, {0x0002F8C7, 0x00003A6C}, {0x0002F8C8, 0x0000654F}, {0x0002F8C9, 0x0000656C}, -{0x0002F8CA, 0x0002300A}, {0x0002F8CB, 0x000065E3}, {0x0002F8CC, 0x000066F8}, {0x0002F8CD, 0x00006649}, -{0x0002F8CE, 0x00003B19}, {0x0002F8CF, 0x00006691}, {0x0002F8D0, 0x00003B08}, {0x0002F8D1, 0x00003AE4}, -{0x0002F8D2, 0x00005192}, {0x0002F8D3, 0x00005195}, {0x0002F8D4, 0x00006700}, {0x0002F8D5, 0x0000669C}, -{0x0002F8D6, 0x000080AD}, {0x0002F8D7, 0x000043D9}, {0x0002F8D8, 0x00006717}, {0x0002F8D9, 0x0000671B}, -{0x0002F8DA, 0x00006721}, {0x0002F8DB, 0x0000675E}, {0x0002F8DC, 0x00006753}, {0x0002F8DD, 0x000233C3}, -{0x0002F8DE, 0x00003B49}, {0x0002F8DF, 0x000067FA}, {0x0002F8E0, 0x00006785}, {0x0002F8E1, 0x00006852}, -{0x0002F8E2, 0x00006885}, {0x0002F8E3, 0x0002346D}, {0x0002F8E4, 0x0000688E}, {0x0002F8E5, 0x0000681F}, -{0x0002F8E6, 0x00006914}, {0x0002F8E7, 0x00003B9D}, {0x0002F8E8, 0x00006942}, {0x0002F8E9, 0x000069A3}, -{0x0002F8EA, 0x000069EA}, {0x0002F8EB, 0x00006AA8}, {0x0002F8EC, 0x000236A3}, {0x0002F8ED, 0x00006ADB}, -{0x0002F8EE, 0x00003C18}, {0x0002F8EF, 0x00006B21}, {0x0002F8F0, 0x000238A7}, {0x0002F8F1, 0x00006B54}, -{0x0002F8F2, 0x00003C4E}, {0x0002F8F3, 0x00006B72}, {0x0002F8F4, 0x00006B9F}, {0x0002F8F5, 0x00006BBA}, -{0x0002F8F6, 0x00006BBB}, {0x0002F8F7, 0x00023A8D}, {0x0002F8F8, 0x00021D0B}, {0x0002F8F9, 0x00023AFA}, -{0x0002F8FA, 0x00006C4E}, {0x0002F8FB, 0x00023CBC}, {0x0002F8FC, 0x00006CBF}, {0x0002F8FD, 0x00006CCD}, -{0x0002F8FE, 0x00006C67}, {0x0002F8FF, 0x00006D16}, {0x0002F900, 0x00006D3E}, {0x0002F901, 0x00006D77}, -{0x0002F902, 0x00006D41}, {0x0002F903, 0x00006D69}, {0x0002F904, 0x00006D78}, {0x0002F905, 0x00006D85}, -{0x0002F906, 0x00023D1E}, {0x0002F907, 0x00006D34}, {0x0002F908, 0x00006E2F}, {0x0002F909, 0x00006E6E}, -{0x0002F90A, 0x00003D33}, {0x0002F90B, 0x00006ECB}, {0x0002F90C, 0x00006EC7}, {0x0002F90D, 0x00023ED1}, -{0x0002F90E, 0x00006DF9}, {0x0002F90F, 0x00006F6E}, {0x0002F910, 0x00023F5E}, {0x0002F911, 0x00023F8E}, -{0x0002F912, 0x00006FC6}, {0x0002F913, 0x00007039}, {0x0002F914, 0x0000701E}, {0x0002F915, 0x0000701B}, -{0x0002F916, 0x00003D96}, {0x0002F917, 0x0000704A}, {0x0002F918, 0x0000707D}, {0x0002F919, 0x00007077}, -{0x0002F91A, 0x000070AD}, {0x0002F91B, 0x00020525}, {0x0002F91C, 0x00007145}, {0x0002F91D, 0x00024263}, -{0x0002F91E, 0x0000719C}, {0x0002F91F, 0x000243AB}, {0x0002F920, 0x00007228}, {0x0002F921, 0x00007235}, -{0x0002F922, 0x00007250}, {0x0002F923, 0x00024608}, {0x0002F924, 0x00007280}, {0x0002F925, 0x00007295}, -{0x0002F926, 0x00024735}, {0x0002F927, 0x00024814}, {0x0002F928, 0x0000737A}, {0x0002F929, 0x0000738B}, -{0x0002F92A, 0x00003EAC}, {0x0002F92B, 0x000073A5}, {0x0002F92C, 0x00003EB8}, {0x0002F92D, 0x00003EB8}, -{0x0002F92E, 0x00007447}, {0x0002F92F, 0x0000745C}, {0x0002F930, 0x00007471}, {0x0002F931, 0x00007485}, -{0x0002F932, 0x000074CA}, {0x0002F933, 0x00003F1B}, {0x0002F934, 0x00007524}, {0x0002F935, 0x00024C36}, -{0x0002F936, 0x0000753E}, {0x0002F937, 0x00024C92}, {0x0002F938, 0x00007570}, {0x0002F939, 0x0002219F}, -{0x0002F93A, 0x00007610}, {0x0002F93B, 0x00024FA1}, {0x0002F93C, 0x00024FB8}, {0x0002F93D, 0x00025044}, -{0x0002F93E, 0x00003FFC}, {0x0002F93F, 0x00004008}, {0x0002F940, 0x000076F4}, {0x0002F941, 0x000250F3}, -{0x0002F942, 0x000250F2}, {0x0002F943, 0x00025119}, {0x0002F944, 0x00025133}, {0x0002F945, 0x0000771E}, -{0x0002F946, 0x0000771F}, {0x0002F947, 0x0000771F}, {0x0002F948, 0x0000774A}, {0x0002F949, 0x00004039}, -{0x0002F94A, 0x0000778B}, {0x0002F94B, 0x00004046}, {0x0002F94C, 0x00004096}, {0x0002F94D, 0x0002541D}, -{0x0002F94E, 0x0000784E}, {0x0002F94F, 0x0000788C}, {0x0002F950, 0x000078CC}, {0x0002F951, 0x000040E3}, -{0x0002F952, 0x00025626}, {0x0002F953, 0x00007956}, {0x0002F954, 0x0002569A}, {0x0002F955, 0x000256C5}, -{0x0002F956, 0x0000798F}, {0x0002F957, 0x000079EB}, {0x0002F958, 0x0000412F}, {0x0002F959, 0x00007A40}, -{0x0002F95A, 0x00007A4A}, {0x0002F95B, 0x00007A4F}, {0x0002F95C, 0x0002597C}, {0x0002F95D, 0x00025AA7}, -{0x0002F95E, 0x00025AA7}, {0x0002F95F, 0x00007AEE}, {0x0002F960, 0x00004202}, {0x0002F961, 0x00025BAB}, -{0x0002F962, 0x00007BC6}, {0x0002F963, 0x00007BC9}, {0x0002F964, 0x00004227}, {0x0002F965, 0x00025C80}, -{0x0002F966, 0x00007CD2}, {0x0002F967, 0x000042A0}, {0x0002F968, 0x00007CE8}, {0x0002F969, 0x00007CE3}, -{0x0002F96A, 0x00007D00}, {0x0002F96B, 0x00025F86}, {0x0002F96C, 0x00007D63}, {0x0002F96D, 0x00004301}, -{0x0002F96E, 0x00007DC7}, {0x0002F96F, 0x00007E02}, {0x0002F970, 0x00007E45}, {0x0002F971, 0x00004334}, -{0x0002F972, 0x00026228}, {0x0002F973, 0x00026247}, {0x0002F974, 0x00004359}, {0x0002F975, 0x000262D9}, -{0x0002F976, 0x00007F7A}, {0x0002F977, 0x0002633E}, {0x0002F978, 0x00007F95}, {0x0002F979, 0x00007FFA}, -{0x0002F97A, 0x00008005}, {0x0002F97B, 0x000264DA}, {0x0002F97C, 0x00026523}, {0x0002F97D, 0x00008060}, -{0x0002F97E, 0x000265A8}, {0x0002F97F, 0x00008070}, {0x0002F980, 0x0002335F}, {0x0002F981, 0x000043D5}, -{0x0002F982, 0x000080B2}, {0x0002F983, 0x00008103}, {0x0002F984, 0x0000440B}, {0x0002F985, 0x0000813E}, -{0x0002F986, 0x00005AB5}, {0x0002F987, 0x000267A7}, {0x0002F988, 0x000267B5}, {0x0002F989, 0x00023393}, -{0x0002F98A, 0x0002339C}, {0x0002F98B, 0x00008201}, {0x0002F98C, 0x00008204}, {0x0002F98D, 0x00008F9E}, -{0x0002F98E, 0x0000446B}, {0x0002F98F, 0x00008291}, {0x0002F990, 0x0000828B}, {0x0002F991, 0x0000829D}, -{0x0002F992, 0x000052B3}, {0x0002F993, 0x000082B1}, {0x0002F994, 0x000082B3}, {0x0002F995, 0x000082BD}, -{0x0002F996, 0x000082E6}, {0x0002F997, 0x00026B3C}, {0x0002F998, 0x000082E5}, {0x0002F999, 0x0000831D}, -{0x0002F99A, 0x00008363}, {0x0002F99B, 0x000083AD}, {0x0002F99C, 0x00008323}, {0x0002F99D, 0x000083BD}, -{0x0002F99E, 0x000083E7}, {0x0002F99F, 0x00008457}, {0x0002F9A0, 0x00008353}, {0x0002F9A1, 0x000083CA}, -{0x0002F9A2, 0x000083CC}, {0x0002F9A3, 0x000083DC}, {0x0002F9A4, 0x00026C36}, {0x0002F9A5, 0x00026D6B}, -{0x0002F9A6, 0x00026CD5}, {0x0002F9A7, 0x0000452B}, {0x0002F9A8, 0x000084F1}, {0x0002F9A9, 0x000084F3}, -{0x0002F9AA, 0x00008516}, {0x0002F9AB, 0x000273CA}, {0x0002F9AC, 0x00008564}, {0x0002F9AD, 0x00026F2C}, -{0x0002F9AE, 0x0000455D}, {0x0002F9AF, 0x00004561}, {0x0002F9B0, 0x00026FB1}, {0x0002F9B1, 0x000270D2}, -{0x0002F9B2, 0x0000456B}, {0x0002F9B3, 0x00008650}, {0x0002F9B4, 0x0000865C}, {0x0002F9B5, 0x00008667}, -{0x0002F9B6, 0x00008669}, {0x0002F9B7, 0x000086A9}, {0x0002F9B8, 0x00008688}, {0x0002F9B9, 0x0000870E}, -{0x0002F9BA, 0x000086E2}, {0x0002F9BB, 0x00008779}, {0x0002F9BC, 0x00008728}, {0x0002F9BD, 0x0000876B}, -{0x0002F9BE, 0x00008786}, {0x0002F9BF, 0x000045D7}, {0x0002F9C0, 0x000087E1}, {0x0002F9C1, 0x00008801}, -{0x0002F9C2, 0x000045F9}, {0x0002F9C3, 0x00008860}, {0x0002F9C4, 0x00008863}, {0x0002F9C5, 0x00027667}, -{0x0002F9C6, 0x000088D7}, {0x0002F9C7, 0x000088DE}, {0x0002F9C8, 0x00004635}, {0x0002F9C9, 0x000088FA}, -{0x0002F9CA, 0x000034BB}, {0x0002F9CB, 0x000278AE}, {0x0002F9CC, 0x00027966}, {0x0002F9CD, 0x000046BE}, -{0x0002F9CE, 0x000046C7}, {0x0002F9CF, 0x00008AA0}, {0x0002F9D0, 0x00008AED}, {0x0002F9D1, 0x00008B8A}, -{0x0002F9D2, 0x00008C55}, {0x0002F9D3, 0x00027CA8}, {0x0002F9D4, 0x00008CAB}, {0x0002F9D5, 0x00008CC1}, -{0x0002F9D6, 0x00008D1B}, {0x0002F9D7, 0x00008D77}, {0x0002F9D8, 0x00027F2F}, {0x0002F9D9, 0x00020804}, -{0x0002F9DA, 0x00008DCB}, {0x0002F9DB, 0x00008DBC}, {0x0002F9DC, 0x00008DF0}, {0x0002F9DD, 0x000208DE}, -{0x0002F9DE, 0x00008ED4}, {0x0002F9DF, 0x00008F38}, {0x0002F9E0, 0x000285D2}, {0x0002F9E1, 0x000285ED}, -{0x0002F9E2, 0x00009094}, {0x0002F9E3, 0x000090F1}, {0x0002F9E4, 0x00009111}, {0x0002F9E5, 0x0002872E}, -{0x0002F9E6, 0x0000911B}, {0x0002F9E7, 0x00009238}, {0x0002F9E8, 0x000092D7}, {0x0002F9E9, 0x000092D8}, -{0x0002F9EA, 0x0000927C}, {0x0002F9EB, 0x000093F9}, {0x0002F9EC, 0x00009415}, {0x0002F9ED, 0x00028BFA}, -{0x0002F9EE, 0x0000958B}, {0x0002F9EF, 0x00004995}, {0x0002F9F0, 0x000095B7}, {0x0002F9F1, 0x00028D77}, -{0x0002F9F2, 0x000049E6}, {0x0002F9F3, 0x000096C3}, {0x0002F9F4, 0x00005DB2}, {0x0002F9F5, 0x00009723}, -{0x0002F9F6, 0x00029145}, {0x0002F9F7, 0x0002921A}, {0x0002F9F8, 0x00004A6E}, {0x0002F9F9, 0x00004A76}, -{0x0002F9FA, 0x000097E0}, {0x0002F9FB, 0x0002940A}, {0x0002F9FC, 0x00004AB2}, {0x0002F9FD, 0x00029496}, -{0x0002F9FE, 0x0000980B}, {0x0002F9FF, 0x0000980B}, {0x0002FA00, 0x00009829}, {0x0002FA01, 0x000295B6}, -{0x0002FA02, 0x000098E2}, {0x0002FA03, 0x00004B33}, {0x0002FA04, 0x00009929}, {0x0002FA05, 0x000099A7}, -{0x0002FA06, 0x000099C2}, {0x0002FA07, 0x000099FE}, {0x0002FA08, 0x00004BCE}, {0x0002FA09, 0x00029B30}, -{0x0002FA0A, 0x00009B12}, {0x0002FA0B, 0x00009C40}, {0x0002FA0C, 0x00009CFD}, {0x0002FA0D, 0x00004CCE}, -{0x0002FA0E, 0x00004CED}, {0x0002FA0F, 0x00009D67}, {0x0002FA10, 0x0002A0CE}, {0x0002FA11, 0x00004CF8}, -{0x0002FA12, 0x0002A105}, {0x0002FA13, 0x0002A20E}, {0x0002FA14, 0x0002A291}, {0x0002FA15, 0x00009EBB}, -{0x0002FA16, 0x00004D56}, {0x0002FA17, 0x00009EF9}, {0x0002FA18, 0x00009EFE}, {0x0002FA19, 0x00009F05}, -{0x0002FA1A, 0x00009F0F}, {0x0002FA1B, 0x00009F16}, {0x0002FA1D, 0x0002A600}, +const std::vector unicode_ranges_nfd = { // start, last, nfd +{0x000000, 0x000000, 0x000000}, +{0x0000C0, 0x0000C5, 0x000041}, +{0x0000C7, 0x0000C7, 0x000043}, +{0x0000C8, 0x0000CB, 0x000045}, +{0x0000CC, 0x0000CF, 0x000049}, +{0x0000D1, 0x0000D1, 0x00004E}, +{0x0000D2, 0x0000D6, 0x00004F}, +{0x0000D9, 0x0000DC, 0x000055}, +{0x0000DD, 0x0000DD, 0x000059}, +{0x0000E0, 0x0000E5, 0x000061}, +{0x0000E7, 0x0000E7, 0x000063}, +{0x0000E8, 0x0000EB, 0x000065}, +{0x0000EC, 0x0000EF, 0x000069}, +{0x0000F1, 0x0000F1, 0x00006E}, +{0x0000F2, 0x0000F6, 0x00006F}, +{0x0000F9, 0x0000FC, 0x000075}, +{0x0000FD, 0x0000FD, 0x000079}, +{0x0000FF, 0x0000FF, 0x000079}, +{0x000100, 0x000100, 0x000041}, +{0x000101, 0x000101, 0x000061}, +{0x000102, 0x000102, 0x000041}, +{0x000103, 0x000103, 0x000061}, +{0x000104, 0x000104, 0x000041}, +{0x000105, 0x000105, 0x000061}, +{0x000106, 0x000106, 0x000043}, +{0x000107, 0x000107, 0x000063}, +{0x000108, 0x000108, 0x000043}, +{0x000109, 0x000109, 0x000063}, +{0x00010A, 0x00010A, 0x000043}, +{0x00010B, 0x00010B, 0x000063}, +{0x00010C, 0x00010C, 0x000043}, +{0x00010D, 0x00010D, 0x000063}, +{0x00010E, 0x00010E, 0x000044}, +{0x00010F, 0x00010F, 0x000064}, +{0x000112, 0x000112, 0x000045}, +{0x000113, 0x000113, 0x000065}, +{0x000114, 0x000114, 0x000045}, +{0x000115, 0x000115, 0x000065}, +{0x000116, 0x000116, 0x000045}, +{0x000117, 0x000117, 0x000065}, +{0x000118, 0x000118, 0x000045}, +{0x000119, 0x000119, 0x000065}, +{0x00011A, 0x00011A, 0x000045}, +{0x00011B, 0x00011B, 0x000065}, +{0x00011C, 0x00011C, 0x000047}, +{0x00011D, 0x00011D, 0x000067}, +{0x00011E, 0x00011E, 0x000047}, +{0x00011F, 0x00011F, 0x000067}, +{0x000120, 0x000120, 0x000047}, +{0x000121, 0x000121, 0x000067}, +{0x000122, 0x000122, 0x000047}, +{0x000123, 0x000123, 0x000067}, +{0x000124, 0x000124, 0x000048}, +{0x000125, 0x000125, 0x000068}, +{0x000128, 0x000128, 0x000049}, +{0x000129, 0x000129, 0x000069}, +{0x00012A, 0x00012A, 0x000049}, +{0x00012B, 0x00012B, 0x000069}, +{0x00012C, 0x00012C, 0x000049}, +{0x00012D, 0x00012D, 0x000069}, +{0x00012E, 0x00012E, 0x000049}, +{0x00012F, 0x00012F, 0x000069}, +{0x000130, 0x000130, 0x000049}, +{0x000134, 0x000134, 0x00004A}, +{0x000135, 0x000135, 0x00006A}, +{0x000136, 0x000136, 0x00004B}, +{0x000137, 0x000137, 0x00006B}, +{0x000139, 0x000139, 0x00004C}, +{0x00013A, 0x00013A, 0x00006C}, +{0x00013B, 0x00013B, 0x00004C}, +{0x00013C, 0x00013C, 0x00006C}, +{0x00013D, 0x00013D, 0x00004C}, +{0x00013E, 0x00013E, 0x00006C}, +{0x000143, 0x000143, 0x00004E}, +{0x000144, 0x000144, 0x00006E}, +{0x000145, 0x000145, 0x00004E}, +{0x000146, 0x000146, 0x00006E}, +{0x000147, 0x000147, 0x00004E}, +{0x000148, 0x000148, 0x00006E}, +{0x00014C, 0x00014C, 0x00004F}, +{0x00014D, 0x00014D, 0x00006F}, +{0x00014E, 0x00014E, 0x00004F}, +{0x00014F, 0x00014F, 0x00006F}, +{0x000150, 0x000150, 0x00004F}, +{0x000151, 0x000151, 0x00006F}, +{0x000154, 0x000154, 0x000052}, +{0x000155, 0x000155, 0x000072}, +{0x000156, 0x000156, 0x000052}, +{0x000157, 0x000157, 0x000072}, +{0x000158, 0x000158, 0x000052}, +{0x000159, 0x000159, 0x000072}, +{0x00015A, 0x00015A, 0x000053}, +{0x00015B, 0x00015B, 0x000073}, +{0x00015C, 0x00015C, 0x000053}, +{0x00015D, 0x00015D, 0x000073}, +{0x00015E, 0x00015E, 0x000053}, +{0x00015F, 0x00015F, 0x000073}, +{0x000160, 0x000160, 0x000053}, +{0x000161, 0x000161, 0x000073}, +{0x000162, 0x000162, 0x000054}, +{0x000163, 0x000163, 0x000074}, +{0x000164, 0x000164, 0x000054}, +{0x000165, 0x000165, 0x000074}, +{0x000168, 0x000168, 0x000055}, +{0x000169, 0x000169, 0x000075}, +{0x00016A, 0x00016A, 0x000055}, +{0x00016B, 0x00016B, 0x000075}, +{0x00016C, 0x00016C, 0x000055}, +{0x00016D, 0x00016D, 0x000075}, +{0x00016E, 0x00016E, 0x000055}, +{0x00016F, 0x00016F, 0x000075}, +{0x000170, 0x000170, 0x000055}, +{0x000171, 0x000171, 0x000075}, +{0x000172, 0x000172, 0x000055}, +{0x000173, 0x000173, 0x000075}, +{0x000174, 0x000174, 0x000057}, +{0x000175, 0x000175, 0x000077}, +{0x000176, 0x000176, 0x000059}, +{0x000177, 0x000177, 0x000079}, +{0x000178, 0x000178, 0x000059}, +{0x000179, 0x000179, 0x00005A}, +{0x00017A, 0x00017A, 0x00007A}, +{0x00017B, 0x00017B, 0x00005A}, +{0x00017C, 0x00017C, 0x00007A}, +{0x00017D, 0x00017D, 0x00005A}, +{0x00017E, 0x00017E, 0x00007A}, +{0x0001A0, 0x0001A0, 0x00004F}, +{0x0001A1, 0x0001A1, 0x00006F}, +{0x0001AF, 0x0001AF, 0x000055}, +{0x0001B0, 0x0001B0, 0x000075}, +{0x0001CD, 0x0001CD, 0x000041}, +{0x0001CE, 0x0001CE, 0x000061}, +{0x0001CF, 0x0001CF, 0x000049}, +{0x0001D0, 0x0001D0, 0x000069}, +{0x0001D1, 0x0001D1, 0x00004F}, +{0x0001D2, 0x0001D2, 0x00006F}, +{0x0001D3, 0x0001D3, 0x000055}, +{0x0001D4, 0x0001D4, 0x000075}, +{0x0001D5, 0x0001D5, 0x000055}, +{0x0001D6, 0x0001D6, 0x000075}, +{0x0001D7, 0x0001D7, 0x000055}, +{0x0001D8, 0x0001D8, 0x000075}, +{0x0001D9, 0x0001D9, 0x000055}, +{0x0001DA, 0x0001DA, 0x000075}, +{0x0001DB, 0x0001DB, 0x000055}, +{0x0001DC, 0x0001DC, 0x000075}, +{0x0001DE, 0x0001DE, 0x000041}, +{0x0001DF, 0x0001DF, 0x000061}, +{0x0001E0, 0x0001E0, 0x000041}, +{0x0001E1, 0x0001E1, 0x000061}, +{0x0001E2, 0x0001E2, 0x0000C6}, +{0x0001E3, 0x0001E3, 0x0000E6}, +{0x0001E6, 0x0001E6, 0x000047}, +{0x0001E7, 0x0001E7, 0x000067}, +{0x0001E8, 0x0001E8, 0x00004B}, +{0x0001E9, 0x0001E9, 0x00006B}, +{0x0001EA, 0x0001EA, 0x00004F}, +{0x0001EB, 0x0001EB, 0x00006F}, +{0x0001EC, 0x0001EC, 0x00004F}, +{0x0001ED, 0x0001ED, 0x00006F}, +{0x0001EE, 0x0001EE, 0x0001B7}, +{0x0001EF, 0x0001EF, 0x000292}, +{0x0001F0, 0x0001F0, 0x00006A}, +{0x0001F4, 0x0001F4, 0x000047}, +{0x0001F5, 0x0001F5, 0x000067}, +{0x0001F8, 0x0001F8, 0x00004E}, +{0x0001F9, 0x0001F9, 0x00006E}, +{0x0001FA, 0x0001FA, 0x000041}, +{0x0001FB, 0x0001FB, 0x000061}, +{0x0001FC, 0x0001FC, 0x0000C6}, +{0x0001FD, 0x0001FD, 0x0000E6}, +{0x0001FE, 0x0001FE, 0x0000D8}, +{0x0001FF, 0x0001FF, 0x0000F8}, +{0x000200, 0x000200, 0x000041}, +{0x000201, 0x000201, 0x000061}, +{0x000202, 0x000202, 0x000041}, +{0x000203, 0x000203, 0x000061}, +{0x000204, 0x000204, 0x000045}, +{0x000205, 0x000205, 0x000065}, +{0x000206, 0x000206, 0x000045}, +{0x000207, 0x000207, 0x000065}, +{0x000208, 0x000208, 0x000049}, +{0x000209, 0x000209, 0x000069}, +{0x00020A, 0x00020A, 0x000049}, +{0x00020B, 0x00020B, 0x000069}, +{0x00020C, 0x00020C, 0x00004F}, +{0x00020D, 0x00020D, 0x00006F}, +{0x00020E, 0x00020E, 0x00004F}, +{0x00020F, 0x00020F, 0x00006F}, +{0x000210, 0x000210, 0x000052}, +{0x000211, 0x000211, 0x000072}, +{0x000212, 0x000212, 0x000052}, +{0x000213, 0x000213, 0x000072}, +{0x000214, 0x000214, 0x000055}, +{0x000215, 0x000215, 0x000075}, +{0x000216, 0x000216, 0x000055}, +{0x000217, 0x000217, 0x000075}, +{0x000218, 0x000218, 0x000053}, +{0x000219, 0x000219, 0x000073}, +{0x00021A, 0x00021A, 0x000054}, +{0x00021B, 0x00021B, 0x000074}, +{0x00021E, 0x00021E, 0x000048}, +{0x00021F, 0x00021F, 0x000068}, +{0x000226, 0x000226, 0x000041}, +{0x000227, 0x000227, 0x000061}, +{0x000228, 0x000228, 0x000045}, +{0x000229, 0x000229, 0x000065}, +{0x00022A, 0x00022A, 0x00004F}, +{0x00022B, 0x00022B, 0x00006F}, +{0x00022C, 0x00022C, 0x00004F}, +{0x00022D, 0x00022D, 0x00006F}, +{0x00022E, 0x00022E, 0x00004F}, +{0x00022F, 0x00022F, 0x00006F}, +{0x000230, 0x000230, 0x00004F}, +{0x000231, 0x000231, 0x00006F}, +{0x000232, 0x000232, 0x000059}, +{0x000233, 0x000233, 0x000079}, +{0x000340, 0x000340, 0x000300}, +{0x000341, 0x000341, 0x000301}, +{0x000343, 0x000343, 0x000313}, +{0x000344, 0x000344, 0x000308}, +{0x000374, 0x000374, 0x0002B9}, +{0x00037E, 0x00037E, 0x00003B}, +{0x000385, 0x000385, 0x0000A8}, +{0x000386, 0x000386, 0x000391}, +{0x000387, 0x000387, 0x0000B7}, +{0x000388, 0x000388, 0x000395}, +{0x000389, 0x000389, 0x000397}, +{0x00038A, 0x00038A, 0x000399}, +{0x00038C, 0x00038C, 0x00039F}, +{0x00038E, 0x00038E, 0x0003A5}, +{0x00038F, 0x00038F, 0x0003A9}, +{0x000390, 0x000390, 0x0003B9}, +{0x0003AA, 0x0003AA, 0x000399}, +{0x0003AB, 0x0003AB, 0x0003A5}, +{0x0003AC, 0x0003AC, 0x0003B1}, +{0x0003AD, 0x0003AD, 0x0003B5}, +{0x0003AE, 0x0003AE, 0x0003B7}, +{0x0003AF, 0x0003AF, 0x0003B9}, +{0x0003B0, 0x0003B0, 0x0003C5}, +{0x0003CA, 0x0003CA, 0x0003B9}, +{0x0003CB, 0x0003CB, 0x0003C5}, +{0x0003CC, 0x0003CC, 0x0003BF}, +{0x0003CD, 0x0003CD, 0x0003C5}, +{0x0003CE, 0x0003CE, 0x0003C9}, +{0x0003D3, 0x0003D4, 0x0003D2}, +{0x000400, 0x000401, 0x000415}, +{0x000403, 0x000403, 0x000413}, +{0x000407, 0x000407, 0x000406}, +{0x00040C, 0x00040C, 0x00041A}, +{0x00040D, 0x00040D, 0x000418}, +{0x00040E, 0x00040E, 0x000423}, +{0x000419, 0x000419, 0x000418}, +{0x000439, 0x000439, 0x000438}, +{0x000450, 0x000451, 0x000435}, +{0x000453, 0x000453, 0x000433}, +{0x000457, 0x000457, 0x000456}, +{0x00045C, 0x00045C, 0x00043A}, +{0x00045D, 0x00045D, 0x000438}, +{0x00045E, 0x00045E, 0x000443}, +{0x000476, 0x000476, 0x000474}, +{0x000477, 0x000477, 0x000475}, +{0x0004C1, 0x0004C1, 0x000416}, +{0x0004C2, 0x0004C2, 0x000436}, +{0x0004D0, 0x0004D0, 0x000410}, +{0x0004D1, 0x0004D1, 0x000430}, +{0x0004D2, 0x0004D2, 0x000410}, +{0x0004D3, 0x0004D3, 0x000430}, +{0x0004D6, 0x0004D6, 0x000415}, +{0x0004D7, 0x0004D7, 0x000435}, +{0x0004DA, 0x0004DA, 0x0004D8}, +{0x0004DB, 0x0004DB, 0x0004D9}, +{0x0004DC, 0x0004DC, 0x000416}, +{0x0004DD, 0x0004DD, 0x000436}, +{0x0004DE, 0x0004DE, 0x000417}, +{0x0004DF, 0x0004DF, 0x000437}, +{0x0004E2, 0x0004E2, 0x000418}, +{0x0004E3, 0x0004E3, 0x000438}, +{0x0004E4, 0x0004E4, 0x000418}, +{0x0004E5, 0x0004E5, 0x000438}, +{0x0004E6, 0x0004E6, 0x00041E}, +{0x0004E7, 0x0004E7, 0x00043E}, +{0x0004EA, 0x0004EA, 0x0004E8}, +{0x0004EB, 0x0004EB, 0x0004E9}, +{0x0004EC, 0x0004EC, 0x00042D}, +{0x0004ED, 0x0004ED, 0x00044D}, +{0x0004EE, 0x0004EE, 0x000423}, +{0x0004EF, 0x0004EF, 0x000443}, +{0x0004F0, 0x0004F0, 0x000423}, +{0x0004F1, 0x0004F1, 0x000443}, +{0x0004F2, 0x0004F2, 0x000423}, +{0x0004F3, 0x0004F3, 0x000443}, +{0x0004F4, 0x0004F4, 0x000427}, +{0x0004F5, 0x0004F5, 0x000447}, +{0x0004F8, 0x0004F8, 0x00042B}, +{0x0004F9, 0x0004F9, 0x00044B}, +{0x000622, 0x000623, 0x000627}, +{0x000624, 0x000624, 0x000648}, +{0x000625, 0x000625, 0x000627}, +{0x000626, 0x000626, 0x00064A}, +{0x0006C0, 0x0006C0, 0x0006D5}, +{0x0006C2, 0x0006C2, 0x0006C1}, +{0x0006D3, 0x0006D3, 0x0006D2}, +{0x000929, 0x000929, 0x000928}, +{0x000931, 0x000931, 0x000930}, +{0x000934, 0x000934, 0x000933}, +{0x000958, 0x000958, 0x000915}, +{0x000959, 0x000959, 0x000916}, +{0x00095A, 0x00095A, 0x000917}, +{0x00095B, 0x00095B, 0x00091C}, +{0x00095C, 0x00095C, 0x000921}, +{0x00095D, 0x00095D, 0x000922}, +{0x00095E, 0x00095E, 0x00092B}, +{0x00095F, 0x00095F, 0x00092F}, +{0x0009CB, 0x0009CC, 0x0009C7}, +{0x0009DC, 0x0009DC, 0x0009A1}, +{0x0009DD, 0x0009DD, 0x0009A2}, +{0x0009DF, 0x0009DF, 0x0009AF}, +{0x000A33, 0x000A33, 0x000A32}, +{0x000A36, 0x000A36, 0x000A38}, +{0x000A59, 0x000A59, 0x000A16}, +{0x000A5A, 0x000A5A, 0x000A17}, +{0x000A5B, 0x000A5B, 0x000A1C}, +{0x000A5E, 0x000A5E, 0x000A2B}, +{0x000B48, 0x000B48, 0x000B47}, +{0x000B4B, 0x000B4C, 0x000B47}, +{0x000B5C, 0x000B5C, 0x000B21}, +{0x000B5D, 0x000B5D, 0x000B22}, +{0x000B94, 0x000B94, 0x000B92}, +{0x000BCA, 0x000BCA, 0x000BC6}, +{0x000BCB, 0x000BCB, 0x000BC7}, +{0x000BCC, 0x000BCC, 0x000BC6}, +{0x000C48, 0x000C48, 0x000C46}, +{0x000CC0, 0x000CC0, 0x000CBF}, +{0x000CC7, 0x000CC8, 0x000CC6}, +{0x000CCA, 0x000CCB, 0x000CC6}, +{0x000D4A, 0x000D4A, 0x000D46}, +{0x000D4B, 0x000D4B, 0x000D47}, +{0x000D4C, 0x000D4C, 0x000D46}, +{0x000DDA, 0x000DDA, 0x000DD9}, +{0x000DDC, 0x000DDE, 0x000DD9}, +{0x000F43, 0x000F43, 0x000F42}, +{0x000F4D, 0x000F4D, 0x000F4C}, +{0x000F52, 0x000F52, 0x000F51}, +{0x000F57, 0x000F57, 0x000F56}, +{0x000F5C, 0x000F5C, 0x000F5B}, +{0x000F69, 0x000F69, 0x000F40}, +{0x000F73, 0x000F73, 0x000F71}, +{0x000F75, 0x000F75, 0x000F71}, +{0x000F76, 0x000F76, 0x000FB2}, +{0x000F78, 0x000F78, 0x000FB3}, +{0x000F81, 0x000F81, 0x000F71}, +{0x000F93, 0x000F93, 0x000F92}, +{0x000F9D, 0x000F9D, 0x000F9C}, +{0x000FA2, 0x000FA2, 0x000FA1}, +{0x000FA7, 0x000FA7, 0x000FA6}, +{0x000FAC, 0x000FAC, 0x000FAB}, +{0x000FB9, 0x000FB9, 0x000F90}, +{0x001026, 0x001026, 0x001025}, +{0x001B06, 0x001B06, 0x001B05}, +{0x001B08, 0x001B08, 0x001B07}, +{0x001B0A, 0x001B0A, 0x001B09}, +{0x001B0C, 0x001B0C, 0x001B0B}, +{0x001B0E, 0x001B0E, 0x001B0D}, +{0x001B12, 0x001B12, 0x001B11}, +{0x001B3B, 0x001B3B, 0x001B3A}, +{0x001B3D, 0x001B3D, 0x001B3C}, +{0x001B40, 0x001B40, 0x001B3E}, +{0x001B41, 0x001B41, 0x001B3F}, +{0x001B43, 0x001B43, 0x001B42}, +{0x001E00, 0x001E00, 0x000041}, +{0x001E01, 0x001E01, 0x000061}, +{0x001E02, 0x001E02, 0x000042}, +{0x001E03, 0x001E03, 0x000062}, +{0x001E04, 0x001E04, 0x000042}, +{0x001E05, 0x001E05, 0x000062}, +{0x001E06, 0x001E06, 0x000042}, +{0x001E07, 0x001E07, 0x000062}, +{0x001E08, 0x001E08, 0x000043}, +{0x001E09, 0x001E09, 0x000063}, +{0x001E0A, 0x001E0A, 0x000044}, +{0x001E0B, 0x001E0B, 0x000064}, +{0x001E0C, 0x001E0C, 0x000044}, +{0x001E0D, 0x001E0D, 0x000064}, +{0x001E0E, 0x001E0E, 0x000044}, +{0x001E0F, 0x001E0F, 0x000064}, +{0x001E10, 0x001E10, 0x000044}, +{0x001E11, 0x001E11, 0x000064}, +{0x001E12, 0x001E12, 0x000044}, +{0x001E13, 0x001E13, 0x000064}, +{0x001E14, 0x001E14, 0x000045}, +{0x001E15, 0x001E15, 0x000065}, +{0x001E16, 0x001E16, 0x000045}, +{0x001E17, 0x001E17, 0x000065}, +{0x001E18, 0x001E18, 0x000045}, +{0x001E19, 0x001E19, 0x000065}, +{0x001E1A, 0x001E1A, 0x000045}, +{0x001E1B, 0x001E1B, 0x000065}, +{0x001E1C, 0x001E1C, 0x000045}, +{0x001E1D, 0x001E1D, 0x000065}, +{0x001E1E, 0x001E1E, 0x000046}, +{0x001E1F, 0x001E1F, 0x000066}, +{0x001E20, 0x001E20, 0x000047}, +{0x001E21, 0x001E21, 0x000067}, +{0x001E22, 0x001E22, 0x000048}, +{0x001E23, 0x001E23, 0x000068}, +{0x001E24, 0x001E24, 0x000048}, +{0x001E25, 0x001E25, 0x000068}, +{0x001E26, 0x001E26, 0x000048}, +{0x001E27, 0x001E27, 0x000068}, +{0x001E28, 0x001E28, 0x000048}, +{0x001E29, 0x001E29, 0x000068}, +{0x001E2A, 0x001E2A, 0x000048}, +{0x001E2B, 0x001E2B, 0x000068}, +{0x001E2C, 0x001E2C, 0x000049}, +{0x001E2D, 0x001E2D, 0x000069}, +{0x001E2E, 0x001E2E, 0x000049}, +{0x001E2F, 0x001E2F, 0x000069}, +{0x001E30, 0x001E30, 0x00004B}, +{0x001E31, 0x001E31, 0x00006B}, +{0x001E32, 0x001E32, 0x00004B}, +{0x001E33, 0x001E33, 0x00006B}, +{0x001E34, 0x001E34, 0x00004B}, +{0x001E35, 0x001E35, 0x00006B}, +{0x001E36, 0x001E36, 0x00004C}, +{0x001E37, 0x001E37, 0x00006C}, +{0x001E38, 0x001E38, 0x00004C}, +{0x001E39, 0x001E39, 0x00006C}, +{0x001E3A, 0x001E3A, 0x00004C}, +{0x001E3B, 0x001E3B, 0x00006C}, +{0x001E3C, 0x001E3C, 0x00004C}, +{0x001E3D, 0x001E3D, 0x00006C}, +{0x001E3E, 0x001E3E, 0x00004D}, +{0x001E3F, 0x001E3F, 0x00006D}, +{0x001E40, 0x001E40, 0x00004D}, +{0x001E41, 0x001E41, 0x00006D}, +{0x001E42, 0x001E42, 0x00004D}, +{0x001E43, 0x001E43, 0x00006D}, +{0x001E44, 0x001E44, 0x00004E}, +{0x001E45, 0x001E45, 0x00006E}, +{0x001E46, 0x001E46, 0x00004E}, +{0x001E47, 0x001E47, 0x00006E}, +{0x001E48, 0x001E48, 0x00004E}, +{0x001E49, 0x001E49, 0x00006E}, +{0x001E4A, 0x001E4A, 0x00004E}, +{0x001E4B, 0x001E4B, 0x00006E}, +{0x001E4C, 0x001E4C, 0x00004F}, +{0x001E4D, 0x001E4D, 0x00006F}, +{0x001E4E, 0x001E4E, 0x00004F}, +{0x001E4F, 0x001E4F, 0x00006F}, +{0x001E50, 0x001E50, 0x00004F}, +{0x001E51, 0x001E51, 0x00006F}, +{0x001E52, 0x001E52, 0x00004F}, +{0x001E53, 0x001E53, 0x00006F}, +{0x001E54, 0x001E54, 0x000050}, +{0x001E55, 0x001E55, 0x000070}, +{0x001E56, 0x001E56, 0x000050}, +{0x001E57, 0x001E57, 0x000070}, +{0x001E58, 0x001E58, 0x000052}, +{0x001E59, 0x001E59, 0x000072}, +{0x001E5A, 0x001E5A, 0x000052}, +{0x001E5B, 0x001E5B, 0x000072}, +{0x001E5C, 0x001E5C, 0x000052}, +{0x001E5D, 0x001E5D, 0x000072}, +{0x001E5E, 0x001E5E, 0x000052}, +{0x001E5F, 0x001E5F, 0x000072}, +{0x001E60, 0x001E60, 0x000053}, +{0x001E61, 0x001E61, 0x000073}, +{0x001E62, 0x001E62, 0x000053}, +{0x001E63, 0x001E63, 0x000073}, +{0x001E64, 0x001E64, 0x000053}, +{0x001E65, 0x001E65, 0x000073}, +{0x001E66, 0x001E66, 0x000053}, +{0x001E67, 0x001E67, 0x000073}, +{0x001E68, 0x001E68, 0x000053}, +{0x001E69, 0x001E69, 0x000073}, +{0x001E6A, 0x001E6A, 0x000054}, +{0x001E6B, 0x001E6B, 0x000074}, +{0x001E6C, 0x001E6C, 0x000054}, +{0x001E6D, 0x001E6D, 0x000074}, +{0x001E6E, 0x001E6E, 0x000054}, +{0x001E6F, 0x001E6F, 0x000074}, +{0x001E70, 0x001E70, 0x000054}, +{0x001E71, 0x001E71, 0x000074}, +{0x001E72, 0x001E72, 0x000055}, +{0x001E73, 0x001E73, 0x000075}, +{0x001E74, 0x001E74, 0x000055}, +{0x001E75, 0x001E75, 0x000075}, +{0x001E76, 0x001E76, 0x000055}, +{0x001E77, 0x001E77, 0x000075}, +{0x001E78, 0x001E78, 0x000055}, +{0x001E79, 0x001E79, 0x000075}, +{0x001E7A, 0x001E7A, 0x000055}, +{0x001E7B, 0x001E7B, 0x000075}, +{0x001E7C, 0x001E7C, 0x000056}, +{0x001E7D, 0x001E7D, 0x000076}, +{0x001E7E, 0x001E7E, 0x000056}, +{0x001E7F, 0x001E7F, 0x000076}, +{0x001E80, 0x001E80, 0x000057}, +{0x001E81, 0x001E81, 0x000077}, +{0x001E82, 0x001E82, 0x000057}, +{0x001E83, 0x001E83, 0x000077}, +{0x001E84, 0x001E84, 0x000057}, +{0x001E85, 0x001E85, 0x000077}, +{0x001E86, 0x001E86, 0x000057}, +{0x001E87, 0x001E87, 0x000077}, +{0x001E88, 0x001E88, 0x000057}, +{0x001E89, 0x001E89, 0x000077}, +{0x001E8A, 0x001E8A, 0x000058}, +{0x001E8B, 0x001E8B, 0x000078}, +{0x001E8C, 0x001E8C, 0x000058}, +{0x001E8D, 0x001E8D, 0x000078}, +{0x001E8E, 0x001E8E, 0x000059}, +{0x001E8F, 0x001E8F, 0x000079}, +{0x001E90, 0x001E90, 0x00005A}, +{0x001E91, 0x001E91, 0x00007A}, +{0x001E92, 0x001E92, 0x00005A}, +{0x001E93, 0x001E93, 0x00007A}, +{0x001E94, 0x001E94, 0x00005A}, +{0x001E95, 0x001E95, 0x00007A}, +{0x001E96, 0x001E96, 0x000068}, +{0x001E97, 0x001E97, 0x000074}, +{0x001E98, 0x001E98, 0x000077}, +{0x001E99, 0x001E99, 0x000079}, +{0x001E9B, 0x001E9B, 0x00017F}, +{0x001EA0, 0x001EA0, 0x000041}, +{0x001EA1, 0x001EA1, 0x000061}, +{0x001EA2, 0x001EA2, 0x000041}, +{0x001EA3, 0x001EA3, 0x000061}, +{0x001EA4, 0x001EA4, 0x000041}, +{0x001EA5, 0x001EA5, 0x000061}, +{0x001EA6, 0x001EA6, 0x000041}, +{0x001EA7, 0x001EA7, 0x000061}, +{0x001EA8, 0x001EA8, 0x000041}, +{0x001EA9, 0x001EA9, 0x000061}, +{0x001EAA, 0x001EAA, 0x000041}, +{0x001EAB, 0x001EAB, 0x000061}, +{0x001EAC, 0x001EAC, 0x000041}, +{0x001EAD, 0x001EAD, 0x000061}, +{0x001EAE, 0x001EAE, 0x000041}, +{0x001EAF, 0x001EAF, 0x000061}, +{0x001EB0, 0x001EB0, 0x000041}, +{0x001EB1, 0x001EB1, 0x000061}, +{0x001EB2, 0x001EB2, 0x000041}, +{0x001EB3, 0x001EB3, 0x000061}, +{0x001EB4, 0x001EB4, 0x000041}, +{0x001EB5, 0x001EB5, 0x000061}, +{0x001EB6, 0x001EB6, 0x000041}, +{0x001EB7, 0x001EB7, 0x000061}, +{0x001EB8, 0x001EB8, 0x000045}, +{0x001EB9, 0x001EB9, 0x000065}, +{0x001EBA, 0x001EBA, 0x000045}, +{0x001EBB, 0x001EBB, 0x000065}, +{0x001EBC, 0x001EBC, 0x000045}, +{0x001EBD, 0x001EBD, 0x000065}, +{0x001EBE, 0x001EBE, 0x000045}, +{0x001EBF, 0x001EBF, 0x000065}, +{0x001EC0, 0x001EC0, 0x000045}, +{0x001EC1, 0x001EC1, 0x000065}, +{0x001EC2, 0x001EC2, 0x000045}, +{0x001EC3, 0x001EC3, 0x000065}, +{0x001EC4, 0x001EC4, 0x000045}, +{0x001EC5, 0x001EC5, 0x000065}, +{0x001EC6, 0x001EC6, 0x000045}, +{0x001EC7, 0x001EC7, 0x000065}, +{0x001EC8, 0x001EC8, 0x000049}, +{0x001EC9, 0x001EC9, 0x000069}, +{0x001ECA, 0x001ECA, 0x000049}, +{0x001ECB, 0x001ECB, 0x000069}, +{0x001ECC, 0x001ECC, 0x00004F}, +{0x001ECD, 0x001ECD, 0x00006F}, +{0x001ECE, 0x001ECE, 0x00004F}, +{0x001ECF, 0x001ECF, 0x00006F}, +{0x001ED0, 0x001ED0, 0x00004F}, +{0x001ED1, 0x001ED1, 0x00006F}, +{0x001ED2, 0x001ED2, 0x00004F}, +{0x001ED3, 0x001ED3, 0x00006F}, +{0x001ED4, 0x001ED4, 0x00004F}, +{0x001ED5, 0x001ED5, 0x00006F}, +{0x001ED6, 0x001ED6, 0x00004F}, +{0x001ED7, 0x001ED7, 0x00006F}, +{0x001ED8, 0x001ED8, 0x00004F}, +{0x001ED9, 0x001ED9, 0x00006F}, +{0x001EDA, 0x001EDA, 0x00004F}, +{0x001EDB, 0x001EDB, 0x00006F}, +{0x001EDC, 0x001EDC, 0x00004F}, +{0x001EDD, 0x001EDD, 0x00006F}, +{0x001EDE, 0x001EDE, 0x00004F}, +{0x001EDF, 0x001EDF, 0x00006F}, +{0x001EE0, 0x001EE0, 0x00004F}, +{0x001EE1, 0x001EE1, 0x00006F}, +{0x001EE2, 0x001EE2, 0x00004F}, +{0x001EE3, 0x001EE3, 0x00006F}, +{0x001EE4, 0x001EE4, 0x000055}, +{0x001EE5, 0x001EE5, 0x000075}, +{0x001EE6, 0x001EE6, 0x000055}, +{0x001EE7, 0x001EE7, 0x000075}, +{0x001EE8, 0x001EE8, 0x000055}, +{0x001EE9, 0x001EE9, 0x000075}, +{0x001EEA, 0x001EEA, 0x000055}, +{0x001EEB, 0x001EEB, 0x000075}, +{0x001EEC, 0x001EEC, 0x000055}, +{0x001EED, 0x001EED, 0x000075}, +{0x001EEE, 0x001EEE, 0x000055}, +{0x001EEF, 0x001EEF, 0x000075}, +{0x001EF0, 0x001EF0, 0x000055}, +{0x001EF1, 0x001EF1, 0x000075}, +{0x001EF2, 0x001EF2, 0x000059}, +{0x001EF3, 0x001EF3, 0x000079}, +{0x001EF4, 0x001EF4, 0x000059}, +{0x001EF5, 0x001EF5, 0x000079}, +{0x001EF6, 0x001EF6, 0x000059}, +{0x001EF7, 0x001EF7, 0x000079}, +{0x001EF8, 0x001EF8, 0x000059}, +{0x001EF9, 0x001EF9, 0x000079}, +{0x001F00, 0x001F07, 0x0003B1}, +{0x001F08, 0x001F0F, 0x000391}, +{0x001F10, 0x001F15, 0x0003B5}, +{0x001F18, 0x001F1D, 0x000395}, +{0x001F20, 0x001F27, 0x0003B7}, +{0x001F28, 0x001F2F, 0x000397}, +{0x001F30, 0x001F37, 0x0003B9}, +{0x001F38, 0x001F3F, 0x000399}, +{0x001F40, 0x001F45, 0x0003BF}, +{0x001F48, 0x001F4D, 0x00039F}, +{0x001F50, 0x001F57, 0x0003C5}, +{0x001F59, 0x001F59, 0x0003A5}, +{0x001F5B, 0x001F5B, 0x0003A5}, +{0x001F5D, 0x001F5D, 0x0003A5}, +{0x001F5F, 0x001F5F, 0x0003A5}, +{0x001F60, 0x001F67, 0x0003C9}, +{0x001F68, 0x001F6F, 0x0003A9}, +{0x001F70, 0x001F71, 0x0003B1}, +{0x001F72, 0x001F73, 0x0003B5}, +{0x001F74, 0x001F75, 0x0003B7}, +{0x001F76, 0x001F77, 0x0003B9}, +{0x001F78, 0x001F79, 0x0003BF}, +{0x001F7A, 0x001F7B, 0x0003C5}, +{0x001F7C, 0x001F7D, 0x0003C9}, +{0x001F80, 0x001F87, 0x0003B1}, +{0x001F88, 0x001F8F, 0x000391}, +{0x001F90, 0x001F97, 0x0003B7}, +{0x001F98, 0x001F9F, 0x000397}, +{0x001FA0, 0x001FA7, 0x0003C9}, +{0x001FA8, 0x001FAF, 0x0003A9}, +{0x001FB0, 0x001FB4, 0x0003B1}, +{0x001FB6, 0x001FB7, 0x0003B1}, +{0x001FB8, 0x001FBC, 0x000391}, +{0x001FBE, 0x001FBE, 0x0003B9}, +{0x001FC1, 0x001FC1, 0x0000A8}, +{0x001FC2, 0x001FC4, 0x0003B7}, +{0x001FC6, 0x001FC7, 0x0003B7}, +{0x001FC8, 0x001FC9, 0x000395}, +{0x001FCA, 0x001FCC, 0x000397}, +{0x001FCD, 0x001FCF, 0x001FBF}, +{0x001FD0, 0x001FD3, 0x0003B9}, +{0x001FD6, 0x001FD7, 0x0003B9}, +{0x001FD8, 0x001FDB, 0x000399}, +{0x001FDD, 0x001FDF, 0x001FFE}, +{0x001FE0, 0x001FE3, 0x0003C5}, +{0x001FE4, 0x001FE5, 0x0003C1}, +{0x001FE6, 0x001FE7, 0x0003C5}, +{0x001FE8, 0x001FEB, 0x0003A5}, +{0x001FEC, 0x001FEC, 0x0003A1}, +{0x001FED, 0x001FEE, 0x0000A8}, +{0x001FEF, 0x001FEF, 0x000060}, +{0x001FF2, 0x001FF4, 0x0003C9}, +{0x001FF6, 0x001FF7, 0x0003C9}, +{0x001FF8, 0x001FF9, 0x00039F}, +{0x001FFA, 0x001FFC, 0x0003A9}, +{0x001FFD, 0x001FFD, 0x0000B4}, +{0x002000, 0x002000, 0x002002}, +{0x002001, 0x002001, 0x002003}, +{0x002126, 0x002126, 0x0003A9}, +{0x00212A, 0x00212A, 0x00004B}, +{0x00212B, 0x00212B, 0x000041}, +{0x00219A, 0x00219A, 0x002190}, +{0x00219B, 0x00219B, 0x002192}, +{0x0021AE, 0x0021AE, 0x002194}, +{0x0021CD, 0x0021CD, 0x0021D0}, +{0x0021CE, 0x0021CE, 0x0021D4}, +{0x0021CF, 0x0021CF, 0x0021D2}, +{0x002204, 0x002204, 0x002203}, +{0x002209, 0x002209, 0x002208}, +{0x00220C, 0x00220C, 0x00220B}, +{0x002224, 0x002224, 0x002223}, +{0x002226, 0x002226, 0x002225}, +{0x002241, 0x002241, 0x00223C}, +{0x002244, 0x002244, 0x002243}, +{0x002247, 0x002247, 0x002245}, +{0x002249, 0x002249, 0x002248}, +{0x002260, 0x002260, 0x00003D}, +{0x002262, 0x002262, 0x002261}, +{0x00226D, 0x00226D, 0x00224D}, +{0x00226E, 0x00226E, 0x00003C}, +{0x00226F, 0x00226F, 0x00003E}, +{0x002270, 0x002270, 0x002264}, +{0x002271, 0x002271, 0x002265}, +{0x002274, 0x002274, 0x002272}, +{0x002275, 0x002275, 0x002273}, +{0x002278, 0x002278, 0x002276}, +{0x002279, 0x002279, 0x002277}, +{0x002280, 0x002280, 0x00227A}, +{0x002281, 0x002281, 0x00227B}, +{0x002284, 0x002284, 0x002282}, +{0x002285, 0x002285, 0x002283}, +{0x002288, 0x002288, 0x002286}, +{0x002289, 0x002289, 0x002287}, +{0x0022AC, 0x0022AC, 0x0022A2}, +{0x0022AD, 0x0022AD, 0x0022A8}, +{0x0022AE, 0x0022AE, 0x0022A9}, +{0x0022AF, 0x0022AF, 0x0022AB}, +{0x0022E0, 0x0022E0, 0x00227C}, +{0x0022E1, 0x0022E1, 0x00227D}, +{0x0022E2, 0x0022E2, 0x002291}, +{0x0022E3, 0x0022E3, 0x002292}, +{0x0022EA, 0x0022EA, 0x0022B2}, +{0x0022EB, 0x0022EB, 0x0022B3}, +{0x0022EC, 0x0022EC, 0x0022B4}, +{0x0022ED, 0x0022ED, 0x0022B5}, +{0x002329, 0x002329, 0x003008}, +{0x00232A, 0x00232A, 0x003009}, +{0x002ADC, 0x002ADC, 0x002ADD}, +{0x00304C, 0x00304C, 0x00304B}, +{0x00304E, 0x00304E, 0x00304D}, +{0x003050, 0x003050, 0x00304F}, +{0x003052, 0x003052, 0x003051}, +{0x003054, 0x003054, 0x003053}, +{0x003056, 0x003056, 0x003055}, +{0x003058, 0x003058, 0x003057}, +{0x00305A, 0x00305A, 0x003059}, +{0x00305C, 0x00305C, 0x00305B}, +{0x00305E, 0x00305E, 0x00305D}, +{0x003060, 0x003060, 0x00305F}, +{0x003062, 0x003062, 0x003061}, +{0x003065, 0x003065, 0x003064}, +{0x003067, 0x003067, 0x003066}, +{0x003069, 0x003069, 0x003068}, +{0x003070, 0x003071, 0x00306F}, +{0x003073, 0x003074, 0x003072}, +{0x003076, 0x003077, 0x003075}, +{0x003079, 0x00307A, 0x003078}, +{0x00307C, 0x00307D, 0x00307B}, +{0x003094, 0x003094, 0x003046}, +{0x00309E, 0x00309E, 0x00309D}, +{0x0030AC, 0x0030AC, 0x0030AB}, +{0x0030AE, 0x0030AE, 0x0030AD}, +{0x0030B0, 0x0030B0, 0x0030AF}, +{0x0030B2, 0x0030B2, 0x0030B1}, +{0x0030B4, 0x0030B4, 0x0030B3}, +{0x0030B6, 0x0030B6, 0x0030B5}, +{0x0030B8, 0x0030B8, 0x0030B7}, +{0x0030BA, 0x0030BA, 0x0030B9}, +{0x0030BC, 0x0030BC, 0x0030BB}, +{0x0030BE, 0x0030BE, 0x0030BD}, +{0x0030C0, 0x0030C0, 0x0030BF}, +{0x0030C2, 0x0030C2, 0x0030C1}, +{0x0030C5, 0x0030C5, 0x0030C4}, +{0x0030C7, 0x0030C7, 0x0030C6}, +{0x0030C9, 0x0030C9, 0x0030C8}, +{0x0030D0, 0x0030D1, 0x0030CF}, +{0x0030D3, 0x0030D4, 0x0030D2}, +{0x0030D6, 0x0030D7, 0x0030D5}, +{0x0030D9, 0x0030DA, 0x0030D8}, +{0x0030DC, 0x0030DD, 0x0030DB}, +{0x0030F4, 0x0030F4, 0x0030A6}, +{0x0030F7, 0x0030F7, 0x0030EF}, +{0x0030F8, 0x0030F8, 0x0030F0}, +{0x0030F9, 0x0030F9, 0x0030F1}, +{0x0030FA, 0x0030FA, 0x0030F2}, +{0x0030FE, 0x0030FE, 0x0030FD}, +{0x00AC00, 0x00AE4B, 0x001100}, +{0x00AE4C, 0x00B097, 0x001101}, +{0x00B098, 0x00B2E3, 0x001102}, +{0x00B2E4, 0x00B52F, 0x001103}, +{0x00B530, 0x00B77B, 0x001104}, +{0x00B77C, 0x00B9C7, 0x001105}, +{0x00B9C8, 0x00BC13, 0x001106}, +{0x00BC14, 0x00BE5F, 0x001107}, +{0x00BE60, 0x00C0AB, 0x001108}, +{0x00C0AC, 0x00C2F7, 0x001109}, +{0x00C2F8, 0x00C543, 0x00110A}, +{0x00C544, 0x00C78F, 0x00110B}, +{0x00C790, 0x00C9DB, 0x00110C}, +{0x00C9DC, 0x00CC27, 0x00110D}, +{0x00CC28, 0x00CE73, 0x00110E}, +{0x00CE74, 0x00D0BF, 0x00110F}, +{0x00D0C0, 0x00D30B, 0x001110}, +{0x00D30C, 0x00D557, 0x001111}, +{0x00D558, 0x00D7A3, 0x001112}, +{0x00F900, 0x00F900, 0x008C48}, +{0x00F901, 0x00F901, 0x0066F4}, +{0x00F902, 0x00F902, 0x008ECA}, +{0x00F903, 0x00F903, 0x008CC8}, +{0x00F904, 0x00F904, 0x006ED1}, +{0x00F905, 0x00F905, 0x004E32}, +{0x00F906, 0x00F906, 0x0053E5}, +{0x00F907, 0x00F908, 0x009F9C}, +{0x00F909, 0x00F909, 0x005951}, +{0x00F90A, 0x00F90A, 0x0091D1}, +{0x00F90B, 0x00F90B, 0x005587}, +{0x00F90C, 0x00F90C, 0x005948}, +{0x00F90D, 0x00F90D, 0x0061F6}, +{0x00F90E, 0x00F90E, 0x007669}, +{0x00F90F, 0x00F90F, 0x007F85}, +{0x00F910, 0x00F910, 0x00863F}, +{0x00F911, 0x00F911, 0x0087BA}, +{0x00F912, 0x00F912, 0x0088F8}, +{0x00F913, 0x00F913, 0x00908F}, +{0x00F914, 0x00F914, 0x006A02}, +{0x00F915, 0x00F915, 0x006D1B}, +{0x00F916, 0x00F916, 0x0070D9}, +{0x00F917, 0x00F917, 0x0073DE}, +{0x00F918, 0x00F918, 0x00843D}, +{0x00F919, 0x00F919, 0x00916A}, +{0x00F91A, 0x00F91A, 0x0099F1}, +{0x00F91B, 0x00F91B, 0x004E82}, +{0x00F91C, 0x00F91C, 0x005375}, +{0x00F91D, 0x00F91D, 0x006B04}, +{0x00F91E, 0x00F91E, 0x00721B}, +{0x00F91F, 0x00F91F, 0x00862D}, +{0x00F920, 0x00F920, 0x009E1E}, +{0x00F921, 0x00F921, 0x005D50}, +{0x00F922, 0x00F922, 0x006FEB}, +{0x00F923, 0x00F923, 0x0085CD}, +{0x00F924, 0x00F924, 0x008964}, +{0x00F925, 0x00F925, 0x0062C9}, +{0x00F926, 0x00F926, 0x0081D8}, +{0x00F927, 0x00F927, 0x00881F}, +{0x00F928, 0x00F928, 0x005ECA}, +{0x00F929, 0x00F929, 0x006717}, +{0x00F92A, 0x00F92A, 0x006D6A}, +{0x00F92B, 0x00F92B, 0x0072FC}, +{0x00F92C, 0x00F92C, 0x0090CE}, +{0x00F92D, 0x00F92D, 0x004F86}, +{0x00F92E, 0x00F92E, 0x0051B7}, +{0x00F92F, 0x00F92F, 0x0052DE}, +{0x00F930, 0x00F930, 0x0064C4}, +{0x00F931, 0x00F931, 0x006AD3}, +{0x00F932, 0x00F932, 0x007210}, +{0x00F933, 0x00F933, 0x0076E7}, +{0x00F934, 0x00F934, 0x008001}, +{0x00F935, 0x00F935, 0x008606}, +{0x00F936, 0x00F936, 0x00865C}, +{0x00F937, 0x00F937, 0x008DEF}, +{0x00F938, 0x00F938, 0x009732}, +{0x00F939, 0x00F939, 0x009B6F}, +{0x00F93A, 0x00F93A, 0x009DFA}, +{0x00F93B, 0x00F93B, 0x00788C}, +{0x00F93C, 0x00F93C, 0x00797F}, +{0x00F93D, 0x00F93D, 0x007DA0}, +{0x00F93E, 0x00F93E, 0x0083C9}, +{0x00F93F, 0x00F93F, 0x009304}, +{0x00F940, 0x00F940, 0x009E7F}, +{0x00F941, 0x00F941, 0x008AD6}, +{0x00F942, 0x00F942, 0x0058DF}, +{0x00F943, 0x00F943, 0x005F04}, +{0x00F944, 0x00F944, 0x007C60}, +{0x00F945, 0x00F945, 0x00807E}, +{0x00F946, 0x00F946, 0x007262}, +{0x00F947, 0x00F947, 0x0078CA}, +{0x00F948, 0x00F948, 0x008CC2}, +{0x00F949, 0x00F949, 0x0096F7}, +{0x00F94A, 0x00F94A, 0x0058D8}, +{0x00F94B, 0x00F94B, 0x005C62}, +{0x00F94C, 0x00F94C, 0x006A13}, +{0x00F94D, 0x00F94D, 0x006DDA}, +{0x00F94E, 0x00F94E, 0x006F0F}, +{0x00F94F, 0x00F94F, 0x007D2F}, +{0x00F950, 0x00F950, 0x007E37}, +{0x00F951, 0x00F951, 0x00964B}, +{0x00F952, 0x00F952, 0x0052D2}, +{0x00F953, 0x00F953, 0x00808B}, +{0x00F954, 0x00F954, 0x0051DC}, +{0x00F955, 0x00F955, 0x0051CC}, +{0x00F956, 0x00F956, 0x007A1C}, +{0x00F957, 0x00F957, 0x007DBE}, +{0x00F958, 0x00F958, 0x0083F1}, +{0x00F959, 0x00F959, 0x009675}, +{0x00F95A, 0x00F95A, 0x008B80}, +{0x00F95B, 0x00F95B, 0x0062CF}, +{0x00F95C, 0x00F95C, 0x006A02}, +{0x00F95D, 0x00F95D, 0x008AFE}, +{0x00F95E, 0x00F95E, 0x004E39}, +{0x00F95F, 0x00F95F, 0x005BE7}, +{0x00F960, 0x00F960, 0x006012}, +{0x00F961, 0x00F961, 0x007387}, +{0x00F962, 0x00F962, 0x007570}, +{0x00F963, 0x00F963, 0x005317}, +{0x00F964, 0x00F964, 0x0078FB}, +{0x00F965, 0x00F965, 0x004FBF}, +{0x00F966, 0x00F966, 0x005FA9}, +{0x00F967, 0x00F967, 0x004E0D}, +{0x00F968, 0x00F968, 0x006CCC}, +{0x00F969, 0x00F969, 0x006578}, +{0x00F96A, 0x00F96A, 0x007D22}, +{0x00F96B, 0x00F96B, 0x0053C3}, +{0x00F96C, 0x00F96C, 0x00585E}, +{0x00F96D, 0x00F96D, 0x007701}, +{0x00F96E, 0x00F96E, 0x008449}, +{0x00F96F, 0x00F96F, 0x008AAA}, +{0x00F970, 0x00F970, 0x006BBA}, +{0x00F971, 0x00F971, 0x008FB0}, +{0x00F972, 0x00F972, 0x006C88}, +{0x00F973, 0x00F973, 0x0062FE}, +{0x00F974, 0x00F974, 0x0082E5}, +{0x00F975, 0x00F975, 0x0063A0}, +{0x00F976, 0x00F976, 0x007565}, +{0x00F977, 0x00F977, 0x004EAE}, +{0x00F978, 0x00F978, 0x005169}, +{0x00F979, 0x00F979, 0x0051C9}, +{0x00F97A, 0x00F97A, 0x006881}, +{0x00F97B, 0x00F97B, 0x007CE7}, +{0x00F97C, 0x00F97C, 0x00826F}, +{0x00F97D, 0x00F97D, 0x008AD2}, +{0x00F97E, 0x00F97E, 0x0091CF}, +{0x00F97F, 0x00F97F, 0x0052F5}, +{0x00F980, 0x00F980, 0x005442}, +{0x00F981, 0x00F981, 0x005973}, +{0x00F982, 0x00F982, 0x005EEC}, +{0x00F983, 0x00F983, 0x0065C5}, +{0x00F984, 0x00F984, 0x006FFE}, +{0x00F985, 0x00F985, 0x00792A}, +{0x00F986, 0x00F986, 0x0095AD}, +{0x00F987, 0x00F987, 0x009A6A}, +{0x00F988, 0x00F988, 0x009E97}, +{0x00F989, 0x00F989, 0x009ECE}, +{0x00F98A, 0x00F98A, 0x00529B}, +{0x00F98B, 0x00F98B, 0x0066C6}, +{0x00F98C, 0x00F98C, 0x006B77}, +{0x00F98D, 0x00F98D, 0x008F62}, +{0x00F98E, 0x00F98E, 0x005E74}, +{0x00F98F, 0x00F98F, 0x006190}, +{0x00F990, 0x00F990, 0x006200}, +{0x00F991, 0x00F991, 0x00649A}, +{0x00F992, 0x00F992, 0x006F23}, +{0x00F993, 0x00F993, 0x007149}, +{0x00F994, 0x00F994, 0x007489}, +{0x00F995, 0x00F995, 0x0079CA}, +{0x00F996, 0x00F996, 0x007DF4}, +{0x00F997, 0x00F997, 0x00806F}, +{0x00F998, 0x00F998, 0x008F26}, +{0x00F999, 0x00F999, 0x0084EE}, +{0x00F99A, 0x00F99A, 0x009023}, +{0x00F99B, 0x00F99B, 0x00934A}, +{0x00F99C, 0x00F99C, 0x005217}, +{0x00F99D, 0x00F99D, 0x0052A3}, +{0x00F99E, 0x00F99E, 0x0054BD}, +{0x00F99F, 0x00F99F, 0x0070C8}, +{0x00F9A0, 0x00F9A0, 0x0088C2}, +{0x00F9A1, 0x00F9A1, 0x008AAA}, +{0x00F9A2, 0x00F9A2, 0x005EC9}, +{0x00F9A3, 0x00F9A3, 0x005FF5}, +{0x00F9A4, 0x00F9A4, 0x00637B}, +{0x00F9A5, 0x00F9A5, 0x006BAE}, +{0x00F9A6, 0x00F9A6, 0x007C3E}, +{0x00F9A7, 0x00F9A7, 0x007375}, +{0x00F9A8, 0x00F9A8, 0x004EE4}, +{0x00F9A9, 0x00F9A9, 0x0056F9}, +{0x00F9AA, 0x00F9AA, 0x005BE7}, +{0x00F9AB, 0x00F9AB, 0x005DBA}, +{0x00F9AC, 0x00F9AC, 0x00601C}, +{0x00F9AD, 0x00F9AD, 0x0073B2}, +{0x00F9AE, 0x00F9AE, 0x007469}, +{0x00F9AF, 0x00F9AF, 0x007F9A}, +{0x00F9B0, 0x00F9B0, 0x008046}, +{0x00F9B1, 0x00F9B1, 0x009234}, +{0x00F9B2, 0x00F9B2, 0x0096F6}, +{0x00F9B3, 0x00F9B3, 0x009748}, +{0x00F9B4, 0x00F9B4, 0x009818}, +{0x00F9B5, 0x00F9B5, 0x004F8B}, +{0x00F9B6, 0x00F9B6, 0x0079AE}, +{0x00F9B7, 0x00F9B7, 0x0091B4}, +{0x00F9B8, 0x00F9B8, 0x0096B8}, +{0x00F9B9, 0x00F9B9, 0x0060E1}, +{0x00F9BA, 0x00F9BA, 0x004E86}, +{0x00F9BB, 0x00F9BB, 0x0050DA}, +{0x00F9BC, 0x00F9BC, 0x005BEE}, +{0x00F9BD, 0x00F9BD, 0x005C3F}, +{0x00F9BE, 0x00F9BE, 0x006599}, +{0x00F9BF, 0x00F9BF, 0x006A02}, +{0x00F9C0, 0x00F9C0, 0x0071CE}, +{0x00F9C1, 0x00F9C1, 0x007642}, +{0x00F9C2, 0x00F9C2, 0x0084FC}, +{0x00F9C3, 0x00F9C3, 0x00907C}, +{0x00F9C4, 0x00F9C4, 0x009F8D}, +{0x00F9C5, 0x00F9C5, 0x006688}, +{0x00F9C6, 0x00F9C6, 0x00962E}, +{0x00F9C7, 0x00F9C7, 0x005289}, +{0x00F9C8, 0x00F9C8, 0x00677B}, +{0x00F9C9, 0x00F9C9, 0x0067F3}, +{0x00F9CA, 0x00F9CA, 0x006D41}, +{0x00F9CB, 0x00F9CB, 0x006E9C}, +{0x00F9CC, 0x00F9CC, 0x007409}, +{0x00F9CD, 0x00F9CD, 0x007559}, +{0x00F9CE, 0x00F9CE, 0x00786B}, +{0x00F9CF, 0x00F9CF, 0x007D10}, +{0x00F9D0, 0x00F9D0, 0x00985E}, +{0x00F9D1, 0x00F9D1, 0x00516D}, +{0x00F9D2, 0x00F9D2, 0x00622E}, +{0x00F9D3, 0x00F9D3, 0x009678}, +{0x00F9D4, 0x00F9D4, 0x00502B}, +{0x00F9D5, 0x00F9D5, 0x005D19}, +{0x00F9D6, 0x00F9D6, 0x006DEA}, +{0x00F9D7, 0x00F9D7, 0x008F2A}, +{0x00F9D8, 0x00F9D8, 0x005F8B}, +{0x00F9D9, 0x00F9D9, 0x006144}, +{0x00F9DA, 0x00F9DA, 0x006817}, +{0x00F9DB, 0x00F9DB, 0x007387}, +{0x00F9DC, 0x00F9DC, 0x009686}, +{0x00F9DD, 0x00F9DD, 0x005229}, +{0x00F9DE, 0x00F9DE, 0x00540F}, +{0x00F9DF, 0x00F9DF, 0x005C65}, +{0x00F9E0, 0x00F9E0, 0x006613}, +{0x00F9E1, 0x00F9E1, 0x00674E}, +{0x00F9E2, 0x00F9E2, 0x0068A8}, +{0x00F9E3, 0x00F9E3, 0x006CE5}, +{0x00F9E4, 0x00F9E4, 0x007406}, +{0x00F9E5, 0x00F9E5, 0x0075E2}, +{0x00F9E6, 0x00F9E6, 0x007F79}, +{0x00F9E7, 0x00F9E7, 0x0088CF}, +{0x00F9E8, 0x00F9E8, 0x0088E1}, +{0x00F9E9, 0x00F9E9, 0x0091CC}, +{0x00F9EA, 0x00F9EA, 0x0096E2}, +{0x00F9EB, 0x00F9EB, 0x00533F}, +{0x00F9EC, 0x00F9EC, 0x006EBA}, +{0x00F9ED, 0x00F9ED, 0x00541D}, +{0x00F9EE, 0x00F9EE, 0x0071D0}, +{0x00F9EF, 0x00F9EF, 0x007498}, +{0x00F9F0, 0x00F9F0, 0x0085FA}, +{0x00F9F1, 0x00F9F1, 0x0096A3}, +{0x00F9F2, 0x00F9F2, 0x009C57}, +{0x00F9F3, 0x00F9F3, 0x009E9F}, +{0x00F9F4, 0x00F9F4, 0x006797}, +{0x00F9F5, 0x00F9F5, 0x006DCB}, +{0x00F9F6, 0x00F9F6, 0x0081E8}, +{0x00F9F7, 0x00F9F7, 0x007ACB}, +{0x00F9F8, 0x00F9F8, 0x007B20}, +{0x00F9F9, 0x00F9F9, 0x007C92}, +{0x00F9FA, 0x00F9FA, 0x0072C0}, +{0x00F9FB, 0x00F9FB, 0x007099}, +{0x00F9FC, 0x00F9FC, 0x008B58}, +{0x00F9FD, 0x00F9FD, 0x004EC0}, +{0x00F9FE, 0x00F9FE, 0x008336}, +{0x00F9FF, 0x00F9FF, 0x00523A}, +{0x00FA00, 0x00FA00, 0x005207}, +{0x00FA01, 0x00FA01, 0x005EA6}, +{0x00FA02, 0x00FA02, 0x0062D3}, +{0x00FA03, 0x00FA03, 0x007CD6}, +{0x00FA04, 0x00FA04, 0x005B85}, +{0x00FA05, 0x00FA05, 0x006D1E}, +{0x00FA06, 0x00FA06, 0x0066B4}, +{0x00FA07, 0x00FA07, 0x008F3B}, +{0x00FA08, 0x00FA08, 0x00884C}, +{0x00FA09, 0x00FA09, 0x00964D}, +{0x00FA0A, 0x00FA0A, 0x00898B}, +{0x00FA0B, 0x00FA0B, 0x005ED3}, +{0x00FA0C, 0x00FA0C, 0x005140}, +{0x00FA0D, 0x00FA0D, 0x0055C0}, +{0x00FA10, 0x00FA10, 0x00585A}, +{0x00FA12, 0x00FA12, 0x006674}, +{0x00FA15, 0x00FA15, 0x0051DE}, +{0x00FA16, 0x00FA16, 0x00732A}, +{0x00FA17, 0x00FA17, 0x0076CA}, +{0x00FA18, 0x00FA18, 0x00793C}, +{0x00FA19, 0x00FA19, 0x00795E}, +{0x00FA1A, 0x00FA1A, 0x007965}, +{0x00FA1B, 0x00FA1B, 0x00798F}, +{0x00FA1C, 0x00FA1C, 0x009756}, +{0x00FA1D, 0x00FA1D, 0x007CBE}, +{0x00FA1E, 0x00FA1E, 0x007FBD}, +{0x00FA20, 0x00FA20, 0x008612}, +{0x00FA22, 0x00FA22, 0x008AF8}, +{0x00FA25, 0x00FA25, 0x009038}, +{0x00FA26, 0x00FA26, 0x0090FD}, +{0x00FA2A, 0x00FA2A, 0x0098EF}, +{0x00FA2B, 0x00FA2B, 0x0098FC}, +{0x00FA2C, 0x00FA2C, 0x009928}, +{0x00FA2D, 0x00FA2D, 0x009DB4}, +{0x00FA2E, 0x00FA2E, 0x0090DE}, +{0x00FA2F, 0x00FA2F, 0x0096B7}, +{0x00FA30, 0x00FA30, 0x004FAE}, +{0x00FA31, 0x00FA31, 0x0050E7}, +{0x00FA32, 0x00FA32, 0x00514D}, +{0x00FA33, 0x00FA33, 0x0052C9}, +{0x00FA34, 0x00FA34, 0x0052E4}, +{0x00FA35, 0x00FA35, 0x005351}, +{0x00FA36, 0x00FA36, 0x00559D}, +{0x00FA37, 0x00FA37, 0x005606}, +{0x00FA38, 0x00FA38, 0x005668}, +{0x00FA39, 0x00FA39, 0x005840}, +{0x00FA3A, 0x00FA3A, 0x0058A8}, +{0x00FA3B, 0x00FA3B, 0x005C64}, +{0x00FA3C, 0x00FA3C, 0x005C6E}, +{0x00FA3D, 0x00FA3D, 0x006094}, +{0x00FA3E, 0x00FA3E, 0x006168}, +{0x00FA3F, 0x00FA3F, 0x00618E}, +{0x00FA40, 0x00FA40, 0x0061F2}, +{0x00FA41, 0x00FA41, 0x00654F}, +{0x00FA42, 0x00FA42, 0x0065E2}, +{0x00FA43, 0x00FA43, 0x006691}, +{0x00FA44, 0x00FA44, 0x006885}, +{0x00FA45, 0x00FA45, 0x006D77}, +{0x00FA46, 0x00FA46, 0x006E1A}, +{0x00FA47, 0x00FA47, 0x006F22}, +{0x00FA48, 0x00FA48, 0x00716E}, +{0x00FA49, 0x00FA49, 0x00722B}, +{0x00FA4A, 0x00FA4A, 0x007422}, +{0x00FA4B, 0x00FA4B, 0x007891}, +{0x00FA4C, 0x00FA4C, 0x00793E}, +{0x00FA4D, 0x00FA4D, 0x007949}, +{0x00FA4E, 0x00FA4E, 0x007948}, +{0x00FA4F, 0x00FA4F, 0x007950}, +{0x00FA50, 0x00FA50, 0x007956}, +{0x00FA51, 0x00FA51, 0x00795D}, +{0x00FA52, 0x00FA52, 0x00798D}, +{0x00FA53, 0x00FA53, 0x00798E}, +{0x00FA54, 0x00FA54, 0x007A40}, +{0x00FA55, 0x00FA55, 0x007A81}, +{0x00FA56, 0x00FA56, 0x007BC0}, +{0x00FA57, 0x00FA57, 0x007DF4}, +{0x00FA58, 0x00FA58, 0x007E09}, +{0x00FA59, 0x00FA59, 0x007E41}, +{0x00FA5A, 0x00FA5A, 0x007F72}, +{0x00FA5B, 0x00FA5B, 0x008005}, +{0x00FA5C, 0x00FA5C, 0x0081ED}, +{0x00FA5D, 0x00FA5E, 0x008279}, +{0x00FA5F, 0x00FA5F, 0x008457}, +{0x00FA60, 0x00FA60, 0x008910}, +{0x00FA61, 0x00FA61, 0x008996}, +{0x00FA62, 0x00FA62, 0x008B01}, +{0x00FA63, 0x00FA63, 0x008B39}, +{0x00FA64, 0x00FA64, 0x008CD3}, +{0x00FA65, 0x00FA65, 0x008D08}, +{0x00FA66, 0x00FA66, 0x008FB6}, +{0x00FA67, 0x00FA67, 0x009038}, +{0x00FA68, 0x00FA68, 0x0096E3}, +{0x00FA69, 0x00FA69, 0x0097FF}, +{0x00FA6A, 0x00FA6A, 0x00983B}, +{0x00FA6B, 0x00FA6B, 0x006075}, +{0x00FA6C, 0x00FA6C, 0x0242EE}, +{0x00FA6D, 0x00FA6D, 0x008218}, +{0x00FA70, 0x00FA70, 0x004E26}, +{0x00FA71, 0x00FA71, 0x0051B5}, +{0x00FA72, 0x00FA72, 0x005168}, +{0x00FA73, 0x00FA73, 0x004F80}, +{0x00FA74, 0x00FA74, 0x005145}, +{0x00FA75, 0x00FA75, 0x005180}, +{0x00FA76, 0x00FA76, 0x0052C7}, +{0x00FA77, 0x00FA77, 0x0052FA}, +{0x00FA78, 0x00FA78, 0x00559D}, +{0x00FA79, 0x00FA79, 0x005555}, +{0x00FA7A, 0x00FA7A, 0x005599}, +{0x00FA7B, 0x00FA7B, 0x0055E2}, +{0x00FA7C, 0x00FA7C, 0x00585A}, +{0x00FA7D, 0x00FA7D, 0x0058B3}, +{0x00FA7E, 0x00FA7E, 0x005944}, +{0x00FA7F, 0x00FA7F, 0x005954}, +{0x00FA80, 0x00FA80, 0x005A62}, +{0x00FA81, 0x00FA81, 0x005B28}, +{0x00FA82, 0x00FA82, 0x005ED2}, +{0x00FA83, 0x00FA83, 0x005ED9}, +{0x00FA84, 0x00FA84, 0x005F69}, +{0x00FA85, 0x00FA85, 0x005FAD}, +{0x00FA86, 0x00FA86, 0x0060D8}, +{0x00FA87, 0x00FA87, 0x00614E}, +{0x00FA88, 0x00FA88, 0x006108}, +{0x00FA89, 0x00FA89, 0x00618E}, +{0x00FA8A, 0x00FA8A, 0x006160}, +{0x00FA8B, 0x00FA8B, 0x0061F2}, +{0x00FA8C, 0x00FA8C, 0x006234}, +{0x00FA8D, 0x00FA8D, 0x0063C4}, +{0x00FA8E, 0x00FA8E, 0x00641C}, +{0x00FA8F, 0x00FA8F, 0x006452}, +{0x00FA90, 0x00FA90, 0x006556}, +{0x00FA91, 0x00FA91, 0x006674}, +{0x00FA92, 0x00FA92, 0x006717}, +{0x00FA93, 0x00FA93, 0x00671B}, +{0x00FA94, 0x00FA94, 0x006756}, +{0x00FA95, 0x00FA95, 0x006B79}, +{0x00FA96, 0x00FA96, 0x006BBA}, +{0x00FA97, 0x00FA97, 0x006D41}, +{0x00FA98, 0x00FA98, 0x006EDB}, +{0x00FA99, 0x00FA99, 0x006ECB}, +{0x00FA9A, 0x00FA9A, 0x006F22}, +{0x00FA9B, 0x00FA9B, 0x00701E}, +{0x00FA9C, 0x00FA9C, 0x00716E}, +{0x00FA9D, 0x00FA9D, 0x0077A7}, +{0x00FA9E, 0x00FA9E, 0x007235}, +{0x00FA9F, 0x00FA9F, 0x0072AF}, +{0x00FAA0, 0x00FAA0, 0x00732A}, +{0x00FAA1, 0x00FAA1, 0x007471}, +{0x00FAA2, 0x00FAA2, 0x007506}, +{0x00FAA3, 0x00FAA3, 0x00753B}, +{0x00FAA4, 0x00FAA4, 0x00761D}, +{0x00FAA5, 0x00FAA5, 0x00761F}, +{0x00FAA6, 0x00FAA6, 0x0076CA}, +{0x00FAA7, 0x00FAA7, 0x0076DB}, +{0x00FAA8, 0x00FAA8, 0x0076F4}, +{0x00FAA9, 0x00FAA9, 0x00774A}, +{0x00FAAA, 0x00FAAA, 0x007740}, +{0x00FAAB, 0x00FAAB, 0x0078CC}, +{0x00FAAC, 0x00FAAC, 0x007AB1}, +{0x00FAAD, 0x00FAAD, 0x007BC0}, +{0x00FAAE, 0x00FAAE, 0x007C7B}, +{0x00FAAF, 0x00FAAF, 0x007D5B}, +{0x00FAB0, 0x00FAB0, 0x007DF4}, +{0x00FAB1, 0x00FAB1, 0x007F3E}, +{0x00FAB2, 0x00FAB2, 0x008005}, +{0x00FAB3, 0x00FAB3, 0x008352}, +{0x00FAB4, 0x00FAB4, 0x0083EF}, +{0x00FAB5, 0x00FAB5, 0x008779}, +{0x00FAB6, 0x00FAB6, 0x008941}, +{0x00FAB7, 0x00FAB7, 0x008986}, +{0x00FAB8, 0x00FAB8, 0x008996}, +{0x00FAB9, 0x00FAB9, 0x008ABF}, +{0x00FABA, 0x00FABA, 0x008AF8}, +{0x00FABB, 0x00FABB, 0x008ACB}, +{0x00FABC, 0x00FABC, 0x008B01}, +{0x00FABD, 0x00FABD, 0x008AFE}, +{0x00FABE, 0x00FABE, 0x008AED}, +{0x00FABF, 0x00FABF, 0x008B39}, +{0x00FAC0, 0x00FAC0, 0x008B8A}, +{0x00FAC1, 0x00FAC1, 0x008D08}, +{0x00FAC2, 0x00FAC2, 0x008F38}, +{0x00FAC3, 0x00FAC3, 0x009072}, +{0x00FAC4, 0x00FAC4, 0x009199}, +{0x00FAC5, 0x00FAC5, 0x009276}, +{0x00FAC6, 0x00FAC6, 0x00967C}, +{0x00FAC7, 0x00FAC7, 0x0096E3}, +{0x00FAC8, 0x00FAC8, 0x009756}, +{0x00FAC9, 0x00FAC9, 0x0097DB}, +{0x00FACA, 0x00FACA, 0x0097FF}, +{0x00FACB, 0x00FACB, 0x00980B}, +{0x00FACC, 0x00FACC, 0x00983B}, +{0x00FACD, 0x00FACD, 0x009B12}, +{0x00FACE, 0x00FACE, 0x009F9C}, +{0x00FACF, 0x00FACF, 0x02284A}, +{0x00FAD0, 0x00FAD0, 0x022844}, +{0x00FAD1, 0x00FAD1, 0x0233D5}, +{0x00FAD2, 0x00FAD2, 0x003B9D}, +{0x00FAD3, 0x00FAD3, 0x004018}, +{0x00FAD4, 0x00FAD4, 0x004039}, +{0x00FAD5, 0x00FAD5, 0x025249}, +{0x00FAD6, 0x00FAD6, 0x025CD0}, +{0x00FAD7, 0x00FAD7, 0x027ED3}, +{0x00FAD8, 0x00FAD8, 0x009F43}, +{0x00FAD9, 0x00FAD9, 0x009F8E}, +{0x00FB1D, 0x00FB1D, 0x0005D9}, +{0x00FB1F, 0x00FB1F, 0x0005F2}, +{0x00FB2A, 0x00FB2D, 0x0005E9}, +{0x00FB2E, 0x00FB30, 0x0005D0}, +{0x00FB31, 0x00FB31, 0x0005D1}, +{0x00FB32, 0x00FB32, 0x0005D2}, +{0x00FB33, 0x00FB33, 0x0005D3}, +{0x00FB34, 0x00FB34, 0x0005D4}, +{0x00FB35, 0x00FB35, 0x0005D5}, +{0x00FB36, 0x00FB36, 0x0005D6}, +{0x00FB38, 0x00FB38, 0x0005D8}, +{0x00FB39, 0x00FB39, 0x0005D9}, +{0x00FB3A, 0x00FB3A, 0x0005DA}, +{0x00FB3B, 0x00FB3B, 0x0005DB}, +{0x00FB3C, 0x00FB3C, 0x0005DC}, +{0x00FB3E, 0x00FB3E, 0x0005DE}, +{0x00FB40, 0x00FB40, 0x0005E0}, +{0x00FB41, 0x00FB41, 0x0005E1}, +{0x00FB43, 0x00FB43, 0x0005E3}, +{0x00FB44, 0x00FB44, 0x0005E4}, +{0x00FB46, 0x00FB46, 0x0005E6}, +{0x00FB47, 0x00FB47, 0x0005E7}, +{0x00FB48, 0x00FB48, 0x0005E8}, +{0x00FB49, 0x00FB49, 0x0005E9}, +{0x00FB4A, 0x00FB4A, 0x0005EA}, +{0x00FB4B, 0x00FB4B, 0x0005D5}, +{0x00FB4C, 0x00FB4C, 0x0005D1}, +{0x00FB4D, 0x00FB4D, 0x0005DB}, +{0x00FB4E, 0x00FB4E, 0x0005E4}, +{0x01109A, 0x01109A, 0x011099}, +{0x01109C, 0x01109C, 0x01109B}, +{0x0110AB, 0x0110AB, 0x0110A5}, +{0x01112E, 0x01112E, 0x011131}, +{0x01112F, 0x01112F, 0x011132}, +{0x01134B, 0x01134C, 0x011347}, +{0x0114BB, 0x0114BC, 0x0114B9}, +{0x0114BE, 0x0114BE, 0x0114B9}, +{0x0115BA, 0x0115BA, 0x0115B8}, +{0x0115BB, 0x0115BB, 0x0115B9}, +{0x011938, 0x011938, 0x011935}, +{0x01D15E, 0x01D15E, 0x01D157}, +{0x01D15F, 0x01D164, 0x01D158}, +{0x01D1BB, 0x01D1BB, 0x01D1B9}, +{0x01D1BC, 0x01D1BC, 0x01D1BA}, +{0x01D1BD, 0x01D1BD, 0x01D1B9}, +{0x01D1BE, 0x01D1BE, 0x01D1BA}, +{0x01D1BF, 0x01D1BF, 0x01D1B9}, +{0x01D1C0, 0x01D1C0, 0x01D1BA}, +{0x02F800, 0x02F800, 0x004E3D}, +{0x02F801, 0x02F801, 0x004E38}, +{0x02F802, 0x02F802, 0x004E41}, +{0x02F803, 0x02F803, 0x020122}, +{0x02F804, 0x02F804, 0x004F60}, +{0x02F805, 0x02F805, 0x004FAE}, +{0x02F806, 0x02F806, 0x004FBB}, +{0x02F807, 0x02F807, 0x005002}, +{0x02F808, 0x02F808, 0x00507A}, +{0x02F809, 0x02F809, 0x005099}, +{0x02F80A, 0x02F80A, 0x0050E7}, +{0x02F80B, 0x02F80B, 0x0050CF}, +{0x02F80C, 0x02F80C, 0x00349E}, +{0x02F80D, 0x02F80D, 0x02063A}, +{0x02F80E, 0x02F80E, 0x00514D}, +{0x02F80F, 0x02F80F, 0x005154}, +{0x02F810, 0x02F810, 0x005164}, +{0x02F811, 0x02F811, 0x005177}, +{0x02F812, 0x02F812, 0x02051C}, +{0x02F813, 0x02F813, 0x0034B9}, +{0x02F814, 0x02F814, 0x005167}, +{0x02F815, 0x02F815, 0x00518D}, +{0x02F816, 0x02F816, 0x02054B}, +{0x02F817, 0x02F817, 0x005197}, +{0x02F818, 0x02F818, 0x0051A4}, +{0x02F819, 0x02F819, 0x004ECC}, +{0x02F81A, 0x02F81A, 0x0051AC}, +{0x02F81B, 0x02F81B, 0x0051B5}, +{0x02F81C, 0x02F81C, 0x0291DF}, +{0x02F81D, 0x02F81D, 0x0051F5}, +{0x02F81E, 0x02F81E, 0x005203}, +{0x02F81F, 0x02F81F, 0x0034DF}, +{0x02F820, 0x02F820, 0x00523B}, +{0x02F821, 0x02F821, 0x005246}, +{0x02F822, 0x02F822, 0x005272}, +{0x02F823, 0x02F823, 0x005277}, +{0x02F824, 0x02F824, 0x003515}, +{0x02F825, 0x02F825, 0x0052C7}, +{0x02F826, 0x02F826, 0x0052C9}, +{0x02F827, 0x02F827, 0x0052E4}, +{0x02F828, 0x02F828, 0x0052FA}, +{0x02F829, 0x02F829, 0x005305}, +{0x02F82A, 0x02F82A, 0x005306}, +{0x02F82B, 0x02F82B, 0x005317}, +{0x02F82C, 0x02F82C, 0x005349}, +{0x02F82D, 0x02F82D, 0x005351}, +{0x02F82E, 0x02F82E, 0x00535A}, +{0x02F82F, 0x02F82F, 0x005373}, +{0x02F830, 0x02F830, 0x00537D}, +{0x02F831, 0x02F833, 0x00537F}, +{0x02F834, 0x02F834, 0x020A2C}, +{0x02F835, 0x02F835, 0x007070}, +{0x02F836, 0x02F836, 0x0053CA}, +{0x02F837, 0x02F837, 0x0053DF}, +{0x02F838, 0x02F838, 0x020B63}, +{0x02F839, 0x02F839, 0x0053EB}, +{0x02F83A, 0x02F83A, 0x0053F1}, +{0x02F83B, 0x02F83B, 0x005406}, +{0x02F83C, 0x02F83C, 0x00549E}, +{0x02F83D, 0x02F83D, 0x005438}, +{0x02F83E, 0x02F83E, 0x005448}, +{0x02F83F, 0x02F83F, 0x005468}, +{0x02F840, 0x02F840, 0x0054A2}, +{0x02F841, 0x02F841, 0x0054F6}, +{0x02F842, 0x02F842, 0x005510}, +{0x02F843, 0x02F843, 0x005553}, +{0x02F844, 0x02F844, 0x005563}, +{0x02F845, 0x02F846, 0x005584}, +{0x02F847, 0x02F847, 0x005599}, +{0x02F848, 0x02F848, 0x0055AB}, +{0x02F849, 0x02F849, 0x0055B3}, +{0x02F84A, 0x02F84A, 0x0055C2}, +{0x02F84B, 0x02F84B, 0x005716}, +{0x02F84C, 0x02F84C, 0x005606}, +{0x02F84D, 0x02F84D, 0x005717}, +{0x02F84E, 0x02F84E, 0x005651}, +{0x02F84F, 0x02F84F, 0x005674}, +{0x02F850, 0x02F850, 0x005207}, +{0x02F851, 0x02F851, 0x0058EE}, +{0x02F852, 0x02F852, 0x0057CE}, +{0x02F853, 0x02F853, 0x0057F4}, +{0x02F854, 0x02F854, 0x00580D}, +{0x02F855, 0x02F855, 0x00578B}, +{0x02F856, 0x02F856, 0x005832}, +{0x02F857, 0x02F857, 0x005831}, +{0x02F858, 0x02F858, 0x0058AC}, +{0x02F859, 0x02F859, 0x0214E4}, +{0x02F85A, 0x02F85A, 0x0058F2}, +{0x02F85B, 0x02F85B, 0x0058F7}, +{0x02F85C, 0x02F85C, 0x005906}, +{0x02F85D, 0x02F85D, 0x00591A}, +{0x02F85E, 0x02F85E, 0x005922}, +{0x02F85F, 0x02F85F, 0x005962}, +{0x02F860, 0x02F860, 0x0216A8}, +{0x02F861, 0x02F861, 0x0216EA}, +{0x02F862, 0x02F862, 0x0059EC}, +{0x02F863, 0x02F863, 0x005A1B}, +{0x02F864, 0x02F864, 0x005A27}, +{0x02F865, 0x02F865, 0x0059D8}, +{0x02F866, 0x02F866, 0x005A66}, +{0x02F867, 0x02F867, 0x0036EE}, +{0x02F868, 0x02F868, 0x0036FC}, +{0x02F869, 0x02F869, 0x005B08}, +{0x02F86A, 0x02F86B, 0x005B3E}, +{0x02F86C, 0x02F86C, 0x0219C8}, +{0x02F86D, 0x02F86D, 0x005BC3}, +{0x02F86E, 0x02F86E, 0x005BD8}, +{0x02F86F, 0x02F86F, 0x005BE7}, +{0x02F870, 0x02F870, 0x005BF3}, +{0x02F871, 0x02F871, 0x021B18}, +{0x02F872, 0x02F872, 0x005BFF}, +{0x02F873, 0x02F873, 0x005C06}, +{0x02F874, 0x02F874, 0x005F53}, +{0x02F875, 0x02F875, 0x005C22}, +{0x02F876, 0x02F876, 0x003781}, +{0x02F877, 0x02F877, 0x005C60}, +{0x02F878, 0x02F878, 0x005C6E}, +{0x02F879, 0x02F879, 0x005CC0}, +{0x02F87A, 0x02F87A, 0x005C8D}, +{0x02F87B, 0x02F87B, 0x021DE4}, +{0x02F87C, 0x02F87C, 0x005D43}, +{0x02F87D, 0x02F87D, 0x021DE6}, +{0x02F87E, 0x02F87E, 0x005D6E}, +{0x02F87F, 0x02F87F, 0x005D6B}, +{0x02F880, 0x02F880, 0x005D7C}, +{0x02F881, 0x02F881, 0x005DE1}, +{0x02F882, 0x02F882, 0x005DE2}, +{0x02F883, 0x02F883, 0x00382F}, +{0x02F884, 0x02F884, 0x005DFD}, +{0x02F885, 0x02F885, 0x005E28}, +{0x02F886, 0x02F886, 0x005E3D}, +{0x02F887, 0x02F887, 0x005E69}, +{0x02F888, 0x02F888, 0x003862}, +{0x02F889, 0x02F889, 0x022183}, +{0x02F88A, 0x02F88A, 0x00387C}, +{0x02F88B, 0x02F88B, 0x005EB0}, +{0x02F88C, 0x02F88C, 0x005EB3}, +{0x02F88D, 0x02F88D, 0x005EB6}, +{0x02F88E, 0x02F88E, 0x005ECA}, +{0x02F88F, 0x02F88F, 0x02A392}, +{0x02F890, 0x02F890, 0x005EFE}, +{0x02F891, 0x02F892, 0x022331}, +{0x02F893, 0x02F893, 0x008201}, +{0x02F894, 0x02F895, 0x005F22}, +{0x02F896, 0x02F896, 0x0038C7}, +{0x02F897, 0x02F897, 0x0232B8}, +{0x02F898, 0x02F898, 0x0261DA}, +{0x02F899, 0x02F899, 0x005F62}, +{0x02F89A, 0x02F89A, 0x005F6B}, +{0x02F89B, 0x02F89B, 0x0038E3}, +{0x02F89C, 0x02F89C, 0x005F9A}, +{0x02F89D, 0x02F89D, 0x005FCD}, +{0x02F89E, 0x02F89E, 0x005FD7}, +{0x02F89F, 0x02F89F, 0x005FF9}, +{0x02F8A0, 0x02F8A0, 0x006081}, +{0x02F8A1, 0x02F8A1, 0x00393A}, +{0x02F8A2, 0x02F8A2, 0x00391C}, +{0x02F8A3, 0x02F8A3, 0x006094}, +{0x02F8A4, 0x02F8A4, 0x0226D4}, +{0x02F8A5, 0x02F8A5, 0x0060C7}, +{0x02F8A6, 0x02F8A6, 0x006148}, +{0x02F8A7, 0x02F8A7, 0x00614C}, +{0x02F8A8, 0x02F8A8, 0x00614E}, +{0x02F8A9, 0x02F8A9, 0x00614C}, +{0x02F8AA, 0x02F8AA, 0x00617A}, +{0x02F8AB, 0x02F8AB, 0x00618E}, +{0x02F8AC, 0x02F8AC, 0x0061B2}, +{0x02F8AD, 0x02F8AD, 0x0061A4}, +{0x02F8AE, 0x02F8AE, 0x0061AF}, +{0x02F8AF, 0x02F8AF, 0x0061DE}, +{0x02F8B0, 0x02F8B0, 0x0061F2}, +{0x02F8B1, 0x02F8B1, 0x0061F6}, +{0x02F8B2, 0x02F8B2, 0x006210}, +{0x02F8B3, 0x02F8B3, 0x00621B}, +{0x02F8B4, 0x02F8B4, 0x00625D}, +{0x02F8B5, 0x02F8B5, 0x0062B1}, +{0x02F8B6, 0x02F8B6, 0x0062D4}, +{0x02F8B7, 0x02F8B7, 0x006350}, +{0x02F8B8, 0x02F8B8, 0x022B0C}, +{0x02F8B9, 0x02F8B9, 0x00633D}, +{0x02F8BA, 0x02F8BA, 0x0062FC}, +{0x02F8BB, 0x02F8BB, 0x006368}, +{0x02F8BC, 0x02F8BC, 0x006383}, +{0x02F8BD, 0x02F8BD, 0x0063E4}, +{0x02F8BE, 0x02F8BE, 0x022BF1}, +{0x02F8BF, 0x02F8BF, 0x006422}, +{0x02F8C0, 0x02F8C0, 0x0063C5}, +{0x02F8C1, 0x02F8C1, 0x0063A9}, +{0x02F8C2, 0x02F8C2, 0x003A2E}, +{0x02F8C3, 0x02F8C3, 0x006469}, +{0x02F8C4, 0x02F8C4, 0x00647E}, +{0x02F8C5, 0x02F8C5, 0x00649D}, +{0x02F8C6, 0x02F8C6, 0x006477}, +{0x02F8C7, 0x02F8C7, 0x003A6C}, +{0x02F8C8, 0x02F8C8, 0x00654F}, +{0x02F8C9, 0x02F8C9, 0x00656C}, +{0x02F8CA, 0x02F8CA, 0x02300A}, +{0x02F8CB, 0x02F8CB, 0x0065E3}, +{0x02F8CC, 0x02F8CC, 0x0066F8}, +{0x02F8CD, 0x02F8CD, 0x006649}, +{0x02F8CE, 0x02F8CE, 0x003B19}, +{0x02F8CF, 0x02F8CF, 0x006691}, +{0x02F8D0, 0x02F8D0, 0x003B08}, +{0x02F8D1, 0x02F8D1, 0x003AE4}, +{0x02F8D2, 0x02F8D2, 0x005192}, +{0x02F8D3, 0x02F8D3, 0x005195}, +{0x02F8D4, 0x02F8D4, 0x006700}, +{0x02F8D5, 0x02F8D5, 0x00669C}, +{0x02F8D6, 0x02F8D6, 0x0080AD}, +{0x02F8D7, 0x02F8D7, 0x0043D9}, +{0x02F8D8, 0x02F8D8, 0x006717}, +{0x02F8D9, 0x02F8D9, 0x00671B}, +{0x02F8DA, 0x02F8DA, 0x006721}, +{0x02F8DB, 0x02F8DB, 0x00675E}, +{0x02F8DC, 0x02F8DC, 0x006753}, +{0x02F8DD, 0x02F8DD, 0x0233C3}, +{0x02F8DE, 0x02F8DE, 0x003B49}, +{0x02F8DF, 0x02F8DF, 0x0067FA}, +{0x02F8E0, 0x02F8E0, 0x006785}, +{0x02F8E1, 0x02F8E1, 0x006852}, +{0x02F8E2, 0x02F8E2, 0x006885}, +{0x02F8E3, 0x02F8E3, 0x02346D}, +{0x02F8E4, 0x02F8E4, 0x00688E}, +{0x02F8E5, 0x02F8E5, 0x00681F}, +{0x02F8E6, 0x02F8E6, 0x006914}, +{0x02F8E7, 0x02F8E7, 0x003B9D}, +{0x02F8E8, 0x02F8E8, 0x006942}, +{0x02F8E9, 0x02F8E9, 0x0069A3}, +{0x02F8EA, 0x02F8EA, 0x0069EA}, +{0x02F8EB, 0x02F8EB, 0x006AA8}, +{0x02F8EC, 0x02F8EC, 0x0236A3}, +{0x02F8ED, 0x02F8ED, 0x006ADB}, +{0x02F8EE, 0x02F8EE, 0x003C18}, +{0x02F8EF, 0x02F8EF, 0x006B21}, +{0x02F8F0, 0x02F8F0, 0x0238A7}, +{0x02F8F1, 0x02F8F1, 0x006B54}, +{0x02F8F2, 0x02F8F2, 0x003C4E}, +{0x02F8F3, 0x02F8F3, 0x006B72}, +{0x02F8F4, 0x02F8F4, 0x006B9F}, +{0x02F8F5, 0x02F8F5, 0x006BBA}, +{0x02F8F6, 0x02F8F6, 0x006BBB}, +{0x02F8F7, 0x02F8F7, 0x023A8D}, +{0x02F8F8, 0x02F8F8, 0x021D0B}, +{0x02F8F9, 0x02F8F9, 0x023AFA}, +{0x02F8FA, 0x02F8FA, 0x006C4E}, +{0x02F8FB, 0x02F8FB, 0x023CBC}, +{0x02F8FC, 0x02F8FC, 0x006CBF}, +{0x02F8FD, 0x02F8FD, 0x006CCD}, +{0x02F8FE, 0x02F8FE, 0x006C67}, +{0x02F8FF, 0x02F8FF, 0x006D16}, +{0x02F900, 0x02F900, 0x006D3E}, +{0x02F901, 0x02F901, 0x006D77}, +{0x02F902, 0x02F902, 0x006D41}, +{0x02F903, 0x02F903, 0x006D69}, +{0x02F904, 0x02F904, 0x006D78}, +{0x02F905, 0x02F905, 0x006D85}, +{0x02F906, 0x02F906, 0x023D1E}, +{0x02F907, 0x02F907, 0x006D34}, +{0x02F908, 0x02F908, 0x006E2F}, +{0x02F909, 0x02F909, 0x006E6E}, +{0x02F90A, 0x02F90A, 0x003D33}, +{0x02F90B, 0x02F90B, 0x006ECB}, +{0x02F90C, 0x02F90C, 0x006EC7}, +{0x02F90D, 0x02F90D, 0x023ED1}, +{0x02F90E, 0x02F90E, 0x006DF9}, +{0x02F90F, 0x02F90F, 0x006F6E}, +{0x02F910, 0x02F910, 0x023F5E}, +{0x02F911, 0x02F911, 0x023F8E}, +{0x02F912, 0x02F912, 0x006FC6}, +{0x02F913, 0x02F913, 0x007039}, +{0x02F914, 0x02F914, 0x00701E}, +{0x02F915, 0x02F915, 0x00701B}, +{0x02F916, 0x02F916, 0x003D96}, +{0x02F917, 0x02F917, 0x00704A}, +{0x02F918, 0x02F918, 0x00707D}, +{0x02F919, 0x02F919, 0x007077}, +{0x02F91A, 0x02F91A, 0x0070AD}, +{0x02F91B, 0x02F91B, 0x020525}, +{0x02F91C, 0x02F91C, 0x007145}, +{0x02F91D, 0x02F91D, 0x024263}, +{0x02F91E, 0x02F91E, 0x00719C}, +{0x02F91F, 0x02F91F, 0x0243AB}, +{0x02F920, 0x02F920, 0x007228}, +{0x02F921, 0x02F921, 0x007235}, +{0x02F922, 0x02F922, 0x007250}, +{0x02F923, 0x02F923, 0x024608}, +{0x02F924, 0x02F924, 0x007280}, +{0x02F925, 0x02F925, 0x007295}, +{0x02F926, 0x02F926, 0x024735}, +{0x02F927, 0x02F927, 0x024814}, +{0x02F928, 0x02F928, 0x00737A}, +{0x02F929, 0x02F929, 0x00738B}, +{0x02F92A, 0x02F92A, 0x003EAC}, +{0x02F92B, 0x02F92B, 0x0073A5}, +{0x02F92C, 0x02F92D, 0x003EB8}, +{0x02F92E, 0x02F92E, 0x007447}, +{0x02F92F, 0x02F92F, 0x00745C}, +{0x02F930, 0x02F930, 0x007471}, +{0x02F931, 0x02F931, 0x007485}, +{0x02F932, 0x02F932, 0x0074CA}, +{0x02F933, 0x02F933, 0x003F1B}, +{0x02F934, 0x02F934, 0x007524}, +{0x02F935, 0x02F935, 0x024C36}, +{0x02F936, 0x02F936, 0x00753E}, +{0x02F937, 0x02F937, 0x024C92}, +{0x02F938, 0x02F938, 0x007570}, +{0x02F939, 0x02F939, 0x02219F}, +{0x02F93A, 0x02F93A, 0x007610}, +{0x02F93B, 0x02F93B, 0x024FA1}, +{0x02F93C, 0x02F93C, 0x024FB8}, +{0x02F93D, 0x02F93D, 0x025044}, +{0x02F93E, 0x02F93E, 0x003FFC}, +{0x02F93F, 0x02F93F, 0x004008}, +{0x02F940, 0x02F940, 0x0076F4}, +{0x02F941, 0x02F941, 0x0250F3}, +{0x02F942, 0x02F942, 0x0250F2}, +{0x02F943, 0x02F943, 0x025119}, +{0x02F944, 0x02F944, 0x025133}, +{0x02F945, 0x02F945, 0x00771E}, +{0x02F946, 0x02F947, 0x00771F}, +{0x02F948, 0x02F948, 0x00774A}, +{0x02F949, 0x02F949, 0x004039}, +{0x02F94A, 0x02F94A, 0x00778B}, +{0x02F94B, 0x02F94B, 0x004046}, +{0x02F94C, 0x02F94C, 0x004096}, +{0x02F94D, 0x02F94D, 0x02541D}, +{0x02F94E, 0x02F94E, 0x00784E}, +{0x02F94F, 0x02F94F, 0x00788C}, +{0x02F950, 0x02F950, 0x0078CC}, +{0x02F951, 0x02F951, 0x0040E3}, +{0x02F952, 0x02F952, 0x025626}, +{0x02F953, 0x02F953, 0x007956}, +{0x02F954, 0x02F954, 0x02569A}, +{0x02F955, 0x02F955, 0x0256C5}, +{0x02F956, 0x02F956, 0x00798F}, +{0x02F957, 0x02F957, 0x0079EB}, +{0x02F958, 0x02F958, 0x00412F}, +{0x02F959, 0x02F959, 0x007A40}, +{0x02F95A, 0x02F95A, 0x007A4A}, +{0x02F95B, 0x02F95B, 0x007A4F}, +{0x02F95C, 0x02F95C, 0x02597C}, +{0x02F95D, 0x02F95E, 0x025AA7}, +{0x02F95F, 0x02F95F, 0x007AEE}, +{0x02F960, 0x02F960, 0x004202}, +{0x02F961, 0x02F961, 0x025BAB}, +{0x02F962, 0x02F962, 0x007BC6}, +{0x02F963, 0x02F963, 0x007BC9}, +{0x02F964, 0x02F964, 0x004227}, +{0x02F965, 0x02F965, 0x025C80}, +{0x02F966, 0x02F966, 0x007CD2}, +{0x02F967, 0x02F967, 0x0042A0}, +{0x02F968, 0x02F968, 0x007CE8}, +{0x02F969, 0x02F969, 0x007CE3}, +{0x02F96A, 0x02F96A, 0x007D00}, +{0x02F96B, 0x02F96B, 0x025F86}, +{0x02F96C, 0x02F96C, 0x007D63}, +{0x02F96D, 0x02F96D, 0x004301}, +{0x02F96E, 0x02F96E, 0x007DC7}, +{0x02F96F, 0x02F96F, 0x007E02}, +{0x02F970, 0x02F970, 0x007E45}, +{0x02F971, 0x02F971, 0x004334}, +{0x02F972, 0x02F972, 0x026228}, +{0x02F973, 0x02F973, 0x026247}, +{0x02F974, 0x02F974, 0x004359}, +{0x02F975, 0x02F975, 0x0262D9}, +{0x02F976, 0x02F976, 0x007F7A}, +{0x02F977, 0x02F977, 0x02633E}, +{0x02F978, 0x02F978, 0x007F95}, +{0x02F979, 0x02F979, 0x007FFA}, +{0x02F97A, 0x02F97A, 0x008005}, +{0x02F97B, 0x02F97B, 0x0264DA}, +{0x02F97C, 0x02F97C, 0x026523}, +{0x02F97D, 0x02F97D, 0x008060}, +{0x02F97E, 0x02F97E, 0x0265A8}, +{0x02F97F, 0x02F97F, 0x008070}, +{0x02F980, 0x02F980, 0x02335F}, +{0x02F981, 0x02F981, 0x0043D5}, +{0x02F982, 0x02F982, 0x0080B2}, +{0x02F983, 0x02F983, 0x008103}, +{0x02F984, 0x02F984, 0x00440B}, +{0x02F985, 0x02F985, 0x00813E}, +{0x02F986, 0x02F986, 0x005AB5}, +{0x02F987, 0x02F987, 0x0267A7}, +{0x02F988, 0x02F988, 0x0267B5}, +{0x02F989, 0x02F989, 0x023393}, +{0x02F98A, 0x02F98A, 0x02339C}, +{0x02F98B, 0x02F98B, 0x008201}, +{0x02F98C, 0x02F98C, 0x008204}, +{0x02F98D, 0x02F98D, 0x008F9E}, +{0x02F98E, 0x02F98E, 0x00446B}, +{0x02F98F, 0x02F98F, 0x008291}, +{0x02F990, 0x02F990, 0x00828B}, +{0x02F991, 0x02F991, 0x00829D}, +{0x02F992, 0x02F992, 0x0052B3}, +{0x02F993, 0x02F993, 0x0082B1}, +{0x02F994, 0x02F994, 0x0082B3}, +{0x02F995, 0x02F995, 0x0082BD}, +{0x02F996, 0x02F996, 0x0082E6}, +{0x02F997, 0x02F997, 0x026B3C}, +{0x02F998, 0x02F998, 0x0082E5}, +{0x02F999, 0x02F999, 0x00831D}, +{0x02F99A, 0x02F99A, 0x008363}, +{0x02F99B, 0x02F99B, 0x0083AD}, +{0x02F99C, 0x02F99C, 0x008323}, +{0x02F99D, 0x02F99D, 0x0083BD}, +{0x02F99E, 0x02F99E, 0x0083E7}, +{0x02F99F, 0x02F99F, 0x008457}, +{0x02F9A0, 0x02F9A0, 0x008353}, +{0x02F9A1, 0x02F9A1, 0x0083CA}, +{0x02F9A2, 0x02F9A2, 0x0083CC}, +{0x02F9A3, 0x02F9A3, 0x0083DC}, +{0x02F9A4, 0x02F9A4, 0x026C36}, +{0x02F9A5, 0x02F9A5, 0x026D6B}, +{0x02F9A6, 0x02F9A6, 0x026CD5}, +{0x02F9A7, 0x02F9A7, 0x00452B}, +{0x02F9A8, 0x02F9A8, 0x0084F1}, +{0x02F9A9, 0x02F9A9, 0x0084F3}, +{0x02F9AA, 0x02F9AA, 0x008516}, +{0x02F9AB, 0x02F9AB, 0x0273CA}, +{0x02F9AC, 0x02F9AC, 0x008564}, +{0x02F9AD, 0x02F9AD, 0x026F2C}, +{0x02F9AE, 0x02F9AE, 0x00455D}, +{0x02F9AF, 0x02F9AF, 0x004561}, +{0x02F9B0, 0x02F9B0, 0x026FB1}, +{0x02F9B1, 0x02F9B1, 0x0270D2}, +{0x02F9B2, 0x02F9B2, 0x00456B}, +{0x02F9B3, 0x02F9B3, 0x008650}, +{0x02F9B4, 0x02F9B4, 0x00865C}, +{0x02F9B5, 0x02F9B5, 0x008667}, +{0x02F9B6, 0x02F9B6, 0x008669}, +{0x02F9B7, 0x02F9B7, 0x0086A9}, +{0x02F9B8, 0x02F9B8, 0x008688}, +{0x02F9B9, 0x02F9B9, 0x00870E}, +{0x02F9BA, 0x02F9BA, 0x0086E2}, +{0x02F9BB, 0x02F9BB, 0x008779}, +{0x02F9BC, 0x02F9BC, 0x008728}, +{0x02F9BD, 0x02F9BD, 0x00876B}, +{0x02F9BE, 0x02F9BE, 0x008786}, +{0x02F9BF, 0x02F9BF, 0x0045D7}, +{0x02F9C0, 0x02F9C0, 0x0087E1}, +{0x02F9C1, 0x02F9C1, 0x008801}, +{0x02F9C2, 0x02F9C2, 0x0045F9}, +{0x02F9C3, 0x02F9C3, 0x008860}, +{0x02F9C4, 0x02F9C4, 0x008863}, +{0x02F9C5, 0x02F9C5, 0x027667}, +{0x02F9C6, 0x02F9C6, 0x0088D7}, +{0x02F9C7, 0x02F9C7, 0x0088DE}, +{0x02F9C8, 0x02F9C8, 0x004635}, +{0x02F9C9, 0x02F9C9, 0x0088FA}, +{0x02F9CA, 0x02F9CA, 0x0034BB}, +{0x02F9CB, 0x02F9CB, 0x0278AE}, +{0x02F9CC, 0x02F9CC, 0x027966}, +{0x02F9CD, 0x02F9CD, 0x0046BE}, +{0x02F9CE, 0x02F9CE, 0x0046C7}, +{0x02F9CF, 0x02F9CF, 0x008AA0}, +{0x02F9D0, 0x02F9D0, 0x008AED}, +{0x02F9D1, 0x02F9D1, 0x008B8A}, +{0x02F9D2, 0x02F9D2, 0x008C55}, +{0x02F9D3, 0x02F9D3, 0x027CA8}, +{0x02F9D4, 0x02F9D4, 0x008CAB}, +{0x02F9D5, 0x02F9D5, 0x008CC1}, +{0x02F9D6, 0x02F9D6, 0x008D1B}, +{0x02F9D7, 0x02F9D7, 0x008D77}, +{0x02F9D8, 0x02F9D8, 0x027F2F}, +{0x02F9D9, 0x02F9D9, 0x020804}, +{0x02F9DA, 0x02F9DA, 0x008DCB}, +{0x02F9DB, 0x02F9DB, 0x008DBC}, +{0x02F9DC, 0x02F9DC, 0x008DF0}, +{0x02F9DD, 0x02F9DD, 0x0208DE}, +{0x02F9DE, 0x02F9DE, 0x008ED4}, +{0x02F9DF, 0x02F9DF, 0x008F38}, +{0x02F9E0, 0x02F9E0, 0x0285D2}, +{0x02F9E1, 0x02F9E1, 0x0285ED}, +{0x02F9E2, 0x02F9E2, 0x009094}, +{0x02F9E3, 0x02F9E3, 0x0090F1}, +{0x02F9E4, 0x02F9E4, 0x009111}, +{0x02F9E5, 0x02F9E5, 0x02872E}, +{0x02F9E6, 0x02F9E6, 0x00911B}, +{0x02F9E7, 0x02F9E7, 0x009238}, +{0x02F9E8, 0x02F9E8, 0x0092D7}, +{0x02F9E9, 0x02F9E9, 0x0092D8}, +{0x02F9EA, 0x02F9EA, 0x00927C}, +{0x02F9EB, 0x02F9EB, 0x0093F9}, +{0x02F9EC, 0x02F9EC, 0x009415}, +{0x02F9ED, 0x02F9ED, 0x028BFA}, +{0x02F9EE, 0x02F9EE, 0x00958B}, +{0x02F9EF, 0x02F9EF, 0x004995}, +{0x02F9F0, 0x02F9F0, 0x0095B7}, +{0x02F9F1, 0x02F9F1, 0x028D77}, +{0x02F9F2, 0x02F9F2, 0x0049E6}, +{0x02F9F3, 0x02F9F3, 0x0096C3}, +{0x02F9F4, 0x02F9F4, 0x005DB2}, +{0x02F9F5, 0x02F9F5, 0x009723}, +{0x02F9F6, 0x02F9F6, 0x029145}, +{0x02F9F7, 0x02F9F7, 0x02921A}, +{0x02F9F8, 0x02F9F8, 0x004A6E}, +{0x02F9F9, 0x02F9F9, 0x004A76}, +{0x02F9FA, 0x02F9FA, 0x0097E0}, +{0x02F9FB, 0x02F9FB, 0x02940A}, +{0x02F9FC, 0x02F9FC, 0x004AB2}, +{0x02F9FD, 0x02F9FD, 0x029496}, +{0x02F9FE, 0x02F9FF, 0x00980B}, +{0x02FA00, 0x02FA00, 0x009829}, +{0x02FA01, 0x02FA01, 0x0295B6}, +{0x02FA02, 0x02FA02, 0x0098E2}, +{0x02FA03, 0x02FA03, 0x004B33}, +{0x02FA04, 0x02FA04, 0x009929}, +{0x02FA05, 0x02FA05, 0x0099A7}, +{0x02FA06, 0x02FA06, 0x0099C2}, +{0x02FA07, 0x02FA07, 0x0099FE}, +{0x02FA08, 0x02FA08, 0x004BCE}, +{0x02FA09, 0x02FA09, 0x029B30}, +{0x02FA0A, 0x02FA0A, 0x009B12}, +{0x02FA0B, 0x02FA0B, 0x009C40}, +{0x02FA0C, 0x02FA0C, 0x009CFD}, +{0x02FA0D, 0x02FA0D, 0x004CCE}, +{0x02FA0E, 0x02FA0E, 0x004CED}, +{0x02FA0F, 0x02FA0F, 0x009D67}, +{0x02FA10, 0x02FA10, 0x02A0CE}, +{0x02FA11, 0x02FA11, 0x004CF8}, +{0x02FA12, 0x02FA12, 0x02A105}, +{0x02FA13, 0x02FA13, 0x02A20E}, +{0x02FA14, 0x02FA14, 0x02A291}, +{0x02FA15, 0x02FA15, 0x009EBB}, +{0x02FA16, 0x02FA16, 0x004D56}, +{0x02FA17, 0x02FA17, 0x009EF9}, +{0x02FA18, 0x02FA18, 0x009EFE}, +{0x02FA19, 0x02FA19, 0x009F05}, +{0x02FA1A, 0x02FA1A, 0x009F0F}, +{0x02FA1B, 0x02FA1B, 0x009F16}, +{0x02FA1C, 0x02FA1C, 0x009F3B}, +{0x02FA1D, 0x02FA1D, 0x02A600}, }; -const std::map unicode_map_lowercase = { -{0x00041, 0x00061}, {0x00042, 0x00062}, {0x00043, 0x00063}, {0x00044, 0x00064}, {0x00045, 0x00065}, {0x00046, 0x00066}, -{0x00047, 0x00067}, {0x00048, 0x00068}, {0x00049, 0x00069}, {0x0004A, 0x0006A}, {0x0004B, 0x0006B}, {0x0004C, 0x0006C}, -{0x0004D, 0x0006D}, {0x0004E, 0x0006E}, {0x0004F, 0x0006F}, {0x00050, 0x00070}, {0x00051, 0x00071}, {0x00052, 0x00072}, -{0x00053, 0x00073}, {0x00054, 0x00074}, {0x00055, 0x00075}, {0x00056, 0x00076}, {0x00057, 0x00077}, {0x00058, 0x00078}, -{0x00059, 0x00079}, {0x0005A, 0x0007A}, {0x000C0, 0x000E0}, {0x000C1, 0x000E1}, {0x000C2, 0x000E2}, {0x000C3, 0x000E3}, -{0x000C4, 0x000E4}, {0x000C5, 0x000E5}, {0x000C6, 0x000E6}, {0x000C7, 0x000E7}, {0x000C8, 0x000E8}, {0x000C9, 0x000E9}, -{0x000CA, 0x000EA}, {0x000CB, 0x000EB}, {0x000CC, 0x000EC}, {0x000CD, 0x000ED}, {0x000CE, 0x000EE}, {0x000CF, 0x000EF}, -{0x000D0, 0x000F0}, {0x000D1, 0x000F1}, {0x000D2, 0x000F2}, {0x000D3, 0x000F3}, {0x000D4, 0x000F4}, {0x000D5, 0x000F5}, -{0x000D6, 0x000F6}, {0x000D8, 0x000F8}, {0x000D9, 0x000F9}, {0x000DA, 0x000FA}, {0x000DB, 0x000FB}, {0x000DC, 0x000FC}, -{0x000DD, 0x000FD}, {0x000DE, 0x000FE}, {0x00100, 0x00101}, {0x00102, 0x00103}, {0x00104, 0x00105}, {0x00106, 0x00107}, -{0x00108, 0x00109}, {0x0010A, 0x0010B}, {0x0010C, 0x0010D}, {0x0010E, 0x0010F}, {0x00110, 0x00111}, {0x00112, 0x00113}, -{0x00114, 0x00115}, {0x00116, 0x00117}, {0x00118, 0x00119}, {0x0011A, 0x0011B}, {0x0011C, 0x0011D}, {0x0011E, 0x0011F}, -{0x00120, 0x00121}, {0x00122, 0x00123}, {0x00124, 0x00125}, {0x00126, 0x00127}, {0x00128, 0x00129}, {0x0012A, 0x0012B}, -{0x0012C, 0x0012D}, {0x0012E, 0x0012F}, {0x00130, 0x00069}, {0x00132, 0x00133}, {0x00134, 0x00135}, {0x00136, 0x00137}, -{0x00139, 0x0013A}, {0x0013B, 0x0013C}, {0x0013D, 0x0013E}, {0x0013F, 0x00140}, {0x00141, 0x00142}, {0x00143, 0x00144}, -{0x00145, 0x00146}, {0x00147, 0x00148}, {0x0014A, 0x0014B}, {0x0014C, 0x0014D}, {0x0014E, 0x0014F}, {0x00150, 0x00151}, -{0x00152, 0x00153}, {0x00154, 0x00155}, {0x00156, 0x00157}, {0x00158, 0x00159}, {0x0015A, 0x0015B}, {0x0015C, 0x0015D}, -{0x0015E, 0x0015F}, {0x00160, 0x00161}, {0x00162, 0x00163}, {0x00164, 0x00165}, {0x00166, 0x00167}, {0x00168, 0x00169}, -{0x0016A, 0x0016B}, {0x0016C, 0x0016D}, {0x0016E, 0x0016F}, {0x00170, 0x00171}, {0x00172, 0x00173}, {0x00174, 0x00175}, -{0x00176, 0x00177}, {0x00178, 0x000FF}, {0x00179, 0x0017A}, {0x0017B, 0x0017C}, {0x0017D, 0x0017E}, {0x00181, 0x00253}, -{0x00182, 0x00183}, {0x00184, 0x00185}, {0x00186, 0x00254}, {0x00187, 0x00188}, {0x00189, 0x00256}, {0x0018A, 0x00257}, -{0x0018B, 0x0018C}, {0x0018E, 0x001DD}, {0x0018F, 0x00259}, {0x00190, 0x0025B}, {0x00191, 0x00192}, {0x00193, 0x00260}, -{0x00194, 0x00263}, {0x00196, 0x00269}, {0x00197, 0x00268}, {0x00198, 0x00199}, {0x0019C, 0x0026F}, {0x0019D, 0x00272}, -{0x0019F, 0x00275}, {0x001A0, 0x001A1}, {0x001A2, 0x001A3}, {0x001A4, 0x001A5}, {0x001A6, 0x00280}, {0x001A7, 0x001A8}, -{0x001A9, 0x00283}, {0x001AC, 0x001AD}, {0x001AE, 0x00288}, {0x001AF, 0x001B0}, {0x001B1, 0x0028A}, {0x001B2, 0x0028B}, -{0x001B3, 0x001B4}, {0x001B5, 0x001B6}, {0x001B7, 0x00292}, {0x001B8, 0x001B9}, {0x001BC, 0x001BD}, {0x001C4, 0x001C6}, -{0x001C5, 0x001C6}, {0x001C7, 0x001C9}, {0x001C8, 0x001C9}, {0x001CA, 0x001CC}, {0x001CB, 0x001CC}, {0x001CD, 0x001CE}, -{0x001CF, 0x001D0}, {0x001D1, 0x001D2}, {0x001D3, 0x001D4}, {0x001D5, 0x001D6}, {0x001D7, 0x001D8}, {0x001D9, 0x001DA}, -{0x001DB, 0x001DC}, {0x001DE, 0x001DF}, {0x001E0, 0x001E1}, {0x001E2, 0x001E3}, {0x001E4, 0x001E5}, {0x001E6, 0x001E7}, -{0x001E8, 0x001E9}, {0x001EA, 0x001EB}, {0x001EC, 0x001ED}, {0x001EE, 0x001EF}, {0x001F1, 0x001F3}, {0x001F2, 0x001F3}, -{0x001F4, 0x001F5}, {0x001F6, 0x00195}, {0x001F7, 0x001BF}, {0x001F8, 0x001F9}, {0x001FA, 0x001FB}, {0x001FC, 0x001FD}, -{0x001FE, 0x001FF}, {0x00200, 0x00201}, {0x00202, 0x00203}, {0x00204, 0x00205}, {0x00206, 0x00207}, {0x00208, 0x00209}, -{0x0020A, 0x0020B}, {0x0020C, 0x0020D}, {0x0020E, 0x0020F}, {0x00210, 0x00211}, {0x00212, 0x00213}, {0x00214, 0x00215}, -{0x00216, 0x00217}, {0x00218, 0x00219}, {0x0021A, 0x0021B}, {0x0021C, 0x0021D}, {0x0021E, 0x0021F}, {0x00220, 0x0019E}, -{0x00222, 0x00223}, {0x00224, 0x00225}, {0x00226, 0x00227}, {0x00228, 0x00229}, {0x0022A, 0x0022B}, {0x0022C, 0x0022D}, -{0x0022E, 0x0022F}, {0x00230, 0x00231}, {0x00232, 0x00233}, {0x0023A, 0x02C65}, {0x0023B, 0x0023C}, {0x0023D, 0x0019A}, -{0x0023E, 0x02C66}, {0x00241, 0x00242}, {0x00243, 0x00180}, {0x00244, 0x00289}, {0x00245, 0x0028C}, {0x00246, 0x00247}, -{0x00248, 0x00249}, {0x0024A, 0x0024B}, {0x0024C, 0x0024D}, {0x0024E, 0x0024F}, {0x00370, 0x00371}, {0x00372, 0x00373}, -{0x00376, 0x00377}, {0x0037F, 0x003F3}, {0x00386, 0x003AC}, {0x00388, 0x003AD}, {0x00389, 0x003AE}, {0x0038A, 0x003AF}, -{0x0038C, 0x003CC}, {0x0038E, 0x003CD}, {0x0038F, 0x003CE}, {0x00391, 0x003B1}, {0x00392, 0x003B2}, {0x00393, 0x003B3}, -{0x00394, 0x003B4}, {0x00395, 0x003B5}, {0x00396, 0x003B6}, {0x00397, 0x003B7}, {0x00398, 0x003B8}, {0x00399, 0x003B9}, -{0x0039A, 0x003BA}, {0x0039B, 0x003BB}, {0x0039C, 0x003BC}, {0x0039D, 0x003BD}, {0x0039E, 0x003BE}, {0x0039F, 0x003BF}, -{0x003A0, 0x003C0}, {0x003A1, 0x003C1}, {0x003A3, 0x003C3}, {0x003A4, 0x003C4}, {0x003A5, 0x003C5}, {0x003A6, 0x003C6}, -{0x003A7, 0x003C7}, {0x003A8, 0x003C8}, {0x003A9, 0x003C9}, {0x003AA, 0x003CA}, {0x003AB, 0x003CB}, {0x003CF, 0x003D7}, -{0x003D8, 0x003D9}, {0x003DA, 0x003DB}, {0x003DC, 0x003DD}, {0x003DE, 0x003DF}, {0x003E0, 0x003E1}, {0x003E2, 0x003E3}, -{0x003E4, 0x003E5}, {0x003E6, 0x003E7}, {0x003E8, 0x003E9}, {0x003EA, 0x003EB}, {0x003EC, 0x003ED}, {0x003EE, 0x003EF}, -{0x003F4, 0x003B8}, {0x003F7, 0x003F8}, {0x003F9, 0x003F2}, {0x003FA, 0x003FB}, {0x003FD, 0x0037B}, {0x003FE, 0x0037C}, -{0x003FF, 0x0037D}, {0x00400, 0x00450}, {0x00401, 0x00451}, {0x00402, 0x00452}, {0x00403, 0x00453}, {0x00404, 0x00454}, -{0x00405, 0x00455}, {0x00406, 0x00456}, {0x00407, 0x00457}, {0x00408, 0x00458}, {0x00409, 0x00459}, {0x0040A, 0x0045A}, -{0x0040B, 0x0045B}, {0x0040C, 0x0045C}, {0x0040D, 0x0045D}, {0x0040E, 0x0045E}, {0x0040F, 0x0045F}, {0x00410, 0x00430}, -{0x00411, 0x00431}, {0x00412, 0x00432}, {0x00413, 0x00433}, {0x00414, 0x00434}, {0x00415, 0x00435}, {0x00416, 0x00436}, -{0x00417, 0x00437}, {0x00418, 0x00438}, {0x00419, 0x00439}, {0x0041A, 0x0043A}, {0x0041B, 0x0043B}, {0x0041C, 0x0043C}, -{0x0041D, 0x0043D}, {0x0041E, 0x0043E}, {0x0041F, 0x0043F}, {0x00420, 0x00440}, {0x00421, 0x00441}, {0x00422, 0x00442}, -{0x00423, 0x00443}, {0x00424, 0x00444}, {0x00425, 0x00445}, {0x00426, 0x00446}, {0x00427, 0x00447}, {0x00428, 0x00448}, -{0x00429, 0x00449}, {0x0042A, 0x0044A}, {0x0042B, 0x0044B}, {0x0042C, 0x0044C}, {0x0042D, 0x0044D}, {0x0042E, 0x0044E}, -{0x0042F, 0x0044F}, {0x00460, 0x00461}, {0x00462, 0x00463}, {0x00464, 0x00465}, {0x00466, 0x00467}, {0x00468, 0x00469}, -{0x0046A, 0x0046B}, {0x0046C, 0x0046D}, {0x0046E, 0x0046F}, {0x00470, 0x00471}, {0x00472, 0x00473}, {0x00474, 0x00475}, -{0x00476, 0x00477}, {0x00478, 0x00479}, {0x0047A, 0x0047B}, {0x0047C, 0x0047D}, {0x0047E, 0x0047F}, {0x00480, 0x00481}, -{0x0048A, 0x0048B}, {0x0048C, 0x0048D}, {0x0048E, 0x0048F}, {0x00490, 0x00491}, {0x00492, 0x00493}, {0x00494, 0x00495}, -{0x00496, 0x00497}, {0x00498, 0x00499}, {0x0049A, 0x0049B}, {0x0049C, 0x0049D}, {0x0049E, 0x0049F}, {0x004A0, 0x004A1}, -{0x004A2, 0x004A3}, {0x004A4, 0x004A5}, {0x004A6, 0x004A7}, {0x004A8, 0x004A9}, {0x004AA, 0x004AB}, {0x004AC, 0x004AD}, -{0x004AE, 0x004AF}, {0x004B0, 0x004B1}, {0x004B2, 0x004B3}, {0x004B4, 0x004B5}, {0x004B6, 0x004B7}, {0x004B8, 0x004B9}, -{0x004BA, 0x004BB}, {0x004BC, 0x004BD}, {0x004BE, 0x004BF}, {0x004C0, 0x004CF}, {0x004C1, 0x004C2}, {0x004C3, 0x004C4}, -{0x004C5, 0x004C6}, {0x004C7, 0x004C8}, {0x004C9, 0x004CA}, {0x004CB, 0x004CC}, {0x004CD, 0x004CE}, {0x004D0, 0x004D1}, -{0x004D2, 0x004D3}, {0x004D4, 0x004D5}, {0x004D6, 0x004D7}, {0x004D8, 0x004D9}, {0x004DA, 0x004DB}, {0x004DC, 0x004DD}, -{0x004DE, 0x004DF}, {0x004E0, 0x004E1}, {0x004E2, 0x004E3}, {0x004E4, 0x004E5}, {0x004E6, 0x004E7}, {0x004E8, 0x004E9}, -{0x004EA, 0x004EB}, {0x004EC, 0x004ED}, {0x004EE, 0x004EF}, {0x004F0, 0x004F1}, {0x004F2, 0x004F3}, {0x004F4, 0x004F5}, -{0x004F6, 0x004F7}, {0x004F8, 0x004F9}, {0x004FA, 0x004FB}, {0x004FC, 0x004FD}, {0x004FE, 0x004FF}, {0x00500, 0x00501}, -{0x00502, 0x00503}, {0x00504, 0x00505}, {0x00506, 0x00507}, {0x00508, 0x00509}, {0x0050A, 0x0050B}, {0x0050C, 0x0050D}, -{0x0050E, 0x0050F}, {0x00510, 0x00511}, {0x00512, 0x00513}, {0x00514, 0x00515}, {0x00516, 0x00517}, {0x00518, 0x00519}, -{0x0051A, 0x0051B}, {0x0051C, 0x0051D}, {0x0051E, 0x0051F}, {0x00520, 0x00521}, {0x00522, 0x00523}, {0x00524, 0x00525}, -{0x00526, 0x00527}, {0x00528, 0x00529}, {0x0052A, 0x0052B}, {0x0052C, 0x0052D}, {0x0052E, 0x0052F}, {0x00531, 0x00561}, -{0x00532, 0x00562}, {0x00533, 0x00563}, {0x00534, 0x00564}, {0x00535, 0x00565}, {0x00536, 0x00566}, {0x00537, 0x00567}, -{0x00538, 0x00568}, {0x00539, 0x00569}, {0x0053A, 0x0056A}, {0x0053B, 0x0056B}, {0x0053C, 0x0056C}, {0x0053D, 0x0056D}, -{0x0053E, 0x0056E}, {0x0053F, 0x0056F}, {0x00540, 0x00570}, {0x00541, 0x00571}, {0x00542, 0x00572}, {0x00543, 0x00573}, -{0x00544, 0x00574}, {0x00545, 0x00575}, {0x00546, 0x00576}, {0x00547, 0x00577}, {0x00548, 0x00578}, {0x00549, 0x00579}, -{0x0054A, 0x0057A}, {0x0054B, 0x0057B}, {0x0054C, 0x0057C}, {0x0054D, 0x0057D}, {0x0054E, 0x0057E}, {0x0054F, 0x0057F}, -{0x00550, 0x00580}, {0x00551, 0x00581}, {0x00552, 0x00582}, {0x00553, 0x00583}, {0x00554, 0x00584}, {0x00555, 0x00585}, -{0x00556, 0x00586}, {0x010A0, 0x02D00}, {0x010A1, 0x02D01}, {0x010A2, 0x02D02}, {0x010A3, 0x02D03}, {0x010A4, 0x02D04}, -{0x010A5, 0x02D05}, {0x010A6, 0x02D06}, {0x010A7, 0x02D07}, {0x010A8, 0x02D08}, {0x010A9, 0x02D09}, {0x010AA, 0x02D0A}, -{0x010AB, 0x02D0B}, {0x010AC, 0x02D0C}, {0x010AD, 0x02D0D}, {0x010AE, 0x02D0E}, {0x010AF, 0x02D0F}, {0x010B0, 0x02D10}, -{0x010B1, 0x02D11}, {0x010B2, 0x02D12}, {0x010B3, 0x02D13}, {0x010B4, 0x02D14}, {0x010B5, 0x02D15}, {0x010B6, 0x02D16}, -{0x010B7, 0x02D17}, {0x010B8, 0x02D18}, {0x010B9, 0x02D19}, {0x010BA, 0x02D1A}, {0x010BB, 0x02D1B}, {0x010BC, 0x02D1C}, -{0x010BD, 0x02D1D}, {0x010BE, 0x02D1E}, {0x010BF, 0x02D1F}, {0x010C0, 0x02D20}, {0x010C1, 0x02D21}, {0x010C2, 0x02D22}, -{0x010C3, 0x02D23}, {0x010C4, 0x02D24}, {0x010C5, 0x02D25}, {0x010C7, 0x02D27}, {0x010CD, 0x02D2D}, {0x013A0, 0x0AB70}, -{0x013A1, 0x0AB71}, {0x013A2, 0x0AB72}, {0x013A3, 0x0AB73}, {0x013A4, 0x0AB74}, {0x013A5, 0x0AB75}, {0x013A6, 0x0AB76}, -{0x013A7, 0x0AB77}, {0x013A8, 0x0AB78}, {0x013A9, 0x0AB79}, {0x013AA, 0x0AB7A}, {0x013AB, 0x0AB7B}, {0x013AC, 0x0AB7C}, -{0x013AD, 0x0AB7D}, {0x013AE, 0x0AB7E}, {0x013AF, 0x0AB7F}, {0x013B0, 0x0AB80}, {0x013B1, 0x0AB81}, {0x013B2, 0x0AB82}, -{0x013B3, 0x0AB83}, {0x013B4, 0x0AB84}, {0x013B5, 0x0AB85}, {0x013B6, 0x0AB86}, {0x013B7, 0x0AB87}, {0x013B8, 0x0AB88}, -{0x013B9, 0x0AB89}, {0x013BA, 0x0AB8A}, {0x013BB, 0x0AB8B}, {0x013BC, 0x0AB8C}, {0x013BD, 0x0AB8D}, {0x013BE, 0x0AB8E}, -{0x013BF, 0x0AB8F}, {0x013C0, 0x0AB90}, {0x013C1, 0x0AB91}, {0x013C2, 0x0AB92}, {0x013C3, 0x0AB93}, {0x013C4, 0x0AB94}, -{0x013C5, 0x0AB95}, {0x013C6, 0x0AB96}, {0x013C7, 0x0AB97}, {0x013C8, 0x0AB98}, {0x013C9, 0x0AB99}, {0x013CA, 0x0AB9A}, -{0x013CB, 0x0AB9B}, {0x013CC, 0x0AB9C}, {0x013CD, 0x0AB9D}, {0x013CE, 0x0AB9E}, {0x013CF, 0x0AB9F}, {0x013D0, 0x0ABA0}, -{0x013D1, 0x0ABA1}, {0x013D2, 0x0ABA2}, {0x013D3, 0x0ABA3}, {0x013D4, 0x0ABA4}, {0x013D5, 0x0ABA5}, {0x013D6, 0x0ABA6}, -{0x013D7, 0x0ABA7}, {0x013D8, 0x0ABA8}, {0x013D9, 0x0ABA9}, {0x013DA, 0x0ABAA}, {0x013DB, 0x0ABAB}, {0x013DC, 0x0ABAC}, -{0x013DD, 0x0ABAD}, {0x013DE, 0x0ABAE}, {0x013DF, 0x0ABAF}, {0x013E0, 0x0ABB0}, {0x013E1, 0x0ABB1}, {0x013E2, 0x0ABB2}, -{0x013E3, 0x0ABB3}, {0x013E4, 0x0ABB4}, {0x013E5, 0x0ABB5}, {0x013E6, 0x0ABB6}, {0x013E7, 0x0ABB7}, {0x013E8, 0x0ABB8}, -{0x013E9, 0x0ABB9}, {0x013EA, 0x0ABBA}, {0x013EB, 0x0ABBB}, {0x013EC, 0x0ABBC}, {0x013ED, 0x0ABBD}, {0x013EE, 0x0ABBE}, -{0x013EF, 0x0ABBF}, {0x013F0, 0x013F8}, {0x013F1, 0x013F9}, {0x013F2, 0x013FA}, {0x013F3, 0x013FB}, {0x013F4, 0x013FC}, -{0x013F5, 0x013FD}, {0x01C90, 0x010D0}, {0x01C91, 0x010D1}, {0x01C92, 0x010D2}, {0x01C93, 0x010D3}, {0x01C94, 0x010D4}, -{0x01C95, 0x010D5}, {0x01C96, 0x010D6}, {0x01C97, 0x010D7}, {0x01C98, 0x010D8}, {0x01C99, 0x010D9}, {0x01C9A, 0x010DA}, -{0x01C9B, 0x010DB}, {0x01C9C, 0x010DC}, {0x01C9D, 0x010DD}, {0x01C9E, 0x010DE}, {0x01C9F, 0x010DF}, {0x01CA0, 0x010E0}, -{0x01CA1, 0x010E1}, {0x01CA2, 0x010E2}, {0x01CA3, 0x010E3}, {0x01CA4, 0x010E4}, {0x01CA5, 0x010E5}, {0x01CA6, 0x010E6}, -{0x01CA7, 0x010E7}, {0x01CA8, 0x010E8}, {0x01CA9, 0x010E9}, {0x01CAA, 0x010EA}, {0x01CAB, 0x010EB}, {0x01CAC, 0x010EC}, -{0x01CAD, 0x010ED}, {0x01CAE, 0x010EE}, {0x01CAF, 0x010EF}, {0x01CB0, 0x010F0}, {0x01CB1, 0x010F1}, {0x01CB2, 0x010F2}, -{0x01CB3, 0x010F3}, {0x01CB4, 0x010F4}, {0x01CB5, 0x010F5}, {0x01CB6, 0x010F6}, {0x01CB7, 0x010F7}, {0x01CB8, 0x010F8}, -{0x01CB9, 0x010F9}, {0x01CBA, 0x010FA}, {0x01CBD, 0x010FD}, {0x01CBE, 0x010FE}, {0x01CBF, 0x010FF}, {0x01E00, 0x01E01}, -{0x01E02, 0x01E03}, {0x01E04, 0x01E05}, {0x01E06, 0x01E07}, {0x01E08, 0x01E09}, {0x01E0A, 0x01E0B}, {0x01E0C, 0x01E0D}, -{0x01E0E, 0x01E0F}, {0x01E10, 0x01E11}, {0x01E12, 0x01E13}, {0x01E14, 0x01E15}, {0x01E16, 0x01E17}, {0x01E18, 0x01E19}, -{0x01E1A, 0x01E1B}, {0x01E1C, 0x01E1D}, {0x01E1E, 0x01E1F}, {0x01E20, 0x01E21}, {0x01E22, 0x01E23}, {0x01E24, 0x01E25}, -{0x01E26, 0x01E27}, {0x01E28, 0x01E29}, {0x01E2A, 0x01E2B}, {0x01E2C, 0x01E2D}, {0x01E2E, 0x01E2F}, {0x01E30, 0x01E31}, -{0x01E32, 0x01E33}, {0x01E34, 0x01E35}, {0x01E36, 0x01E37}, {0x01E38, 0x01E39}, {0x01E3A, 0x01E3B}, {0x01E3C, 0x01E3D}, -{0x01E3E, 0x01E3F}, {0x01E40, 0x01E41}, {0x01E42, 0x01E43}, {0x01E44, 0x01E45}, {0x01E46, 0x01E47}, {0x01E48, 0x01E49}, -{0x01E4A, 0x01E4B}, {0x01E4C, 0x01E4D}, {0x01E4E, 0x01E4F}, {0x01E50, 0x01E51}, {0x01E52, 0x01E53}, {0x01E54, 0x01E55}, -{0x01E56, 0x01E57}, {0x01E58, 0x01E59}, {0x01E5A, 0x01E5B}, {0x01E5C, 0x01E5D}, {0x01E5E, 0x01E5F}, {0x01E60, 0x01E61}, -{0x01E62, 0x01E63}, {0x01E64, 0x01E65}, {0x01E66, 0x01E67}, {0x01E68, 0x01E69}, {0x01E6A, 0x01E6B}, {0x01E6C, 0x01E6D}, -{0x01E6E, 0x01E6F}, {0x01E70, 0x01E71}, {0x01E72, 0x01E73}, {0x01E74, 0x01E75}, {0x01E76, 0x01E77}, {0x01E78, 0x01E79}, -{0x01E7A, 0x01E7B}, {0x01E7C, 0x01E7D}, {0x01E7E, 0x01E7F}, {0x01E80, 0x01E81}, {0x01E82, 0x01E83}, {0x01E84, 0x01E85}, -{0x01E86, 0x01E87}, {0x01E88, 0x01E89}, {0x01E8A, 0x01E8B}, {0x01E8C, 0x01E8D}, {0x01E8E, 0x01E8F}, {0x01E90, 0x01E91}, -{0x01E92, 0x01E93}, {0x01E94, 0x01E95}, {0x01E9E, 0x000DF}, {0x01EA0, 0x01EA1}, {0x01EA2, 0x01EA3}, {0x01EA4, 0x01EA5}, -{0x01EA6, 0x01EA7}, {0x01EA8, 0x01EA9}, {0x01EAA, 0x01EAB}, {0x01EAC, 0x01EAD}, {0x01EAE, 0x01EAF}, {0x01EB0, 0x01EB1}, -{0x01EB2, 0x01EB3}, {0x01EB4, 0x01EB5}, {0x01EB6, 0x01EB7}, {0x01EB8, 0x01EB9}, {0x01EBA, 0x01EBB}, {0x01EBC, 0x01EBD}, -{0x01EBE, 0x01EBF}, {0x01EC0, 0x01EC1}, {0x01EC2, 0x01EC3}, {0x01EC4, 0x01EC5}, {0x01EC6, 0x01EC7}, {0x01EC8, 0x01EC9}, -{0x01ECA, 0x01ECB}, {0x01ECC, 0x01ECD}, {0x01ECE, 0x01ECF}, {0x01ED0, 0x01ED1}, {0x01ED2, 0x01ED3}, {0x01ED4, 0x01ED5}, -{0x01ED6, 0x01ED7}, {0x01ED8, 0x01ED9}, {0x01EDA, 0x01EDB}, {0x01EDC, 0x01EDD}, {0x01EDE, 0x01EDF}, {0x01EE0, 0x01EE1}, -{0x01EE2, 0x01EE3}, {0x01EE4, 0x01EE5}, {0x01EE6, 0x01EE7}, {0x01EE8, 0x01EE9}, {0x01EEA, 0x01EEB}, {0x01EEC, 0x01EED}, -{0x01EEE, 0x01EEF}, {0x01EF0, 0x01EF1}, {0x01EF2, 0x01EF3}, {0x01EF4, 0x01EF5}, {0x01EF6, 0x01EF7}, {0x01EF8, 0x01EF9}, -{0x01EFA, 0x01EFB}, {0x01EFC, 0x01EFD}, {0x01EFE, 0x01EFF}, {0x01F08, 0x01F00}, {0x01F09, 0x01F01}, {0x01F0A, 0x01F02}, -{0x01F0B, 0x01F03}, {0x01F0C, 0x01F04}, {0x01F0D, 0x01F05}, {0x01F0E, 0x01F06}, {0x01F0F, 0x01F07}, {0x01F18, 0x01F10}, -{0x01F19, 0x01F11}, {0x01F1A, 0x01F12}, {0x01F1B, 0x01F13}, {0x01F1C, 0x01F14}, {0x01F1D, 0x01F15}, {0x01F28, 0x01F20}, -{0x01F29, 0x01F21}, {0x01F2A, 0x01F22}, {0x01F2B, 0x01F23}, {0x01F2C, 0x01F24}, {0x01F2D, 0x01F25}, {0x01F2E, 0x01F26}, -{0x01F2F, 0x01F27}, {0x01F38, 0x01F30}, {0x01F39, 0x01F31}, {0x01F3A, 0x01F32}, {0x01F3B, 0x01F33}, {0x01F3C, 0x01F34}, -{0x01F3D, 0x01F35}, {0x01F3E, 0x01F36}, {0x01F3F, 0x01F37}, {0x01F48, 0x01F40}, {0x01F49, 0x01F41}, {0x01F4A, 0x01F42}, -{0x01F4B, 0x01F43}, {0x01F4C, 0x01F44}, {0x01F4D, 0x01F45}, {0x01F59, 0x01F51}, {0x01F5B, 0x01F53}, {0x01F5D, 0x01F55}, -{0x01F5F, 0x01F57}, {0x01F68, 0x01F60}, {0x01F69, 0x01F61}, {0x01F6A, 0x01F62}, {0x01F6B, 0x01F63}, {0x01F6C, 0x01F64}, -{0x01F6D, 0x01F65}, {0x01F6E, 0x01F66}, {0x01F6F, 0x01F67}, {0x01F88, 0x01F80}, {0x01F89, 0x01F81}, {0x01F8A, 0x01F82}, -{0x01F8B, 0x01F83}, {0x01F8C, 0x01F84}, {0x01F8D, 0x01F85}, {0x01F8E, 0x01F86}, {0x01F8F, 0x01F87}, {0x01F98, 0x01F90}, -{0x01F99, 0x01F91}, {0x01F9A, 0x01F92}, {0x01F9B, 0x01F93}, {0x01F9C, 0x01F94}, {0x01F9D, 0x01F95}, {0x01F9E, 0x01F96}, -{0x01F9F, 0x01F97}, {0x01FA8, 0x01FA0}, {0x01FA9, 0x01FA1}, {0x01FAA, 0x01FA2}, {0x01FAB, 0x01FA3}, {0x01FAC, 0x01FA4}, -{0x01FAD, 0x01FA5}, {0x01FAE, 0x01FA6}, {0x01FAF, 0x01FA7}, {0x01FB8, 0x01FB0}, {0x01FB9, 0x01FB1}, {0x01FBA, 0x01F70}, -{0x01FBB, 0x01F71}, {0x01FBC, 0x01FB3}, {0x01FC8, 0x01F72}, {0x01FC9, 0x01F73}, {0x01FCA, 0x01F74}, {0x01FCB, 0x01F75}, -{0x01FCC, 0x01FC3}, {0x01FD8, 0x01FD0}, {0x01FD9, 0x01FD1}, {0x01FDA, 0x01F76}, {0x01FDB, 0x01F77}, {0x01FE8, 0x01FE0}, -{0x01FE9, 0x01FE1}, {0x01FEA, 0x01F7A}, {0x01FEB, 0x01F7B}, {0x01FEC, 0x01FE5}, {0x01FF8, 0x01F78}, {0x01FF9, 0x01F79}, -{0x01FFA, 0x01F7C}, {0x01FFB, 0x01F7D}, {0x01FFC, 0x01FF3}, {0x02126, 0x003C9}, {0x0212A, 0x0006B}, {0x0212B, 0x000E5}, -{0x02132, 0x0214E}, {0x02160, 0x02170}, {0x02161, 0x02171}, {0x02162, 0x02172}, {0x02163, 0x02173}, {0x02164, 0x02174}, -{0x02165, 0x02175}, {0x02166, 0x02176}, {0x02167, 0x02177}, {0x02168, 0x02178}, {0x02169, 0x02179}, {0x0216A, 0x0217A}, -{0x0216B, 0x0217B}, {0x0216C, 0x0217C}, {0x0216D, 0x0217D}, {0x0216E, 0x0217E}, {0x0216F, 0x0217F}, {0x02183, 0x02184}, -{0x024B6, 0x024D0}, {0x024B7, 0x024D1}, {0x024B8, 0x024D2}, {0x024B9, 0x024D3}, {0x024BA, 0x024D4}, {0x024BB, 0x024D5}, -{0x024BC, 0x024D6}, {0x024BD, 0x024D7}, {0x024BE, 0x024D8}, {0x024BF, 0x024D9}, {0x024C0, 0x024DA}, {0x024C1, 0x024DB}, -{0x024C2, 0x024DC}, {0x024C3, 0x024DD}, {0x024C4, 0x024DE}, {0x024C5, 0x024DF}, {0x024C6, 0x024E0}, {0x024C7, 0x024E1}, -{0x024C8, 0x024E2}, {0x024C9, 0x024E3}, {0x024CA, 0x024E4}, {0x024CB, 0x024E5}, {0x024CC, 0x024E6}, {0x024CD, 0x024E7}, -{0x024CE, 0x024E8}, {0x024CF, 0x024E9}, {0x02C00, 0x02C30}, {0x02C01, 0x02C31}, {0x02C02, 0x02C32}, {0x02C03, 0x02C33}, -{0x02C04, 0x02C34}, {0x02C05, 0x02C35}, {0x02C06, 0x02C36}, {0x02C07, 0x02C37}, {0x02C08, 0x02C38}, {0x02C09, 0x02C39}, -{0x02C0A, 0x02C3A}, {0x02C0B, 0x02C3B}, {0x02C0C, 0x02C3C}, {0x02C0D, 0x02C3D}, {0x02C0E, 0x02C3E}, {0x02C0F, 0x02C3F}, -{0x02C10, 0x02C40}, {0x02C11, 0x02C41}, {0x02C12, 0x02C42}, {0x02C13, 0x02C43}, {0x02C14, 0x02C44}, {0x02C15, 0x02C45}, -{0x02C16, 0x02C46}, {0x02C17, 0x02C47}, {0x02C18, 0x02C48}, {0x02C19, 0x02C49}, {0x02C1A, 0x02C4A}, {0x02C1B, 0x02C4B}, -{0x02C1C, 0x02C4C}, {0x02C1D, 0x02C4D}, {0x02C1E, 0x02C4E}, {0x02C1F, 0x02C4F}, {0x02C20, 0x02C50}, {0x02C21, 0x02C51}, -{0x02C22, 0x02C52}, {0x02C23, 0x02C53}, {0x02C24, 0x02C54}, {0x02C25, 0x02C55}, {0x02C26, 0x02C56}, {0x02C27, 0x02C57}, -{0x02C28, 0x02C58}, {0x02C29, 0x02C59}, {0x02C2A, 0x02C5A}, {0x02C2B, 0x02C5B}, {0x02C2C, 0x02C5C}, {0x02C2D, 0x02C5D}, -{0x02C2E, 0x02C5E}, {0x02C2F, 0x02C5F}, {0x02C60, 0x02C61}, {0x02C62, 0x0026B}, {0x02C63, 0x01D7D}, {0x02C64, 0x0027D}, -{0x02C67, 0x02C68}, {0x02C69, 0x02C6A}, {0x02C6B, 0x02C6C}, {0x02C6D, 0x00251}, {0x02C6E, 0x00271}, {0x02C6F, 0x00250}, -{0x02C70, 0x00252}, {0x02C72, 0x02C73}, {0x02C75, 0x02C76}, {0x02C7E, 0x0023F}, {0x02C7F, 0x00240}, {0x02C80, 0x02C81}, -{0x02C82, 0x02C83}, {0x02C84, 0x02C85}, {0x02C86, 0x02C87}, {0x02C88, 0x02C89}, {0x02C8A, 0x02C8B}, {0x02C8C, 0x02C8D}, -{0x02C8E, 0x02C8F}, {0x02C90, 0x02C91}, {0x02C92, 0x02C93}, {0x02C94, 0x02C95}, {0x02C96, 0x02C97}, {0x02C98, 0x02C99}, -{0x02C9A, 0x02C9B}, {0x02C9C, 0x02C9D}, {0x02C9E, 0x02C9F}, {0x02CA0, 0x02CA1}, {0x02CA2, 0x02CA3}, {0x02CA4, 0x02CA5}, -{0x02CA6, 0x02CA7}, {0x02CA8, 0x02CA9}, {0x02CAA, 0x02CAB}, {0x02CAC, 0x02CAD}, {0x02CAE, 0x02CAF}, {0x02CB0, 0x02CB1}, -{0x02CB2, 0x02CB3}, {0x02CB4, 0x02CB5}, {0x02CB6, 0x02CB7}, {0x02CB8, 0x02CB9}, {0x02CBA, 0x02CBB}, {0x02CBC, 0x02CBD}, -{0x02CBE, 0x02CBF}, {0x02CC0, 0x02CC1}, {0x02CC2, 0x02CC3}, {0x02CC4, 0x02CC5}, {0x02CC6, 0x02CC7}, {0x02CC8, 0x02CC9}, -{0x02CCA, 0x02CCB}, {0x02CCC, 0x02CCD}, {0x02CCE, 0x02CCF}, {0x02CD0, 0x02CD1}, {0x02CD2, 0x02CD3}, {0x02CD4, 0x02CD5}, -{0x02CD6, 0x02CD7}, {0x02CD8, 0x02CD9}, {0x02CDA, 0x02CDB}, {0x02CDC, 0x02CDD}, {0x02CDE, 0x02CDF}, {0x02CE0, 0x02CE1}, -{0x02CE2, 0x02CE3}, {0x02CEB, 0x02CEC}, {0x02CED, 0x02CEE}, {0x02CF2, 0x02CF3}, {0x0A640, 0x0A641}, {0x0A642, 0x0A643}, -{0x0A644, 0x0A645}, {0x0A646, 0x0A647}, {0x0A648, 0x0A649}, {0x0A64A, 0x0A64B}, {0x0A64C, 0x0A64D}, {0x0A64E, 0x0A64F}, -{0x0A650, 0x0A651}, {0x0A652, 0x0A653}, {0x0A654, 0x0A655}, {0x0A656, 0x0A657}, {0x0A658, 0x0A659}, {0x0A65A, 0x0A65B}, -{0x0A65C, 0x0A65D}, {0x0A65E, 0x0A65F}, {0x0A660, 0x0A661}, {0x0A662, 0x0A663}, {0x0A664, 0x0A665}, {0x0A666, 0x0A667}, -{0x0A668, 0x0A669}, {0x0A66A, 0x0A66B}, {0x0A66C, 0x0A66D}, {0x0A680, 0x0A681}, {0x0A682, 0x0A683}, {0x0A684, 0x0A685}, -{0x0A686, 0x0A687}, {0x0A688, 0x0A689}, {0x0A68A, 0x0A68B}, {0x0A68C, 0x0A68D}, {0x0A68E, 0x0A68F}, {0x0A690, 0x0A691}, -{0x0A692, 0x0A693}, {0x0A694, 0x0A695}, {0x0A696, 0x0A697}, {0x0A698, 0x0A699}, {0x0A69A, 0x0A69B}, {0x0A722, 0x0A723}, -{0x0A724, 0x0A725}, {0x0A726, 0x0A727}, {0x0A728, 0x0A729}, {0x0A72A, 0x0A72B}, {0x0A72C, 0x0A72D}, {0x0A72E, 0x0A72F}, -{0x0A732, 0x0A733}, {0x0A734, 0x0A735}, {0x0A736, 0x0A737}, {0x0A738, 0x0A739}, {0x0A73A, 0x0A73B}, {0x0A73C, 0x0A73D}, -{0x0A73E, 0x0A73F}, {0x0A740, 0x0A741}, {0x0A742, 0x0A743}, {0x0A744, 0x0A745}, {0x0A746, 0x0A747}, {0x0A748, 0x0A749}, -{0x0A74A, 0x0A74B}, {0x0A74C, 0x0A74D}, {0x0A74E, 0x0A74F}, {0x0A750, 0x0A751}, {0x0A752, 0x0A753}, {0x0A754, 0x0A755}, -{0x0A756, 0x0A757}, {0x0A758, 0x0A759}, {0x0A75A, 0x0A75B}, {0x0A75C, 0x0A75D}, {0x0A75E, 0x0A75F}, {0x0A760, 0x0A761}, -{0x0A762, 0x0A763}, {0x0A764, 0x0A765}, {0x0A766, 0x0A767}, {0x0A768, 0x0A769}, {0x0A76A, 0x0A76B}, {0x0A76C, 0x0A76D}, -{0x0A76E, 0x0A76F}, {0x0A779, 0x0A77A}, {0x0A77B, 0x0A77C}, {0x0A77D, 0x01D79}, {0x0A77E, 0x0A77F}, {0x0A780, 0x0A781}, -{0x0A782, 0x0A783}, {0x0A784, 0x0A785}, {0x0A786, 0x0A787}, {0x0A78B, 0x0A78C}, {0x0A78D, 0x00265}, {0x0A790, 0x0A791}, -{0x0A792, 0x0A793}, {0x0A796, 0x0A797}, {0x0A798, 0x0A799}, {0x0A79A, 0x0A79B}, {0x0A79C, 0x0A79D}, {0x0A79E, 0x0A79F}, -{0x0A7A0, 0x0A7A1}, {0x0A7A2, 0x0A7A3}, {0x0A7A4, 0x0A7A5}, {0x0A7A6, 0x0A7A7}, {0x0A7A8, 0x0A7A9}, {0x0A7AA, 0x00266}, -{0x0A7AB, 0x0025C}, {0x0A7AC, 0x00261}, {0x0A7AD, 0x0026C}, {0x0A7AE, 0x0026A}, {0x0A7B0, 0x0029E}, {0x0A7B1, 0x00287}, -{0x0A7B2, 0x0029D}, {0x0A7B3, 0x0AB53}, {0x0A7B4, 0x0A7B5}, {0x0A7B6, 0x0A7B7}, {0x0A7B8, 0x0A7B9}, {0x0A7BA, 0x0A7BB}, -{0x0A7BC, 0x0A7BD}, {0x0A7BE, 0x0A7BF}, {0x0A7C0, 0x0A7C1}, {0x0A7C2, 0x0A7C3}, {0x0A7C4, 0x0A794}, {0x0A7C5, 0x00282}, -{0x0A7C6, 0x01D8E}, {0x0A7C7, 0x0A7C8}, {0x0A7C9, 0x0A7CA}, {0x0A7D0, 0x0A7D1}, {0x0A7D6, 0x0A7D7}, {0x0A7D8, 0x0A7D9}, -{0x0A7F5, 0x0A7F6}, {0x0FF21, 0x0FF41}, {0x0FF22, 0x0FF42}, {0x0FF23, 0x0FF43}, {0x0FF24, 0x0FF44}, {0x0FF25, 0x0FF45}, -{0x0FF26, 0x0FF46}, {0x0FF27, 0x0FF47}, {0x0FF28, 0x0FF48}, {0x0FF29, 0x0FF49}, {0x0FF2A, 0x0FF4A}, {0x0FF2B, 0x0FF4B}, -{0x0FF2C, 0x0FF4C}, {0x0FF2D, 0x0FF4D}, {0x0FF2E, 0x0FF4E}, {0x0FF2F, 0x0FF4F}, {0x0FF30, 0x0FF50}, {0x0FF31, 0x0FF51}, -{0x0FF32, 0x0FF52}, {0x0FF33, 0x0FF53}, {0x0FF34, 0x0FF54}, {0x0FF35, 0x0FF55}, {0x0FF36, 0x0FF56}, {0x0FF37, 0x0FF57}, -{0x0FF38, 0x0FF58}, {0x0FF39, 0x0FF59}, {0x0FF3A, 0x0FF5A}, {0x10400, 0x10428}, {0x10401, 0x10429}, {0x10402, 0x1042A}, -{0x10403, 0x1042B}, {0x10404, 0x1042C}, {0x10405, 0x1042D}, {0x10406, 0x1042E}, {0x10407, 0x1042F}, {0x10408, 0x10430}, -{0x10409, 0x10431}, {0x1040A, 0x10432}, {0x1040B, 0x10433}, {0x1040C, 0x10434}, {0x1040D, 0x10435}, {0x1040E, 0x10436}, -{0x1040F, 0x10437}, {0x10410, 0x10438}, {0x10411, 0x10439}, {0x10412, 0x1043A}, {0x10413, 0x1043B}, {0x10414, 0x1043C}, -{0x10415, 0x1043D}, {0x10416, 0x1043E}, {0x10417, 0x1043F}, {0x10418, 0x10440}, {0x10419, 0x10441}, {0x1041A, 0x10442}, -{0x1041B, 0x10443}, {0x1041C, 0x10444}, {0x1041D, 0x10445}, {0x1041E, 0x10446}, {0x1041F, 0x10447}, {0x10420, 0x10448}, -{0x10421, 0x10449}, {0x10422, 0x1044A}, {0x10423, 0x1044B}, {0x10424, 0x1044C}, {0x10425, 0x1044D}, {0x10426, 0x1044E}, -{0x10427, 0x1044F}, {0x104B0, 0x104D8}, {0x104B1, 0x104D9}, {0x104B2, 0x104DA}, {0x104B3, 0x104DB}, {0x104B4, 0x104DC}, -{0x104B5, 0x104DD}, {0x104B6, 0x104DE}, {0x104B7, 0x104DF}, {0x104B8, 0x104E0}, {0x104B9, 0x104E1}, {0x104BA, 0x104E2}, -{0x104BB, 0x104E3}, {0x104BC, 0x104E4}, {0x104BD, 0x104E5}, {0x104BE, 0x104E6}, {0x104BF, 0x104E7}, {0x104C0, 0x104E8}, -{0x104C1, 0x104E9}, {0x104C2, 0x104EA}, {0x104C3, 0x104EB}, {0x104C4, 0x104EC}, {0x104C5, 0x104ED}, {0x104C6, 0x104EE}, -{0x104C7, 0x104EF}, {0x104C8, 0x104F0}, {0x104C9, 0x104F1}, {0x104CA, 0x104F2}, {0x104CB, 0x104F3}, {0x104CC, 0x104F4}, -{0x104CD, 0x104F5}, {0x104CE, 0x104F6}, {0x104CF, 0x104F7}, {0x104D0, 0x104F8}, {0x104D1, 0x104F9}, {0x104D2, 0x104FA}, -{0x104D3, 0x104FB}, {0x10570, 0x10597}, {0x10571, 0x10598}, {0x10572, 0x10599}, {0x10573, 0x1059A}, {0x10574, 0x1059B}, -{0x10575, 0x1059C}, {0x10576, 0x1059D}, {0x10577, 0x1059E}, {0x10578, 0x1059F}, {0x10579, 0x105A0}, {0x1057A, 0x105A1}, -{0x1057C, 0x105A3}, {0x1057D, 0x105A4}, {0x1057E, 0x105A5}, {0x1057F, 0x105A6}, {0x10580, 0x105A7}, {0x10581, 0x105A8}, -{0x10582, 0x105A9}, {0x10583, 0x105AA}, {0x10584, 0x105AB}, {0x10585, 0x105AC}, {0x10586, 0x105AD}, {0x10587, 0x105AE}, -{0x10588, 0x105AF}, {0x10589, 0x105B0}, {0x1058A, 0x105B1}, {0x1058C, 0x105B3}, {0x1058D, 0x105B4}, {0x1058E, 0x105B5}, -{0x1058F, 0x105B6}, {0x10590, 0x105B7}, {0x10591, 0x105B8}, {0x10592, 0x105B9}, {0x10594, 0x105BB}, {0x10595, 0x105BC}, -{0x10C80, 0x10CC0}, {0x10C81, 0x10CC1}, {0x10C82, 0x10CC2}, {0x10C83, 0x10CC3}, {0x10C84, 0x10CC4}, {0x10C85, 0x10CC5}, -{0x10C86, 0x10CC6}, {0x10C87, 0x10CC7}, {0x10C88, 0x10CC8}, {0x10C89, 0x10CC9}, {0x10C8A, 0x10CCA}, {0x10C8B, 0x10CCB}, -{0x10C8C, 0x10CCC}, {0x10C8D, 0x10CCD}, {0x10C8E, 0x10CCE}, {0x10C8F, 0x10CCF}, {0x10C90, 0x10CD0}, {0x10C91, 0x10CD1}, -{0x10C92, 0x10CD2}, {0x10C93, 0x10CD3}, {0x10C94, 0x10CD4}, {0x10C95, 0x10CD5}, {0x10C96, 0x10CD6}, {0x10C97, 0x10CD7}, -{0x10C98, 0x10CD8}, {0x10C99, 0x10CD9}, {0x10C9A, 0x10CDA}, {0x10C9B, 0x10CDB}, {0x10C9C, 0x10CDC}, {0x10C9D, 0x10CDD}, -{0x10C9E, 0x10CDE}, {0x10C9F, 0x10CDF}, {0x10CA0, 0x10CE0}, {0x10CA1, 0x10CE1}, {0x10CA2, 0x10CE2}, {0x10CA3, 0x10CE3}, -{0x10CA4, 0x10CE4}, {0x10CA5, 0x10CE5}, {0x10CA6, 0x10CE6}, {0x10CA7, 0x10CE7}, {0x10CA8, 0x10CE8}, {0x10CA9, 0x10CE9}, -{0x10CAA, 0x10CEA}, {0x10CAB, 0x10CEB}, {0x10CAC, 0x10CEC}, {0x10CAD, 0x10CED}, {0x10CAE, 0x10CEE}, {0x10CAF, 0x10CEF}, -{0x10CB0, 0x10CF0}, {0x10CB1, 0x10CF1}, {0x10CB2, 0x10CF2}, {0x118A0, 0x118C0}, {0x118A1, 0x118C1}, {0x118A2, 0x118C2}, -{0x118A3, 0x118C3}, {0x118A4, 0x118C4}, {0x118A5, 0x118C5}, {0x118A6, 0x118C6}, {0x118A7, 0x118C7}, {0x118A8, 0x118C8}, -{0x118A9, 0x118C9}, {0x118AA, 0x118CA}, {0x118AB, 0x118CB}, {0x118AC, 0x118CC}, {0x118AD, 0x118CD}, {0x118AE, 0x118CE}, -{0x118AF, 0x118CF}, {0x118B0, 0x118D0}, {0x118B1, 0x118D1}, {0x118B2, 0x118D2}, {0x118B3, 0x118D3}, {0x118B4, 0x118D4}, -{0x118B5, 0x118D5}, {0x118B6, 0x118D6}, {0x118B7, 0x118D7}, {0x118B8, 0x118D8}, {0x118B9, 0x118D9}, {0x118BA, 0x118DA}, -{0x118BB, 0x118DB}, {0x118BC, 0x118DC}, {0x118BD, 0x118DD}, {0x118BE, 0x118DE}, {0x118BF, 0x118DF}, {0x16E40, 0x16E60}, -{0x16E41, 0x16E61}, {0x16E42, 0x16E62}, {0x16E43, 0x16E63}, {0x16E44, 0x16E64}, {0x16E45, 0x16E65}, {0x16E46, 0x16E66}, -{0x16E47, 0x16E67}, {0x16E48, 0x16E68}, {0x16E49, 0x16E69}, {0x16E4A, 0x16E6A}, {0x16E4B, 0x16E6B}, {0x16E4C, 0x16E6C}, -{0x16E4D, 0x16E6D}, {0x16E4E, 0x16E6E}, {0x16E4F, 0x16E6F}, {0x16E50, 0x16E70}, {0x16E51, 0x16E71}, {0x16E52, 0x16E72}, -{0x16E53, 0x16E73}, {0x16E54, 0x16E74}, {0x16E55, 0x16E75}, {0x16E56, 0x16E76}, {0x16E57, 0x16E77}, {0x16E58, 0x16E78}, -{0x16E59, 0x16E79}, {0x16E5A, 0x16E7A}, {0x16E5B, 0x16E7B}, {0x16E5C, 0x16E7C}, {0x16E5D, 0x16E7D}, {0x16E5E, 0x16E7E}, -{0x16E5F, 0x16E7F}, {0x1E900, 0x1E922}, {0x1E901, 0x1E923}, {0x1E902, 0x1E924}, {0x1E903, 0x1E925}, {0x1E904, 0x1E926}, -{0x1E905, 0x1E927}, {0x1E906, 0x1E928}, {0x1E907, 0x1E929}, {0x1E908, 0x1E92A}, {0x1E909, 0x1E92B}, {0x1E90A, 0x1E92C}, -{0x1E90B, 0x1E92D}, {0x1E90C, 0x1E92E}, {0x1E90D, 0x1E92F}, {0x1E90E, 0x1E930}, {0x1E90F, 0x1E931}, {0x1E910, 0x1E932}, -{0x1E911, 0x1E933}, {0x1E912, 0x1E934}, {0x1E913, 0x1E935}, {0x1E914, 0x1E936}, {0x1E915, 0x1E937}, {0x1E916, 0x1E938}, -{0x1E917, 0x1E939}, {0x1E918, 0x1E93A}, {0x1E919, 0x1E93B}, {0x1E91A, 0x1E93C}, {0x1E91B, 0x1E93D}, {0x1E91C, 0x1E93E}, -{0x1E91D, 0x1E93F}, {0x1E91E, 0x1E940}, {0x1E91F, 0x1E941}, {0x1E920, 0x1E942}, {0x1E921, 0x1E943}, -}; diff --git a/cpp/unicode-data.h b/cpp/unicode-data.h index cb9dd8aa..e27fe177 100644 --- a/cpp/unicode-data.h +++ b/cpp/unicode-data.h @@ -1,16 +1,20 @@ #pragma once #include -#include -#include #include +#include +#include -extern const std::vector> unicode_ranges_digit; -extern const std::vector> unicode_ranges_letter; -extern const std::vector> unicode_ranges_whitespace; -extern const std::vector> unicode_ranges_accent_mark; -extern const std::vector> unicode_ranges_punctuation; -extern const std::vector> unicode_ranges_symbol; -extern const std::vector> unicode_ranges_control; -extern const std::multimap unicode_map_nfd; -extern const std::map unicode_map_lowercase; +struct range_nfd { + uint32_t first; + uint32_t last; + uint32_t nfd; +}; + +static const uint32_t MAX_CODEPOINTS = 0x110000; + +extern const std::vector> unicode_ranges_flags; +extern const std::unordered_set unicode_set_whitespace; +extern const std::unordered_map unicode_map_lowercase; +extern const std::unordered_map unicode_map_uppercase; +extern const std::vector unicode_ranges_nfd; diff --git a/cpp/unicode.cpp b/cpp/unicode.cpp index f2ccda05..056a4c74 100644 --- a/cpp/unicode.cpp +++ b/cpp/unicode.cpp @@ -1,4 +1,4 @@ -#include "unicode.h" +#include "unicode.h" #include "unicode-data.h" #include @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -108,57 +109,49 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) // return result; //} -static std::unordered_map unicode_cpt_type_map() { - std::unordered_map cpt_types; - for (auto p : unicode_ranges_digit) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_DIGIT; - } - } - for (auto p : unicode_ranges_letter) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_LETTER; - } - } - for (auto p : unicode_ranges_whitespace) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_WHITESPACE; +static std::vector unicode_cpt_flags_array() { + std::vector cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED); + + assert (unicode_ranges_flags.front().first == 0); + assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS); + for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) { + const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags + const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags + for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) { + cpt_flags[cpt] = range_ini.second; } } - for (auto p : unicode_ranges_accent_mark) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK; - } + + for (auto cpt : unicode_set_whitespace) { + cpt_flags[cpt].is_whitespace = true; } - for (auto p : unicode_ranges_punctuation) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION; - } + + for (auto p : unicode_map_lowercase) { + cpt_flags[p.second].is_lowercase = true; } - for (auto p : unicode_ranges_symbol) { - for (auto i = p.first; i <= p.second; ++i) { - cpt_types[i] = CODEPOINT_TYPE_SYMBOL; - } + + for (auto p : unicode_map_uppercase) { + cpt_flags[p.second].is_uppercase = true; } - for (auto p : unicode_ranges_control) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_CONTROL; - } + + for (auto &range : unicode_ranges_nfd) { // start, last, nfd + cpt_flags[range.nfd].is_nfd = true; } - return cpt_types; + + return cpt_flags; } static std::unordered_map unicode_byte_to_utf8_map() { std::unordered_map map; - for (int ch = u'!'; ch <= u'~'; ++ch) { + for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~' assert(0 <= ch && ch < 256); map[ch] = unicode_cpt_to_utf8(ch); } - for (int ch = u'¡'; ch <= u'¬'; ++ch) { + for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬' assert(0 <= ch && ch < 256); map[ch] = unicode_cpt_to_utf8(ch); } - for (int ch = u'®'; ch <= u'ÿ'; ++ch) { + for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ' assert(0 <= ch && ch < 256); map[ch] = unicode_cpt_to_utf8(ch); } @@ -174,15 +167,15 @@ static std::unordered_map unicode_byte_to_utf8_map() { static std::unordered_map unicode_utf8_to_byte_map() { std::unordered_map map; - for (int ch = u'!'; ch <= u'~'; ++ch) { + for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~' assert(0 <= ch && ch < 256); map[unicode_cpt_to_utf8(ch)] = ch; } - for (int ch = u'¡'; ch <= u'¬'; ++ch) { + for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬' assert(0 <= ch && ch < 256); map[unicode_cpt_to_utf8(ch)] = ch; } - for (int ch = u'®'; ch <= u'ÿ'; ++ch) { + for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ' assert(0 <= ch && ch < 256); map[unicode_cpt_to_utf8(ch)] = ch; } @@ -224,138 +217,255 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - const auto cpts = unicode_cpts_from_utf8(text); + size_t start = 0; for (auto offset : offsets) { - std::string token; + const size_t offset_ini = start; + const size_t offset_end = start + offset; + assert(offset_end <= cpts.size()); + start = offset_end; + + auto _get_cpt = [&] (const size_t pos) -> char32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + }; + + auto _get_flags = [&] (const size_t pos) -> codepoint_flags { + static const codepoint_flags undef(codepoint_flags::UNDEFINED); + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef; + }; + + size_t _prev_end = offset_ini; + auto _add_token = [&] (const size_t end) -> size_t { + assert(_prev_end <= end && end <= offset_end); + size_t len = end - _prev_end; + if (len > 0) { + bpe_offsets.push_back(len); + } + _prev_end = end; + //if (len > 0) { + // std::string s = ""; + // for(size_t p = end-len; p < end; p++) + // s += unicode_cpt_to_utf8(cpts[p]); + // printf(">>> '%s'\n", s.c_str()); + //} + return len; + }; + + for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { + const char32_t cpt = _get_cpt(pos); + const auto flags = _get_flags(pos); + + // regex: 's|'t|'re|'ve|'m|'ll|'d + if (cpt == '\'' && pos+1 < offset_end) { + char32_t cpt_next = _get_cpt(pos+1); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') { + pos += _add_token(pos+2); + continue; + } + if (pos+2 < offset_end) { + char32_t cpt_next_next = _get_cpt(pos+2); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + pos += _add_token(pos+3); + continue; + } + } + } + + auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags); + // regex: ?\p{L}+ + if (flags2.is_letter) { + pos += (cpt == ' '); + while (flags2.is_letter) { + flags2 = _get_flags(++pos); + } + _add_token(pos); + continue; + } + // regex: ?\p{N}+ + if (flags2.is_number) { + pos += (cpt == ' '); + while (flags2.is_number) { + flags2 = _get_flags(++pos); + } + _add_token(pos); + continue; + } + // regex: ?[^\s\p{L}\p{N}]+ + if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) { + pos += (cpt == ' '); + while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) { + flags2 = _get_flags(++pos); + } + _add_token(pos); + continue; + } + + size_t num_whitespaces = 0; + while (_get_flags(pos+num_whitespaces).is_whitespace) { + num_whitespaces++; + } - bool collecting_numeric = false; - bool collecting_letter = false; - bool collecting_special = false; - bool collecting_whitespace_lookahead = false; - bool collecting = false; + // regex: \s+(?!\S) + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + pos += num_whitespaces - 1; + _add_token(pos); + continue; + } - std::vector text_utf; - text_utf.reserve(offset); + // regex: \s+ + if (num_whitespaces > 0) { + pos += num_whitespaces; + _add_token(pos); + continue; + } - for (size_t i = start; i < start + offset; ++i) { - text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i])); + // no matches + _add_token(++pos); } + } - for (int i = 0; i < (int)text_utf.size(); i++) { - const std::string & utf_char = text_utf[i]; - bool split_condition = false; - int bytes_remain = text_utf.size() - i; + return bpe_offsets; +} + +// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" +static std::vector unicode_regex_split_custom_llama3(const std::string & text, const std::vector & offsets) { + std::vector bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - // forward backward lookups - const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; - const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; + const auto cpts = unicode_cpts_from_utf8(text); - // handling contractions - if (!split_condition && bytes_remain >= 2) { - // 's|'t|'m|'d - if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { - split_condition = true; + size_t start = 0; + for (auto offset : offsets) { + const size_t offset_ini = start; + const size_t offset_end = start + offset; + assert(offset_end <= cpts.size()); + start = offset_end; + + auto _get_cpt = [&] (const size_t pos) -> char32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + }; + + auto _get_flags = [&] (const size_t pos) -> codepoint_flags { + static const codepoint_flags undef(codepoint_flags::UNDEFINED); + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef; + }; + + size_t _prev_end = offset_ini; + auto _add_token = [&] (const size_t end) -> size_t { + assert(_prev_end <= end && end <= offset_end); + size_t len = end - _prev_end; + if (len > 0) { + bpe_offsets.push_back(len); + } + _prev_end = end; + //if (len > 0) { + // std::string s = ""; + // for(size_t p = end-len; p < end; p++) + // s += unicode_cpt_to_utf8(cpts[p]); + // printf(">>> '%s'\n", s.c_str()); + //} + return len; + }; + + for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { + const char32_t cpt = _get_cpt(pos); + const auto flags = _get_flags(pos); + + // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive + if (cpt == '\'' && pos+1 < offset_end) { + char32_t cpt_next = unicode_tolower(_get_cpt(pos+1)); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') { + pos += _add_token(pos+2); + continue; } - if (split_condition) { - if (token.size()) { - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); + if (pos+2 < offset_end) { + char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2)); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + pos += _add_token(pos+3); + continue; } - token = utf_char + utf_char_next; - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); - token = ""; - i++; - continue; } } - if (!split_condition && bytes_remain >= 3) { - // 're|'ve|'ll - if (utf_char == "\'" && ( - (utf_char_next == "r" && utf_char_next_next == "e") || - (utf_char_next == "v" && utf_char_next_next == "e") || - (utf_char_next == "l" && utf_char_next_next == "l")) - ) { - split_condition = true; - } - if (split_condition) { - // current token + next token can be defined - if (token.size()) { - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); - } - token = utf_char; - token += utf_char_next; - token += utf_char_next_next; - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); - token = ""; - i += 2; + // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct? + if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) { + if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters + pos++; + while (_get_flags(pos).is_letter) { + pos++; + } + _add_token(pos); continue; } } - if (!split_condition && !collecting) { - if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { - collecting_letter = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { - collecting_numeric = true; - collecting = true; - } - else if ( - ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || - (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) - ) { - collecting_special = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { - collecting_whitespace_lookahead = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) { - split_condition = true; + // regex: \p{N}{1,3} + if (flags.is_number) { + size_t ini = pos; + while (_get_flags(pos).is_number) { + if (++pos - ini >= 3 ) { + _add_token(pos); + ini = pos; + } } + _add_token(pos); + continue; } - else if (!split_condition && collecting) { - if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) { - split_condition = true; - } - else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) { - split_condition = true; + + // regex: ?[^\s\p{L}\p{N}]+[\r\n]* + auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags); + if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) { + pos += (cpt == ' '); + while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) { + flags2 = _get_flags(++pos); } - else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { - split_condition = true; + char32_t cpt2 = _get_cpt(pos); + while (cpt2 == '\r' || cpt2 == '\n') { + cpt2 = _get_cpt(++pos); } - else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { - split_condition = true; + _add_token(pos); + continue; + } + + size_t num_whitespaces = 0; + size_t last_end_r_or_n = 0; + while (_get_flags(pos+num_whitespaces).is_whitespace) { + char32_t cpt2 = _get_cpt(pos+num_whitespaces); + if (cpt2 == '\r' || cpt2 == '\n') { + last_end_r_or_n = pos + num_whitespaces + 1; } + num_whitespaces++; } - if (utf_char_next == "") { - split_condition = true; // final - token += utf_char; + // regex: \s*[\r\n]+ + if (last_end_r_or_n > 0) { + pos = last_end_r_or_n; + _add_token(pos); + continue; } - if (split_condition) { - if (token.size()) { - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); - } - token = utf_char; - collecting = false; - collecting_letter = false; - collecting_numeric = false; - collecting_special = false; - collecting_whitespace_lookahead = false; + // regex: \s+(?!\S) + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + pos += num_whitespaces - 1; + _add_token(pos); + continue; } - else { - token += utf_char; + + // regex: \s+ + if (num_whitespaces > 0) { + pos += num_whitespaces; + _add_token(pos); + continue; } - } - start += offset; + // no matches + _add_token(++pos); + } } return bpe_offsets; @@ -424,14 +534,14 @@ static std::vector unicode_regex_split_stl(const std::string & text, con static std::vector unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { std::vector bpe_offsets; - (void)(text); - (void)(regex_expr); - (void)(offsets); - // TODO: this implementation is actually wrong, uncomment and run: - // make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf - //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { - // bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); - //} + if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { + bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); + } else if ( + regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" || + regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") { + + bpe_offsets = unicode_regex_split_custom_llama3(text, offsets); + } return bpe_offsets; } @@ -470,15 +580,14 @@ std::string unicode_cpt_to_utf8(uint32_t cp) { } std::vector unicode_cpts_normalize_nfd(const std::vector & cpts) { - std::vector result; - result.reserve(cpts.size()); + auto comp = [] (const uint32_t cpt, const range_nfd & range) { + return cpt < range.first; + }; + std::vector result(cpts.size()); for (size_t i = 0; i < cpts.size(); ++i) { - auto it = unicode_map_nfd.find(cpts[i]); - if (it == unicode_map_nfd.end()) { - result.push_back(cpts[i]); - } else { - result.push_back(it->second); - } + const uint32_t cpt = cpts[i]; + auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1; + result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt; } return result; } @@ -492,18 +601,19 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { return result; } -int unicode_cpt_type(uint32_t cp) { - static std::unordered_map cpt_types = unicode_cpt_type_map(); - const auto it = cpt_types.find(cp); - return it == cpt_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second; +codepoint_flags unicode_cpt_flags(const uint32_t cp) { + static const codepoint_flags undef(codepoint_flags::UNDEFINED); + static const auto cpt_flags = unicode_cpt_flags_array(); + return cp < cpt_flags.size() ? cpt_flags[cp] : undef; } -int unicode_cpt_type(const std::string & utf8) { - if (utf8.length() == 0) { - return CODEPOINT_TYPE_UNIDENTIFIED; +codepoint_flags unicode_cpt_flags(const std::string & utf8) { + static const codepoint_flags undef(codepoint_flags::UNDEFINED); + if (utf8.empty()) { + return undef; // undefined } size_t offset = 0; - return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset)); + return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset)); } std::string unicode_byte_to_utf8(uint8_t byte) { @@ -524,21 +634,21 @@ char32_t unicode_tolower(char32_t cp) { std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { - { "\\p{N}", CODEPOINT_TYPE_DIGIT }, - { "\\p{L}", CODEPOINT_TYPE_LETTER }, - { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION }, + { "\\p{N}", codepoint_flags::NUMBER }, + { "\\p{L}", codepoint_flags::LETTER }, + { "\\p{P}", codepoint_flags::PUNCTUATION }, }; static const std::map k_ucat_cpt = { - { CODEPOINT_TYPE_DIGIT, 0xD1 }, - { CODEPOINT_TYPE_LETTER, 0xD2 }, - { CODEPOINT_TYPE_PUNCTUATION, 0xD3 }, + { codepoint_flags::NUMBER, 0xD1 }, + { codepoint_flags::LETTER, 0xD2 }, + { codepoint_flags::PUNCTUATION, 0xD3 }, }; static const std::map k_ucat_map = { - { CODEPOINT_TYPE_DIGIT, "\x30-\x39" }, // 0-9 - { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z - { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} + { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9 + { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z + { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} }; // compute collapsed codepoints only if needed by at least one regex @@ -569,10 +679,10 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - const int cpt_type = unicode_cpt_type(cpts[i]); + const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag(); - if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(cpt_type); + if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(cpt_flag); } else { text_collapsed[i] = (char) 0xD0; // fallback } diff --git a/cpp/unicode.h b/cpp/unicode.h index ce2bcef5..7513be4a 100644 --- a/cpp/unicode.h +++ b/cpp/unicode.h @@ -4,22 +4,56 @@ #include #include -#define CODEPOINT_TYPE_UNIDENTIFIED 0 -#define CODEPOINT_TYPE_DIGIT 1 -#define CODEPOINT_TYPE_LETTER 2 -#define CODEPOINT_TYPE_WHITESPACE 3 -#define CODEPOINT_TYPE_ACCENT_MARK 4 -#define CODEPOINT_TYPE_PUNCTUATION 5 -#define CODEPOINT_TYPE_SYMBOL 6 -#define CODEPOINT_TYPE_CONTROL 7 +struct codepoint_flags { + enum { + UNDEFINED = 0x0001, + NUMBER = 0x0002, // regex: \p{N} + LETTER = 0x0004, // regex: \p{L} + SEPARATOR = 0x0008, // regex: \p{Z} + ACCENT_MARK = 0x0010, // regex: \p{M} + PUNCTUATION = 0x0020, // regex: \p{P} + SYMBOL = 0x0040, // regex: \p{S} + CONTROL = 0x0080, // regex: \p{C} + MASK_CATEGORIES = 0x00FF, + }; + + // codepoint type + uint16_t is_undefined : 1; + uint16_t is_number : 1; // regex: \p{N} + uint16_t is_letter : 1; // regex: \p{L} + uint16_t is_separator : 1; // regex: \p{Z} + uint16_t is_accent_mark : 1; // regex: \p{M} + uint16_t is_punctuation : 1; // regex: \p{P} + uint16_t is_symbol : 1; // regex: \p{S} + uint16_t is_control : 1; // regex: \p{C} + // helper flags + uint16_t is_whitespace : 1; // regex: \s + uint16_t is_lowercase : 1; + uint16_t is_uppercase : 1; + uint16_t is_nfd : 1; + + // decode from uint16 + inline codepoint_flags(const uint16_t flags=0) { + *reinterpret_cast(this) = flags; + } + + inline uint16_t as_uint() const { + return *reinterpret_cast(this); + } + + inline uint16_t category_flag() const { + return this->as_uint() & MASK_CATEGORIES; + } +}; + std::string unicode_cpt_to_utf8(uint32_t cp); std::vector unicode_cpts_from_utf8(const std::string & utf8); std::vector unicode_cpts_normalize_nfd(const std::vector & cpts); -int unicode_cpt_type(uint32_t cp); -int unicode_cpt_type(const std::string & utf8); +codepoint_flags unicode_cpt_flags(const uint32_t cp); +codepoint_flags unicode_cpt_flags(const std::string & utf8); std::string unicode_byte_to_utf8(uint8_t byte); uint8_t unicode_utf8_to_byte(const std::string & utf8); diff --git a/example/ios/.xcode.env.local b/example/ios/.xcode.env.local index 10b8a2a5..1864b351 100644 --- a/example/ios/.xcode.env.local +++ b/example/ios/.xcode.env.local @@ -1 +1 @@ -export NODE_BINARY=/var/folders/4z/1d45cfts3936kdm7v9jl349r0000gn/T/yarn--1714015495137-0.8507344017914271/node +export NODE_BINARY=/var/folders/4z/1d45cfts3936kdm7v9jl349r0000gn/T/yarn--1716945418336-0.012674683762205596/node diff --git a/llama.cpp b/llama.cpp index a2ac89d6..b864b50c 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit a2ac89d6efb41b535778bfeaecaae8fe295b6ed3 +Subproject commit b864b50ce5e2beefc8c2fd31733e4e1a978b7754 diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 10b895b8..47cf44bb 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -56,6 +56,7 @@ files=( "./cpp/ggml-impl.h" "./cpp/ggml-common.h" "./cpp/sgemm.cpp" + "./cpp/json-schema-to-grammar.h" ) # Loop through each file and run the sed commands diff --git a/scripts/common.cpp.patch b/scripts/common.cpp.patch index da0a851a..0d03d38a 100644 --- a/scripts/common.cpp.patch +++ b/scripts/common.cpp.patch @@ -1,6 +1,6 @@ ---- common.cpp.orig 2024-04-25 11:10:50 -+++ common.cpp 2024-04-25 11:10:51 -@@ -45,6 +45,12 @@ +--- common.cpp.orig 2024-05-29 09:16:58 ++++ common.cpp 2024-05-29 09:16:59 +@@ -47,6 +47,12 @@ #include #include #endif diff --git a/scripts/common.h.patch b/scripts/common.h.patch index 966a1fdb..f8c9be2a 100644 --- a/scripts/common.h.patch +++ b/scripts/common.h.patch @@ -1,8 +1,8 @@ ---- common.h.orig 2024-05-04 13:24:18 -+++ common.h 2024-05-04 13:24:19 -@@ -44,6 +44,17 @@ - int get_math_cpu_count(); - int32_t get_num_physical_cores(); +--- common.h.orig 2024-05-29 09:16:58 ++++ common.h 2024-05-29 09:16:59 +@@ -41,6 +41,17 @@ + + struct llama_control_vector_load_info; +#define print_build_info() do { \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ @@ -16,5 +16,5 @@ +extern char const *LLAMA_BUILD_TARGET; + // - // CLI argument parsing + // CPU utils // diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch index c123fc37..a269d0fd 100644 --- a/scripts/ggml-metal.m.patch +++ b/scripts/ggml-metal.m.patch @@ -1,6 +1,6 @@ ---- ggml-metal.m.orig 2024-05-04 13:24:18 -+++ ggml-metal.m 2024-05-04 13:24:19 -@@ -321,7 +321,7 @@ +--- ggml-metal.m.orig 2024-05-29 09:16:58 ++++ ggml-metal.m 2024-05-29 09:16:59 +@@ -334,7 +334,7 @@ const bool try_metallib = true; #endif diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index ac51a0f3..816ad178 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,6 +1,6 @@ ---- llama.cpp.orig 2024-05-04 13:24:18 -+++ llama.cpp 2024-05-04 13:24:19 -@@ -120,6 +120,17 @@ +--- llama.cpp.orig 2024-05-29 09:16:58 ++++ llama.cpp 2024-05-29 09:16:59 +@@ -117,6 +117,17 @@ #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) @@ -18,7 +18,7 @@ // // helpers // -@@ -1303,16 +1314,16 @@ +@@ -1384,16 +1395,16 @@ if (prefetch > 0) { // advise the kernel to preload the mapped memory