Skip to content

Commit

Permalink
feat: sync llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed May 29, 2024
1 parent 7fbcebc commit 5804e02
Show file tree
Hide file tree
Showing 29 changed files with 17,745 additions and 8,727 deletions.
1,503 changes: 752 additions & 751 deletions cpp/common.cpp

Large diffs are not rendered by default.

93 changes: 51 additions & 42 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,20 @@
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)

#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;
extern char const * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET;

struct llama_control_vector_load_info;

int get_math_cpu_count();
int32_t get_num_physical_cores();

#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
Expand All @@ -55,14 +52,21 @@ extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;

//
// CPU utils
//

int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();

//
// CLI argument parsing
//

struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed

int32_t n_threads = get_math_cpu_count();
int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
Expand Down Expand Up @@ -93,6 +97,7 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers

lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
Expand Down Expand Up @@ -151,6 +156,9 @@ struct gpt_params {
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool special = false; // enable special token output
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
Expand Down Expand Up @@ -187,33 +195,34 @@ struct gpt_params {

void gpt_params_handle_model_default(gpt_params & params);

bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
std::string gpt_params_get_system_info(const gpt_params & params);

void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);

std::string get_system_info(const gpt_params & params);
//
// String utils
//

std::string gpt_random_prompt(std::mt19937 & rng);
std::vector<std::string> string_split(std::string input, char separator);

void process_escapes(std::string& input);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_random_prompt(std::mt19937 & rng);

bool validate_file_name(const std::string & filename);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);

//
// String utils
// Filesystem utils
//

std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);

std::string fs_get_cache_directory();

//
// Model utils
Expand Down Expand Up @@ -284,29 +293,15 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);

//
// YAML utils
//

bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
std::string get_sortable_timestamp();

void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

//
// KV cache utils
//

// Dump the KV cache view with the number of sequences per cell.
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

//
// Embedding utils
Expand Down Expand Up @@ -340,6 +335,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
//
// Split utils
//

static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

//
// YAML utils
//

void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);

void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

5 changes: 2 additions & 3 deletions cpp/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1182,9 +1182,9 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
static char * fmt_size(size_t size) {
static char buffer[128];
if (size >= 1024*1024) {
sprintf(buffer, "%zuM", size/1024/1024);
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
} else {
sprintf(buffer, "%zuK", size/1024);
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
}
return buffer;
}
Expand Down Expand Up @@ -1895,7 +1895,6 @@ void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_t

tensor->buffer = buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
tensor->backend = tensor->view_src->backend;
lm_ggml_backend_buffer_init_tensor(buffer, tensor);
}

Expand Down
54 changes: 0 additions & 54 deletions cpp/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,8 @@ typedef sycl::half2 lm_ggml_half2;
// QK = number of values after dequantization
// QK_K = super-block size

#ifdef LM_GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif // LM_GGML_QKK_64

#if defined(LM_GGML_COMMON_DECL_CUDA) || defined(LM_GGML_COMMON_DECL_HIP) || defined(LM_GGML_COMMON_DECL_SYCL)
// QR = QK / number of values before dequantization
Expand Down Expand Up @@ -131,13 +126,8 @@ typedef sycl::half2 lm_ggml_half2;
#define QI4_NL (QK4_NL / (4*QR4_NL))
#define QR4_NL 2

#if QK_K == 64
#define QI4_XS QI4_NL
#define QR4_XS QR4_NL
#else
#define QI4_XS (QK_K / (4*QR4_XS))
#define QR4_XS 8
#endif

#endif // LM_GGML_COMMON_DECL_CUDA || LM_GGML_COMMON_DECL_HIP

Expand Down Expand Up @@ -228,36 +218,18 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(lm_ggml_half) + QK_K/16 + QK_K/4, "
// weight is represented as x = a * q
// 16 blocks of 16 elements each
// Effectively 3.4375 bits per weight
#ifdef LM_GGML_QKK_64
typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
uint8_t scales[2];
lm_ggml_half d; // super-block scale
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
#else
typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
uint8_t scales[12]; // scales, quantized with 6 bits
lm_ggml_half d; // super-block scale
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
#endif

// 4-bit quantization
// 8 blocks of 32 elements each
// weight is represented as x = a * q + b
// Effectively 4.5 bits per weight
#ifdef LM_GGML_QKK_64
typedef struct {
lm_ggml_half d[2]; // super-block scales/mins
uint8_t scales[2]; // 4-bit block scales/mins
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct {
union {
struct {
Expand All @@ -270,21 +242,11 @@ typedef struct {
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
#endif

// 5-bit quantization
// 8 blocks of 32 elements each
// weight is represented as x = a * q + b
// Effectively 5.5 bits per weight
#ifdef LM_GGML_QKK_64
typedef struct {
lm_ggml_half d; // super-block scale
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K;
static_assert(sizeof(block_q5_K) == sizeof(lm_ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
union {
struct {
Expand All @@ -298,7 +260,6 @@ typedef struct {
uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif

// 6-bit quantization
// weight is represented as x = a * q
Expand Down Expand Up @@ -356,11 +317,7 @@ typedef struct {
static_assert(sizeof(block_iq3_xxs) == sizeof(lm_ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");

// 3.4375 bpw
#if QK_K == 64
#define IQ3S_N_SCALE 2
#else
#define IQ3S_N_SCALE QK_K/64
#endif
typedef struct {
lm_ggml_half d;
uint8_t qs[QK_K/4];
Expand All @@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(lm_ggml_half) + QK_K/8 + QK_K/16, "w
typedef struct {
uint8_t qs[QK_K/8]; // grid index, low 8 bits
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
#if QK_K == 64
lm_ggml_half d;
#endif
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
} block_iq1_m;
#if QK_K == 64
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(lm_ggml_half), "wrong iq1_m block size/padding");
#else
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
#endif

// Used by IQ1_M quants
typedef union {
Expand All @@ -406,17 +356,13 @@ typedef struct {
} block_iq4_nl;
static_assert(sizeof(block_iq4_nl) == sizeof(lm_ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");

#if QK_K == 64
#define block_iq4_xs block_iq4_nl
#else
typedef struct {
lm_ggml_half d;
uint16_t scales_h;
uint8_t scales_l[QK_K/64];
uint8_t qs[QK_K/2];
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
#endif

#endif // LM_GGML_COMMON_DECL
#endif // LM_GGML_COMMON_DECL
Expand Down
Loading

0 comments on commit 5804e02

Please sign in to comment.