From aaff4a38f8efbcc0953b8eb46fac3965dbf8e837 Mon Sep 17 00:00:00 2001 From: jhen Date: Fri, 10 Nov 2023 13:35:53 +0800 Subject: [PATCH] feat: sync llama.cpp --- cpp/ggml-alloc.c | 21 +-- cpp/ggml.c | 330 ++++++++++------------------------------ cpp/ggml.h | 5 + cpp/llama.cpp | 179 +++++++++++++--------- llama.cpp | 2 +- scripts/llama.cpp.patch | 10 +- 6 files changed, 206 insertions(+), 341 deletions(-) diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c index 23278bff..2dbd6b51 100644 --- a/cpp/ggml-alloc.c +++ b/cpp/ggml-alloc.c @@ -378,9 +378,13 @@ static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) { } } -static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view) { +static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view, bool update_backend) { assert(view->view_src != NULL && view->view_src->data != NULL); - view->backend = view->view_src->backend; + + if (update_backend) { + view->backend = view->view_src->backend; + } + view->buffer = view->view_src->buffer; view->data = (char *)view->view_src->data + view->view_offs; @@ -394,7 +398,7 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (lm_ggml_is_view(node)) { - init_view(alloc, node); + init_view(alloc, node, true); } else { // see if we can reuse a parent's buffer (inplace) if (lm_ggml_op_can_inplace(node->op)) { @@ -424,15 +428,14 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); node->view_src = view_src; view_src_hn->n_views += 1; - init_view(alloc, node); + init_view(alloc, node, false); return; } - } - else { + } else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->view_src = parent; p_hn->n_views += 1; - init_view(alloc, node); + init_view(alloc, node, false); return; } } @@ -463,7 +466,7 @@ size_t lm_ggml_allocr_alloc_graph_n( hash_get(ht, view_src)->n_views += 1; if (node->buffer == NULL && node->data != NULL) { // view of a pre-allocated tensor, didn't call init_view() yet - init_view(alloc, node); + init_view(alloc, node, true); } } @@ -474,7 +477,7 @@ size_t lm_ggml_allocr_alloc_graph_n( } hash_get(ht, parent)->n_children += 1; if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { - init_view(alloc, parent); + init_view(alloc, parent, true); } } } diff --git a/cpp/ggml.c b/cpp/ggml.c index 9688880a..c0d6ff31 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -4970,8 +4970,13 @@ struct lm_ggml_tensor * lm_ggml_rope_back( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down) { LM_GGML_ASSERT(lm_ggml_is_vector(b)); @@ -4988,11 +4993,15 @@ struct lm_ggml_tensor * lm_ggml_rope_back( struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a); - int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); - memcpy(params + 5, &freq_scale, sizeof(float)); - memcpy(params + 6, &xpos_base, sizeof(float)); - memcpy(params + 7, &xpos_down, sizeof(bool)); + int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx }; + memcpy(params + 5, &freq_base, sizeof(float)); + memcpy(params + 6, &freq_scale, sizeof(float)); + memcpy(params + 7, &ext_factor, sizeof(float)); + memcpy(params + 8, &attn_factor, sizeof(float)); + memcpy(params + 9, &beta_fast, sizeof(float)); + memcpy(params + 10, &beta_slow, sizeof(float)); + memcpy(params + 11, &xpos_base, sizeof(float)); + memcpy(params + 12, &xpos_down, sizeof(bool)); lm_ggml_set_op_params(result, params, sizeof(params)); result->op = LM_GGML_OP_ROPE_BACK; @@ -10974,7 +10983,8 @@ static void lm_ggml_compute_forward_rope_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { + struct lm_ggml_tensor * dst, + const bool forward) { if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -11033,6 +11043,11 @@ static void lm_ggml_compute_forward_rope_f32( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; + const int32_t * pos = (const int32_t *) src1->data; for (int64_t i3 = 0; i3 < ne3; i3++) { @@ -11049,9 +11064,9 @@ static void lm_ggml_compute_forward_rope_f32( float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); + const float sin_theta = sinf(theta_base) * sin_sign; const float cos_block_theta = cosf(block_theta); - const float sin_block_theta = sinf(block_theta); + const float sin_block_theta = sinf(block_theta) * sin_sign; theta_base *= theta_scale; block_theta *= theta_scale; @@ -11075,6 +11090,7 @@ static void lm_ggml_compute_forward_rope_f32( rope_yarn( theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta ); + sin_theta *= sin_sign; // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; @@ -11105,6 +11121,7 @@ static void lm_ggml_compute_forward_rope_f32( theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta ); + sin_theta *= sin_sign; theta_base *= theta_scale; @@ -11130,7 +11147,8 @@ static void lm_ggml_compute_forward_rope_f16( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { + struct lm_ggml_tensor * dst, + const bool forward) { if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -11182,6 +11200,11 @@ static void lm_ggml_compute_forward_rope_f16( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; + const int32_t * pos = (const int32_t *) src1->data; for (int64_t i3 = 0; i3 < ne3; i3++) { @@ -11198,9 +11221,9 @@ static void lm_ggml_compute_forward_rope_f16( float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); + const float sin_theta = sinf(theta_base) * sin_sign; const float cos_block_theta = cosf(block_theta); - const float sin_block_theta = sinf(block_theta); + const float sin_block_theta = sinf(block_theta) * sin_sign; theta_base *= theta_scale; block_theta *= theta_scale; @@ -11224,6 +11247,7 @@ static void lm_ggml_compute_forward_rope_f16( rope_yarn( theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta ); + sin_theta *= sin_sign; theta_base *= theta_scale; @@ -11250,6 +11274,7 @@ static void lm_ggml_compute_forward_rope_f16( theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta ); + sin_theta *= sin_sign; theta_base *= theta_scale; @@ -11279,11 +11304,11 @@ static void lm_ggml_compute_forward_rope( switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_rope_f16(params, src0, src1, dst); + lm_ggml_compute_forward_rope_f16(params, src0, src1, dst, true); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rope_f32(params, src0, src1, dst); + lm_ggml_compute_forward_rope_f32(params, src0, src1, dst, true); } break; default: { @@ -11294,216 +11319,6 @@ static void lm_ggml_compute_forward_rope( // lm_ggml_compute_forward_rope_back -static void lm_ggml_compute_forward_rope_back_f32( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - - if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - // y = rope(x, src1) - // dx = rope_back(dy, src1) - // src0 is dy, src1 contains options - - float freq_base; - float freq_scale; - - // these two only relevant for xPos RoPE: - float xpos_base; - bool xpos_down; - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - - LM_GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - assert(nb0 == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = lm_ggml_nrows(dst); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - - const bool is_neox = mode & 2; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - float theta_base = freq_scale * (float)p; - - if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); - - // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; - if (xpos_down) zeta = 1.0f / zeta; - - theta_base *= theta_scale; - - const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = dy[0]; - const float dy1 = dy[1]; - - dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta; - dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta; - } - } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); - - theta_base *= theta_scale; - - const int64_t i0 = ib*n_dims + ic/2; - - const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = dy[0]; - const float dy1 = dy[n_dims/2]; - - dx[0] = dy0*cos_theta + dy1*sin_theta; - dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta; - } - } - } - } - } - } -} - -static void lm_ggml_compute_forward_rope_back_f16( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - - if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - // y = rope(x, src1) - // dx = rope_back(dy, src1) - // src0 is dy, src1 contains options - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - - LM_GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - assert(nb0 == sizeof(lm_ggml_fp16_t)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = lm_ggml_nrows(dst); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(10000.0, -2.0f/n_dims); - - const bool is_neox = mode & 2; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - float theta_base = (float)p; - - if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); - - theta_base *= theta_scale; - - const lm_ggml_fp16_t * const dy = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - lm_ggml_fp16_t * dx = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = LM_GGML_FP16_TO_FP32(dy[0]); - const float dy1 = LM_GGML_FP16_TO_FP32(dy[1]); - - dx[0] = LM_GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[1] = LM_GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); - } - } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); - - theta_base *= theta_scale; - - const int64_t i0 = ib*n_dims + ic/2; - - const lm_ggml_fp16_t * const dy = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - lm_ggml_fp16_t * dx = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = LM_GGML_FP16_TO_FP32(dy[0]); - const float dy1 = LM_GGML_FP16_TO_FP32(dy[n_dims/2]); - - dx[0] = LM_GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[n_dims/2] = LM_GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); - } - } - } - } - } - } -} - static void lm_ggml_compute_forward_rope_back( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, @@ -11512,11 +11327,11 @@ static void lm_ggml_compute_forward_rope_back( switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_rope_back_f16(params, src0, src1, dst); + lm_ggml_compute_forward_rope_f16(params, src0, src1, dst, false); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rope_back_f32(params, src0, src1, dst); + lm_ggml_compute_forward_rope_f32(params, src0, src1, dst, false); } break; default: { @@ -15559,17 +15374,20 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // necessary for llama if (src0->grad) { //const int n_past = ((int32_t *) tensor->op_params)[0]; - const int n_dims = ((int32_t *) tensor->op_params)[1]; - const int mode = ((int32_t *) tensor->op_params)[2]; - const int n_ctx = ((int32_t *) tensor->op_params)[3]; - float freq_base; - float freq_scale; - float xpos_base; - bool xpos_down; - memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + const int n_orig_ctx = ((int32_t *) tensor->op_params)[4]; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down; + + memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool)); src0->grad = lm_ggml_add_or_set(ctx, src0->grad, @@ -15579,8 +15397,13 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm n_dims, mode, n_ctx, + n_orig_ctx, freq_base, freq_scale, + ext_factor, + attn_factor, + beta_fast, + beta_slow, xpos_base, xpos_down), zero_table); @@ -15590,17 +15413,20 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { if (src0->grad) { //const int n_past = ((int32_t *) tensor->op_params)[0]; - const int n_dims = ((int32_t *) tensor->op_params)[1]; - const int mode = ((int32_t *) tensor->op_params)[2]; - const int n_ctx = ((int32_t *) tensor->op_params)[3]; - float freq_base; - float freq_scale; - float xpos_base; - bool xpos_down; - memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + const int n_orig_ctx = ((int32_t *) tensor->op_params)[4]; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down; + + memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool)); src0->grad = lm_ggml_add_or_set(ctx, src0->grad, @@ -15609,14 +15435,14 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm src1, n_dims, mode, - 0, n_ctx, + n_orig_ctx, freq_base, freq_scale, - 0.0f, - 1.0f, - 0.0f, - 0.0f, + ext_factor, + attn_factor, + beta_fast, + beta_slow, xpos_base, xpos_down, false), diff --git a/cpp/ggml.h b/cpp/ggml.h index d13ee9b0..3cbd5d8e 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -1372,8 +1372,13 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down); diff --git a/cpp/llama.cpp b/cpp/llama.cpp index 6abc8fc7..b62b25a0 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -607,19 +607,37 @@ static void lm_ggml_graph_compute_helper(std::vector & buf, lm_ggml_cgr // llama helpers // +inline void * llama_host_malloc(size_t n) { #ifdef LM_GGML_USE_CUBLAS -# define llama_host_malloc(n) lm_ggml_cuda_host_malloc(n) -# define llama_host_free(data) lm_ggml_cuda_host_free(data) + if (lm_ggml_cublas_loaded()) { + return lm_ggml_cuda_host_malloc(n); + } else { + return malloc(n); + } #elif LM_GGML_USE_METAL -# define llama_host_malloc(n) lm_ggml_metal_host_malloc(n) -# define llama_host_free(data) lm_ggml_metal_host_free(data) + return lm_ggml_metal_host_malloc(n); #elif LM_GGML_USE_CPU_HBM -# define llama_host_malloc(n) hbw_malloc(n) -# define llama_host_free(data) if (data != NULL) hbw_free(data) + return hbw_malloc(n); #else -# define llama_host_malloc(n) malloc(n) -# define llama_host_free(data) free(data) + return malloc(n); #endif +} + +inline void llama_host_free(void * ptr) { +#ifdef LM_GGML_USE_CUBLAS + if (lm_ggml_cublas_loaded()) { + return lm_ggml_cuda_host_free(ptr); + } else { + return free(ptr); + } +#elif LM_GGML_USE_METAL + return lm_ggml_metal_host_free(ptr); +#elif LM_GGML_USE_CPU_HBM + return hbw_free(ptr); +#else + return free(ptr); +#endif +} #if defined(_WIN32) static std::string llama_format_win_err(DWORD err) { @@ -1211,9 +1229,11 @@ struct llama_kv_cache { } #ifdef LM_GGML_USE_CUBLAS - lm_ggml_cuda_free_data(k); - lm_ggml_cuda_free_data(v); -#endif // LM_GGML_USE_CUBLAS + if (lm_ggml_cublas_loaded()) { + lm_ggml_cuda_free_data(k); + lm_ggml_cuda_free_data(v); + } +#endif } }; @@ -1313,11 +1333,15 @@ struct llama_model { } #ifdef LM_GGML_USE_CUBLAS - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - lm_ggml_cuda_free_data(tensors_by_name[i].second); + if (lm_ggml_cublas_loaded()) { + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + lm_ggml_cuda_free_data(tensors_by_name[i].second); + } + lm_ggml_cuda_free_scratch(); } - lm_ggml_cuda_free_scratch(); -#elif defined(LM_GGML_USE_CLBLAST) +#endif + +#if defined(LM_GGML_USE_CLBLAST) for (size_t i = 0; i < tensors_by_name.size(); ++i) { lm_ggml_cl_free_data(tensors_by_name[i].second); } @@ -1429,23 +1453,26 @@ static bool llama_kv_cache_init( lm_ggml_set_name(cache.v, "cache_v"); (void) n_gpu_layers; + #ifdef LM_GGML_USE_CUBLAS - size_t vram_kv_cache = 0; + if (lm_ggml_cublas_loaded()) { + size_t vram_kv_cache = 0; - if (n_gpu_layers > (int)n_layer + 1) { - lm_ggml_cuda_assign_buffers_no_scratch(cache.v); - LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += lm_ggml_nbytes(cache.v); - } - if (n_gpu_layers > (int)n_layer + 2) { - lm_ggml_cuda_assign_buffers_no_scratch(cache.k); - LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += lm_ggml_nbytes(cache.k); - } - if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + if (n_gpu_layers > (int)n_layer + 1) { + lm_ggml_cuda_assign_buffers_no_scratch(cache.v); + LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); + vram_kv_cache += lm_ggml_nbytes(cache.v); + } + if (n_gpu_layers > (int)n_layer + 2) { + lm_ggml_cuda_assign_buffers_no_scratch(cache.k); + LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); + vram_kv_cache += lm_ggml_nbytes(cache.k); + } + if (vram_kv_cache > 0) { + LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + } } -#endif // LM_GGML_USE_CUBLAS +#endif return true; } @@ -2532,18 +2559,22 @@ static void llm_load_tensors( } (void) main_gpu; + + enum lm_ggml_backend_type llama_backend_offload = LM_GGML_BACKEND_CPU; + enum lm_ggml_backend_type llama_backend_offload_split = LM_GGML_BACKEND_CPU; + #ifdef LM_GGML_USE_CUBLAS - LLAMA_LOG_INFO("%s: using " LM_GGML_CUDA_NAME " for GPU acceleration\n", __func__); - lm_ggml_cuda_set_main_device(main_gpu); -#define LLAMA_BACKEND_OFFLOAD LM_GGML_BACKEND_GPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT LM_GGML_BACKEND_GPU_SPLIT + if (lm_ggml_cublas_loaded()) { + LLAMA_LOG_INFO("%s: using " LM_GGML_CUDA_NAME " for GPU acceleration\n", __func__); + lm_ggml_cuda_set_main_device(main_gpu); + + llama_backend_offload = LM_GGML_BACKEND_GPU; + llama_backend_offload_split = LM_GGML_BACKEND_GPU_SPLIT; + } #elif defined(LM_GGML_USE_CLBLAST) - LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); -#define LLAMA_BACKEND_OFFLOAD LM_GGML_BACKEND_GPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT LM_GGML_BACKEND_GPU -#else -#define LLAMA_BACKEND_OFFLOAD LM_GGML_BACKEND_CPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT LM_GGML_BACKEND_CPU + LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); + llama_backend_offload = LM_GGML_BACKEND_GPU; + llama_backend_offload_split = LM_GGML_BACKEND_GPU; #endif // prepare memory for the weights @@ -2570,12 +2601,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2599,8 +2630,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2636,12 +2667,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2665,8 +2696,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2706,12 +2737,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2737,8 +2768,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2783,12 +2814,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2814,8 +2845,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2860,12 +2891,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2888,8 +2919,8 @@ static void llm_load_tensors( const int i_gpu_start = n_layer - n_gpu_layers; model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; auto & layer = model.layers[i]; layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); @@ -2926,12 +2957,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2957,8 +2988,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -3004,12 +3035,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -3033,8 +3064,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; diff --git a/llama.cpp b/llama.cpp index 381efbf4..a75fa576 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 381efbf480959bb6d1e247a8b0c2328f22e350f8 +Subproject commit a75fa576abba9d37f463580c379e4bbf1e1ad03c diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index 028665bf..649b1868 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,9 +1,9 @@ ---- llama.cpp.orig 2023-11-03 14:26:21 -+++ llama.cpp 2023-11-03 14:26:22 +--- llama.cpp.orig 2023-11-10 13:36:00 ++++ llama.cpp 2023-11-10 13:36:02 @@ -103,6 +103,17 @@ #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) - + +#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING) +#include +#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID" @@ -18,8 +18,8 @@ // // helpers // -@@ -761,16 +772,16 @@ - +@@ -779,16 +790,16 @@ + if (prefetch > 0) { // Advise the kernel to preload the mapped memory - if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {