diff --git a/README.md b/README.md index a7101525648b0..6302ac977aa94 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) +- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1) - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) #### Multimodal diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5562499aa4925..cf317eeae608a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -326,6 +326,7 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, gguf.MODEL_TENSOR.POSNET_NORM1, gguf.MODEL_TENSOR.POSNET_NORM2, ) @@ -3316,6 +3317,8 @@ def set_gguf_parameters(self): # required by llama.cpp, unused self.gguf_writer.add_head_count(0) + lerp_weights: dict[int, dict[str, Tensor]] = {} + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) @@ -3331,14 +3334,84 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: data_torch = data_torch.squeeze() - rescale_every_n_layers = self.hparams["rescale_every"] - if rescale_every_n_layers > 0: - if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): - data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + try: + rescale_every_n_layers = self.hparams["rescale_every"] + if rescale_every_n_layers > 0: + if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): + data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + except KeyError: + pass + + # concat time_mix_lerp weights to reduce some cpu overhead + # also reduces the number of tensors in the model + if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: + try: + self.lerp_weights[bid][new_name] = data_torch + except KeyError: + self.lerp_weights[bid] = {new_name: data_torch} + if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) + yield (new_name, data) + return yield (new_name, data_torch) +@Model.register("RWKV6Qwen2ForCausalLM") +class RWKV6Qwen2Model(Rwkv6Model): + model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + num_attention_heads = self.hparams["num_attention_heads"] + num_key_value_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + head_size = hidden_size // num_attention_heads + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + time_mix_extra_dim = 64 if hidden_size >= 4096 else 32 + time_decay_extra_dim = 128 if hidden_size >= 4096 else 64 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # special parameters for time_mixing in RWKV6QWEN2 + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_token_shift_count(1) + # RWKV6QWEN2 use grouped key/value like GQA + self.gguf_writer.add_head_count_kv(num_key_value_heads) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + for new_name, data in super().modify_tensors(data_torch, name, bid): + if "time_mix_w1" in new_name or "time_mix_w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg + # permute them here to avoid code changes + data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) + if "w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + yield (new_name, data) + continue + yield (new_name, data) + + @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8630d92c5c6a4..8f8cb9e1aa140 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -501,6 +501,7 @@ extern "C" { GGML_OP_GET_REL_POS, GGML_OP_ADD_REL_POS, GGML_OP_RWKV_WKV6, + GGML_OP_GATED_LINEAR_ATTN, GGML_OP_UNARY, @@ -1859,6 +1860,15 @@ extern "C" { struct ggml_tensor * td, struct ggml_tensor * state); + GGML_API struct ggml_tensor * ggml_gated_linear_attn( + struct ggml_context * ctx, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * q, + struct ggml_tensor * g, + struct ggml_tensor * state, + float scale); + // custom operators typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index b7fefb9ddfd89..2966ff7682de2 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -11803,9 +11803,9 @@ static void ggml_compute_forward_add_rel_pos( static void ggml_compute_forward_rwkv_wkv6_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const int64_t T = dst->src[1]->ne[3]; + const int64_t T = dst->src[1]->ne[2]; const int64_t C = dst->ne[0]; - const int64_t HEADS = dst->src[1]->ne[2]; + const int64_t HEADS = dst->src[1]->ne[1]; const int64_t n_seqs = dst->src[5]->ne[1]; const int64_t head_size = C / HEADS; @@ -12000,6 +12000,197 @@ static void ggml_compute_forward_rwkv_wkv6( } } +// ggml_compute_forward_gla + +static void ggml_compute_forward_gla_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + const int64_t T = dst->src[1]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t HEADS = dst->src[1]->ne[1]; + const int64_t n_seqs = dst->src[4]->ne[1]; + const int64_t head_size = C / HEADS; + const float scale = ggml_get_op_params_f32(dst, 0); + + float * dst_data = (float *) dst->data; + float * state = ((float *) dst->data) + C * T; + + const int ith = params->ith; + const int nth = params->nth; + + if (ith >= HEADS) { + return; + } + + const int h_start = (HEADS * ith) / nth; + const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? + (HEADS * (ith + 1)) / nth : HEADS; + + float * k = (float *) dst->src[0]->data; + float * v = (float *) dst->src[1]->data; + float * q = (float *) dst->src[2]->data; + float * g = (float *) dst->src[3]->data; + + size_t t_stride = HEADS * head_size; // Same to C + + size_t h_stride = C / HEADS; + GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS + size_t h_stride_2d = head_size * head_size; + + if (ith == 0) { + memset(dst_data, 0, T * C * sizeof(float)); + } + ggml_barrier(params->threadpool); + + + #if defined(__AVX__) && !defined(__AVX512F__) + #define GGML_F32X GGML_F32x8 + #define GGML_F32X_SET1 GGML_F32x8_SET1 + #define GGML_F32X_LOAD GGML_F32x8_LOAD + #define GGML_F32X_STORE GGML_F32x8_STORE + #define GGML_F32X_MUL GGML_F32x8_MUL + #define GGML_F32X_FMA GGML_F32x8_FMA + #define GLA_VECTOR_SIZE 8 + #elif defined(__AVX512F__) + #define GGML_F32X GGML_F32x16 + #define GGML_F32X_SET1 GGML_F32x16_SET1 + #define GGML_F32X_LOAD GGML_F32x16_LOAD + #define GGML_F32X_STORE GGML_F32x16_STORE + #define GGML_F32X_MUL GGML_F32x16_MUL + #define GGML_F32X_FMA GGML_F32x16_FMA + #define GLA_VECTOR_SIZE 16 + #elif defined(__ARM_NEON) && defined(__aarch64__) + #define GGML_F32X GGML_F32x4 + #define GGML_F32X_SET1 GGML_F32x4_SET1 + #define GGML_F32X_LOAD GGML_F32x4_LOAD + #define GGML_F32X_STORE GGML_F32x4_STORE + #define GGML_F32X_MUL GGML_F32x4_MUL + #define GGML_F32X_FMA GGML_F32x4_FMA + #define GLA_VECTOR_SIZE 4 + #endif + + #ifdef GLA_VECTOR_SIZE + const int64_t vec_count = head_size / GLA_VECTOR_SIZE; + + for (int64_t t = 0; t < T; t++) { + size_t t_offset = t * t_stride; + size_t state_offset = head_size * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset; + + for (int64_t h = h_start; h < h_end; h++) { + size_t h_offset = h * h_stride; + size_t t_h_offset = t_offset + h_offset; + size_t h_2d_offset = h * h_stride_2d; + + for (int64_t i = 0; i < head_size; i++) { + size_t t_h_i_offset = t_h_offset + i; + size_t h_2d_i_offset = h_2d_offset + i * h_stride; + + float k_val = k[t_h_i_offset]; + float q_val = q[t_h_i_offset] * scale; + float g_val = g[t_h_i_offset]; + + // Broadcast scalar values to vectors + GGML_F32X k_vec = GGML_F32X_SET1(k_val); + GGML_F32X q_vec = GGML_F32X_SET1(q_val); + GGML_F32X g_vec = GGML_F32X_SET1(g_val); + + for (int64_t j = 0; j < vec_count; j++) { + size_t base_j = j * GLA_VECTOR_SIZE; + size_t t_h_j_offset = t_h_offset + base_j; + size_t h_2d_i_j_offset = h_2d_i_offset + base_j; + + // Load x elements at once + GGML_F32X v_vec = GGML_F32X_LOAD(&v[t_h_j_offset]); + GGML_F32X prev_state_vec = GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]); + GGML_F32X dst_vec = GGML_F32X_LOAD(&dst_data[t_h_j_offset]); + + // Compute kv = v * k + GGML_F32X kv_vec = GGML_F32X_MUL(v_vec, k_vec); + + // Compute temp = prev_state * g + kv + GGML_F32X temp_vec = GGML_F32X_FMA(kv_vec, prev_state_vec, g_vec); + + // Update dst: dst += temp * q + dst_vec = GGML_F32X_FMA(dst_vec, temp_vec, q_vec); + GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec); + + // Update state + GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], temp_vec); + } + + // Handle remaining elements, this will not be used. + for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) { + size_t t_h_j_offset = t_h_offset + j; + size_t h_2d_i_j_offset = h_2d_i_offset + j; + float v_val = v[t_h_j_offset]; + float kv_val = v_val * k_val; + float prev_state_val = state_prev[h_2d_i_j_offset]; + float temp_val = kv_val + prev_state_val * g_val; + dst_data[t_h_j_offset] += temp_val * q_val; + state_cur[h_2d_i_j_offset] = temp_val; + } + } + } + } + + #else + for (int64_t t = 0; t < T; t++) { + size_t t_offset = t * t_stride; + size_t state_offset = head_size * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset; + + for (int64_t h = h_start; h < h_end; h++) { + size_t h_offset = h * h_stride; + size_t t_h_offset = t_offset + h_offset; + size_t h_2d_offset = h * h_stride_2d; + + for (int64_t i = 0; i < head_size; i++) { + size_t t_h_i_offset = t_h_offset + i; + size_t h_2d_i_offset = h_2d_offset + i * h_stride; + + float k_val = k[t_h_i_offset]; + float q_val = q[t_h_i_offset] * scale; + float g_val = g[t_h_i_offset]; + + for (int64_t j = 0; j < head_size; j++) { + size_t t_h_j_offset = t_h_offset + j; + size_t h_2d_i_j_offset = h_2d_i_offset + j; + + float v_val = v[t_h_j_offset]; + float kv_val = v_val * k_val; + float prev_state_val = state_prev[h_2d_i_j_offset]; + float temp_val = prev_state_val * g_val + kv_val; + dst_data[t_h_j_offset] += temp_val * q_val; + state_cur[h_2d_i_j_offset] = temp_val; + } + } + } + } + #endif +} + + +static void ggml_compute_forward_gla( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_gla_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_map_unary static void ggml_compute_forward_map_unary_f32( @@ -12749,6 +12940,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_rwkv_wkv6(params, tensor); } break; + case GGML_OP_GATED_LINEAR_ATTN: + { + ggml_compute_forward_gla(params, tensor); + } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; @@ -13047,6 +13242,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_WIN_UNPART: case GGML_OP_GET_REL_POS: case GGML_OP_RWKV_WKV6: + case GGML_OP_GATED_LINEAR_ATTN: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 0b06be729864e..8476ee1bca50c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -37,6 +37,7 @@ #include "ggml-cuda/unary.cuh" #include "ggml-cuda/upscale.cuh" #include "ggml-cuda/wkv6.cuh" +#include "ggml-cuda/gla.cuh" #include #include @@ -2167,6 +2168,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_RWKV_WKV6: ggml_cuda_op_rwkv_wkv6(ctx, dst); break; + case GGML_OP_GATED_LINEAR_ATTN: + ggml_cuda_op_gated_linear_attn(ctx, dst); + break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_cuda_cross_entropy_loss_back(ctx, dst); break; @@ -3011,6 +3015,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: case GGML_OP_RWKV_WKV6: + case GGML_OP_GATED_LINEAR_ATTN: return true; case GGML_OP_FLASH_ATTN_EXT: { #ifndef FLASH_ATTN_AVAILABLE diff --git a/ggml/src/ggml-cuda/gla.cu b/ggml/src/ggml-cuda/gla.cu new file mode 100644 index 0000000000000..f7d615a8282fc --- /dev/null +++ b/ggml/src/ggml-cuda/gla.cu @@ -0,0 +1,93 @@ +#include "common.cuh" +#include "gla.cuh" + +template +static __global__ void gated_linear_attn_f32(const int B, const int T, const int C, const int H, const float scale, + const float * k, const float * v, const float * r, const float * td, const float * s, float * dst) { + const int tid = threadIdx.x; + const int bid = blockIdx.x; + + const int head_size = HEAD_SIZE; + const int batch_i = bid / H; + const int head_i = bid % H; + const int state_size = C * head_size; + const int n_seq_tokens = T / B; + + float state[head_size]; + __shared__ float _k[head_size], _r[head_size], _td[head_size]; + + #pragma unroll + for (int i = 0; i < head_size; i++) { + state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid]; + } + + for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) { + __syncthreads(); + _k[tid] = k[t]; + _r[tid] = r[t]; + _td[tid] = td[t]; + __syncthreads(); + + const float _v = v[t]; + float y = 0; + for (int j = 0; j < head_size; j += 4) { + const float4 & k = (float4 &)(_k[j]); + const float4 & r = (float4 &)(_r[j]); + const float4 & td = (float4 &)(_td[j]); + float4 & s = (float4 &)(state[j]); + float4 kv; + + kv.x = k.x * _v; + kv.y = k.y * _v; + kv.z = k.z * _v; + kv.w = k.w * _v; + + s.x = s.x * td.x + kv.x; + s.y = s.y * td.y + kv.y; + s.z = s.z * td.z + kv.z; + s.w = s.w * td.w + kv.w; + + y += r.x * s.x; + y += r.y * s.y; + y += r.z * s.z; + y += r.w * s.w; + } + dst[t] = y * scale; + } + + #pragma unroll + for (int i = 0; i < head_size; i++) { + dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; + } +} + +void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const float * k_d = (const float *)dst->src[0]->data; + const float * v_d = (const float *)dst->src[1]->data; + const float * r_d = (const float *)dst->src[2]->data; + const float * td_d = (const float *)dst->src[3]->data; + const float * s_d = (const float *)dst->src[4]->data; + + const int64_t B = dst->src[4]->ne[1]; + const int64_t T = dst->src[0]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[1]; + + float scale; + memcpy(&scale, (float*)dst->op_params, sizeof(float)); + + float * dst_d = (float *)dst->data; + + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == 64 || C / H == 128); + + + if (C / H == 64) { + gated_linear_attn_f32<64><<>>(B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d); + } else { + gated_linear_attn_f32<128><<>>(B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d); + } +} diff --git a/ggml/src/ggml-cuda/gla.cuh b/ggml/src/ggml-cuda/gla.cuh new file mode 100644 index 0000000000000..2c82ad7dd7229 --- /dev/null +++ b/ggml/src/ggml-cuda/gla.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/wkv6.cu b/ggml/src/ggml-cuda/wkv6.cu index 42578341a386b..bbdafbee5818b 100644 --- a/ggml/src/ggml-cuda/wkv6.cu +++ b/ggml/src/ggml-cuda/wkv6.cu @@ -73,9 +73,9 @@ void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const float * s_d = (const float *)dst->src[5]->data; const int64_t B = dst->src[5]->ne[1]; - const int64_t T = dst->src[0]->ne[3]; + const int64_t T = dst->src[0]->ne[2]; const int64_t C = dst->ne[0]; - const int64_t H = dst->src[0]->ne[2]; + const int64_t H = dst->src[0]->ne[1]; float * dst_d = (float *)dst->data; diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 88314a5cd73af..022e7b7637bd3 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -51,6 +51,10 @@ void ggml_sycl_host_free(void* ptr) try { std::exit(1); } +bool gpu_has_xmx(sycl::device &dev) { + return dev.has(sycl::aspect::ext_intel_matrix); +} + int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) { const int64_t max_range = std::numeric_limits::max(); int64_t sycl_down_blk_size = block_size; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 62b4cea3ada85..e9500f3a1682b 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -662,6 +662,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t } } +bool gpu_has_xmx(sycl::device &dev); void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index a240968ad2e48..d41cfd3a6ec88 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -158,8 +158,9 @@ static void concat_f32_sycl_non_cont( }); } -void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst) { +void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; queue_ptr stream = ctx.stream(); const int32_t dim = ((int32_t *)dst->op_params)[0]; diff --git a/ggml/src/ggml-sycl/concat.hpp b/ggml/src/ggml-sycl/concat.hpp index 5a04feaab6b0a..e5cb7314c9f33 100644 --- a/ggml/src/ggml-sycl/concat.hpp +++ b/ggml/src/ggml-sycl/concat.hpp @@ -15,7 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst); +void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst); #endif // GGML_SYCL_CONCAT_HPP diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp index bc4ab1ddbadf0..ddba601e10fcc 100644 --- a/ggml/src/ggml-sycl/conv.cpp +++ b/ggml/src/ggml-sycl/conv.cpp @@ -71,8 +71,9 @@ static void conv_transpose_1d_f32_f32_sycl( }); } -void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst) { +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; const float * src0_d = (const float *)src0->data; const float * src1_d = (const float *)src1->data; diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp index eb20730f904a6..f9e60dc758029 100644 --- a/ggml/src/ggml-sycl/conv.hpp +++ b/ggml/src/ggml-sycl/conv.hpp @@ -15,7 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst); +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst); #endif // GGML_SYCL_CONV_HPP diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index d05a51f807c20..4bcd74376eaac 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -882,149 +882,149 @@ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor } -void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqrt); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqrt); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sin); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sin); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_cos); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_cos); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_acc); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_acc); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_silu); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_silu); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu_quick); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu_quick); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_tanh); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_tanh); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_relu); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_relu); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sigmoid); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sigmoid); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardsigmoid); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardsigmoid); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardswish); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardswish); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_exp); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_exp); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_log); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_log); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_neg); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_neg); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_step); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_step); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_leaky_relu); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_leaky_relu); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqr); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqr); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_upscale); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pad); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pad); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_add); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sub); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sub); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_mul); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_mul); GGML_SYCL_DEBUG("call %s done\n", __func__); } -void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_div); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_div); GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 8152edf583863..46443264505cc 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -25,52 +25,52 @@ static __dpct_inline__ float op_div(const float a, const float b) { } -void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 312ccfeb85359..037c8093eef30 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -54,18 +54,12 @@ static ggml_sycl_device_info ggml_sycl_init() { GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES); int64_t total_vram = 0; -#if defined(GGML_SYCL_FORCE_MMQ) - GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__); -#else - GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: no\n", __func__); -#endif -#if defined(SYCL_USE_XMX) - GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__); -#else - GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__); -#endif - GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME); - +/* This is a bit misleading; reserved for later */ +// #if defined(SYCL_USE_XMX) +// GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__); +// #else +// GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__); +// #endif for (int i = 0; i < info.device_count; ++i) { info.devices[i].vmm = 0; dpct::device_info prop; @@ -109,11 +103,11 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); auto global_mem_size = prop.get_global_mem_size()/1000000; - - GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), + std::string xmx = gpu_has_xmx(device) ? "yes" : "no"; + GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(), name.c_str(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), - global_mem_size, device.get_info().c_str()); + global_mem_size, device.get_info().c_str(), xmx.c_str()); } void ggml_backend_sycl_print_sycl_devices() { @@ -124,16 +118,16 @@ void ggml_backend_sycl_print_sycl_devices() { GGML_LOG_INFO( "| | | | " - " |Max | |Max |Global | |\n"); + " |Max | |Max |Global | | XMX |\n"); GGML_LOG_INFO( "| | | | " - " |compute|Max work|sub |mem | |\n"); + " |compute|Max work|sub |mem | | or |\n"); GGML_LOG_INFO( "|ID| Device Type| " - "Name|Version|units |group |group|size | Driver version|\n"); + "Name|Version|units |group |group|size | Driver version| Tensor Cores |\n"); GGML_LOG_INFO( "|--|-------------------|---------------------------------------|------" - "-|-------|--------|-----|-------|---------------------|\n"); + "-|-------|--------|-----|-------|---------------------|--------------|\n"); for (int id = 0; id < device_count; ++id) { sycl::device device = dpct::dev_mgr::instance().get_device(id); @@ -164,14 +158,18 @@ static void ggml_check_sycl() try { static bool initialized = false; if (!initialized) { - GGML_LOG_INFO("[SYCL] call ggml_check_sycl\n"); + GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - GGML_LOG_INFO("%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug); - + GGML_LOG_INFO("GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); +#if defined(GGML_SYCL_FORCE_MMQ) + GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: yes\n"); +#else + GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: no\n"); +#endif #if defined(GGML_SYCL_F16) - GGML_LOG_INFO("%s: GGML_SYCL_F16: yes\n", __func__); + GGML_LOG_INFO("GGML_SYCL_F16: yes\n"); #else - GGML_LOG_INFO("%s: GGML_SYCL_F16: no\n", __func__); + GGML_LOG_INFO("GGML_SYCL_F16: no\n"); #endif /* NOT REMOVE, keep it for next optimize for XMX. @@ -1189,7 +1187,6 @@ std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(q /// kernels typedef void (*cpy_kernel_t)(const char * cx, char * cdst); -typedef void (*ggml_sycl_func_t)(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); typedef void (*ggml_sycl_op_mul_mat_t)( ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -3171,33 +3168,33 @@ catch (sycl::exception const &exc) { } -static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_repeat); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_repeat); GGML_SYCL_DEBUG("call %s done\n", __func__); } -static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_get_rows); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_get_rows); GGML_SYCL_DEBUG("call %s done\n", __func__); } -static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_norm); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_norm); GGML_SYCL_DEBUG("call %s done\n", __func__); } -static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rms_norm); GGML_SYCL_DEBUG("call %s done\n", __func__); } -static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_group_norm); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm); GGML_SYCL_DEBUG("call %s done\n", __func__); } @@ -3572,9 +3569,10 @@ __dpct_inline__ static void k_copy_dst_from_contiguous( } } -static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, +static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, ggml_tensor *dst) try { + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers"); const ggml_tensor *ids = dst->src[2]; @@ -3740,12 +3738,12 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_scale); +static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_scale); } -static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_clamp); +static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp); } static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, @@ -3787,7 +3785,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr ggml_type_name(src0->type), ggml_type_name(src1->type)); GGML_ABORT("fatal error"); } - GGML_UNUSED(dst); } catch (sycl::exception const &exc) { @@ -3796,59 +3793,52 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { // TODO: why do we pass dst as src1 here? - ggml_sycl_cpy(ctx, src0, dst, nullptr); - GGML_UNUSED(src1); + ggml_sycl_cpy(ctx, dst->src[0], dst, nullptr); } -static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_diag_mask_inf); +static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf); } -static void ggml_sycl_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_soft_max); +static void ggml_sycl_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_soft_max); } -static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rope); +static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope); } -static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pool2d); +static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pool2d); } -static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_im2col); +static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_im2col); } -static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sum); +static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum); } -static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sum_rows); +static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum_rows); } -static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_argsort); +static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argsort); } -static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_argmax); +static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argmax); } -static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_UNUSED(src0); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(ctx); -} void ggml_sycl_set_main_device(const int main_device) try { if (dpct::get_current_device_id() == static_cast (main_device)) { @@ -3871,191 +3861,189 @@ catch (sycl::exception const &exc) { std::exit(1); } -bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * tensor) { +bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) { if (!g_sycl_loaded) return false; - ggml_sycl_func_t func; + if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) { + ggml_sycl_set_peer_access(dst->src[1]->ne[1], ctx.device); + } - switch (tensor->op) { + switch (dst->op) { case GGML_OP_ARGMAX: - func = ggml_sycl_argmax; + ggml_sycl_argmax(ctx, dst); break; case GGML_OP_CONV_TRANSPOSE_1D: - func = ggml_sycl_op_conv_transpose_1d; + ggml_sycl_op_conv_transpose_1d(ctx, dst); break; case GGML_OP_REPEAT: - func = ggml_sycl_repeat; + ggml_sycl_repeat(ctx, dst); break; case GGML_OP_GET_ROWS: - func = ggml_sycl_get_rows; + ggml_sycl_get_rows(ctx, dst); break; case GGML_OP_DUP: - func = ggml_sycl_dup; + ggml_sycl_dup(ctx, dst); break; case GGML_OP_ADD: case GGML_OP_ADD1: // TODO: more efficient implementation - func = ggml_sycl_add; + ggml_sycl_add(ctx, dst); break; case GGML_OP_SUB: - func = ggml_sycl_sub; + ggml_sycl_sub(ctx, dst); break; case GGML_OP_ACC: - func = ggml_sycl_acc; + ggml_sycl_acc(ctx, dst); break; case GGML_OP_MUL: - func = ggml_sycl_mul; + ggml_sycl_mul(ctx, dst); break; case GGML_OP_LOG: - func = ggml_sycl_log; + ggml_sycl_log(ctx, dst); break; case GGML_OP_DIV: - func = ggml_sycl_div; + ggml_sycl_div(ctx, dst); break; case GGML_OP_UNARY: - switch (ggml_get_unary_op(tensor)) { + switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_NEG: - func = ggml_sycl_neg; + ggml_sycl_neg(ctx, dst); break; case GGML_UNARY_OP_STEP: - func = ggml_sycl_step; + ggml_sycl_step(ctx, dst); break; case GGML_UNARY_OP_GELU: - func = ggml_sycl_gelu; + ggml_sycl_gelu(ctx, dst); break; case GGML_UNARY_OP_SILU: - func = ggml_sycl_silu; + ggml_sycl_silu(ctx, dst); break; case GGML_UNARY_OP_GELU_QUICK: - func = ggml_sycl_gelu_quick; + ggml_sycl_gelu_quick(ctx, dst); break; case GGML_UNARY_OP_TANH: - func = ggml_sycl_tanh; + ggml_sycl_tanh(ctx, dst); break; case GGML_UNARY_OP_RELU: - func = ggml_sycl_relu; + ggml_sycl_relu(ctx, dst); break; case GGML_UNARY_OP_SIGMOID: - func = ggml_sycl_sigmoid; + ggml_sycl_sigmoid(ctx, dst); break; case GGML_UNARY_OP_HARDSIGMOID: - func = ggml_sycl_hardsigmoid; + ggml_sycl_hardsigmoid(ctx, dst); break; case GGML_UNARY_OP_HARDSWISH: - func = ggml_sycl_hardswish; + ggml_sycl_hardswish(ctx, dst); break; case GGML_UNARY_OP_EXP: - func = ggml_sycl_exp; + ggml_sycl_exp(ctx, dst); break; default: return false; } break; case GGML_OP_NORM: - func = ggml_sycl_norm; + ggml_sycl_norm(ctx, dst); break; case GGML_OP_GROUP_NORM: - func = ggml_sycl_group_norm; + ggml_sycl_group_norm(ctx, dst); break; case GGML_OP_CONCAT: - func = ggml_sycl_op_concat; + ggml_sycl_op_concat(ctx, dst); break; case GGML_OP_UPSCALE: - func = ggml_sycl_upscale; + ggml_sycl_upscale(ctx, dst); break; case GGML_OP_PAD: - func = ggml_sycl_pad; + ggml_sycl_pad(ctx, dst); break; case GGML_OP_LEAKY_RELU: - func = ggml_sycl_leaky_relu; + ggml_sycl_leaky_relu(ctx, dst); break; case GGML_OP_RMS_NORM: - func = ggml_sycl_rms_norm; + ggml_sycl_rms_norm(ctx, dst); break; case GGML_OP_MUL_MAT: - if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { + if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { return false; } - func = ggml_sycl_mul_mat; + /* ggml_sycl_mul_mat_id is dependent on ggml_sycl_mul_mat */ + ggml_sycl_mul_mat(ctx, dst->src[0], dst->src[1], dst); break; case GGML_OP_MUL_MAT_ID: - if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { + if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { return false; } - func = ggml_sycl_mul_mat_id; + ggml_sycl_mul_mat_id(ctx, dst); break; case GGML_OP_OUT_PROD: - func = ggml_sycl_op_out_prod; + ggml_sycl_op_out_prod(ctx, dst); break; case GGML_OP_SCALE: - func = ggml_sycl_scale; + ggml_sycl_scale(ctx, dst); break; case GGML_OP_SQR: - func = ggml_sycl_sqr; + ggml_sycl_sqr(ctx, dst); break; case GGML_OP_SQRT: - func = ggml_sycl_sqrt; + ggml_sycl_sqrt(ctx, dst); break; case GGML_OP_SIN: - func = ggml_sycl_sin; + ggml_sycl_sin(ctx, dst); break; case GGML_OP_COS: - func = ggml_sycl_cos; + ggml_sycl_cos(ctx, dst); break; case GGML_OP_CLAMP: - func = ggml_sycl_clamp; + ggml_sycl_clamp(ctx, dst); break; case GGML_OP_CPY: - func = ggml_sycl_cpy; + ggml_sycl_cpy(ctx, dst->src[0], dst->src[1], dst); break; case GGML_OP_CONT: - func = ggml_sycl_dup; + ggml_sycl_dup(ctx, dst); break; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - func = ggml_sycl_nop; + GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__); break; case GGML_OP_DIAG_MASK_INF: - func = ggml_sycl_diag_mask_inf; + ggml_sycl_diag_mask_inf(ctx, dst); break; case GGML_OP_SOFT_MAX: - func = ggml_sycl_soft_max; + ggml_sycl_soft_max(ctx, dst); break; case GGML_OP_ROPE: - func = ggml_sycl_rope; + ggml_sycl_rope(ctx, dst); break; case GGML_OP_IM2COL: - func = ggml_sycl_im2col; + ggml_sycl_im2col(ctx, dst); break; case GGML_OP_POOL_2D: - func = ggml_sycl_pool2d; + ggml_sycl_pool2d(ctx, dst); break; case GGML_OP_SUM: - func = ggml_sycl_sum; + ggml_sycl_sum(ctx, dst); break; case GGML_OP_SUM_ROWS: - func = ggml_sycl_sum_rows; + ggml_sycl_sum_rows(ctx, dst); break; case GGML_OP_ARGSORT: - func = ggml_sycl_argsort; + ggml_sycl_argsort(ctx, dst); break; case GGML_OP_TIMESTEP_EMBEDDING: - func = ggml_sycl_op_timestep_embedding; + ggml_sycl_op_timestep_embedding(ctx, dst); break; case GGML_OP_RWKV_WKV6: - func = ggml_sycl_op_rwkv_wkv6; + ggml_sycl_op_rwkv_wkv6(ctx, dst); break; default: return false; } - if (tensor->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(tensor->src[0]->buffer)) { - ggml_sycl_set_peer_access(tensor->src[1]->ne[1], ctx.device); - } - - func(ctx, tensor->src[0], tensor->src[1], tensor); return true; } diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index ef9af0b7633ab..8e8347ff4f95e 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -3,9 +3,9 @@ #include "outprod.hpp" -void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { - +void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-sycl/outprod.hpp b/ggml/src/ggml-sycl/outprod.hpp index 9c042738a480e..f50413d3f7a28 100644 --- a/ggml/src/ggml-sycl/outprod.hpp +++ b/ggml/src/ggml-sycl/outprod.hpp @@ -3,8 +3,7 @@ #include "common.hpp" -void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst); +void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst); #endif // GGML_SYCL_OUTPROD_HPP diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index 2ffe3cca91725..b877d18c1730a 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -55,8 +55,9 @@ static void timestep_embedding_f32_sycl( }); } -void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor * dst) { +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; const float * src0_d = (const float *)src0->data; float * dst_d = (float *)dst->data; dpct::queue_ptr stream = ctx.stream(); diff --git a/ggml/src/ggml-sycl/tsembd.hpp b/ggml/src/ggml-sycl/tsembd.hpp index ff854c337c344..4c18748bbffc2 100644 --- a/ggml/src/ggml-sycl/tsembd.hpp +++ b/ggml/src/ggml-sycl/tsembd.hpp @@ -15,7 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor * dst); +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_TSEMBD_HPP diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp index 105db6f030c59..b54c20964ed5d 100644 --- a/ggml/src/ggml-sycl/wkv6.cpp +++ b/ggml/src/ggml-sycl/wkv6.cpp @@ -95,8 +95,10 @@ static void rwkv_wkv_f32_kernel( } } -void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { +void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { + + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; const float* k_d = (const float*)dst->src[0]->data; const float* v_d = (const float*)dst->src[1]->data; @@ -107,9 +109,9 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s float* dst_d = (float*)dst->data; const int64_t B = dst->src[5]->ne[1]; - const int64_t T = dst->src[0]->ne[3]; + const int64_t T = dst->src[0]->ne[2]; const int64_t C = dst->ne[0]; - const int64_t H = dst->src[0]->ne[2]; + const int64_t H = dst->src[0]->ne[1]; GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); GGML_ASSERT(C % H == 0); diff --git a/ggml/src/ggml-sycl/wkv6.hpp b/ggml/src/ggml-sycl/wkv6.hpp index ddfa3377b4824..8c596a9972220 100644 --- a/ggml/src/ggml-sycl/wkv6.hpp +++ b/ggml/src/ggml-sycl/wkv6.hpp @@ -3,8 +3,7 @@ #include "common.hpp" -void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor * dst); +void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_WKV6_HPP diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0774524242a6b..649146d7b4530 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2277,6 +2277,7 @@ static vk_device ggml_vk_get_device(size_t idx) { if (device->subgroup_size_control) { device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize; device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize; + device_extensions.push_back("VK_EXT_subgroup_size_control"); } device->subgroup_size_control = device->subgroup_size_control && @@ -2285,7 +2286,6 @@ static vk_device ggml_vk_get_device(size_t idx) { if (device->subgroup_size_control) { device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups; - device_extensions.push_back("VK_EXT_subgroup_size_control"); } #if defined(VK_KHR_cooperative_matrix) @@ -5633,9 +5633,9 @@ static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subc } static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { - const size_t seq_length = dst->src[0]->ne[3]; + const size_t seq_length = dst->src[0]->ne[2]; const size_t n_embed = dst->ne[0]; - const size_t n_heads = dst->src[0]->ne[2]; + const size_t n_heads = dst->src[0]->ne[1]; const size_t n_seqs = dst->src[5]->ne[1]; ggml_vk_op_f32_rwkv6( diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 24875cdcf4c98..53902858de7da 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -1,9 +1,6 @@ #version 450 -#ifdef FLOAT16 -#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require -#endif -#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #include "mul_mat_vec_base.comp" @@ -27,8 +24,8 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const #if K_PER_ITER == 8 #if QUANT_R == 2 - const B_TYPE_VEC4 bv02 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]; - const B_TYPE_VEC4 bv13 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4]; + const vec4 bv02 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]); + const vec4 bv13 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4]); const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y); const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w); #else diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index 9342134462416..6a9b9b2d132a0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -1,5 +1,5 @@ #version 450 -#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #include "mul_mat_vec_base.comp" @@ -40,9 +40,9 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - f16vec2 d = data_a[ib0 + i].d; - const FLOAT_TYPE dall = d.x; - const FLOAT_TYPE dmin = d.y; + vec2 d = vec2(data_a[ib0 + i].d); + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0]; uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1]; @@ -63,14 +63,14 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uvec2 qs16 = uvec2(unpack8(qs16_u16)); [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - B_TYPE_VEC2 b0 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]; - B_TYPE_VEC2 b16 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]; - B_TYPE_VEC2 b32 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]; - B_TYPE_VEC2 b48 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]; - B_TYPE_VEC2 b64 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]; - B_TYPE_VEC2 b80 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]; - B_TYPE_VEC2 b96 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]; - B_TYPE_VEC2 b112 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]; + vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]); + vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]); + vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]); + vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]); + vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]); + vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]); + vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]); + vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]); FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index 86b0159d97a89..96ef50fdda2a3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -1,5 +1,5 @@ #version 450 -#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #include "mul_mat_vec_base.comp" @@ -60,14 +60,14 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - B_TYPE_VEC2 b0 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]; - B_TYPE_VEC2 b16 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]; - B_TYPE_VEC2 b32 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]; - B_TYPE_VEC2 b48 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]; - B_TYPE_VEC2 b64 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]; - B_TYPE_VEC2 b80 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]; - B_TYPE_VEC2 b96 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]; - B_TYPE_VEC2 b112 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]; + vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]); + vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]); + vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]); + vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]); + vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]); + vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]); + vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]); + vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]); FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 2; ++l) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp index cd1dd8e89c21e..f97eb8744fb18 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -1,6 +1,6 @@ #version 450 -#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #include "mul_mat_vec_base.comp" @@ -45,7 +45,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - f16vec2 d = data_a[ib0 + i].d; + vec2 d = vec2(data_a[ib0 + i].d); const FLOAT_TYPE dall = FLOAT_TYPE(d.x); const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); @@ -96,10 +96,10 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint32_t q4_15 = qs64_hi4.w; [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - B_TYPE_VEC4 by10 = data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4]; - B_TYPE_VEC4 by132 = data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]; - B_TYPE_VEC4 by20 = data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4]; - B_TYPE_VEC4 by232 = data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8]; + vec4 by10 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 ]); + vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]); + vec4 by20 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 ]); + vec4 by232 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8]); const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3))); const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7))); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp index 0a68891c35a5f..79d7db0e3e64b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -1,6 +1,6 @@ #version 450 -#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #include "mul_mat_vec_base.comp" @@ -42,7 +42,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - f16vec2 d = data_a[ib0 + i].d; + vec2 d = vec2(data_a[ib0 + i].d); const FLOAT_TYPE dall = FLOAT_TYPE(d.x); const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); @@ -105,14 +105,14 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint32_t q4_15 = qs64_80_hi4.w; [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - B_TYPE_VEC2 by10 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2]; - B_TYPE_VEC2 by116 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8]; - B_TYPE_VEC2 by132 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16]; - B_TYPE_VEC2 by148 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24]; - B_TYPE_VEC2 by20 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2]; - B_TYPE_VEC2 by216 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8]; - B_TYPE_VEC2 by232 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16]; - B_TYPE_VEC2 by248 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24]; + vec2 by10 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 ]); + vec2 by116 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8]); + vec2 by132 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16]); + vec2 by148 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24]); + vec2 by20 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 ]); + vec2 by216 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8]); + vec2 by232 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16]); + vec2 by248 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24]); const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 70e13a56bd730..041fd27c12b54 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -1,6 +1,6 @@ #version 450 -#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #include "mul_mat_vec_base.comp" @@ -77,10 +77,10 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uvec4 q3 = uvec4(unpack8(q3_u32)); [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - B_TYPE_VEC4 by0 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4]; - B_TYPE_VEC4 by32 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8]; - B_TYPE_VEC4 by64 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]; - B_TYPE_VEC4 by96 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]; + vec4 by0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 ]); + vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8]); + vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]); + vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]); FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 4; ++l) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp index a25808e16568a..51fc2dc7ed406 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp @@ -1,6 +1,5 @@ #version 450 -#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #extension GL_EXT_control_flow_attributes : enable layout (push_constant) uniform parameter diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index eecc47f3a9764..f12e61bbe1052 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -2,7 +2,10 @@ #if !defined(GGML_TYPES_COMP) #define GGML_TYPES_COMP -#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_16bit_storage : require #if defined(DATA_A_F32) #define QUANT_K 1 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 90abc6ad45233..da5b817e15637 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -968,6 +968,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GET_REL_POS", "ADD_REL_POS", "RWKV_WKV6", + "GATED_LINEAR_ATTN", "UNARY", @@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1064,6 +1065,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "get_rel_pos(x)", "add_rel_pos(x)", "rwkv_wkv6(k, v, r, tf, td, s)", + "gated_linear_attn(k, v, q, gate, s)", "unary(x)", @@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4629,15 +4631,13 @@ struct ggml_tensor * ggml_rwkv_wkv6( GGML_ASSERT(ggml_is_contiguous(state)); const int64_t S = k->ne[0]; - const int64_t H = k->ne[2]; - const int64_t n_tokens = k->ne[3]; + const int64_t H = k->ne[1]; + const int64_t n_tokens = k->ne[2]; const int64_t n_seqs = state->ne[1]; { - GGML_ASSERT(k->ne[1] == 1); - GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens); - GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens); - // TODO: RWKV v4 and v5 - GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens); + GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens); + GGML_ASSERT(r->ne[0] == S && r->ne[1] == H && r->ne[2] == n_tokens); + GGML_ASSERT(td->ne[0] == S && td->ne[1] == H && td->ne[2] == n_tokens); GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs); } @@ -4656,6 +4656,49 @@ struct ggml_tensor * ggml_rwkv_wkv6( return result; } +// ggml_gated_linear_attn + +struct ggml_tensor * ggml_gated_linear_attn( + struct ggml_context * ctx, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * q, + struct ggml_tensor * g, + struct ggml_tensor * state, + float scale) { + GGML_ASSERT(ggml_is_contiguous(k)); + GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(q)); + GGML_ASSERT(ggml_is_contiguous(g)); + GGML_ASSERT(ggml_is_contiguous(state)); + + const int64_t S = k->ne[0]; + const int64_t H = k->ne[1]; + const int64_t n_tokens = k->ne[2]; + const int64_t n_seqs = state->ne[1]; + { + GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens); + GGML_ASSERT(q->ne[0] == S && q->ne[1] == H && q->ne[2] == n_tokens); + GGML_ASSERT(g->ne[0] == S && g->ne[1] == H && g->ne[2] == n_tokens); + GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs); + } + + // concat output and new_state + const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + ggml_set_op_params_f32(result, 0, scale); + + result->op = GGML_OP_GATED_LINEAR_ATTN; + result->src[0] = k; + result->src[1] = v; + result->src[2] = q; + result->src[3] = g; + result->src[4] = state; + + return result; +} + // ggml_unary static struct ggml_tensor * ggml_unary_impl( diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cf05bf47ece08..56aa9288dad40 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -115,6 +115,7 @@ class LLM: TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" RESIDUAL_SCALE = "{arch}.residual_scale" EMBEDDING_SCALE = "{arch}.embedding_scale" + TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -255,6 +256,7 @@ class MODEL_ARCH(IntEnum): GEMMA2 = auto() STARCODER2 = auto() RWKV6 = auto() + RWKV6QWEN2 = auto() MAMBA = auto() XVERSE = auto() COMMAND_R = auto() @@ -334,6 +336,7 @@ class MODEL_TENSOR(IntEnum): TIME_MIX_LERP_V = auto() TIME_MIX_LERP_R = auto() TIME_MIX_LERP_G = auto() + TIME_MIX_LERP_FUSED = auto() TIME_MIX_LERP_W = auto() TIME_MIX_FIRST = auto() TIME_MIX_DECAY = auto() @@ -440,6 +443,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", + MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", @@ -519,6 +523,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", + MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused", MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", @@ -1103,6 +1108,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TIME_MIX_LERP_R, MODEL_TENSOR.TIME_MIX_LERP_G, MODEL_TENSOR.TIME_MIX_LERP_W, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, MODEL_TENSOR.TIME_MIX_FIRST, MODEL_TENSOR.TIME_MIX_DECAY, MODEL_TENSOR.TIME_MIX_DECAY_W1, @@ -1119,6 +1125,35 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE, MODEL_TENSOR.CHANNEL_MIX_VALUE, ], + MODEL_ARCH.RWKV6QWEN2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_LERP_X, + MODEL_TENSOR.TIME_MIX_LERP_K, + MODEL_TENSOR.TIME_MIX_LERP_V, + MODEL_TENSOR.TIME_MIX_LERP_R, + MODEL_TENSOR.TIME_MIX_LERP_G, + MODEL_TENSOR.TIME_MIX_LERP_W, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, + MODEL_TENSOR.TIME_MIX_FIRST, + MODEL_TENSOR.TIME_MIX_DECAY, + MODEL_TENSOR.TIME_MIX_DECAY_W1, + MODEL_TENSOR.TIME_MIX_DECAY_W2, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_GATE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.MAMBA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4a0a65e3cc33e..bf851c92ca548 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -743,6 +743,9 @@ def add_embedding_scale(self, value: float) -> None: def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) + def add_token_shift_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7616c468a5301..617791e240b60 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron olmoe olmo2 + "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -464,34 +464,42 @@ class TensorNameMap: MODEL_TENSOR.TIME_MIX_W1: ( "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6 + "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_W2: ( "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6 + "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_X: ( "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6 + "model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_K: ( "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6 + "model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_V: ( "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6 + "model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_R: ( "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6 + "model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_G: ( "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6 + "model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_W: ( "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6 + "model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_FIRST: ( @@ -500,30 +508,37 @@ class TensorNameMap: MODEL_TENSOR.TIME_MIX_DECAY: ( "rwkv.blocks.{bid}.attention.time_decay", # rwkv v6 + "model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_DECAY_W1: ( "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6 + "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_DECAY_W2: ( "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6 + "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_KEY: ( - "rwkv.blocks.{bid}.attention.key", # rwkv + "rwkv.blocks.{bid}.attention.key", # rwkv + "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_VALUE: ( - "rwkv.blocks.{bid}.attention.value", # rwkv + "rwkv.blocks.{bid}.attention.value", # rwkv + "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( "rwkv.blocks.{bid}.attention.receptance", # rwkv + "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_GATE: ( - "rwkv.blocks.{bid}.attention.gate", # rwkv + "rwkv.blocks.{bid}.attention.gate", # rwkv + "model.layers.{bid}.self_attn.gate", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LN: ( @@ -531,7 +546,8 @@ class TensorNameMap: ), MODEL_TENSOR.TIME_MIX_OUTPUT: ( - "rwkv.blocks.{bid}.attention.output", # rwkv + "rwkv.blocks.{bid}.attention.output", # rwkv + "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2 ), MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index eef66ed311d7a..7300bd26a907f 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -57,6 +57,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_RWKV6, "rwkv6" }, + { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" }, { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_CHAMELEON, "chameleon" }, @@ -106,6 +107,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" }, { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, + { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -1166,6 +1168,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" }, { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" }, { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" }, + { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" }, { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" }, { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" }, @@ -1183,6 +1186,32 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" }, }, }, + { + LLM_ARCH_RWKV6QWEN2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, + { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, + { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" }, + { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, + { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" }, + { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" }, + { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" }, + { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" }, + { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, + { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, + { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, + { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" }, + { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_GRANITE, { @@ -1365,6 +1394,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}}, {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 2e5f97b771d0e..79909f03fc522 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -61,6 +61,7 @@ enum llm_arch { LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, LLM_ARCH_RWKV6, + LLM_ARCH_RWKV6QWEN2, LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, @@ -110,6 +111,7 @@ enum llm_kv { LLM_KV_TIME_DECAY_EXTRA_DIM, LLM_KV_RESIDUAL_SCALE, LLM_KV_EMBEDDING_SCALE, + LLM_KV_TOKEN_SHIFT_COUNT, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -253,6 +255,7 @@ enum llm_tensor { LLM_TENSOR_TIME_MIX_LERP_V, LLM_TENSOR_TIME_MIX_LERP_R, LLM_TENSOR_TIME_MIX_LERP_G, + LLM_TENSOR_TIME_MIX_LERP_FUSED, LLM_TENSOR_TIME_MIX_FIRST, LLM_TENSOR_TIME_MIX_DECAY, LLM_TENSOR_TIME_MIX_DECAY_W1, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index c40534696b65f..ea87b2953d9dd 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -52,7 +52,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const { uint32_t llama_hparams::n_embd_k_s() const { if (wkv_head_size != 0) { // for RWKV models - return 2 * n_embd; + return token_shift_count * n_embd; } // TODO: maybe support other convolution strides than 1 diff --git a/src/llama-hparams.h b/src/llama-hparams.h index a29f20ec49665..3542bef499eac 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -76,6 +76,7 @@ struct llama_hparams { uint32_t time_mix_extra_dim = 0; uint32_t time_decay_extra_dim = 0; uint32_t wkv_head_size = 0; + uint32_t token_shift_count = 2; float rope_attn_factor = 1.0f; float rope_freq_base_train; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7260cb155261b..c056204b0995e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1054,12 +1054,15 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) { } } break; case LLM_ARCH_RWKV6: + case LLM_ARCH_RWKV6QWEN2: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim); ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim); ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); + ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1_6B; break; @@ -1070,6 +1073,7 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) { default: model.type = e_model::MODEL_UNKNOWN; } break; case 61: model.type = e_model::MODEL_14B; break; + case 64: model.type = e_model::MODEL_32B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -2064,6 +2068,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: case LLM_ARCH_RWKV6: + case LLM_ARCH_RWKV6QWEN2: case LLM_ARCH_WAVTOKENIZER_DEC: return LLAMA_ROPE_TYPE_NONE; @@ -2208,6 +2213,7 @@ bool llama_model_is_recurrent(const struct llama_model * model) { switch (model->arch) { case LLM_ARCH_MAMBA: return true; case LLM_ARCH_RWKV6: return true; + case LLM_ARCH_RWKV6QWEN2: return true; default: return false; } } diff --git a/src/llama-model.h b/src/llama-model.h index 424cb0f521943..565d2dbdf6ff1 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -241,15 +241,19 @@ struct llama_layer { struct ggml_tensor * time_mix_lerp_v = nullptr; struct ggml_tensor * time_mix_lerp_r = nullptr; struct ggml_tensor * time_mix_lerp_g = nullptr; - - struct ggml_tensor * time_mix_first = nullptr; - struct ggml_tensor * time_mix_decay = nullptr; - struct ggml_tensor * time_mix_decay_w1 = nullptr; - struct ggml_tensor * time_mix_decay_w2 = nullptr; - struct ggml_tensor * time_mix_key = nullptr; - struct ggml_tensor * time_mix_value = nullptr; - struct ggml_tensor * time_mix_receptance = nullptr; - struct ggml_tensor * time_mix_gate = nullptr; + struct ggml_tensor * time_mix_lerp_fused = nullptr; + + struct ggml_tensor * time_mix_first = nullptr; + struct ggml_tensor * time_mix_decay = nullptr; + struct ggml_tensor * time_mix_decay_w1 = nullptr; + struct ggml_tensor * time_mix_decay_w2 = nullptr; + struct ggml_tensor * time_mix_key = nullptr; + struct ggml_tensor * time_mix_key_b = nullptr; + struct ggml_tensor * time_mix_value = nullptr; + struct ggml_tensor * time_mix_value_b = nullptr; + struct ggml_tensor * time_mix_receptance = nullptr; + struct ggml_tensor * time_mix_receptance_b = nullptr; + struct ggml_tensor * time_mix_gate = nullptr; struct ggml_tensor * time_mix_ln = nullptr; struct ggml_tensor * time_mix_ln_b = nullptr; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 466e7bc61b559..a45044f306254 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -620,7 +620,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - // sanity checks + // sanity checks for models that have attention layers + if (qs.n_attention_wv != 0) { const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); // attention layers have a non-zero number of kv heads @@ -758,6 +759,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize &= name.find("time_mix_w2.weight") == std::string::npos; quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; diff --git a/src/llama.cpp b/src/llama.cpp index ae375bcd3c8b1..a364861d3c803 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -134,11 +134,11 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w const int64_t H = 123; const int64_t n_tokens = 123; const int64_t n_seqs = 123; - ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens); - ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); - ggml_tensor * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); + ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); + ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); + ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); ggml_tensor * tf = w; - ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); + ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state); } break; @@ -2186,11 +2186,13 @@ static bool llm_load_tensors( layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0); layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0); - layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, 0); - layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0); - layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, 0); - layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0); - layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, 0); + layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED); + GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL)); layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0); layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0); @@ -2214,6 +2216,59 @@ static bool llm_load_tensors( } } break; + case LLM_ARCH_RWKV6QWEN2: + { + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + const int time_mix_extra_dim = hparams.time_mix_extra_dim; + const int time_decay_extra_dim = hparams.time_decay_extra_dim; + const int head_size = hparams.wkv_head_size; + const int attn_hidden_size = n_embd; + const int n_head_kv = hparams.n_head_kv(); + int attn_key_value_size; + if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) { + attn_key_value_size = attn_hidden_size; + } else { + attn_key_value_size = n_head_kv * head_size; + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0); + layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0); + + layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0); + layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0); + + layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0); + layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0); + layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0); + layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0); + layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0); + layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0); + layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0); + // optional bias tensors + layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; case LLM_ARCH_CHAMELEON: { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -3337,16 +3392,20 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( const struct llama_layer * layer, struct ggml_tensor * cur, struct ggml_tensor * x_prev, - struct ggml_tensor ** wkv_state) { + struct ggml_tensor ** wkv_state, + size_t wkv_head_size, + size_t head_count_kv) { size_t n_embd = cur->ne[0]; size_t n_seq_tokens = cur->ne[1]; size_t n_seqs = cur->ne[2]; - size_t head_size = layer->time_mix_first->ne[0]; - size_t head_count = layer->time_mix_first->ne[1]; + size_t head_size = wkv_head_size; + size_t head_count = n_embd / head_size; size_t n_tokens = n_seqs * n_seq_tokens; + bool is_qrwkv = layer->time_mix_first == nullptr; + struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens); @@ -3375,69 +3434,64 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( xxx ); - struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); - struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - - struct ggml_tensor * xw = ggml_add( - ctx, - ggml_mul( - ctx, - ggml_add(ctx, mw, layer->time_mix_lerp_w), - sx - ), - cur - ); + struct ggml_tensor *xw, *xk, *xv, *xr, *xg; + if (layer->time_mix_lerp_fused) { + // fusing these weights makes some performance improvement + sx = ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens); + cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); + xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur); + xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); + xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + } else { + // for backward compatibility + xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); + xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - struct ggml_tensor * xk = ggml_add( - ctx, - ggml_mul( - ctx, - ggml_add(ctx, mk, layer->time_mix_lerp_k), - sx - ), - cur - ); + xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur); + xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur); + xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur); + xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur); + xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur); + } - struct ggml_tensor * xv = ggml_add( - ctx, - ggml_mul( - ctx, - ggml_add(ctx, mv, layer->time_mix_lerp_v), - sx - ), - cur - ); + struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr); + struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk); + struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv); + if (layer->time_mix_receptance_b) { + r = ggml_add(ctx, r, layer->time_mix_receptance_b); + } + if (layer->time_mix_key_b) { + k = ggml_add(ctx, k, layer->time_mix_key_b); + } + if (layer->time_mix_value_b) { + v = ggml_add(ctx, v, layer->time_mix_value_b); + } - struct ggml_tensor * xr = ggml_add( - ctx, - ggml_mul( - ctx, - ggml_add(ctx, mr, layer->time_mix_lerp_r), - sx - ), - cur - ); + struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg); + if (is_qrwkv) { + g = ggml_sigmoid(ctx, g); + } else { + g = ggml_silu(ctx, g); + } - struct ggml_tensor * xg = ggml_add( - ctx, - ggml_mul( - ctx, - ggml_add(ctx, mg, layer->time_mix_lerp_g), - sx - ), - cur - ); + if (head_count_kv != head_count) { + GGML_ASSERT(head_count % head_count_kv == 0); + k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens); + v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens); + struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); + k = ggml_repeat(ctx, k, tmp); + v = ggml_repeat(ctx, v, tmp); + } - struct ggml_tensor * r = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens); - struct ggml_tensor * k = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens); - struct ggml_tensor * v = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens); - struct ggml_tensor * g = ggml_silu( - ctx, - llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg) - ); + k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens); + v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens); + r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens); struct ggml_tensor * w = ggml_mul_mat( ctx, @@ -3448,25 +3502,35 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( ) ); - w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd)); + w = ggml_add(ctx, w, layer->time_mix_decay); w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); - w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); + w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens); - k = ggml_transpose(ctx, k); - v = ggml_transpose(ctx, v); - r = ggml_transpose(ctx, r); + if (is_qrwkv) { + // k = k * (1 - w) + k = ggml_sub(ctx, k, ggml_mul(ctx, k, w)); + } - struct ggml_tensor * wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); + struct ggml_tensor * wkv_output; + if (!layer->time_mix_first) { + wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); + } else { + wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); + } cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); - // group norm with head_count groups - cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); - cur = ggml_norm(ctx, cur, 64e-5f); + if (!is_qrwkv) { + // group norm with head_count groups + cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); + cur = ggml_norm(ctx, cur, 64e-5f); - // Convert back to regular vectors. - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); + // Convert back to regular vectors. + cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); + cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); + } else { + cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); + } cur = ggml_mul(ctx, cur, g); cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); @@ -10048,7 +10112,7 @@ struct llm_build_context { 1 ); - cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states)); + cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, @@ -10115,6 +10179,118 @@ struct llm_build_context { return gf; } + // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py + ggml_cgraph * build_rwkv6qwen2() { + ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + GGML_ASSERT(n_embd == hparams.n_embd_k_s()); + + const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_tokens = ubatch.n_tokens; + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + struct ggml_tensor * state_copy = build_inp_s_copy(); + struct ggml_tensor * state_mask = build_inp_s_mask(); + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; + + // (ab)using the KV cache to store the states + struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, + gf, kv_self.k_l[il], state_copy, state_mask, + hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); + struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, + gf, kv_self.v_l[il], state_copy, state_mask, + hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); + + cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); + + struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il); + struct ggml_tensor * x_prev = ggml_concat( + ctx0, + token_shift, + ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), + 1 + ); + + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + wkv_states, + ggml_view_1d( + ctx0, + kv_self.v_l[il], + hparams.n_embd_v_s() * n_seqs, + hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + ) + ) + ); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); + ggml_build_forward_expand(gf, ffn_inp); + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + wkv_states, + ggml_view_1d( + ctx0, + kv_self.v_l[il], + hparams.n_embd_v_s() * n_seqs, + hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + ) + ) + ); + + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + // ref: https://github.com/facebookresearch/chameleon // based on the original build_llama() function, changes: // * qk-norm @@ -10724,6 +10900,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_rwkv6(); } break; + case LLM_ARCH_RWKV6QWEN2: + { + result = llm.build_rwkv6qwen2(); + } break; case LLM_ARCH_CHAMELEON: { result = llm.build_chameleon(); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 1e892f66365e0..3834e0f84aa72 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1659,17 +1659,46 @@ struct test_rwkv_wkv6 : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { const int64_t n_tokens = n_seq_tokens * n_seqs; - ggml_tensor * r = ggml_new_tensor(ctx, type, 4, std::vector{ 1, head_size, head_count, n_tokens }.data()); - ggml_tensor * k = ggml_new_tensor(ctx, type, 4, std::vector{ head_size, 1, head_count, n_tokens }.data()); - ggml_tensor * v = ggml_new_tensor(ctx, type, 4, std::vector{ 1, head_size, head_count, n_tokens }.data()); + ggml_tensor * r = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * k = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * v = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector{ head_size, head_count }.data()); - ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector{ 1, head_size, head_count, n_tokens }.data()); + ggml_tensor * td = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector{ head_size * head_size * head_count, n_seqs }.data()); ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s); return out; } }; +// GGML_OP_GATED_LINEAR_ATTN +struct test_gla : public test_case { + const ggml_type type; + + const int64_t head_count; + const int64_t head_size; + const int64_t n_seq_tokens; + const int64_t n_seqs; + + std::string vars() override { + return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs); + } + + test_gla(ggml_type type = GGML_TYPE_F32, + int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32) + : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + const int64_t n_tokens = n_seq_tokens * n_seqs; + ggml_tensor * q = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * k = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * v = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * g = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector{ head_size * head_size * head_count, n_seqs }.data()); + ggml_tensor * out = ggml_gated_linear_attn(ctx, k, v, q, g, s, pow(head_size, -0.5)); + return out; + } +}; + // GGML_OP_MUL_MAT struct test_mul_mat : public test_case { const ggml_type type_a; @@ -3626,6 +3655,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4)); test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4)); + test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 1, 1)); + test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 1)); + test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 4)); + test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 128, 4)); + for (int i = 1; i < 9; ++i) { test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));