From ec2029986cdea53eaedb2ac28ca9d8be58d30e76 Mon Sep 17 00:00:00 2001 From: donderom <274926+donderom@users.noreply.github.com> Date: Thu, 6 Feb 2025 15:58:43 +0100 Subject: [PATCH] Update to support b4599 libllama release --- src/main/scala/com/donderom/llm4s/Llama.scala | 785 +++++++++++------- src/main/scala/com/donderom/llm4s/Llm.scala | 64 +- .../scala/com/donderom/llm4s/Params.scala | 119 ++- .../scala/com/donderom/llm4s/SlincLlm.scala | 261 +++--- 4 files changed, 701 insertions(+), 528 deletions(-) diff --git a/src/main/scala/com/donderom/llm4s/Llama.scala b/src/main/scala/com/donderom/llm4s/Llama.scala index aa13a9f..1eb575d 100644 --- a/src/main/scala/com/donderom/llm4s/Llama.scala +++ b/src/main/scala/com/donderom/llm4s/Llama.scala @@ -7,26 +7,72 @@ object Llama: type Pos = CInt type Token = CInt type SeqId = CInt - type Ctx = Ptr[Any] + + type Vocab = Ptr[Any] type Model = Ptr[Any] - type Grammar = Ptr[Any] + type Ctx = Ptr[Any] + type Sampler = Ptr[Any] + + type LoraAdapter = Ptr[Any] enum VocabType: - case NONE, SPM, BPE, WPM + case NONE, SPM, BPE, WPM, UGM, RWKV given Transform[VocabType, CInt](VocabType.fromOrdinal, _.ordinal) - enum TokenType: - case UNDEFINED, NORMAL, UNKNOWN, CONTROL, USER_DEFINED, UNUSED, BYTE + enum RopeType(val code: CInt): + case NONE extends RopeType(-1) + case NORM extends RopeType(0) + case NEOX extends RopeType(2) + case MROPE extends RopeType(8) + case VISION extends RopeType(24) - given Transform[TokenType, CInt](TokenType.fromOrdinal, _.ordinal) + given Transform[RopeType, CInt]( + _ match + case RopeType.NONE.code => RopeType.NONE + case RopeType.NORM.code => RopeType.NORM + case RopeType.NEOX.code => RopeType.NEOX + case RopeType.MROPE.code => RopeType.MROPE + case RopeType.VISION.code => RopeType.VISION + , + _.code + ) + + enum TokenAttr(val code: CInt): + case UNDEFINED extends TokenAttr(0) + case UNKNOWN extends TokenAttr(1 << 0) + case UNUSED extends TokenAttr(1 << 1) + case NORMAL extends TokenAttr(1 << 2) + case CONTROL extends TokenAttr(1 << 3) + case USER_DEFINED extends TokenAttr(1 << 4) + case BYTE extends TokenAttr(1 << 5) + case NORMALIZED extends TokenAttr(1 << 6) + case LSTRIP extends TokenAttr(1 << 7) + case RSTRIP extends TokenAttr(1 << 8) + case SINGLE_WORD extends TokenAttr(1 << 9) + + given Transform[TokenAttr, CInt]( + _ match + case TokenAttr.UNDEFINED.code => TokenAttr.UNDEFINED + case TokenAttr.UNKNOWN.code => TokenAttr.UNKNOWN + case TokenAttr.UNUSED.code => TokenAttr.UNUSED + case TokenAttr.NORMAL.code => TokenAttr.NORMAL + case TokenAttr.CONTROL.code => TokenAttr.CONTROL + case TokenAttr.USER_DEFINED.code => TokenAttr.USER_DEFINED + case TokenAttr.BYTE.code => TokenAttr.BYTE + case TokenAttr.NORMALIZED.code => TokenAttr.NORMALIZED + case TokenAttr.LSTRIP.code => TokenAttr.LSTRIP + case TokenAttr.RSTRIP.code => TokenAttr.RSTRIP + case TokenAttr.SINGLE_WORD.code => TokenAttr.SINGLE_WORD + , + _.code + ) enum Ftype(val code: CInt): case ALL_F32 extends Ftype(0) case MOSTLY_F16 extends Ftype(1) case MOSTLY_Q4_0 extends Ftype(2) case MOSTLY_Q4_1 extends Ftype(3) - case MOSTLY_Q4_1_SOME_F16 extends Ftype(4) case MOSTLY_Q8_0 extends Ftype(7) case MOSTLY_Q5_0 extends Ftype(8) case MOSTLY_Q5_1 extends Ftype(9) @@ -53,23 +99,64 @@ object Llama: case MOSTLY_IQ4_XS extends Ftype(30) case MOSTLY_IQ1_M extends Ftype(31) case MOSTLY_BF16 extends Ftype(32) + case MOSTLY_TQ1_0 extends Ftype(36) + case MOSTLY_TQ2_0 extends Ftype(37) case GUESSED extends Ftype(1024) - given Transform[Ftype, CInt](Ftype.fromOrdinal, _.code) + given Transform[Ftype, CInt]( + _ match + case Ftype.ALL_F32.code => Ftype.ALL_F32 + case Ftype.MOSTLY_F16.code => Ftype.MOSTLY_F16 + case Ftype.MOSTLY_Q4_0.code => Ftype.MOSTLY_Q4_0 + case Ftype.MOSTLY_Q4_1.code => Ftype.MOSTLY_Q4_1 + case Ftype.MOSTLY_Q8_0.code => Ftype.MOSTLY_Q8_0 + case Ftype.MOSTLY_Q5_0.code => Ftype.MOSTLY_Q5_0 + case Ftype.MOSTLY_Q5_1.code => Ftype.MOSTLY_Q5_1 + case Ftype.MOSTLY_Q2_K.code => Ftype.MOSTLY_Q2_K + case Ftype.MOSTLY_Q3_K_S.code => Ftype.MOSTLY_Q3_K_S + case Ftype.MOSTLY_Q3_K_M.code => Ftype.MOSTLY_Q3_K_M + case Ftype.MOSTLY_Q3_K_L.code => Ftype.MOSTLY_Q3_K_L + case Ftype.MOSTLY_Q4_K_S.code => Ftype.MOSTLY_Q3_K_S + case Ftype.MOSTLY_Q4_K_M.code => Ftype.MOSTLY_Q4_K_M + case Ftype.MOSTLY_Q5_K_S.code => Ftype.MOSTLY_Q5_K_S + case Ftype.MOSTLY_Q5_K_M.code => Ftype.MOSTLY_Q5_K_M + case Ftype.MOSTLY_Q6_K.code => Ftype.MOSTLY_Q6_K + case Ftype.MOSTLY_IQ2_XXS.code => Ftype.MOSTLY_IQ2_XXS + case Ftype.MOSTLY_IQ2_XS.code => Ftype.MOSTLY_IQ2_XS + case Ftype.MOSTLY_Q2_K_S.code => Ftype.MOSTLY_Q2_K_S + case Ftype.MOSTLY_IQ3_XS.code => Ftype.MOSTLY_IQ3_XS + case Ftype.MOSTLY_IQ3_XXS.code => Ftype.MOSTLY_IQ3_XXS + case Ftype.MOSTLY_IQ1_S.code => Ftype.MOSTLY_IQ1_S + case Ftype.MOSTLY_IQ4_NL.code => Ftype.MOSTLY_IQ4_NL + case Ftype.MOSTLY_IQ3_S.code => Ftype.MOSTLY_IQ3_S + case Ftype.MOSTLY_IQ3_M.code => Ftype.MOSTLY_IQ3_M + case Ftype.MOSTLY_IQ2_S.code => Ftype.MOSTLY_IQ2_S + case Ftype.MOSTLY_IQ2_M.code => Ftype.MOSTLY_IQ2_M + case Ftype.MOSTLY_IQ4_XS.code => Ftype.MOSTLY_IQ4_XS + case Ftype.MOSTLY_IQ1_M.code => Ftype.MOSTLY_IQ1_M + case Ftype.MOSTLY_BF16.code => Ftype.MOSTLY_BF16 + case Ftype.MOSTLY_TQ1_0.code => Ftype.MOSTLY_TQ1_0 + case Ftype.MOSTLY_TQ2_0.code => Ftype.MOSTLY_TQ2_0 + case Ftype.GUESSED.code => Ftype.GUESSED + , + _.code + ) enum RopeScalingType(val code: CInt): case UNSPECIFIED extends RopeScalingType(-1) case NONE extends RopeScalingType(0) case LINEAR extends RopeScalingType(1) case YARN extends RopeScalingType(2) - case MAX_VALUE extends RopeScalingType(2) + case LONGROPE extends RopeScalingType(3) + case MAX_VALUE extends RopeScalingType(3) given Transform[RopeScalingType, CInt]( _ match - case 0 => RopeScalingType.NONE - case 1 => RopeScalingType.LINEAR - case 2 => RopeScalingType.YARN - case _ => RopeScalingType.UNSPECIFIED + case RopeScalingType.NONE.code => RopeScalingType.NONE + case RopeScalingType.LINEAR.code => RopeScalingType.LINEAR + case RopeScalingType.YARN.code => RopeScalingType.YARN + case RopeScalingType.LONGROPE.code => RopeScalingType.LONGROPE + case _ => RopeScalingType.UNSPECIFIED , _.code ) @@ -79,13 +166,31 @@ object Llama: case NONE extends PoolingType(0) case MEAN extends PoolingType(1) case CLS extends PoolingType(2) + case LAST extends PoolingType(3) + case RANK extends PoolingType(4) given Transform[PoolingType, CInt]( _ match - case 0 => PoolingType.NONE - case 1 => PoolingType.MEAN - case 2 => PoolingType.CLS - case _ => PoolingType.UNSPECIFIED + case PoolingType.NONE.code => PoolingType.NONE + case PoolingType.MEAN.code => PoolingType.MEAN + case PoolingType.CLS.code => PoolingType.CLS + case PoolingType.LAST.code => PoolingType.LAST + case PoolingType.RANK.code => PoolingType.RANK + case _ => PoolingType.UNSPECIFIED + , + _.code + ) + + enum AttentionType(val code: CInt): + case UNSPECIFIED extends AttentionType(-1) + case CAUSAL extends AttentionType(0) + case NON_CAUSAL extends AttentionType(1) + + given Transform[AttentionType, CInt]( + _ match + case AttentionType.UNSPECIFIED.code => AttentionType.UNSPECIFIED + case AttentionType.CAUSAL.code => AttentionType.CAUSAL + case AttentionType.NON_CAUSAL.code => AttentionType.NON_CAUSAL , _.code ) @@ -100,6 +205,7 @@ object Llama: final case class TokenDataArray( data: Ptr[TokenData], size: SizeT, + selected: CInt, sorted: CBool ) derives Struct @@ -110,10 +216,7 @@ object Llama: pos: Ptr[Pos], n_seq_id: Ptr[CInt], seq_id: Ptr[Ptr[SeqId]], - logits: Ptr[CInt], - all_pos_0: Pos, - all_pos_1: Pos, - all_seq_id: SeqId + logits: Ptr[CInt] ) derives Struct enum ModelKvOverrideType: @@ -131,6 +234,7 @@ object Llama: ) derives Struct final case class ModelParams( + devices: Ptr[Any], n_gpu_layers: CInt, split_mode: SplitMode, main_gpu: CInt, @@ -174,46 +278,49 @@ object Llama: case F64 extends GgmlType(28) case IQ1_M extends GgmlType(29) case BF16 extends GgmlType(30) - case COUNT extends GgmlType(31) + case TQ1_0 extends GgmlType(34) + case TQ2_0 extends GgmlType(35) + case COUNT extends GgmlType(39) given Transform[GgmlType, CInt]( _ match - case 0 => GgmlType.F32 - case 1 => GgmlType.F16 - case 2 => GgmlType.Q4_0 - case 3 => GgmlType.Q4_1 - case 6 => GgmlType.Q5_0 - case 7 => GgmlType.Q5_1 - case 8 => GgmlType.Q8_0 - case 9 => GgmlType.Q8_1 - case 10 => GgmlType.Q2_K - case 11 => GgmlType.Q3_K - case 12 => GgmlType.Q4_K - case 13 => GgmlType.Q5_K - case 14 => GgmlType.Q6_K - case 15 => GgmlType.Q8_K - case 16 => GgmlType.IQ2_XXS - case 17 => GgmlType.IQ2_XS - case 18 => GgmlType.IQ3_XXS - case 19 => GgmlType.IQ1_S - case 20 => GgmlType.IQ4_NL - case 21 => GgmlType.IQ3_S - case 22 => GgmlType.IQ2_S - case 23 => GgmlType.IQ4_XS - case 24 => GgmlType.I8 - case 25 => GgmlType.I16 - case 26 => GgmlType.I32 - case 27 => GgmlType.I64 - case 28 => GgmlType.F64 - case 29 => GgmlType.IQ1_M - case 30 => GgmlType.BF16 - case 31 => GgmlType.COUNT + case GgmlType.F32.code => GgmlType.F32 + case GgmlType.F16.code => GgmlType.F16 + case GgmlType.Q4_0.code => GgmlType.Q4_0 + case GgmlType.Q4_1.code => GgmlType.Q4_1 + case GgmlType.Q5_0.code => GgmlType.Q5_0 + case GgmlType.Q5_1.code => GgmlType.Q5_1 + case GgmlType.Q8_0.code => GgmlType.Q8_0 + case GgmlType.Q8_1.code => GgmlType.Q8_1 + case GgmlType.Q2_K.code => GgmlType.Q2_K + case GgmlType.Q3_K.code => GgmlType.Q3_K + case GgmlType.Q4_K.code => GgmlType.Q4_K + case GgmlType.Q5_K.code => GgmlType.Q5_K + case GgmlType.Q6_K.code => GgmlType.Q6_K + case GgmlType.Q8_K.code => GgmlType.Q8_K + case GgmlType.IQ2_XXS.code => GgmlType.IQ2_XXS + case GgmlType.IQ2_XS.code => GgmlType.IQ2_XS + case GgmlType.IQ3_XXS.code => GgmlType.IQ3_XXS + case GgmlType.IQ1_S.code => GgmlType.IQ1_S + case GgmlType.IQ4_NL.code => GgmlType.IQ4_NL + case GgmlType.IQ3_S.code => GgmlType.IQ3_S + case GgmlType.IQ2_S.code => GgmlType.IQ2_S + case GgmlType.IQ4_XS.code => GgmlType.IQ4_XS + case GgmlType.I8.code => GgmlType.I8 + case GgmlType.I16.code => GgmlType.I16 + case GgmlType.I32.code => GgmlType.I32 + case GgmlType.I64.code => GgmlType.I64 + case GgmlType.F64.code => GgmlType.F64 + case GgmlType.IQ1_M.code => GgmlType.IQ1_M + case GgmlType.BF16.code => GgmlType.BF16 + case GgmlType.TQ1_0.code => GgmlType.TQ1_0 + case GgmlType.TQ2_0.code => GgmlType.TQ2_0 + case GgmlType.COUNT.code => GgmlType.COUNT , _.code ) final case class ContextParams( - seed: CInt, n_ctx: CInt, n_batch: CInt, n_ubatch: CInt, @@ -222,6 +329,7 @@ object Llama: n_threads_batch: CInt, rope_scaling_type: RopeScalingType, pooling_type: PoolingType, + attention_type: AttentionType, rope_freq_base: CFloat, rope_freq_scale: CFloat, yarn_ext_factor: CFloat, @@ -238,6 +346,7 @@ object Llama: embeddings: CBool, offload_kqv: CBool, flash_attn: CBool, + no_perf: CBool, abort_callback: Ptr[Any], abort_callback_data: Ptr[Any] ) derives Struct @@ -261,103 +370,85 @@ object Llama: given Transform[NumaStrategy, CInt](NumaStrategy.fromOrdinal, _.ordinal) - enum Gretype: - case END, ALT, RULE_REF, CHAR, CHAR_NOT, CHAR_RNG_UPPER, CHAR_ALT - - given Transform[Gretype, CInt](Gretype.fromOrdinal, _.ordinal) - - final case class GrammarElement(gretype: Gretype, value: CInt) derives Struct - - final case class Timings( - t_start_ms: CDouble, - t_end_ms: CDouble, - t_load_ms: CDouble, - t_sample_ms: CDouble, - t_p_eval_ms: CDouble, - t_eval_ms: CDouble, - n_sample: CInt, - n_p_eval: CInt, - n_eval: CInt - ) derives Struct - - // Information associated with an individual cell in the KV cache view. - final case class KvCacheViewCell(pos: Pos) derives Struct + final case class LogitBias(token: Token, bias: CFloat) derives Struct - final case class KvCacheView( - n_cells: CInt, - n_seq_max: CInt, - token_count: CInt, - used_cells: CInt, - max_contiguous: CInt, - max_contiguous_idx: CInt, - cells: Ptr[KvCacheViewCell], - cells_sequences: Ptr[SeqId] - ) derives Struct + final case class SamplerChainParams(no_perf: CBool) derives Struct - final case class BeamView( - tokens: Ptr[Token], - n_tokens: SizeT, - p: CFloat, - eob: CBool - ) derives Struct - - final case class BeamsState( - beam_views: Ptr[BeamView], - n_beams: SizeT, - common_prefix_length: SizeT, - last_call: CBool - ) + final case class ChatMessage(role: Ptr[CChar], content: Ptr[CChar]) + derives Struct trait Llama derives FSet: import Llama.* def llama_model_default_params(): ModelParams def llama_context_default_params(): ContextParams + def llama_sampler_chain_default_params(): SamplerChainParams def llama_model_quantize_default_params(): ModelQuantizeParams + // Initialize the llama + ggml backend + // If numa is true, use NUMA optimizations + // Call once at the start of the program def llama_backend_init(): Unit - def llama_numa_init(strategy: NumaStrategy): Unit - + // Call once at the end of the program - currently only used for MPI def llama_backend_free(): Unit - def llama_load_model_from_file( + def llama_numa_init(strategy: NumaStrategy): Unit + + // Load the model from a file + // If the file is split into multiple parts, the file name must follow this pattern: -%05d-of-%05d.gguf + // If the split file name does not follow this pattern, use llama_model_load_from_splits + def llama_model_load_from_file( path_model: Ptr[CChar], params: ModelParams ): Model - def llama_free_model(model: Model): Unit + // Load the model from multiple splits (support custom naming scheme) + // The paths must be in the correct order + def llama_model_load_from_splits( + paths: Ptr[Ptr[CChar]], + n_paths: SizeT, + params: ModelParams + ): Model - def llama_new_context_with_model(model: Model, params: ContextParams): Ctx + def llama_model_free(model: Model): Unit + def llama_init_from_model(model: Model, params: ContextParams): Ctx + + // Frees all allocated memory def llama_free(ctx: Ctx): Unit def llama_time_us(): CInt - def llama_max_devices(): CInt + def llama_max_devices(): SizeT def llama_supports_mmap(): CBool def llama_supports_mlock(): CBool def llama_supports_gpu_offload(): CBool - - def llama_get_model(ctx: Ctx): Model + def llama_supports_rpc(): CBool def llama_n_ctx(ctx: Ctx): CInt def llama_n_batch(ctx: Ctx): CInt def llama_n_ubatch(ctx: Ctx): CInt def llama_n_seq_max(ctx: Ctx): CInt + def llama_get_model(ctx: Ctx): Model def llama_pooling_type(ctx: Ctx): PoolingType - def llama_vocab_type(model: Model): VocabType + def llama_model_get_vocab(model: Model): Vocab + def llama_model_rope_type(model: Model): RopeType - def llama_n_vocab(model: Model): CInt - def llama_n_ctx_train(model: Model): CInt - def llama_n_embd(model: Model): CInt - def llama_n_layer(model: Model): CInt + def llama_model_n_ctx_train(model: Model): CInt + def llama_model_n_embd(model: Model): CInt + def llama_model_n_layer(model: Model): CInt + def llama_model_n_head(model: Model): CInt // Get the model's RoPE frequency scaling factor - def llama_rope_freq_scale_train(model: Model): CFloat + def llama_model_rope_freq_scale_train(model: Model): CFloat + + def llama_vocab_type(vocab: Vocab): VocabType + + def llama_vocab_n_tokens(vocab: Vocab): CInt // Get metadata value as a string by key name def llama_model_meta_val_str( @@ -392,26 +483,61 @@ trait Llama derives FSet: // Returns the total size of all the tensors in the model in bytes def llama_model_size(model: Model): CInt + // Get the default chat template. Returns nullptr if not available + // If name is NULL, returns the default chat template + def llama_model_chat_template(model: Model, name: Ptr[CChar]): Ptr[CChar] + // Returns the total number of parameters in the model def llama_model_n_params(model: Model): CInt + // Returns true if the model contains an encoder that requires llama_encode() call + def llama_model_has_encoder(model: Model): CBool + + // Returns true if the model contains a decoder that requires llama_decode() call + def llama_model_has_decoder(model: Model): CBool + + // For encoder-decoder models, this function returns id of the token that must be provided + // to the decoder to start generating output sequence. For other models, it returns -1. + def llama_model_decoder_start_token(model: Model): Token + + // Returns true if the model is recurrent (like Mamba, RWKV, etc.) + def llama_model_is_recurrent(model: Model): CBool + + // Returns 0 on success def llama_model_quantize( fname_inp: Ptr[CChar], fname_out: Ptr[CChar], params: Ptr[ModelQuantizeParams] ): CInt - // Apply a LoRA adapter to a loaded model - def llama_model_apply_lora_from_file( - model: Model, - path_lora: Ptr[CChar], - scale: CFloat, - path_base_model: Ptr[CChar], - n_threads: CInt - ): CInt + // Adapters - def llama_control_vector_apply( - lctx: Ctx, + // Load a LoRA adapter from file + def llama_adapter_lora_init(model: Model, path_lora: Ptr[CChar]): LoraAdapter + + // Manually free a LoRA adapter + // Note: loaded adapters will be free when the associated model is deleted + def llama_adapter_lora_free(adapter: LoraAdapter): Unit + + // Add a loaded LoRA adapter to given context + // This will not modify model's weight + def llama_set_adapter_lora(ctx: Ctx, adapter: LoraAdapter, scale: Float): CInt + + // Remove a specific LoRA adapter from given context + // Return -1 if the adapter is not present in the context + def llama_rm_adapter_lora(ctx: Ctx, adapter: LoraAdapter): CInt + + // Remove all LoRA adapters from given context + def llama_clear_adapter_lora(ctx: Ctx): Unit + + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + def llama_apply_adapter_cvec( + ctx: Ctx, data: Ptr[Float], len: SizeT, n_embd: CInt, @@ -419,16 +545,7 @@ trait Llama derives FSet: il_end: CInt ): CInt - // KV - - // Create an empty KV cache view. (use only for debugging purposes) - def llama_kv_cache_view_init(ctx: Ctx, n_seq_max: CInt): KvCacheView - - // Free a KV cache view. (use only for debugging purposes) - def llama_kv_cache_view_free(view: Ptr[KvCacheView]): Unit - - // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) - def llama_kv_cache_view_update(ctx: Ctx, view: Ptr[KvCacheView]): Unit + // KV cache // Returns the number of tokens in the KV cache (slow, use only for debug) // If a KV cell has multiple sequences assigned to it, it will be counted multiple times @@ -437,10 +554,11 @@ trait Llama derives FSet: // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) def llama_get_kv_cache_used_cells(ctx: Ctx): CInt - // Clear the KV cache + // Clear the KV cache - both cell info is erased and KV data is zeroed def llama_kv_cache_clear(ctx: Ctx): Unit // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) + // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails // seq_id < 0 : match any sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) @@ -462,7 +580,9 @@ trait Llama derives FSet: def llama_kv_cache_seq_keep(ctx: Ctx, seq_id: SeqId): Unit // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) def llama_kv_cache_seq_add( @@ -474,7 +594,9 @@ trait Llama derives FSet: ): Unit // Integer division of the positions by factor of `d > 1` - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) def llama_kv_cache_seq_div( @@ -497,17 +619,17 @@ trait Llama derives FSet: // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) def llama_kv_cache_update(ctx: Ctx): Unit + // Check if the context supports KV cache shifting + def llama_kv_cache_can_shift(ctx: Ctx): CBool + // Decoding - // Return batch for single sequence of tokens starting at pos_0 + // Return batch for single sequence of tokens + // The sequence ID will be fixed to 0 + // The position of the tokens will be tracked automatically by llama_decode // // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it - def llama_batch_get_one( - tokens: Ptr[Token], - n_tokens: CInt, - pos_0: Pos, - seq_id: SeqId - ): Batch + def llama_batch_get_one(tokens: Ptr[Token], n_tokens: CInt): Batch // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens // Each token can be assigned up to n_seq_max sequence ids @@ -521,10 +643,16 @@ trait Llama derives FSet: // Frees a batch of tokens allocated with llama_batch_init() def llama_batch_free(batch: Batch): Unit + // Processes a batch of tokens with the ecoder part of the encoder-decoder model. + // Stores the encoder output internally for later use by the decoder cross-attention layers. + // 0 - success + // < 0 - error. the KV cache state is restored to the state before this call + def llama_encode(ctx: Ctx, batch: Batch): CInt + // Positive return values does not mean a fatal error, but rather a warning. // 0 - success // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) - // < 0 - error + // < 0 - error. the KV cache state is restored to the state before this call def llama_decode(ctx: Ctx, batch: Batch): CInt // Set the number of threads used for decoding @@ -536,6 +664,16 @@ trait Llama derives FSet: n_threads_batch: CInt ): Unit + // Get the number of threads used for generation of a single token. + def llama_n_threads(ctx: Ctx): CInt + + // Get the number of threads used for prompt and batch processing (multiple token). + def llama_n_threads_batch(ctx: Ctx): CInt + + // Set whether the model is in embeddings mode or not + // If true, embeddings will be returned but logits will not + def llama_set_embeddings(ctx: Ctx, embeddings: CBool): Unit + // Set whether to use causal attention or not // If set to true, the model will only attend to the past tokens def llama_set_causal_attn(ctx: Ctx, causal_attn: CBool): Unit @@ -582,36 +720,41 @@ trait Llama derives FSet: // Get the embeddings for a sequence id // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE - // shape: [n_embd] (1-dimensional) + // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence + // otherwise: float[n_embd] (1-dimensional) def llama_get_embeddings_seq(ctx: Ctx, seq_id: SeqId): Ptr[Float] // Vocab - def llama_token_get_text(model: Model, token: Token): Ptr[CChar] + def llama_vocab_get_text(vocab: Vocab, token: Token): Ptr[CChar] - def llama_token_get_score(model: Model, token: Token): CFloat + def llama_vocab_get_score(vocab: Vocab, token: Token): CFloat - def llama_token_get_type(model: Model, token: Token): TokenType + def llama_vocab_get_attr(vocab: Vocab, token: Token): TokenAttr // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) - def llama_token_is_eog(model: Model, token: Token): CBool + def llama_vocab_is_eog(vocab: Vocab, token: Token): CBool - def llama_token_bos(model: Model): Token - def llama_token_eos(model: Model): Token - def llama_token_cls(model: Model): Token - def llama_token_sep(model: Model): Token - def llama_token_nl(model: Model): Token + // Identify if Token Id is a control token or a render-able token + def llama_vocab_is_control(vocab: Vocab, token: Token): CBool - // Returns -1 if unknown, 1 for true or 0 for false. - def llama_add_bos_token(model: Model): CInt + // Special tokens + def llama_vocab_bos(vocab: Vocab): Token + def llama_vocab_eos(vocab: Vocab): Token + def llama_vocab_eot(vocab: Vocab): Token + def llama_vocab_sep(vocab: Vocab): Token + def llama_vocab_nl(vocab: Vocab): Token + def llama_vocab_pad(vocab: Vocab): Token - // Returns -1 if unknown, 1 for true or 0 for false. - def llama_add_eos_token(model: Model): CInt + def llama_vocab_get_add_bos(vocab: Vocab): CBool + def llama_vocab_get_add_eos(vocab: Vocab): CBool - def llama_token_prefix(model: Model): Token - def llama_token_middle(model: Model): Token - def llama_token_suffix(model: Model): Token - def llama_token_eot(model: Model): Token + def llama_vocab_fim_pre(vocab: Vocab): Token + def llama_vocab_fim_suf(vocab: Vocab): Token + def llama_vocab_fim_mid(vocab: Vocab): Token + def llama_vocab_fim_pad(vocab: Vocab): Token + def llama_vocab_fim_rep(vocab: Vocab): Token + def llama_vocab_fim_sep(vocab: Vocab): Token // Tokenization @@ -619,10 +762,11 @@ trait Llama derives FSet: /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. /// @return Returns the number of tokens on success, no more than n_tokens_max /// @return Returns a negative number on failure - the number of tokens that would have been returned + /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so. /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated /// as plaintext. Does not insert a leading space. def llama_tokenize( - model: Model, + vocab: Vocab, text: Ptr[CChar], text_len: CInt, tokens: Ptr[Token], @@ -634,112 +778,114 @@ trait Llama derives FSet: // Token Id -> Piece. // Uses the vocabulary in the provided context. // Does not write null terminator to the buffer. - // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. + // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix') // @param special If true, special tokens are rendered in the output. def llama_token_to_piece( - model: Model, + vocab: Vocab, token: Token, buf: Ptr[CChar], length: CInt, + lstrip: CInt, special: CBool ): CInt - // Grammar + /// @details Convert the provided tokens into text (inverse of llama_tokenize()). + /// @param text The char pointer must be large enough to hold the resulting text. + /// @return Returns the number of chars/bytes on success, no more than text_len_max. + /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned. + /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so. + /// @param unparse_special If true, special tokens are rendered in the output. + def llama_detokenize( + vocab: Vocab, + tokens: Ptr[Token], + n_tokens: CInt, + text: Ptr[CChar], + text_len_max: CInt, + remove_special: CBool, + unparse_special: CBool + ): CInt - def llama_grammar_init( - rules: Ptr[Ptr[GrammarElement]], - n_rules: SizeT, - start_rule_index: SizeT - ): Grammar + // Chat templates + + /// Apply chat template. Inspired by hf apply_chat_template() on python. + /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" + /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. + /// @param chat Pointer to a list of multiple llama_chat_message + /// @param n_msg Number of llama_chat_message in this chat + /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message. + /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages) + /// @param length The size of the allocated buffer + /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template. + def llama_chat_apply_template( + tmpl: Ptr[CChar], + chat: Ptr[ChatMessage], + n_msg: SizeT, + add_ass: CBool, + buf: Ptr[CChar], + length: CInt + ): CInt - def llama_grammar_free(grammar: Grammar): Unit + // Get list of built-in chat templates + def llama_chat_builtin_templates(output: Ptr[Ptr[CChar]], len: SizeT): CInt - def llama_grammar_copy(grammar: Grammar): Grammar + // Sampling API - // Sampling functions + def llama_sampler_name(sampler: Sampler): Ptr[CChar] + def llama_sampler_accept(sampler: Sampler, token: Token): Unit + def llama_sampler_apply( + sampler: Sampler, + candidates: Ptr[TokenDataArray] + ): Unit + def llama_sampler_reset(sampler: Sampler): Unit + def llama_sampler_clone(sampler: Sampler): Sampler + // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add) + def llama_sampler_free(sampler: Sampler): Unit - // Sets the current rng seed. - def llama_set_rng_seed(ctx: Ctx, seed: CInt): Unit + def llama_sampler_chain_init(params: SamplerChainParams): Sampler - /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - def llama_sample_repetition_penalties( - ctx: Ctx, - candidates: Ptr[TokenDataArray], - last_tokens: Ptr[Token], - penalty_last_n: SizeT, - penalty_repeat: CFloat, - penalty_freq: CFloat, - penalty_present: CFloat - ): Unit + // important: takes ownership of the sampler object and will free it when llama_sampler_free is called + def llama_sampler_chain_add(chain: Sampler, smpl: Sampler): Unit + def llama_sampler_chain_get(chain: Sampler, i: CInt): Sampler + def llama_sampler_chain_n(chain: Sampler): CInt - /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. - def llama_sample_softmax( - ctx: Ctx, - candidates: Ptr[TokenDataArray] - ): Unit + // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed + def llama_sampler_chain_remove(chain: Sampler, i: CInt): Sampler + + // Available samplers: + + def llama_sampler_init_greedy(): Sampler + def llama_sampler_init_dist(seed: CInt): Sampler /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - def llama_sample_top_k( - ctx: Ctx, - candidates: Ptr[TokenDataArray], - k: CInt, - min_keep: SizeT - ): Unit + def llama_sampler_init_top_k(k: CInt): Sampler /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - def llama_sample_top_p( - ctx: Ctx, - candidates: Ptr[TokenDataArray], - p: CFloat, - min_keep: SizeT - ): Unit + def llama_sampler_init_top_p(p: CFloat, min_keep: SizeT): Sampler /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 - def llama_sample_min_p( - ctx: Ctx, - candidates: Ptr[TokenDataArray], - p: CFloat, - min_keep: SizeT - ): Unit - - /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. - def llama_sample_tail_free( - ctx: Ctx, - candidates: Ptr[TokenDataArray], - z: CFloat, - min_keep: SizeT - ): Unit + def llama_sampler_init_min_p(p: CFloat, min_keep: SizeT): Sampler /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. - def llama_sample_typical( - ctx: Ctx, - candidates: Ptr[TokenDataArray], - p: CFloat, - min_keep: SizeT - ): Unit + def llama_sampler_init_typical(p: CFloat, min_keep: SizeT): Sampler - /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. - def llama_sample_entropy( - ctx: Ctx, - candidates_p: Ptr[TokenDataArray], - min_temp: Float, - max_temp: Float, - exponent_val: Float - ): Unit + /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf + def llama_sampler_init_temp(t: CFloat): Sampler - def llama_sample_temp( - ctx: Ctx, - candidates: Ptr[TokenDataArray], - temp: CFloat - ): Unit + /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. + def llama_sampler_init_temp_ext( + t: CFloat, + delta: CFloat, + exponent: CFloat + ): Sampler - /// @details Apply constraints from grammar - def llama_sample_grammar( - ctx: Ctx, - candidates: Ptr[TokenDataArray], - grammar: Grammar - ): Unit + /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 + def llama_sampler_init_xtc( + p: CFloat, + t: CFloat, + min_keep: SizeT, + seed: CInt + ): Sampler /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. @@ -747,61 +893,130 @@ trait Llama derives FSet: /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - def llama_sample_token_mirostat( - ctx: Ctx, - candidates: Ptr[TokenDataArray], + def llama_sampler_init_mirostat( + n_vocab: CInt, + seed: CInt, tau: CFloat, eta: CFloat, - m: CInt, - mu: Ptr[CFloat] - ): Token + m: CInt + ): Sampler /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - def llama_sample_token_mirostat_v2( - ctx: Ctx, - candidates: Ptr[TokenDataArray], + def llama_sampler_init_mirostat_v2( + seed: CInt, tau: CFloat, - eta: CFloat, - mu: Ptr[CFloat] - ): Token - - /// @details Selects the token with the highest probability. - /// Does not compute the token probabilities. Use llama_sample_softmax() instead. - def llama_sample_token_greedy( - ctx: Ctx, - candidates: Ptr[TokenDataArray] - ): Token - - /// @details Randomly selects a token from the candidates based on their probabilities. - def llama_sample_token(ctx: Ctx, candidates: Ptr[TokenDataArray]): Token - - /// @details Accepts the sampled token into the grammar - def llama_grammar_accept_token( - ctx: Ctx, - grammar: Grammar, - token: Token - ): Unit - - // Beam search - - def llama_beam_search( - ctx: Ctx, - callback: Ptr[(Ptr[Any], BeamsState) => Unit], - callback_data: Ptr[Any], - n_beams: SizeT, - n_past: CInt, - n_predict: CInt - ): Unit + eta: CFloat + ): Sampler + + def llama_sampler_init_grammar( + vocab: Vocab, + grammar_str: Ptr[CChar], + grammar_root: Ptr[CChar] + ): Sampler + + /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639 + /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future. + /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. + def llama_sampler_init_grammar_lazy( + vocab: Vocab, + grammar_str: Ptr[CChar], + grammar_root: Ptr[CChar], + trigger_words: Ptr[Ptr[CChar]], + num_trigger_words: SizeT, + trigger_tokens: Ptr[Token], + num_trigger_tokens: SizeT + ): Sampler + + /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. + def llama_sampler_init_penalties( + penalty_last_n: CInt, + penalty_repeat: CFloat, + penalty_freq: CFloat, + penalty_present: CFloat + ): Sampler + + /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 + def llama_sampler_init_dry( + vocab: Vocab, + n_ctx_train: CInt, + dry_multiplier: CFloat, + dry_base: CFloat, + dry_allowed_length: CInt, + dry_penalty_last_n: CInt, + seq_breakers: Ptr[Ptr[CChar]], + num_breakers: SizeT + ): Sampler + + def llama_sampler_init_logit_bias( + n_vocab: CInt, + n_logit_bias: CInt, + logit_bias: Ptr[LogitBias] + ): Sampler + + // this sampler is meant to be used for fill-in-the-middle infilling + // it's supposed to be used after top_k + top_p sampling + // + // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG + // 2. combine probs of tokens that have the same prefix + // + // example: + // + // - before: + // "hel": 0.5 + // "hell": 0.2 + // "hello": 0.1 + // "dummy": 0.1 + // + // - after: + // "hel": 0.8 + // "dummy": 0.1 + // + // 3. discard non-EOG tokens with low prob + // 4. if no tokens are left -> pick EOT + def llama_sampler_init_infill(vocab: Vocab): Sampler - // Performance and system information + // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise + def llama_sampler_get_seed(smpl: Sampler): CInt - def llama_get_timings(ctx: Ctx): Timings + /// @details Sample and accept a token from the idx-th output of the last evaluation + // + // Shorthand for: + // const auto * logits = llama_get_logits_ith(ctx, idx); + // llama_token_data_array cur_p = { ... init from logits ... }; + // llama_sampler_apply(smpl, &cur_p); + // auto token = cur_p.data[cur_p.selected].id; + // llama_sampler_accept(smpl, token); + // return token; + // Returns the sampled token + def llama_sampler_sample(smpl: Sampler, ctx: Ctx, idx: CInt): Token + + // Model split + + /// @details Build a split GGUF final path for this chunk. + /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" + // Returns the split_path length. + def llama_split_path( + split_path: Ptr[CChar], + maxlen: SizeT, + path_prefix: Ptr[CChar], + split_no: CInt, + split_count: CInt + ): CInt - def llama_print_timings(ctx: Ctx): Unit - def llama_reset_timings(ctx: Ctx): Unit + /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. + /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" + // Returns the split_prefix length. + def llama_split_prefix( + split_prefix: Ptr[CChar], + maxlen: SizeT, + split_path: Ptr[CChar], + split_no: CInt, + split_count: CInt + ): CInt + // Print system information def llama_print_system_info(): Ptr[CChar] diff --git a/src/main/scala/com/donderom/llm4s/Llm.scala b/src/main/scala/com/donderom/llm4s/Llm.scala index e73db9e..324a0e7 100644 --- a/src/main/scala/com/donderom/llm4s/Llm.scala +++ b/src/main/scala/com/donderom/llm4s/Llm.scala @@ -5,6 +5,7 @@ import java.nio.file.Path import scala.util.Try import fr.hammons.slinc.runtime.given +import fr.hammons.slinc.types.SizeT import fr.hammons.slinc.{FSet, Ptr, Scope, Slinc} final case class Logprob(token: String, value: Double) @@ -34,11 +35,16 @@ object Llm: val llm = createModel(model, params) def generate(prompt: String, params: LlmParams): Try[Usage] = - for ctx <- createContext(llm, params.context, false) + for + llm <- llm + ctx <- createContext(llm, params.context, false) + _ <- loadLora(llm, ctx, params.lora) yield SlincLlm(ctx).generate(prompt, params) def embeddings(prompt: String, params: ContextParams): Try[Array[Float]] = - for ctx <- createContext(llm, params, true) + for + llm <- llm + ctx <- createContext(llm, params, true) yield SlincLlm(ctx).embeddings(prompt, params.batch) def close(): Unit = @@ -46,20 +52,18 @@ object Llm: llama <- binding llm <- llm do - llama.llama_free_model(llm) + llama.llama_model_free(llm) llama.llama_backend_free() private def createModel( model: Path, params: ModelParams ): Try[Llama.Model] = - binding.foreach: llama => + binding.map: llama => llama.llama_backend_init() llama.llama_numa_init(params.numa) - - Scope.global: - val baseModel = binding.map: llama => - llama.llama_load_model_from_file( + Scope.confined: + llama.llama_model_load_from_file( path_model = Ptr.copy(model.toAbsolutePath.toString), params = llama.llama_model_default_params().copy( n_gpu_layers = params.gpuLayers, @@ -69,31 +73,14 @@ object Llm: ) ) - params.lora.adapter.fold(baseModel): loraAdapter => - val err = - for - llama <- binding - llm <- baseModel - loraBase = params.lora.base.fold(Slinc.getRuntime().Null): - base => Ptr.copy(base.toAbsolutePath.toString) - yield llama.llama_model_apply_lora_from_file( - model = llm, - path_lora = Ptr.copy(loraAdapter.toAbsolutePath.toString), - scale = params.lora.scale, - path_base_model = loraBase, - n_threads = params.lora.threads - ) - err.filter(_ == 0).flatMap(_ => baseModel) - private def createContext( - llm: Try[Llama.Model], + llm: Llama.Model, contextParams: ContextParams, embedding: Boolean ): Try[Llama.Ctx] = for llama <- binding - llm <- llm - ctx = llama.llama_new_context_with_model( + ctx = llama.llama_init_from_model( model = llm, params = llamaParams( llama.llama_context_default_params(), @@ -103,15 +90,34 @@ object Llm: ) if ctx != Slinc.getRuntime().Null yield ctx + private def loadLora( + llm: Llama.Model, + ctx: Llama.Ctx, + loraParams: Option[LoraParams] + ): Try[Unit] = + loraParams.fold(Try(())): params => + Scope.confined: + for + llama <- binding + adapter <- Try( + llama.llama_adapter_lora_init( + llm, + Ptr.copy(params.path.toAbsolutePath.toString) + ) + ) + if adapter != Slinc.getRuntime().Null + _ <- Try(llama.llama_set_adapter_lora(ctx, adapter, params.scale)) + yield () + private def llamaParams( defaultParams: Llama.ContextParams, params: ContextParams, embedding: Boolean ): Llama.ContextParams = defaultParams.copy( - seed = params.seed, n_ctx = params.size, - n_batch = params.batch.size, + n_batch = params.batch.logical, + n_ubatch = params.batch.physical, n_threads = params.threads, n_threads_batch = params.batch.threads, rope_scaling_type = params.rope.scalingType, diff --git a/src/main/scala/com/donderom/llm4s/Params.scala b/src/main/scala/com/donderom/llm4s/Params.scala index 83be14d..4f26010 100644 --- a/src/main/scala/com/donderom/llm4s/Params.scala +++ b/src/main/scala/com/donderom/llm4s/Params.scala @@ -7,9 +7,8 @@ import Llama.{NumaStrategy, RopeScalingType} object Default: val threads = Runtime.getRuntime.availableProcessors - val penalty: Penalty = Penalty() - val repeatLastTokens: Int = 64 val logprobs: Int = 0 + val seed: Int = 0xfffffff val temp: Float = .8f object Mirostat: val tau: Float = 5.0f @@ -17,19 +16,16 @@ object Default: val muCoef: Float = 2.0f final case class LoraParams( - adapter: Option[Path] = None, - base: Option[Path] = None, - scale: Float = 1.0f, - threads: Int = Default.threads + path: Path, + scale: Float = 1.0f ) final case class ModelParams( - gpuLayers: Int = 0, + gpuLayers: Int = -1, mainGpu: Int = 0, mmap: Boolean = true, mlock: Boolean = false, - numa: NumaStrategy = NumaStrategy.DISABLED, - lora: LoraParams = LoraParams() + numa: NumaStrategy = NumaStrategy.DISABLED ) final case class RopeParams( @@ -46,13 +42,16 @@ final case class YarnParams( origCtx: Int = 0 ) -final case class BatchParams(size: Int = 512, threads: Int = Default.threads) +final case class BatchParams( + logical: Int = 2048, + physical: Int = 512, + threads: Int = Default.threads +) final case class GroupAttention(factor: Int = 1, width: Int = 512) final case class ContextParams( - seed: Int = -1, - size: Int = 512, + size: Int = 4096, threads: Int = Default.threads, batch: BatchParams = BatchParams(), rope: RopeParams = RopeParams(), @@ -60,10 +59,23 @@ final case class ContextParams( ) final case class Penalty( - repeat: Float = 1.10f, + lastN: Int = 64, + repeat: Float = 1.0f, frequency: Float = .0f, - presence: Float = .0f, - penalizeNewLines: Boolean = true + presence: Float = .0f +) + +final case class Dry( + multiplier: Float = .0f, + base: Float = 1.75f, + allowedLength: Int = 2, + penaltyLastN: Int = -1, + seqBreakers: Seq[Char] = Seq[Char]('\n', ':', '"', '*') +) + +final case class Xtc( + probability: Float = .0f, + threshold: Float = 0.10f ) final case class Dynatemp( @@ -71,63 +83,50 @@ final case class Dynatemp( exponent: Float = 1.0f ) -enum Sampler: - case TOP_K, TAIL_FREE, TYPICAL, TOP_P, MIN_P, TEMPERATURE - -enum Sampling( - val penalty: Penalty, - val repeatLastTokens: Int, - val logprobs: Int -): - case Greedy( - override val penalty: Penalty = Default.penalty, - override val repeatLastTokens: Int = Default.repeatLastTokens, - override val logprobs: Int = Default.logprobs - ) extends Sampling(penalty, repeatLastTokens, logprobs) - - case MirostatV1( - override val penalty: Penalty = Default.penalty, - override val repeatLastTokens: Int = Default.repeatLastTokens, - override val logprobs: Int = Default.logprobs, +enum SamplerType: + case PENALTIES, DRY, TOP_K, TYPICAL_P, TOP_P, MIN_P, XTC, TEMPERATURE + +enum Sampling: + case Dist( + greedy: Boolean = false, + samplers: List[SamplerType] = SamplerType.values.toList, + seed: Int = Default.seed, + logitBias: Map[Int, Float] = Map(), + penalty: Penalty = Penalty(), + dry: Dry = Dry(), + minKeep: Short = 0, + topK: Int = 40, + typicalP: Float = 1.0f, + topP: Float = 0.95f, + minP: Float = 0.05f, + xtc: Xtc = Xtc(), temp: Float = Default.temp, - tau: Float = Default.Mirostat.tau, - eta: Float = Default.Mirostat.eta, - m: Int = 100, - muCoef: Float = Default.Mirostat.muCoef - ) extends Sampling(penalty, repeatLastTokens, logprobs) - - case MirostatV2( - override val penalty: Penalty = Default.penalty, - override val repeatLastTokens: Int = Default.repeatLastTokens, - override val logprobs: Int = Default.logprobs, + dynatemp: Dynatemp = Dynatemp() + ) + + case Mirostat1( + seed: Int = Default.seed, temp: Float = Default.temp, tau: Float = Default.Mirostat.tau, eta: Float = Default.Mirostat.eta, - muCoef: Float = Default.Mirostat.muCoef - ) extends Sampling(penalty, repeatLastTokens, logprobs) + m: Int = 100 + ) - case Random( - override val penalty: Penalty = Default.penalty, - override val repeatLastTokens: Int = Default.repeatLastTokens, - override val logprobs: Int = Default.logprobs, + case Mirostat2( + seed: Int = Default.seed, temp: Float = Default.temp, - topK: Option[Int] = Some(40), - tfsZ: Float = 1.0f, - typicalP: Float = 1.0f, - topP: Float = .95f, - minP: Float = .05f, - dynatemp: Dynatemp = Dynatemp(), - samplers: List[Sampler] = Sampler.values.toList - ) extends Sampling(penalty, repeatLastTokens, logprobs) + tau: Float = Default.Mirostat.tau, + eta: Float = Default.Mirostat.eta + ) final case class LlmParams( context: ContextParams = ContextParams(), - sampling: Sampling = Sampling.Random(), + sampling: Sampling = Sampling.Dist(), predictTokens: Int = -1, keepTokens: Int = 0, - logitBias: Map[Int, Float] = Map(), suffix: Option[String] = None, echo: Boolean = true, stopSeqs: List[String] = Nil, - groupAttention: GroupAttention = GroupAttention() + groupAttention: GroupAttention = GroupAttention(), + lora: Option[LoraParams] = None ) diff --git a/src/main/scala/com/donderom/llm4s/SlincLlm.scala b/src/main/scala/com/donderom/llm4s/SlincLlm.scala index fe66aa2..aab06f3 100644 --- a/src/main/scala/com/donderom/llm4s/SlincLlm.scala +++ b/src/main/scala/com/donderom/llm4s/SlincLlm.scala @@ -12,6 +12,7 @@ import fr.hammons.slinc.{FSet, Ptr, Scope} import State.* private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx): + // Logprobs are None until a better solution is implemented final case class Sample(id: Int, prob: Option[Probability]) lazy val llama = FSet.instance[Llama] @@ -20,6 +21,7 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx): lazy val decoder = StandardCharsets.UTF_8.newDecoder def generate(prompt: String, params: LlmParams): Usage = + val sampler = createSampler(params.sampling) val lastTokens = new ArrayDeque[Int](ctxSize) val stop = Stop.Acc[Token](params.stopSeqs) @@ -87,18 +89,13 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx): evaluate(ids, past, params.context.batch) end eval - def repeatTokens(): Array[Int] = - val repeatLastTokens = - if params.sampling.repeatLastTokens < 0 then ctxSize - else params.sampling.repeatLastTokens - val lastRepeat = math.min(lastTokens.size, repeatLastTokens) - val padding = Array.fill(repeatLastTokens - lastRepeat)(0) - padding ++ lastTokens.takeRight(lastRepeat).toArray - def tokens(state: State[Token]): LazyList[Token] = if !state.remaining.none then val newPast = eval(state.evaluated) - val smpl = sample(repeatTokens(), params.sampling, params.logitBias) + + val tokenId = llama.llama_sampler_sample(sampler, ctx, -1) + llama.llama_sampler_accept(sampler, tokenId) + val smpl = Sample(tokenId, None) if lastTokens.size == ctxSize then lastTokens.remove(0) lastTokens.append(smpl.id) @@ -119,8 +116,8 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx): LazyList.from(chunk) #::: gen(st) case stop.Action.Stop(chunk) => LazyList.from(params.suffix.fold(chunk)(chunk :+ _.token)) - else close(state.stop.deferred(params.suffix)) - else close(state.stop.deferred(params.suffix)) + else close(state.stop.deferred(params.suffix), sampler) + else close(state.stop.deferred(params.suffix), sampler) end tokens val ids = encode(prompt) @@ -143,21 +140,18 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx): def embeddings(prompt: String, params: BatchParams): Array[Float] = val ids = encode(prompt) val _ = evaluate(ids, Evaluated.none, params) - val size = llama.llama_n_embd(model) + val size = llama.llama_model_n_embd(model) val embeddings = llama.llama_get_embeddings(ctx).asArray(size).unsafeArray llama.llama_free(ctx) embeddings lazy val ctxSize: Int = llama.llama_n_ctx(ctx) - lazy val vocabSize: Int = llama.llama_n_vocab(model) - lazy val addBosToken: Int = llama.llama_add_bos_token(model) - lazy val addBos: Boolean = - if addBosToken != -1 then addBosToken != 0 - else llama.llama_vocab_type(model) == Llama.VocabType.SPM - lazy val newLineToken: Int = llama.llama_token_nl(model) + lazy val vocab: Llama.Vocab = llama.llama_model_get_vocab(model) + lazy val vocabSize: Int = llama.llama_vocab_n_tokens(vocab) + lazy val addBos: Boolean = llama.llama_vocab_get_add_bos(vocab) def keepGenerating(token: Int): Boolean = - !llama.llama_token_is_eog(model, token) + !llama.llama_vocab_is_eog(vocab, token) def encode(text: String): Array[Int] = val bos = if addBos then 1 else 0 @@ -166,7 +160,7 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx): Scope.confined: val tokens = Ptr.copy(res) val numTokens = llama.llama_tokenize( - model = model, + vocab = vocab, text = Ptr.copy(bytes), text_len = bytes.size, tokens = tokens, @@ -188,10 +182,11 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx): Scope.confined: val tokens = Ptr.copy(res) val numTokens = llama.llama_token_to_piece( - model = model, + vocab = vocab, token = token, buf = tokens, length = res.size, + lstrip = 0, special = false ) if numTokens < 0 then decode(token, pending, math.abs(numTokens)) @@ -203,160 +198,118 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx): def evaluate( ids: Array[Int], past: Evaluated, - params: BatchParams + batch: BatchParams ): Evaluated = if ids.isEmpty then past else - val batches = ids.grouped(params.size) + val batches = ids.grouped(batch.logical) Scope.confined: for (batch, n) <- batches.zipWithIndex do llama.llama_decode( ctx = ctx, batch = llama.llama_batch_get_one( tokens = Ptr.copy(batch), - n_tokens = batch.size, - pos_0 = (past + n * params.size).toInt, - seq_id = 0 + n_tokens = batch.size ) ) past + ids.size - def sample( - repeatTokens: Array[Int], - sampling: Sampling, - logitBias: Map[Int, Float], - idx: Int = 0 - ): Sample = - import Sampling.* + def createSampler(params: Sampling): Llama.Sampler = + val sparams = llama.llama_sampler_chain_default_params() + val chain = llama.llama_sampler_chain_init(sparams) + val add = llama.llama_sampler_chain_add(chain, _) + params match + case config: Sampling.Dist => + Scope.confined: + if !config.logitBias.isEmpty then + val logitBias = config.logitBias.map(Llama.LogitBias(_, _)) + add( + llama.llama_sampler_init_logit_bias( + vocabSize, + config.logitBias.size, + Ptr.copy(logitBias.toArray) + ) + ) - Scope.confined: - val logits = llama.llama_get_logits_ith(ctx, idx).asArray(vocabSize) - .unsafeArray - logitBias.foreach((token, bias) => logits(token) = bias) + for sampler <- config.samplers do + val minKeep = SizeT(config.minKeep) + sampler match + case SamplerType.DRY => + val seqBreakers = config.dry.seqBreakers.map(_.toByte) + add( + llama.llama_sampler_init_dry( + llama.llama_model_get_vocab(model), + llama.llama_model_n_ctx_train(model), + config.dry.multiplier, + config.dry.base, + config.dry.allowedLength, + config.dry.penaltyLastN, + Ptr.copy(Ptr.copy(seqBreakers.toArray)), + SizeT(seqBreakers.size.toShort) + ) + ) - val tokenData = Array.tabulate[Llama.TokenData](vocabSize): tokenId => - Llama.TokenData(id = tokenId, logit = logits(tokenId), p = .0) + case SamplerType.TOP_K => + add(llama.llama_sampler_init_top_k(config.topK)) - val data = Ptr.copy(tokenData) + case SamplerType.TOP_P => + add(llama.llama_sampler_init_top_p(config.topP, minKeep)) - val candidates = Ptr.copy( - Llama.TokenDataArray( - data = data, - size = SizeT(tokenData.size.toShort), - sorted = false - ) - ) + case SamplerType.MIN_P => + add(llama.llama_sampler_init_min_p(config.minP, minKeep)) - val repeatLastTokens = Ptr.copy(repeatTokens) - val repeatTokensSize = SizeT(repeatTokens.size.toShort) - llama.llama_sample_repetition_penalties( - ctx = ctx, - candidates = candidates, - last_tokens = repeatLastTokens, - penalty_last_n = repeatTokensSize, - penalty_repeat = sampling.penalty.repeat, - penalty_freq = sampling.penalty.frequency, - penalty_present = sampling.penalty.presence - ) + case SamplerType.XTC => + add( + llama.llama_sampler_init_xtc( + config.xtc.probability, + config.xtc.threshold, + minKeep, + config.seed + ) + ) - if !sampling.penalty.penalizeNewLines then - val newLineLogit = logits(newLineToken) - val newLineIndex = tokenData.indexWhere(_.id == newLineToken) - if newLineIndex != -1 then - !data(newLineIndex) = (!data(newLineIndex)).copy(logit = newLineLogit) - - val tokenId = sampling match - case Greedy(_, _, logprobs) => - if logprobs > 0 then - llama.llama_sample_softmax(ctx, candidates) - (!data).id - else llama.llama_sample_token_greedy(ctx, candidates) - - case MirostatV1(_, _, _, temp, tau, eta, m, muCoef) => - llama.llama_sample_temp(ctx, candidates, temp) - llama.llama_sample_token_mirostat( - ctx = ctx, - candidates = candidates, - tau = tau, - eta = eta, - m = m, - mu = Ptr.copy(muCoef * tau) - ) + case SamplerType.TYPICAL_P => + add( + llama.llama_sampler_init_typical( + config.typicalP, + minKeep + ) + ) - case MirostatV2(_, _, _, temp, tau, eta, muCoef) => - llama.llama_sample_temp(ctx, candidates, temp) - llama.llama_sample_token_mirostat_v2( - ctx = ctx, - candidates = candidates, - tau = tau, - eta = eta, - mu = Ptr.copy(muCoef * tau) - ) + case SamplerType.TEMPERATURE => + add( + llama.llama_sampler_init_temp_ext( + config.temp, + config.dynatemp.range, + config.dynatemp.exponent + ) + ) - case Random( - _, - _, - logprobs, - temp, - topK, - tfsZ, - typicalP, - topP, - minP, - dynatemp, - samplers - ) => - val topk = topK.filter(_ > 0).getOrElse(vocabSize) - val minKeep = SizeT(math.max(1, logprobs).toShort) - samplers.foreach: - case Sampler.TOP_K => - llama.llama_sample_top_k(ctx, candidates, topk, minKeep) - case Sampler.TAIL_FREE => - llama.llama_sample_tail_free(ctx, candidates, tfsZ, minKeep) - case Sampler.TYPICAL => - llama.llama_sample_typical(ctx, candidates, typicalP, minKeep) - case Sampler.TOP_P => - llama.llama_sample_top_p(ctx, candidates, topP, minKeep) - case Sampler.MIN_P => - llama.llama_sample_min_p(ctx, candidates, minP, minKeep) - case Sampler.TEMPERATURE => - if dynatemp.range > 0 then - val dynatemp_min = math.max(.0f, temp - dynatemp.range) - val dynatemp_max = math.max(.0f, temp + dynatemp.range) - llama.llama_sample_entropy( - ctx = ctx, - candidates_p = candidates, - min_temp = dynatemp_min, - max_temp = dynatemp_max, - exponent_val = dynatemp.exponent + case SamplerType.PENALTIES => + add( + llama.llama_sampler_init_penalties( + config.penalty.lastN, + config.penalty.repeat, + config.penalty.frequency, + config.penalty.presence + ) ) - else llama.llama_sample_temp(ctx, candidates, temp) - llama.llama_sample_token(ctx, candidates) - - Sample(tokenId, logprob(tokenId, data, sampling.logprobs)) - end sample - - def logprob( - id: Int, - data: Ptr[Llama.TokenData], - num: Int - ): Option[Probability] = - def tokenValue(tokenId: Int): String = - decode(tokenId) match - case token: String => token - case bytes: Array[Byte] => - bytes.map(b => s"\\\\x${String.format("%02x", b)}").mkString - - if num > 0 then - val log = (td: Llama.TokenData) => math.log(td.p) - val cap = math.min(num, vocabSize) - val logprobs = data.asArray(cap).unsafeArray.map: td => - Logprob(tokenValue(td.id), log(td)) - val current = LazyList.range(0, vocabSize).map(!data(_)).find(_.id == id) - val logprob = Logprob(tokenValue(id), current.fold(.0)(log)) - Some(Probability(logprob, logprobs)) - else None - - def close(suffix: Vector[Token]): LazyList[Token] = + + if config.greedy then add(llama.llama_sampler_init_greedy()) + else add(llama.llama_sampler_init_dist(config.seed)) + + case Sampling.Mirostat1(seed, temp, tau, eta, m) => + add(llama.llama_sampler_init_temp(temp)) + add(llama.llama_sampler_init_mirostat(vocabSize, seed, tau, eta, m)) + + case Sampling.Mirostat2(seed, temp, tau, eta) => + add(llama.llama_sampler_init_temp(temp)) + add(llama.llama_sampler_init_mirostat_v2(seed, tau, eta)) + + chain + end createSampler + + def close(suffix: Vector[Token], sampler: Llama.Sampler): LazyList[Token] = + llama.llama_sampler_free(sampler) llama.llama_free(ctx) LazyList.from(suffix)