From ec2029986cdea53eaedb2ac28ca9d8be58d30e76 Mon Sep 17 00:00:00 2001
From: donderom <274926+donderom@users.noreply.github.com>
Date: Thu, 6 Feb 2025 15:58:43 +0100
Subject: [PATCH] Update to support b4599 libllama release

---
 src/main/scala/com/donderom/llm4s/Llama.scala | 785 +++++++++++-------
 src/main/scala/com/donderom/llm4s/Llm.scala   |  64 +-
 .../scala/com/donderom/llm4s/Params.scala     | 119 ++-
 .../scala/com/donderom/llm4s/SlincLlm.scala   | 261 +++---
 4 files changed, 701 insertions(+), 528 deletions(-)

diff --git a/src/main/scala/com/donderom/llm4s/Llama.scala b/src/main/scala/com/donderom/llm4s/Llama.scala
index aa13a9f..1eb575d 100644
--- a/src/main/scala/com/donderom/llm4s/Llama.scala
+++ b/src/main/scala/com/donderom/llm4s/Llama.scala
@@ -7,26 +7,72 @@ object Llama:
   type Pos = CInt
   type Token = CInt
   type SeqId = CInt
-  type Ctx = Ptr[Any]
+
+  type Vocab = Ptr[Any]
   type Model = Ptr[Any]
-  type Grammar = Ptr[Any]
+  type Ctx = Ptr[Any]
+  type Sampler = Ptr[Any]
+
+  type LoraAdapter = Ptr[Any]
 
   enum VocabType:
-    case NONE, SPM, BPE, WPM
+    case NONE, SPM, BPE, WPM, UGM, RWKV
 
   given Transform[VocabType, CInt](VocabType.fromOrdinal, _.ordinal)
 
-  enum TokenType:
-    case UNDEFINED, NORMAL, UNKNOWN, CONTROL, USER_DEFINED, UNUSED, BYTE
+  enum RopeType(val code: CInt):
+    case NONE extends RopeType(-1)
+    case NORM extends RopeType(0)
+    case NEOX extends RopeType(2)
+    case MROPE extends RopeType(8)
+    case VISION extends RopeType(24)
 
-  given Transform[TokenType, CInt](TokenType.fromOrdinal, _.ordinal)
+  given Transform[RopeType, CInt](
+    _ match
+      case RopeType.NONE.code   => RopeType.NONE
+      case RopeType.NORM.code   => RopeType.NORM
+      case RopeType.NEOX.code   => RopeType.NEOX
+      case RopeType.MROPE.code  => RopeType.MROPE
+      case RopeType.VISION.code => RopeType.VISION
+    ,
+    _.code
+  )
+
+  enum TokenAttr(val code: CInt):
+    case UNDEFINED extends TokenAttr(0)
+    case UNKNOWN extends TokenAttr(1 << 0)
+    case UNUSED extends TokenAttr(1 << 1)
+    case NORMAL extends TokenAttr(1 << 2)
+    case CONTROL extends TokenAttr(1 << 3)
+    case USER_DEFINED extends TokenAttr(1 << 4)
+    case BYTE extends TokenAttr(1 << 5)
+    case NORMALIZED extends TokenAttr(1 << 6)
+    case LSTRIP extends TokenAttr(1 << 7)
+    case RSTRIP extends TokenAttr(1 << 8)
+    case SINGLE_WORD extends TokenAttr(1 << 9)
+
+  given Transform[TokenAttr, CInt](
+    _ match
+      case TokenAttr.UNDEFINED.code    => TokenAttr.UNDEFINED
+      case TokenAttr.UNKNOWN.code      => TokenAttr.UNKNOWN
+      case TokenAttr.UNUSED.code       => TokenAttr.UNUSED
+      case TokenAttr.NORMAL.code       => TokenAttr.NORMAL
+      case TokenAttr.CONTROL.code      => TokenAttr.CONTROL
+      case TokenAttr.USER_DEFINED.code => TokenAttr.USER_DEFINED
+      case TokenAttr.BYTE.code         => TokenAttr.BYTE
+      case TokenAttr.NORMALIZED.code   => TokenAttr.NORMALIZED
+      case TokenAttr.LSTRIP.code       => TokenAttr.LSTRIP
+      case TokenAttr.RSTRIP.code       => TokenAttr.RSTRIP
+      case TokenAttr.SINGLE_WORD.code  => TokenAttr.SINGLE_WORD
+    ,
+    _.code
+  )
 
   enum Ftype(val code: CInt):
     case ALL_F32 extends Ftype(0)
     case MOSTLY_F16 extends Ftype(1)
     case MOSTLY_Q4_0 extends Ftype(2)
     case MOSTLY_Q4_1 extends Ftype(3)
-    case MOSTLY_Q4_1_SOME_F16 extends Ftype(4)
     case MOSTLY_Q8_0 extends Ftype(7)
     case MOSTLY_Q5_0 extends Ftype(8)
     case MOSTLY_Q5_1 extends Ftype(9)
@@ -53,23 +99,64 @@ object Llama:
     case MOSTLY_IQ4_XS extends Ftype(30)
     case MOSTLY_IQ1_M extends Ftype(31)
     case MOSTLY_BF16 extends Ftype(32)
+    case MOSTLY_TQ1_0 extends Ftype(36)
+    case MOSTLY_TQ2_0 extends Ftype(37)
     case GUESSED extends Ftype(1024)
 
-  given Transform[Ftype, CInt](Ftype.fromOrdinal, _.code)
+  given Transform[Ftype, CInt](
+    _ match
+      case Ftype.ALL_F32.code        => Ftype.ALL_F32
+      case Ftype.MOSTLY_F16.code     => Ftype.MOSTLY_F16
+      case Ftype.MOSTLY_Q4_0.code    => Ftype.MOSTLY_Q4_0
+      case Ftype.MOSTLY_Q4_1.code    => Ftype.MOSTLY_Q4_1
+      case Ftype.MOSTLY_Q8_0.code    => Ftype.MOSTLY_Q8_0
+      case Ftype.MOSTLY_Q5_0.code    => Ftype.MOSTLY_Q5_0
+      case Ftype.MOSTLY_Q5_1.code    => Ftype.MOSTLY_Q5_1
+      case Ftype.MOSTLY_Q2_K.code    => Ftype.MOSTLY_Q2_K
+      case Ftype.MOSTLY_Q3_K_S.code  => Ftype.MOSTLY_Q3_K_S
+      case Ftype.MOSTLY_Q3_K_M.code  => Ftype.MOSTLY_Q3_K_M
+      case Ftype.MOSTLY_Q3_K_L.code  => Ftype.MOSTLY_Q3_K_L
+      case Ftype.MOSTLY_Q4_K_S.code  => Ftype.MOSTLY_Q3_K_S
+      case Ftype.MOSTLY_Q4_K_M.code  => Ftype.MOSTLY_Q4_K_M
+      case Ftype.MOSTLY_Q5_K_S.code  => Ftype.MOSTLY_Q5_K_S
+      case Ftype.MOSTLY_Q5_K_M.code  => Ftype.MOSTLY_Q5_K_M
+      case Ftype.MOSTLY_Q6_K.code    => Ftype.MOSTLY_Q6_K
+      case Ftype.MOSTLY_IQ2_XXS.code => Ftype.MOSTLY_IQ2_XXS
+      case Ftype.MOSTLY_IQ2_XS.code  => Ftype.MOSTLY_IQ2_XS
+      case Ftype.MOSTLY_Q2_K_S.code  => Ftype.MOSTLY_Q2_K_S
+      case Ftype.MOSTLY_IQ3_XS.code  => Ftype.MOSTLY_IQ3_XS
+      case Ftype.MOSTLY_IQ3_XXS.code => Ftype.MOSTLY_IQ3_XXS
+      case Ftype.MOSTLY_IQ1_S.code   => Ftype.MOSTLY_IQ1_S
+      case Ftype.MOSTLY_IQ4_NL.code  => Ftype.MOSTLY_IQ4_NL
+      case Ftype.MOSTLY_IQ3_S.code   => Ftype.MOSTLY_IQ3_S
+      case Ftype.MOSTLY_IQ3_M.code   => Ftype.MOSTLY_IQ3_M
+      case Ftype.MOSTLY_IQ2_S.code   => Ftype.MOSTLY_IQ2_S
+      case Ftype.MOSTLY_IQ2_M.code   => Ftype.MOSTLY_IQ2_M
+      case Ftype.MOSTLY_IQ4_XS.code  => Ftype.MOSTLY_IQ4_XS
+      case Ftype.MOSTLY_IQ1_M.code   => Ftype.MOSTLY_IQ1_M
+      case Ftype.MOSTLY_BF16.code    => Ftype.MOSTLY_BF16
+      case Ftype.MOSTLY_TQ1_0.code   => Ftype.MOSTLY_TQ1_0
+      case Ftype.MOSTLY_TQ2_0.code   => Ftype.MOSTLY_TQ2_0
+      case Ftype.GUESSED.code        => Ftype.GUESSED
+    ,
+    _.code
+  )
 
   enum RopeScalingType(val code: CInt):
     case UNSPECIFIED extends RopeScalingType(-1)
     case NONE extends RopeScalingType(0)
     case LINEAR extends RopeScalingType(1)
     case YARN extends RopeScalingType(2)
-    case MAX_VALUE extends RopeScalingType(2)
+    case LONGROPE extends RopeScalingType(3)
+    case MAX_VALUE extends RopeScalingType(3)
 
   given Transform[RopeScalingType, CInt](
     _ match
-      case 0 => RopeScalingType.NONE
-      case 1 => RopeScalingType.LINEAR
-      case 2 => RopeScalingType.YARN
-      case _ => RopeScalingType.UNSPECIFIED
+      case RopeScalingType.NONE.code     => RopeScalingType.NONE
+      case RopeScalingType.LINEAR.code   => RopeScalingType.LINEAR
+      case RopeScalingType.YARN.code     => RopeScalingType.YARN
+      case RopeScalingType.LONGROPE.code => RopeScalingType.LONGROPE
+      case _                             => RopeScalingType.UNSPECIFIED
     ,
     _.code
   )
@@ -79,13 +166,31 @@ object Llama:
     case NONE extends PoolingType(0)
     case MEAN extends PoolingType(1)
     case CLS extends PoolingType(2)
+    case LAST extends PoolingType(3)
+    case RANK extends PoolingType(4)
 
   given Transform[PoolingType, CInt](
     _ match
-      case 0 => PoolingType.NONE
-      case 1 => PoolingType.MEAN
-      case 2 => PoolingType.CLS
-      case _ => PoolingType.UNSPECIFIED
+      case PoolingType.NONE.code => PoolingType.NONE
+      case PoolingType.MEAN.code => PoolingType.MEAN
+      case PoolingType.CLS.code  => PoolingType.CLS
+      case PoolingType.LAST.code => PoolingType.LAST
+      case PoolingType.RANK.code => PoolingType.RANK
+      case _                     => PoolingType.UNSPECIFIED
+    ,
+    _.code
+  )
+
+  enum AttentionType(val code: CInt):
+    case UNSPECIFIED extends AttentionType(-1)
+    case CAUSAL extends AttentionType(0)
+    case NON_CAUSAL extends AttentionType(1)
+
+  given Transform[AttentionType, CInt](
+    _ match
+      case AttentionType.UNSPECIFIED.code => AttentionType.UNSPECIFIED
+      case AttentionType.CAUSAL.code      => AttentionType.CAUSAL
+      case AttentionType.NON_CAUSAL.code  => AttentionType.NON_CAUSAL
     ,
     _.code
   )
@@ -100,6 +205,7 @@ object Llama:
   final case class TokenDataArray(
       data: Ptr[TokenData],
       size: SizeT,
+      selected: CInt,
       sorted: CBool
   ) derives Struct
 
@@ -110,10 +216,7 @@ object Llama:
       pos: Ptr[Pos],
       n_seq_id: Ptr[CInt],
       seq_id: Ptr[Ptr[SeqId]],
-      logits: Ptr[CInt],
-      all_pos_0: Pos,
-      all_pos_1: Pos,
-      all_seq_id: SeqId
+      logits: Ptr[CInt]
   ) derives Struct
 
   enum ModelKvOverrideType:
@@ -131,6 +234,7 @@ object Llama:
   ) derives Struct
 
   final case class ModelParams(
+      devices: Ptr[Any],
       n_gpu_layers: CInt,
       split_mode: SplitMode,
       main_gpu: CInt,
@@ -174,46 +278,49 @@ object Llama:
     case F64 extends GgmlType(28)
     case IQ1_M extends GgmlType(29)
     case BF16 extends GgmlType(30)
-    case COUNT extends GgmlType(31)
+    case TQ1_0 extends GgmlType(34)
+    case TQ2_0 extends GgmlType(35)
+    case COUNT extends GgmlType(39)
 
   given Transform[GgmlType, CInt](
     _ match
-      case 0  => GgmlType.F32
-      case 1  => GgmlType.F16
-      case 2  => GgmlType.Q4_0
-      case 3  => GgmlType.Q4_1
-      case 6  => GgmlType.Q5_0
-      case 7  => GgmlType.Q5_1
-      case 8  => GgmlType.Q8_0
-      case 9  => GgmlType.Q8_1
-      case 10 => GgmlType.Q2_K
-      case 11 => GgmlType.Q3_K
-      case 12 => GgmlType.Q4_K
-      case 13 => GgmlType.Q5_K
-      case 14 => GgmlType.Q6_K
-      case 15 => GgmlType.Q8_K
-      case 16 => GgmlType.IQ2_XXS
-      case 17 => GgmlType.IQ2_XS
-      case 18 => GgmlType.IQ3_XXS
-      case 19 => GgmlType.IQ1_S
-      case 20 => GgmlType.IQ4_NL
-      case 21 => GgmlType.IQ3_S
-      case 22 => GgmlType.IQ2_S
-      case 23 => GgmlType.IQ4_XS
-      case 24 => GgmlType.I8
-      case 25 => GgmlType.I16
-      case 26 => GgmlType.I32
-      case 27 => GgmlType.I64
-      case 28 => GgmlType.F64
-      case 29 => GgmlType.IQ1_M
-      case 30 => GgmlType.BF16
-      case 31 => GgmlType.COUNT
+      case GgmlType.F32.code     => GgmlType.F32
+      case GgmlType.F16.code     => GgmlType.F16
+      case GgmlType.Q4_0.code    => GgmlType.Q4_0
+      case GgmlType.Q4_1.code    => GgmlType.Q4_1
+      case GgmlType.Q5_0.code    => GgmlType.Q5_0
+      case GgmlType.Q5_1.code    => GgmlType.Q5_1
+      case GgmlType.Q8_0.code    => GgmlType.Q8_0
+      case GgmlType.Q8_1.code    => GgmlType.Q8_1
+      case GgmlType.Q2_K.code    => GgmlType.Q2_K
+      case GgmlType.Q3_K.code    => GgmlType.Q3_K
+      case GgmlType.Q4_K.code    => GgmlType.Q4_K
+      case GgmlType.Q5_K.code    => GgmlType.Q5_K
+      case GgmlType.Q6_K.code    => GgmlType.Q6_K
+      case GgmlType.Q8_K.code    => GgmlType.Q8_K
+      case GgmlType.IQ2_XXS.code => GgmlType.IQ2_XXS
+      case GgmlType.IQ2_XS.code  => GgmlType.IQ2_XS
+      case GgmlType.IQ3_XXS.code => GgmlType.IQ3_XXS
+      case GgmlType.IQ1_S.code   => GgmlType.IQ1_S
+      case GgmlType.IQ4_NL.code  => GgmlType.IQ4_NL
+      case GgmlType.IQ3_S.code   => GgmlType.IQ3_S
+      case GgmlType.IQ2_S.code   => GgmlType.IQ2_S
+      case GgmlType.IQ4_XS.code  => GgmlType.IQ4_XS
+      case GgmlType.I8.code      => GgmlType.I8
+      case GgmlType.I16.code     => GgmlType.I16
+      case GgmlType.I32.code     => GgmlType.I32
+      case GgmlType.I64.code     => GgmlType.I64
+      case GgmlType.F64.code     => GgmlType.F64
+      case GgmlType.IQ1_M.code   => GgmlType.IQ1_M
+      case GgmlType.BF16.code    => GgmlType.BF16
+      case GgmlType.TQ1_0.code   => GgmlType.TQ1_0
+      case GgmlType.TQ2_0.code   => GgmlType.TQ2_0
+      case GgmlType.COUNT.code   => GgmlType.COUNT
     ,
     _.code
   )
 
   final case class ContextParams(
-      seed: CInt,
       n_ctx: CInt,
       n_batch: CInt,
       n_ubatch: CInt,
@@ -222,6 +329,7 @@ object Llama:
       n_threads_batch: CInt,
       rope_scaling_type: RopeScalingType,
       pooling_type: PoolingType,
+      attention_type: AttentionType,
       rope_freq_base: CFloat,
       rope_freq_scale: CFloat,
       yarn_ext_factor: CFloat,
@@ -238,6 +346,7 @@ object Llama:
       embeddings: CBool,
       offload_kqv: CBool,
       flash_attn: CBool,
+      no_perf: CBool,
       abort_callback: Ptr[Any],
       abort_callback_data: Ptr[Any]
   ) derives Struct
@@ -261,103 +370,85 @@ object Llama:
 
   given Transform[NumaStrategy, CInt](NumaStrategy.fromOrdinal, _.ordinal)
 
-  enum Gretype:
-    case END, ALT, RULE_REF, CHAR, CHAR_NOT, CHAR_RNG_UPPER, CHAR_ALT
-
-  given Transform[Gretype, CInt](Gretype.fromOrdinal, _.ordinal)
-
-  final case class GrammarElement(gretype: Gretype, value: CInt) derives Struct
-
-  final case class Timings(
-      t_start_ms: CDouble,
-      t_end_ms: CDouble,
-      t_load_ms: CDouble,
-      t_sample_ms: CDouble,
-      t_p_eval_ms: CDouble,
-      t_eval_ms: CDouble,
-      n_sample: CInt,
-      n_p_eval: CInt,
-      n_eval: CInt
-  ) derives Struct
-
-  // Information associated with an individual cell in the KV cache view.
-  final case class KvCacheViewCell(pos: Pos) derives Struct
+  final case class LogitBias(token: Token, bias: CFloat) derives Struct
 
-  final case class KvCacheView(
-      n_cells: CInt,
-      n_seq_max: CInt,
-      token_count: CInt,
-      used_cells: CInt,
-      max_contiguous: CInt,
-      max_contiguous_idx: CInt,
-      cells: Ptr[KvCacheViewCell],
-      cells_sequences: Ptr[SeqId]
-  ) derives Struct
+  final case class SamplerChainParams(no_perf: CBool) derives Struct
 
-  final case class BeamView(
-      tokens: Ptr[Token],
-      n_tokens: SizeT,
-      p: CFloat,
-      eob: CBool
-  ) derives Struct
-
-  final case class BeamsState(
-      beam_views: Ptr[BeamView],
-      n_beams: SizeT,
-      common_prefix_length: SizeT,
-      last_call: CBool
-  )
+  final case class ChatMessage(role: Ptr[CChar], content: Ptr[CChar])
+      derives Struct
 
 trait Llama derives FSet:
   import Llama.*
 
   def llama_model_default_params(): ModelParams
   def llama_context_default_params(): ContextParams
+  def llama_sampler_chain_default_params(): SamplerChainParams
   def llama_model_quantize_default_params(): ModelQuantizeParams
 
+  // Initialize the llama + ggml backend
+  // If numa is true, use NUMA optimizations
+  // Call once at the start of the program
   def llama_backend_init(): Unit
 
-  def llama_numa_init(strategy: NumaStrategy): Unit
-
+  // Call once at the end of the program - currently only used for MPI
   def llama_backend_free(): Unit
 
-  def llama_load_model_from_file(
+  def llama_numa_init(strategy: NumaStrategy): Unit
+
+  // Load the model from a file
+  // If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
+  // If the split file name does not follow this pattern, use llama_model_load_from_splits
+  def llama_model_load_from_file(
       path_model: Ptr[CChar],
       params: ModelParams
   ): Model
 
-  def llama_free_model(model: Model): Unit
+  // Load the model from multiple splits (support custom naming scheme)
+  // The paths must be in the correct order
+  def llama_model_load_from_splits(
+      paths: Ptr[Ptr[CChar]],
+      n_paths: SizeT,
+      params: ModelParams
+  ): Model
 
-  def llama_new_context_with_model(model: Model, params: ContextParams): Ctx
+  def llama_model_free(model: Model): Unit
 
+  def llama_init_from_model(model: Model, params: ContextParams): Ctx
+
+  // Frees all allocated memory
   def llama_free(ctx: Ctx): Unit
 
   def llama_time_us(): CInt
 
-  def llama_max_devices(): CInt
+  def llama_max_devices(): SizeT
 
   def llama_supports_mmap(): CBool
   def llama_supports_mlock(): CBool
   def llama_supports_gpu_offload(): CBool
-
-  def llama_get_model(ctx: Ctx): Model
+  def llama_supports_rpc(): CBool
 
   def llama_n_ctx(ctx: Ctx): CInt
   def llama_n_batch(ctx: Ctx): CInt
   def llama_n_ubatch(ctx: Ctx): CInt
   def llama_n_seq_max(ctx: Ctx): CInt
 
+  def llama_get_model(ctx: Ctx): Model
   def llama_pooling_type(ctx: Ctx): PoolingType
 
-  def llama_vocab_type(model: Model): VocabType
+  def llama_model_get_vocab(model: Model): Vocab
+  def llama_model_rope_type(model: Model): RopeType
 
-  def llama_n_vocab(model: Model): CInt
-  def llama_n_ctx_train(model: Model): CInt
-  def llama_n_embd(model: Model): CInt
-  def llama_n_layer(model: Model): CInt
+  def llama_model_n_ctx_train(model: Model): CInt
+  def llama_model_n_embd(model: Model): CInt
+  def llama_model_n_layer(model: Model): CInt
+  def llama_model_n_head(model: Model): CInt
 
   // Get the model's RoPE frequency scaling factor
-  def llama_rope_freq_scale_train(model: Model): CFloat
+  def llama_model_rope_freq_scale_train(model: Model): CFloat
+
+  def llama_vocab_type(vocab: Vocab): VocabType
+
+  def llama_vocab_n_tokens(vocab: Vocab): CInt
 
   // Get metadata value as a string by key name
   def llama_model_meta_val_str(
@@ -392,26 +483,61 @@ trait Llama derives FSet:
   // Returns the total size of all the tensors in the model in bytes
   def llama_model_size(model: Model): CInt
 
+  // Get the default chat template. Returns nullptr if not available
+  // If name is NULL, returns the default chat template
+  def llama_model_chat_template(model: Model, name: Ptr[CChar]): Ptr[CChar]
+
   // Returns the total number of parameters in the model
   def llama_model_n_params(model: Model): CInt
 
+  // Returns true if the model contains an encoder that requires llama_encode() call
+  def llama_model_has_encoder(model: Model): CBool
+
+  // Returns true if the model contains a decoder that requires llama_decode() call
+  def llama_model_has_decoder(model: Model): CBool
+
+  // For encoder-decoder models, this function returns id of the token that must be provided
+  // to the decoder to start generating output sequence. For other models, it returns -1.
+  def llama_model_decoder_start_token(model: Model): Token
+
+  // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
+  def llama_model_is_recurrent(model: Model): CBool
+
+  // Returns 0 on success
   def llama_model_quantize(
       fname_inp: Ptr[CChar],
       fname_out: Ptr[CChar],
       params: Ptr[ModelQuantizeParams]
   ): CInt
 
-  // Apply a LoRA adapter to a loaded model
-  def llama_model_apply_lora_from_file(
-      model: Model,
-      path_lora: Ptr[CChar],
-      scale: CFloat,
-      path_base_model: Ptr[CChar],
-      n_threads: CInt
-  ): CInt
+  // Adapters
 
-  def llama_control_vector_apply(
-      lctx: Ctx,
+  // Load a LoRA adapter from file
+  def llama_adapter_lora_init(model: Model, path_lora: Ptr[CChar]): LoraAdapter
+
+  // Manually free a LoRA adapter
+  // Note: loaded adapters will be free when the associated model is deleted
+  def llama_adapter_lora_free(adapter: LoraAdapter): Unit
+
+  // Add a loaded LoRA adapter to given context
+  // This will not modify model's weight
+  def llama_set_adapter_lora(ctx: Ctx, adapter: LoraAdapter, scale: Float): CInt
+
+  // Remove a specific LoRA adapter from given context
+  // Return -1 if the adapter is not present in the context
+  def llama_rm_adapter_lora(ctx: Ctx, adapter: LoraAdapter): CInt
+
+  // Remove all LoRA adapters from given context
+  def llama_clear_adapter_lora(ctx: Ctx): Unit
+
+  // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+  // the currently loaded vector.
+  // n_embd should be the size of a single layer's control, and data should point
+  // to an n_embd x n_layers buffer starting from layer 1.
+  // il_start and il_end are the layer range the vector should apply to (both inclusive)
+  // See llama_control_vector_load in common to load a control vector.
+  def llama_apply_adapter_cvec(
+      ctx: Ctx,
       data: Ptr[Float],
       len: SizeT,
       n_embd: CInt,
@@ -419,16 +545,7 @@ trait Llama derives FSet:
       il_end: CInt
   ): CInt
 
-  // KV
-
-  // Create an empty KV cache view. (use only for debugging purposes)
-  def llama_kv_cache_view_init(ctx: Ctx, n_seq_max: CInt): KvCacheView
-
-  // Free a KV cache view. (use only for debugging purposes)
-  def llama_kv_cache_view_free(view: Ptr[KvCacheView]): Unit
-
-  // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-  def llama_kv_cache_view_update(ctx: Ctx, view: Ptr[KvCacheView]): Unit
+  // KV cache
 
   // Returns the number of tokens in the KV cache (slow, use only for debug)
   // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
@@ -437,10 +554,11 @@ trait Llama derives FSet:
   // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
   def llama_get_kv_cache_used_cells(ctx: Ctx): CInt
 
-  // Clear the KV cache
+  // Clear the KV cache - both cell info is erased and KV data is zeroed
   def llama_kv_cache_clear(ctx: Ctx): Unit
 
   // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+  // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
   // seq_id < 0 : match any sequence
   // p0 < 0     : [0,  p1]
   // p1 < 0     : [p0, inf)
@@ -462,7 +580,9 @@ trait Llama derives FSet:
   def llama_kv_cache_seq_keep(ctx: Ctx, seq_id: SeqId): Unit
 
   // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-  // If the KV cache is RoPEd, the KV data is updated accordingly
+  // If the KV cache is RoPEd, the KV data is updated accordingly:
+  //   - lazily on next llama_decode()
+  //   - explicitly with llama_kv_cache_update()
   // p0 < 0 : [0,  p1]
   // p1 < 0 : [p0, inf)
   def llama_kv_cache_seq_add(
@@ -474,7 +594,9 @@ trait Llama derives FSet:
   ): Unit
 
   // Integer division of the positions by factor of `d > 1`
-  // If the KV cache is RoPEd, the KV data is updated accordingly
+  // If the KV cache is RoPEd, the KV data is updated accordingly:
+  //   - lazily on next llama_decode()
+  //   - explicitly with llama_kv_cache_update()
   // p0 < 0 : [0,  p1]
   // p1 < 0 : [p0, inf)
   def llama_kv_cache_seq_div(
@@ -497,17 +619,17 @@ trait Llama derives FSet:
   // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
   def llama_kv_cache_update(ctx: Ctx): Unit
 
+  // Check if the context supports KV cache shifting
+  def llama_kv_cache_can_shift(ctx: Ctx): CBool
+
   // Decoding
 
-  // Return batch for single sequence of tokens starting at pos_0
+  // Return batch for single sequence of tokens
+  // The sequence ID will be fixed to 0
+  // The position of the tokens will be tracked automatically by llama_decode
   //
   // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
-  def llama_batch_get_one(
-      tokens: Ptr[Token],
-      n_tokens: CInt,
-      pos_0: Pos,
-      seq_id: SeqId
-  ): Batch
+  def llama_batch_get_one(tokens: Ptr[Token], n_tokens: CInt): Batch
 
   // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
   // Each token can be assigned up to n_seq_max sequence ids
@@ -521,10 +643,16 @@ trait Llama derives FSet:
   // Frees a batch of tokens allocated with llama_batch_init()
   def llama_batch_free(batch: Batch): Unit
 
+  // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
+  // Stores the encoder output internally for later use by the decoder cross-attention layers.
+  //   0 - success
+  // < 0 - error. the KV cache state is restored to the state before this call
+  def llama_encode(ctx: Ctx, batch: Batch): CInt
+
   // Positive return values does not mean a fatal error, but rather a warning.
   //   0 - success
   //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-  // < 0 - error
+  // < 0 - error. the KV cache state is restored to the state before this call
   def llama_decode(ctx: Ctx, batch: Batch): CInt
 
   // Set the number of threads used for decoding
@@ -536,6 +664,16 @@ trait Llama derives FSet:
       n_threads_batch: CInt
   ): Unit
 
+  // Get the number of threads used for generation of a single token.
+  def llama_n_threads(ctx: Ctx): CInt
+
+  // Get the number of threads used for prompt and batch processing (multiple token).
+  def llama_n_threads_batch(ctx: Ctx): CInt
+
+  // Set whether the model is in embeddings mode or not
+  // If true, embeddings will be returned but logits will not
+  def llama_set_embeddings(ctx: Ctx, embeddings: CBool): Unit
+
   // Set whether to use causal attention or not
   // If set to true, the model will only attend to the past tokens
   def llama_set_causal_attn(ctx: Ctx, causal_attn: CBool): Unit
@@ -582,36 +720,41 @@ trait Llama derives FSet:
 
   // Get the embeddings for a sequence id
   // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-  // shape: [n_embd] (1-dimensional)
+  // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+  // otherwise: float[n_embd] (1-dimensional)
   def llama_get_embeddings_seq(ctx: Ctx, seq_id: SeqId): Ptr[Float]
 
   // Vocab
 
-  def llama_token_get_text(model: Model, token: Token): Ptr[CChar]
+  def llama_vocab_get_text(vocab: Vocab, token: Token): Ptr[CChar]
 
-  def llama_token_get_score(model: Model, token: Token): CFloat
+  def llama_vocab_get_score(vocab: Vocab, token: Token): CFloat
 
-  def llama_token_get_type(model: Model, token: Token): TokenType
+  def llama_vocab_get_attr(vocab: Vocab, token: Token): TokenAttr
 
   // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
-  def llama_token_is_eog(model: Model, token: Token): CBool
+  def llama_vocab_is_eog(vocab: Vocab, token: Token): CBool
 
-  def llama_token_bos(model: Model): Token
-  def llama_token_eos(model: Model): Token
-  def llama_token_cls(model: Model): Token
-  def llama_token_sep(model: Model): Token
-  def llama_token_nl(model: Model): Token
+  // Identify if Token Id is a control token or a render-able token
+  def llama_vocab_is_control(vocab: Vocab, token: Token): CBool
 
-  // Returns -1 if unknown, 1 for true or 0 for false.
-  def llama_add_bos_token(model: Model): CInt
+  // Special tokens
+  def llama_vocab_bos(vocab: Vocab): Token
+  def llama_vocab_eos(vocab: Vocab): Token
+  def llama_vocab_eot(vocab: Vocab): Token
+  def llama_vocab_sep(vocab: Vocab): Token
+  def llama_vocab_nl(vocab: Vocab): Token
+  def llama_vocab_pad(vocab: Vocab): Token
 
-  // Returns -1 if unknown, 1 for true or 0 for false.
-  def llama_add_eos_token(model: Model): CInt
+  def llama_vocab_get_add_bos(vocab: Vocab): CBool
+  def llama_vocab_get_add_eos(vocab: Vocab): CBool
 
-  def llama_token_prefix(model: Model): Token
-  def llama_token_middle(model: Model): Token
-  def llama_token_suffix(model: Model): Token
-  def llama_token_eot(model: Model): Token
+  def llama_vocab_fim_pre(vocab: Vocab): Token
+  def llama_vocab_fim_suf(vocab: Vocab): Token
+  def llama_vocab_fim_mid(vocab: Vocab): Token
+  def llama_vocab_fim_pad(vocab: Vocab): Token
+  def llama_vocab_fim_rep(vocab: Vocab): Token
+  def llama_vocab_fim_sep(vocab: Vocab): Token
 
   // Tokenization
 
@@ -619,10 +762,11 @@ trait Llama derives FSet:
   /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
   /// @return Returns the number of tokens on success, no more than n_tokens_max
   /// @return Returns a negative number on failure - the number of tokens that would have been returned
+  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
   /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
   ///                      as plaintext. Does not insert a leading space.
   def llama_tokenize(
-      model: Model,
+      vocab: Vocab,
       text: Ptr[CChar],
       text_len: CInt,
       tokens: Ptr[Token],
@@ -634,112 +778,114 @@ trait Llama derives FSet:
   // Token Id -> Piece.
   // Uses the vocabulary in the provided context.
   // Does not write null terminator to the buffer.
-  // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+  // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
   // @param special If true, special tokens are rendered in the output.
   def llama_token_to_piece(
-      model: Model,
+      vocab: Vocab,
       token: Token,
       buf: Ptr[CChar],
       length: CInt,
+      lstrip: CInt,
       special: CBool
   ): CInt
 
-  // Grammar
+  /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
+  /// @param text The char pointer must be large enough to hold the resulting text.
+  /// @return Returns the number of chars/bytes on success, no more than text_len_max.
+  /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
+  /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
+  /// @param unparse_special If true, special tokens are rendered in the output.
+  def llama_detokenize(
+      vocab: Vocab,
+      tokens: Ptr[Token],
+      n_tokens: CInt,
+      text: Ptr[CChar],
+      text_len_max: CInt,
+      remove_special: CBool,
+      unparse_special: CBool
+  ): CInt
 
-  def llama_grammar_init(
-      rules: Ptr[Ptr[GrammarElement]],
-      n_rules: SizeT,
-      start_rule_index: SizeT
-  ): Grammar
+  // Chat templates
+
+  /// Apply chat template. Inspired by hf apply_chat_template() on python.
+  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
+  /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+  /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
+  /// @param chat Pointer to a list of multiple llama_chat_message
+  /// @param n_msg Number of llama_chat_message in this chat
+  /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
+  /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
+  /// @param length The size of the allocated buffer
+  /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
+  def llama_chat_apply_template(
+      tmpl: Ptr[CChar],
+      chat: Ptr[ChatMessage],
+      n_msg: SizeT,
+      add_ass: CBool,
+      buf: Ptr[CChar],
+      length: CInt
+  ): CInt
 
-  def llama_grammar_free(grammar: Grammar): Unit
+  // Get list of built-in chat templates
+  def llama_chat_builtin_templates(output: Ptr[Ptr[CChar]], len: SizeT): CInt
 
-  def llama_grammar_copy(grammar: Grammar): Grammar
+  // Sampling API
 
-  // Sampling functions
+  def llama_sampler_name(sampler: Sampler): Ptr[CChar]
+  def llama_sampler_accept(sampler: Sampler, token: Token): Unit
+  def llama_sampler_apply(
+      sampler: Sampler,
+      candidates: Ptr[TokenDataArray]
+  ): Unit
+  def llama_sampler_reset(sampler: Sampler): Unit
+  def llama_sampler_clone(sampler: Sampler): Sampler
+  // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
+  def llama_sampler_free(sampler: Sampler): Unit
 
-  // Sets the current rng seed.
-  def llama_set_rng_seed(ctx: Ctx, seed: CInt): Unit
+  def llama_sampler_chain_init(params: SamplerChainParams): Sampler
 
-  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-  def llama_sample_repetition_penalties(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
-      last_tokens: Ptr[Token],
-      penalty_last_n: SizeT,
-      penalty_repeat: CFloat,
-      penalty_freq: CFloat,
-      penalty_present: CFloat
-  ): Unit
+  // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
+  def llama_sampler_chain_add(chain: Sampler, smpl: Sampler): Unit
+  def llama_sampler_chain_get(chain: Sampler, i: CInt): Sampler
+  def llama_sampler_chain_n(chain: Sampler): CInt
 
-  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-  def llama_sample_softmax(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray]
-  ): Unit
+  // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
+  def llama_sampler_chain_remove(chain: Sampler, i: CInt): Sampler
+
+  // Available samplers:
+
+  def llama_sampler_init_greedy(): Sampler
+  def llama_sampler_init_dist(seed: CInt): Sampler
 
   /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-  def llama_sample_top_k(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
-      k: CInt,
-      min_keep: SizeT
-  ): Unit
+  def llama_sampler_init_top_k(k: CInt): Sampler
 
   /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-  def llama_sample_top_p(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
-      p: CFloat,
-      min_keep: SizeT
-  ): Unit
+  def llama_sampler_init_top_p(p: CFloat, min_keep: SizeT): Sampler
 
   /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-  def llama_sample_min_p(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
-      p: CFloat,
-      min_keep: SizeT
-  ): Unit
-
-  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-  def llama_sample_tail_free(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
-      z: CFloat,
-      min_keep: SizeT
-  ): Unit
+  def llama_sampler_init_min_p(p: CFloat, min_keep: SizeT): Sampler
 
   /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-  def llama_sample_typical(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
-      p: CFloat,
-      min_keep: SizeT
-  ): Unit
+  def llama_sampler_init_typical(p: CFloat, min_keep: SizeT): Sampler
 
-  /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
-  def llama_sample_entropy(
-      ctx: Ctx,
-      candidates_p: Ptr[TokenDataArray],
-      min_temp: Float,
-      max_temp: Float,
-      exponent_val: Float
-  ): Unit
+  /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
+  def llama_sampler_init_temp(t: CFloat): Sampler
 
-  def llama_sample_temp(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
-      temp: CFloat
-  ): Unit
+  /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
+  def llama_sampler_init_temp_ext(
+      t: CFloat,
+      delta: CFloat,
+      exponent: CFloat
+  ): Sampler
 
-  /// @details Apply constraints from grammar
-  def llama_sample_grammar(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
-      grammar: Grammar
-  ): Unit
+  /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+  def llama_sampler_init_xtc(
+      p: CFloat,
+      t: CFloat,
+      min_keep: SizeT,
+      seed: CInt
+  ): Sampler
 
   /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
   /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -747,61 +893,130 @@ trait Llama derives FSet:
   /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
   /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
   /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-  def llama_sample_token_mirostat(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
+  def llama_sampler_init_mirostat(
+      n_vocab: CInt,
+      seed: CInt,
       tau: CFloat,
       eta: CFloat,
-      m: CInt,
-      mu: Ptr[CFloat]
-  ): Token
+      m: CInt
+  ): Sampler
 
   /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
   /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
   /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
   /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
   /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-  def llama_sample_token_mirostat_v2(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray],
+  def llama_sampler_init_mirostat_v2(
+      seed: CInt,
       tau: CFloat,
-      eta: CFloat,
-      mu: Ptr[CFloat]
-  ): Token
-
-  /// @details Selects the token with the highest probability.
-  ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
-  def llama_sample_token_greedy(
-      ctx: Ctx,
-      candidates: Ptr[TokenDataArray]
-  ): Token
-
-  /// @details Randomly selects a token from the candidates based on their probabilities.
-  def llama_sample_token(ctx: Ctx, candidates: Ptr[TokenDataArray]): Token
-
-  /// @details Accepts the sampled token into the grammar
-  def llama_grammar_accept_token(
-      ctx: Ctx,
-      grammar: Grammar,
-      token: Token
-  ): Unit
-
-  // Beam search
-
-  def llama_beam_search(
-      ctx: Ctx,
-      callback: Ptr[(Ptr[Any], BeamsState) => Unit],
-      callback_data: Ptr[Any],
-      n_beams: SizeT,
-      n_past: CInt,
-      n_predict: CInt
-  ): Unit
+      eta: CFloat
+  ): Sampler
+
+  def llama_sampler_init_grammar(
+      vocab: Vocab,
+      grammar_str: Ptr[CChar],
+      grammar_root: Ptr[CChar]
+  ): Sampler
+
+  /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639
+  /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
+  /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
+  def llama_sampler_init_grammar_lazy(
+      vocab: Vocab,
+      grammar_str: Ptr[CChar],
+      grammar_root: Ptr[CChar],
+      trigger_words: Ptr[Ptr[CChar]],
+      num_trigger_words: SizeT,
+      trigger_tokens: Ptr[Token],
+      num_trigger_tokens: SizeT
+  ): Sampler
+
+  /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
+  def llama_sampler_init_penalties(
+      penalty_last_n: CInt,
+      penalty_repeat: CFloat,
+      penalty_freq: CFloat,
+      penalty_present: CFloat
+  ): Sampler
+
+  ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
+  def llama_sampler_init_dry(
+      vocab: Vocab,
+      n_ctx_train: CInt,
+      dry_multiplier: CFloat,
+      dry_base: CFloat,
+      dry_allowed_length: CInt,
+      dry_penalty_last_n: CInt,
+      seq_breakers: Ptr[Ptr[CChar]],
+      num_breakers: SizeT
+  ): Sampler
+
+  def llama_sampler_init_logit_bias(
+      n_vocab: CInt,
+      n_logit_bias: CInt,
+      logit_bias: Ptr[LogitBias]
+  ): Sampler
+
+  // this sampler is meant to be used for fill-in-the-middle infilling
+  // it's supposed to be used after top_k + top_p sampling
+  //
+  // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+  // 2. combine probs of tokens that have the same prefix
+  //
+  // example:
+  //
+  // - before:
+  //   "hel":   0.5
+  //   "hell":  0.2
+  //   "hello": 0.1
+  //   "dummy": 0.1
+  //
+  // - after:
+  //   "hel":   0.8
+  //   "dummy": 0.1
+  //
+  // 3. discard non-EOG tokens with low prob
+  // 4. if no tokens are left -> pick EOT
+  def llama_sampler_init_infill(vocab: Vocab): Sampler
 
-  // Performance and system information
+  // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
+  def llama_sampler_get_seed(smpl: Sampler): CInt
 
-  def llama_get_timings(ctx: Ctx): Timings
+  /// @details Sample and accept a token from the idx-th output of the last evaluation
+  //
+  // Shorthand for:
+  //    const auto * logits = llama_get_logits_ith(ctx, idx);
+  //    llama_token_data_array cur_p = { ... init from logits ... };
+  //    llama_sampler_apply(smpl, &cur_p);
+  //    auto token = cur_p.data[cur_p.selected].id;
+  //    llama_sampler_accept(smpl, token);
+  //    return token;
+  // Returns the sampled token
+  def llama_sampler_sample(smpl: Sampler, ctx: Ctx, idx: CInt): Token
+
+  // Model split
+
+  /// @details Build a split GGUF final path for this chunk.
+  ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
+  //  Returns the split_path length.
+  def llama_split_path(
+      split_path: Ptr[CChar],
+      maxlen: SizeT,
+      path_prefix: Ptr[CChar],
+      split_no: CInt,
+      split_count: CInt
+  ): CInt
 
-  def llama_print_timings(ctx: Ctx): Unit
-  def llama_reset_timings(ctx: Ctx): Unit
+  /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
+  ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+  //  Returns the split_prefix length.
+  def llama_split_prefix(
+      split_prefix: Ptr[CChar],
+      maxlen: SizeT,
+      split_path: Ptr[CChar],
+      split_no: CInt,
+      split_count: CInt
+  ): CInt
 
+  // Print system information
   def llama_print_system_info(): Ptr[CChar]
diff --git a/src/main/scala/com/donderom/llm4s/Llm.scala b/src/main/scala/com/donderom/llm4s/Llm.scala
index e73db9e..324a0e7 100644
--- a/src/main/scala/com/donderom/llm4s/Llm.scala
+++ b/src/main/scala/com/donderom/llm4s/Llm.scala
@@ -5,6 +5,7 @@ import java.nio.file.Path
 import scala.util.Try
 
 import fr.hammons.slinc.runtime.given
+import fr.hammons.slinc.types.SizeT
 import fr.hammons.slinc.{FSet, Ptr, Scope, Slinc}
 
 final case class Logprob(token: String, value: Double)
@@ -34,11 +35,16 @@ object Llm:
       val llm = createModel(model, params)
 
       def generate(prompt: String, params: LlmParams): Try[Usage] =
-        for ctx <- createContext(llm, params.context, false)
+        for
+          llm <- llm
+          ctx <- createContext(llm, params.context, false)
+          _ <- loadLora(llm, ctx, params.lora)
         yield SlincLlm(ctx).generate(prompt, params)
 
       def embeddings(prompt: String, params: ContextParams): Try[Array[Float]] =
-        for ctx <- createContext(llm, params, true)
+        for
+          llm <- llm
+          ctx <- createContext(llm, params, true)
         yield SlincLlm(ctx).embeddings(prompt, params.batch)
 
       def close(): Unit =
@@ -46,20 +52,18 @@ object Llm:
           llama <- binding
           llm <- llm
         do
-          llama.llama_free_model(llm)
+          llama.llama_model_free(llm)
           llama.llama_backend_free()
 
       private def createModel(
           model: Path,
           params: ModelParams
       ): Try[Llama.Model] =
-        binding.foreach: llama =>
+        binding.map: llama =>
           llama.llama_backend_init()
           llama.llama_numa_init(params.numa)
-
-        Scope.global:
-          val baseModel = binding.map: llama =>
-            llama.llama_load_model_from_file(
+          Scope.confined:
+            llama.llama_model_load_from_file(
               path_model = Ptr.copy(model.toAbsolutePath.toString),
               params = llama.llama_model_default_params().copy(
                 n_gpu_layers = params.gpuLayers,
@@ -69,31 +73,14 @@ object Llm:
               )
             )
 
-          params.lora.adapter.fold(baseModel): loraAdapter =>
-            val err =
-              for
-                llama <- binding
-                llm <- baseModel
-                loraBase = params.lora.base.fold(Slinc.getRuntime().Null):
-                  base => Ptr.copy(base.toAbsolutePath.toString)
-              yield llama.llama_model_apply_lora_from_file(
-                model = llm,
-                path_lora = Ptr.copy(loraAdapter.toAbsolutePath.toString),
-                scale = params.lora.scale,
-                path_base_model = loraBase,
-                n_threads = params.lora.threads
-              )
-            err.filter(_ == 0).flatMap(_ => baseModel)
-
       private def createContext(
-          llm: Try[Llama.Model],
+          llm: Llama.Model,
           contextParams: ContextParams,
           embedding: Boolean
       ): Try[Llama.Ctx] =
         for
           llama <- binding
-          llm <- llm
-          ctx = llama.llama_new_context_with_model(
+          ctx = llama.llama_init_from_model(
             model = llm,
             params = llamaParams(
               llama.llama_context_default_params(),
@@ -103,15 +90,34 @@ object Llm:
           ) if ctx != Slinc.getRuntime().Null
         yield ctx
 
+      private def loadLora(
+          llm: Llama.Model,
+          ctx: Llama.Ctx,
+          loraParams: Option[LoraParams]
+      ): Try[Unit] =
+        loraParams.fold(Try(())): params =>
+          Scope.confined:
+            for
+              llama <- binding
+              adapter <- Try(
+                llama.llama_adapter_lora_init(
+                  llm,
+                  Ptr.copy(params.path.toAbsolutePath.toString)
+                )
+              )
+              if adapter != Slinc.getRuntime().Null
+              _ <- Try(llama.llama_set_adapter_lora(ctx, adapter, params.scale))
+            yield ()
+
       private def llamaParams(
           defaultParams: Llama.ContextParams,
           params: ContextParams,
           embedding: Boolean
       ): Llama.ContextParams =
         defaultParams.copy(
-          seed = params.seed,
           n_ctx = params.size,
-          n_batch = params.batch.size,
+          n_batch = params.batch.logical,
+          n_ubatch = params.batch.physical,
           n_threads = params.threads,
           n_threads_batch = params.batch.threads,
           rope_scaling_type = params.rope.scalingType,
diff --git a/src/main/scala/com/donderom/llm4s/Params.scala b/src/main/scala/com/donderom/llm4s/Params.scala
index 83be14d..4f26010 100644
--- a/src/main/scala/com/donderom/llm4s/Params.scala
+++ b/src/main/scala/com/donderom/llm4s/Params.scala
@@ -7,9 +7,8 @@ import Llama.{NumaStrategy, RopeScalingType}
 object Default:
   val threads = Runtime.getRuntime.availableProcessors
 
-  val penalty: Penalty = Penalty()
-  val repeatLastTokens: Int = 64
   val logprobs: Int = 0
+  val seed: Int = 0xfffffff
   val temp: Float = .8f
   object Mirostat:
     val tau: Float = 5.0f
@@ -17,19 +16,16 @@ object Default:
     val muCoef: Float = 2.0f
 
 final case class LoraParams(
-    adapter: Option[Path] = None,
-    base: Option[Path] = None,
-    scale: Float = 1.0f,
-    threads: Int = Default.threads
+    path: Path,
+    scale: Float = 1.0f
 )
 
 final case class ModelParams(
-    gpuLayers: Int = 0,
+    gpuLayers: Int = -1,
     mainGpu: Int = 0,
     mmap: Boolean = true,
     mlock: Boolean = false,
-    numa: NumaStrategy = NumaStrategy.DISABLED,
-    lora: LoraParams = LoraParams()
+    numa: NumaStrategy = NumaStrategy.DISABLED
 )
 
 final case class RopeParams(
@@ -46,13 +42,16 @@ final case class YarnParams(
     origCtx: Int = 0
 )
 
-final case class BatchParams(size: Int = 512, threads: Int = Default.threads)
+final case class BatchParams(
+    logical: Int = 2048,
+    physical: Int = 512,
+    threads: Int = Default.threads
+)
 
 final case class GroupAttention(factor: Int = 1, width: Int = 512)
 
 final case class ContextParams(
-    seed: Int = -1,
-    size: Int = 512,
+    size: Int = 4096,
     threads: Int = Default.threads,
     batch: BatchParams = BatchParams(),
     rope: RopeParams = RopeParams(),
@@ -60,10 +59,23 @@ final case class ContextParams(
 )
 
 final case class Penalty(
-    repeat: Float = 1.10f,
+    lastN: Int = 64,
+    repeat: Float = 1.0f,
     frequency: Float = .0f,
-    presence: Float = .0f,
-    penalizeNewLines: Boolean = true
+    presence: Float = .0f
+)
+
+final case class Dry(
+    multiplier: Float = .0f,
+    base: Float = 1.75f,
+    allowedLength: Int = 2,
+    penaltyLastN: Int = -1,
+    seqBreakers: Seq[Char] = Seq[Char]('\n', ':', '"', '*')
+)
+
+final case class Xtc(
+    probability: Float = .0f,
+    threshold: Float = 0.10f
 )
 
 final case class Dynatemp(
@@ -71,63 +83,50 @@ final case class Dynatemp(
     exponent: Float = 1.0f
 )
 
-enum Sampler:
-  case TOP_K, TAIL_FREE, TYPICAL, TOP_P, MIN_P, TEMPERATURE
-
-enum Sampling(
-    val penalty: Penalty,
-    val repeatLastTokens: Int,
-    val logprobs: Int
-):
-  case Greedy(
-      override val penalty: Penalty = Default.penalty,
-      override val repeatLastTokens: Int = Default.repeatLastTokens,
-      override val logprobs: Int = Default.logprobs
-  ) extends Sampling(penalty, repeatLastTokens, logprobs)
-
-  case MirostatV1(
-      override val penalty: Penalty = Default.penalty,
-      override val repeatLastTokens: Int = Default.repeatLastTokens,
-      override val logprobs: Int = Default.logprobs,
+enum SamplerType:
+  case PENALTIES, DRY, TOP_K, TYPICAL_P, TOP_P, MIN_P, XTC, TEMPERATURE
+
+enum Sampling:
+  case Dist(
+      greedy: Boolean = false,
+      samplers: List[SamplerType] = SamplerType.values.toList,
+      seed: Int = Default.seed,
+      logitBias: Map[Int, Float] = Map(),
+      penalty: Penalty = Penalty(),
+      dry: Dry = Dry(),
+      minKeep: Short = 0,
+      topK: Int = 40,
+      typicalP: Float = 1.0f,
+      topP: Float = 0.95f,
+      minP: Float = 0.05f,
+      xtc: Xtc = Xtc(),
       temp: Float = Default.temp,
-      tau: Float = Default.Mirostat.tau,
-      eta: Float = Default.Mirostat.eta,
-      m: Int = 100,
-      muCoef: Float = Default.Mirostat.muCoef
-  ) extends Sampling(penalty, repeatLastTokens, logprobs)
-
-  case MirostatV2(
-      override val penalty: Penalty = Default.penalty,
-      override val repeatLastTokens: Int = Default.repeatLastTokens,
-      override val logprobs: Int = Default.logprobs,
+      dynatemp: Dynatemp = Dynatemp()
+  )
+
+  case Mirostat1(
+      seed: Int = Default.seed,
       temp: Float = Default.temp,
       tau: Float = Default.Mirostat.tau,
       eta: Float = Default.Mirostat.eta,
-      muCoef: Float = Default.Mirostat.muCoef
-  ) extends Sampling(penalty, repeatLastTokens, logprobs)
+      m: Int = 100
+  )
 
-  case Random(
-      override val penalty: Penalty = Default.penalty,
-      override val repeatLastTokens: Int = Default.repeatLastTokens,
-      override val logprobs: Int = Default.logprobs,
+  case Mirostat2(
+      seed: Int = Default.seed,
       temp: Float = Default.temp,
-      topK: Option[Int] = Some(40),
-      tfsZ: Float = 1.0f,
-      typicalP: Float = 1.0f,
-      topP: Float = .95f,
-      minP: Float = .05f,
-      dynatemp: Dynatemp = Dynatemp(),
-      samplers: List[Sampler] = Sampler.values.toList
-  ) extends Sampling(penalty, repeatLastTokens, logprobs)
+      tau: Float = Default.Mirostat.tau,
+      eta: Float = Default.Mirostat.eta
+  )
 
 final case class LlmParams(
     context: ContextParams = ContextParams(),
-    sampling: Sampling = Sampling.Random(),
+    sampling: Sampling = Sampling.Dist(),
     predictTokens: Int = -1,
     keepTokens: Int = 0,
-    logitBias: Map[Int, Float] = Map(),
     suffix: Option[String] = None,
     echo: Boolean = true,
     stopSeqs: List[String] = Nil,
-    groupAttention: GroupAttention = GroupAttention()
+    groupAttention: GroupAttention = GroupAttention(),
+    lora: Option[LoraParams] = None
 )
diff --git a/src/main/scala/com/donderom/llm4s/SlincLlm.scala b/src/main/scala/com/donderom/llm4s/SlincLlm.scala
index fe66aa2..aab06f3 100644
--- a/src/main/scala/com/donderom/llm4s/SlincLlm.scala
+++ b/src/main/scala/com/donderom/llm4s/SlincLlm.scala
@@ -12,6 +12,7 @@ import fr.hammons.slinc.{FSet, Ptr, Scope}
 import State.*
 
 private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx):
+  // Logprobs are None until a better solution is implemented
   final case class Sample(id: Int, prob: Option[Probability])
 
   lazy val llama = FSet.instance[Llama]
@@ -20,6 +21,7 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx):
   lazy val decoder = StandardCharsets.UTF_8.newDecoder
 
   def generate(prompt: String, params: LlmParams): Usage =
+    val sampler = createSampler(params.sampling)
     val lastTokens = new ArrayDeque[Int](ctxSize)
     val stop = Stop.Acc[Token](params.stopSeqs)
 
@@ -87,18 +89,13 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx):
       evaluate(ids, past, params.context.batch)
     end eval
 
-    def repeatTokens(): Array[Int] =
-      val repeatLastTokens =
-        if params.sampling.repeatLastTokens < 0 then ctxSize
-        else params.sampling.repeatLastTokens
-      val lastRepeat = math.min(lastTokens.size, repeatLastTokens)
-      val padding = Array.fill(repeatLastTokens - lastRepeat)(0)
-      padding ++ lastTokens.takeRight(lastRepeat).toArray
-
     def tokens(state: State[Token]): LazyList[Token] =
       if !state.remaining.none then
         val newPast = eval(state.evaluated)
-        val smpl = sample(repeatTokens(), params.sampling, params.logitBias)
+
+        val tokenId = llama.llama_sampler_sample(sampler, ctx, -1)
+        llama.llama_sampler_accept(sampler, tokenId)
+        val smpl = Sample(tokenId, None)
 
         if lastTokens.size == ctxSize then lastTokens.remove(0)
         lastTokens.append(smpl.id)
@@ -119,8 +116,8 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx):
                   LazyList.from(chunk) #::: gen(st)
                 case stop.Action.Stop(chunk) =>
                   LazyList.from(params.suffix.fold(chunk)(chunk :+ _.token))
-        else close(state.stop.deferred(params.suffix))
-      else close(state.stop.deferred(params.suffix))
+        else close(state.stop.deferred(params.suffix), sampler)
+      else close(state.stop.deferred(params.suffix), sampler)
     end tokens
 
     val ids = encode(prompt)
@@ -143,21 +140,18 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx):
   def embeddings(prompt: String, params: BatchParams): Array[Float] =
     val ids = encode(prompt)
     val _ = evaluate(ids, Evaluated.none, params)
-    val size = llama.llama_n_embd(model)
+    val size = llama.llama_model_n_embd(model)
     val embeddings = llama.llama_get_embeddings(ctx).asArray(size).unsafeArray
     llama.llama_free(ctx)
     embeddings
 
   lazy val ctxSize: Int = llama.llama_n_ctx(ctx)
-  lazy val vocabSize: Int = llama.llama_n_vocab(model)
-  lazy val addBosToken: Int = llama.llama_add_bos_token(model)
-  lazy val addBos: Boolean =
-    if addBosToken != -1 then addBosToken != 0
-    else llama.llama_vocab_type(model) == Llama.VocabType.SPM
-  lazy val newLineToken: Int = llama.llama_token_nl(model)
+  lazy val vocab: Llama.Vocab = llama.llama_model_get_vocab(model)
+  lazy val vocabSize: Int = llama.llama_vocab_n_tokens(vocab)
+  lazy val addBos: Boolean = llama.llama_vocab_get_add_bos(vocab)
 
   def keepGenerating(token: Int): Boolean =
-    !llama.llama_token_is_eog(model, token)
+    !llama.llama_vocab_is_eog(vocab, token)
 
   def encode(text: String): Array[Int] =
     val bos = if addBos then 1 else 0
@@ -166,7 +160,7 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx):
     Scope.confined:
       val tokens = Ptr.copy(res)
       val numTokens = llama.llama_tokenize(
-        model = model,
+        vocab = vocab,
         text = Ptr.copy(bytes),
         text_len = bytes.size,
         tokens = tokens,
@@ -188,10 +182,11 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx):
     Scope.confined:
       val tokens = Ptr.copy(res)
       val numTokens = llama.llama_token_to_piece(
-        model = model,
+        vocab = vocab,
         token = token,
         buf = tokens,
         length = res.size,
+        lstrip = 0,
         special = false
       )
       if numTokens < 0 then decode(token, pending, math.abs(numTokens))
@@ -203,160 +198,118 @@ private class SlincLlm private[llm4s] (private[llm4s] val ctx: Llama.Ctx):
   def evaluate(
       ids: Array[Int],
       past: Evaluated,
-      params: BatchParams
+      batch: BatchParams
   ): Evaluated =
     if ids.isEmpty then past
     else
-      val batches = ids.grouped(params.size)
+      val batches = ids.grouped(batch.logical)
       Scope.confined:
         for (batch, n) <- batches.zipWithIndex do
           llama.llama_decode(
             ctx = ctx,
             batch = llama.llama_batch_get_one(
               tokens = Ptr.copy(batch),
-              n_tokens = batch.size,
-              pos_0 = (past + n * params.size).toInt,
-              seq_id = 0
+              n_tokens = batch.size
             )
           )
       past + ids.size
 
-  def sample(
-      repeatTokens: Array[Int],
-      sampling: Sampling,
-      logitBias: Map[Int, Float],
-      idx: Int = 0
-  ): Sample =
-    import Sampling.*
+  def createSampler(params: Sampling): Llama.Sampler =
+    val sparams = llama.llama_sampler_chain_default_params()
+    val chain = llama.llama_sampler_chain_init(sparams)
+    val add = llama.llama_sampler_chain_add(chain, _)
+    params match
+      case config: Sampling.Dist =>
+        Scope.confined:
+          if !config.logitBias.isEmpty then
+            val logitBias = config.logitBias.map(Llama.LogitBias(_, _))
+            add(
+              llama.llama_sampler_init_logit_bias(
+                vocabSize,
+                config.logitBias.size,
+                Ptr.copy(logitBias.toArray)
+              )
+            )
 
-    Scope.confined:
-      val logits = llama.llama_get_logits_ith(ctx, idx).asArray(vocabSize)
-        .unsafeArray
-      logitBias.foreach((token, bias) => logits(token) = bias)
+          for sampler <- config.samplers do
+            val minKeep = SizeT(config.minKeep)
+            sampler match
+              case SamplerType.DRY =>
+                val seqBreakers = config.dry.seqBreakers.map(_.toByte)
+                add(
+                  llama.llama_sampler_init_dry(
+                    llama.llama_model_get_vocab(model),
+                    llama.llama_model_n_ctx_train(model),
+                    config.dry.multiplier,
+                    config.dry.base,
+                    config.dry.allowedLength,
+                    config.dry.penaltyLastN,
+                    Ptr.copy(Ptr.copy(seqBreakers.toArray)),
+                    SizeT(seqBreakers.size.toShort)
+                  )
+                )
 
-      val tokenData = Array.tabulate[Llama.TokenData](vocabSize): tokenId =>
-        Llama.TokenData(id = tokenId, logit = logits(tokenId), p = .0)
+              case SamplerType.TOP_K =>
+                add(llama.llama_sampler_init_top_k(config.topK))
 
-      val data = Ptr.copy(tokenData)
+              case SamplerType.TOP_P =>
+                add(llama.llama_sampler_init_top_p(config.topP, minKeep))
 
-      val candidates = Ptr.copy(
-        Llama.TokenDataArray(
-          data = data,
-          size = SizeT(tokenData.size.toShort),
-          sorted = false
-        )
-      )
+              case SamplerType.MIN_P =>
+                add(llama.llama_sampler_init_min_p(config.minP, minKeep))
 
-      val repeatLastTokens = Ptr.copy(repeatTokens)
-      val repeatTokensSize = SizeT(repeatTokens.size.toShort)
-      llama.llama_sample_repetition_penalties(
-        ctx = ctx,
-        candidates = candidates,
-        last_tokens = repeatLastTokens,
-        penalty_last_n = repeatTokensSize,
-        penalty_repeat = sampling.penalty.repeat,
-        penalty_freq = sampling.penalty.frequency,
-        penalty_present = sampling.penalty.presence
-      )
+              case SamplerType.XTC =>
+                add(
+                  llama.llama_sampler_init_xtc(
+                    config.xtc.probability,
+                    config.xtc.threshold,
+                    minKeep,
+                    config.seed
+                  )
+                )
 
-      if !sampling.penalty.penalizeNewLines then
-        val newLineLogit = logits(newLineToken)
-        val newLineIndex = tokenData.indexWhere(_.id == newLineToken)
-        if newLineIndex != -1 then
-          !data(newLineIndex) = (!data(newLineIndex)).copy(logit = newLineLogit)
-
-      val tokenId = sampling match
-        case Greedy(_, _, logprobs) =>
-          if logprobs > 0 then
-            llama.llama_sample_softmax(ctx, candidates)
-            (!data).id
-          else llama.llama_sample_token_greedy(ctx, candidates)
-
-        case MirostatV1(_, _, _, temp, tau, eta, m, muCoef) =>
-          llama.llama_sample_temp(ctx, candidates, temp)
-          llama.llama_sample_token_mirostat(
-            ctx = ctx,
-            candidates = candidates,
-            tau = tau,
-            eta = eta,
-            m = m,
-            mu = Ptr.copy(muCoef * tau)
-          )
+              case SamplerType.TYPICAL_P =>
+                add(
+                  llama.llama_sampler_init_typical(
+                    config.typicalP,
+                    minKeep
+                  )
+                )
 
-        case MirostatV2(_, _, _, temp, tau, eta, muCoef) =>
-          llama.llama_sample_temp(ctx, candidates, temp)
-          llama.llama_sample_token_mirostat_v2(
-            ctx = ctx,
-            candidates = candidates,
-            tau = tau,
-            eta = eta,
-            mu = Ptr.copy(muCoef * tau)
-          )
+              case SamplerType.TEMPERATURE =>
+                add(
+                  llama.llama_sampler_init_temp_ext(
+                    config.temp,
+                    config.dynatemp.range,
+                    config.dynatemp.exponent
+                  )
+                )
 
-        case Random(
-              _,
-              _,
-              logprobs,
-              temp,
-              topK,
-              tfsZ,
-              typicalP,
-              topP,
-              minP,
-              dynatemp,
-              samplers
-            ) =>
-          val topk = topK.filter(_ > 0).getOrElse(vocabSize)
-          val minKeep = SizeT(math.max(1, logprobs).toShort)
-          samplers.foreach:
-            case Sampler.TOP_K =>
-              llama.llama_sample_top_k(ctx, candidates, topk, minKeep)
-            case Sampler.TAIL_FREE =>
-              llama.llama_sample_tail_free(ctx, candidates, tfsZ, minKeep)
-            case Sampler.TYPICAL =>
-              llama.llama_sample_typical(ctx, candidates, typicalP, minKeep)
-            case Sampler.TOP_P =>
-              llama.llama_sample_top_p(ctx, candidates, topP, minKeep)
-            case Sampler.MIN_P =>
-              llama.llama_sample_min_p(ctx, candidates, minP, minKeep)
-            case Sampler.TEMPERATURE =>
-              if dynatemp.range > 0 then
-                val dynatemp_min = math.max(.0f, temp - dynatemp.range)
-                val dynatemp_max = math.max(.0f, temp + dynatemp.range)
-                llama.llama_sample_entropy(
-                  ctx = ctx,
-                  candidates_p = candidates,
-                  min_temp = dynatemp_min,
-                  max_temp = dynatemp_max,
-                  exponent_val = dynatemp.exponent
+              case SamplerType.PENALTIES =>
+                add(
+                  llama.llama_sampler_init_penalties(
+                    config.penalty.lastN,
+                    config.penalty.repeat,
+                    config.penalty.frequency,
+                    config.penalty.presence
+                  )
                 )
-              else llama.llama_sample_temp(ctx, candidates, temp)
-          llama.llama_sample_token(ctx, candidates)
-
-      Sample(tokenId, logprob(tokenId, data, sampling.logprobs))
-  end sample
-
-  def logprob(
-      id: Int,
-      data: Ptr[Llama.TokenData],
-      num: Int
-  ): Option[Probability] =
-    def tokenValue(tokenId: Int): String =
-      decode(tokenId) match
-        case token: String => token
-        case bytes: Array[Byte] =>
-          bytes.map(b => s"\\\\x${String.format("%02x", b)}").mkString
-
-    if num > 0 then
-      val log = (td: Llama.TokenData) => math.log(td.p)
-      val cap = math.min(num, vocabSize)
-      val logprobs = data.asArray(cap).unsafeArray.map: td =>
-        Logprob(tokenValue(td.id), log(td))
-      val current = LazyList.range(0, vocabSize).map(!data(_)).find(_.id == id)
-      val logprob = Logprob(tokenValue(id), current.fold(.0)(log))
-      Some(Probability(logprob, logprobs))
-    else None
-
-  def close(suffix: Vector[Token]): LazyList[Token] =
+
+          if config.greedy then add(llama.llama_sampler_init_greedy())
+          else add(llama.llama_sampler_init_dist(config.seed))
+
+      case Sampling.Mirostat1(seed, temp, tau, eta, m) =>
+        add(llama.llama_sampler_init_temp(temp))
+        add(llama.llama_sampler_init_mirostat(vocabSize, seed, tau, eta, m))
+
+      case Sampling.Mirostat2(seed, temp, tau, eta) =>
+        add(llama.llama_sampler_init_temp(temp))
+        add(llama.llama_sampler_init_mirostat_v2(seed, tau, eta))
+
+    chain
+  end createSampler
+
+  def close(suffix: Vector[Token], sampler: Llama.Sampler): LazyList[Token] =
+    llama.llama_sampler_free(sampler)
     llama.llama_free(ctx)
     LazyList.from(suffix)