-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconfig.toml
56 lines (53 loc) · 2.29 KB
/
config.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Config for web-rwkv-axum
# It specifies what model to use, what token table to use, etc.
[axum]
state_dump = "states"
[model]
# Path to the model file
# Must be a safetensor instead of pth.
path = "assets/RWKV-5-World-7B-v2-OnlyForTest_49%_trained-20231114-ctx4096.st"
# Max batch count means maximum size of model state used
# as pool for batch inference. Larger size reduce the need
# of buffer swapping at the cost of increased GPU memory
# usage. Default 32.
max_batch_count = 32
# Max state size means a relative size of model states
# initialized by web-rwkv. Decrease this if the model crash
# while loading with panic message like:
# Buffer size 2214592512 is greater than the maximum buffer size (268435456)
max_state_size = 2
# Max concurrency means the maximum concurrent states that
# can be inferred at the same time. Larger size reduces
# response performance but increase GPU utilization to a
# level. Default 8.
max_concurrency = 8
# Max chunk count means how many tokens should a chunk
# be at max. Larger chunks increase infer speed for long
# prompt at the cost of resource consumption.
# If the chunk size is too large, performance might decrease
# for some bandwith problem.
max_chunk_count = 128
# Preference for adapter. Can be HighPerformance or
# LowPower. If omitted, adapter index will be used.
preference = "HighPerformance"
# Adapter index to select, usually 0.
# There might be a tool to check for adapters later.
# adapter = 0
# Quantize a certain layer of the model, a bit flag.
# If omitted, no quantization is done.
# Each character in the string represents a layer's quantification
# config, 1=int8, 2=nf4, others=none.
# Example here quantifies the first 28 layers with nf4, last 4
# layers are fp16.
quantization = "22222222222222222222222222220000"
# Max tokens that can be inferred in one infer request
# This is use to set a hard cap for the infer, so request with
# potentially unending condition can still return.
max_infer_tokens = 256
# Add lora for the model, multiple loras can be specified,
# and different blends for a lora can be specified.
# lora_configs = [{ path = "path", blends = [{ pattern = '.*', alpha = 0.1 }] }]
[tokenizer]
# Path to the vocab JSON.
# Refer to https://github.com/cryscan/web-rwkv/blob/main/assets/rwkv_vocab_v20230424.json
path = "assets/rwkv_vocab_v20230424.json"