feat(vllm): expose 'load_format' (#3943)

Signed-off-by: Ettore Di Giacinto <[email protected]>
mudler · Oct 23, 2024 · ae1ec4e · ae1ec4e
1 parent c75ecfa
commit ae1ec4e
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 0 deletions.
diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
@@ -95,6 +95,8 @@ async def LoadModel(self, request, context):
 
         if request.Quantization != "":
             engine_args.quantization = request.Quantization
+        if request.LoadFormat != "":
+            engine_args.load_format = request.LoadFormat
         if request.GPUMemoryUtilization != 0:
             engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
         if request.TrustRemoteCode:

diff --git a/core/backend/options.go b/core/backend/options.go
@@ -139,6 +139,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DraftModel:           c.DraftModel,
 		AudioPath:            c.VallE.AudioPath,
 		Quantization:         c.Quantization,
+		LoadFormat:           c.LoadFormat,
 		GPUMemoryUtilization: c.GPUMemoryUtilization,
 		TrustRemoteCode:      c.TrustRemoteCode,
 		EnforceEager:         c.EnforceEager,

diff --git a/core/config/backend_config.go b/core/config/backend_config.go
@@ -143,6 +143,7 @@ type LLMConfig struct {
 	DraftModel           string  `yaml:"draft_model"`
 	NDraft               int32   `yaml:"n_draft"`
 	Quantization         string  `yaml:"quantization"`
+	LoadFormat           string  `yaml:"load_format"`
 	GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
 	TrustRemoteCode      bool    `yaml:"trust_remote_code"`      // vLLM
 	EnforceEager         bool    `yaml:"enforce_eager"`          // vLLM