diff --git a/serving.proto b/serving.proto index b2d8280..ef4f995 100644 --- a/serving.proto +++ b/serving.proto @@ -69,6 +69,45 @@ message SummarizationRequest { // https://docs.vectara.com/docs/prompts/vectara-prompt-engine string prompt_text = 200; + // Vectara manages both system and user roles and prompts for the generative + // LLM out of the box by default. However, Scale customers can override the + // prompt_text via this variable. The prompt_text is in the form of an + // Apache Velocity template. For more details on how to configure the + // prompt_text, see the long-form documentation at + // https://docs.vectara.com/docs/prompts/vectara-prompt-engine + // See https://vectara.com/pricing/ for more details on becoming a Scale customer. + string prompt_text = 200; + + // Debugging the generative prompt is currently a Scale-only feature. + // See https://vectara.com/pricing/ for more details on becoming a Scale customer. + bool debug = 205; + + // Controls the length of the summary. + // This is a rough estimate and not a hard limit: the end summary can be longer or shorter + // than this value. This is currently a Scale-only feature. + // See https://vectara.com/pricing/ for more details on becoming a Scale customer. + uint32 response_chars = 210; + + // Parameters for the summarizer model. These are currently a Scale-only feature. + // See https://vectara.com/pricing/ for more details on becoming a Scale customer. + // WARNING: This is an experimental feature, and breakable at any point with virtually no + // notice. It is meant for experimentation to converge on optimal parameters that can then + // be set in the prompt definitions. + message ModelParams { + optional uint32 max_tokens = 5; + // The sampling temperature to use. Higher values make the summary more random, while lower + // values make it more focused and deterministic. + optional float temperature = 10; + // Higher values penalize new tokens based on their existing frequency in the text so far, + // decreasing the model's likelihood to repeat the same line verbatim. + optional float frequency_penalty = 15; + // Higher values penalize new tokens based on whether they appear in the text so far, + // increasing the model's likelihood to talk about new topics. + optional float presence_penalty = 20; + } + ModelParams model_params = 215; + + // If present, the query will be treated as a chat query. // When using chat, only one summarization request is allowed per query. ChatRequest chat = 225;