Back to Db Gpt

LlamaServerParameters Configuration

docs/docs/config-reference/llm/llama_cpp_adapter_llamaserverparameters_421f40.mdx

0.8.19.2 KB
Original Source

import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";

<ConfigDetail config={{ "name": "LlamaServerParameters", "description": "LlamaServerParameters(name: str, provider: str = 'llama.cpp.server', verbose: Optional[bool] = False, concurrency: Optional[int] = 20, backend: Optional[str] = None, prompt_template: Optional[str] = None, context_length: Optional[int] = None, reasoning_model: Optional[bool] = None, path: Optional[str] = None, model_hf_repo: Optional[str] = None, model_hf_file: Optional[str] = None, device: Optional[str] = None, server_bin_path: Optional[str] = None, server_host: str = '127.0.0.1', server_port: int = 0, temperature: float = 0.8, seed: int = 42, debug: bool = False, model_url: Optional[str] = None, model_draft: Optional[str] = None, threads: Optional[int] = None, n_gpu_layers: Optional[int] = None, batch_size: Optional[int] = None, ubatch_size: Optional[int] = None, ctx_size: Optional[int] = None, grp_attn_n: Optional[int] = None, grp_attn_w: Optional[int] = None, n_predict: Optional[int] = None, slot_save_path: Optional[str] = None, n_slots: Optional[int] = None, cont_batching: bool = False, embedding: bool = False, reranking: bool = False, metrics: bool = False, slots: bool = False, draft: Optional[int] = None, draft_max: Optional[int] = None, draft_min: Optional[int] = None, api_key: Optional[str] = None, lora_files: List[str] = <factory>, no_context_shift: bool = False, no_webui: Optional[bool] = None, startup_timeout: Optional[int] = None)", "documentationUrl": "", "parameters": [ { "name": "name", "type": "string", "required": true, "description": "The name of the model." }, { "name": "path", "type": "string", "required": false, "description": "Local model file path" }, { "name": "backend", "type": "string", "required": false, "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name." }, { "name": "device", "type": "string", "required": false, "description": "Device to run model. If None, the device is automatically determined" }, { "name": "provider", "type": "string", "required": false, "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')", "defaultValue": "llama.cpp.server" }, { "name": "verbose", "type": "boolean", "required": false, "description": "Show verbose output.", "defaultValue": "False" }, { "name": "concurrency", "type": "integer", "required": false, "description": "Model concurrency limit", "defaultValue": "20" }, { "name": "prompt_template", "type": "string", "required": false, "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment." }, { "name": "context_length", "type": "integer", "required": false, "description": "The context length of the model. If None, it is automatically determined from model." }, { "name": "reasoning_model", "type": "boolean", "required": false, "description": "Whether the model is a reasoning model. If None, it is automatically determined from model." }, { "name": "model_hf_repo", "type": "string", "required": false, "description": "Hugging Face repository for model download" }, { "name": "model_hf_file", "type": "string", "required": false, "description": "Model file name in the Hugging Face repository" }, { "name": "server_bin_path", "type": "string", "required": false, "description": "Path to the server binary executable" }, { "name": "server_host", "type": "string", "required": false, "description": "Host address to bind the server", "defaultValue": "127.0.0.1" }, { "name": "server_port", "type": "integer", "required": false, "description": "Port to bind the server. 0 for random available port", "defaultValue": "0" }, { "name": "temperature", "type": "number", "required": false, "description": "Sampling temperature for text generation", "defaultValue": "0.8" }, { "name": "seed", "type": "integer", "required": false, "description": "Random seed for reproducibility", "defaultValue": "42" }, { "name": "debug", "type": "boolean", "required": false, "description": "Enable debug mode", "defaultValue": "False" }, { "name": "model_url", "type": "string", "required": false, "description": "Model download URL (env: LLAMA_ARG_MODEL_URL)" }, { "name": "model_draft", "type": "string", "required": false, "description": "Draft model file path" }, { "name": "threads", "type": "integer", "required": false, "description": "Number of threads to use during generation (default: -1) (env: LLAMA_ARG_THREADS)" }, { "name": "n_gpu_layers", "type": "integer", "required": false, "description": "Number of layers to store in VRAM (env: LLAMA_ARG_N_GPU_LAYERS), set 1000000000 to use all layers" }, { "name": "batch_size", "type": "integer", "required": false, "description": "Logical maximum batch size (default: 2048) (env: LLAMA_ARG_BATCH)" }, { "name": "ubatch_size", "type": "integer", "required": false, "description": "Physical maximum batch size (default: 512) (env: LLAMA_ARG_UBATCH)" }, { "name": "ctx_size", "type": "integer", "required": false, "description": "Size of the prompt context (default: 4096, 0 = loaded from model) (env: LLAMA_ARG_CTX_SIZE)" }, { "name": "grp_attn_n", "type": "integer", "required": false, "description": "Group-attention factor (default: 1)" }, { "name": "grp_attn_w", "type": "integer", "required": false, "description": "Group-attention width (default: 512)" }, { "name": "n_predict", "type": "integer", "required": false, "description": "Number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) (env: LLAMA_ARG_N_PREDICT)" }, { "name": "slot_save_path", "type": "string", "required": false, "description": "Path to save slot kv cache (default: disabled)" }, { "name": "n_slots", "type": "integer", "required": false, "description": "Number of slots for KV cache" }, { "name": "cont_batching", "type": "boolean", "required": false, "description": "Enable continuous batching (a.k.a dynamic batching)", "defaultValue": "False" }, { "name": "embedding", "type": "boolean", "required": false, "description": "Restrict to only support embedding use case; use only with dedicated embedding models (env: LLAMA_ARG_EMBEDDINGS)", "defaultValue": "False" }, { "name": "reranking", "type": "boolean", "required": false, "description": "Enable reranking endpoint on server (env: LLAMA_ARG_RERANKING)", "defaultValue": "False" }, { "name": "metrics", "type": "boolean", "required": false, "description": "Enable prometheus compatible metrics endpoint (env: LLAMA_ARG_ENDPOINT_METRICS)", "defaultValue": "False" }, { "name": "slots", "type": "boolean", "required": false, "description": "Enable slots monitoring endpoint (env: LLAMA_ARG_ENDPOINT_SLOTS)", "defaultValue": "False" }, { "name": "draft", "type": "integer", "required": false, "description": "Number of tokens to draft for speculative decoding (default: 16) (env: LLAMA_ARG_DRAFT_MAX)" }, { "name": "draft_max", "type": "integer", "required": false, "description": "Same as draft" }, { "name": "draft_min", "type": "integer", "required": false, "description": "Minimum number of draft tokens to use for speculative decoding (default: 5)" }, { "name": "api_key", "type": "string", "required": false, "description": "API key to use for authentication (env: LLAMA_API_KEY)" }, { "name": "lora_files", "type": "string", "required": false, "description": "Path to LoRA adapter (can be repeated to use multiple adapters)", "defaultValue": "[]" }, { "name": "no_context_shift", "type": "boolean", "required": false, "description": "Disables context shift on infinite text generation", "defaultValue": "False" }, { "name": "no_webui", "type": "boolean", "required": false, "description": "Disable web UI" }, { "name": "startup_timeout", "type": "integer", "required": false, "description": "Server startup timeout in seconds" } ] }} />