Spaces:

peterpeter8585
/

vllm

Build error

App Files Files Community

peterpeter8585 commited on Oct 6

Commit

a88a542

verified ·

1 Parent(s): 9d681ca

Upload 6 files

Browse files

Files changed (6) hide show

.gitignore +7 -0
Dockerfile +50 -0
LICENSE +21 -0
README.md +405 -10
docker-bake.hcl +26 -0
worker-config.json +1514 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+runpod.toml
+*.pyc
+.env
+test/*
+vllm-base/vllm-*
+.DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+FROM nvidia/cuda:12.1.0-base-ubuntu22.04
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+RUN ldconfig /usr/local/cuda-12.1/compat/
+# Install Python dependencies
+COPY builder/requirements.txt /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install --upgrade -r /requirements.txt
+# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
+RUN python3 -m pip install vllm==0.10.0 && \
+    python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
+# Setup for Option 2: Building the Image with the Model included
+ARG MODEL_NAME=""
+ARG TOKENIZER_NAME=""
+ARG BASE_PATH="/runpod-volume"
+ARG QUANTIZATION=""
+ARG MODEL_REVISION=""
+ARG TOKENIZER_REVISION=""
+ENV MODEL_NAME=$MODEL_NAME \
+    MODEL_REVISION=$MODEL_REVISION \
+    TOKENIZER_NAME=$TOKENIZER_NAME \
+    TOKENIZER_REVISION=$TOKENIZER_REVISION \
+    BASE_PATH=$BASE_PATH \
+    QUANTIZATION=$QUANTIZATION \
+    HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \
+    HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \
+    HF_HOME="${BASE_PATH}/huggingface-cache/hub" \
+    HF_HUB_ENABLE_HF_TRANSFER=0
+ENV PYTHONPATH="/:/vllm-workspace"
+COPY src /src
+RUN --mount=type=secret,id=HF_TOKEN,required=false \
+    if [ -f /run/secrets/HF_TOKEN ]; then \
+    export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \
+    fi && \
+    if [ -n "$MODEL_NAME" ]; then \
+    python3 /src/download_model.py; \
+    fi
+# Start the handler
+CMD ["python3", "/src/handler.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Runpod
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,10 +1,405 @@
----
-title: Vllm
-emoji: 🏢
-colorFrom: pink
-colorTo: gray
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+# OpenAI-Compatible vLLM Serverless Endpoint Worker
+Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https://github.com/vllm-project/vllm) Inference Engine on RunPod Serverless with just a few clicks.
+</div>
+## Table of Contents
+- [Setting up the Serverless Worker](#setting-up-the-serverless-worker)
+  - [Option 1: Deploy Any Model Using Pre-Built Docker Image [Recommended]](#option-1-deploy-any-model-using-pre-built-docker-image-recommended)
+    - [Configuration](#configuration)
+  - [Option 2: Build Docker Image with Model Inside](#option-2-build-docker-image-with-model-inside)
+    - [Prerequisites](#prerequisites)
+    - [Arguments](#arguments)
+    - [Example: Building an image with OpenChat-3.5](#example-building-an-image-with-openchat-35)
+      - [(Optional) Including Huggingface Token](#optional-including-huggingface-token)
+  - [Compatible Model Architectures](#compatible-model-architectures)
+- [Usage: OpenAI Compatibility](#usage-openai-compatibility)
+  - [Modifying your OpenAI Codebase to use your deployed vLLM Worker](#modifying-your-openai-codebase-to-use-your-deployed-vllm-worker)
+  - [OpenAI Request Input Parameters](#openai-request-input-parameters)
+  - [Chat Completions [RECOMMENDED]](#chat-completions-recommended)
+  - [Examples: Using your RunPod endpoint with OpenAI](#examples-using-your-runpod-endpoint-with-openai)
+    - [Chat Completions](#chat-completions)
+    - [Getting a list of names for available models](#getting-a-list-of-names-for-available-models)
+- [Usage: Standard (Non-OpenAI)](#usage-standard-non-openai)
+  - [Request Input Parameters](#request-input-parameters)
+  - [Sampling Parameters](#sampling-parameters)
+    - [Text Input Formats](#text-input-formats)
+# Setting up the Serverless Worker
+## Option 1: Deploy Any Model Using Pre-Built Docker Image [Recommended]
+**🚀 Deploy Guide**: Follow our [step-by-step deployment guide](https://docs.runpod.io/serverless/vllm/get-started) to deploy using the RunPod Console.
+**📦 Docker Image**: `runpod/worker-v1-vllm:<version>`
+- **Available Versions**: See [GitHub Releases](https://github.com/runpod-workers/worker-vllm/releases)
+- **CUDA Compatibility**: Requires CUDA >= 12.1
+### Configuration
+Configure worker-vllm using environment variables:
+| Environment Variable                | Description                                       | Default             | Options                                                            |
+| ----------------------------------- | ------------------------------------------------- | ------------------- | ------------------------------------------------------------------ |
+| `MODEL_NAME`                        | Path of the model weights                         | "facebook/opt-125m" | Local folder or Hugging Face repo ID                               |
+| `HF_TOKEN`                          | HuggingFace access token for gated/private models |                     | Your HuggingFace access token                                      |
+| `MAX_MODEL_LEN`                     | Model's maximum context length                    |                     | Integer (e.g., 4096)                                               |
+| `QUANTIZATION`                      | Quantization method                               |                     | "awq", "gptq", "squeezellm", "bitsandbytes"                        |
+| `TENSOR_PARALLEL_SIZE`              | Number of GPUs                                    | 1                   | Integer                                                            |
+| `GPU_MEMORY_UTILIZATION`            | Fraction of GPU memory to use                     | 0.95                | Float between 0.0 and 1.0                                          |
+| `MAX_NUM_SEQS`                      | Maximum number of sequences per iteration         | 256                 | Integer                                                            |
+| `CUSTOM_CHAT_TEMPLATE`              | Custom chat template override                     |                     | Jinja2 template string                                             |
+| `ENABLE_AUTO_TOOL_CHOICE`           | Enable automatic tool selection                   | false               | boolean (true or false)                                            |
+| `TOOL_CALL_PARSER`                  | Parser for tool calls                             |                     | "mistral", "hermes", "llama3_json", "granite", "deepseek_v3", etc. |
+| `OPENAI_SERVED_MODEL_NAME_OVERRIDE` | Override served model name in API                 |                     | String                                                             |
+| `MAX_CONCURRENCY`                   | Maximum concurrent requests                       | 300                 | Integer                                                            |
+For the complete list of all available environment variables, examples, and detailed descriptions: **[Configuration](docs/configuration.md)**
+## Option 2: Build Docker Image with Model Inside
+To build an image with the model baked in, you must specify the following docker arguments when building the image.
+### Prerequisites
+- Docker
+### Arguments
+- **Required**
+  - `MODEL_NAME`
+- **Optional**
+  - `MODEL_REVISION`: Model revision to load (default: `main`).
+  - `BASE_PATH`: Storage directory where huggingface cache and model will be located. (default: `/runpod-volume`, which will utilize network storage if you attach it or create a local directory within the image if you don't. If your intention is to bake the model into the image, you should set this to something like `/models` to make sure there are no issues if you were to accidentally attach network storage.)
+  - `QUANTIZATION`
+  - `WORKER_CUDA_VERSION`: `12.1.0` (`12.1.0` is recommended for optimal performance).
+  - `TOKENIZER_NAME`: Tokenizer repository if you would like to use a different tokenizer than the one that comes with the model. (default: `None`, which uses the model's tokenizer)
+  - `TOKENIZER_REVISION`: Tokenizer revision to load (default: `main`).
+For the remaining settings, you may apply them as environment variables when running the container. Supported environment variables are listed in the [Environment Variables](#environment-variables) section.
+### Example: Building an image with OpenChat-3.5
+```bash
+docker build -t username/image:tag --build-arg MODEL_NAME="openchat/openchat_3.5" --build-arg BASE_PATH="/models" .
+```
+### (Optional) Including Huggingface Token
+If the model you would like to deploy is private or gated, you will need to include it during build time as a Docker secret, which will protect it from being exposed in the image and on DockerHub.
+1. Enable Docker BuildKit (required for secrets).
+```bash
+export DOCKER_BUILDKIT=1
+```
+2. Export your Hugging Face token as an environment variable
+```bash
+export HF_TOKEN="your_token_here"
+```
+2. Add the token as a secret when building
+```bash
+docker build -t username/image:tag --secret id=HF_TOKEN --build-arg MODEL_NAME="openchat/openchat_3.5" .
+```
+# Compatible Model Architectures
+You can deploy **any model on Hugging Face** that is supported by vLLM. For the complete and up-to-date list of supported model architectures, see the [vLLM Supported Models documentation](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models).
+# Usage: OpenAI Compatibility
+The vLLM Worker is fully compatible with OpenAI's API, and you can use it with any OpenAI Codebase by changing only 3 lines in total. The supported routes are <ins>Chat Completions</ins> and <ins>Models</ins> - with both streaming and non-streaming.
+## Modifying your OpenAI Codebase to use your deployed vLLM Worker
+**Python** (similar to Node.js, etc.):
+1. When initializing the OpenAI Client in your code, change the `api_key` to your RunPod API Key and the `base_url` to your RunPod Serverless Endpoint URL in the following format: `https://api.runpod.ai/v2/<YOUR ENDPOINT ID>/openai/v1`, filling in your deployed endpoint ID. For example, if your Endpoint ID is `abc1234`, the URL would be `https://api.runpod.ai/v2/abc1234/openai/v1`.
+   - Before:
+   ```python
+   from openai import OpenAI
+   client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+   ```
+   - After:
+   ```python
+   from openai import OpenAI
+   client = OpenAI(
+       api_key=os.environ.get("RUNPOD_API_KEY"),
+       base_url="https://api.runpod.ai/v2/<YOUR ENDPOINT ID>/openai/v1",
+   )
+   ```
+2. Change the `model` parameter to your deployed model's name whenever using Completions or Chat Completions.
+   - Before:
+   ```python
+   response = client.chat.completions.create(
+       model="gpt-3.5-turbo",
+       messages=[{"role": "user", "content": "Why is RunPod the best platform?"}],
+       temperature=0,
+       max_tokens=100,
+   )
+   ```
+   - After:
+   ```python
+   response = client.chat.completions.create(
+       model="<YOUR DEPLOYED MODEL REPO/NAME>",
+       messages=[{"role": "user", "content": "Why is RunPod the best platform?"}],
+       temperature=0,
+       max_tokens=100,
+   )
+   ```
+**Using http requests**:
+1. Change the `Authorization` header to your RunPod API Key and the `url` to your RunPod Serverless Endpoint URL in the following format: `https://api.runpod.ai/v2/<YOUR ENDPOINT ID>/openai/v1`
+   - Before:
+   ```bash
+   curl https://api.openai.com/v1/chat/completions \
+   -H "Content-Type: application/json" \
+   -H "Authorization: Bearer $OPENAI_API_KEY" \
+   -d '{
+   "model": "gpt-4",
+   "messages": [
+     {
+       "role": "user",
+       "content": "Why is RunPod the best platform?"
+     }
+   ],
+   "temperature": 0,
+   "max_tokens": 100
+   }'
+   ```
+   - After:
+   ```bash
+   curl https://api.runpod.ai/v2/<YOUR ENDPOINT ID>/openai/v1/chat/completions \
+   -H "Content-Type: application/json" \
+   -H "Authorization: Bearer <YOUR OPENAI API KEY>" \
+   -d '{
+   "model": "<YOUR DEPLOYED MODEL REPO/NAME>",
+   "messages": [
+     {
+       "role": "user",
+       "content": "Why is RunPod the best platform?"
+     }
+   ],
+   "temperature": 0,
+   "max_tokens": 100
+   }'
+   ```
+## OpenAI Request Input Parameters:
+When using the chat completion feature of the vLLM Serverless Endpoint Worker, you can customize your requests with the following parameters:
+### Chat Completions [RECOMMENDED]
+<details>
+  <summary>Supported Chat Completions Inputs and Descriptions</summary>
+| Parameter           | Type                             | Default Value | Description                                                                                                                                                                                                                                                  |
+| ------------------- | -------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `messages`          | Union[str, List[Dict[str, str]]] |               | List of messages, where each message is a dictionary with a `role` and `content`. The model's chat template will be applied to the messages automatically, so the model must have one or it should be specified as `CUSTOM_CHAT_TEMPLATE` env var.           |
+| `model`             | str                              |               | The model repo that you've deployed on your RunPod Serverless Endpoint. If you are unsure what the name is or are baking the model in, use the guide to get the list of available models in the **Examples: Using your RunPod endpoint with OpenAI** section |
+| `temperature`       | Optional[float]                  | 0.7           | Float that controls the randomness of the sampling. Lower values make the model more deterministic, while higher values make the model more random. Zero means greedy sampling.                                                                              |
+| `top_p`             | Optional[float]                  | 1.0           | Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens.                                                                                                                            |
+| `n`                 | Optional[int]                    | 1             | Number of output sequences to return for the given prompt.                                                                                                                                                                                                   |
+| `max_tokens`        | Optional[int]                    | None          | Maximum number of tokens to generate per output sequence.                                                                                                                                                                                                    |
+| `seed`              | Optional[int]                    | None          | Random seed to use for the generation.                                                                                                                                                                                                                       |
+| `stop`              | Optional[Union[str, List[str]]]  | list          | List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.                                                                                                                                     |
+| `stream`            | Optional[bool]                   | False         | Whether to stream or not                                                                                                                                                                                                                                     |
+| `presence_penalty`  | Optional[float]                  | 0.0           | Float that penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.                                                          |
+| `frequency_penalty` | Optional[float]                  | 0.0           | Float that penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.                                                              |
+| `logit_bias`        | Optional[Dict[str, float]]       | None          | Unsupported by vLLM                                                                                                                                                                                                                                          |
+| `user`              | Optional[str]                    | None          | Unsupported by vLLM                                                                                                                                                                                                                                          |
+Additional parameters supported by vLLM:
+| `best_of` | Optional[int] | None | Number of output sequences that are generated from the prompt. From these `best_of` sequences, the top `n` sequences are returned. `best_of` must be greater than or equal to `n`. This is treated as the beam width when `use_beam_search` is True. By default, `best_of` is set to `n`. |
+| `top_k` | Optional[int] | -1 | Integer that controls the number of top tokens to consider. Set to -1 to consider all tokens. |
+| `ignore_eos` | Optional[bool] | False | Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. |
+| `use_beam_search` | Optional[bool] | False | Whether to use beam search instead of sampling. |
+| `stop_token_ids` | Optional[List[int]] | list | List of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. |
+| `skip_special_tokens` | Optional[bool] | True | Whether to skip special tokens in the output. |
+| `spaces_between_special_tokens`| Optional[bool] | True | Whether to add spaces between special tokens in the output. Defaults to True. |
+| `add_generation_prompt` | Optional[bool] | True | Read more [here](https://huggingface.co/docs/transformers/main/en/chat_templating#what-are-generation-prompts) |
+| `echo` | Optional[bool] | False | Echo back the prompt in addition to the completion |
+| `repetition_penalty` | Optional[float] | 1.0 | Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > 1 encourage the model to use new tokens, while values < 1 encourage the model to repeat tokens. |
+| `min_p` | Optional[float] | 0.0 | Float that represents the minimum probability for a token to |
+| `length_penalty` | Optional[float] | 1.0 | Float that penalizes sequences based on their length. Used in beam search.. |
+| `include_stop_str_in_output` | Optional[bool] | False | Whether to include the stop strings in output text. Defaults to False.|
+</details>
+### Examples: Using your RunPod endpoint with OpenAI
+First, initialize the OpenAI Client with your RunPod API Key and Endpoint URL:
+```python
+from openai import OpenAI
+import os
+# Initialize the OpenAI Client with your RunPod API Key and Endpoint URL
+client = OpenAI(
+    api_key=os.environ.get("RUNPOD_API_KEY"),
+    base_url="https://api.runpod.ai/v2/<YOUR ENDPOINT ID>/openai/v1",
+)
+```
+### Chat Completions:
+This is the format used for GPT-4 and focused on instruction-following and chat. Examples of Open Source chat/instruct models include `meta-llama/Llama-2-7b-chat-hf`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `openchat/openchat-3.5-0106`, `NousResearch/Nous-Hermes-2-Mistral-7B-DPO` and more. However, if your model is a completion-style model with no chat/instruct fine-tune and/or does not have a chat template, you can still use this if you provide a chat template with the environment variable `CUSTOM_CHAT_TEMPLATE`.
+- **Streaming**:
+  ```python
+  # Create a chat completion stream
+  response_stream = client.chat.completions.create(
+      model="<YOUR DEPLOYED MODEL REPO/NAME>",
+      messages=[{"role": "user", "content": "Why is RunPod the best platform?"}],
+      temperature=0,
+      max_tokens=100,
+      stream=True,
+  )
+  # Stream the response
+  for response in response_stream:
+      print(chunk.choices[0].delta.content or "", end="", flush=True)
+  ```
+- **Non-Streaming**:
+  ```python
+  # Create a chat completion
+  response = client.chat.completions.create(
+      model="<YOUR DEPLOYED MODEL REPO/NAME>",
+      messages=[{"role": "user", "content": "Why is RunPod the best platform?"}],
+      temperature=0,
+      max_tokens=100,
+  )
+  # Print the response
+  print(response.choices[0].message.content)
+  ```
+### Getting a list of names for available models:
+In the case of baking the model into the image, sometimes the repo may not be accepted as the `model` in the request. In this case, you can list the available models as shown below and use that name.
+```python
+models_response = client.models.list()
+list_of_models = [model.id for model in models_response]
+print(list_of_models)
+```
+# Usage: Standard (Non-OpenAI)
+## Request Input Parameters
+<details>
+  <summary>Click to expand table</summary>
+  You may either use a `prompt` or a list of `messages` as input. If you use `messages`, the model's chat template will be applied to the messages automatically, so the model must have one. If you use `prompt`, you may optionally apply the model's chat template to the prompt by setting `apply_chat_template` to `true`.
+  | Argument              | Type                 | Default            | Description                                                                                            |
+  |-----------------------|----------------------|--------------------|--------------------------------------------------------------------------------------------------------|
+  | `prompt`              | str                  |                    | Prompt string to generate text based on.                                                               |
+  | `messages`            | list[dict[str, str]] |                    | List of messages, which will automatically have the model's chat template applied. Overrides `prompt`. |
+  | `apply_chat_template` | bool                 | False              | Whether to apply the model's chat template to the `prompt`.                                            |
+  | `sampling_params`     | dict                 | {}                 | Sampling parameters to control the generation, like temperature, top_p, etc. You can find all available parameters in the `Sampling Parameters` section below. |
+  | `stream`              | bool                 | False              | Whether to enable streaming of output. If True, responses are streamed as they are generated.          |
+  | `max_batch_size`          | int                  | env var `DEFAULT_BATCH_SIZE` | The maximum number of tokens to stream every HTTP POST call.                                                   |
+  | `min_batch_size`          | int                  | env var `DEFAULT_MIN_BATCH_SIZE` | The minimum number of tokens to stream every HTTP POST call.                                           |
+  | `batch_size_growth_factor` | int                  | env var `DEFAULT_BATCH_SIZE_GROWTH_FACTOR` | The growth factor by which `min_batch_size` will be multiplied for each call until `max_batch_size` is reached.           |
+</details>
+### Sampling Parameters
+Below are all available sampling parameters that you can specify in the `sampling_params` dictionary. If you do not specify any of these parameters, the default values will be used.
+<details>
+  <summary>Click to expand table</summary>
+| Argument                        | Type                        | Default | Description                                                                                                                                                                                   |
+| ------------------------------- | --------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `n`                             | int                         | 1       | Number of output sequences generated from the prompt. The top `n` sequences are returned.                                                                                                     |
+| `best_of`                       | Optional[int]               | `n`     | Number of output sequences generated from the prompt. The top `n` sequences are returned from these `best_of` sequences. Must be ≥ `n`. Treated as beam width in beam search. Default is `n`. |
+| `presence_penalty`              | float                       | 0.0     | Penalizes new tokens based on their presence in the generated text so far. Values > 0 encourage new tokens, values < 0 encourage repetition.                                                  |
+| `frequency_penalty`             | float                       | 0.0     | Penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage new tokens, values < 0 encourage repetition.                                                 |
+| `repetition_penalty`            | float                       | 1.0     | Penalizes new tokens based on their appearance in the prompt and generated text. Values > 1 encourage new tokens, values < 1 encourage repetition.                                            |
+| `temperature`                   | float                       | 1.0     | Controls the randomness of sampling. Lower values make it more deterministic, higher values make it more random. Zero means greedy sampling.                                                  |
+| `top_p`                         | float                       | 1.0     | Controls the cumulative probability of top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens.                                                                            |
+| `top_k`                         | int                         | -1      | Controls the number of top tokens to consider. Set to -1 to consider all tokens.                                                                                                              |
+| `min_p`                         | float                       | 0.0     | Represents the minimum probability for a token to be considered, relative to the most likely token. Must be in [0, 1]. Set to 0 to disable.                                                   |
+| `use_beam_search`               | bool                        | False   | Whether to use beam search instead of sampling.                                                                                                                                               |
+| `length_penalty`                | float                       | 1.0     | Penalizes sequences based on their length. Used in beam search.                                                                                                                               |
+| `early_stopping`                | Union[bool, str]            | False   | Controls stopping condition in beam search. Can be `True`, `False`, or `"never"`.                                                                                                             |
+| `stop`                          | Union[None, str, List[str]] | None    | List of strings that stop generation when produced. The output will not contain these strings.                                                                                                |
+| `stop_token_ids`                | Optional[List[int]]         | None    | List of token IDs that stop generation when produced. Output contains these tokens unless they are special tokens.                                                                            |
+| `ignore_eos`                    | bool                        | False   | Whether to ignore the End-Of-Sequence token and continue generating tokens after its generation.                                                                                              |
+| `max_tokens`                    | int                         | 16      | Maximum number of tokens to generate per output sequence.                                                                                                                                     |
+| `skip_special_tokens`           | bool                        | True    | Whether to skip special tokens in the output.                                                                                                                                                 |
+| `spaces_between_special_tokens` | bool                        | True    | Whether to add spaces between special tokens in the output.                                                                                                                                   |
+### Text Input Formats
+You may either use a `prompt` or a list of `messages` as input.
+1.  `prompt`
+    The prompt string can be any string, and the model's chat template will not be applied to it unless `apply_chat_template` is set to `true`, in which case it will be treated as a user message.
+        Example:
+        ```json
+        {
+          "input": {
+            "prompt": "why sky is blue?",
+            "sampling_params": {
+              "temperature": 0.7,
+              "max_tokens": 100
+            }
+          }
+        }
+        ```
+2.  `messages`
+    Your list can contain any number of messages, and each message usually can have any role from the following list: - `user` - `assistant` - `system`
+    However, some models may have different roles, so you should check the model's chat template to see which roles are required.
+    The model's chat template will be applied to the messages automatically, so the model must have one.
+    Example:
+    ```json
+    {
+      "input": {
+        "messages": [
+          {
+            "role": "system",
+            "content": "You are a helpful AI assistant that provides clear and concise responses."
+          },
+          {
+            "role": "user",
+            "content": "Can you explain the difference between supervised and unsupervised learning?"
+          },
+          {
+            "role": "assistant",
+            "content": "Sure! Supervised learning uses labeled data, meaning each input has a corresponding correct output. The model learns by mapping inputs to known outputs. In contrast, unsupervised learning works with unlabeled data, where the model identifies patterns, structures, or clusters without predefined answers."
+          }
+        ],
+        "sampling_params": {
+          "temperature": 0.7,
+          "max_tokens": 100
+        }
+      }
+    }
+    ```
+</details>

docker-bake.hcl ADDED Viewed

	@@ -0,0 +1,26 @@

+variable "DOCKERHUB_REPO" {
+  default = "runpod"
+}
+variable "DOCKERHUB_IMG" {
+  default = "worker-v1-vllm"
+}
+variable "RELEASE_VERSION" {
+  default = "latest"
+}
+variable "HUGGINGFACE_ACCESS_TOKEN" {
+  default = ""
+}
+group "default" {
+  targets = ["worker-vllm"]
+}
+target "worker-vllm" {
+  tags = ["${DOCKERHUB_REPO}/${DOCKERHUB_IMG}:${RELEASE_VERSION}"]
+  context = "."
+  dockerfile = "Dockerfile"
+  platforms = ["linux/amd64"]
+}

worker-config.json ADDED Viewed

	@@ -0,0 +1,1514 @@

+{
+  "versions": {
+    "0.10.0": {
+      "imageName": "runpod/worker-v1-vllm:v2.8.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.9.0": {
+      "imageName": "runpod/worker-v1-vllm:v2.6.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.9.1": {
+      "imageName": "runpod/worker-v1-vllm:v2.7.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.9.0": {
+      "imageName": "runpod/worker-v1-vllm:v2.6.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.8.5": {
+      "imageName": "runpod/worker-v1-vllm:v2.5.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.8.4": {
+      "imageName": "runpod/worker-v1-vllm:v2.4.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.8.3": {
+      "imageName": "runpod/worker-v1-vllm:v2.3.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.8.2": {
+      "imageName": "runpod/worker-v1-vllm:v2.2.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.7.3": {
+      "imageName": "runpod/worker-v1-vllm:v2.1.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.6.6": {
+      "imageName": "runpod/worker-v1-vllm:v1.8.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.7.0": {
+      "imageName": "runpod/worker-v1-vllm:v1.9.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.6.4": {
+      "imageName": "runpod/worker-v1-vllm:v1.7.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    },
+    "0.6.3": {
+      "imageName": "runpod/worker-v1-vllm:v1.6.0stable-cuda12.1.0",
+      "minimumCudaVersion": "12.1",
+      "categories": [
+        {
+          "title": "LLM Settings",
+          "settings": [
+            "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
+            "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
+            "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
+            "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
+            "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
+            "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
+            "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
+            "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
+            "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
+            "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
+            "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
+            "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
+            "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+            "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+            "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
+            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+            "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
+            "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
+            "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
+          ]
+        },
+        {
+          "title": "Tokenizer Settings",
+          "settings": [
+            "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
+          ]
+        },
+        {
+          "title": "System Settings",
+          "settings": [
+            "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
+            "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
+          ]
+        },
+        {
+          "title": "Streaming Settings",
+          "settings": [
+            "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
+          ]
+        },
+        {
+          "title": "OpenAI Settings",
+          "settings": [
+            "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
+          ]
+        },
+        {
+          "title": "Serverless Settings",
+          "settings": [
+            "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
+          ]
+        }
+      ]
+    }
+  },
+  "schema": {
+    "TOKENIZER": {
+      "env_var_name": "TOKENIZER",
+      "value": "",
+      "title": "Tokenizer",
+      "description": "Name or path of the Hugging Face tokenizer to use.",
+      "required": false,
+      "type": "text"
+    },
+    "TOKENIZER_MODE": {
+      "env_var_name": "TOKENIZER_MODE",
+      "value": "auto",
+      "title": "Tokenizer Mode",
+      "description": "The tokenizer mode.",
+      "required": false,
+      "type": "select",
+      "options": [
+        { "value": "auto", "label": "auto" },
+        { "value": "slow", "label": "slow" }
+      ]
+    },
+    "SKIP_TOKENIZER_INIT": {
+      "env_var_name": "SKIP_TOKENIZER_INIT",
+      "value": false,
+      "title": "Skip Tokenizer Init",
+      "description": "Skip initialization of tokenizer and detokenizer.",
+      "required": false,
+      "type": "toggle"
+    },
+    "TRUST_REMOTE_CODE": {
+      "env_var_name": "TRUST_REMOTE_CODE",
+      "value": false,
+      "title": "Trust Remote Code",
+      "description": "Trust remote code from Hugging Face.",
+      "required": false,
+      "type": "toggle"
+    },
+    "DOWNLOAD_DIR": {
+      "env_var_name": "DOWNLOAD_DIR",
+      "value": "",
+      "title": "Download Directory",
+      "description": "Directory to download and load the weights.",
+      "required": false,
+      "type": "text"
+    },
+    "LOAD_FORMAT": {
+      "env_var_name": "LOAD_FORMAT",
+      "value": "auto",
+      "title": "Load Format",
+      "description": "The format of the model weights to load.",
+      "required": false,
+      "type": "select",
+      "options": [
+        { "value": "auto", "label": "auto" },
+        { "value": "pt", "label": "pt" },
+        { "value": "safetensors", "label": "safetensors" },
+        { "value": "npcache", "label": "npcache" },
+        { "value": "dummy", "label": "dummy" },
+        { "value": "tensorizer", "label": "tensorizer" },
+        { "value": "bitsandbytes", "label": "bitsandbytes" }
+      ]
+    },
+    "DTYPE": {
+      "env_var_name": "DTYPE",
+      "value": "auto",
+      "title": "Data Type",
+      "description": "Data type for model weights and activations.",
+      "required": false,
+      "type": "select",
+      "options": [
+        { "value": "auto", "label": "auto" },
+        { "value": "half", "label": "half" },
+        { "value": "float16", "label": "float16" },
+        { "value": "bfloat16", "label": "bfloat16" },
+        { "value": "float", "label": "float" },
+        { "value": "float32", "label": "float32" }
+      ]
+    },
+    "KV_CACHE_DTYPE": {
+      "env_var_name": "KV_CACHE_DTYPE",
+      "value": "auto",
+      "title": "KV Cache Data Type",
+      "description": "Data type for KV cache storage.",
+      "required": false,
+      "type": "select",
+      "options": [
+        { "value": "auto", "label": "auto" },
+        { "value": "fp8", "label": "fp8" }
+      ]
+    },
+    "QUANTIZATION_PARAM_PATH": {
+      "env_var_name": "QUANTIZATION_PARAM_PATH",
+      "value": "",
+      "title": "Quantization Param Path",
+      "description": "Path to the JSON file containing the KV cache scaling factors.",
+      "required": false,
+      "type": "text"
+    },
+    "MAX_MODEL_LEN": {
+      "env_var_name": "MAX_MODEL_LEN",
+      "value": "",
+      "title": "Max Model Length",
+      "description": "Model context length.",
+      "required": false,
+      "type": "number"
+    },
+    "GUIDED_DECODING_BACKEND": {
+      "env_var_name": "GUIDED_DECODING_BACKEND",
+      "value": "outlines",
+      "title": "Guided Decoding Backend",
+      "description": "Which engine will be used for guided decoding by default.",
+      "required": false,
+      "type": "select",
+      "options": [
+        { "value": "outlines", "label": "outlines" },
+        { "value": "lm-format-enforcer", "label": "lm-format-enforcer" }
+      ]
+    },
+    "DISTRIBUTED_EXECUTOR_BACKEND": {
+    "env_var_name": "DISTRIBUTED_EXECUTOR_BACKEND",
+    "value": "",
+    "title": "Distributed Executor Backend",
+    "description": "Backend to use for distributed serving.",
+    "required": false,
+    "type": "select",
+    "options": [
+      { "value": "ray", "label": "ray" },
+      { "value": "mp", "label": "mp" }
+    ]
+  },
+  "WORKER_USE_RAY": {
+    "env_var_name": "WORKER_USE_RAY",
+    "value": false,
+    "title": "Worker Use Ray",
+    "description": "Deprecated, use --distributed-executor-backend=ray.",
+    "required": false,
+    "type": "toggle"
+  },
+  "RAY_WORKERS_USE_NSIGHT": {
+    "env_var_name": "RAY_WORKERS_USE_NSIGHT",
+    "value": false,
+    "title": "Ray Workers Use Nsight",
+    "description": "If specified, use nsight to profile Ray workers.",
+    "required": false,
+    "type": "toggle"
+  },
+  "PIPELINE_PARALLEL_SIZE": {
+    "env_var_name": "PIPELINE_PARALLEL_SIZE",
+    "value": 1,
+    "title": "Pipeline Parallel Size",
+    "description": "Number of pipeline stages.",
+    "required": false,
+    "type": "number"
+  },
+  "TENSOR_PARALLEL_SIZE": {
+    "env_var_name": "TENSOR_PARALLEL_SIZE",
+    "value": 1,
+    "title": "Tensor Parallel Size",
+    "description": "Number of tensor parallel replicas.",
+    "required": false,
+    "type": "number"
+  },
+  "MAX_PARALLEL_LOADING_WORKERS": {
+    "env_var_name": "MAX_PARALLEL_LOADING_WORKERS",
+    "value": "",
+    "title": "Max Parallel Loading Workers",
+    "description": "Load model sequentially in multiple batches.",
+    "required": false,
+    "type": "number"
+  },
+  "ENABLE_PREFIX_CACHING": {
+    "env_var_name": "ENABLE_PREFIX_CACHING",
+    "value": false,
+    "title": "Enable Prefix Caching",
+    "description": "Enables automatic prefix caching.",
+    "required": false,
+    "type": "toggle"
+  },
+  "DISABLE_SLIDING_WINDOW": {
+    "env_var_name": "DISABLE_SLIDING_WINDOW",
+    "value": false,
+    "title": "Disable Sliding Window",
+    "description": "Disables sliding window, capping to sliding window size.",
+    "required": false,
+    "type": "toggle"
+  },
+  "USE_V2_BLOCK_MANAGER": {
+    "env_var_name": "USE_V2_BLOCK_MANAGER",
+    "value": false,
+    "title": "Use V2 Block Manager",
+    "description": "Use BlockSpaceMangerV2.",
+    "required": false,
+    "type": "toggle"
+  },
+  "NUM_LOOKAHEAD_SLOTS": {
+    "env_var_name": "NUM_LOOKAHEAD_SLOTS",
+    "value": 0,
+    "title": "Num Lookahead Slots",
+    "description": "Experimental scheduling config necessary for speculative decoding.",
+    "required": false,
+    "type": "number"
+  },
+  "SEED": {
+    "env_var_name": "SEED",
+    "value": 0,
+    "title": "Seed",
+    "description": "Random seed for operations.",
+    "required": false,
+    "type": "number"
+  },
+  "NUM_GPU_BLOCKS_OVERRIDE": {
+    "env_var_name": "NUM_GPU_BLOCKS_OVERRIDE",
+    "value": "",
+    "title": "Num GPU Blocks Override",
+    "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.",
+    "required": false,
+    "type": "number"
+  },
+  "MAX_NUM_BATCHED_TOKENS": {
+    "env_var_name": "MAX_NUM_BATCHED_TOKENS",
+    "value": "",
+    "title": "Max Num Batched Tokens",
+    "description": "Maximum number of batched tokens per iteration.",
+    "required": false,
+    "type": "number"
+  },
+  "MAX_NUM_SEQS": {
+    "env_var_name": "MAX_NUM_SEQS",
+    "value": 256,
+    "title": "Max Num Seqs",
+    "description": "Maximum number of sequences per iteration.",
+    "required": false,
+    "type": "number"
+  },
+  "MAX_LOGPROBS": {
+    "env_var_name": "MAX_LOGPROBS",
+    "value": 20,
+    "title": "Max Logprobs",
+    "description": "Max number of log probs to return when logprobs is specified in SamplingParams.",
+    "required": false,
+    "type": "number"
+  },
+  "DISABLE_LOG_STATS": {
+    "env_var_name": "DISABLE_LOG_STATS",
+    "value": false,
+    "title": "Disable Log Stats",
+    "description": "Disable logging statistics.",
+    "required": false,
+    "type": "toggle"
+  },
+  "QUANTIZATION": {
+    "env_var_name": "QUANTIZATION",
+    "value": "",
+    "title": "Quantization",
+    "description": "Method used to quantize the weights.\nif the `Load Format` is 'bitsandbytes' then `Quantization` will be forced to 'bitsandbytes'",
+    "required": false,
+    "type": "select",
+    "options": [
+      { "value": "None", "label": "None" },
+      { "value": "awq", "label": "AWQ" },
+      { "value": "squeezellm", "label": "SqueezeLLM" },
+      { "value": "gptq", "label": "GPTQ" },
+      { "value": "bitsandbytes", "label": "bitsandbytes" }
+    ]
+  },
+  "ROPE_SCALING": {
+    "env_var_name": "ROPE_SCALING",
+    "value": "",
+    "title": "RoPE Scaling",
+    "description": "RoPE scaling configuration in JSON format.",
+    "required": false,
+    "type": "text"
+  },
+  "ROPE_THETA": {
+    "env_var_name": "ROPE_THETA",
+    "value": "",
+    "title": "RoPE Theta",
+    "description": "RoPE theta. Use with rope_scaling.",
+    "required": false,
+    "type": "number"
+  },
+  "TOKENIZER_POOL_SIZE": {
+    "env_var_name": "TOKENIZER_POOL_SIZE",
+    "value": 0,
+    "title": "Tokenizer Pool Size",
+    "description": "Size of tokenizer pool to use for asynchronous tokenization.",
+    "required": false,
+    "type": "number"
+  },
+  "TOKENIZER_POOL_TYPE": {
+    "env_var_name": "TOKENIZER_POOL_TYPE",
+    "value": "ray",
+    "title": "Tokenizer Pool Type",
+    "description": "Type of tokenizer pool to use for asynchronous tokenization.",
+    "required": false,
+    "type": "text"
+  },
+  "TOKENIZER_POOL_EXTRA_CONFIG": {
+    "env_var_name": "TOKENIZER_POOL_EXTRA_CONFIG",
+    "value": "",
+    "title": "Tokenizer Pool Extra Config",
+    "description": "Extra config for tokenizer pool.",
+    "required": false,
+    "type": "text"
+  },
+  "ENABLE_LORA": {
+    "env_var_name": "ENABLE_LORA",
+    "value": false,
+    "title": "Enable LoRA",
+    "description": "If True, enable handling of LoRA adapters.",
+    "required": false,
+    "type": "toggle"
+  },
+  "MAX_LORAS": {
+    "env_var_name": "MAX_LORAS",
+    "value": 1,
+    "title": "Max LoRAs",
+    "description": "Max number of LoRAs in a single batch.",
+    "required": false,
+    "type": "number"
+  },
+  "MAX_LORA_RANK": {
+    "env_var_name": "MAX_LORA_RANK",
+    "value": 16,
+    "title": "Max LoRA Rank",
+    "description": "Max LoRA rank.",
+    "required": false,
+    "type": "number"
+  },
+  "LORA_EXTRA_VOCAB_SIZE": {
+    "env_var_name": "LORA_EXTRA_VOCAB_SIZE",
+    "value": 256,
+    "title": "LoRA Extra Vocab Size",
+    "description": "Maximum size of extra vocabulary for LoRA adapters.",
+    "required": false,
+    "type": "number"
+  },
+  "LORA_DTYPE": {
+    "env_var_name": "LORA_DTYPE",
+    "value": "auto",
+    "title": "LoRA Data Type",
+    "description": "Data type for LoRA.",
+    "required": false,
+    "type": "select",
+    "options": [
+      { "value": "auto", "label": "auto" },
+      { "value": "float16", "label": "float16" },
+      { "value": "bfloat16", "label": "bfloat16" },
+      { "value": "float32", "label": "float32" }
+    ]
+  },
+  "LONG_LORA_SCALING_FACTORS": {
+    "env_var_name": "LONG_LORA_SCALING_FACTORS",
+    "value": "",
+    "title": "Long LoRA Scaling Factors",
+    "description": "Specify multiple scaling factors for LoRA adapters.",
+    "required": false,
+    "type": "text"
+  },
+  "MAX_CPU_LORAS": {
+    "env_var_name": "MAX_CPU_LORAS",
+    "value": "",
+    "title": "Max CPU LoRAs",
+    "description": "Maximum number of LoRAs to store in CPU memory.",
+    "required": false,
+    "type": "number"
+  },
+  "FULLY_SHARDED_LORAS": {
+    "env_var_name": "FULLY_SHARDED_LORAS",
+    "value": false,
+    "title": "Fully Sharded LoRAs",
+    "description": "Enable fully sharded LoRA layers.",
+    "required": false,
+    "type": "toggle"
+  },
+  "DEVICE": {
+    "env_var_name": "DEVICE",
+    "value": "auto",
+    "title": "Device",
+    "description": "Device type for vLLM execution.",
+    "required": false,
+    "type": "select",
+    "options": [
+      { "value": "auto", "label": "auto" },
+      { "value": "cuda", "label": "cuda" },
+      { "value": "neuron", "label": "neuron" },
+      { "value": "cpu", "label": "cpu" },
+      { "value": "openvino", "label": "openvino" },
+      { "value": "tpu", "label": "tpu" },
+      { "value": "xpu", "label": "xpu" }
+    ]
+  },
+  "SCHEDULER_DELAY_FACTOR": {
+    "env_var_name": "SCHEDULER_DELAY_FACTOR",
+    "value": 0.0,
+    "title": "Scheduler Delay Factor",
+    "description": "Apply a delay before scheduling next prompt.",
+    "required": false,
+    "type": "number"
+  },
+  "ENABLE_CHUNKED_PREFILL": {
+    "env_var_name": "ENABLE_CHUNKED_PREFILL",
+    "value": false,
+    "title": "Enable Chunked Prefill",
+    "description": "Enable chunked prefill requests.",
+    "required": false,
+    "type": "toggle"
+  },
+  "SPECULATIVE_MODEL": {
+    "env_var_name": "SPECULATIVE_MODEL",
+    "value": "",
+    "title": "Speculative Model",
+    "description": "The name of the draft model to be used in speculative decoding.",
+    "required": false,
+    "type": "text"
+  },
+  "NUM_SPECULATIVE_TOKENS": {
+    "env_var_name": "NUM_SPECULATIVE_TOKENS",
+    "value": "",
+    "title": "Num Speculative Tokens",
+    "description": "The number of speculative tokens to sample from the draft model.",
+    "required": false,
+    "type": "number"
+  },
+  "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE": {
+    "env_var_name": "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
+    "value": "",
+    "title": "Speculative Draft Tensor Parallel Size",
+    "description": "Number of tensor parallel replicas for the draft model.",
+    "required": false,
+    "type": "number"
+  },
+  "SPECULATIVE_MAX_MODEL_LEN": {
+    "env_var_name": "SPECULATIVE_MAX_MODEL_LEN",
+    "value": "",
+    "title": "Speculative Max Model Length",
+    "description": "The maximum sequence length supported by the draft model.",
+    "required": false,
+    "type": "number"
+  },
+  "SPECULATIVE_DISABLE_BY_BATCH_SIZE": {
+    "env_var_name": "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
+    "value": "",
+    "title": "Speculative Disable by Batch Size",
+    "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.",
+    "required": false,
+    "type": "number"
+  },
+  "NGRAM_PROMPT_LOOKUP_MAX": {
+    "env_var_name": "NGRAM_PROMPT_LOOKUP_MAX",
+    "value": "",
+    "title": "Ngram Prompt Lookup Max",
+    "description": "Max size of window for ngram prompt lookup in speculative decoding.",
+    "required": false,
+    "type": "number"
+  },
+  "NGRAM_PROMPT_LOOKUP_MIN": {
+    "env_var_name": "NGRAM_PROMPT_LOOKUP_MIN",
+    "value": "",
+    "title": "Ngram Prompt Lookup Min",
+    "description": "Min size of window for ngram prompt lookup in speculative decoding.",
+    "required": false,
+    "type": "number"
+  },
+  "SPEC_DECODING_ACCEPTANCE_METHOD": {
+    "env_var_name": "SPEC_DECODING_ACCEPTANCE_METHOD",
+    "value": "rejection_sampler",
+    "title": "Speculative Decoding Acceptance Method",
+    "description": "Specify the acceptance method for draft token verification in speculative decoding.",
+    "required": false,
+    "type": "select",
+    "options": [
+      { "value": "rejection_sampler", "label": "rejection_sampler" },
+      { "value": "typical_acceptance_sampler", "label": "typical_acceptance_sampler" }
+    ]
+  },
+  "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD": {
+    "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD",
+    "value": "",
+    "title": "Typical Acceptance Sampler Posterior Threshold",
+    "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.",
+    "required": false,
+    "type": "number"
+  },
+  "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA": {
+    "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
+    "value": "",
+    "title": "Typical Acceptance Sampler Posterior Alpha",
+    "description": "A scaling factor for the entropy-based threshold for token acceptance.",
+    "required": false,
+    "type": "number"
+  },
+  "MODEL_LOADER_EXTRA_CONFIG": {
+    "env_var_name": "MODEL_LOADER_EXTRA_CONFIG",
+    "value": "",
+    "title": "Model Loader Extra Config",
+    "description": "Extra config for model loader.",
+    "required": false,
+    "type": "text"
+  },
+  "PREEMPTION_MODE": {
+    "env_var_name": "PREEMPTION_MODE",
+    "value": "",
+    "title": "Preemption Mode",
+    "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.",
+    "required": false,
+    "type": "text"
+  },
+  "PREEMPTION_CHECK_PERIOD": {
+    "env_var_name": "PREEMPTION_CHECK_PERIOD",
+    "value": 1.0,
+    "title": "Preemption Check Period",
+    "description": "How frequently the engine checks if a preemption happens.",
+    "required": false,
+    "type": "number"
+  },
+  "PREEMPTION_CPU_CAPACITY": {
+    "env_var_name": "PREEMPTION_CPU_CAPACITY",
+    "value": 2,
+    "title": "Preemption CPU Capacity",
+    "description": "The percentage of CPU memory used for the saved activations.",
+    "required": false,
+    "type": "number"
+  },
+  "MAX_LOG_LEN": {
+    "env_var_name": "MAX_LOG_LEN",
+    "value": "",
+    "title": "Max Log Length",
+    "description": "Max number of characters or ID numbers being printed in log.",
+    "required": false,
+    "type": "number"
+  },
+  "DISABLE_LOGGING_REQUEST": {
+    "env_var_name": "DISABLE_LOGGING_REQUEST",
+    "value": false,
+    "title": "Disable Logging Request",
+    "description": "Disable logging requests.",
+    "required": false,
+    "type": "toggle"
+  },
+  "TOKENIZER_NAME": {
+    "env_var_name": "TOKENIZER_NAME",
+    "value": "",
+    "title": "Tokenizer Name",
+    "description": "Tokenizer repo to use a different tokenizer than the model's default",
+    "required": false,
+    "type": "text"
+  },
+  "TOKENIZER_REVISION": {
+    "env_var_name": "TOKENIZER_REVISION",
+    "value": "",
+    "title": "Tokenizer Revision",
+    "description": "Tokenizer revision to load",
+    "required": false,
+    "type": "text"
+  },
+  "CUSTOM_CHAT_TEMPLATE": {
+    "env_var_name": "CUSTOM_CHAT_TEMPLATE",
+    "value": "",
+    "title": "Custom Chat Template",
+    "description": "Custom chat jinja template",
+    "required": false,
+    "type": "text"
+  },
+  "GPU_MEMORY_UTILIZATION": {
+    "env_var_name": "GPU_MEMORY_UTILIZATION",
+    "value": "0.95",
+    "title": "GPU Memory Utilization",
+    "description": "Sets GPU VRAM utilization",
+    "required": false,
+    "type": "number"
+  },
+  "BLOCK_SIZE": {
+    "env_var_name": "BLOCK_SIZE",
+    "value": "16",
+    "title": "Block Size",
+    "description": "Token block size for contiguous chunks of tokens",
+    "required": false,
+    "type": "number"
+  },
+  "SWAP_SPACE": {
+    "env_var_name": "SWAP_SPACE",
+    "value": "4",
+    "title": "Swap Space",
+    "description": "CPU swap space size (GiB) per GPU",
+    "required": false,
+    "type": "number"
+  },
+  "ENFORCE_EAGER": {
+    "env_var_name": "ENFORCE_EAGER",
+    "value": false,
+    "title": "Enforce Eager",
+    "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility",
+    "required": false,
+    "type": "toggle"
+  },
+  "MAX_SEQ_LEN_TO_CAPTURE": {
+    "env_var_name": "MAX_SEQ_LEN_TO_CAPTURE",
+    "value": "8192",
+    "title": "CUDA Graph Max Content Length",
+    "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode",
+    "required": false,
+    "type": "number"
+  },
+  "DISABLE_CUSTOM_ALL_REDUCE": {
+    "env_var_name": "DISABLE_CUSTOM_ALL_REDUCE",
+    "value": false,
+    "title": "Disable Custom All Reduce",
+    "description": "Enables or disables custom all reduce",
+    "required": false,
+    "type": "toggle"
+  },
+  "DEFAULT_BATCH_SIZE": {
+    "env_var_name": "DEFAULT_BATCH_SIZE",
+    "value": "50",
+    "title": "Default Final Batch Size",
+    "description": "Default and Maximum batch size for token streaming to reduce HTTP calls",
+    "required": false,
+    "type": "number"
+  },
+  "DEFAULT_MIN_BATCH_SIZE": {
+    "env_var_name": "DEFAULT_MIN_BATCH_SIZE",
+    "value": "1",
+    "title": "Default Starting Batch Size",
+    "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request",
+    "required": false,
+    "type": "number"
+  },
+  "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": {
+    "env_var_name": "DEFAULT_BATCH_SIZE_GROWTH_FACTOR",
+    "value": "3",
+    "title": "Default Batch Size Growth Factor",
+    "description": "Growth factor for dynamic batch size",
+    "required": false,
+    "type": "number"
+  },
+  "RAW_OPENAI_OUTPUT": {
+    "env_var_name": "RAW_OPENAI_OUTPUT",
+    "value": true,
+    "title": "Raw OpenAI Output",
+    "description": "Raw OpenAI output instead of just the text",
+    "required": false,
+    "type": "toggle"
+  },
+  "OPENAI_RESPONSE_ROLE": {
+    "env_var_name": "OPENAI_RESPONSE_ROLE",
+    "value": "assistant",
+    "title": "OpenAI Response Role",
+    "description": "Role of the LLM's Response in OpenAI Chat Completions",
+    "required": false,
+    "type": "text"
+  },
+  "OPENAI_SERVED_MODEL_NAME_OVERRIDE": {
+    "env_var_name": "OPENAI_SERVED_MODEL_NAME_OVERRIDE",
+    "value": "",
+    "title": "OpenAI Served Model Name Override",
+    "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests",
+    "required": false,
+    "type": "text"
+  },
+  "MAX_CONCURRENCY": {
+    "env_var_name": "MAX_CONCURRENCY",
+    "value": "300",
+    "title": "Max Concurrency",
+    "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency",
+    "required": false,
+    "type": "number"
+  },
+  "MODEL_REVISION": {
+    "env_var_name": "MODEL_REVISION",
+    "value": "",
+    "title": "Model Revision",
+    "description": "Model revision (branch) to load",
+    "required": false,
+    "type": "text"
+  },
+  "BASE_PATH": {
+    "env_var_name": "BASE_PATH",
+    "value": "/runpod-volume",
+    "title": "Base Path",
+    "description": "Storage directory for Huggingface cache and model",
+    "required": false,
+    "type": "text"
+  },
+  "DISABLE_LOG_REQUESTS": {
+    "env_var_name": "DISABLE_LOG_REQUESTS",
+    "value": true,
+    "title": "Disable Log Requests",
+    "description": "Enables or disables vLLM request logging",
+    "required": false,
+    "type": "toggle"
+  },
+  "ENABLE_AUTO_TOOL_CHOICE": {
+    "env_var_name": "ENABLE_AUTO_TOOL_CHOICE",
+    "value": false,
+    "title": "Enable Auto Tool Choice",
+    "description": "Enables or disables auto tool choice",
+    "required": false,
+    "type": "toggle"
+  },
+  "TOOL_CALL_PARSER": {
+    "env_var_name": "TOOL_CALL_PARSER",
+    "value": "",
+    "title": "Tool Call Parser",
+    "description": "Tool call parser",
+    "required": false,
+    "type": "select",
+    "options": [
+      { "value": "", "label": "None" },
+      { "value": "hermes", "label": "Hermes" },
+      { "value": "mistral", "label": "Mistral" },
+      { "value": "llama3_json", "label": "Llama3 JSON" },
+      { "value": "pythonic", "label": "Pythonic" },
+      { "value": "internlm", "label": "InternLM" }
+    ]
+  }
+}
+}