jinaai
/

jina-vlm

@@ -168,26 +168,23 @@ Pair training for single-vector embeddings. Higher is better. Averages are macro
 ## Usage
-### Requirements
-The following Python packages are required:
-- `torch>=2.9.0`
-- `torchvision>=0.24.0`
-- `transformers>=4.57.0`
-- `pillow>=12.0.0`
-- `einops>=0.8.1`
-Optional but recommended packages:
-- **flash-attention**: Installing [flash-attention](https://github.com/Dao-AILab/flash-attention) is recommended for improved inference speed and efficiency, but not mandatory.
 ### Using the CLI
-You can directly chat with `jina-vlm-v1` using the `test_jvlm.py` CLI.
 **Options:**
-- `-m, --model`: Model path (default: `'.'`). Set this to `'jinaai/jina-vlm-v1'` if you are running this script outside this repo.
 - `-i, --image`: Image path, URL, or glob pattern (can specify multiple times, default: `[]`).
 - `-p, --prompt`: Text prompt (can specify multiple times, default: `'Describe the image for me in 100 words'` or `'Describe the images for me in 100 words'` if multiple images are provided).
 - `--max-crops`: Maximum crops (default: `12`).
@@ -200,49 +197,49 @@ You can directly chat with `jina-vlm-v1` using the `test_jvlm.py` CLI.
 ```bash
 # Single image
-python test_jvlm.py -i assets/jvlm_architecture.png -p "What's in this image?"
 # Single image with streaming
-python test_jvlm.py -i assets/the_persistence_of_memory.jpg -p "What's in this image?" --stream
 # Remote image URL
-python test_jvlm.py -i https://picsum.photos/id/1025/800/600.jpg -p "Describe this image"
 # Multiple images (local and remote)
-python test_jvlm.py -i https://picsum.photos/id/1015/800/600.jpg -i https://picsum.photos/id/1016/800/600.jpg -i https://picsum.photos/id/1021/800/600.jpg -p "Describe these images"
 # Text only input
-python test_jvlm.py -p "How many planets are in our solar system?"
 # Glob pattern support (quote patterns to prevent shell expansion)
-python test_jvlm.py -i "*.jpg" -p "Describe these images"
-python test_jvlm.py -i "photos/*.png" -i "images/*.jpg" -p "What do you see in these images?"
 # Custom max crops, max pixels and max output tokens
 # Reducing max crops and max pixels speeds up inference and lowers mem consumption on large images
-python test_jvlm.py -i photo.jpg -p "Describe this picture in detail" --max-crops 8 --max-pixels 500000 --max-tokens 2048
 # Prompt position control
-python test_jvlm.py -i photo.jpg -p "What's in this image?" --prompt-first
 # Map mode: apply one prompt to multiple images
-python test_jvlm.py --map -i "*.jpg" -p "What is this?"
 # Map mode: apply multiple prompts to one image
-python test_jvlm.py --map -i photo_of_a_dog.jpg -p "What breed?" -p "What color?" -p "Happy or sad?"
 # Batch inference
 # When an equal number of images and prompts (>1) is provided, we assume it is batched inference
 # Generation will run in a batch if streaming is disabled, otherwise sequentially
-python test_jvlm.py -i photo1.jpg -p "What is shown in this image?" -i photo2.jpg -p "Describe this image"
 # Similarly for no images and multiple prompts
-python test_jvlm.py -p "What is a neural network?" -p "Describe the concept of polymorphism in Computer Science"
 ```
 Example input:
 ```bash
-python test_jvlm.py -m jinaai/jina-vlm-v1 -i assets/the_persistence_of_memory.jpg -p "Describe this picture"
 ```
 <p align="center">
 <img src="./assets/the_persistence_of_memory.jpg">

 ## Usage
+### Installation
+```bash
+uv sync
+```
+For CUDA users with FlashAttention2 support:
+```bash
+uv sync --extra flash-attn
+```
 ### Using the CLI
+You can directly chat with `jina-vlm-v1` using the `infer.py` CLI.
 **Options:**
+- `-m, --model`: Model path. Auto-detects local repo (if `config.json` exists) or falls back to `jinaai/jina-vlm-v1` from HuggingFace.
 - `-i, --image`: Image path, URL, or glob pattern (can specify multiple times, default: `[]`).
 - `-p, --prompt`: Text prompt (can specify multiple times, default: `'Describe the image for me in 100 words'` or `'Describe the images for me in 100 words'` if multiple images are provided).
 - `--max-crops`: Maximum crops (default: `12`).
 ```bash
 # Single image
+python infer.py -i assets/jvlm_architecture.png -p "What's in this image?"
 # Single image with streaming
+python infer.py -i assets/the_persistence_of_memory.jpg -p "What's in this image?" --stream
 # Remote image URL
+python infer.py -i https://picsum.photos/id/1025/800/600.jpg -p "Describe this image"
 # Multiple images (local and remote)
+python infer.py -i https://picsum.photos/id/1015/800/600.jpg -i https://picsum.photos/id/1016/800/600.jpg -i https://picsum.photos/id/1021/800/600.jpg -p "Describe these images"
 # Text only input
+python infer.py -p "How many planets are in our solar system?"
 # Glob pattern support (quote patterns to prevent shell expansion)
+python infer.py -i "*.jpg" -p "Describe these images"
+python infer.py -i "photos/*.png" -i "images/*.jpg" -p "What do you see in these images?"
 # Custom max crops, max pixels and max output tokens
 # Reducing max crops and max pixels speeds up inference and lowers mem consumption on large images
+python infer.py -i photo.jpg -p "Describe this picture in detail" --max-crops 8 --max-pixels 500000 --max-tokens 2048
 # Prompt position control
+python infer.py -i photo.jpg -p "What's in this image?" --prompt-first
 # Map mode: apply one prompt to multiple images
+python infer.py --map -i "*.jpg" -p "What is this?"
 # Map mode: apply multiple prompts to one image
+python infer.py --map -i photo_of_a_dog.jpg -p "What breed?" -p "What color?" -p "Happy or sad?"
 # Batch inference
 # When an equal number of images and prompts (>1) is provided, we assume it is batched inference
 # Generation will run in a batch if streaming is disabled, otherwise sequentially
+python infer.py -i photo1.jpg -p "What is shown in this image?" -i photo2.jpg -p "Describe this image"
 # Similarly for no images and multiple prompts
+python infer.py -p "What is a neural network?" -p "Describe the concept of polymorphism in Computer Science"
 ```
 Example input:
 ```bash
+python infer.py -i assets/the_persistence_of_memory.jpg -p "Describe this picture"
 ```
 <p align="center">
 <img src="./assets/the_persistence_of_memory.jpg">

test_jvlm.py → infer.py RENAMED Viewed

@@ -258,13 +258,14 @@ def test_jvlm():
     parser = argparse.ArgumentParser(
         description='jina-vlm-v1 vision-language model inference.'
     )
     parser.add_argument(
         '-m',
         '--model',
-        default='.',
         help=(
-            'Model path (default: `"."`). Set this to `"jinaai/jina-vlm-v1"` if you '
-            'are running this script outside this repo.'
         ),
     )
     parser.add_argument(

     parser = argparse.ArgumentParser(
         description='jina-vlm-v1 vision-language model inference.'
     )
+    default_model = '.' if os.path.exists('./config.json') else 'jinaai/jina-vlm-v1'
     parser.add_argument(
         '-m',
         '--model',
+        default=default_model,
         help=(
+            'Model path. Auto-detects local repo (if config.json exists) or '
+            'falls back to "jinaai/jina-vlm-v1" from HuggingFace.'
         ),
     )
     parser.add_argument(

pyproject.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[project]
+name = "jina-vlm-v1"
+version = "1.0.0"
+description = "Jina VLM v1: Lightweight Vision Language Alignment"
+readme = "README.md"
+license = "CC-BY-NC-4.0"
+requires-python = ">=3.10"
+dependencies = [
+    "torch>=2.9.0",
+    "torchvision>=0.24.0",
+    "transformers>=4.57.0",
+    "pillow>=12.0.0",
+    "einops>=0.8.1",
+    "accelerate>=1.0.0",
+]
+[project.optional-dependencies]
+flash-attn = ["flash-attn>=2.0.0"]