Rename test_jvlm.py to infer.py, add pyproject.toml, update README with uv install
Browse files- README.md +24 -27
- test_jvlm.py → infer.py +4 -3
- pyproject.toml +18 -0
README.md
CHANGED
|
@@ -168,26 +168,23 @@ Pair training for single-vector embeddings. Higher is better. Averages are macro
|
|
| 168 |
|
| 169 |
## Usage
|
| 170 |
|
| 171 |
-
###
|
| 172 |
-
|
| 173 |
-
The following Python packages are required:
|
| 174 |
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
- `pillow>=12.0.0`
|
| 179 |
-
- `einops>=0.8.1`
|
| 180 |
-
|
| 181 |
-
Optional but recommended packages:
|
| 182 |
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
### Using the CLI
|
| 186 |
|
| 187 |
-
You can directly chat with `jina-vlm-v1` using the `
|
| 188 |
|
| 189 |
**Options:**
|
| 190 |
-
- `-m, --model`: Model path (
|
| 191 |
- `-i, --image`: Image path, URL, or glob pattern (can specify multiple times, default: `[]`).
|
| 192 |
- `-p, --prompt`: Text prompt (can specify multiple times, default: `'Describe the image for me in 100 words'` or `'Describe the images for me in 100 words'` if multiple images are provided).
|
| 193 |
- `--max-crops`: Maximum crops (default: `12`).
|
|
@@ -200,49 +197,49 @@ You can directly chat with `jina-vlm-v1` using the `test_jvlm.py` CLI.
|
|
| 200 |
|
| 201 |
```bash
|
| 202 |
# Single image
|
| 203 |
-
python
|
| 204 |
|
| 205 |
# Single image with streaming
|
| 206 |
-
python
|
| 207 |
|
| 208 |
# Remote image URL
|
| 209 |
-
python
|
| 210 |
|
| 211 |
# Multiple images (local and remote)
|
| 212 |
-
python
|
| 213 |
|
| 214 |
# Text only input
|
| 215 |
-
python
|
| 216 |
|
| 217 |
# Glob pattern support (quote patterns to prevent shell expansion)
|
| 218 |
-
python
|
| 219 |
-
python
|
| 220 |
|
| 221 |
# Custom max crops, max pixels and max output tokens
|
| 222 |
# Reducing max crops and max pixels speeds up inference and lowers mem consumption on large images
|
| 223 |
-
python
|
| 224 |
|
| 225 |
# Prompt position control
|
| 226 |
-
python
|
| 227 |
|
| 228 |
# Map mode: apply one prompt to multiple images
|
| 229 |
-
python
|
| 230 |
|
| 231 |
# Map mode: apply multiple prompts to one image
|
| 232 |
-
python
|
| 233 |
|
| 234 |
# Batch inference
|
| 235 |
# When an equal number of images and prompts (>1) is provided, we assume it is batched inference
|
| 236 |
# Generation will run in a batch if streaming is disabled, otherwise sequentially
|
| 237 |
-
python
|
| 238 |
|
| 239 |
# Similarly for no images and multiple prompts
|
| 240 |
-
python
|
| 241 |
```
|
| 242 |
|
| 243 |
Example input:
|
| 244 |
```bash
|
| 245 |
-
python
|
| 246 |
```
|
| 247 |
<p align="center">
|
| 248 |
<img src="./assets/the_persistence_of_memory.jpg">
|
|
|
|
| 168 |
|
| 169 |
## Usage
|
| 170 |
|
| 171 |
+
### Installation
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
```bash
|
| 174 |
+
uv sync
|
| 175 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
For CUDA users with FlashAttention2 support:
|
| 178 |
+
```bash
|
| 179 |
+
uv sync --extra flash-attn
|
| 180 |
+
```
|
| 181 |
|
| 182 |
### Using the CLI
|
| 183 |
|
| 184 |
+
You can directly chat with `jina-vlm-v1` using the `infer.py` CLI.
|
| 185 |
|
| 186 |
**Options:**
|
| 187 |
+
- `-m, --model`: Model path. Auto-detects local repo (if `config.json` exists) or falls back to `jinaai/jina-vlm-v1` from HuggingFace.
|
| 188 |
- `-i, --image`: Image path, URL, or glob pattern (can specify multiple times, default: `[]`).
|
| 189 |
- `-p, --prompt`: Text prompt (can specify multiple times, default: `'Describe the image for me in 100 words'` or `'Describe the images for me in 100 words'` if multiple images are provided).
|
| 190 |
- `--max-crops`: Maximum crops (default: `12`).
|
|
|
|
| 197 |
|
| 198 |
```bash
|
| 199 |
# Single image
|
| 200 |
+
python infer.py -i assets/jvlm_architecture.png -p "What's in this image?"
|
| 201 |
|
| 202 |
# Single image with streaming
|
| 203 |
+
python infer.py -i assets/the_persistence_of_memory.jpg -p "What's in this image?" --stream
|
| 204 |
|
| 205 |
# Remote image URL
|
| 206 |
+
python infer.py -i https://picsum.photos/id/1025/800/600.jpg -p "Describe this image"
|
| 207 |
|
| 208 |
# Multiple images (local and remote)
|
| 209 |
+
python infer.py -i https://picsum.photos/id/1015/800/600.jpg -i https://picsum.photos/id/1016/800/600.jpg -i https://picsum.photos/id/1021/800/600.jpg -p "Describe these images"
|
| 210 |
|
| 211 |
# Text only input
|
| 212 |
+
python infer.py -p "How many planets are in our solar system?"
|
| 213 |
|
| 214 |
# Glob pattern support (quote patterns to prevent shell expansion)
|
| 215 |
+
python infer.py -i "*.jpg" -p "Describe these images"
|
| 216 |
+
python infer.py -i "photos/*.png" -i "images/*.jpg" -p "What do you see in these images?"
|
| 217 |
|
| 218 |
# Custom max crops, max pixels and max output tokens
|
| 219 |
# Reducing max crops and max pixels speeds up inference and lowers mem consumption on large images
|
| 220 |
+
python infer.py -i photo.jpg -p "Describe this picture in detail" --max-crops 8 --max-pixels 500000 --max-tokens 2048
|
| 221 |
|
| 222 |
# Prompt position control
|
| 223 |
+
python infer.py -i photo.jpg -p "What's in this image?" --prompt-first
|
| 224 |
|
| 225 |
# Map mode: apply one prompt to multiple images
|
| 226 |
+
python infer.py --map -i "*.jpg" -p "What is this?"
|
| 227 |
|
| 228 |
# Map mode: apply multiple prompts to one image
|
| 229 |
+
python infer.py --map -i photo_of_a_dog.jpg -p "What breed?" -p "What color?" -p "Happy or sad?"
|
| 230 |
|
| 231 |
# Batch inference
|
| 232 |
# When an equal number of images and prompts (>1) is provided, we assume it is batched inference
|
| 233 |
# Generation will run in a batch if streaming is disabled, otherwise sequentially
|
| 234 |
+
python infer.py -i photo1.jpg -p "What is shown in this image?" -i photo2.jpg -p "Describe this image"
|
| 235 |
|
| 236 |
# Similarly for no images and multiple prompts
|
| 237 |
+
python infer.py -p "What is a neural network?" -p "Describe the concept of polymorphism in Computer Science"
|
| 238 |
```
|
| 239 |
|
| 240 |
Example input:
|
| 241 |
```bash
|
| 242 |
+
python infer.py -i assets/the_persistence_of_memory.jpg -p "Describe this picture"
|
| 243 |
```
|
| 244 |
<p align="center">
|
| 245 |
<img src="./assets/the_persistence_of_memory.jpg">
|
test_jvlm.py → infer.py
RENAMED
|
@@ -258,13 +258,14 @@ def test_jvlm():
|
|
| 258 |
parser = argparse.ArgumentParser(
|
| 259 |
description='jina-vlm-v1 vision-language model inference.'
|
| 260 |
)
|
|
|
|
| 261 |
parser.add_argument(
|
| 262 |
'-m',
|
| 263 |
'--model',
|
| 264 |
-
default=
|
| 265 |
help=(
|
| 266 |
-
'Model path
|
| 267 |
-
'
|
| 268 |
),
|
| 269 |
)
|
| 270 |
parser.add_argument(
|
|
|
|
| 258 |
parser = argparse.ArgumentParser(
|
| 259 |
description='jina-vlm-v1 vision-language model inference.'
|
| 260 |
)
|
| 261 |
+
default_model = '.' if os.path.exists('./config.json') else 'jinaai/jina-vlm-v1'
|
| 262 |
parser.add_argument(
|
| 263 |
'-m',
|
| 264 |
'--model',
|
| 265 |
+
default=default_model,
|
| 266 |
help=(
|
| 267 |
+
'Model path. Auto-detects local repo (if config.json exists) or '
|
| 268 |
+
'falls back to "jinaai/jina-vlm-v1" from HuggingFace.'
|
| 269 |
),
|
| 270 |
)
|
| 271 |
parser.add_argument(
|
pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "jina-vlm-v1"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "Jina VLM v1: Lightweight Vision Language Alignment"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
license = "CC-BY-NC-4.0"
|
| 7 |
+
requires-python = ">=3.10"
|
| 8 |
+
dependencies = [
|
| 9 |
+
"torch>=2.9.0",
|
| 10 |
+
"torchvision>=0.24.0",
|
| 11 |
+
"transformers>=4.57.0",
|
| 12 |
+
"pillow>=12.0.0",
|
| 13 |
+
"einops>=0.8.1",
|
| 14 |
+
"accelerate>=1.0.0",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
[project.optional-dependencies]
|
| 18 |
+
flash-attn = ["flash-attn>=2.0.0"]
|