hanxiao commited on
Commit
bcecc9a
·
1 Parent(s): c967bd1

Rename test_jvlm.py to infer.py, add pyproject.toml, update README with uv install

Browse files
Files changed (3) hide show
  1. README.md +24 -27
  2. test_jvlm.py → infer.py +4 -3
  3. pyproject.toml +18 -0
README.md CHANGED
@@ -168,26 +168,23 @@ Pair training for single-vector embeddings. Higher is better. Averages are macro
168
 
169
  ## Usage
170
 
171
- ### Requirements
172
-
173
- The following Python packages are required:
174
 
175
- - `torch>=2.9.0`
176
- - `torchvision>=0.24.0`
177
- - `transformers>=4.57.0`
178
- - `pillow>=12.0.0`
179
- - `einops>=0.8.1`
180
-
181
- Optional but recommended packages:
182
 
183
- - **flash-attention**: Installing [flash-attention](https://github.com/Dao-AILab/flash-attention) is recommended for improved inference speed and efficiency, but not mandatory.
 
 
 
184
 
185
  ### Using the CLI
186
 
187
- You can directly chat with `jina-vlm-v1` using the `test_jvlm.py` CLI.
188
 
189
  **Options:**
190
- - `-m, --model`: Model path (default: `'.'`). Set this to `'jinaai/jina-vlm-v1'` if you are running this script outside this repo.
191
  - `-i, --image`: Image path, URL, or glob pattern (can specify multiple times, default: `[]`).
192
  - `-p, --prompt`: Text prompt (can specify multiple times, default: `'Describe the image for me in 100 words'` or `'Describe the images for me in 100 words'` if multiple images are provided).
193
  - `--max-crops`: Maximum crops (default: `12`).
@@ -200,49 +197,49 @@ You can directly chat with `jina-vlm-v1` using the `test_jvlm.py` CLI.
200
 
201
  ```bash
202
  # Single image
203
- python test_jvlm.py -i assets/jvlm_architecture.png -p "What's in this image?"
204
 
205
  # Single image with streaming
206
- python test_jvlm.py -i assets/the_persistence_of_memory.jpg -p "What's in this image?" --stream
207
 
208
  # Remote image URL
209
- python test_jvlm.py -i https://picsum.photos/id/1025/800/600.jpg -p "Describe this image"
210
 
211
  # Multiple images (local and remote)
212
- python test_jvlm.py -i https://picsum.photos/id/1015/800/600.jpg -i https://picsum.photos/id/1016/800/600.jpg -i https://picsum.photos/id/1021/800/600.jpg -p "Describe these images"
213
 
214
  # Text only input
215
- python test_jvlm.py -p "How many planets are in our solar system?"
216
 
217
  # Glob pattern support (quote patterns to prevent shell expansion)
218
- python test_jvlm.py -i "*.jpg" -p "Describe these images"
219
- python test_jvlm.py -i "photos/*.png" -i "images/*.jpg" -p "What do you see in these images?"
220
 
221
  # Custom max crops, max pixels and max output tokens
222
  # Reducing max crops and max pixels speeds up inference and lowers mem consumption on large images
223
- python test_jvlm.py -i photo.jpg -p "Describe this picture in detail" --max-crops 8 --max-pixels 500000 --max-tokens 2048
224
 
225
  # Prompt position control
226
- python test_jvlm.py -i photo.jpg -p "What's in this image?" --prompt-first
227
 
228
  # Map mode: apply one prompt to multiple images
229
- python test_jvlm.py --map -i "*.jpg" -p "What is this?"
230
 
231
  # Map mode: apply multiple prompts to one image
232
- python test_jvlm.py --map -i photo_of_a_dog.jpg -p "What breed?" -p "What color?" -p "Happy or sad?"
233
 
234
  # Batch inference
235
  # When an equal number of images and prompts (>1) is provided, we assume it is batched inference
236
  # Generation will run in a batch if streaming is disabled, otherwise sequentially
237
- python test_jvlm.py -i photo1.jpg -p "What is shown in this image?" -i photo2.jpg -p "Describe this image"
238
 
239
  # Similarly for no images and multiple prompts
240
- python test_jvlm.py -p "What is a neural network?" -p "Describe the concept of polymorphism in Computer Science"
241
  ```
242
 
243
  Example input:
244
  ```bash
245
- python test_jvlm.py -m jinaai/jina-vlm-v1 -i assets/the_persistence_of_memory.jpg -p "Describe this picture"
246
  ```
247
  <p align="center">
248
  <img src="./assets/the_persistence_of_memory.jpg">
 
168
 
169
  ## Usage
170
 
171
+ ### Installation
 
 
172
 
173
+ ```bash
174
+ uv sync
175
+ ```
 
 
 
 
176
 
177
+ For CUDA users with FlashAttention2 support:
178
+ ```bash
179
+ uv sync --extra flash-attn
180
+ ```
181
 
182
  ### Using the CLI
183
 
184
+ You can directly chat with `jina-vlm-v1` using the `infer.py` CLI.
185
 
186
  **Options:**
187
+ - `-m, --model`: Model path. Auto-detects local repo (if `config.json` exists) or falls back to `jinaai/jina-vlm-v1` from HuggingFace.
188
  - `-i, --image`: Image path, URL, or glob pattern (can specify multiple times, default: `[]`).
189
  - `-p, --prompt`: Text prompt (can specify multiple times, default: `'Describe the image for me in 100 words'` or `'Describe the images for me in 100 words'` if multiple images are provided).
190
  - `--max-crops`: Maximum crops (default: `12`).
 
197
 
198
  ```bash
199
  # Single image
200
+ python infer.py -i assets/jvlm_architecture.png -p "What's in this image?"
201
 
202
  # Single image with streaming
203
+ python infer.py -i assets/the_persistence_of_memory.jpg -p "What's in this image?" --stream
204
 
205
  # Remote image URL
206
+ python infer.py -i https://picsum.photos/id/1025/800/600.jpg -p "Describe this image"
207
 
208
  # Multiple images (local and remote)
209
+ python infer.py -i https://picsum.photos/id/1015/800/600.jpg -i https://picsum.photos/id/1016/800/600.jpg -i https://picsum.photos/id/1021/800/600.jpg -p "Describe these images"
210
 
211
  # Text only input
212
+ python infer.py -p "How many planets are in our solar system?"
213
 
214
  # Glob pattern support (quote patterns to prevent shell expansion)
215
+ python infer.py -i "*.jpg" -p "Describe these images"
216
+ python infer.py -i "photos/*.png" -i "images/*.jpg" -p "What do you see in these images?"
217
 
218
  # Custom max crops, max pixels and max output tokens
219
  # Reducing max crops and max pixels speeds up inference and lowers mem consumption on large images
220
+ python infer.py -i photo.jpg -p "Describe this picture in detail" --max-crops 8 --max-pixels 500000 --max-tokens 2048
221
 
222
  # Prompt position control
223
+ python infer.py -i photo.jpg -p "What's in this image?" --prompt-first
224
 
225
  # Map mode: apply one prompt to multiple images
226
+ python infer.py --map -i "*.jpg" -p "What is this?"
227
 
228
  # Map mode: apply multiple prompts to one image
229
+ python infer.py --map -i photo_of_a_dog.jpg -p "What breed?" -p "What color?" -p "Happy or sad?"
230
 
231
  # Batch inference
232
  # When an equal number of images and prompts (>1) is provided, we assume it is batched inference
233
  # Generation will run in a batch if streaming is disabled, otherwise sequentially
234
+ python infer.py -i photo1.jpg -p "What is shown in this image?" -i photo2.jpg -p "Describe this image"
235
 
236
  # Similarly for no images and multiple prompts
237
+ python infer.py -p "What is a neural network?" -p "Describe the concept of polymorphism in Computer Science"
238
  ```
239
 
240
  Example input:
241
  ```bash
242
+ python infer.py -i assets/the_persistence_of_memory.jpg -p "Describe this picture"
243
  ```
244
  <p align="center">
245
  <img src="./assets/the_persistence_of_memory.jpg">
test_jvlm.py → infer.py RENAMED
@@ -258,13 +258,14 @@ def test_jvlm():
258
  parser = argparse.ArgumentParser(
259
  description='jina-vlm-v1 vision-language model inference.'
260
  )
 
261
  parser.add_argument(
262
  '-m',
263
  '--model',
264
- default='.',
265
  help=(
266
- 'Model path (default: `"."`). Set this to `"jinaai/jina-vlm-v1"` if you '
267
- 'are running this script outside this repo.'
268
  ),
269
  )
270
  parser.add_argument(
 
258
  parser = argparse.ArgumentParser(
259
  description='jina-vlm-v1 vision-language model inference.'
260
  )
261
+ default_model = '.' if os.path.exists('./config.json') else 'jinaai/jina-vlm-v1'
262
  parser.add_argument(
263
  '-m',
264
  '--model',
265
+ default=default_model,
266
  help=(
267
+ 'Model path. Auto-detects local repo (if config.json exists) or '
268
+ 'falls back to "jinaai/jina-vlm-v1" from HuggingFace.'
269
  ),
270
  )
271
  parser.add_argument(
pyproject.toml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "jina-vlm-v1"
3
+ version = "1.0.0"
4
+ description = "Jina VLM v1: Lightweight Vision Language Alignment"
5
+ readme = "README.md"
6
+ license = "CC-BY-NC-4.0"
7
+ requires-python = ">=3.10"
8
+ dependencies = [
9
+ "torch>=2.9.0",
10
+ "torchvision>=0.24.0",
11
+ "transformers>=4.57.0",
12
+ "pillow>=12.0.0",
13
+ "einops>=0.8.1",
14
+ "accelerate>=1.0.0",
15
+ ]
16
+
17
+ [project.optional-dependencies]
18
+ flash-attn = ["flash-attn>=2.0.0"]