Model update

Browse files

Files changed (5) hide show

README1.md +114 -0
infer.py +499 -0
infer_utils.py +247 -0
modeling_jvlm.py +1 -1
test_jvlm.py +504 -0

README1.md ADDED Viewed

	@@ -0,0 +1,114 @@

+# JVLM - Jina Vision Language Model
+Minimal inference script for JVLM with streaming output and batch processing.
+## Quick Start
+```bash
+python infer.py -i test_image.jpg -p "Describe the image"
+```
+## Requirements
+```bash
+uv sync
+```
+Optional extras:
+```bash
+uv sync --extra accelerate  # recommended for automatic device selection
+uv sync --extra tensorflow  # only needed for tensorflow resize methods
+```
+## Usage
+### CLI
+```bash
+# Single image (streaming)
+python infer.py -i photo.jpg -p "What's in this image?"
+# Remote image URL
+python infer.py -i https://example.com/image.jpg -p "Describe this"
+# Multiple images (local and remote)
+python infer.py -i img1.jpg -i https://example.com/img2.jpg -i img3.jpg -p "Compare these images"
+# Glob pattern support (quote patterns to prevent shell expansion)
+python infer.py -i "*.jpg" -p "Describe"
+python infer.py -i "photos/*.png" -i "images/*.jpg" -p "What do you see?"
+# Non-streaming
+python infer.py -i photo.jpg -p "What's in this image?" --no-stream
+# Custom model
+python infer.py -m /path/to/model -i image.png -p "Describe the scene"
+# Custom max tokens
+python infer.py -i photo.jpg -p "Explain in detail" --max-tokens 2048
+# Prompt position control
+python infer.py -i photo.jpg -p "Describe" --prompt-first
+# Map mode: apply one prompt to multiple images
+python infer.py --map -i "*.jpg" -p "What is this?"
+# Map mode: apply multiple prompts to one image
+python infer.py --map -i photo.jpg -p "What breed?" -p "What color?" -p "Happy or sad?"
+```
+**Options:**
+- `-i, --image`: image path, URL, or glob pattern (can specify multiple times, default: test_image.jpg)
+- `-p, --prompt`: text prompt (can specify multiple times with --map, default: "Describe the image for me in 100 words")
+- `-m, --model`: model path (default: ".")
+- `--max-tokens`: maximum output tokens (default: 1024)
+- `--no-stream`: disable streaming (default: stream token-by-token)
+- `--no-image-labels`: disable ordinal labels for multi-image inputs (default: enabled)
+- `--prompt-first`: place prompt before images instead of after (may affect output quality)
+- `--map`: map mode - apply single prompt to multiple images OR multiple prompts to single image
+### Python
+```python
+from PIL import Image
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+processor = AutoProcessor.from_pretrained(".", trust_remote_code=True, use_fast=False)
+model = AutoModelForCausalLM.from_pretrained(
+    ".", trust_remote_code=True, dtype=torch.bfloat16,
+    device_map="auto"
+)
+device = next(model.parameters()).device
+image = Image.open("test_image.jpg")
+inputs = [{
+    'role': 'user',
+    'content': [{'type': 'image', 'image': image}, {'type': 'text', 'text': "Describe this"}]
+}]
+messages, images = processor.apply_chat_template(inputs, add_generation_prompt=True)
+processed_inputs = processor(messages=messages, images=images)
+batched_inputs = processor.collate([processed_inputs], max_sequence_length=4096)
+# Move to device
+device_inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v
+                 for k, v in batched_inputs.items()}
+# Streaming generation
+with torch.no_grad(), torch.autocast(device.type, dtype=torch.bfloat16):
+    for token_id in model.stream_generate(
+        input_ids=device_inputs['input_ids'],
+        images=device_inputs['images'],
+        image_masks=device_inputs['image_masks'],
+        image_input_idx=device_inputs['image_input_idx'],
+        max_new_tokens=256,
+    ):
+        text = processor.tokenizer.decode([token_id], skip_special_tokens=True)
+        print(text, end='', flush=True)
+```
+## Notes
+Streaming uses `stream_generate()` method with KV cache in `modeling_jvlm.py`.

infer.py ADDED Viewed

	@@ -0,0 +1,499 @@

+import argparse
+import glob
+import logging
+import os
+import sys
+from time import perf_counter
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+import torch
+from transformers import (
+    AutoModelForCausalLM, AutoProcessor, GenerationConfig, TextStreamer
+)
+from transformers.utils import is_flash_attn_2_available
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+TEST_IMAGE = './assets/the_persistence_of_memory.jpg'
+log = logging.getLogger(__name__)
+class Timer:
+    def __enter__(self):
+        self.start = perf_counter()
+        self.readout = None
+        return self
+    def __exit__(self, *_, **__):
+        self.time = perf_counter() - self.start
+        self.readout = f'{self.time:.3f}'
+def _resolve_device_dtype_and_attn() -> Tuple[torch.device, torch.dtype, str]:
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+        if is_flash_attn_2_available():
+            dtype = torch.bfloat16
+            attn_implementation = 'flash_attention_2'
+        else:
+            dtype = torch.float16
+            attn_implementation = 'sdpa'
+    else:
+        if torch.backends.mps.is_available():
+            device = torch.device('mps')
+        else:
+            device = torch.device('cpu')
+        dtype = torch.float32
+        attn_implementation = 'sdpa'
+    return device, dtype, attn_implementation
+def _build_conversations(
+    images: Optional[List[str]],
+    prompts: Optional[List[str]],
+    batched: bool = False,
+    prompt_first: bool = False,
+    image_labels: bool = False,
+):
+    def _is_url(_path: str) -> bool:
+        try:
+            result = urlparse(_path)
+            return result.scheme in ('http', 'https')
+        except:
+            return False
+    images = images or []
+    expanded_image_paths = []
+    for path in images:
+        if _is_url(path):
+            expanded_image_paths.append(path)
+        elif any(char in path for char in ['*', '?', '[', ']']):
+            matched_files = glob.glob(path)
+            if matched_files:
+                expanded_image_paths.extend(sorted(matched_files))
+            else:
+                log.warning(f'No files matched pattern "{path}"')
+        else:
+            expanded_image_paths.append(path)
+    images = expanded_image_paths or [TEST_IMAGE]
+    n_images = len(images)
+    if prompts is None:
+        prompts = (
+            ['Describe the image in 100 words'] if n_images == 1 or batched else
+            ['Describe the images in 100 words']
+        )
+    n_prompts = len(prompts)
+    if n_images == 1 and n_prompts == 1:
+        examples = [([images[0]], prompts[0])]
+    elif batched:
+        if n_images > 1 and n_prompts == 1:
+            prompt = prompts[0]
+            log.info(f'Batch mode: Applying 1 prompt to {n_images} images')
+            examples = [([image], prompt) for image in images]
+        elif n_images == 1 and n_prompts > 1:
+            image = images[0]
+            log.info(f'\nBatch mode: Applying {n_prompts} prompts to 1 image')
+            examples = [([image], prompt) for prompt in prompts]
+        elif n_images > 1 and n_images == n_prompts:
+            log.info(f'\nBatch mode: Applying {n_prompts} prompts to {n_images} images')
+            examples = [([image], prompt) for image, prompt in zip(images, prompts)]
+        else:
+            log.error(
+                'Batch mode requires either (multiple images + 1 prompt) or '
+                '(1 image + multiple prompts) or (multiple images + multiple prompts) '
+                'with equal number of images and prompts. Got '
+                f'{n_images} images and {n_prompts} prompts'
+            )
+            sys.exit(1)
+    else:
+        if n_prompts > 1:
+            log.error(
+                'Non-batch mode requires 1+ images and 1 prompt. Got '
+                f'{n_images} images and {n_prompts} prompts'
+            )
+            sys.exit(1)
+        examples = [(images, prompts[0])]
+    conversations = []
+    allimages = []
+    allprompts = []
+    ordinals = [
+        'first', 'second', 'third', 'fourth', 'fifth',
+        'sixth', 'seventh', 'eighth', 'ninth', 'tenth',
+    ]
+    for images, prompt in examples:
+        content = []
+        allimages.append(images)
+        allprompts.append(prompt)
+        if prompt_first:
+            content.append({'type': 'text', 'text': prompt})
+        if len(images) > 1 and image_labels:
+            for idx, img in enumerate(images):
+                ordinal = ordinals[idx] if idx < len(ordinals) else f'{idx+1}th'
+                image = images[idx]
+                descriptor = f'url: {image}'
+                if os.path.isfile(image):
+                    descriptor = f'filename: {os.path.basename(image)}'
+                content.append({
+                    'type': 'text',
+                    'text': f'(this is the {ordinal} image, {descriptor})',
+                })
+                content.append({'type': 'image', 'image': img})
+        else:
+            content.extend([{'type': 'image', 'image': image} for image in images])
+        if not prompt_first:
+            content.append({'type': 'text', 'text': prompt})
+        conversations.append({'role': 'user', 'content': content})
+    return conversations, allimages, allprompts
+def _token_usage_report(
+    inputs: Dict[str, Any],
+    images: List[Any],
+    max_sequence_length: int,
+    special_image_token_ids: Dict[str, int],
+):
+    """Report token usage statistics in tree format."""
+    n_images = len(images)
+    input_ids = inputs['input_ids']
+    attention_mask = inputs['attention_mask']
+    # Total tokens in sequence (non-padding)
+    total_tokens = attention_mask.sum().item()
+    # Count ALL image-related tokens directly from input_ids
+    image_patch_id = special_image_token_ids['image_patch_token_id']
+    image_start_id = special_image_token_ids['image_start_token_id']
+    image_end_id = special_image_token_ids['image_end_token_id']
+    image_col_id = special_image_token_ids['image_col_token_id']
+    num_patch = (input_ids[0] == image_patch_id).sum().item()
+    num_start = (input_ids[0] == image_start_id).sum().item()
+    num_end = (input_ids[0] == image_end_id).sum().item()
+    num_col = (input_ids[0] == image_col_id).sum().item()
+    # Total image tokens = all image-related special tokens
+    total_image_tokens = num_patch + num_start + num_end + num_col
+    # Pure text tokens (excluding all image-related tokens)
+    text_token_count = total_tokens - total_image_tokens
+    report = [
+        f'Input Context Window Layout (max: {max_sequence_length} tokens):',
+        f'├── Total: {total_tokens} tokens '
+        f'({((total_tokens / max_sequence_length) * 100):.1f}%)',
+    ]
+    # Count tokens per image by finding img_start and img_end boundaries
+    # Each image is delimited by img_start and img_end tokens
+    tokens_per_image_list = []
+    # Find all img_start and img_end positions in input_ids
+    start_positions = (input_ids[0] == image_start_id).nonzero(
+        as_tuple=True
+    )[0].tolist()
+    end_positions = (input_ids[0] == image_end_id).nonzero(as_tuple=True)[0].tolist()
+    if len(start_positions) > 0 and len(end_positions) > 0:
+        # Each image typically has 2 start and 2 end tokens
+        # Determine actual number of images in context
+        n_starts_per_image = 2  # typical case
+        n_images_in_context = len(start_positions) // n_starts_per_image
+        # Warn if not all images fit in context
+        if n_images_in_context < n_images:
+            log.warning(
+                f'Only {n_images_in_context}/{n_images} images fit in context window'
+            )
+        for idx in range(n_images):
+            if idx < n_images_in_context:
+                # Get the start and end indices for this image
+                start_idx_begin = idx * n_starts_per_image
+                end_idx_end = (idx + 1) * n_starts_per_image
+                if (
+                    start_idx_begin < len(start_positions) and
+                    end_idx_end <= len(end_positions)
+                ):
+                    # First start position and last end position define the image span
+                    first_start = start_positions[start_idx_begin]
+                    last_end = end_positions[end_idx_end - 1]
+                    # Count tokens from first start to last end (inclusive)
+                    num_tokens = last_end - first_start + 1
+                    tokens_per_image_list.append(num_tokens)
+                else:
+                    tokens_per_image_list.append(0)
+            else:
+                # Image didn't fit in context
+                tokens_per_image_list.append(0)
+    else:
+        # Fallback to uniform division if we can't find boundaries
+        tokens_per_image = total_image_tokens // n_images if n_images > 0 else 0
+        tokens_per_image_list = [tokens_per_image] * n_images
+    for idx in range(n_images):
+        img = images[idx]
+        n_tokens = tokens_per_image_list[idx] if idx < len(tokens_per_image_list) else 0
+        pct = (n_tokens / max_sequence_length * 100)
+        report.append(
+            f'├── Image {idx + 1}: {img.width}x{img.height} → {n_tokens} '
+            f'tokens ({pct:.1f}%)'
+        )
+    text_pct = (text_token_count / max_sequence_length * 100)
+    report.append(f'└── Text: {text_token_count} tokens ({text_pct:.1f}%)')
+    return '\n'.join(report)
+def test_jvlm():
+    parser = argparse.ArgumentParser(
+        description='jina-vlm-v1 vision-language model inference.'
+    )
+    parser.add_argument(
+        '-m',
+        '--model',
+        default='.',
+        help=(
+            'Model path. Set this to "jinaai/jina-vlm-v1" if you are running this '
+            'script outside this repo.'
+        )
+    )
+    parser.add_argument(
+        '-i',
+        '--image',
+        action='append',
+        help='Image path or glob pattern (can specify multiple times, e.g., "*.jpg").'
+    )
+    parser.add_argument(
+        '-p',
+        '--prompt',
+        action='append',
+        help='Text prompt (can specify multiple times with --map).',
+    )
+    parser.add_argument(
+        '--max-crops',
+        type=int,
+        default=12,
+        help='Maximum crops (default: 12).',
+    )
+    parser.add_argument(
+        '--max-tokens',
+        type=int,
+        default=1024,
+        help='Maximum output tokens (default: 1024).',
+    )
+    parser.add_argument(
+        '--max-pixels',
+        type=int,
+        default=None,
+        help=(
+            'Max pixels per image, bigger images are resized and the aspect ratio is '
+            'preserved (default: None).'
+        ),
+    )
+    parser.add_argument(
+        '--no-stream',
+        action='store_true',
+        help='Disable streaming (default: stream token-by-token)',
+    )
+    parser.add_argument(
+        '--image-labels',
+        action='store_true',
+        help=(
+            'Enable ordinal text labels after each image '
+            '(default: no image labels for multi-image)'
+        ),
+    )
+    parser.add_argument(
+        '--prompt-first',
+        action='store_true',
+        help=(
+            'Place prompt before images instead of after (default: prompt after images)'
+        ),
+    )
+    parser.add_argument(
+        '--batched',
+        action='store_true',
+        help=(
+            'Batch mode: apply single prompt to multiple images (or single image to '
+            'multiple prompts) with KV cache reuse.'
+        ),
+    )
+    args = parser.parse_args()
+    print('Welcome to the jinaai/jina-vlm-v1 playground ✨')
+    print('Use this script to test our model!')
+    print('- Jina AI')
+    print()
+    print('Model path: ', args.model)
+    print('Loading the processor ...')
+    processor = AutoProcessor.from_pretrained(
+        args.model, trust_remote_code=True, use_fast=False,
+    )
+    print('Done ✅')
+    print()
+    print('Specifying device, dtype and attention implementation ...')
+    device, dtype, attn_implementation = _resolve_device_dtype_and_attn()
+    print(f'Using attention implementation: {attn_implementation}')
+    print(f'Using device: {device}')
+    print(f'Using dtype: {dtype}')
+    print()
+    print('Loading the model ...')
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        trust_remote_code=True,
+        dtype=dtype,
+        low_cpu_mem_usage=True,
+        device_map=device.type,
+        attn_implementation=attn_implementation,
+    )
+    max_sequence_length = getattr(model.config, 'max_sequence_length', 40960)
+    n_params = sum(p.numel() for p in model.parameters())
+    print(f'Max sequence length: {max_sequence_length}')
+    print(f'Number of parameters: {n_params}')
+    print('Done ✅')
+    print()
+    print('Let\'s create some conversations ...')
+    conversations, images, prompts = _build_conversations(
+        args.image,
+        args.prompt,
+        batched=args.batched,
+        prompt_first=args.prompt_first,
+        image_labels=args.image_labels
+    )
+    n_conversations = len(conversations)
+    print(f'Built {n_conversations} conversations 🚀')
+    print()
+    print('Transforming conversations to numbers ...')
+    timer = Timer()
+    with timer:
+        texts = processor.apply_chat_template(conversations, add_generation_prompt=True)
+        inputs = processor(
+            text=texts,
+            images=images,
+            padding='longest',
+            max_length=max_sequence_length,
+            max_crops=args.max_crops,
+            max_pixels=args.max_pixels,
+            do_resize=True if args.max_pixels is not None else False,
+            return_tensors='pt',
+        )
+        device_inputs = {}
+        for k, v in inputs.items():
+            if k == 'labels':
+                continue
+            if isinstance(v, torch.Tensor):
+                if v.is_floating_point():
+                    device_inputs[k] = v.to(device, dtype=dtype, non_blocking=True)
+                else:
+                    device_inputs[k] = v.to(device, non_blocking=True)
+            else:
+                device_inputs[k] = v
+    processing_time = timer.readout
+    special_image_token_ids = {
+        'image_patch_token_id': processor.image_processor.image_patch_token_id,
+        'image_start_token_id': processor.image_processor.image_start_token_id,
+        'image_end_token_id': processor.image_processor.image_end_token_id,
+        'image_col_token_id': processor.image_processor.image_col_token_id,
+    }
+    token_usage_reports = []
+    for idx in range(n_conversations):
+        ith_inputs = {k: v[idx] for k, v in inputs.items()}
+        token_usage_report = _token_usage_report(
+            ith_inputs,
+            images[idx],
+            max_sequence_length=max_sequence_length,
+            special_image_token_ids=special_image_token_ids,
+        )
+        token_usage_reports.append(token_usage_report)
+    print(f'Processed {n_conversations} conversations in {processing_time}s')
+    print('All done 🪄')
+    print()
+    print('Running inference ...')
+    generated_tokens = 0
+    input_prompts = inputs['input_ids']
+    if args.no_stream:
+        print('Non-streaming mode')
+        print('Inference will run in a batch')
+        print()
+        with (
+            timer,
+            torch.no_grad(),
+            torch.autocast(device.type, enabled=(device.type != 'mps'), dtype=dtype),
+        ):
+            output = model.generate(
+                **device_inputs,
+                generation_config=GenerationConfig(
+                    max_new_tokens=args.max_tokens, do_sample=False,
+                ),
+            )
+            generation_time, generation_readout = timer.time, timer.readout
+        for idx in range(n_conversations):
+            out = output.sequences[idx][len(input_prompts[idx].tolist()):]
+            generated_tokens += len(out)
+            response = processor.tokenizer.decode(out, skip_special_tokens=True)
+            print(f'Conversation {idx + 1}/{n_conversations}')
+            print(f'├── 🖼️Images: {images[idx]}')
+            print(f'├── 📜Prompt: {prompts[idx]}')
+            print(f'├── 💬Chat: {texts[idx]}')
+            print(f'└── 🧠Response: {response}')
+            print('Token usage report:')
+            print(token_usage_reports[idx])
+            print()
+    else:
+        print('Streaming mode')
+        print('Inference will run sequentially')
+        print()
+        streamer = TextStreamer(processor.tokenizer)
+        for idx in range(n_conversations):
+            print(f'Conversation {idx + 1}/{n_conversations}')
+            print(f'├── 🖼️Images: {images[idx]}')
+            print(f'├── 📜Prompt: {prompts[idx]}')
+            print(f'├── 💬Chat: {texts[idx]}')
+            print(f'└── 🧠Response: ')
+            ith_inputs = {k: v[idx].unsqueeze(0) for k, v in device_inputs.items()}
+            with (
+                timer,
+                torch.no_grad(),
+                torch.autocast(device.type, enabled=(device.type != 'mps'), dtype=dtype)
+            ):
+                output = model.generate(
+                    **ith_inputs,
+                    streamer=streamer,
+                    generation_config=GenerationConfig(
+                        max_new_tokens=args.max_tokens, do_sample=False,
+                    ),
+                )
+            out = output.sequences[0][len(input_prompts[idx].tolist()):]
+            generated_tokens += len(out)
+            print('Token usage report:')
+            print(token_usage_reports[idx])
+            print()
+        generation_time, generation_readout = timer.time, timer.readout
+    res_per_sec = n_conversations / generation_time if generation_time > 0 else 0
+    tok_per_sec = generated_tokens / generation_time if generation_time > 0 else 0
+    print('Done ✅')
+    print(f'Generated {n_conversations} responses in {generation_readout}s')
+    print(f'{res_per_sec:.2f} res/s {tok_per_sec:.2f} tok/s')
+if __name__ == '__main__':
+    test_jvlm()

infer_utils.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import os
+import tempfile
+import urllib.request
+from urllib.parse import urlparse
+import time
+import torch
+def is_url(path):
+    """Check if a path is a URL"""
+    try:
+        result = urlparse(path)
+        return result.scheme in ("http", "https")
+    except:
+        return False
+def download_image(url):
+    """Download image from URL to temporary file"""
+    try:
+        # Create temp file with proper extension
+        parsed = urlparse(url)
+        ext = os.path.splitext(parsed.path)[1] or ".jpg"
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
+        temp_path = temp_file.name
+        temp_file.close()
+        # Download image
+        urllib.request.urlretrieve(url, temp_path)
+        print(f"Downloaded image from: {url}")
+        return temp_path
+    except Exception as e:
+        raise RuntimeError(f"Failed to download image from {url}: {e}")
+def print_token_stats(batched_inputs, images_list, model, processor):
+    """Print token usage statistics in tree format.
+    Comment out the call to this function if you don't want to see the stats.
+    """
+    input_ids = batched_inputs['input_ids']
+    max_ctx_len = model.config.max_sequence_length
+    image_input_idx = batched_inputs.get('image_input_idx')
+    # Total tokens in sequence (non-padding)
+    valid_mask = input_ids[0] != -1
+    total_tokens = valid_mask.sum().item()
+    # Count ALL image-related tokens directly from input_ids
+    image_patch_id = processor.image_preprocessor.image_patch_token_id
+    image_start_id = processor.image_preprocessor.image_start_token_id
+    image_end_id = processor.image_preprocessor.image_end_token_id
+    image_col_id = processor.image_preprocessor.image_col_token_id
+    num_patch = (input_ids[0] == image_patch_id).sum().item()
+    num_start = (input_ids[0] == image_start_id).sum().item()
+    num_end = (input_ids[0] == image_end_id).sum().item()
+    num_col = (input_ids[0] == image_col_id).sum().item()
+    # Total image tokens = all image-related special tokens
+    total_image_tokens = num_patch + num_start + num_end + num_col
+    # Pure text tokens (excluding all image-related tokens)
+    text_token_count = total_tokens - total_image_tokens
+    print("Input Context Window Layout (max: {} tokens):".format(max_ctx_len))
+    print("└── Total: {} tokens ({:.1f}%)".format(
+        total_tokens, (total_tokens / max_ctx_len) * 100))
+    # Count tokens per image by finding img_start and img_end boundaries
+    # Each image is delimited by img_start and img_end tokens
+    tokens_per_image_list = []
+    # Find all img_start and img_end positions in input_ids
+    start_positions = (input_ids[0] == image_start_id).nonzero(as_tuple=True)[0].tolist()
+    end_positions = (input_ids[0] == image_end_id).nonzero(as_tuple=True)[0].tolist()
+    if len(start_positions) > 0 and len(end_positions) > 0:
+        # Each image typically has 2 start and 2 end tokens
+        # Determine actual number of images in context
+        num_starts_per_image = 2  # typical case
+        num_images_in_context = len(start_positions) // num_starts_per_image
+        # Warn if not all images fit in context
+        if num_images_in_context < len(images_list):
+            print(f"Warning: Only {num_images_in_context}/{len(images_list)} images fit in context window")
+        for img_idx in range(len(images_list)):
+            if img_idx < num_images_in_context:
+                # Get the start and end indices for this image
+                start_idx_begin = img_idx * num_starts_per_image
+                start_idx_end = (img_idx + 1) * num_starts_per_image
+                end_idx_begin = img_idx * num_starts_per_image
+                end_idx_end = (img_idx + 1) * num_starts_per_image
+                if start_idx_begin < len(start_positions) and end_idx_end <= len(end_positions):
+                    # First start position and last end position define the image span
+                    first_start = start_positions[start_idx_begin]
+                    last_end = end_positions[end_idx_end - 1]
+                    # Count tokens from first start to last end (inclusive)
+                    num_tokens = last_end - first_start + 1
+                    tokens_per_image_list.append(num_tokens)
+                else:
+                    tokens_per_image_list.append(0)
+            else:
+                # Image didn't fit in context
+                tokens_per_image_list.append(0)
+    else:
+        # Fallback to uniform division if we can't find boundaries
+        tokens_per_image = total_image_tokens // len(images_list) if len(images_list) > 0 else 0
+        tokens_per_image_list = [tokens_per_image] * len(images_list)
+    for img_idx in range(len(images_list)):
+        img = images_list[img_idx]
+        num_tokens = tokens_per_image_list[img_idx] if img_idx < len(tokens_per_image_list) else 0
+        pct = (num_tokens / max_ctx_len * 100)
+        if img_idx < len(images_list) - 1:
+            print("    ├��─ Image {}: {}x{} → {} tokens ({:.1f}%)".format(
+                img_idx + 1, img.width, img.height, num_tokens, pct))
+        else:
+            print("    ├── Image {}: {}x{} → {} tokens ({:.1f}%)".format(
+                img_idx + 1, img.width, img.height, num_tokens, pct))
+    # Show text last
+    text_pct = (text_token_count / max_ctx_len * 100)
+    print("    └── Text: {} tokens ({:.1f}%)".format(text_token_count, text_pct))
+    print()
+def is_chinese_char(cp):
+    """Check if character is CJK"""
+    if (
+        (cp >= 0x4E00 and cp <= 0x9FFF)
+        or (cp >= 0x3400 and cp <= 0x4DBF)
+        or (cp >= 0x20000 and cp <= 0x2A6DF)
+        or (cp >= 0x2A700 and cp <= 0x2B73F)
+        or (cp >= 0x2B740 and cp <= 0x2B81F)
+        or (cp >= 0x2B820 and cp <= 0x2CEAF)
+        or (cp >= 0xF900 and cp <= 0xFAFF)
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)
+    ):
+        return True
+    return False
+def build_content(images, prompt, prompt_first):
+    """Build content list with proper ordering."""
+    content = []
+    if prompt_first:
+        content.append({"type": "text", "text": prompt})
+        content.extend([{"type": "image", "image": img} for img in images])
+    else:
+        content.extend([{"type": "image", "image": img} for img in images])
+        content.append({"type": "text", "text": prompt})
+    return content
+def generate_single(model, processor, content, args, device, prefer_mps, max_crops=12):
+    """
+    Generate output for a single image-prompt pair.
+    Returns:
+        text: Generated text
+        elapsed_time: Time taken
+        num_tokens: Number of tokens generated
+    """
+    inputs = [{"role": "user", "content": content}]
+    messages, images = processor.apply_chat_template(inputs, add_generation_prompt=True)
+    processed_inputs = processor(messages=messages, images=images)
+    # Use model's max sequence length from config
+    max_seq_len = getattr(model.config, 'max_sequence_length', 40960)
+    batched_inputs = processor.collate(
+        [processed_inputs], max_sequence_length=max_seq_len, max_crops=max_crops * len(images)
+    )
+    device_inputs = {}
+    for k, v in batched_inputs.items():
+        if isinstance(v, torch.Tensor):
+            if prefer_mps and v.is_floating_point():
+                device_inputs[k] = v.to(device, dtype=torch.float32, non_blocking=True)
+            else:
+                device_inputs[k] = v.to(device, non_blocking=True)
+        else:
+            device_inputs[k] = v
+    with (
+        torch.no_grad(),
+        torch.autocast(device, enabled=(device != "mps"), dtype=torch.bfloat16),
+    ):
+        if args.no_stream:
+            start_time = time.time()
+            outputs = model.generate(
+                input_ids=device_inputs["input_ids"],
+                images=device_inputs.get("images"),
+                image_masks=device_inputs.get("image_masks"),
+                image_input_idx=device_inputs["image_input_idx"],
+                max_new_tokens=args.max_tokens,
+            )
+            elapsed_time = time.time() - start_time
+            text = processor.tokenizer.decode(
+                outputs.token_ids[0, 0], skip_special_tokens=True
+            )
+            num_tokens = len(outputs.token_ids[0, 0])
+            print(text)
+        else:
+            # Streaming mode
+            token_cache = []
+            print_len = 0
+            token_count = 0
+            start_time = time.time()
+            for token_id in model.stream_generate(
+                input_ids=device_inputs["input_ids"],
+                position_ids=device_inputs.get("position_ids"),
+                images=device_inputs.get("images"),
+                image_masks=device_inputs.get("image_masks"),
+                image_input_idx=device_inputs["image_input_idx"],
+                max_new_tokens=args.max_tokens,
+            ):
+                token_cache.append(token_id)
+                token_count += 1
+                text = processor.tokenizer.decode(token_cache, skip_special_tokens=True)
+                if text.endswith("\n"):
+                    printable_text = text[print_len:]
+                    token_cache = []
+                    print_len = 0
+                elif len(text) > 0 and is_chinese_char(ord(text[-1])):
+                    printable_text = text[print_len:]
+                    print_len += len(printable_text)
+                else:
+                    printable_text = text[print_len : text.rfind(" ") + 1]
+                    print_len += len(printable_text)
+                print(printable_text, end="", flush=True)
+            elapsed_time = time.time() - start_time
+            if print_len < len(text):
+                print(text[print_len:], end="", flush=True)
+            print()
+            num_tokens = token_count
+    tok_per_sec = num_tokens / elapsed_time if elapsed_time > 0 else 0
+    print(f"{tok_per_sec:.2f} tok/s")
+    return text, elapsed_time, num_tokens

modeling_jvlm.py CHANGED Viewed

@@ -492,7 +492,7 @@ class JinaVLM(JinaPreTrainedModel):
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> BaseModelOutputWithPast:
         image_features = None
-        if images is not None:
             image_out = self.vision_model(images, image_masks)
             image_features = image_out.last_hidden_state
         return self.language_model(

         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> BaseModelOutputWithPast:
         image_features = None
+        if images is not None and images.shape[1] > 0:
             image_out = self.vision_model(images, image_masks)
             image_features = image_out.last_hidden_state
         return self.language_model(

test_jvlm.py ADDED Viewed

	@@ -0,0 +1,504 @@

+import argparse
+import glob
+import logging
+import os
+import sys
+from time import perf_counter
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+import torch
+from transformers import (
+    AutoModelForCausalLM, AutoProcessor, GenerationConfig, TextStreamer
+)
+from transformers.utils import is_flash_attn_2_available
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+TEST_IMAGE = './assets/the_persistence_of_memory.jpg'
+log = logging.getLogger(__name__)
+class Timer:
+    def __enter__(self):
+        self.start = perf_counter()
+        self.readout = None
+        return self
+    def __exit__(self, *_, **__):
+        self.time = perf_counter() - self.start
+        self.readout = f'{self.time:.3f}'
+def _resolve_device_dtype_and_attn() -> Tuple[torch.device, torch.dtype, str]:
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+        if is_flash_attn_2_available():
+            dtype = torch.bfloat16
+            attn_implementation = 'flash_attention_2'
+        else:
+            dtype = torch.float16
+            attn_implementation = 'sdpa'
+    else:
+        if torch.backends.mps.is_available():
+            device = torch.device('mps')
+        else:
+            device = torch.device('cpu')
+        dtype = torch.float32
+        attn_implementation = 'sdpa'
+    return device, dtype, attn_implementation
+def _build_conversations(
+    images: Optional[List[str]],
+    prompts: Optional[List[str]],
+    batched: bool = False,
+    prompt_first: bool = False,
+    image_labels: bool = False,
+):
+    def _is_url(_path: str) -> bool:
+        try:
+            result = urlparse(_path)
+            return result.scheme in ('http', 'https')
+        except:
+            return False
+    images = images or []
+    expanded_image_paths = []
+    for path in images:
+        if _is_url(path):
+            expanded_image_paths.append(path)
+        elif any(char in path for char in ['*', '?', '[', ']']):
+            matched_files = glob.glob(path)
+            if matched_files:
+                expanded_image_paths.extend(sorted(matched_files))
+            else:
+                log.warning(f'No files matched pattern "{path}"')
+        else:
+            expanded_image_paths.append(path)
+    images = expanded_image_paths
+    n_images = len(images)
+    if prompts is None:
+        if len(images) == 0:
+            images = [TEST_IMAGE]
+            n_images = len(images)
+        prompts = (
+            ['Describe the image in 100 words'] if n_images == 1 or batched else
+            ['Describe the images in 100 words']
+        )
+    n_prompts = len(prompts)
+    if n_images == 0:
+        examples = [([], prompt) for prompt in prompts]
+    elif n_images == 1 and n_prompts == 1:
+        examples = [([images[0]], prompts[0])]
+    elif batched:
+        if n_images > 1 and n_prompts == 1:
+            prompt = prompts[0]
+            log.info(f'Batch mode: Applying 1 prompt to {n_images} images')
+            examples = [([image], prompt) for image in images]
+        elif n_images == 1 and n_prompts > 1:
+            image = images[0]
+            log.info(f'\nBatch mode: Applying {n_prompts} prompts to 1 image')
+            examples = [([image], prompt) for prompt in prompts]
+        elif n_images > 1 and n_images == n_prompts:
+            log.info(f'\nBatch mode: Applying {n_prompts} prompts to {n_images} images')
+            examples = [([image], prompt) for image, prompt in zip(images, prompts)]
+        else:
+            log.error(
+                'Batch mode requires either (multiple images + 1 prompt) or '
+                '(1 image + multiple prompts) or (multiple images + multiple prompts) '
+                'with equal number of images and prompts. Got '
+                f'{n_images} images and {n_prompts} prompts'
+            )
+            sys.exit(1)
+    else:
+        if n_prompts > 1:
+            log.error(
+                'Non-batch mode requires 1+ images and 1 prompt. Got '
+                f'{n_images} images and {n_prompts} prompts'
+            )
+            sys.exit(1)
+        examples = [(images, prompts[0])]
+    conversations = []
+    allimages = []
+    allprompts = []
+    ordinals = [
+        'first', 'second', 'third', 'fourth', 'fifth',
+        'sixth', 'seventh', 'eighth', 'ninth', 'tenth',
+    ]
+    for images, prompt in examples:
+        content = []
+        allimages.append(images)
+        allprompts.append(prompt)
+        if prompt_first:
+            content.append({'type': 'text', 'text': prompt})
+        if len(images) > 1 and image_labels:
+            for idx, img in enumerate(images):
+                ordinal = ordinals[idx] if idx < len(ordinals) else f'{idx+1}th'
+                image = images[idx]
+                descriptor = f'url: {image}'
+                if os.path.isfile(image):
+                    descriptor = f'filename: {os.path.basename(image)}'
+                content.append({
+                    'type': 'text',
+                    'text': f'(this is the {ordinal} image, {descriptor})',
+                })
+                content.append({'type': 'image', 'image': img})
+        else:
+            content.extend([{'type': 'image', 'image': image} for image in images])
+        if not prompt_first:
+            content.append({'type': 'text', 'text': prompt})
+        conversations.append([{'role': 'user', 'content': content}])
+    return conversations, allimages, allprompts
+def _token_usage_report(
+    inputs: Dict[str, Any],
+    n_images: int,
+    max_sequence_length: int,
+    special_image_token_ids: Dict[str, int],
+):
+    """Report token usage statistics in tree format."""
+    input_ids = inputs['input_ids']
+    attention_mask = inputs['attention_mask']
+    # Total tokens in sequence (non-padding)
+    total_tokens = attention_mask.sum().item()
+    # Count ALL image-related tokens directly from input_ids
+    image_patch_id = special_image_token_ids['image_patch_token_id']
+    image_start_id = special_image_token_ids['image_start_token_id']
+    image_end_id = special_image_token_ids['image_end_token_id']
+    image_column_token_id = special_image_token_ids['image_column_token_id']
+    num_patch = (input_ids == image_patch_id).sum().item()
+    num_start = (input_ids == image_start_id).sum().item()
+    num_end = (input_ids == image_end_id).sum().item()
+    num_col = (input_ids == image_column_token_id).sum().item()
+    # Total image tokens = all image-related special tokens
+    total_image_tokens = num_patch + num_start + num_end + num_col
+    # Pure text tokens (excluding all image-related tokens)
+    text_token_count = total_tokens - total_image_tokens
+    report = [
+        f'Input Context Window Layout (max: {max_sequence_length} tokens):',
+        f'├── Total: {total_tokens} tokens '
+        f'({((total_tokens / max_sequence_length) * 100):.1f}%)',
+    ]
+    # Count tokens per image by finding img_start and img_end boundaries
+    # Each image is delimited by img_start and img_end tokens
+    tokens_per_image_list = []
+    # Find all img_start and img_end positions in input_ids
+    start_positions = (input_ids == image_start_id).nonzero(
+        as_tuple=True
+    )[0].tolist()
+    end_positions = (input_ids == image_end_id).nonzero(as_tuple=True)[0].tolist()
+    if len(start_positions) > 0 and len(end_positions) > 0:
+        # Each image typically has 2 start and 2 end tokens
+        # Determine actual number of images in context
+        n_starts_per_image = 2  # typical case
+        n_images_in_context = len(start_positions) // n_starts_per_image
+        # Warn if not all images fit in context
+        if n_images_in_context < n_images:
+            log.warning(
+                f'Only {n_images_in_context}/{n_images} images fit in context window'
+            )
+        for idx in range(n_images):
+            if idx < n_images_in_context:
+                # Get the start and end indices for this image
+                start_idx_begin = idx * n_starts_per_image
+                end_idx_end = (idx + 1) * n_starts_per_image
+                if (
+                    start_idx_begin < len(start_positions) and
+                    end_idx_end <= len(end_positions)
+                ):
+                    # First start position and last end position define the image span
+                    first_start = start_positions[start_idx_begin]
+                    last_end = end_positions[end_idx_end - 1]
+                    # Count tokens from first start to last end (inclusive)
+                    num_tokens = last_end - first_start + 1
+                    tokens_per_image_list.append(num_tokens)
+                else:
+                    tokens_per_image_list.append(0)
+            else:
+                # Image didn't fit in context
+                tokens_per_image_list.append(0)
+    else:
+        # Fallback to uniform division if we can't find boundaries
+        tokens_per_image = total_image_tokens // n_images if n_images > 0 else 0
+        tokens_per_image_list = [tokens_per_image] * n_images
+    for idx in range(n_images):
+        n_tokens = tokens_per_image_list[idx] if idx < len(tokens_per_image_list) else 0
+        pct = (n_tokens / max_sequence_length * 100)
+        report.append(f'├── Image {idx + 1} → {n_tokens} tokens ({pct:.1f}%)')
+    text_pct = (text_token_count / max_sequence_length * 100)
+    report.append(f'└── Text: {text_token_count} tokens ({text_pct:.1f}%)')
+    return '\n'.join(report)
+def test_jvlm():
+    parser = argparse.ArgumentParser(
+        description='jina-vlm-v1 vision-language model inference.'
+    )
+    parser.add_argument(
+        '-m',
+        '--model',
+        default='.',
+        help=(
+            'Model path. Set this to "jinaai/jina-vlm-v1" if you are running this '
+            'script outside this repo.'
+        )
+    )
+    parser.add_argument(
+        '-i',
+        '--image',
+        action='append',
+        help='Image path or glob pattern (can specify multiple times, e.g., "*.jpg").'
+    )
+    parser.add_argument(
+        '-p',
+        '--prompt',
+        action='append',
+        help='Text prompt (can specify multiple times with --map).',
+    )
+    parser.add_argument(
+        '--max-crops',
+        type=int,
+        default=12,
+        help='Maximum crops (default: 12).',
+    )
+    parser.add_argument(
+        '--max-tokens',
+        type=int,
+        default=1024,
+        help='Maximum output tokens (default: 1024).',
+    )
+    parser.add_argument(
+        '--max-pixels',
+        type=int,
+        default=None,
+        help=(
+            'Max pixels per image, bigger images are resized and the aspect ratio is '
+            'preserved (default: None).'
+        ),
+    )
+    parser.add_argument(
+        '--no-stream',
+        action='store_true',
+        help='Disable streaming (default: stream token-by-token)',
+    )
+    parser.add_argument(
+        '--image-labels',
+        action='store_true',
+        help=(
+            'Enable ordinal text labels after each image '
+            '(default: no image labels for multi-image)'
+        ),
+    )
+    parser.add_argument(
+        '--prompt-first',
+        action='store_true',
+        help=(
+            'Place prompt before images instead of after (default: prompt after images)'
+        ),
+    )
+    parser.add_argument(
+        '--batched',
+        action='store_true',
+        help=(
+            'Batch mode: apply single prompt to multiple images (or single image to '
+            'multiple prompts) with KV cache reuse.'
+        ),
+    )
+    args = parser.parse_args()
+    print()
+    print('Welcome to the jinaai/jina-vlm-v1 playground ✨')
+    print('Use this script to test our model!')
+    print('- Jina AI')
+    print()
+    print('--- Loading the model ...')
+    print('Specifying device, dtype and attention implementation ...')
+    device, dtype, attn_implementation = _resolve_device_dtype_and_attn()
+    print(f'Using attention implementation: {attn_implementation}')
+    print(f'Using device: {device}')
+    print(f'Using dtype: {dtype}')
+    print('Model path: ', args.model)
+    processor = AutoProcessor.from_pretrained(
+        args.model, trust_remote_code=True, use_fast=False,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        trust_remote_code=True,
+        dtype=dtype,
+        low_cpu_mem_usage=True,
+        device_map=device.type,
+        attn_implementation=attn_implementation,
+    )
+    max_sequence_length = getattr(model.config, 'max_sequence_length', 40960)
+    n_params = sum(p.numel() for p in model.parameters())
+    print(f'Max sequence length: {max_sequence_length}')
+    print(f'Number of parameters: {n_params}')
+    print('Done ✅')
+    print()
+    print('--- Let\'s create some conversations ...')
+    conversations, images, prompts = _build_conversations(
+        args.image,
+        args.prompt,
+        batched=args.batched,
+        prompt_first=args.prompt_first,
+        image_labels=args.image_labels
+    )
+    n_conversations = len(conversations)
+    print(f'Built {n_conversations} conversations 🚀')
+    print()
+    print('--- Transforming conversations to numbers ...')
+    timer = Timer()
+    with timer:
+        texts = processor.apply_chat_template(conversations, add_generation_prompt=True)
+        print(texts)
+        print(images)
+        inputs = processor(
+            text=texts,
+            images=images,
+            padding='longest',
+            max_length=max_sequence_length,
+            max_crops=args.max_crops,
+            max_pixels=args.max_pixels,
+            do_resize=True if args.max_pixels is not None else False,
+            return_tensors='pt',
+        )
+        print(inputs['images'])
+        print(inputs['image_input_idx'])
+        texts = texts if isinstance(texts, list) else [texts]
+        device_inputs = {}
+        for k, v in inputs.items():
+            if k == 'labels':
+                continue
+            if isinstance(v, torch.Tensor):
+                if v.is_floating_point():
+                    device_inputs[k] = v.to(device, dtype=dtype, non_blocking=True)
+                else:
+                    device_inputs[k] = v.to(device, non_blocking=True)
+            else:
+                device_inputs[k] = v
+    processing_time = timer.readout
+    special_image_token_ids = {
+        'image_patch_token_id': processor.image_patch_token_id,
+        'image_start_token_id': processor.image_start_token_id,
+        'image_end_token_id': processor.image_end_token_id,
+        'image_column_token_id': processor.image_column_token_id,
+    }
+    token_usage_reports = []
+    for idx in range(n_conversations):
+        ith_inputs = {k: v[idx] for k, v in inputs.items()}
+        token_usage_report = _token_usage_report(
+            ith_inputs,
+            len(images[idx]),
+            max_sequence_length=max_sequence_length,
+            special_image_token_ids=special_image_token_ids,
+        )
+        token_usage_reports.append(token_usage_report)
+    print(f'Processed {n_conversations} conversations in {processing_time}s')
+    print('All done 🪄')
+    print()
+    print('--- Running inference ...')
+    generated_tokens = 0
+    input_prompts = inputs['input_ids']
+    if args.no_stream:
+        print('Non-streaming mode')
+        print('Inference will run in a batch')
+        print()
+        with (
+            timer,
+            torch.no_grad(),
+            torch.autocast(device.type, enabled=(device.type != 'mps'), dtype=dtype),
+        ):
+            output = model.generate(
+                **device_inputs,
+                generation_config=GenerationConfig(
+                    max_new_tokens=args.max_tokens, do_sample=False,
+                ),
+                return_dict_in_generate=True,
+                use_model_defaults=True,
+            )
+            generation_time, generation_readout = timer.time, timer.readout
+        for idx in range(n_conversations):
+            out = output.sequences[idx][len(input_prompts[idx].tolist()):]
+            generated_tokens += len(out)
+            response = processor.tokenizer.decode(out, skip_special_tokens=True)
+            print(f'* Conversation {idx + 1}/{n_conversations}')
+            print(f'├── 🖼️Images: {images[idx]}')
+            print(f'├── 📜Prompt: {prompts[idx]}')
+            print(f'├── 💬Chat: {texts[idx]}')
+            print(f'└── 🧠Response:{response}')
+            print('Token usage report:')
+            print(token_usage_reports[idx])
+            print()
+    else:
+        print('Streaming mode')
+        print('Inference will run sequentially')
+        print()
+        streamer = TextStreamer(
+            processor.tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        for idx in range(n_conversations):
+            print(f'* Conversation {idx + 1}/{n_conversations}')
+            print(f'├── 🖼️Images: {images[idx]}')
+            print(f'├── 📜Prompt: {prompts[idx]}')
+            print(f'├── 💬Chat: {texts[idx]}')
+            print(f'└── 🧠Response:', end='')
+            ith_inputs = {k: v[idx].unsqueeze(0) for k, v in device_inputs.items()}
+            with (
+                timer,
+                torch.no_grad(),
+                torch.autocast(device.type, enabled=(device.type != 'mps'), dtype=dtype)
+            ):
+                output = model.generate(
+                    **ith_inputs,
+                    streamer=streamer,
+                    generation_config=GenerationConfig(
+                        max_new_tokens=args.max_tokens, do_sample=False,
+                    ),
+                    return_dict_in_generate=True,
+                    use_model_defaults=True,
+                )
+            out = output.sequences[0][len(input_prompts[idx].tolist()):]
+            generated_tokens += len(out)
+            print('Token usage report:')
+            print(token_usage_reports[idx])
+            print()
+        generation_time, generation_readout = timer.time, timer.readout
+    res_per_sec = n_conversations / generation_time if generation_time > 0 else 0
+    tok_per_sec = generated_tokens / generation_time if generation_time > 0 else 0
+    print(f'Generated {n_conversations} responses in {generation_readout}s')
+    print(f'{res_per_sec:.2f} res/s {tok_per_sec:.2f} tok/s')
+    print('Done ✅')
+if __name__ == '__main__':
+    test_jvlm()