Use get_input_embeddings() Instead of Accessing .embed_tokens Directly

by Zhenzhao - opened May 26

base: refs/heads/main

←

from: refs/pr/8

Discussion Files changed

+26

-675

This PR is in draft mode

Files changed (7) hide show

1_Pooling/config.json +0 -10
README.md +24 -73
config.json +2 -7
config_sentence_transformers.json +0 -7
custom_st.py +0 -221
modeling_gme_qwen2vl.py +0 -337
modules.json +0 -20

1_Pooling/config.json DELETED Viewed

@@ -1,10 +0,0 @@
-{
-    "word_embedding_dimension": 3584,
-    "pooling_mode_cls_token": false,
-    "pooling_mode_mean_tokens": false,
-    "pooling_mode_max_tokens": false,
-    "pooling_mode_mean_sqrt_len_tokens": false,
-    "pooling_mode_weightedmean_tokens": false,
-    "pooling_mode_lasttoken": true,
-    "include_prompt": true
-}

README.md CHANGED Viewed

@@ -3691,110 +3691,61 @@ The `GME` models support three types of input: **text**, **image**, and **image-
 |[`gme-Qwen2-VL-2B`](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct) | 2.21B | 32768 | 1536 | 65.27 | 68.41 | 64.45 |
 |[`gme-Qwen2-VL-7B`](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct) | 8.29B | 32768 | 3584 | 67.48 | 71.36 | 67.44 |
 ## Usage
-**Transformers**
-The remote code has some issues with `transformers>=4.52.0`, please downgrade or use `sentence_transformers`
 ```python
-from transformers import AutoModel
-from transformers.utils.versions import require_version
-require_version(
-    "transformers<4.52.0",
-    "The remote code has some issues with transformers>=4.52.0, please downgrade: pip install transformers==4.51.3"
-)
-t2i_prompt = 'Find an image that matches the given text.'
 texts = [
-    "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023.",
-    "Alibaba office.",
 ]
 images = [
-    'https://upload.wikimedia.org/wikipedia/commons/e/e9/Tesla_Cybertruck_damaged_window.jpg',
-    'https://upload.wikimedia.org/wikipedia/commons/e/e0/TaobaoCity_Alibaba_Xixi_Park.jpg',
 ]
-gme = AutoModel.from_pretrained(
-    "Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
-    torch_dtype="float16", device_map='cuda', trust_remote_code=True
-)
 # Single-modal embedding
 e_text = gme.get_text_embeddings(texts=texts)
 e_image = gme.get_image_embeddings(images=images)
-print('Single-modal', (e_text @ e_image.T).tolist())
-## Single-modal [[0.279296875, 0.0002658367156982422], [0.06427001953125, 0.304443359375]]
 # How to set embedding instruction
-e_query = gme.get_text_embeddings(texts=texts, instruction=t2i_prompt)
 # If is_query=False, we always use the default instruction.
 e_corpus = gme.get_image_embeddings(images=images, is_query=False)
-print('Single-modal with instruction', (e_query @ e_corpus.T).tolist())
-## Single-modal with instruction [[0.32861328125, 0.026336669921875], [0.09466552734375, 0.3134765625]]
 # Fused-modal embedding
 e_fused = gme.get_fused_embeddings(texts=texts, images=images)
-print('Fused-modal', (e_fused @ e_fused.T).tolist())
-## Fused-modal [[1.0, 0.0308685302734375], [0.0308685302734375, 1.0]]
-```
-**sentence_transformers**
-The `encode` function accept `str` or `dict` with key(s) in `{'text', 'image', 'prompt'}`.
-**Do not pass `prompt` as the argument to `encode`**, pass as the input as a `dict` with a `prompt` key.
 ```python
-from sentence_transformers import SentenceTransformer
-t2i_prompt = 'Find an image that matches the given text.'
-texts = [
-    "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023.",
-    "Alibaba office.",
-]
-images = [
-    'https://upload.wikimedia.org/wikipedia/commons/e/e9/Tesla_Cybertruck_damaged_window.jpg',
-    'https://upload.wikimedia.org/wikipedia/commons/e/e0/TaobaoCity_Alibaba_Xixi_Park.jpg',
-]
-gme_st = SentenceTransformer("Alibaba-NLP/gme-Qwen2-VL-7B-Instruct")
-# Single-modal embedding
-e_text = gme_st.encode(texts, convert_to_tensor=True)
-e_image = gme_st.encode([dict(image=i) for i in images], convert_to_tensor=True)
-print('Single-modal', (e_text @ e_image.T).tolist())
-## Single-modal [[0.27880859375, 0.0005745887756347656], [0.06500244140625, 0.306640625]]
-# How to set embedding instruction
-e_query = gme_st.encode([dict(text=t, prompt=t2i_prompt) for t in texts], convert_to_tensor=True)
-# If no prompt, we always use the default instruction.
-e_corpus = gme_st.encode([dict(image=i) for i in images], convert_to_tensor=True)
-print('Single-modal with instruction', (e_query @ e_corpus.T).tolist())
-## Single-modal with instruction [[0.328369140625, 0.0269927978515625], [0.09521484375, 0.316162109375]]
-# Fused-modal embedding
-e_fused = gme_st.encode([dict(text=t, image=i) for t, i in zip(texts, images)], convert_to_tensor=True)
-print('Fused-modal', (e_fused @ e_fused.T).tolist())
-## Fused-modal [[0.99951171875, 0.0311737060546875], [0.0311737060546875, 1.0009765625]]
 ```
 ## Evaluation
-We validated the performance on our universal multimodal retrieval benchmark (**UMRB**, see [Release UMRB](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct/discussions/2)) among others.
 |                    |      | Single-modal |           | Cross-modal |             |           | Fused-modal |            |            |             |  Avg.      |
 |--------------------|------|:------------:|:---------:|:-----------:|:-----------:|:---------:|:-----------:|:----------:|:----------:|:-----------:|:----------:|

 |[`gme-Qwen2-VL-2B`](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct) | 2.21B | 32768 | 1536 | 65.27 | 68.41 | 64.45 |
 |[`gme-Qwen2-VL-7B`](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct) | 8.29B | 32768 | 3584 | 67.48 | 71.36 | 67.44 |
 ## Usage
+**Use with custom code**
 ```python
+# You can find the script gme_inference.py in https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct/blob/main/gme_inference.py
+from gme_inference import GmeQwen2VL
+model = GmeQwen2VL('Alibaba-NLP/gme-Qwen2-VL-7B-Instruct')
 texts = [
+    "What kind of car is this?",
+    "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023."
 ]
 images = [
+    'https://en.wikipedia.org/wiki/File:Tesla_Cybertruck_damaged_window.jpg',
+    'https://en.wikipedia.org/wiki/File:2024_Tesla_Cybertruck_Foundation_Series,_front_left_(Greenwich).jpg',
 ]
 # Single-modal embedding
 e_text = gme.get_text_embeddings(texts=texts)
 e_image = gme.get_image_embeddings(images=images)
+print((e_text * e_image).sum(-1))
+## tensor([0.1702, 0.5278], dtype=torch.float16)
 # How to set embedding instruction
+e_query = gme.get_text_embeddings(texts=texts, instruction='Find an image that matches the given text.')
 # If is_query=False, we always use the default instruction.
 e_corpus = gme.get_image_embeddings(images=images, is_query=False)
+print((e_query * e_corpus).sum(-1))
+## tensor([0.2000, 0.5752], dtype=torch.float16)
 # Fused-modal embedding
 e_fused = gme.get_fused_embeddings(texts=texts, images=images)
+print((e_fused[0] * e_fused[1]).sum())
+## tensor(0.6826, dtype=torch.float16)
+```
+<!-- <details>
+<summary>With transformers</summary>
 ```python
+# Requires transformers>=4.46.2
+TODO
+# [[0.3016996383666992, 0.7503870129585266, 0.3203084468841553]]
 ```
+</details>
+ -->
 ## Evaluation
+We validated the performance on our universal multimodal retrieval benchmark (**UMRB**) among others.
 |                    |      | Single-modal |           | Cross-modal |             |           | Fused-modal |            |            |             |  Avg.      |
 |--------------------|------|:------------:|:---------:|:-----------:|:-----------:|:---------:|:-----------:|:----------:|:----------:|:-----------:|:----------:|

config.json CHANGED Viewed

@@ -1,13 +1,8 @@
 {
-  "_name_or_path": "Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
   "architectures": [
-    "Qwen2VLForConditionalGeneration",
-    "GmeQwen2VL"
   ],
-  "auto_map": {
-    "AutoConfig": "modeling_gme_qwen2vl.GmeQwen2VLConfig",
-    "AutoModel": "modeling_gme_qwen2vl.GmeQwen2VL"
-  },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151645,

 {
+  "_name_or_path": "gme-Qwen2-VL-7B-Instruct",
   "architectures": [
+    "Qwen2VLForConditionalGeneration"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151645,

config_sentence_transformers.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "prompts": {
-     "query": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-  },
-  "default_prompt_name": null,
-  "similarity_fn_name": null
-}

custom_st.py DELETED Viewed

@@ -1,221 +0,0 @@
-from io import BytesIO
-from typing import Any, Dict, Optional, List
-import torch
-from PIL import Image
-from sentence_transformers.models import Transformer as BaseTransformer
-from transformers import AutoModelForVision2Seq, AutoProcessor
-class MultiModalTransformer(BaseTransformer):
-    def __init__(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        tokenizer_args: Optional[Dict[str, Any]] = None,
-        min_image_tokens: int = 256,
-        max_image_tokens: int = 1280,
-        max_length: int = 1800,
-        **kwargs,
-    ):
-        super().__init__(model_name_or_path, **kwargs)
-        if tokenizer_args is None:
-            tokenizer_args = {}
-        tokenizer_args.pop("trust_remote_code", None)
-        # Initialize processor
-        min_pixels = min_image_tokens * 28 * 28
-        max_pixels = max_image_tokens * 28 * 28
-        self.processor = AutoProcessor.from_pretrained(
-            model_name_or_path, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
-        )
-        self.processor.tokenizer.padding_side = 'right'
-        self.sep = ' '
-        self.max_length = max_length
-        self.normalize = True
-    def _load_model(
-            self,
-            model_name_or_path: str,
-            config,
-            cache_dir: str,
-            backend: str,
-            is_peft_model: bool,
-            **model_args,
-    ) -> None:
-        model_args.pop("trust_remote_code", None)
-        self.auto_model = AutoModelForVision2Seq.from_pretrained(
-            model_name_or_path, torch_dtype=torch.float16, **model_args
-        )
-    def forward(
-        self, features: Dict[str, torch.Tensor], **kwargs
-    ) -> Dict[str, torch.Tensor]:
-        if features.get("inputs_embeds", None) is None:
-            features["inputs_embeds"] = self.auto_model.base_model.get_input_embeddings()(features["input_ids"])
-            if features.get("pixel_values", None) is not None:
-                features["pixel_values"] = features["pixel_values"].type(self.auto_model.visual.get_dtype())
-                image_embeds = self.auto_model.visual(
-                    features["pixel_values"], grid_thw=features["image_grid_thw"]
-                )
-                image_mask = features["input_ids"] == self.auto_model.config.image_token_id
-                features["inputs_embeds"][image_mask] = image_embeds
-                # features.pop("pixel_values")
-                # features.pop("image_grid_thw")
-        # features.pop("input_ids")
-        inputs = {k: v for k, v in features.items() if k in 'position_ids,attention_mask,inputs_embeds'}
-        outputs = self.auto_model.model(
-            **inputs,
-            return_dict=True,
-            output_hidden_states=True,
-            # **kwargs
-        )
-        # pooling_mask = features["attention_mask"] if features.get("pooling_mask", None) is None else features["pooling_mask"]
-        # left_padding = (pooling_mask[:, -1].sum() == pooling_mask.shape[0])  # TODO
-        # if left_padding:
-        #     embeddings = outputs.last_hidden_state
-        # else:
-        #     sequence_lengths = pooling_mask.sum(dim=1) - 1
-        #     embeddings = outputs.last_hidden_state[torch.arange(
-        #         outputs.last_hidden_state.shape[0], device=outputs.last_hidden_state.device
-        #     ), sequence_lengths]
-        features.update({"token_embeddings": outputs.last_hidden_state})
-        return features
-    def tokenize(self, texts: List[List[Dict[str, Any]]] | List[str]) -> Dict[str, torch.Tensor]:
-        default_instruction = 'You are a helpful assistant.'
-        all_texts, all_images = list(), list()
-        for item in texts:
-            if isinstance(item, str):
-                txt, img, inst = item, None, default_instruction
-            elif isinstance(item, dict):
-                txt = item.get('text', None)
-                img = item.get('image', None)
-                inst = item.get('prompt', default_instruction)
-            else:
-                raise RuntimeError(f'Input format not supported! {item=}')
-            input_str = ''
-            if img is None:
-                all_images = None  # All examples in the same batch are consistent
-                # or will have ValueError: Could not make a flat list of images from xxxx
-            else:
-                input_str += '<|vision_start|><|image_pad|><|vision_end|>'
-                img = fetch_image(img)
-                all_images.append(img)
-            if txt is not None:
-                input_str += txt
-            msg = f'<|im_start|>system\n{inst}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>'
-            all_texts.append(msg)
-        inputs = self.processor(
-            text=all_texts,
-            images=all_images,
-            padding="longest",
-            truncation=True,
-            max_length=self.max_seq_length,
-            return_tensors='pt'
-        )
-        return inputs
-### Copied from qwen_vl_utils.vision_process.py
-import base64
-from io import BytesIO
-import requests
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
-def round_by_factor(number: int, factor: int) -> int:
-    """Returns the closest integer to 'number' that is divisible by 'factor'."""
-    return round(number / factor) * factor
-def ceil_by_factor(number: int, factor: int) -> int:
-    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-    return math.ceil(number / factor) * factor
-def floor_by_factor(number: int, factor: int) -> int:
-    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-    return math.floor(number / factor) * factor
-def smart_resize(
-    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
-) -> tuple[int, int]:
-    """
-    Rescales the image so that the following conditions are met:
-    1. Both dimensions (height and width) are divisible by 'factor'.
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-    3. The aspect ratio of the image is maintained as closely as possible.
-    """
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-    if max(h_bar, w_bar) / min(h_bar, w_bar) > MAX_RATIO:
-        logging.warning(
-            f"Absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(h_bar, w_bar) / min(h_bar, w_bar)}"
-        )
-        if h_bar > w_bar:
-            h_bar = w_bar * MAX_RATIO
-        else:
-            w_bar = h_bar * MAX_RATIO
-    return h_bar, w_bar
-def fetch_image(image: str | Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
-    image_obj = None
-    if isinstance(image, Image.Image):
-        image_obj = image
-    elif image.startswith("http://") or image.startswith("https://"):
-        image_obj = Image.open(requests.get(image, stream=True).raw)
-    elif image.startswith("file://"):
-        image_obj = Image.open(image[7:])
-    elif image.startswith("data:image"):
-        if "base64," in image:
-            _, base64_data = image.split("base64,", 1)
-            data = base64.b64decode(base64_data)
-            image_obj = Image.open(BytesIO(data))
-    else:
-        image_obj = Image.open(image)
-    if image_obj is None:
-        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
-    image = image_obj.convert("RGB")
-    ## resize
-    # if "resized_height" in ele and "resized_width" in ele:
-    #     resized_height, resized_width = smart_resize(
-    #         ele["resized_height"],
-    #         ele["resized_width"],
-    #         factor=size_factor,
-    #     )
-    # else:
-    width, height = image.size
-    # min_pixels = ele.get("min_pixels", MIN_PIXELS)
-    # max_pixels = ele.get("max_pixels", MAX_PIXELS)
-    resized_height, resized_width = smart_resize(
-        height,
-        width,
-        factor=size_factor,
-        min_pixels=MIN_PIXELS,
-        max_pixels=MAX_PIXELS,
-    )
-    image = image.resize((resized_width, resized_height))
-    return image
-###

modeling_gme_qwen2vl.py DELETED Viewed

@@ -1,337 +0,0 @@
-from __future__ import annotations
-import base64
-import logging
-import math
-import os
-from io import BytesIO
-from typing import Any, Dict, List, Optional, Union
-import requests
-import torch
-from PIL import Image
-from torch.utils.data import DataLoader
-from tqdm.autonotebook import tqdm
-from transformers import AutoProcessor, PreTrainedModel
-from transformers.models.qwen2_vl.modeling_qwen2_vl import (
-    Qwen2VisionTransformerPretrainedModel,
-    Qwen2VLConfig,
-    Qwen2VLForConditionalGeneration,
-    Qwen2VLModel,
-)
-from transformers.utils.versions import require_version
-require_version(
-    "transformers<4.52.0",
-    "This code has some issues with transformers>=4.52.0, please downgrade: pip install transformers==4.51.3"
-)
-class GmeQwen2VLConfig(Qwen2VLConfig):
-    # model_type = ''
-    def __init__(
-        self,
-        min_image_tokens: int = 256,
-        max_image_tokens: int = 1280,
-        max_length: int = 1800,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(**kwargs)
-        self.min_image_tokens = min_image_tokens
-        self.max_image_tokens = max_image_tokens
-        self.max_length = max_length
-class GmeQwen2VL(PreTrainedModel):
-    config_class = GmeQwen2VLConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
-    # _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    # _supports_cache_class = True
-    _supports_static_cache = False  # TODO (joao): fix. torch.compile failing probably due to `cache_positions`
-    # _tied_weights_keys = ["lm_head.weight"]
-    def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
-        super().__init__(config)
-        self.visual = Qwen2VisionTransformerPretrainedModel._from_config(config.vision_config)
-        self.model = Qwen2VLModel(config)
-        self.vocab_size = config.vocab_size
-        # self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.rope_deltas = None  # cache rope_deltas here
-        min_pixels: int = config.min_image_tokens * 28 * 28
-        max_pixels: int = config.max_image_tokens * 28 * 28
-        self.processor = AutoProcessor.from_pretrained(
-            config._name_or_path, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
-        )
-        self.max_length: int = config.max_length
-        self.normalize: bool = True
-        self.processor.tokenizer.padding_side = "right"
-        self.default_instruction: str = "You are a helpful assistant."
-        self.sep: str = " "
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.Tensor] = None,
-        # pixel_values_videos: Optional[torch.FloatTensor] = None,
-        image_grid_thw: Optional[torch.LongTensor] = None,
-        # video_grid_thw: Optional[torch.LongTensor] = None,
-        pooling_mask: Optional[torch.LongTensor] = None,
-        **kwargs
-    ) -> torch.Tensor:
-        if inputs_embeds is None:
-            inputs_embeds = self.model.get_input_embeddings()(input_ids)
-            if pixel_values is not None:
-                pixel_values = pixel_values.type(self.visual.get_dtype())
-                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
-                image_mask = input_ids == self.config.image_token_id
-                inputs_embeds[image_mask] = image_embeds
-            # if pixel_values_videos is not None:
-            #     pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
-            #     video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
-            #     video_mask = input_ids == self.config.video_token_id
-            #     inputs_embeds[video_mask] = video_embeds
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(inputs_embeds.device)
-        outputs = self.model(
-            input_ids=None,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-        )
-        pooling_mask = attention_mask if pooling_mask is None else pooling_mask
-        left_padding = (pooling_mask[:, -1].sum() == pooling_mask.shape[0])  # TODO
-        if left_padding:
-            embeddings = outputs.last_hidden_state[:, -1]
-        else:
-            sequence_lengths = pooling_mask.sum(dim=1) - 1
-            batch_size = outputs.last_hidden_state.shape[0]
-            embeddings = outputs.last_hidden_state[torch.arange(
-                batch_size, device=outputs.last_hidden_state.device
-            ), sequence_lengths]
-        if self.normalize:
-            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-        return embeddings.contiguous()
-    def embed(self, texts: list[str], images: list[Image.Image], is_query=True, instruction=None, **kwargs):
-        self.eval()
-        # Inputs must be batched
-        input_texts, input_images = list(), list()
-        for t, i in zip(texts, images):
-            if not is_query or instruction is None:
-                instruction = self.default_instruction
-            input_str = ''
-            if i is None:
-                input_images = None  # All examples in the same batch are consistent
-            else:
-                input_str += '<|vision_start|><|image_pad|><|vision_end|>'
-                i = fetch_image(i)
-                input_images.append(i)
-            if t is not None:
-                input_str += t
-            msg = f'<|im_start|>system\n{instruction}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>'
-            input_texts.append(msg)
-        inputs = self.processor(
-            text=input_texts,
-            images=input_images,
-            padding=True,
-            truncation=True,
-            max_length=self.max_length,
-            return_tensors='pt'
-        )
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}  # TODO
-        with torch.inference_mode():
-            embeddings = self.forward(**inputs)
-        return embeddings
-    def encode(self, sentences: list[str], *, prompt_name=None, **kwargs):
-        return self.get_fused_embeddings(texts=sentences, prompt_name=prompt_name, **kwargs)
-    def encode_queries(self, queries: List[str], **kwargs):
-        embeddings = self.encode(queries, **kwargs)
-        return embeddings
-    def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs):
-        if type(corpus) is dict:
-            sentences = [
-                (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
-                if "title" in corpus
-                else corpus["text"][i].strip()
-                for i in range(len(corpus["text"]))
-            ]
-        else:
-            sentences = [
-                (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
-                for doc in corpus
-            ]
-        embeddings = self.encode(sentences, is_query=False, **kwargs)
-        return embeddings
-    def get_image_embeddings(self, images: list[Image.Image] | DataLoader, **kwargs):
-        return self.get_fused_embeddings(images=images, **kwargs)
-    def get_text_embeddings(self, texts: list[str], **kwargs):
-        return self.get_fused_embeddings(texts=texts, **kwargs)
-    def get_fused_embeddings(self, texts: list[str] = None, images: list[Image.Image] | DataLoader = None, **kwargs):
-        if isinstance(images, DataLoader):
-            image_loader = images
-            batch_size = image_loader.batch_size
-            image_loader.dataset.transform = None
-        else:
-            batch_size = kwargs.pop('batch_size', 32)
-            if images is None:
-                image_loader = None
-            else:
-                image_loader = DataLoader(
-                    images,
-                    batch_size=batch_size,
-                    shuffle=False,
-                    collate_fn=custom_collate_fn,
-                    num_workers=min(math.floor(os.cpu_count() / 2), 8),
-                )
-        if texts is None:
-            assert image_loader is not None
-            n_batch = len(image_loader)
-        else:
-            n_batch = len(texts) // batch_size + int(len(texts) % batch_size > 0)
-            image_loader = image_loader or [None] * n_batch
-        all_embeddings = list()
-        none_batch = [None] * batch_size
-        show_progress_bar = kwargs.pop('show_progress_bar', False)
-        pbar = tqdm(total=n_batch, disable=not show_progress_bar, mininterval=1, miniters=10, desc='encode')
-        for n, img_batch in zip(range(0, n_batch * batch_size, batch_size), image_loader):
-            text_batch = none_batch if texts is None else texts[n: n+batch_size]
-            img_batch = none_batch if img_batch is None else img_batch
-            embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
-            pbar.update(1)
-            all_embeddings.append(embeddings.cpu())
-        pbar.close()
-        all_embeddings = torch.cat(all_embeddings, dim=0)
-        return all_embeddings
-def custom_collate_fn(batch):
-    return batch
-### Copied from qwen_vl_utils.vision_process.py
-import base64
-from io import BytesIO
-import requests
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
-def round_by_factor(number: int, factor: int) -> int:
-    """Returns the closest integer to 'number' that is divisible by 'factor'."""
-    return round(number / factor) * factor
-def ceil_by_factor(number: int, factor: int) -> int:
-    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-    return math.ceil(number / factor) * factor
-def floor_by_factor(number: int, factor: int) -> int:
-    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-    return math.floor(number / factor) * factor
-def smart_resize(
-    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
-) -> tuple[int, int]:
-    """
-    Rescales the image so that the following conditions are met:
-    1. Both dimensions (height and width) are divisible by 'factor'.
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-    3. The aspect ratio of the image is maintained as closely as possible.
-    """
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-    if max(h_bar, w_bar) / min(h_bar, w_bar) > MAX_RATIO:
-        logging.warning(
-            f"Absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(h_bar, w_bar) / min(h_bar, w_bar)}"
-        )
-        if h_bar > w_bar:
-            h_bar = w_bar * MAX_RATIO
-        else:
-            w_bar = h_bar * MAX_RATIO
-    return h_bar, w_bar
-def fetch_image(image: str | Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
-    image_obj = None
-    if isinstance(image, Image.Image):
-        image_obj = image
-    elif image.startswith("http://") or image.startswith("https://"):
-        image_obj = Image.open(requests.get(image, stream=True).raw)
-    elif image.startswith("file://"):
-        image_obj = Image.open(image[7:])
-    elif image.startswith("data:image"):
-        if "base64," in image:
-            _, base64_data = image.split("base64,", 1)
-            data = base64.b64decode(base64_data)
-            image_obj = Image.open(BytesIO(data))
-    else:
-        image_obj = Image.open(image)
-    if image_obj is None:
-        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
-    image = image_obj.convert("RGB")
-    ## resize
-    # if "resized_height" in ele and "resized_width" in ele:
-    #     resized_height, resized_width = smart_resize(
-    #         ele["resized_height"],
-    #         ele["resized_width"],
-    #         factor=size_factor,
-    #     )
-    # else:
-    width, height = image.size
-    # min_pixels = ele.get("min_pixels", MIN_PIXELS)
-    # max_pixels = ele.get("max_pixels", MAX_PIXELS)
-    resized_height, resized_width = smart_resize(
-        height,
-        width,
-        factor=size_factor,
-        min_pixels=MIN_PIXELS,
-        max_pixels=MAX_PIXELS,
-    )
-    image = image.resize((resized_width, resized_height))
-    return image
-###

modules.json DELETED Viewed

@@ -1,20 +0,0 @@
-[
-  {
-    "idx": 0,
-    "name": "0",
-    "path": "",
-    "type": "custom_st.MultiModalTransformer"
-  },
-  {
-    "idx": 1,
-    "name": "1",
-    "path": "1_Pooling",
-    "type": "sentence_transformers.models.Pooling"
-  },
-  {
-    "idx": 2,
-    "name": "2",
-    "path": "2_Normalize",
-    "type": "sentence_transformers.models.Normalize"
-  }
-]