bunch of updates

Browse files

Files changed (8) hide show

added_tokens.json +4 -0
modeling_img2html.py +5 -5
preprocessor_config.json +2 -2
processing_img2html.py +28 -21
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +56 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<fake_token_around_image>": 32000,
+  "<image>": 32001
+}

modeling_img2html.py CHANGED Viewed

@@ -109,7 +109,7 @@ class Img2HTMLBaseModelOutputWithPast(ModelOutput):
 @dataclass
 class Img2HTMLCausalLMOutputWithPast(ModelOutput):
     """
-    Base class for Idefics causal language model (or autoregressive) outputs.
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
@@ -171,10 +171,10 @@ def expand_inputs_for_generation(
     if attention_mask is not None:
         model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
-    if model_kwargs["image_attention_mask"] is not None:
-        model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
-            0, expanded_return_idx
-        )
     if model_kwargs["pixel_values"] is not None:
         model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)

 @dataclass
 class Img2HTMLCausalLMOutputWithPast(ModelOutput):
     """
+    Base class for Img2HTML causal language model (or autoregressive) outputs.
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
     if attention_mask is not None:
         model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
+    # if model_kwargs["image_attention_mask"] is not None:
+    #     model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
+    #         0, expanded_return_idx
+    #     )
     if model_kwargs["pixel_values"] is not None:
         model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)

preprocessor_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "auto_map": {
-    "AutoProcessor": "processing_img2html.Img2HTMLProcessor",
-    "AutoImageProcessor": "image_processing_img2html.Img2HTMLImageProcessor"
   },
   "image_num_channels": 3,
   "image_mean": [

 {
   "auto_map": {
+    "AutoProcessor": "HuggingFaceM4/img2html--processing_img2html.Img2HTMLProcessor",
+    "AutoImageProcessor": "HuggingFaceM4/img2html--image_processing_img2html.Img2HTMLImageProcessor"
   },
   "image_num_channels": 3,
   "image_mean": [

processing_img2html.py CHANGED Viewed

@@ -24,6 +24,7 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
 from transformers.utils import TensorType, is_torch_available
 if is_torch_available():
     import torch
@@ -40,7 +41,6 @@ def is_url(string):
     result = urlparse(string)
     return all([result.scheme, result.netloc])
 class Img2HTMLProcessor(ProcessorMixin):
     r"""
     Constructs a Img2HTML processor which wraps a LLama tokenizer and Img2HTML image processor into a single processor.
@@ -60,7 +60,7 @@ class Img2HTMLProcessor(ProcessorMixin):
     image_processor_class = "Img2HTMLImageProcessor"
     tokenizer_class = "LlamaTokenizerFast"
-    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
@@ -76,11 +76,32 @@ class Img2HTMLProcessor(ProcessorMixin):
             self.image_processor.image_size,
         )
-        self.tokenizer_was_trained_with_end_of_utterance_token = (
-            True
-            if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
-            else False
-        )
     def __call__(
         self,
@@ -90,7 +111,6 @@ class Img2HTMLProcessor(ProcessorMixin):
         max_length: Optional[int] = None,
         transform: Callable = None,
         add_eos_token=False,
-        add_end_of_utterance_token=None,
         debug=False,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchEncoding:
@@ -120,10 +140,6 @@ class Img2HTMLProcessor(ProcessorMixin):
                 set of transforms will be applied to the images
             add_eos_token (`bool`, *optional*, defaults to `False`):
                 Adds `eos_token` at the end of the final prompt if True`
-            add_end_of_utterance_token (`bool`, *optional*)
-                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
-                image). If `None` the tokenizer will be checked instead and if this token is found in
-                `additional_special_tokens` then the value will be `True`.
             debug (`bool`, *optional*, defaults to `False`):
                 `True` value will help debug prompt generation by dumping useful information
             return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
@@ -198,18 +214,12 @@ class Img2HTMLProcessor(ProcessorMixin):
         In order to help debug prompt generation enable `debug=True` which will show you what's happening.
         """
-        # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
-        if add_end_of_utterance_token is None:
-            add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
         # turn non-batched prompts into batched
         if not any(isinstance(i, list) for i in prompts):
             prompts = [prompts]
         fake_token = "<fake_token_around_image>"
         image_token = "<image>"
-        end_of_utterance_token = "<end_of_utterance>"
         def image_tokens(last_was_image):
             if last_was_image:
@@ -239,9 +249,6 @@ class Img2HTMLProcessor(ProcessorMixin):
                         image_objects.append(image)
                         last_was_image = True
                     else:
-                        # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
-                        if add_end_of_utterance_token and last_was_text:
-                            full_text += end_of_utterance_token
                         full_text += item
                         last_was_image = False
                 else:

 from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
 from transformers.utils import TensorType, is_torch_available
+from .image_processing_img2html import Img2HTMLImageProcessor
 if is_torch_available():
     import torch
     result = urlparse(string)
     return all([result.scheme, result.netloc])
 class Img2HTMLProcessor(ProcessorMixin):
     r"""
     Constructs a Img2HTML processor which wraps a LLama tokenizer and Img2HTML image processor into a single processor.
     image_processor_class = "Img2HTMLImageProcessor"
     tokenizer_class = "LlamaTokenizerFast"
+    def __init__(self, image_processor, tokenizer=None, image_size=960, **kwargs):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
             self.image_processor.image_size,
         )
+    # @classmethod
+    # def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    #     # Hack overriding things
+    #     from pathlib import Path
+    #     from transformers.utils import direct_transformers_import
+    #     # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
+    #     transformers_module = direct_transformers_import(Path(__file__).parent)
+    #     args = []
+    #     for attribute_name in cls.attributes:
+    #         class_name = getattr(cls, f"{attribute_name}_class")
+    #         if isinstance(class_name, tuple):
+    #             classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
+    #             use_fast = kwargs.get("use_fast", True)
+    #             if use_fast and classes[1] is not None:
+    #                 attribute_class = classes[1]
+    #             else:
+    #                 attribute_class = classes[0]
+    #         else:
+    #             if class_name == "Img2HTMLImageProcessor":
+    #                 attribute_class = Img2HTMLImageProcessor
+    #             else:
+    #                 attribute_class = getattr(transformers_module, class_name)
+    #         args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+    #     return args
     def __call__(
         self,
         max_length: Optional[int] = None,
         transform: Callable = None,
         add_eos_token=False,
         debug=False,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchEncoding:
                 set of transforms will be applied to the images
             add_eos_token (`bool`, *optional*, defaults to `False`):
                 Adds `eos_token` at the end of the final prompt if True`
             debug (`bool`, *optional*, defaults to `False`):
                 `True` value will help debug prompt generation by dumping useful information
             return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
         In order to help debug prompt generation enable `debug=True` which will show you what's happening.
         """
         # turn non-batched prompts into batched
         if not any(isinstance(i, list) for i in prompts):
             prompts = [prompts]
         fake_token = "<fake_token_around_image>"
         image_token = "<image>"
         def image_tokens(last_was_image):
             if last_was_image:
                         image_objects.append(image)
                         last_was_image = True
                     else:
                         full_text += item
                         last_was_image = False
                 else:

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<fake_token_around_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}