infly
/

Infinity-Parser-7B

Model card Files Files and versions

zuminghuang commited on Oct 21, 2025

Commit

97f3f85

·

verified ·

1 Parent(s): df3c00d

Update README.md

Files changed (1) hide show

README.md +17 -9

README.md CHANGED Viewed

@@ -19,29 +19,36 @@ Overview of Infinity-Parser training framework. Our model is optimized via reinf
 ## Inference
 ```python
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 model_path = "infly/Infinity-Parser-7B"
 prompt = "Please transform the document’s contents into Markdown format."
-# default: Load the model on the available device(s)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    model_path, torch_dtype="auto", device_map="auto"
 )
-# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
-# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-#     model_path,
-#     torch_dtype=torch.bfloat16,
-#     attn_implementation="flash_attention_2",
-#     device_map="auto",
-# )
 min_pixels = 256 * 28 * 28   # 448 * 448
 max_pixels = 2304 * 28 * 28  # 1344 * 1344
 processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
 messages = [
     {
         "role": "user",
@@ -68,6 +75,7 @@ inputs = processor(
 )
 inputs = inputs.to("cuda")
 generated_ids = model.generate(**inputs, max_new_tokens=4096)
 generated_ids_trimmed = [
     out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)

 ## Inference
 ```python
+import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 model_path = "infly/Infinity-Parser-7B"
 prompt = "Please transform the document’s contents into Markdown format."
+print(f"Loading model and processor...")
+# Default: Load the model on the available device(s)
+# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+#     model_path, torch_dtype="auto", device_map="auto"
+# )
+# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto",
 )
+# Default processor
+# processor = AutoProcessor.from_pretrained(model_path)
+# Recommended processor
 min_pixels = 256 * 28 * 28   # 448 * 448
 max_pixels = 2304 * 28 * 28  # 1344 * 1344
 processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
+print(f"Preparing messages for inference...")
 messages = [
     {
         "role": "user",
 )
 inputs = inputs.to("cuda")
+print(f"Generating results...")
 generated_ids = model.generate(**inputs, max_new_tokens=4096)
 generated_ids_trimmed = [
     out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)