Update README.md
Browse files
README.md
CHANGED
|
@@ -17,6 +17,8 @@ This is the official model from the paper:
|
|
| 17 |
|
| 18 |
π **[DocReRank: Single-Page Hard Negative Query Generation for Training Multi-Modal RAG Rerankers](https://arxiv.org/abs/2505.22584)**
|
| 19 |
|
|
|
|
|
|
|
| 20 |
---
|
| 21 |
|
| 22 |
## β
Model Overview
|
|
@@ -38,26 +40,66 @@ from peft import PeftModel
|
|
| 38 |
import torch
|
| 39 |
from PIL import Image
|
| 40 |
|
| 41 |
-
# Load base model
|
| 42 |
base_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 43 |
"Qwen/Qwen2-VL-2B-Instruct",
|
| 44 |
torch_dtype=torch.bfloat16,
|
| 45 |
device_map="cuda"
|
| 46 |
)
|
| 47 |
|
| 48 |
-
# Load DocReRank adapter
|
| 49 |
model = PeftModel.from_pretrained(base_model, "DocReRank/DocReRank-Reranker").eval()
|
| 50 |
|
| 51 |
-
# Load processor
|
| 52 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
# Example query and image
|
| 55 |
-
query = "What is the total revenue in the table?"
|
| 56 |
-
image = Image.open("sample_page.png")
|
| 57 |
-
|
| 58 |
-
inputs = processor(text=query, images=image, return_tensors="pt").to("cuda", torch.bfloat16)
|
| 59 |
-
|
| 60 |
-
with torch.no_grad():
|
| 61 |
-
outputs = model.generate(**inputs, max_new_tokens=16)
|
| 62 |
|
| 63 |
-
print(processor.tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
|
|
| 17 |
|
| 18 |
π **[DocReRank: Single-Page Hard Negative Query Generation for Training Multi-Modal RAG Rerankers](https://arxiv.org/abs/2505.22584)**
|
| 19 |
|
| 20 |
+
See [Project Page](https://navvewas.github.io/DocReRank/) for more information.
|
| 21 |
+
|
| 22 |
---
|
| 23 |
|
| 24 |
## β
Model Overview
|
|
|
|
| 40 |
import torch
|
| 41 |
from PIL import Image
|
| 42 |
|
| 43 |
+
# β
Load base model
|
| 44 |
base_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 45 |
"Qwen/Qwen2-VL-2B-Instruct",
|
| 46 |
torch_dtype=torch.bfloat16,
|
| 47 |
device_map="cuda"
|
| 48 |
)
|
| 49 |
|
| 50 |
+
# β
Load DocReRank adapter
|
| 51 |
model = PeftModel.from_pretrained(base_model, "DocReRank/DocReRank-Reranker").eval()
|
| 52 |
|
| 53 |
+
# β
Load processor
|
| 54 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
| 55 |
+
processor.image_processor.min_pixels = 200704
|
| 56 |
+
processor.image_processor.max_pixels = 589824
|
| 57 |
+
|
| 58 |
+
# β
Define query and images
|
| 59 |
+
query_text = "What are the performances of DocReRank model on resturants and Biomedical benchmarks?"
|
| 60 |
+
# query_text = "Is there ablations results of DocReRank model?"
|
| 61 |
+
|
| 62 |
+
### Paper pages to rank
|
| 63 |
+
image_paths = [ "DocReRank_paper_page_2.png", "DocReRank_paper_page_4.png", "DocReRank_paper_page_6.png", "DocReRank_paper_page_8.png"]
|
| 64 |
+
|
| 65 |
+
# β
Reranking prompt template
|
| 66 |
+
def compute_score(image_path, query_text):
|
| 67 |
+
image = Image.open(image_path)
|
| 68 |
+
prompt = f"Assert the relevance of the previous image document to the following query, answer True or False. The query is: {query_text}"
|
| 69 |
+
messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}]
|
| 70 |
+
|
| 71 |
+
# Tokenize
|
| 72 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 73 |
+
inputs = processor(text=text, images=image, return_tensors="pt").to(model.device, torch.bfloat16)
|
| 74 |
+
|
| 75 |
+
# Compute logits
|
| 76 |
+
with torch.no_grad():
|
| 77 |
+
outputs = model(**inputs)
|
| 78 |
+
logits = outputs.logits[:, -1, :]
|
| 79 |
+
true_id = processor.tokenizer.convert_tokens_to_ids("True")
|
| 80 |
+
false_id = processor.tokenizer.convert_tokens_to_ids("False")
|
| 81 |
+
probs = torch.softmax(logits[:, [true_id, false_id]], dim=-1)
|
| 82 |
+
relevance_score = probs[0, 0].item() # Probability of "True"
|
| 83 |
+
|
| 84 |
+
return relevance_score
|
| 85 |
+
|
| 86 |
+
# β
Compute scores for both images
|
| 87 |
+
scores = [(img, compute_score(img, query_text)) for img in image_paths]
|
| 88 |
+
|
| 89 |
+
# β
Print results
|
| 90 |
+
for img, score in scores:
|
| 91 |
+
print(f"Image: {img} | Relevance Score: {score:.4f}")
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
## Citation
|
| 95 |
+
If you use this dataset, please cite:
|
| 96 |
+
```bibtex
|
| 97 |
+
@article{wasserman2025docrerank,
|
| 98 |
+
title={DocReRank: Single-Page Hard Negative Query Generation for Training Multi-Modal RAG Rerankers},
|
| 99 |
+
author={Wasserman, Navve and Heinimann, Oliver and Golbari, Yuval and Zimbalist, Tal and Schwartz, Eli and Irani, Michal},
|
| 100 |
+
journal={arXiv preprint arXiv:2505.22584},
|
| 101 |
+
year={2025}
|
| 102 |
+
}
|
| 103 |
+
```
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
|
|