Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -74,28 +74,7 @@ async def index(files: List[UploadFile] = File(...)):
|
|
| 74 |
|
| 75 |
return {"message": f"Uploaded and converted {len(images)} pages"}
|
| 76 |
|
| 77 |
-
|
| 78 |
-
async def search(query: str, k: int):
|
| 79 |
-
qs = []
|
| 80 |
-
with torch.no_grad():
|
| 81 |
-
batch_query = process_queries(processor, [query], mock_image)
|
| 82 |
-
batch_query = {k: v.to(device) for k, v in batch_query.items()}
|
| 83 |
-
embeddings_query = model(**batch_query)
|
| 84 |
-
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
| 85 |
-
|
| 86 |
-
retriever_evaluator = CustomEvaluator(is_multi_vector=True)
|
| 87 |
-
scores = retriever_evaluator.evaluate(qs, ds)
|
| 88 |
-
|
| 89 |
-
top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
|
| 90 |
-
|
| 91 |
-
results = []
|
| 92 |
-
for idx in top_k_indices:
|
| 93 |
-
img_byte_arr = BytesIO()
|
| 94 |
-
images[idx].save(img_byte_arr, format='PNG')
|
| 95 |
-
img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
| 96 |
-
results.append({"image": img_base64, "page": f"Page {idx}"})
|
| 97 |
-
|
| 98 |
-
# Generate PDF
|
| 99 |
pdf_buffer = BytesIO()
|
| 100 |
c = canvas.Canvas(pdf_buffer, pagesize=letter)
|
| 101 |
width, height = letter
|
|
@@ -118,10 +97,78 @@ async def search(query: str, k: int):
|
|
| 118 |
|
| 119 |
c.save()
|
| 120 |
pdf_buffer.seek(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
# Use StreamingResponse to handle in-memory file
|
| 123 |
response = StreamingResponse(pdf_buffer, media_type='application/pdf')
|
| 124 |
-
response.headers['Content-Disposition'] = 'attachment; filename="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
return response
|
| 127 |
|
|
|
|
| 74 |
|
| 75 |
return {"message": f"Uploaded and converted {len(images)} pages"}
|
| 76 |
|
| 77 |
+
def generate_pdf(results):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
pdf_buffer = BytesIO()
|
| 79 |
c = canvas.Canvas(pdf_buffer, pagesize=letter)
|
| 80 |
width, height = letter
|
|
|
|
| 97 |
|
| 98 |
c.save()
|
| 99 |
pdf_buffer.seek(0)
|
| 100 |
+
return pdf_buffer
|
| 101 |
+
|
| 102 |
+
@app.get("/search")
|
| 103 |
+
async def search(query: str, k: int = 1):
|
| 104 |
+
qs = []
|
| 105 |
+
with torch.no_grad():
|
| 106 |
+
batch_query = process_queries(processor, [query], mock_image)
|
| 107 |
+
batch_query = {k: v.to(device) for k, v in batch_query.items()}
|
| 108 |
+
embeddings_query = model(**batch_query)
|
| 109 |
+
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
| 110 |
+
|
| 111 |
+
retriever_evaluator = CustomEvaluator(is_multi_vector=True)
|
| 112 |
+
scores = retriever_evaluator.evaluate(qs, ds)
|
| 113 |
+
|
| 114 |
+
top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
|
| 115 |
+
|
| 116 |
+
results = []
|
| 117 |
+
for idx in top_k_indices:
|
| 118 |
+
img_byte_arr = BytesIO()
|
| 119 |
+
images[idx].save(img_byte_arr, format='PNG')
|
| 120 |
+
img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
| 121 |
+
results.append({"image": img_base64, "page": f"Page {idx}"})
|
| 122 |
+
|
| 123 |
+
pdf_buffer = generate_pdf(results)
|
| 124 |
|
| 125 |
# Use StreamingResponse to handle in-memory file
|
| 126 |
response = StreamingResponse(pdf_buffer, media_type='application/pdf')
|
| 127 |
+
response.headers['Content-Disposition'] = 'attachment; filename="results.pdf"'
|
| 128 |
+
|
| 129 |
+
return response
|
| 130 |
+
|
| 131 |
+
@app.get("/search_by_cv")
|
| 132 |
+
async def search_by_cv(file: UploadFile = File(...), k: int = 10):
|
| 133 |
+
# Lire le fichier PDF uploadé
|
| 134 |
+
content = await file.read()
|
| 135 |
+
pdf_image_list = convert_from_bytes(content)
|
| 136 |
+
|
| 137 |
+
# Générer les embeddings pour les pages du PDF uploadé
|
| 138 |
+
qs = []
|
| 139 |
+
dataloader = DataLoader(
|
| 140 |
+
pdf_image_list,
|
| 141 |
+
batch_size=4,
|
| 142 |
+
shuffle=False,
|
| 143 |
+
collate_fn=lambda x: process_images(processor, x),
|
| 144 |
+
)
|
| 145 |
+
for batch_query in dataloader:
|
| 146 |
+
with torch.no_grad():
|
| 147 |
+
batch_query = {k: v.to(device) for k, v in batch_query.items()}
|
| 148 |
+
embeddings_query = model(**batch_query)
|
| 149 |
+
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
| 150 |
+
|
| 151 |
+
# Comparer les embeddings du CV uploadé avec ceux déjà indexés
|
| 152 |
+
retriever_evaluator = CustomEvaluator(is_multi_vector=True)
|
| 153 |
+
scores = retriever_evaluator.evaluate(qs, ds)
|
| 154 |
+
|
| 155 |
+
# Trouver les indices des résultats les plus pertinents
|
| 156 |
+
top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
|
| 157 |
+
|
| 158 |
+
# Préparer les résultats sous forme d'images
|
| 159 |
+
results = []
|
| 160 |
+
for idx in top_k_indices:
|
| 161 |
+
img_byte_arr = BytesIO()
|
| 162 |
+
images[idx].save(img_byte_arr, format='PNG')
|
| 163 |
+
img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
| 164 |
+
results.append({"image": img_base64, "page": f"Page {idx}"})
|
| 165 |
+
|
| 166 |
+
# Générer le PDF des résultats
|
| 167 |
+
pdf_buffer = generate_pdf(results)
|
| 168 |
+
|
| 169 |
+
# Utiliser StreamingResponse pour renvoyer le fichier PDF généré
|
| 170 |
+
response = StreamingResponse(pdf_buffer, media_type='application/pdf')
|
| 171 |
+
response.headers['Content-Disposition'] = 'attachment; filename="results.pdf"'
|
| 172 |
|
| 173 |
return response
|
| 174 |
|