jinaai
/

jina-embeddings-v4

Just because there is a http at start doesn't mean it's an url, added space check to extract chunks just starting by http (happened with a research paper)

Files changed (1) hide show

custom_st.py +1 -1

custom_st.py CHANGED Viewed

@@ -70,7 +70,7 @@ class Transformer(nn.Module):
                 elif text.startswith("Passage: "):
                     clean_text = text[len("Passage: ") :]
-                if clean_text.startswith("http"):
                     response = requests.get(clean_text)
                     texts[i] = Image.open(BytesIO(response.content)).convert("RGB")
                     image_indices.append(i)

                 elif text.startswith("Passage: "):
                     clean_text = text[len("Passage: ") :]
+                if clean_text.startswith("http") and " " not in clean_text:
                     response = requests.get(clean_text)
                     texts[i] = Image.open(BytesIO(response.content)).convert("RGB")
                     image_indices.append(i)