Spaces:
Sleeping
Sleeping
File size: 2,961 Bytes
6accb61 b36ff59 6accb61 b36ff59 6accb61 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
from supabase import create_client
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
from datasets import load_dataset
from dotenv import load_dotenv
# -----------------------------------------------------------------------------
# Load env vars
# -----------------------------------------------------------------------------
load_dotenv()
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
raise RuntimeError("Set SUPABASE_URL and SUPABASE_SERVICE_KEY in your .env")
if not HF_TOKEN:
raise RuntimeError(
"Set HUGGINGFACE_API_TOKEN in your .env and ensure you've been granted access to the GAIA dataset."
)
# -----------------------------------------------------------------------------
# Init clients & models
# -----------------------------------------------------------------------------
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
model = SentenceTransformer("all-mpnet-base-v2")
# -----------------------------------------------------------------------------
# GAIA metadata location on HF
# -----------------------------------------------------------------------------
GAIA_REPO_ID = "gaia-benchmark/GAIA"
GAIA_METADATA_FILE = "2023/validation/metadata.jsonl"
def fetch_gaia_validation_examples():
print("🔄 Downloading GAIA metadata.jsonl …")
metadata_path = hf_hub_download(
repo_id = GAIA_REPO_ID,
filename = GAIA_METADATA_FILE,
token = HF_TOKEN,
repo_type = "dataset",
)
print(f"✅ Downloaded to {metadata_path!r}")
print("🔄 Loading JSONL via Datasets …")
ds = load_dataset(
"json",
data_files = metadata_path,
split = "train",
)
print("Columns in your JSONL:", ds.column_names)
QUESTION_FIELD = "Question"
ANSWER_FIELD = "Final answer"
qa = []
for row in ds:
q = row.get(QUESTION_FIELD)
a = row.get(ANSWER_FIELD)
if q and a:
qa.append((q, a))
print(f"✅ Found {len(qa)} (Question, Final answer) pairs.")
return qa
def main():
qa_pairs = fetch_gaia_validation_examples()
if not qa_pairs:
print("⚠️ No QA pairs—abort.")
return
to_insert = []
for q, a in qa_pairs:
text = f"Q: {q} A: {a}"
emb = model.encode(text).tolist()
to_insert.append({"page_content": text, "embedding": emb})
print(f"🚀 Inserting {len(to_insert)} records into Supabase…")
res = supabase.table("documents").insert(to_insert).execute()
if res.data:
print(f"🎉 Successfully inserted {len(to_insert)} GAIA examples.")
else:
print("❌ Insert appeared to fail. Response:")
print(res)
if __name__ == "__main__":
main()
|