File size: 2,961 Bytes
6accb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b36ff59
6accb61
 
 
b36ff59
6accb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
from supabase import create_client
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
from datasets import load_dataset
from dotenv import load_dotenv

# -----------------------------------------------------------------------------
# Load env vars
# -----------------------------------------------------------------------------
load_dotenv()
SUPABASE_URL         = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
HF_TOKEN             = os.getenv("HUGGINGFACE_API_TOKEN")

if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
    raise RuntimeError("Set SUPABASE_URL and SUPABASE_SERVICE_KEY in your .env")

if not HF_TOKEN:
    raise RuntimeError(
        "Set HUGGINGFACE_API_TOKEN in your .env and ensure you've been granted access to the GAIA dataset."
    )

# -----------------------------------------------------------------------------
# Init clients & models
# -----------------------------------------------------------------------------
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
model     = SentenceTransformer("all-mpnet-base-v2")

# -----------------------------------------------------------------------------
# GAIA metadata location on HF
# -----------------------------------------------------------------------------
GAIA_REPO_ID       = "gaia-benchmark/GAIA"
GAIA_METADATA_FILE = "2023/validation/metadata.jsonl"

def fetch_gaia_validation_examples():
    print("🔄 Downloading GAIA metadata.jsonl …")
    metadata_path = hf_hub_download(
        repo_id   = GAIA_REPO_ID,
        filename  = GAIA_METADATA_FILE,
        token     = HF_TOKEN,
        repo_type = "dataset",
    )
    print(f"✅ Downloaded to {metadata_path!r}")

    print("🔄 Loading JSONL via Datasets …")
    ds = load_dataset(
        "json",
        data_files = metadata_path,
        split      = "train",
    )
    print("Columns in your JSONL:", ds.column_names)

    QUESTION_FIELD = "Question"
    ANSWER_FIELD   = "Final answer"

    qa = []
    for row in ds:
        q = row.get(QUESTION_FIELD)
        a = row.get(ANSWER_FIELD)
        if q and a:
            qa.append((q, a))

    print(f"✅ Found {len(qa)} (Question, Final answer) pairs.")
    return qa

def main():
    qa_pairs = fetch_gaia_validation_examples()
    if not qa_pairs:
        print("⚠️ No QA pairs—abort.")
        return

    to_insert = []
    for q, a in qa_pairs:
        text = f"Q: {q} A: {a}"
        emb  = model.encode(text).tolist()
        to_insert.append({"page_content": text, "embedding": emb})

    print(f"🚀 Inserting {len(to_insert)} records into Supabase…")
    res = supabase.table("documents").insert(to_insert).execute()
    if res.data:
        print(f"🎉 Successfully inserted {len(to_insert)} GAIA examples.")
    else:
        print("❌ Insert appeared to fail. Response:")
        print(res)

if __name__ == "__main__":
    main()