import os from supabase import create_client from sentence_transformers import SentenceTransformer from huggingface_hub import hf_hub_download from datasets import load_dataset from dotenv import load_dotenv # ----------------------------------------------------------------------------- # Load env vars # ----------------------------------------------------------------------------- load_dotenv() SUPABASE_URL = os.getenv("SUPABASE_URL") SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY") HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN") if not SUPABASE_URL or not SUPABASE_SERVICE_KEY: raise RuntimeError("Set SUPABASE_URL and SUPABASE_SERVICE_KEY in your .env") if not HF_TOKEN: raise RuntimeError( "Set HUGGINGFACE_API_TOKEN in your .env and ensure you've been granted access to the GAIA dataset." ) # ----------------------------------------------------------------------------- # Init clients & models # ----------------------------------------------------------------------------- supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) model = SentenceTransformer("all-mpnet-base-v2") # ----------------------------------------------------------------------------- # GAIA metadata location on HF # ----------------------------------------------------------------------------- GAIA_REPO_ID = "gaia-benchmark/GAIA" GAIA_METADATA_FILE = "2023/validation/metadata.jsonl" def fetch_gaia_validation_examples(): print("🔄 Downloading GAIA metadata.jsonl …") metadata_path = hf_hub_download( repo_id = GAIA_REPO_ID, filename = GAIA_METADATA_FILE, token = HF_TOKEN, repo_type = "dataset", ) print(f"✅ Downloaded to {metadata_path!r}") print("🔄 Loading JSONL via Datasets …") ds = load_dataset( "json", data_files = metadata_path, split = "train", ) print("Columns in your JSONL:", ds.column_names) QUESTION_FIELD = "Question" ANSWER_FIELD = "Final answer" qa = [] for row in ds: q = row.get(QUESTION_FIELD) a = row.get(ANSWER_FIELD) if q and a: qa.append((q, a)) print(f"✅ Found {len(qa)} (Question, Final answer) pairs.") return qa def main(): qa_pairs = fetch_gaia_validation_examples() if not qa_pairs: print("⚠️ No QA pairs—abort.") return to_insert = [] for q, a in qa_pairs: text = f"Q: {q} A: {a}" emb = model.encode(text).tolist() to_insert.append({"page_content": text, "embedding": emb}) print(f"🚀 Inserting {len(to_insert)} records into Supabase…") res = supabase.table("documents").insert(to_insert).execute() if res.data: print(f"🎉 Successfully inserted {len(to_insert)} GAIA examples.") else: print("❌ Insert appeared to fail. Response:") print(res) if __name__ == "__main__": main()