Spaces:
Sleeping
Sleeping
| import os | |
| from supabase import create_client | |
| from sentence_transformers import SentenceTransformer | |
| from huggingface_hub import hf_hub_download | |
| from datasets import load_dataset | |
| from dotenv import load_dotenv | |
| # ----------------------------------------------------------------------------- | |
| # Load env vars | |
| # ----------------------------------------------------------------------------- | |
| load_dotenv() | |
| SUPABASE_URL = os.getenv("SUPABASE_URL") | |
| SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY") | |
| HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN") | |
| if not SUPABASE_URL or not SUPABASE_SERVICE_KEY: | |
| raise RuntimeError("Set SUPABASE_URL and SUPABASE_SERVICE_KEY in your .env") | |
| if not HF_TOKEN: | |
| raise RuntimeError( | |
| "Set HUGGINGFACE_API_TOKEN in your .env and ensure you've been granted access to the GAIA dataset." | |
| ) | |
| # ----------------------------------------------------------------------------- | |
| # Init clients & models | |
| # ----------------------------------------------------------------------------- | |
| supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) | |
| model = SentenceTransformer("all-mpnet-base-v2") | |
| # ----------------------------------------------------------------------------- | |
| # GAIA metadata location on HF | |
| # ----------------------------------------------------------------------------- | |
| GAIA_REPO_ID = "gaia-benchmark/GAIA" | |
| GAIA_METADATA_FILE = "2023/validation/metadata.jsonl" | |
| def fetch_gaia_validation_examples(): | |
| print("🔄 Downloading GAIA metadata.jsonl …") | |
| metadata_path = hf_hub_download( | |
| repo_id = GAIA_REPO_ID, | |
| filename = GAIA_METADATA_FILE, | |
| token = HF_TOKEN, | |
| repo_type = "dataset", | |
| ) | |
| print(f"✅ Downloaded to {metadata_path!r}") | |
| print("🔄 Loading JSONL via Datasets …") | |
| ds = load_dataset( | |
| "json", | |
| data_files = metadata_path, | |
| split = "train", | |
| ) | |
| print("Columns in your JSONL:", ds.column_names) | |
| QUESTION_FIELD = "Question" | |
| ANSWER_FIELD = "Final answer" | |
| qa = [] | |
| for row in ds: | |
| q = row.get(QUESTION_FIELD) | |
| a = row.get(ANSWER_FIELD) | |
| if q and a: | |
| qa.append((q, a)) | |
| print(f"✅ Found {len(qa)} (Question, Final answer) pairs.") | |
| return qa | |
| def main(): | |
| qa_pairs = fetch_gaia_validation_examples() | |
| if not qa_pairs: | |
| print("⚠️ No QA pairs—abort.") | |
| return | |
| to_insert = [] | |
| for q, a in qa_pairs: | |
| text = f"Q: {q} A: {a}" | |
| emb = model.encode(text).tolist() | |
| to_insert.append({"page_content": text, "embedding": emb}) | |
| print(f"🚀 Inserting {len(to_insert)} records into Supabase…") | |
| res = supabase.table("documents").insert(to_insert).execute() | |
| if res.data: | |
| print(f"🎉 Successfully inserted {len(to_insert)} GAIA examples.") | |
| else: | |
| print("❌ Insert appeared to fail. Response:") | |
| print(res) | |
| if __name__ == "__main__": | |
| main() | |