Spaces:
Sleeping
Sleeping
| # prepare_data.py | |
| import os | |
| import kagglehub | |
| import pandas as pd | |
| from datasets import load_dataset | |
| os.makedirs("datasets", exist_ok=True) | |
| # ----------------------- | |
| # 1. PlantVillage (Kaggle) | |
| # ----------------------- | |
| print("Downloading PlantVillage dataset...") | |
| pv_path = kagglehub.dataset_download("dittakavinikhita/plant-disease-prediction-disease-and-healthy") | |
| # Pick the metadata CSV if available | |
| for file in os.listdir(pv_path): | |
| if file.endswith(".csv"): | |
| src = os.path.join(pv_path, file) | |
| dst = "datasets/plant_disease.csv" | |
| pd.read_csv(src).to_csv(dst, index=False) | |
| print("β Saved PlantVillage ->", dst) | |
| # ----------------------- | |
| # 2. AfriQA (Hugging Face) | |
| # ----------------------- | |
| print("Downloading AfriQA dataset...") | |
| afriqa = load_dataset("masakhane/afriqa") | |
| afriqa_df = pd.DataFrame(afriqa["train"]) | |
| # Merge question + answer into one text column | |
| afriqa_df["text"] = "Q: " + afriqa_df["question"].astype(str) + " A: " + afriqa_df["answer"].astype(str) | |
| afriqa_df[["text"]].to_csv("datasets/afriqa.csv", index=False) | |
| print("β Saved AfriQA -> datasets/afriqa.csv") | |
| # ----------------------- | |
| # 3. CrisisNLP (Hugging Face) | |
| # ----------------------- | |
| print("Downloading CrisisNLP dataset...") | |
| crisis = load_dataset("QCRI/CrisisBench-all-lang") | |
| crisis_df = pd.DataFrame(crisis["train"]) | |
| # Pick relevant columns (tweet_text, label, etc.) | |
| if "tweet_text" in crisis_df.columns: | |
| crisis_df["text"] = crisis_df["tweet_text"].astype(str) | |
| else: | |
| crisis_df["text"] = crisis_df.astype(str).agg(" ".join, axis=1) | |
| crisis_df[["text"]].to_csv("datasets/crisis.csv", index=False) | |
| print("β Saved CrisisNLP -> datasets/crisis.csv") | |
| print("π All datasets prepared in /datasets") | |