Spaces:
Sleeping
Sleeping
File size: 1,725 Bytes
0552d2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# prepare_data.py
import os
import kagglehub
import pandas as pd
from datasets import load_dataset
os.makedirs("datasets", exist_ok=True)
# -----------------------
# 1. PlantVillage (Kaggle)
# -----------------------
print("Downloading PlantVillage dataset...")
pv_path = kagglehub.dataset_download("dittakavinikhita/plant-disease-prediction-disease-and-healthy")
# Pick the metadata CSV if available
for file in os.listdir(pv_path):
if file.endswith(".csv"):
src = os.path.join(pv_path, file)
dst = "datasets/plant_disease.csv"
pd.read_csv(src).to_csv(dst, index=False)
print("β
Saved PlantVillage ->", dst)
# -----------------------
# 2. AfriQA (Hugging Face)
# -----------------------
print("Downloading AfriQA dataset...")
afriqa = load_dataset("masakhane/afriqa")
afriqa_df = pd.DataFrame(afriqa["train"])
# Merge question + answer into one text column
afriqa_df["text"] = "Q: " + afriqa_df["question"].astype(str) + " A: " + afriqa_df["answer"].astype(str)
afriqa_df[["text"]].to_csv("datasets/afriqa.csv", index=False)
print("β
Saved AfriQA -> datasets/afriqa.csv")
# -----------------------
# 3. CrisisNLP (Hugging Face)
# -----------------------
print("Downloading CrisisNLP dataset...")
crisis = load_dataset("QCRI/CrisisBench-all-lang")
crisis_df = pd.DataFrame(crisis["train"])
# Pick relevant columns (tweet_text, label, etc.)
if "tweet_text" in crisis_df.columns:
crisis_df["text"] = crisis_df["tweet_text"].astype(str)
else:
crisis_df["text"] = crisis_df.astype(str).agg(" ".join, axis=1)
crisis_df[["text"]].to_csv("datasets/crisis.csv", index=False)
print("β
Saved CrisisNLP -> datasets/crisis.csv")
print("π All datasets prepared in /datasets")
|