Spaces:

tktung
/

irish_grammatical_test

Sleeping

File size: 18,015 Bytes

import hashlib  # <-- added
import os
import tempfile
from datetime import datetime
from pathlib import Path

import datasets
import gradio as gr
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download

# ------------------------------------------------------------
# Cloud‑friendly Q/A preference rater for **Hugging Face Spaces**
# ------------------------------------------------------------
# This version swaps local CSV persistence for a tiny remote‑dataset
# workflow that works on Spaces:
#   • Ratings are stored in (and loaded from) a lightweight **dataset
#     repo** on the Hugging Face Hub – no local file system required.
#   • The dataset repo is set via the `RATINGS_REPO` env‑var.
#   • You must pass a write‑enabled token (env‑var `HF_TOKEN`) that has
#     `write` permission on that dataset.
#
# Quick setup guide
# -----------------
# 1.  Create a dataset repository to hold the ratings file, e.g.:
#       https://huggingface.co/datasets/<org>/qa‑rater‑data
# 2.  Inside **Space Settings ▸ Secrets**, add:
#       • `RATINGS_REPO`  →  <org>/qa‑rater‑data
#       • `HF_TOKEN`      →  a token with *Write* access to that repo
# 3.  Add `huggingface‑hub` to your `requirements.txt` or
#     `pip install huggingface‑hub` locally.
# 4.  Deploy / push your updated Space – ratings will now persist in
#     the dataset repo instead of the Space’s ephemeral storage.
# ------------------------------------------------------------


# -----------------------------------------------------------------------------
# Configuration – constants & styling
# -----------------------------------------------------------------------------
DATA_PATH = "human_judgement/selected_samples.json"
RATINGS_FILE = (
    "human_judgement_irish_grammatical_test.csv"  # Name *inside* the dataset repo
)
# RATINGS_REPO = os.getenv("RATINGS_REPO")  # e.g. "org/qa‑rater‑data"
RATINGS_REPO = None
HF_TOKEN = os.getenv("HF_TOKEN")  # write token for that repo
MAX_HEIGHT_PX = 400  # Max visible height for answer Markdown blocks

api = HfApi(token=HF_TOKEN) if HF_TOKEN else None

# -----------------------------------------------------------------------------
# Helper functions – data I/O
# -----------------------------------------------------------------------------


def user_bucket(user_id: str, buckets: int = 10) -> int:
    """Deterministically map user_id to 1..buckets."""
    h = hashlib.sha256(user_id.encode("utf-8")).hexdigest()
    return (int(h, 16) % buckets) + 1


def load_data(user_id: str) -> pd.DataFrame:
    """
    Load the split of the dataset assigned to this user (1..10).
    Tries several common split naming patterns; falls back to 'train'.
    """
    """
    bucket = user_bucket(user_id)
    patterns = [
        # f"split{bucket}",
        # f"split_{bucket}",
        # f"fold{bucket}",
        # f"fold_{bucket}",
        # f"part{bucket}",
        f"part_{bucket}",
        # f"{bucket}",
    ]
    for split_name in patterns:
        try:
            ds = datasets.load_dataset("tktung/irish_grammar_test", split=split_name)
            df = pd.DataFrame(ds)
            break
        except Exception:
            df = None
    if df is None:
        # Fallback
        ds = datasets.load_dataset("tktung/irish_grammar_test", split="train")
        df = pd.DataFrame(ds)
    """
    ds = datasets.load_dataset("tktung/irish_grammar_test", split="train")
    df = pd.DataFrame(ds)
    required = {"question", "response1", "response2"}
    if not required.issubset(df.columns):
        raise ValueError(f"Dataset must contain columns: {', '.join(required)}")
    return df


# ---------- Rating persistence helpers ---------------------------------------


def _download_remote_ratings() -> Path | None:
    """Try to fetch the current ratings file from the Hub; returns path or None."""
    if not RATINGS_REPO:
        return None
    try:
        return Path(
            hf_hub_download(
                repo_id=RATINGS_REPO,
                filename=RATINGS_FILE,
                repo_type="dataset",
                token=HF_TOKEN,
                cache_dir=tempfile.gettempdir(),
            )
        )
    except Exception:
        # File/repo may not exist yet – caller will create empty DF.
        return None


def load_ratings() -> pd.DataFrame:
    """Return ratings DataFrame from remote repo (or empty if none)."""
    remote = _download_remote_ratings()
    if remote and remote.exists():
        df = pd.read_csv(remote)
    elif RATINGS_FILE and os.path.exists(RATINGS_FILE):
        # Running locally (dev) – load local file if present.
        df = pd.read_csv(RATINGS_FILE)
    else:
        df = pd.DataFrame(
            columns=[
                "user_id",
                "user_bucket",  # added
                "row_index",
                "choice",
                "timestamp",
                "proficiency",
                "is_native",
                "studied_second_level",
                "studied_third_level",
                "uses_for_work",
                "usage_frequency",
            ]
        )
    # Backward compatibility: ensure new columns exist
    required_cols = [
        "proficiency",
        "is_native",
        "studied_second_level",
        "studied_third_level",
        "uses_for_work",
        "usage_frequency",
        "user_bucket",  # added
    ]
    for col in required_cols:
        if col not in df.columns:
            df[col] = pd.NA
    return df


def _upload_remote_ratings(df: pd.DataFrame):
    """Upload CSV to the dataset repo with a commit per save."""
    if not (RATINGS_REPO and api):
        # Running locally (dev) – save to a temp file for inspection.
        df.to_csv(RATINGS_FILE, index=False)
        return

    with tempfile.TemporaryDirectory() as tmpdir:
        csv_path = Path(tmpdir) / RATINGS_FILE
        csv_path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(csv_path, index=False)
        api.upload_file(
            path_or_fileobj=str(csv_path),
            path_in_repo=RATINGS_FILE,
            repo_id=RATINGS_REPO,
            repo_type="dataset",
            commit_message="Add/Update rating",
        )


def save_rating(
    user_id: str,
    proficiency: str,
    is_native: str,
    studied_second_level: str,
    studied_third_level: str,
    uses_for_work: str,
    usage_frequency: str,
    row_index: int,
    choice: int,
):
    """Append a rating (deduplicated) and push to the Hub (stores demographics)."""
    ratings = load_ratings()
    duplicate = (ratings.user_id == user_id) & (ratings.row_index == row_index)
    if duplicate.any():
        return
    norm = lambda x: (x or "").strip().lower()
    bucket = user_bucket(user_id)  # added
    new_entry = {
        "user_id": user_id,
        "user_bucket": bucket,  # added
        "proficiency": norm(proficiency),
        "is_native": norm(is_native),
        "studied_second_level": norm(studied_second_level),
        "studied_third_level": norm(studied_third_level),
        "uses_for_work": norm(uses_for_work),
        "usage_frequency": norm(usage_frequency),
        "row_index": row_index,
        "choice": choice,
        "timestamp": datetime.utcnow().isoformat(),
    }
    ratings = pd.concat([ratings, pd.DataFrame([new_entry])], ignore_index=True)
    _upload_remote_ratings(ratings)


def get_next_unrated(df: pd.DataFrame, ratings: pd.DataFrame, user_id: str):
    rated = ratings.loc[ratings.user_id == user_id, "row_index"].tolist()
    unrated = df[~df.index.isin(rated)]
    if unrated.empty:
        return None
    row = unrated.iloc[0]
    return row.name, row.question, row.response1, row.response2


def user_progress(user_id: str, state_df) -> str:
    """Return progress string for this user."""
    if not isinstance(state_df, pd.DataFrame):
        return "Progress: 0 / 0"
    ratings = load_ratings()
    rated = ratings.loc[ratings.user_id == user_id, "row_index"].nunique()
    total = len(state_df)
    if total == 0:
        return "Progress: 0 / 0"
    return f"Progress: {rated} / {total} ({rated/total:.1%})"


# -----------------------------------------------------------------------------
# Gradio callbacks
# -----------------------------------------------------------------------------


def start_or_resume(
    user_id: str,
    proficiency: str,
    is_native: str,
    studied_second_level: str,
    studied_third_level: str,
    uses_for_work: str,
    usage_frequency: str,
    consent: bool,
    state_df,  # may be None before first load
):
    # If dataset not yet loaded for this session, load user-specific split
    if not isinstance(state_df, pd.DataFrame):
        try:
            state_df = load_data(user_id)
        except Exception as e:
            progress = user_progress(user_id, state_df)
            return (
                gr.update(value=user_id, visible=True),
                gr.update(visible=False),
                gr.update(visible=False),
                "",
                "",
                "",
                "",
                state_df,
                progress,
                f"Dataset load failed: {e}",
            )
    progress = user_progress(user_id, state_df)
    # ...existing validation blocks updated to include progress...
    if not user_id.strip():
        return (
            gr.update(value=user_id, visible=True),
            gr.update(visible=False),
            gr.update(visible=False),
            "",
            "",
            "",
            "",
            state_df,
            progress,
            "Please enter a non-empty identifier to begin.",
        )
    if proficiency not in {"expert", "fluent", "basic"}:
        return (
            gr.update(value=user_id, visible=True),
            gr.update(visible=False),
            gr.update(visible=False),
            "",
            "",
            "",
            "",
            state_df,
            progress,
            "Please select your language proficiency.",
        )
    required_yes_no = {
        is_native: "Is Native?",
        studied_second_level: "Studied Irish At Second Level?",
        studied_third_level: "Studied Irish At Third Level?",
        uses_for_work: "Use Irish for work?",
    }
    for val, label in required_yes_no.items():
        if val not in {"Yes", "No"}:
            return (
                gr.update(value=user_id, visible=True),
                gr.update(visible=False),
                gr.update(visible=False),
                "",
                "",
                "",
                "",
                state_df,
                progress,
                f"Please answer: {label}",
            )
    if usage_frequency not in {"daily", "weekly", "monthly", "yearly"}:
        return (
            gr.update(value=user_id, visible=True),
            gr.update(visible=False),
            gr.update(visible=False),
            "",
            "",
            "",
            "",
            state_df,
            progress,
            "Please select usage frequency.",
        )
    if not consent:
        return (
            gr.update(value=user_id, visible=True),
            gr.update(visible=False),
            gr.update(visible=False),
            "",
            "",
            "",
            "",
            state_df,
            progress,
            "Please provide consent to proceed.",
        )
    ratings = load_ratings()
    record = get_next_unrated(state_df, ratings, user_id)
    if record is None:
        progress = user_progress(user_id, state_df)
        return (
            gr.update(value=user_id, visible=True),
            gr.update(visible=False),
            gr.update(visible=False),
            "",
            "",
            "",
            "",
            state_df,
            progress,
            "🎉 You have evaluated every item – thank you!",
        )
    idx, q, a1, a2 = record
    progress = user_progress(user_id, state_df)
    return (
        gr.update(value=user_id, visible=True),
        gr.update(visible=True),
        gr.update(visible=True),
        "**" + q + "**",
        a1,
        a2,
        str(idx),
        state_df,
        progress,
        "",
    )


def submit_preference(
    user_id: str,
    proficiency: str,
    is_native: str,
    studied_second_level: str,
    studied_third_level: str,
    uses_for_work: str,
    usage_frequency: str,
    row_idx_str: str,
    choice: str,
    state_df,
):
    if choice not in {"Sentence 1", "Sentence 2"}:
        progress = user_progress(user_id, state_df)
        return (
            "",
            "",
            "",
            "",
            progress,
            "Please choose either Sentence 1 or Sentence 2 before submitting.",
        )
    row_idx = int(row_idx_str)
    save_rating(
        user_id,
        proficiency,
        is_native,
        studied_second_level,
        studied_third_level,
        uses_for_work,
        usage_frequency,
        row_idx,
        1 if choice == "Sentence 1" else 2,
    )
    ratings = load_ratings()
    record = get_next_unrated(state_df, ratings, user_id)
    progress = user_progress(user_id, state_df)
    if record is None:
        return "", "", "", "", progress, "🎉 You have evaluated every item – thank you!"
    idx, q, a1, a2 = record
    return "**" + q + "**", a1, a2, str(idx), progress, ""


# -----------------------------------------------------------------------------
# Build Gradio interface
# -----------------------------------------------------------------------------


def build_demo():
    # Removed upfront dataset load; defer until user supplies ID
    # df = load_data()

    # CSS to constrain very tall answers
    overflow_css = f"""
    <style>
      .answerbox {{
          max-height: {MAX_HEIGHT_PX}px;
          overflow-y: auto;
          white-space: pre-wrap;
      }}
    </style>
    """

    with gr.Blocks(title="Question/Answer Preference Rater") as demo:
        gr.HTML(overflow_css)

        gr.Markdown(
            """# Irish Grammatical Test
Enter your identifier below to start or resume. Each sample is a pair of two sentences that varied by a grammatical feature. You should choose the one that you think is correct. Your progress is saved automatically so you can return at any time using the same identifier."""
        )

        state_df = gr.State(None)  # will be filled after hashing user_id
        state_row_idx = gr.State("")

        # Identifier input
        id_input = gr.Textbox(label="User Identifier", placeholder="e.g. alice")
        proficiency_radio = gr.Radio(
            ["expert", "fluent", "basic"],
            label="Language proficiency",
            info="Select your Irish language proficiency level.",
        )
        is_native_radio = gr.Radio(
            ["Yes", "No"], label="Is Native?", info="Are you a native Irish speaker?"
        )
        studied_second_radio = gr.Radio(
            ["Yes", "No"],
            label="Studied Irish At Second Level?",
            info="Did you study Irish in school?",
        )
        studied_third_radio = gr.Radio(
            ["Yes", "No"],
            label="Studied Irish At Third Level?",
            info="Did you study Irish at university/third level?",
        )
        uses_for_work_radio = gr.Radio(
            ["Yes", "No"],
            label="Use Irish for work?",
            info="Do you use Irish in your job?",
        )
        usage_frequency_radio = gr.Radio(
            ["daily", "weekly", "monthly", "yearly"],
            label="How often do you use Irish?",
        )
        consent_checkbox = gr.Checkbox(  # <-- added
            label="I consent to the use of my responses for research purposes."
        )
        start_btn = gr.Button("Start / Resume")

        info_md = gr.Markdown("")
        progress_md = gr.Markdown("Progress: 0 / 0")  # <-- added

        # Evaluation widgets
        with gr.Column(visible=False) as eval_col:
            question_md = gr.Markdown("")
            with gr.Row():
                answer1_md = gr.Markdown(label="Sentence A", elem_classes=["answerbox"])
                answer2_md = gr.Markdown(label="Sentence B", elem_classes=["answerbox"])
            choice_radio = gr.Radio(
                ["Sentence 1", "Sentence 2"],
                label="Which sentence is more grammatically correct?",
            )
        submit_btn = gr.Button("Submit Preference", visible=False)

        # Callbacks wiring (added progress_md in outputs)
        start_btn.click(
            fn=start_or_resume,
            inputs=[
                id_input,
                proficiency_radio,
                is_native_radio,
                studied_second_radio,
                studied_third_radio,
                uses_for_work_radio,
                usage_frequency_radio,
                consent_checkbox,
                state_df,
            ],
            outputs=[
                id_input,
                eval_col,
                submit_btn,
                question_md,
                answer1_md,
                answer2_md,
                state_row_idx,
                state_df,
                progress_md,  # <-- added
                info_md,
            ],
        )

        submit_btn.click(
            fn=submit_preference,
            inputs=[
                id_input,
                proficiency_radio,
                is_native_radio,
                studied_second_radio,
                studied_third_radio,
                uses_for_work_radio,
                usage_frequency_radio,
                state_row_idx,
                choice_radio,
                state_df,
            ],
            outputs=[
                question_md,
                answer1_md,
                answer2_md,
                state_row_idx,
                progress_md,  # <-- added
                info_md,
            ],
        )
    return demo


if __name__ == "__main__":
    build_demo().launch()