tung commited on
Commit
8d295df
Β·
0 Parent(s):

initial commit

Browse files
Files changed (2) hide show
  1. app.py +279 -0
  2. human_judgement/selected_samples.json +3 -0
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from huggingface_hub import HfApi, hf_hub_download
9
+
10
+ # ------------------------------------------------------------
11
+ # Cloud‑friendly Q/A preference rater for **Hugging Face Spaces**
12
+ # ------------------------------------------------------------
13
+ # This version swaps local CSV persistence for a tiny remote‑dataset
14
+ # workflow that works on Spaces:
15
+ # β€’ Ratings are stored in (and loaded from) a lightweight **dataset
16
+ # repo** on the Hugging Face Hub – no local file system required.
17
+ # β€’ The dataset repo is set via the `RATINGS_REPO` env‑var.
18
+ # β€’ You must pass a write‑enabled token (env‑var `HF_TOKEN`) that has
19
+ # `write` permission on that dataset.
20
+ #
21
+ # Quick setup guide
22
+ # -----------------
23
+ # 1. Create a dataset repository to hold the ratings file, e.g.:
24
+ # https://huggingface.co/datasets/<org>/qa‑rater‑data
25
+ # 2. Inside **Space Settings β–Έ Secrets**, add:
26
+ # β€’ `RATINGS_REPO` β†’ <org>/qa‑rater‑data
27
+ # β€’ `HF_TOKEN` β†’ a token with *Write* access to that repo
28
+ # 3. Add `huggingface‑hub` to your `requirements.txt` or
29
+ # `pip install huggingface‑hub` locally.
30
+ # 4. Deploy / push your updated Space – ratings will now persist in
31
+ # the dataset repo instead of the Space’s ephemeral storage.
32
+ # ------------------------------------------------------------
33
+
34
+
35
+ # -----------------------------------------------------------------------------
36
+ # Configuration – constants & styling
37
+ # -----------------------------------------------------------------------------
38
+ DATA_PATH = "human_judgement/selected_samples.json"
39
+ RATINGS_FILE = "human_judgement/human_judgement.csv" # Name *inside* the dataset repo
40
+ RATINGS_REPO = os.getenv("RATINGS_REPO") # e.g. "org/qa‑rater‑data"
41
+ HF_TOKEN = os.getenv("HF_TOKEN") # write token for that repo
42
+ MAX_HEIGHT_PX = 400 # Max visible height for answer Markdown blocks
43
+
44
+ api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
45
+
46
+ # -----------------------------------------------------------------------------
47
+ # Helper functions – data I/O
48
+ # -----------------------------------------------------------------------------
49
+
50
+
51
+ def load_data(path: str = DATA_PATH) -> pd.DataFrame:
52
+ """Local read for the static Q/A CSV bundled with the Space repo."""
53
+ if not os.path.exists(path):
54
+ raise FileNotFoundError(
55
+ f"Could not find data file at {path} – did you upload it?"
56
+ )
57
+ df = pd.read_json(path, lines=True)
58
+ required = {"question", "response1", "response2"}
59
+ if not required.issubset(df.columns):
60
+ raise ValueError(f"CSV must contain columns: {', '.join(required)}")
61
+ return df
62
+
63
+
64
+ # ---------- Rating persistence helpers ---------------------------------------
65
+
66
+
67
+ def _download_remote_ratings() -> Path | None:
68
+ """Try to fetch the current ratings file from the Hub; returns path or None."""
69
+ if not RATINGS_REPO:
70
+ return None
71
+ try:
72
+ return Path(
73
+ hf_hub_download(
74
+ repo_id=RATINGS_REPO,
75
+ filename=RATINGS_FILE,
76
+ repo_type="dataset",
77
+ token=HF_TOKEN,
78
+ cache_dir=tempfile.gettempdir(),
79
+ )
80
+ )
81
+ except Exception:
82
+ # File/repo may not exist yet – caller will create empty DF.
83
+ return None
84
+
85
+
86
+ def load_ratings() -> pd.DataFrame:
87
+ """Return ratings DataFrame from remote repo (or empty if none)."""
88
+ remote = _download_remote_ratings()
89
+ if remote and remote.exists():
90
+ return pd.read_csv(remote)
91
+ return pd.DataFrame(columns=["user_id", "row_index", "choice", "timestamp"])
92
+
93
+
94
+ def _upload_remote_ratings(df: pd.DataFrame):
95
+ """Upload CSV to the dataset repo with a commit per save."""
96
+ if not (RATINGS_REPO and api):
97
+ # Running locally (dev) – save to a temp file for inspection.
98
+ df.to_csv(RATINGS_FILE, index=False)
99
+ return
100
+
101
+ with tempfile.TemporaryDirectory() as tmpdir:
102
+ csv_path = Path(tmpdir) / RATINGS_FILE
103
+ csv_path.parent.mkdir(parents=True, exist_ok=True)
104
+ df.to_csv(csv_path, index=False)
105
+ api.upload_file(
106
+ path_or_fileobj=str(csv_path),
107
+ path_in_repo=RATINGS_FILE,
108
+ repo_id=RATINGS_REPO,
109
+ repo_type="dataset",
110
+ commit_message="Add/Update rating",
111
+ )
112
+
113
+
114
+ def save_rating(user_id: str, row_index: int, choice: int):
115
+ """Append a rating (deduplicated) and push to the Hub."""
116
+ ratings = load_ratings()
117
+ duplicate = (ratings.user_id == user_id) & (ratings.row_index == row_index)
118
+ if duplicate.any():
119
+ return # already stored
120
+
121
+ new_entry = {
122
+ "user_id": user_id,
123
+ "row_index": row_index,
124
+ "choice": choice,
125
+ "timestamp": datetime.utcnow().isoformat(),
126
+ }
127
+ ratings = pd.concat([ratings, pd.DataFrame([new_entry])], ignore_index=True)
128
+ _upload_remote_ratings(ratings)
129
+
130
+
131
+ def get_next_unrated(df: pd.DataFrame, ratings: pd.DataFrame, user_id: str):
132
+ rated = ratings.loc[ratings.user_id == user_id, "row_index"].tolist()
133
+ unrated = df[~df.index.isin(rated)]
134
+ if unrated.empty:
135
+ return None
136
+ row = unrated.iloc[0]
137
+ return row.name, row.question, row.response1, row.response2
138
+
139
+
140
+ # -----------------------------------------------------------------------------
141
+ # Gradio callbacks
142
+ # -----------------------------------------------------------------------------
143
+
144
+
145
+ def start_or_resume(user_id: str, state_df):
146
+ if not user_id.strip():
147
+ return (
148
+ gr.update(value=user_id, visible=True),
149
+ gr.update(visible=False), # eval_col
150
+ gr.update(visible=False), # submit_btn
151
+ "",
152
+ "",
153
+ "",
154
+ "", # q, a1, a2, idx
155
+ "Please enter a non-empty identifier to begin.",
156
+ )
157
+
158
+ ratings = load_ratings()
159
+ record = get_next_unrated(state_df, ratings, user_id)
160
+ if record is None:
161
+ return (
162
+ gr.update(value=user_id, visible=True),
163
+ gr.update(visible=False),
164
+ gr.update(visible=False),
165
+ "",
166
+ "",
167
+ "",
168
+ "",
169
+ "πŸŽ‰ You have evaluated every item – thank you!",
170
+ )
171
+
172
+ idx, q, a1, a2 = record
173
+ return (
174
+ gr.update(value=user_id, visible=True),
175
+ gr.update(visible=True), # eval_col
176
+ gr.update(visible=True), # submit_btn
177
+ "**" + q + "**",
178
+ a1,
179
+ a2,
180
+ str(idx),
181
+ "",
182
+ )
183
+
184
+
185
+ def submit_preference(user_id: str, row_idx_str: str, choice: str, state_df):
186
+ if choice not in {"answer1", "answer2"}:
187
+ return (
188
+ "",
189
+ "",
190
+ "",
191
+ "",
192
+ "Please choose either Answer 1 or Answer 2 before submitting.",
193
+ )
194
+
195
+ row_idx = int(row_idx_str)
196
+ save_rating(user_id, row_idx, 1 if choice == "answer1" else 2)
197
+
198
+ ratings = load_ratings()
199
+ record = get_next_unrated(state_df, ratings, user_id)
200
+ if record is None:
201
+ return "", "", "", "", "πŸŽ‰ You have evaluated every item – thank you!"
202
+
203
+ idx, q, a1, a2 = record
204
+ return "**" + q + "**", a1, a2, str(idx), ""
205
+
206
+
207
+ # -----------------------------------------------------------------------------
208
+ # Build Gradio interface
209
+ # -----------------------------------------------------------------------------
210
+
211
+
212
+ def build_demo():
213
+ df = load_data()
214
+
215
+ # CSS to constrain very tall answers
216
+ overflow_css = f"""
217
+ <style>
218
+ .answerbox {{
219
+ max-height: {MAX_HEIGHT_PX}px;
220
+ overflow-y: auto;
221
+ white-space: pre-wrap;
222
+ }}
223
+ </style>
224
+ """
225
+
226
+ with gr.Blocks(title="Question/Answer Preference Rater") as demo:
227
+ gr.HTML(overflow_css)
228
+
229
+ gr.Markdown(
230
+ """# Irish Grammatical Test\nEnter your identifier below to start or resume. Each sample is a pair of two sentences that varied by a grammatical feature. You should choose the one that you think is correct. Your progress is saved automatically so you can return at any time using the same identifier."""
231
+ )
232
+
233
+ state_df = gr.State(df)
234
+ state_row_idx = gr.State("")
235
+
236
+ # Identifier input
237
+ id_input = gr.Textbox(label="User Identifier", placeholder="e.g. alice")
238
+ start_btn = gr.Button("Start / Resume")
239
+
240
+ info_md = gr.Markdown("")
241
+
242
+ # Evaluation widgets
243
+ with gr.Column(visible=False) as eval_col:
244
+ question_md = gr.Markdown("")
245
+ with gr.Row():
246
+ answer1_md = gr.Markdown(label="Sentence A", elem_classes=["answerbox"])
247
+ answer2_md = gr.Markdown(label="Sentence B", elem_classes=["answerbox"])
248
+ choice_radio = gr.Radio(
249
+ ["answer1", "answer2"], label="Which sentence do you prefer?"
250
+ )
251
+ submit_btn = gr.Button("Submit Preference", visible=False)
252
+
253
+ # Callbacks wiring
254
+ start_btn.click(
255
+ fn=start_or_resume,
256
+ inputs=[id_input, state_df],
257
+ outputs=[
258
+ id_input,
259
+ eval_col,
260
+ submit_btn,
261
+ question_md,
262
+ answer1_md,
263
+ answer2_md,
264
+ state_row_idx,
265
+ info_md,
266
+ ],
267
+ )
268
+
269
+ submit_btn.click(
270
+ fn=submit_preference,
271
+ inputs=[id_input, state_row_idx, choice_radio, state_df],
272
+ outputs=[question_md, answer1_md, answer2_md, state_row_idx, info_md],
273
+ )
274
+
275
+ return demo
276
+
277
+
278
+ if __name__ == "__main__":
279
+ build_demo().launch()
human_judgement/selected_samples.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a330651213a408872c7f956545e97a71ca5ba04f6663710d9ccf3138e9f823bb
3
+ size 266536