ZhouChuYue commited on
Commit
0158942
·
1 Parent(s): 9767ffc

Add Ultra-FineWeb Classifier Space with auto model download

Browse files
Files changed (4) hide show
  1. .gitattributes +0 -35
  2. README.md +54 -13
  3. app.py +488 -0
  4. requirements.txt +5 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,13 +1,54 @@
1
- ---
2
- title: Ultra FineWeb Classifier
3
- emoji: 😻
4
- colorFrom: pink
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 6.3.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Ultra-FineWeb Classifier
3
+ emoji:
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.9.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # Ultra-FineWeb Classifier
14
+
15
+ A lightweight **fastText-based classifier** for filtering high-quality web data, supporting both **English** and **Chinese**.
16
+
17
+ ## 🌟 Features
18
+
19
+ - **Fast Inference**: Based on fastText for efficient classification
20
+ - **Bilingual Support**: Works with both English (en) and Chinese (zh) content
21
+ - **Quality Scoring**: Returns a quality score from 0 to 1
22
+ - **Easy to Use**: Simple web interface powered by Gradio
23
+
24
+ ## 📊 Quality Score Interpretation
25
+
26
+ | Score Range | Quality Level | Recommendation |
27
+ |-------------|---------------|----------------|
28
+ | ≥ 0.7 | 🌟 High Quality | Suitable for LLM training |
29
+ | 0.4 - 0.7 | 📊 Medium Quality | May need review |
30
+ | < 0.4 | ⚠️ Low Quality | Likely not suitable |
31
+
32
+ ## 🔗 Links
33
+
34
+ - 📜 [Technical Report (arXiv)](https://arxiv.org/abs/2505.05427)
35
+ - 🤗 [Model Repository](https://huggingface.co/openbmb/Ultra-FineWeb-classifier)
36
+ - 📦 [Ultra-FineWeb-en Dataset](https://huggingface.co/datasets/openbmb/Ultra-FineWeb-en)
37
+ - 📦 [Ultra-FineWeb-zh Dataset](https://huggingface.co/datasets/openbmb/Ultra-FineWeb-zh)
38
+
39
+ ## 📝 Citation
40
+
41
+ ```bibtex
42
+ @misc{wang2025ultrafineweb,
43
+ title={{Ultra-FineWeb}: Efficient Data Filtering and Verification for High-Quality LLM Training Data},
44
+ author={Yudong Wang and Zixuan Fu and Jie Cai and Peijun Tang and Hongya Lyu and Yewei Fang and Zhi Zheng and Jie Zhou and Guoyang Zeng and Chaojun Xiao and Xu Han and Zhiyuan Liu},
45
+ year={2025},
46
+ eprint={2505.05427},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL},
49
+ }
50
+ ```
51
+
52
+ ## 📄 License
53
+
54
+ Apache 2.0
app.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Ultra-FineWeb Classifier - Hugging Face Space Demo
4
+ A lightweight fastText-based classifier for filtering high-quality web data.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import unicodedata
10
+ from typing import Tuple
11
+
12
+ import gradio as gr
13
+ from huggingface_hub import hf_hub_download
14
+
15
+ # Lazy loading for heavy dependencies
16
+ _tokenizer = None
17
+ _fasttext_models = {}
18
+
19
+ MODEL_REPO = "openbmb/Ultra-FineWeb-classifier"
20
+
21
+
22
+ def get_tokenizer():
23
+ """Lazy load tokenizer."""
24
+ global _tokenizer
25
+ if _tokenizer is None:
26
+ from transformers import AutoTokenizer
27
+
28
+ # Download tokenizer files from the model repo
29
+ tokenizer_path = hf_hub_download(
30
+ repo_id=MODEL_REPO,
31
+ filename="local_tokenizer/tokenizer.json",
32
+ local_dir="./model_cache",
33
+ )
34
+ tokenizer_dir = os.path.dirname(tokenizer_path)
35
+
36
+ # Download other tokenizer files
37
+ for filename in [
38
+ "local_tokenizer/tokenizer_config.json",
39
+ "local_tokenizer/special_tokens_map.json",
40
+ ]:
41
+ hf_hub_download(
42
+ repo_id=MODEL_REPO,
43
+ filename=filename,
44
+ local_dir="./model_cache",
45
+ )
46
+
47
+ _tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
48
+ return _tokenizer
49
+
50
+
51
+ def get_fasttext_model(language: str):
52
+ """Lazy load fastText model for specific language."""
53
+ global _fasttext_models
54
+
55
+ if language not in _fasttext_models:
56
+ import fasttext
57
+
58
+ model_filename = f"classifiers/ultra_fineweb_{language}.bin"
59
+ model_path = hf_hub_download(
60
+ repo_id=MODEL_REPO,
61
+ filename=model_filename,
62
+ local_dir="./model_cache",
63
+ )
64
+ _fasttext_models[language] = fasttext.load_model(model_path)
65
+
66
+ return _fasttext_models[language]
67
+
68
+
69
+ def fasttext_preprocess(content: str, tokenizer) -> str:
70
+ """
71
+ Preprocess content for fastText inference.
72
+
73
+ Steps:
74
+ 1. Remove multiple newlines
75
+ 2. Lowercase
76
+ 3. Remove diacritics
77
+ 4. Word segmentation using tokenizer
78
+ 5. Handle escape characters
79
+ """
80
+ # 1. Remove multiple newlines
81
+ content = re.sub(r'\n{3,}', '\n\n', content)
82
+
83
+ # 2. Lowercase
84
+ content = content.lower()
85
+
86
+ # 3. Remove diacritics
87
+ content = ''.join(
88
+ c for c in unicodedata.normalize('NFKD', content)
89
+ if unicodedata.category(c) != 'Mn'
90
+ )
91
+
92
+ # 4. Word segmentation
93
+ token_ids = tokenizer.encode(content, add_special_tokens=False)
94
+ single_text_list = []
95
+ for token_id in token_ids:
96
+ curr_text = tokenizer.decode([token_id])
97
+ single_text_list.append(curr_text)
98
+
99
+ content = ' '.join(single_text_list)
100
+
101
+ # 5. Handle escape characters
102
+ content = re.sub(r'\n', '\\\\n', content)
103
+ content = re.sub(r'\r', '\\\\r', content)
104
+ content = re.sub(r'\t', '\\\\t', content)
105
+ content = re.sub(r' +', ' ', content)
106
+ content = content.strip()
107
+
108
+ return content
109
+
110
+
111
+ def fasttext_infer(norm_content: str, fasttext_model) -> Tuple[str, float]:
112
+ """
113
+ Run fastText inference.
114
+
115
+ Returns:
116
+ Tuple of (label, score) where score is the probability of being high-quality.
117
+ """
118
+ pred_label, pred_prob = fasttext_model.predict(norm_content)
119
+ pred_label = pred_label[0]
120
+ score = min(pred_prob.tolist()[0], 1.0)
121
+
122
+ # Convert to positive score (probability of being high-quality)
123
+ if pred_label == "__label__neg":
124
+ score = 1 - score
125
+
126
+ return pred_label, score
127
+
128
+
129
+ def classify_text(content: str, language: str) -> Tuple[str, str, str, str]:
130
+ """
131
+ Main classification function.
132
+
133
+ Args:
134
+ content: Text to classify
135
+ language: Language code ("en" or "zh")
136
+
137
+ Returns:
138
+ Tuple of (quality_label, score_display, normalized_content, details)
139
+ """
140
+ if not content or not content.strip():
141
+ return "❌ Error", "N/A", "", "请输入文本内容 / Please enter text content"
142
+
143
+ try:
144
+ # Get tokenizer and model
145
+ tokenizer = get_tokenizer()
146
+ fasttext_model = get_fasttext_model(language)
147
+
148
+ # Preprocess
149
+ norm_content = fasttext_preprocess(content, tokenizer)
150
+
151
+ # Inference
152
+ pred_label, score = fasttext_infer(norm_content, fasttext_model)
153
+
154
+ # Format results
155
+ if score >= 0.7:
156
+ quality_label = "🌟 High Quality"
157
+ quality_class = "high"
158
+ elif score >= 0.4:
159
+ quality_label = "📊 Medium Quality"
160
+ quality_class = "medium"
161
+ else:
162
+ quality_label = "⚠️ Low Quality"
163
+ quality_class = "low"
164
+
165
+ score_display = f"{score:.4f}"
166
+
167
+ details = f"""**Classification Results**
168
+
169
+ | Metric | Value |
170
+ |--------|-------|
171
+ | **Raw Label** | `{pred_label}` |
172
+ | **Quality Score** | `{score:.6f}` |
173
+ | **Quality Level** | {quality_label} |
174
+ | **Language** | `{language}` |
175
+ | **Input Length** | `{len(content)}` chars |
176
+ | **Normalized Length** | `{len(norm_content)}` chars |
177
+
178
+ ---
179
+
180
+ **Score Interpretation:**
181
+ - 🌟 **High Quality** (≥0.7): Content suitable for LLM training
182
+ - 📊 **Medium Quality** (0.4-0.7): Content may need review
183
+ - ⚠️ **Low Quality** (<0.4): Content likely not suitable
184
+ """
185
+
186
+ return quality_label, score_display, norm_content, details
187
+
188
+ except Exception as e:
189
+ return "❌ Error", "N/A", "", f"**Error:** {str(e)}"
190
+
191
+
192
+ # Example texts
193
+ EXAMPLE_EN = """Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves.
194
+
195
+ The process begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide."""
196
+
197
+ EXAMPLE_ZH = """机器学习是人工智能的一个子集,它使系统能够从经验中学习和改进,而无需显式编程。它专注于开发能够访问数据并使用数据自行学习的计算机程序。
198
+
199
+ 这个过程从观察或数据开始,例如示例、直接经验或指令,以便在数据中寻找模式,并根据我们提供的示例在未来做出更好的决策。"""
200
+
201
+
202
+ # Custom CSS
203
+ custom_css = """
204
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500&family=Sora:wght@400;500;600;700&display=swap');
205
+
206
+ .gradio-container {
207
+ font-family: 'Sora', sans-serif !important;
208
+ background: linear-gradient(160deg, #0a0a1a 0%, #1a0a2e 40%, #0a1a2e 70%, #0a0a1a 100%) !important;
209
+ min-height: 100vh;
210
+ }
211
+
212
+ .main-title {
213
+ font-family: 'Sora', sans-serif !important;
214
+ font-weight: 700 !important;
215
+ font-size: 2.8rem !important;
216
+ background: linear-gradient(120deg, #00ff88, #00d4ff, #a855f7) !important;
217
+ -webkit-background-clip: text !important;
218
+ -webkit-text-fill-color: transparent !important;
219
+ background-clip: text !important;
220
+ text-align: center !important;
221
+ margin-bottom: 0.3rem !important;
222
+ letter-spacing: -0.02em !important;
223
+ }
224
+
225
+ .subtitle {
226
+ text-align: center !important;
227
+ color: #8892a0 !important;
228
+ font-size: 1.05rem !important;
229
+ margin-bottom: 2rem !important;
230
+ font-weight: 400 !important;
231
+ }
232
+
233
+ .gr-box {
234
+ border-radius: 16px !important;
235
+ border: 1px solid rgba(0, 255, 136, 0.15) !important;
236
+ background: rgba(10, 15, 30, 0.85) !important;
237
+ backdrop-filter: blur(12px) !important;
238
+ }
239
+
240
+ .gr-input, .gr-textarea {
241
+ font-family: 'IBM Plex Mono', monospace !important;
242
+ background: rgba(20, 25, 45, 0.7) !important;
243
+ border: 1px solid rgba(0, 212, 255, 0.25) !important;
244
+ border-radius: 10px !important;
245
+ color: #e8ecf0 !important;
246
+ font-size: 0.95rem !important;
247
+ }
248
+
249
+ .gr-button-primary {
250
+ background: linear-gradient(135deg, #00ff88 0%, #00d4ff 100%) !important;
251
+ border: none !important;
252
+ font-weight: 600 !important;
253
+ font-size: 1.05rem !important;
254
+ padding: 14px 36px !important;
255
+ border-radius: 10px !important;
256
+ color: #0a0a1a !important;
257
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
258
+ text-transform: uppercase !important;
259
+ letter-spacing: 1.5px !important;
260
+ }
261
+
262
+ .gr-button-primary:hover {
263
+ transform: translateY(-3px) !important;
264
+ box-shadow: 0 12px 35px rgba(0, 255, 136, 0.35) !important;
265
+ }
266
+
267
+ .gr-button-secondary {
268
+ background: transparent !important;
269
+ border: 2px solid rgba(0, 212, 255, 0.4) !important;
270
+ color: #00d4ff !important;
271
+ font-weight: 500 !important;
272
+ border-radius: 10px !important;
273
+ transition: all 0.3s ease !important;
274
+ }
275
+
276
+ .gr-button-secondary:hover {
277
+ background: rgba(0, 212, 255, 0.1) !important;
278
+ border-color: #00d4ff !important;
279
+ }
280
+
281
+ .section-header {
282
+ color: #00ff88 !important;
283
+ font-weight: 600 !important;
284
+ font-size: 1.15rem !important;
285
+ margin-bottom: 1rem !important;
286
+ padding-bottom: 0.5rem !important;
287
+ border-bottom: 2px solid rgba(0, 255, 136, 0.2) !important;
288
+ letter-spacing: 0.5px !important;
289
+ }
290
+
291
+ .score-display {
292
+ font-family: 'IBM Plex Mono', monospace !important;
293
+ font-size: 2.5rem !important;
294
+ font-weight: 700 !important;
295
+ text-align: center !important;
296
+ padding: 1rem !important;
297
+ background: linear-gradient(135deg, rgba(0, 255, 136, 0.1), rgba(0, 212, 255, 0.1)) !important;
298
+ border-radius: 12px !important;
299
+ border: 1px solid rgba(0, 255, 136, 0.3) !important;
300
+ }
301
+
302
+ .gr-markdown {
303
+ color: #d0d5dc !important;
304
+ }
305
+
306
+ .gr-markdown code {
307
+ background: rgba(0, 212, 255, 0.15) !important;
308
+ padding: 3px 8px !important;
309
+ border-radius: 5px !important;
310
+ font-family: 'IBM Plex Mono', monospace !important;
311
+ color: #00d4ff !important;
312
+ }
313
+
314
+ .gr-markdown table {
315
+ border-collapse: collapse !important;
316
+ width: 100% !important;
317
+ margin: 1rem 0 !important;
318
+ }
319
+
320
+ .gr-markdown th, .gr-markdown td {
321
+ border: 1px solid rgba(0, 212, 255, 0.2) !important;
322
+ padding: 10px 14px !important;
323
+ text-align: left !important;
324
+ }
325
+
326
+ .gr-markdown th {
327
+ background: rgba(0, 212, 255, 0.1) !important;
328
+ color: #00d4ff !important;
329
+ font-weight: 600 !important;
330
+ }
331
+
332
+ footer {
333
+ display: none !important;
334
+ }
335
+
336
+ .gr-accordion {
337
+ border: 1px solid rgba(168, 85, 247, 0.25) !important;
338
+ border-radius: 10px !important;
339
+ background: rgba(20, 15, 40, 0.5) !important;
340
+ }
341
+
342
+ label {
343
+ color: #a8b0bc !important;
344
+ font-weight: 500 !important;
345
+ }
346
+
347
+ .output-textbox textarea {
348
+ min-height: 200px !important;
349
+ max-height: 300px !important;
350
+ overflow-y: auto !important;
351
+ }
352
+
353
+ /* Custom scrollbar */
354
+ ::-webkit-scrollbar {
355
+ width: 8px;
356
+ height: 8px;
357
+ }
358
+
359
+ ::-webkit-scrollbar-track {
360
+ background: rgba(20, 25, 45, 0.5);
361
+ border-radius: 4px;
362
+ }
363
+
364
+ ::-webkit-scrollbar-thumb {
365
+ background: rgba(0, 212, 255, 0.4);
366
+ border-radius: 4px;
367
+ }
368
+
369
+ ::-webkit-scrollbar-thumb:hover {
370
+ background: rgba(0, 212, 255, 0.6);
371
+ }
372
+ """
373
+
374
+ # Build Gradio interface
375
+ with gr.Blocks(title="Ultra-FineWeb Classifier", css=custom_css) as demo:
376
+ gr.HTML('<h1 class="main-title">⚡ Ultra-FineWeb Classifier</h1>')
377
+ gr.HTML('<p class="subtitle">Lightweight fastText-based classifier for high-quality web data filtering</p>')
378
+
379
+ with gr.Row():
380
+ with gr.Column(scale=1):
381
+ gr.HTML('<div class="section-header">📥 Input</div>')
382
+
383
+ language = gr.Radio(
384
+ choices=[("English", "en"), ("中文", "zh")],
385
+ value="en",
386
+ label="Language / 语言",
387
+ info="Select the language of your content",
388
+ )
389
+
390
+ content_input = gr.Textbox(
391
+ label="Content to Classify",
392
+ placeholder="Paste your text content here...",
393
+ lines=12,
394
+ max_lines=20,
395
+ value=EXAMPLE_EN,
396
+ )
397
+
398
+ with gr.Row():
399
+ classify_btn = gr.Button("🔍 Classify", variant="primary", size="lg")
400
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg")
401
+
402
+ with gr.Accordion("📝 Example Texts", open=False):
403
+ example_en_btn = gr.Button("Load English Example", size="sm")
404
+ example_zh_btn = gr.Button("Load Chinese Example", size="sm")
405
+
406
+ with gr.Column(scale=1):
407
+ gr.HTML('<div class="section-header">📤 Results</div>')
408
+
409
+ with gr.Row():
410
+ quality_label = gr.Textbox(
411
+ label="Quality Level",
412
+ interactive=False,
413
+ scale=1,
414
+ )
415
+ score_output = gr.Textbox(
416
+ label="Quality Score",
417
+ interactive=False,
418
+ scale=1,
419
+ )
420
+
421
+ details_output = gr.Markdown(
422
+ label="Classification Details",
423
+ )
424
+
425
+ with gr.Accordion("🔧 Normalized Content", open=False):
426
+ norm_content_output = gr.Textbox(
427
+ label="Preprocessed Text (for fastText)",
428
+ lines=8,
429
+ max_lines=15,
430
+ interactive=False,
431
+ elem_classes=["output-textbox"],
432
+ )
433
+
434
+ # Event handlers
435
+ classify_btn.click(
436
+ fn=classify_text,
437
+ inputs=[content_input, language],
438
+ outputs=[quality_label, score_output, norm_content_output, details_output],
439
+ )
440
+
441
+ def clear_all():
442
+ return "", "en", "", "", "", ""
443
+
444
+ clear_btn.click(
445
+ fn=clear_all,
446
+ outputs=[content_input, language, quality_label, score_output, norm_content_output, details_output],
447
+ )
448
+
449
+ def load_english_example():
450
+ return EXAMPLE_EN, "en"
451
+
452
+ def load_chinese_example():
453
+ return EXAMPLE_ZH, "zh"
454
+
455
+ example_en_btn.click(
456
+ fn=load_english_example,
457
+ outputs=[content_input, language],
458
+ )
459
+
460
+ example_zh_btn.click(
461
+ fn=load_chinese_example,
462
+ outputs=[content_input, language],
463
+ )
464
+
465
+ # Auto-update example when language changes
466
+ def update_example_on_language_change(lang):
467
+ if lang == "zh":
468
+ return EXAMPLE_ZH
469
+ return EXAMPLE_EN
470
+
471
+ language.change(
472
+ fn=update_example_on_language_change,
473
+ inputs=[language],
474
+ outputs=[content_input],
475
+ )
476
+
477
+ # Footer
478
+ gr.HTML("""
479
+ <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; color: #64748b; font-size: 0.9rem; border-top: 1px solid rgba(0, 212, 255, 0.1);">
480
+ <p>⚡ <strong>Ultra-FineWeb Classifier</strong> - Part of the <a href="https://huggingface.co/openbmb/Ultra-FineWeb-classifier" target="_blank" style="color: #00d4ff;">Ultra-FineWeb</a> Project</p>
481
+ <p style="font-size: 0.85rem; margin-top: 0.5rem;">Based on fastText for efficient web data quality classification. Supports English and Chinese.</p>
482
+ <p style="font-size: 0.8rem; margin-top: 0.5rem; color: #4a5568;">📜 <a href="https://arxiv.org/abs/2505.05427" target="_blank" style="color: #a855f7;">Technical Report</a> | 🤗 <a href="https://huggingface.co/datasets/openbmb/Ultra-FineWeb-en" target="_blank" style="color: #a855f7;">Dataset</a></p>
483
+ </div>
484
+ """)
485
+
486
+
487
+ if __name__ == "__main__":
488
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.30.0
3
+ huggingface_hub>=0.20.0
4
+ fasttext-wheel
5
+ numpy<2.0