Spaces:

openbmb
/

Ultra-FineWeb-L2-Selector

Running

App Files Files Community

ZhouChuYue commited on about 1 month ago

Commit

0158942

1 Parent(s): 9767ffc

Add Ultra-FineWeb Classifier Space with auto model download

Browse files

Files changed (4) hide show

.gitattributes +0 -35
README.md +54 -13
app.py +488 -0
requirements.txt +5 -0

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,54 @@
----
-title: Ultra FineWeb Classifier
-emoji: 😻
-colorFrom: pink
-colorTo: pink
-sdk: gradio
-sdk_version: 6.3.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Ultra-FineWeb Classifier
+emoji: ⚡
+colorFrom: green
+colorTo: blue
+sdk: gradio
+sdk_version: 5.9.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# ⚡ Ultra-FineWeb Classifier
+A lightweight **fastText-based classifier** for filtering high-quality web data, supporting both **English** and **Chinese**.
+## 🌟 Features
+- **Fast Inference**: Based on fastText for efficient classification
+- **Bilingual Support**: Works with both English (en) and Chinese (zh) content
+- **Quality Scoring**: Returns a quality score from 0 to 1
+- **Easy to Use**: Simple web interface powered by Gradio
+## 📊 Quality Score Interpretation
+| Score Range | Quality Level | Recommendation |
+|-------------|---------------|----------------|
+| ≥ 0.7 | 🌟 High Quality | Suitable for LLM training |
+| 0.4 - 0.7 | 📊 Medium Quality | May need review |
+| < 0.4 | ⚠️ Low Quality | Likely not suitable |
+## 🔗 Links
+- 📜 [Technical Report (arXiv)](https://arxiv.org/abs/2505.05427)
+- 🤗 [Model Repository](https://huggingface.co/openbmb/Ultra-FineWeb-classifier)
+- 📦 [Ultra-FineWeb-en Dataset](https://huggingface.co/datasets/openbmb/Ultra-FineWeb-en)
+- 📦 [Ultra-FineWeb-zh Dataset](https://huggingface.co/datasets/openbmb/Ultra-FineWeb-zh)
+## 📝 Citation
+```bibtex
+@misc{wang2025ultrafineweb,
+  title={{Ultra-FineWeb}: Efficient Data Filtering and Verification for High-Quality LLM Training Data},
+  author={Yudong Wang and Zixuan Fu and Jie Cai and Peijun Tang and Hongya Lyu and Yewei Fang and Zhi Zheng and Jie Zhou and Guoyang Zeng and Chaojun Xiao and Xu Han and Zhiyuan Liu},
+  year={2025},
+  eprint={2505.05427},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+}
+```
+## 📄 License
+Apache 2.0

app.py ADDED Viewed

	@@ -0,0 +1,488 @@

+# -*- coding: utf-8 -*-
+"""
+Ultra-FineWeb Classifier - Hugging Face Space Demo
+A lightweight fastText-based classifier for filtering high-quality web data.
+"""
+import os
+import re
+import unicodedata
+from typing import Tuple
+import gradio as gr
+from huggingface_hub import hf_hub_download
+# Lazy loading for heavy dependencies
+_tokenizer = None
+_fasttext_models = {}
+MODEL_REPO = "openbmb/Ultra-FineWeb-classifier"
+def get_tokenizer():
+    """Lazy load tokenizer."""
+    global _tokenizer
+    if _tokenizer is None:
+        from transformers import AutoTokenizer
+        # Download tokenizer files from the model repo
+        tokenizer_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename="local_tokenizer/tokenizer.json",
+            local_dir="./model_cache",
+        )
+        tokenizer_dir = os.path.dirname(tokenizer_path)
+        # Download other tokenizer files
+        for filename in [
+            "local_tokenizer/tokenizer_config.json",
+            "local_tokenizer/special_tokens_map.json",
+        ]:
+            hf_hub_download(
+                repo_id=MODEL_REPO,
+                filename=filename,
+                local_dir="./model_cache",
+            )
+        _tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    return _tokenizer
+def get_fasttext_model(language: str):
+    """Lazy load fastText model for specific language."""
+    global _fasttext_models
+    if language not in _fasttext_models:
+        import fasttext
+        model_filename = f"classifiers/ultra_fineweb_{language}.bin"
+        model_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=model_filename,
+            local_dir="./model_cache",
+        )
+        _fasttext_models[language] = fasttext.load_model(model_path)
+    return _fasttext_models[language]
+def fasttext_preprocess(content: str, tokenizer) -> str:
+    """
+    Preprocess content for fastText inference.
+    Steps:
+    1. Remove multiple newlines
+    2. Lowercase
+    3. Remove diacritics
+    4. Word segmentation using tokenizer
+    5. Handle escape characters
+    """
+    # 1. Remove multiple newlines
+    content = re.sub(r'\n{3,}', '\n\n', content)
+    # 2. Lowercase
+    content = content.lower()
+    # 3. Remove diacritics
+    content = ''.join(
+        c for c in unicodedata.normalize('NFKD', content)
+        if unicodedata.category(c) != 'Mn'
+    )
+    # 4. Word segmentation
+    token_ids = tokenizer.encode(content, add_special_tokens=False)
+    single_text_list = []
+    for token_id in token_ids:
+        curr_text = tokenizer.decode([token_id])
+        single_text_list.append(curr_text)
+    content = ' '.join(single_text_list)
+    # 5. Handle escape characters
+    content = re.sub(r'\n', '\\\\n', content)
+    content = re.sub(r'\r', '\\\\r', content)
+    content = re.sub(r'\t', '\\\\t', content)
+    content = re.sub(r' +', ' ', content)
+    content = content.strip()
+    return content
+def fasttext_infer(norm_content: str, fasttext_model) -> Tuple[str, float]:
+    """
+    Run fastText inference.
+    Returns:
+        Tuple of (label, score) where score is the probability of being high-quality.
+    """
+    pred_label, pred_prob = fasttext_model.predict(norm_content)
+    pred_label = pred_label[0]
+    score = min(pred_prob.tolist()[0], 1.0)
+    # Convert to positive score (probability of being high-quality)
+    if pred_label == "__label__neg":
+        score = 1 - score
+    return pred_label, score
+def classify_text(content: str, language: str) -> Tuple[str, str, str, str]:
+    """
+    Main classification function.
+    Args:
+        content: Text to classify
+        language: Language code ("en" or "zh")
+    Returns:
+        Tuple of (quality_label, score_display, normalized_content, details)
+    """
+    if not content or not content.strip():
+        return "❌ Error", "N/A", "", "请输入文本内容 / Please enter text content"
+    try:
+        # Get tokenizer and model
+        tokenizer = get_tokenizer()
+        fasttext_model = get_fasttext_model(language)
+        # Preprocess
+        norm_content = fasttext_preprocess(content, tokenizer)
+        # Inference
+        pred_label, score = fasttext_infer(norm_content, fasttext_model)
+        # Format results
+        if score >= 0.7:
+            quality_label = "🌟 High Quality"
+            quality_class = "high"
+        elif score >= 0.4:
+            quality_label = "📊 Medium Quality"
+            quality_class = "medium"
+        else:
+            quality_label = "⚠️ Low Quality"
+            quality_class = "low"
+        score_display = f"{score:.4f}"
+        details = f"""**Classification Results**
+| Metric | Value |
+|--------|-------|
+| **Raw Label** | `{pred_label}` |
+| **Quality Score** | `{score:.6f}` |
+| **Quality Level** | {quality_label} |
+| **Language** | `{language}` |
+| **Input Length** | `{len(content)}` chars |
+| **Normalized Length** | `{len(norm_content)}` chars |
+---
+**Score Interpretation:**
+- 🌟 **High Quality** (≥0.7): Content suitable for LLM training
+- 📊 **Medium Quality** (0.4-0.7): Content may need review
+- ⚠️ **Low Quality** (<0.4): Content likely not suitable
+"""
+        return quality_label, score_display, norm_content, details
+    except Exception as e:
+        return "❌ Error", "N/A", "", f"**Error:** {str(e)}"
+# Example texts
+EXAMPLE_EN = """Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves.
+The process begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide."""
+EXAMPLE_ZH = """机器学习是人工智能的一个子集，它使系统能够从经验中学习和改进，而无需显式编程。它专注于开发能够访问数据并使用数据自行学习的计算机程序。
+这个过程从观察或数据开始，例如示例、直接经验或指令，以便在数据中寻找模式，并根据我们提供的示例在未来做出更好的决策。"""
+# Custom CSS
+custom_css = """
+@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500&family=Sora:wght@400;500;600;700&display=swap');
+.gradio-container {
+    font-family: 'Sora', sans-serif !important;
+    background: linear-gradient(160deg, #0a0a1a 0%, #1a0a2e 40%, #0a1a2e 70%, #0a0a1a 100%) !important;
+    min-height: 100vh;
+}
+.main-title {
+    font-family: 'Sora', sans-serif !important;
+    font-weight: 700 !important;
+    font-size: 2.8rem !important;
+    background: linear-gradient(120deg, #00ff88, #00d4ff, #a855f7) !important;
+    -webkit-background-clip: text !important;
+    -webkit-text-fill-color: transparent !important;
+    background-clip: text !important;
+    text-align: center !important;
+    margin-bottom: 0.3rem !important;
+    letter-spacing: -0.02em !important;
+}
+.subtitle {
+    text-align: center !important;
+    color: #8892a0 !important;
+    font-size: 1.05rem !important;
+    margin-bottom: 2rem !important;
+    font-weight: 400 !important;
+}
+.gr-box {
+    border-radius: 16px !important;
+    border: 1px solid rgba(0, 255, 136, 0.15) !important;
+    background: rgba(10, 15, 30, 0.85) !important;
+    backdrop-filter: blur(12px) !important;
+}
+.gr-input, .gr-textarea {
+    font-family: 'IBM Plex Mono', monospace !important;
+    background: rgba(20, 25, 45, 0.7) !important;
+    border: 1px solid rgba(0, 212, 255, 0.25) !important;
+    border-radius: 10px !important;
+    color: #e8ecf0 !important;
+    font-size: 0.95rem !important;
+}
+.gr-button-primary {
+    background: linear-gradient(135deg, #00ff88 0%, #00d4ff 100%) !important;
+    border: none !important;
+    font-weight: 600 !important;
+    font-size: 1.05rem !important;
+    padding: 14px 36px !important;
+    border-radius: 10px !important;
+    color: #0a0a1a !important;
+    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
+    text-transform: uppercase !important;
+    letter-spacing: 1.5px !important;
+}
+.gr-button-primary:hover {
+    transform: translateY(-3px) !important;
+    box-shadow: 0 12px 35px rgba(0, 255, 136, 0.35) !important;
+}
+.gr-button-secondary {
+    background: transparent !important;
+    border: 2px solid rgba(0, 212, 255, 0.4) !important;
+    color: #00d4ff !important;
+    font-weight: 500 !important;
+    border-radius: 10px !important;
+    transition: all 0.3s ease !important;
+}
+.gr-button-secondary:hover {
+    background: rgba(0, 212, 255, 0.1) !important;
+    border-color: #00d4ff !important;
+}
+.section-header {
+    color: #00ff88 !important;
+    font-weight: 600 !important;
+    font-size: 1.15rem !important;
+    margin-bottom: 1rem !important;
+    padding-bottom: 0.5rem !important;
+    border-bottom: 2px solid rgba(0, 255, 136, 0.2) !important;
+    letter-spacing: 0.5px !important;
+}
+.score-display {
+    font-family: 'IBM Plex Mono', monospace !important;
+    font-size: 2.5rem !important;
+    font-weight: 700 !important;
+    text-align: center !important;
+    padding: 1rem !important;
+    background: linear-gradient(135deg, rgba(0, 255, 136, 0.1), rgba(0, 212, 255, 0.1)) !important;
+    border-radius: 12px !important;
+    border: 1px solid rgba(0, 255, 136, 0.3) !important;
+}
+.gr-markdown {
+    color: #d0d5dc !important;
+}
+.gr-markdown code {
+    background: rgba(0, 212, 255, 0.15) !important;
+    padding: 3px 8px !important;
+    border-radius: 5px !important;
+    font-family: 'IBM Plex Mono', monospace !important;
+    color: #00d4ff !important;
+}
+.gr-markdown table {
+    border-collapse: collapse !important;
+    width: 100% !important;
+    margin: 1rem 0 !important;
+}
+.gr-markdown th, .gr-markdown td {
+    border: 1px solid rgba(0, 212, 255, 0.2) !important;
+    padding: 10px 14px !important;
+    text-align: left !important;
+}
+.gr-markdown th {
+    background: rgba(0, 212, 255, 0.1) !important;
+    color: #00d4ff !important;
+    font-weight: 600 !important;
+}
+footer {
+    display: none !important;
+}
+.gr-accordion {
+    border: 1px solid rgba(168, 85, 247, 0.25) !important;
+    border-radius: 10px !important;
+    background: rgba(20, 15, 40, 0.5) !important;
+}
+label {
+    color: #a8b0bc !important;
+    font-weight: 500 !important;
+}
+.output-textbox textarea {
+    min-height: 200px !important;
+    max-height: 300px !important;
+    overflow-y: auto !important;
+}
+/* Custom scrollbar */
+::-webkit-scrollbar {
+    width: 8px;
+    height: 8px;
+}
+::-webkit-scrollbar-track {
+    background: rgba(20, 25, 45, 0.5);
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb {
+    background: rgba(0, 212, 255, 0.4);
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb:hover {
+    background: rgba(0, 212, 255, 0.6);
+}
+"""
+# Build Gradio interface
+with gr.Blocks(title="Ultra-FineWeb Classifier", css=custom_css) as demo:
+    gr.HTML('<h1 class="main-title">⚡ Ultra-FineWeb Classifier</h1>')
+    gr.HTML('<p class="subtitle">Lightweight fastText-based classifier for high-quality web data filtering</p>')
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML('<div class="section-header">📥 Input</div>')
+            language = gr.Radio(
+                choices=[("English", "en"), ("中文", "zh")],
+                value="en",
+                label="Language / 语言",
+                info="Select the language of your content",
+            )
+            content_input = gr.Textbox(
+                label="Content to Classify",
+                placeholder="Paste your text content here...",
+                lines=12,
+                max_lines=20,
+                value=EXAMPLE_EN,
+            )
+            with gr.Row():
+                classify_btn = gr.Button("🔍 Classify", variant="primary", size="lg")
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg")
+            with gr.Accordion("📝 Example Texts", open=False):
+                example_en_btn = gr.Button("Load English Example", size="sm")
+                example_zh_btn = gr.Button("Load Chinese Example", size="sm")
+        with gr.Column(scale=1):
+            gr.HTML('<div class="section-header">📤 Results</div>')
+            with gr.Row():
+                quality_label = gr.Textbox(
+                    label="Quality Level",
+                    interactive=False,
+                    scale=1,
+                )
+                score_output = gr.Textbox(
+                    label="Quality Score",
+                    interactive=False,
+                    scale=1,
+                )
+            details_output = gr.Markdown(
+                label="Classification Details",
+            )
+            with gr.Accordion("🔧 Normalized Content", open=False):
+                norm_content_output = gr.Textbox(
+                    label="Preprocessed Text (for fastText)",
+                    lines=8,
+                    max_lines=15,
+                    interactive=False,
+                    elem_classes=["output-textbox"],
+                )
+    # Event handlers
+    classify_btn.click(
+        fn=classify_text,
+        inputs=[content_input, language],
+        outputs=[quality_label, score_output, norm_content_output, details_output],
+    )
+    def clear_all():
+        return "", "en", "", "", "", ""
+    clear_btn.click(
+        fn=clear_all,
+        outputs=[content_input, language, quality_label, score_output, norm_content_output, details_output],
+    )
+    def load_english_example():
+        return EXAMPLE_EN, "en"
+    def load_chinese_example():
+        return EXAMPLE_ZH, "zh"
+    example_en_btn.click(
+        fn=load_english_example,
+        outputs=[content_input, language],
+    )
+    example_zh_btn.click(
+        fn=load_chinese_example,
+        outputs=[content_input, language],
+    )
+    # Auto-update example when language changes
+    def update_example_on_language_change(lang):
+        if lang == "zh":
+            return EXAMPLE_ZH
+        return EXAMPLE_EN
+    language.change(
+        fn=update_example_on_language_change,
+        inputs=[language],
+        outputs=[content_input],
+    )
+    # Footer
+    gr.HTML("""
+    <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; color: #64748b; font-size: 0.9rem; border-top: 1px solid rgba(0, 212, 255, 0.1);">
+        <p>⚡ <strong>Ultra-FineWeb Classifier</strong> - Part of the <a href="https://huggingface.co/openbmb/Ultra-FineWeb-classifier" target="_blank" style="color: #00d4ff;">Ultra-FineWeb</a> Project</p>
+        <p style="font-size: 0.85rem; margin-top: 0.5rem;">Based on fastText for efficient web data quality classification. Supports English and Chinese.</p>
+        <p style="font-size: 0.8rem; margin-top: 0.5rem; color: #4a5568;">📜 <a href="https://arxiv.org/abs/2505.05427" target="_blank" style="color: #a855f7;">Technical Report</a> | 🤗 <a href="https://huggingface.co/datasets/openbmb/Ultra-FineWeb-en" target="_blank" style="color: #a855f7;">Dataset</a></p>
+    </div>
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.0.0
+transformers>=4.30.0
+huggingface_hub>=0.20.0
+fasttext-wheel
+numpy<2.0