Spaces:
Sleeping
Sleeping
| """Main Gradio application for viewing dataset with text comparison.""" | |
| import gradio as gr | |
| import random | |
| from data_loader import get_dataset_size, get_sample | |
| from diff_utils import generate_html_diff, get_diff_stats | |
| def load_sample_data(sample_index: int | None = None) -> tuple[str, str, str, str]: | |
| """Load and process a sample from the dataset.""" | |
| if sample_index is None: | |
| sample_index = random.randint(0, get_dataset_size() - 1) | |
| # Ensure index is within bounds | |
| sample_index = max(0, min(sample_index, get_dataset_size() - 1)) | |
| original_text, cleaned_text = get_sample(sample_index) | |
| # Generate diff HTML | |
| diff_html = generate_html_diff(original_text, cleaned_text) | |
| # Get statistics | |
| stats = get_diff_stats(original_text, cleaned_text) | |
| stats_text = f""" | |
| **Sample #{sample_index}** | |
| - Original length: {stats['original_length']:,} characters | |
| - Cleaned length: {stats['cleaned_length']:,} characters | |
| - Characters removed: {stats['characters_removed']:,} | |
| - Similarity: {stats['similarity_ratio']}% | |
| """ | |
| return original_text, cleaned_text, diff_html, stats_text | |
| def create_interface() -> gr.Blocks: | |
| """Create the main Gradio interface.""" | |
| with gr.Blocks( | |
| title="Dataset Text Comparison Viewer", | |
| css=""" | |
| .textbox-container { | |
| max-height: 400px; | |
| overflow-y: auto; | |
| border: 1px solid #e0e0e0; | |
| border-radius: 8px; | |
| padding: 12px; | |
| background: #fafafa; | |
| } | |
| .stats-box { | |
| background: #f8f9fa; | |
| border: 1px solid #dee2e6; | |
| border-radius: 8px; | |
| padding: 16px; | |
| margin: 8px 0; | |
| } | |
| """ | |
| ) as interface: | |
| gr.Markdown("# Dataset Text Comparison Viewer") | |
| gr.Markdown("Compare original and cleaned text from **sumuks/essential-web-v1.0-sample-1M-with-cleaned-text**") | |
| with gr.Row(): | |
| with gr.Column(): | |
| sample_input = gr.Number( | |
| label="Sample Index", | |
| value=0, | |
| minimum=0, | |
| maximum=get_dataset_size() - 1, | |
| step=1 | |
| ) | |
| with gr.Row(): | |
| load_btn = gr.Button("Load Sample", variant="primary") | |
| random_btn = gr.Button("Random Sample", variant="secondary") | |
| # Statistics display | |
| stats_output = gr.Markdown(label="Statistics", elem_classes=["stats-box"]) | |
| # Text comparison | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Original Text") | |
| original_output = gr.Textbox( | |
| label="Original", | |
| lines=15, | |
| max_lines=20, | |
| show_copy_button=True, | |
| elem_classes=["textbox-container"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### Cleaned Text") | |
| cleaned_output = gr.Textbox( | |
| label="Cleaned", | |
| lines=15, | |
| max_lines=20, | |
| show_copy_button=True, | |
| elem_classes=["textbox-container"] | |
| ) | |
| # Diff visualization | |
| gr.Markdown("### Diff Visualization") | |
| diff_output = gr.HTML(label="Diff") | |
| # Event handlers | |
| load_btn.click( | |
| fn=load_sample_data, | |
| inputs=[sample_input], | |
| outputs=[original_output, cleaned_output, diff_output, stats_output] | |
| ) | |
| random_btn.click( | |
| fn=lambda: load_sample_data(None), | |
| inputs=[], | |
| outputs=[original_output, cleaned_output, diff_output, stats_output] | |
| ) | |
| # Load initial sample | |
| interface.load( | |
| fn=lambda: load_sample_data(0), | |
| inputs=[], | |
| outputs=[original_output, cleaned_output, diff_output, stats_output] | |
| ) | |
| return interface | |
| if __name__ == "__main__": | |
| app = create_interface() | |
| app.launch(share=False, debug=True) |