Spaces:
Sleeping
Sleeping
| """ | |
| LangChain + RAGAS Integrated App | |
| Main application using LangChain for models and RAGAS for evaluation. | |
| """ | |
| import gradio as gr | |
| import pandas as pd | |
| import os | |
| from typing import List, Tuple, Optional | |
| from langchain_evaluator import langchain_evaluator | |
| from langchain_models import langchain_models_registry | |
| def get_available_datasets() -> List[str]: | |
| """Get list of available datasets.""" | |
| datasets = [] | |
| for item in os.listdir("tasks"): | |
| if os.path.isdir(f"tasks/{item}") and not item.startswith("."): | |
| datasets.append(item) | |
| return sorted(datasets) | |
| def get_available_dialects() -> List[str]: | |
| """Get list of available SQL dialects.""" | |
| return ["presto", "bigquery", "snowflake"] | |
| def get_available_models() -> List[str]: | |
| """Get list of available models.""" | |
| return langchain_models_registry.get_available_models() | |
| def get_cases_for_dataset(dataset_name: str) -> List[str]: | |
| """Get list of cases for a dataset.""" | |
| if not dataset_name: | |
| return [] | |
| try: | |
| dataset = langchain_evaluator.load_dataset(dataset_name) | |
| cases = [] | |
| for case in dataset['cases']: | |
| cases.append(f"{case['id']}: {case['question'][:50]}...") | |
| return cases | |
| except Exception as e: | |
| print(f"Error loading cases for {dataset_name}: {e}") | |
| return [] | |
| def update_case_dropdown(dataset_name: str): | |
| """Update case dropdown with new choices and reset value.""" | |
| if not dataset_name: | |
| return gr.Dropdown(choices=[], value=None) | |
| try: | |
| dataset = langchain_evaluator.load_dataset(dataset_name) | |
| cases = [] | |
| for case in dataset['cases']: | |
| cases.append(f"{case['id']}: {case['question'][:50]}...") | |
| # Return updated dropdown with new choices and no value | |
| return gr.Dropdown(choices=cases, value=None) | |
| except Exception as e: | |
| print(f"Error loading cases for {dataset_name}: {e}") | |
| return gr.Dropdown(choices=[], value=None) | |
| def run_evaluation( | |
| dataset_name: str, | |
| dialect: str, | |
| case_selection: str, | |
| selected_models: List[str] | |
| ) -> Tuple[str, pd.DataFrame, dict, str, str, str]: | |
| """Run evaluation for selected models on a case.""" | |
| print(f"π DEBUG - case_selection type: {type(case_selection)}, value: {case_selection}") | |
| print(f"π DEBUG - dataset_name: {dataset_name}, dialect: {dialect}, selected_models: {selected_models}") | |
| if not all([dataset_name, dialect, case_selection, selected_models]): | |
| return "Please select all required options.", pd.DataFrame(), {}, "" | |
| try: | |
| # Handle case_selection if it's a list (shouldn't happen but just in case) | |
| if isinstance(case_selection, list): | |
| print(f"β οΈ WARNING: case_selection is a list, taking first element") | |
| case_selection = case_selection[0] if case_selection else "" | |
| # Extract case ID from selection | |
| case_id = case_selection.split(":")[0] if ":" in case_selection else case_selection | |
| print(f"π Starting evaluation:") | |
| print(f" Dataset: {dataset_name}") | |
| print(f" Dialect: {dialect}") | |
| print(f" Case: {case_id}") | |
| print(f" Models: {', '.join(selected_models)}") | |
| # Run evaluation | |
| results = langchain_evaluator.evaluate_models( | |
| dataset_name=dataset_name, | |
| dialect=dialect, | |
| case_id=case_id, | |
| model_names=selected_models | |
| ) | |
| if not results: | |
| return "No results generated. Check console for errors.", pd.DataFrame(), {}, "" | |
| # Update leaderboard | |
| langchain_evaluator.update_leaderboard(results) | |
| # Prepare results for display | |
| results_data = [] | |
| for result in results: | |
| results_data.append({ | |
| 'Model': result.model_name, | |
| 'Reference SQL (Human)': result.reference_sql[:80] + "..." if len(result.reference_sql) > 80 else result.reference_sql, | |
| 'Generated SQL (LLM)': result.generated_sql[:80] + "..." if len(result.generated_sql) > 80 else result.generated_sql, | |
| 'Composite Score': f"{result.composite_score:.3f}", | |
| 'Correctness': f"{result.correctness_exact:.3f}", | |
| 'Result Match F1': f"{result.result_match_f1:.3f}", | |
| 'Exec Success': f"{result.exec_success:.3f}", | |
| 'Latency (ms)': f"{result.latency_ms:.1f}", | |
| 'SQL Quality': f"{result.sql_quality:.3f}", | |
| 'Semantic Similarity': f"{result.semantic_similarity:.3f}" | |
| }) | |
| results_df = pd.DataFrame(results_data) | |
| # Detailed results | |
| detailed_results = {} | |
| for result in results: | |
| detailed_results[result.model_name] = { | |
| 'reference_sql_human': result.reference_sql, | |
| 'raw_sql_llm': result.raw_sql, | |
| 'cleaned_sql_llm': result.generated_sql, | |
| 'question': result.question, | |
| 'all_metrics': { | |
| 'correctness_exact': result.correctness_exact, | |
| 'result_match_f1': result.result_match_f1, | |
| 'exec_success': result.exec_success, | |
| 'latency_ms': result.latency_ms, | |
| 'readability': result.readability, | |
| 'dialect_ok': result.dialect_ok, | |
| 'sql_quality': result.sql_quality, | |
| 'semantic_similarity': result.semantic_similarity, | |
| 'structural_similarity': result.structural_similarity, | |
| 'composite_score': result.composite_score | |
| } | |
| } | |
| status = f"β Evaluation completed! {len(results)} models evaluated." | |
| # Get SQL for display (use first result as example) | |
| reference_sql = results[0].reference_sql if results else "" | |
| generated_sql = results[0].generated_sql if results else "" | |
| return status, results_df, detailed_results, "", reference_sql, generated_sql | |
| except Exception as e: | |
| error_msg = f"β Error during evaluation: {str(e)}" | |
| print(error_msg) | |
| return error_msg, pd.DataFrame(), {}, "", "", "" | |
| def get_leaderboard_display() -> pd.DataFrame: | |
| """Get leaderboard data for display.""" | |
| try: | |
| summary = langchain_evaluator.get_leaderboard_summary(top_n=50) | |
| if summary.empty: | |
| return pd.DataFrame({ | |
| 'Rank': ['-'], | |
| 'Model': ['No data available'], | |
| 'Avg Composite Score': ['-'], | |
| 'Avg Correctness': ['-'], | |
| 'Avg Result Match F1': ['-'], | |
| 'Avg Exec Success': ['-'], | |
| 'Avg Latency (ms)': ['-'], | |
| 'Avg SQL Quality': ['-'], | |
| 'Avg Semantic Similarity': ['-'], | |
| 'Avg Structural Similarity': ['-'], | |
| 'Cases Evaluated': ['-'] | |
| }) | |
| # Sort by composite score (highest first) and add ranking | |
| summary_sorted = summary.sort_values('composite_score_mean', ascending=False) | |
| # Format for display | |
| display_data = [] | |
| for rank, (model_name, row) in enumerate(summary_sorted.iterrows(), 1): | |
| display_row = { | |
| 'Rank': rank, | |
| 'Model': model_name, | |
| 'Avg Composite Score': f"{row['composite_score_mean']:.3f}", | |
| 'Avg Correctness': f"{row['correctness_exact_mean']:.3f}", | |
| 'Avg Result Match F1': f"{row['result_match_f1_mean']:.3f}", | |
| 'Avg Exec Success': f"{row['exec_success_mean']:.3f}", | |
| 'Avg Latency (ms)': f"{row['latency_ms_mean']:.1f}", | |
| 'Cases Evaluated': int(row['composite_score_count']) | |
| } | |
| # Add custom metrics columns if they exist | |
| if 'sql_quality_mean' in row: | |
| display_row['Avg SQL Quality'] = f"{row['sql_quality_mean']:.3f}" | |
| if 'semantic_similarity_mean' in row: | |
| display_row['Avg Semantic Similarity'] = f"{row['semantic_similarity_mean']:.3f}" | |
| if 'structural_similarity_mean' in row: | |
| display_row['Avg Structural Similarity'] = f"{row['structural_similarity_mean']:.3f}" | |
| display_data.append(display_row) | |
| return pd.DataFrame(display_data) | |
| except Exception as e: | |
| print(f"Error loading leaderboard: {e}") | |
| return pd.DataFrame({ | |
| 'Rank': ['-'], | |
| 'Model': ['Error loading data'], | |
| 'Avg Composite Score': ['-'], | |
| 'Avg Correctness': ['-'], | |
| 'Avg Result Match F1': ['-'], | |
| 'Avg Exec Success': ['-'], | |
| 'Avg Latency (ms)': ['-'], | |
| 'Avg SQL Quality': ['-'], | |
| 'Avg Semantic Similarity': ['-'], | |
| 'Avg Structural Similarity': ['-'], | |
| 'Cases Evaluated': ['-'] | |
| }) | |
| def run_comprehensive_evaluation( | |
| dataset_name: str, | |
| dialect: str, | |
| selected_models: List[str], | |
| max_cases: int | |
| ) -> tuple[str, pd.DataFrame, dict, str, str]: | |
| """Run comprehensive evaluation across multiple cases.""" | |
| if not all([dataset_name, dialect, selected_models]): | |
| return "Please select dataset, dialect, and models.", pd.DataFrame(), {}, "", "" | |
| try: | |
| print(f"π Starting comprehensive evaluation:") | |
| print(f" Dataset: {dataset_name}") | |
| print(f" Dialect: {dialect}") | |
| print(f" Models: {', '.join(selected_models)}") | |
| print(f" Max Cases: {max_cases}") | |
| results = langchain_evaluator.run_comprehensive_evaluation( | |
| dataset_name=dataset_name, | |
| dialect=dialect, | |
| model_names=selected_models, | |
| max_cases=max_cases if max_cases > 0 else None | |
| ) | |
| # Update leaderboard | |
| langchain_evaluator.update_leaderboard(results) | |
| # Prepare results for display | |
| results_data = [] | |
| for result in results: | |
| results_data.append({ | |
| 'Model': result.model_name, | |
| 'Case': result.case_id, | |
| 'Reference SQL (Human)': result.reference_sql[:80] + "..." if len(result.reference_sql) > 80 else result.reference_sql, | |
| 'Generated SQL (LLM)': result.generated_sql[:80] + "..." if len(result.generated_sql) > 80 else result.generated_sql, | |
| 'Composite Score': f"{result.composite_score:.3f}", | |
| 'Correctness': f"{result.correctness_exact:.3f}", | |
| 'Result Match F1': f"{result.result_match_f1:.3f}", | |
| 'Exec Success': f"{result.exec_success:.3f}", | |
| 'Latency (ms)': f"{result.latency_ms:.1f}", | |
| 'SQL Quality': f"{result.sql_quality:.3f}", | |
| 'Semantic Similarity': f"{result.semantic_similarity:.3f}" | |
| }) | |
| results_df = pd.DataFrame(results_data) | |
| # Detailed results | |
| detailed_results = {} | |
| for result in results: | |
| detailed_results[f"{result.model_name}_{result.case_id}"] = { | |
| 'reference_sql_human': result.reference_sql, | |
| 'raw_sql_llm': result.raw_sql, | |
| 'cleaned_sql_llm': result.generated_sql, | |
| 'question': result.question, | |
| 'all_metrics': { | |
| 'correctness_exact': result.correctness_exact, | |
| 'result_match_f1': result.result_match_f1, | |
| 'exec_success': result.exec_success, | |
| 'latency_ms': result.latency_ms, | |
| 'readability': result.readability, | |
| 'dialect_ok': result.dialect_ok, | |
| 'sql_quality': result.sql_quality, | |
| 'semantic_similarity': result.semantic_similarity, | |
| 'structural_similarity': result.structural_similarity, | |
| 'composite_score': result.composite_score | |
| } | |
| } | |
| status_msg = f"β Comprehensive evaluation completed! {len(results)} evaluations performed." | |
| # Get SQL for display (use first result as example) | |
| reference_sql = results[0].reference_sql if results else "" | |
| generated_sql = results[0].generated_sql if results else "" | |
| return status_msg, results_df, detailed_results, reference_sql, generated_sql | |
| except Exception as e: | |
| error_msg = f"β Error during comprehensive evaluation: {str(e)}" | |
| print(error_msg) | |
| return error_msg, pd.DataFrame(), {}, "", "" | |
| def create_interface(): | |
| """Create the Gradio interface.""" | |
| with gr.Blocks(title="NLβSQL Leaderboard (LangChain + RAGAS)", theme=gr.themes.Soft()) as app: | |
| gr.Markdown(""" | |
| # NLβSQL Leaderboard (LangChain + RAGAS) | |
| A comprehensive evaluation platform for English β SQL tasks using LangChain for model management and RAGAS for advanced evaluation metrics. | |
| Select a dataset, dialect, and test case, then choose models to evaluate. Results are automatically added to the public leaderboard with RAGAS metrics. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=10): | |
| pass # Empty column for spacing | |
| with gr.Column(scale=1): | |
| refresh_button = gr.Button("Refresh Leaderboard", variant="secondary", size="sm") | |
| with gr.Tabs(): | |
| # Info Tab (moved to first) | |
| with gr.Tab("Info"): | |
| gr.Markdown(""" | |
| ## About the NLβSQL Leaderboard (LangChain + Custom Evaluation) | |
| This platform evaluates natural language to SQL generation using advanced tools: | |
| **Technology Stack:** | |
| - **LangChain**: Model management and prompt handling | |
| - **Custom Evaluation**: Comprehensive evaluation metrics without external dependencies | |
| - **Gradio**: User interface | |
| - **DuckDB**: SQL execution | |
| - **sqlglot**: SQL dialect transpilation | |
| - **HuggingFace Transformers**: Local model inference | |
| **Features:** | |
| - **Local-first approach**: All models run locally for privacy and reliability | |
| - **Advanced metrics**: Custom SQL quality, semantic similarity, structural analysis | |
| - **Comprehensive evaluation**: Batch processing across multiple cases | |
| - **Multi-dialect support**: Presto, BigQuery, and Snowflake SQL dialects | |
| - **Real-time leaderboard**: Track model performance across different datasets | |
| **Evaluation Metrics:** | |
| - **Correctness**: Exact match with reference SQL | |
| - **Result Match F1**: Semantic similarity of query results | |
| - **Execution Success**: Whether the generated SQL executes without errors | |
| - **SQL Quality**: Structural and syntactic quality assessment | |
| - **Semantic Similarity**: Meaning-based comparison with reference | |
| - **Composite Score**: Weighted combination of all metrics | |
| """) | |
| # Evaluation Tab | |
| with gr.Tab("Evaluate"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| dataset_dropdown = gr.Dropdown( | |
| choices=get_available_datasets(), | |
| label="Dataset", | |
| value=None, | |
| allow_custom_value=True | |
| ) | |
| dialect_dropdown = gr.Dropdown( | |
| choices=get_available_dialects(), | |
| label="SQL Dialect", | |
| value="presto" | |
| ) | |
| case_dropdown = gr.Dropdown( | |
| choices=[], | |
| label="Test Case", | |
| interactive=True, | |
| value=None, | |
| allow_custom_value=False, | |
| multiselect=False, | |
| info="Select a dataset first to load test cases" | |
| ) | |
| models_checkbox = gr.CheckboxGroup( | |
| choices=get_available_models(), | |
| label="Models to Evaluate", | |
| value=[] | |
| ) | |
| run_button = gr.Button("Run Evaluation", variant="primary") | |
| with gr.Column(scale=2): | |
| status_output = gr.Textbox(label="Status", interactive=False) | |
| results_table = gr.Dataframe(label="Run Results", interactive=False) | |
| detailed_results = gr.JSON(label="Detailed Metrics", visible=False) | |
| # SQL Display Section | |
| with gr.Row(): | |
| with gr.Column(): | |
| reference_sql_display = gr.Code( | |
| label="Reference SQL (Human)", | |
| language="sql", | |
| interactive=False, | |
| visible=False | |
| ) | |
| with gr.Column(): | |
| generated_sql_display = gr.Code( | |
| label="Generated SQL (LLM)", | |
| language="sql", | |
| interactive=False, | |
| visible=False | |
| ) | |
| # Metric Explanations | |
| with gr.Accordion("π How Metrics Are Calculated", open=False): | |
| gr.Markdown(""" | |
| ### Evaluation Metrics Explained | |
| **π― Composite Score (0.0 - 1.0)** | |
| - Weighted combination of all metrics: `Correctness Γ 0.3 + Result Match F1 Γ 0.3 + Exec Success Γ 0.2 + SQL Quality Γ 0.1 + Semantic Similarity Γ 0.1` | |
| - Higher is better (1.0 = perfect) | |
| **β Correctness (0.0 - 1.0)** | |
| - Exact string match between generated SQL and reference SQL | |
| - 1.0 = identical, 0.0 = completely different | |
| **π Result Match F1 (0.0 - 1.0)** | |
| - F1 score comparing query results (not SQL text) | |
| - Executes both SQLs and compares result sets | |
| - 1.0 = identical results, 0.0 = completely different results | |
| **β‘ Exec Success (0.0 - 1.0)** | |
| - Whether the generated SQL executes without errors | |
| - 1.0 = executes successfully, 0.0 = execution fails | |
| **β±οΈ Latency (milliseconds)** | |
| - Time taken to generate and execute the SQL | |
| - Lower is better (faster response) | |
| **π SQL Quality (0.0 - 1.0)** | |
| - How well the SQL addresses the question | |
| - Based on semantic analysis of question vs SQL intent | |
| **π§ Semantic Similarity (0.0 - 1.0)** | |
| - Semantic similarity between generated and reference SQL | |
| - Uses sentence transformers to compare meaning | |
| - 1.0 = identical meaning, 0.0 = completely different meaning | |
| """) | |
| # Event handlers | |
| dataset_dropdown.change( | |
| fn=update_case_dropdown, | |
| inputs=[dataset_dropdown], | |
| outputs=[case_dropdown] | |
| ) | |
| run_button.click( | |
| fn=run_evaluation, | |
| inputs=[dataset_dropdown, dialect_dropdown, case_dropdown, models_checkbox], | |
| outputs=[status_output, results_table, detailed_results, gr.State(), reference_sql_display, generated_sql_display] | |
| ) | |
| # Comprehensive Evaluation Tab | |
| with gr.Tab("Comprehensive Evaluation"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| comp_dataset_dropdown = gr.Dropdown( | |
| choices=get_available_datasets(), | |
| label="Dataset", | |
| value=None, | |
| allow_custom_value=True | |
| ) | |
| comp_dialect_dropdown = gr.Dropdown( | |
| choices=get_available_dialects(), | |
| label="SQL Dialect", | |
| value="presto" | |
| ) | |
| comp_models_checkbox = gr.CheckboxGroup( | |
| choices=get_available_models(), | |
| label="Models to Evaluate", | |
| value=[] | |
| ) | |
| max_cases_slider = gr.Slider( | |
| minimum=1, | |
| maximum=50, | |
| value=10, | |
| step=1, | |
| label="Max Cases to Evaluate" | |
| ) | |
| comp_run_button = gr.Button("Run Comprehensive Evaluation", variant="primary") | |
| with gr.Column(scale=2): | |
| comp_status_output = gr.Textbox(label="Status", interactive=False) | |
| comp_results_table = gr.Dataframe(label="Comprehensive Results", interactive=False) | |
| comp_detailed_results = gr.JSON(label="Detailed Metrics", visible=False) | |
| # SQL Display Section for Comprehensive Results | |
| with gr.Row(): | |
| with gr.Column(): | |
| comp_reference_sql_display = gr.Code( | |
| label="Reference SQL (Human)", | |
| language="sql", | |
| interactive=False, | |
| visible=False | |
| ) | |
| with gr.Column(): | |
| comp_generated_sql_display = gr.Code( | |
| label="Generated SQL (LLM)", | |
| language="sql", | |
| interactive=False, | |
| visible=False | |
| ) | |
| # Metric Explanations for Comprehensive Evaluation | |
| with gr.Accordion("π How Metrics Are Calculated", open=False): | |
| gr.Markdown(""" | |
| ### Comprehensive Evaluation Metrics | |
| **π― Composite Score (0.0 - 1.0)** | |
| - Weighted combination: `Correctness Γ 0.3 + Result Match F1 Γ 0.3 + Exec Success Γ 0.2 + SQL Quality Γ 0.1 + Semantic Similarity Γ 0.1` | |
| - Higher is better (1.0 = perfect) | |
| **β Correctness (0.0 - 1.0)** | |
| - Exact string match between generated SQL and reference SQL | |
| - 1.0 = identical, 0.0 = completely different | |
| **π Result Match F1 (0.0 - 1.0)** | |
| - F1 score comparing query results (not SQL text) | |
| - Executes both SQLs and compares result sets | |
| - 1.0 = identical results, 0.0 = completely different results | |
| **β‘ Exec Success (0.0 - 1.0)** | |
| - Whether the generated SQL executes without errors | |
| - 1.0 = executes successfully, 0.0 = execution fails | |
| **β±οΈ Latency (milliseconds)** | |
| - Time taken to generate and execute the SQL | |
| - Lower is better (faster response) | |
| **π SQL Quality (0.0 - 1.0)** | |
| - How well the SQL addresses the question | |
| - Based on semantic analysis of question vs SQL intent | |
| **π§ Semantic Similarity (0.0 - 1.0)** | |
| - Semantic similarity between generated and reference SQL | |
| - Uses sentence transformers to compare meaning | |
| - 1.0 = identical meaning, 0.0 = completely different meaning | |
| **π Comprehensive Evaluation** | |
| - Tests models across multiple cases and datasets | |
| - Provides average performance metrics | |
| - Shows consistency across different SQL complexity levels | |
| """) | |
| comp_run_button.click( | |
| fn=run_comprehensive_evaluation, | |
| inputs=[comp_dataset_dropdown, comp_dialect_dropdown, comp_models_checkbox, max_cases_slider], | |
| outputs=[comp_status_output, comp_results_table, comp_detailed_results, comp_reference_sql_display, comp_generated_sql_display] | |
| ) | |
| # Leaderboard Tab | |
| with gr.Tab("Leaderboard"): | |
| leaderboard_table = gr.Dataframe( | |
| label="Global Leaderboard (Top 50)", | |
| interactive=False, | |
| value=get_leaderboard_display() | |
| ) | |
| # Metric Explanations for Leaderboard | |
| with gr.Accordion("π How Leaderboard Metrics Are Calculated", open=False): | |
| gr.Markdown(""" | |
| ### Global Leaderboard Metrics | |
| **π Rank** | |
| - Models ranked by average composite score (highest first) | |
| - Based on aggregated performance across all evaluations | |
| **π― Avg Composite Score (0.0 - 1.0)** | |
| - Average of all composite scores for each model | |
| - Weighted combination: `Correctness Γ 0.3 + Result Match F1 Γ 0.3 + Exec Success Γ 0.2 + SQL Quality Γ 0.1 + Semantic Similarity Γ 0.1` | |
| - Higher is better (1.0 = perfect) | |
| **β Avg Correctness (0.0 - 1.0)** | |
| - Average exact string match between generated SQL and reference SQL | |
| - 1.0 = identical, 0.0 = completely different | |
| **π Avg Result Match F1 (0.0 - 1.0)** | |
| - Average F1 score comparing query results (not SQL text) | |
| - Executes both SQLs and compares result sets | |
| - 1.0 = identical results, 0.0 = completely different results | |
| **β‘ Avg Exec Success (0.0 - 1.0)** | |
| - Average success rate of SQL execution | |
| - 1.0 = always executes successfully, 0.0 = always fails | |
| **β±οΈ Avg Latency (milliseconds)** | |
| - Average time taken to generate and execute SQL | |
| - Lower is better (faster response) | |
| **π Cases Evaluated** | |
| - Number of test cases each model has been evaluated on | |
| - More cases = more reliable performance metrics | |
| **π Avg SQL Quality (0.0 - 1.0)** | |
| - Average quality score of how well SQL addresses questions | |
| - Based on semantic analysis of question vs SQL intent | |
| **π§ Avg Semantic Similarity (0.0 - 1.0)** | |
| - Average semantic similarity between generated and reference SQL | |
| - Uses sentence transformers to compare meaning | |
| - 1.0 = identical meaning, 0.0 = completely different meaning | |
| **π Avg Structural Similarity (0.0 - 1.0)** | |
| - Average structural similarity between generated and reference SQL | |
| - Compares SQL structure, keywords, and patterns | |
| - 1.0 = identical structure, 0.0 = completely different structure | |
| """) | |
| # Add refresh button click event | |
| refresh_button.click( | |
| fn=get_leaderboard_display, | |
| outputs=[leaderboard_table] | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| app = create_interface() | |
| app.launch(server_name="0.0.0.0", server_port=7860, share=True) | |