Spaces:

broadfield-dev
/

AMOP

Paused

App Files Files Community

broadfield-dev commited on Sep 1, 2025

Commit

54b40d5

verified ·

1 Parent(s): babbd78

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -90

app.py CHANGED Viewed

@@ -24,13 +24,9 @@ OUTPUT_DIR = "optimized_models"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
-# --- 2. AMOP CORE PIPELINE FUNCTIONS ---
 def stage_1_analyze_model(model_id: str):
-    """
-    Performs Stage 1: Adaptive Model Analysis.
-    Loads the model's configuration and recommends an optimization strategy.
-    """
     log_stream = "[STAGE 1] Analyzing model...\n"
     try:
         config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
@@ -45,52 +41,40 @@ def stage_1_analyze_model(model_id: str):
         recommendation = ""
         if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type:
             recommendation = "**Recommendation:** This is a large language model (LLM). For best CPU performance, a GGUF-based quantization strategy is typically state-of-the-art. This initial version of AMOP focuses on the ONNX pipeline. The recommended path is **Quantization -> ONNX Conversion**."
-        elif 'bert' in model_type or 'roberta' in model_type:
-            recommendation = "**Recommendation:** This is an encoder model. The full AMOP pipeline is recommended for a balance of size and performance: **Pruning -> Quantization -> ONNX Conversion**."
-        elif 'vit' in model_type:
-             recommendation = "**Recommendation:** This is a Vision Transformer. The recommended path is **Quantization -> ONNX Conversion**. Pruning may be less effective."
         else:
-            recommendation = "**Recommendation:** Unrecognized architecture. The standard path of **Quantization -> ONNX Conversion** is a safe starting point."
         log_stream += f"Analysis complete. Architecture: {model_type}.\n"
-        # GRADIO 5 UPDATE: Instead of gr.update(), return a new component object.
-        return log_stream, analysis_report + "\n" + recommendation, gr.Group(visible=True)
     except Exception as e:
         error_msg = f"Failed to analyze model '{model_id}'. Error: {e}"
         logging.error(error_msg)
-        return log_stream + error_msg, "Could not analyze model. Please check the model ID and try again.", gr.Group(visible=False)
 def stage_2_prune_model(model, prune_percentage: float):
     if prune_percentage == 0:
         return model, "Skipped pruning as percentage was 0."
     log_stream = "[STAGE 2] Pruning model...\n"
     for name, module in model.named_modules():
         if isinstance(module, torch.nn.Linear):
             prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
             prune.remove(module, 'weight')
     log_stream += f"Pruning complete. Note: This version exports the original model to ONNX for maximum compatibility.\n"
     return model, log_stream
 def stage_3_and_4_quantize_and_onnx(model_id: str):
     log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
     try:
         run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
         onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
-        os.makedirs(onnx_path, exist_ok=True)
         main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
         log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
         quantizer = ORTQuantizer.from_pretrained(onnx_path)
         dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
         quantized_path = os.path.join(onnx_path, "quantized")
         quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
         log_stream += f"Successfully quantized model to: {quantized_path}\n"
         return quantized_path, log_stream
     except Exception as e:
@@ -98,31 +82,20 @@ def stage_3_and_4_quantize_and_onnx(model_id: str):
         logging.error(error_msg, exc_info=True)
         raise RuntimeError(error_msg)
-def stage_5_evaluate_and_package(
-    model_id: str,
-    optimized_model_path: str,
-    pipeline_log: str,
-    options: dict
-):
     log_stream = "[STAGE 5] Evaluating and Packaging...\n"
     try:
         ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         prompt = "My name is Philipp and I"
         inputs = tokenizer(prompt, return_tensors="pt")
         start_time = time.time()
         gen_tokens = ort_model.generate(**inputs, max_new_tokens=20)
         end_time = time.time()
         latency = (end_time - start_time) * 1000
         num_tokens = len(gen_tokens[0]) - inputs.input_ids.shape[1]
         ms_per_token = latency / num_tokens if num_tokens > 0 else float('inf')
-        eval_report = f"- **Inference Latency:** {latency:.2f} ms\n"
-        eval_report += f"- **Speed:** {ms_per_token:.2f} ms/token\n"
         log_stream += "Evaluation complete.\n"
     except Exception as e:
         eval_report = f"- **Evaluation Failed:** Could not run generation. This often happens if the base model is not a text-generation model. Error: {e}\n"
@@ -130,65 +103,56 @@ def stage_5_evaluate_and_package(
     if not HF_TOKEN:
         return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
     try:
         repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
         repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
-        with open("model_card_template.md", "r", encoding="utf-8") as f:
-            template_content = f.read()
         model_card_content = template_content.format(
-            repo_name=repo_name, model_id=model_id,
-            optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             eval_report=eval_report, pruning_status="Enabled" if options['prune'] else "Disabled",
-            pruning_percent=options['prune_percent'], repo_id=repo_url.repo_id,
-            pipeline_log=pipeline_log
         )
         readme_path = os.path.join(optimized_model_path, "README.md")
         with open(readme_path, "w", encoding="utf-8") as f: f.write(model_card_content)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         tokenizer.save_pretrained(optimized_model_path)
-        api.upload_folder(
-            folder_path=optimized_model_path, repo_id=repo_url.repo_id,
-            repo_type="model", token=HF_TOKEN
-        )
-        final_message = f"✅ Success! Your optimized model is available at: [{repo_url.repo_id}](https://huggingface.co/{repo_url.repo_id})"
         log_stream += "Upload complete.\n"
         return final_message, log_stream
     except Exception as e:
         error_msg = f"Failed to upload to the Hub. Error: {e}"
         logging.error(error_msg, exc_info=True)
-        return f"❌ Error: {error_msg}", log_stream + error_msg
-# --- 3. MAIN WORKFLOW FUNCTION (GENERATOR FOR GRADIO 5+) ---
 def run_amop_pipeline(model_id: str, do_prune: bool, prune_percent: float):
-    """
-    This is now a generator function. It 'yields' updates to the UI
-    at each step, providing a real-time log.
-    """
     if not model_id:
-        yield "Please enter a Model ID.", ""
         return
-    full_log = "[START] AMOP Pipeline Initiated.\n"
-    yield gr.Markdown("🚀 Pipeline is running... Check logs for real-time updates."), full_log
     try:
         # Step 1: Load Model
         full_log += "Loading base model...\n"
-        yield gr.Markdown("🚀 Pipeline is running... (1/5) Loading model"), full_log
         model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
         full_log += f"Successfully loaded base model '{model_id}'.\n"
         # Step 2: Pruning
-        yield gr.Markdown("🚀 Pipeline is running... (2/5) Pruning model"), full_log
         if do_prune:
             model, log = stage_2_prune_model(model, prune_percent)
             full_log += log
@@ -196,66 +160,83 @@ def run_amop_pipeline(model_id: str, do_prune: bool, prune_percent: float):
             full_log += "[STAGE 2] Pruning skipped by user.\n"
         # Step 3 & 4: ONNX Conversion
-        yield gr.Markdown("🚀 Pipeline is running... (3/5) Converting to ONNX & Quantizing"), full_log
         optimized_path, log = stage_3_and_4_quantize_and_onnx(model_id)
         full_log += log
-        # Step 5: Packaging
-        yield gr.Markdown("🚀 Pipeline is running... (4/5) Evaluating and Packaging"), full_log
         options = {'prune': do_prune, 'prune_percent': prune_percent}
-        final_status_msg, log = stage_5_evaluate_and_package(model_id, optimized_path, full_log, options)
         full_log += log
         # Final Step: Done
-        yield gr.Markdown(final_status_msg), full_log
     except Exception as e:
         logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
         full_log += f"\n[ERROR] Pipeline failed: {e}"
-        yield f"❌ An error occurred during the pipeline. Check the logs for details.", full_log
-# --- 4. GRADIO USER INTERFACE (for Gradio 5+) ---
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# AMOP: Adaptive Model Optimization Pipeline")
-    gr.Markdown(
-        "**Turn any Hugging Face Hub model into a CPU-optimized version.** Enter a model ID, choose your optimizations, "
-        "and get a new, smaller, and faster model repository ready for deployment."
-    )
     if not HF_TOKEN:
         gr.Warning("You have not set your HF_TOKEN in the Space secrets! The final 'upload' step will be skipped. Please add a secret with the key `HF_TOKEN` and your Hugging Face write token as the value.")
     with gr.Row():
         with gr.Column(scale=1):
-            model_id_input = gr.Textbox(label="Hugging Face Model ID", placeholder="e.g., gpt2, bert-base-uncased")
-            analyze_button = gr.Button("1. Analyze Model")
-            with gr.Group(visible=False) as optimization_options:
-                gr.Markdown("### 2. Configure Optimization")
                 analysis_report_output = gr.Markdown()
-                prune_checkbox = gr.Checkbox(label="Enable Pruning (Stage 2)", value=False, info="Note: Pruning is applied conceptually; ONNX export uses the original model for wider compatibility in this version.")
                 prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
-                gr.Checkbox(label="Enable Quantization & ONNX (Stages 3 & 4)", value=True, interactive=False)
-                run_button = gr.Button("3. Run Optimization Pipeline", variant="primary")
         with gr.Column(scale=2):
             gr.Markdown("### Pipeline Status & Logs")
-            final_output = gr.Markdown(value="*Pipeline has not been run yet.*", label="Final Result")
-            log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
     analyze_button.click(
         fn=stage_1_analyze_model,
         inputs=[model_id_input],
-        outputs=[log_output, analysis_report_output, optimization_options]
     )
     run_button.click(
         fn=run_amop_pipeline,
         inputs=[model_id_input, prune_checkbox, prune_slider],
-        outputs=[final_output, log_output]
     )
 if __name__ == "__main__":

 os.makedirs(OUTPUT_DIR, exist_ok=True)
+# --- 2. AMOP CORE PIPELINE FUNCTIONS (Logic is the same) ---
 def stage_1_analyze_model(model_id: str):
     log_stream = "[STAGE 1] Analyzing model...\n"
     try:
         config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
         recommendation = ""
         if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type:
             recommendation = "**Recommendation:** This is a large language model (LLM). For best CPU performance, a GGUF-based quantization strategy is typically state-of-the-art. This initial version of AMOP focuses on the ONNX pipeline. The recommended path is **Quantization -> ONNX Conversion**."
         else:
+            recommendation = "**Recommendation:** This is an encoder model or similar. The full AMOP pipeline is recommended for a balance of size and performance: **Pruning -> Quantization -> ONNX Conversion**."
         log_stream += f"Analysis complete. Architecture: {model_type}.\n"
+        ## UI/UX UPDATE ##: Return an open Accordion instead of a visible Group
+        return log_stream, analysis_report + "\n" + recommendation, gr.Accordion(open=True)
     except Exception as e:
         error_msg = f"Failed to analyze model '{model_id}'. Error: {e}"
         logging.error(error_msg)
+        return log_stream + error_msg, "Could not analyze model. Please check the model ID and try again.", gr.Accordion(open=False)
 def stage_2_prune_model(model, prune_percentage: float):
     if prune_percentage == 0:
         return model, "Skipped pruning as percentage was 0."
     log_stream = "[STAGE 2] Pruning model...\n"
     for name, module in model.named_modules():
         if isinstance(module, torch.nn.Linear):
             prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
             prune.remove(module, 'weight')
     log_stream += f"Pruning complete. Note: This version exports the original model to ONNX for maximum compatibility.\n"
     return model, log_stream
 def stage_3_and_4_quantize_and_onnx(model_id: str):
     log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
     try:
         run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
         onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
         main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
         log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
         quantizer = ORTQuantizer.from_pretrained(onnx_path)
         dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
         quantized_path = os.path.join(onnx_path, "quantized")
         quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
         log_stream += f"Successfully quantized model to: {quantized_path}\n"
         return quantized_path, log_stream
     except Exception as e:
         logging.error(error_msg, exc_info=True)
         raise RuntimeError(error_msg)
+def stage_5_evaluate_and_package(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
     log_stream = "[STAGE 5] Evaluating and Packaging...\n"
     try:
         ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         prompt = "My name is Philipp and I"
         inputs = tokenizer(prompt, return_tensors="pt")
         start_time = time.time()
         gen_tokens = ort_model.generate(**inputs, max_new_tokens=20)
         end_time = time.time()
         latency = (end_time - start_time) * 1000
         num_tokens = len(gen_tokens[0]) - inputs.input_ids.shape[1]
         ms_per_token = latency / num_tokens if num_tokens > 0 else float('inf')
+        eval_report = f"- **Inference Latency:** {latency:.2f} ms\n- **Speed:** {ms_per_token:.2f} ms/token\n"
         log_stream += "Evaluation complete.\n"
     except Exception as e:
         eval_report = f"- **Evaluation Failed:** Could not run generation. This often happens if the base model is not a text-generation model. Error: {e}\n"
     if not HF_TOKEN:
         return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
     try:
         repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
         repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
+        with open("model_card_template.md", "r", encoding="utf-8") as f: template_content = f.read()
         model_card_content = template_content.format(
+            repo_name=repo_name, model_id=model_id, optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             eval_report=eval_report, pruning_status="Enabled" if options['prune'] else "Disabled",
+            pruning_percent=options['prune_percent'], repo_id=repo_url.repo_id, pipeline_log=pipeline_log
         )
         readme_path = os.path.join(optimized_model_path, "README.md")
         with open(readme_path, "w", encoding="utf-8") as f: f.write(model_card_content)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         tokenizer.save_pretrained(optimized_model_path)
+        api.upload_folder(folder_path=optimized_model_path, repo_id=repo_url.repo_id, repo_type="model", token=HF_TOKEN)
+        final_message = f"Success! Your optimized model is available at: huggingface.co/{repo_url.repo_id}"
         log_stream += "Upload complete.\n"
         return final_message, log_stream
     except Exception as e:
         error_msg = f"Failed to upload to the Hub. Error: {e}"
         logging.error(error_msg, exc_info=True)
+        return f"Error: {error_msg}", log_stream + error_msg
+# --- 3. MAIN WORKFLOW GENERATOR (HEAVILY UPDATED FOR UI/UX) ---
 def run_amop_pipeline(model_id: str, do_prune: bool, prune_percent: float):
     if not model_id:
+        yield {log_output: "Please enter a Model ID.", final_output: gr.Label(value="Idle", label="Status")}
         return
+    ## UI/UX UPDATE ##: Yield dictionaries to update multiple components at once.
+    # This provides immediate feedback that the process has started.
+    initial_log = "[START] AMOP Pipeline Initiated.\n"
+    yield {
+        run_button: gr.Button(interactive=False, value="🚀 Running..."),
+        analyze_button: gr.Button(interactive=False),
+        final_output: gr.Label(value={"label": "RUNNING", "confidences": None}, label="Status", show_label=True),
+        log_output: initial_log
+    }
+    full_log = initial_log
     try:
         # Step 1: Load Model
         full_log += "Loading base model...\n"
+        yield {final_output: gr.Label(value={"label": "Loading model (1/5)"}), log_output: full_log}
         model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
         full_log += f"Successfully loaded base model '{model_id}'.\n"
         # Step 2: Pruning
+        yield {final_output: gr.Label(value={"label": "Pruning model (2/5)"}), log_output: full_log}
         if do_prune:
             model, log = stage_2_prune_model(model, prune_percent)
             full_log += log
             full_log += "[STAGE 2] Pruning skipped by user.\n"
         # Step 3 & 4: ONNX Conversion
+        yield {final_output: gr.Label(value={"label": "Converting to ONNX (3/5)"}), log_output: full_log}
         optimized_path, log = stage_3_and_4_quantize_and_onnx(model_id)
         full_log += log
+        # Step 5: Packaging and Evaluation
+        yield {final_output: gr.Label(value={"label": "Packaging & Uploading (4/5)"}), log_output: full_log}
         options = {'prune': do_prune, 'prune_percent': prune_percent}
+        final_message, log = stage_5_evaluate_and_package(model_id, optimized_path, full_log, options)
         full_log += log
         # Final Step: Done
+        yield {
+            final_output: gr.Label(value={"label": "SUCCESS", "confidences": None}, label="Status"),
+            log_output: full_log,
+            ## UI/UX UPDATE ##: Add a markdown component with a clickable link for the final result.
+            success_box: gr.Markdown(f"✅ **Success!** Your optimized model is available here: [{model_id}-amop-cpu](https://huggingface.co/{api.whoami()['name']}/{model_id.split('/')[-1]}-amop-cpu)", visible=True),
+            run_button: gr.Button(interactive=True, value="3. Run Optimization Pipeline", variant="primary"),
+            analyze_button: gr.Button(interactive=True, value="1. Analyze Model")
+        }
     except Exception as e:
         logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
         full_log += f"\n[ERROR] Pipeline failed: {e}"
+        yield {
+            final_output: gr.Label(value={"label": "ERROR", "confidences": None}, label="Status"),
+            log_output: full_log,
+            success_box: gr.Markdown(f"❌ **An error occurred.** Check the logs for details.", visible=True),
+            run_button: gr.Button(interactive=True, value="3. Run Optimization Pipeline", variant="primary"),
+            analyze_button: gr.Button(interactive=True, value="1. Analyze Model")
+        }
+# --- 4. GRADIO USER INTERFACE (HEAVILY UPDATED FOR UI/UX) ---
+with gr.Blocks(theme=gr.themes.Glass(), css=".gradio-container {background-color: #f5f5f5}") as demo:
+    gr.Markdown("# 🚀 AMOP: Adaptive Model Optimization Pipeline")
+    gr.Markdown("Turn any Hugging Face Hub model into a CPU-optimized ONNX version. Follow the steps below.")
     if not HF_TOKEN:
         gr.Warning("You have not set your HF_TOKEN in the Space secrets! The final 'upload' step will be skipped. Please add a secret with the key `HF_TOKEN` and your Hugging Face write token as the value.")
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 1. Select a Model")
+            model_id_input = gr.Textbox(
+                label="Hugging Face Model ID",
+                placeholder="e.g., gpt2, bert-base-uncased",
+                info="Enter the ID of a model from the Hub."
+            )
+            analyze_button = gr.Button("🔍 Analyze Model", variant="secondary")
+            ## UI/UX UPDATE ##: Use an Accordion. It's closed by default, keeping the UI clean.
+            with gr.Accordion("⚙️ 2. Configure Optimization", open=False) as optimization_accordion:
                 analysis_report_output = gr.Markdown()
+                prune_checkbox = gr.Checkbox(label="Enable Pruning (Stage 2)", value=False, info="Removes redundant weights from the model.")
                 prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
+                run_button = gr.Button("🚀 3. Run Optimization Pipeline", variant="primary")
         with gr.Column(scale=2):
             gr.Markdown("### Pipeline Status & Logs")
+            ## UI/UX UPDATE ##: Use gr.Label for a clean, prominent status indicator.
+            final_output = gr.Label(value="Idle", label="Status", show_label=True)
+            ## UI/UX UPDATE ##: Add a dedicated box for the final success/error message.
+            success_box = gr.Markdown(visible=False)
+            log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
+    # Event Handlers
     analyze_button.click(
         fn=stage_1_analyze_model,
         inputs=[model_id_input],
+        outputs=[log_output, analysis_report_output, optimization_accordion]
     )
     run_button.click(
         fn=run_amop_pipeline,
         inputs=[model_id_input, prune_checkbox, prune_slider],
+        outputs=[run_button, analyze_button, final_output, log_output, success_box]
     )
 if __name__ == "__main__":