Spaces:

broadfield-dev
/

AMOP

Paused

App Files Files Community

broadfield-dev commited on Sep 1, 2025

Commit

f074b57

verified ·

1 Parent(s): b0fc6c5

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -77

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torch
 import os
 import logging
 from datetime import datetime
-from huggingface_hub import HfApi, HfFolder
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
 from optimum.onnxruntime.configuration import AutoQuantizationConfig
@@ -13,10 +13,8 @@ import time
 # --- 1. SETUP AND CONFIGURATION ---
-# Setup basic logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# Ensure the user has set their Hugging Face token in the Space secrets
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
@@ -35,15 +33,13 @@ def stage_1_analyze_model(model_id: str):
     """
     log_stream = "[STAGE 1] Analyzing model...\n"
     try:
-        config = AutoConfig.from_pretrained(model_id)
         model_type = config.model_type
-        num_params = getattr(config, "num_hidden_layers", "N/A") * getattr(config, "hidden_size", 0) / 1e6 # A rough estimate
         analysis_report = f"""
         ### Model Analysis Report
         - **Model ID:** `{model_id}`
         - **Architecture:** `{model_type}`
-        - **Estimated Parameters:** ~{num_params:.2f}M
         """
         recommendation = ""
@@ -57,45 +53,30 @@ def stage_1_analyze_model(model_id: str):
             recommendation = "**Recommendation:** Unrecognized architecture. The standard path of **Quantization -> ONNX Conversion** is a safe starting point."
         log_stream += f"Analysis complete. Architecture: {model_type}.\n"
-        return log_stream, analysis_report + "\n" + recommendation, gr.update(visible=True)
     except Exception as e:
         error_msg = f"Failed to analyze model '{model_id}'. Error: {e}"
         logging.error(error_msg)
-        return log_stream + error_msg, "Could not analyze model. Please check the model ID and try again.", gr.update(visible=False)
-def stage_2_prune_model(model, prune_percentage: float, progress):
-    """
-    Performs Stage 2: Structural Reduction via one-shot unstructured pruning.
-    """
     if prune_percentage == 0:
         return model, "Skipped pruning as percentage was 0."
     log_stream = "[STAGE 2] Pruning model...\n"
-    progress(0.25, desc="Applying Unstructured Pruning")
-    total_params = sum(p.numel() for p in model.parameters())
     for name, module in model.named_modules():
         if isinstance(module, torch.nn.Linear):
             prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
-            prune.remove(module, 'weight') # Makes the pruning permanent
-    pruned_params = sum(p.numel() for p in model.parameters())
-    reduction = (total_params - pruned_params) / total_params * 100
-    log_stream += f"Pruning complete. Parameter reduction: ~{reduction:.2f}%\n"
     return model, log_stream
-def stage_3_and_4_quantize_and_onnx(model_id: str, progress):
-    """
-    Performs Stage 3 (Quantization) and Stage 4 (ONNX Conversion).
-    This version uses post-training dynamic quantization.
-    """
     log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
-    progress(0.5, desc="Exporting to ONNX")
     try:
         run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
         onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
@@ -104,16 +85,14 @@ def stage_3_and_4_quantize_and_onnx(model_id: str, progress):
         main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
         log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
-        progress(0.7, desc="Applying Dynamic Quantization")
         quantizer = ORTQuantizer.from_pretrained(onnx_path)
-        dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False) # Dynamic quantization for CPUs
         quantized_path = os.path.join(onnx_path, "quantized")
         quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
         log_stream += f"Successfully quantized model to: {quantized_path}\n"
         return quantized_path, log_stream
     except Exception as e:
         error_msg = f"Failed during ONNX conversion/quantization. Error: {e}"
         logging.error(error_msg, exc_info=True)
@@ -124,15 +103,9 @@ def stage_5_evaluate_and_package(
     model_id: str,
     optimized_model_path: str,
     pipeline_log: str,
-    options: dict,
-    progress
 ):
-    """
-    Performs Stage 5: Evaluation, Packaging, and Uploading.
-    """
     log_stream = "[STAGE 5] Evaluating and Packaging...\n"
-    progress(0.9, desc="Evaluating performance")
     try:
         ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
@@ -145,18 +118,16 @@ def stage_5_evaluate_and_package(
         end_time = time.time()
         latency = (end_time - start_time) * 1000
-        num_tokens = len(gen_tokens[0])
-        ms_per_token = latency / num_tokens
         eval_report = f"- **Inference Latency:** {latency:.2f} ms\n"
         eval_report += f"- **Speed:** {ms_per_token:.2f} ms/token\n"
         log_stream += "Evaluation complete.\n"
     except Exception as e:
-        eval_report = f"- **Evaluation Failed:** Could not load and test the ONNX model. This often happens if the base model is not a text-generation model. Error: {e}\n"
         log_stream += f"Warning: Evaluation failed. {e}\n"
-    progress(0.95, desc="Uploading to Hugging Face Hub")
     if not HF_TOKEN:
         return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
@@ -164,39 +135,29 @@ def stage_5_evaluate_and_package(
         repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
         repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
-        # --- THIS IS THE UPDATED SECTION ---
-        # Read the template file
         with open("model_card_template.md", "r", encoding="utf-8") as f:
             template_content = f.read()
-        # Fill in the placeholders
         model_card_content = template_content.format(
-            repo_name=repo_name,
-            model_id=model_id,
             optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-            eval_report=eval_report,
-            pruning_status="Enabled" if options['prune'] else "Disabled",
-            pruning_percent=options['prune_percent'],
-            repo_id=repo_url.repo_id,
             pipeline_log=pipeline_log
         )
-        # --- END OF UPDATED SECTION ---
         readme_path = os.path.join(optimized_model_path, "README.md")
-        with open(readme_path, "w", encoding="utf-8") as f:
-            f.write(model_card_content)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         tokenizer.save_pretrained(optimized_model_path)
         api.upload_folder(
-            folder_path=optimized_model_path,
-            repo_id=repo_url.repo_id,
-            repo_type="model",
-            token=HF_TOKEN
         )
-        final_message = f"✅ Success! Your optimized model is available at: {repo_url}"
         log_stream += "Upload complete.\n"
         return final_message, log_stream
     except Exception as e:
@@ -205,44 +166,56 @@ def stage_5_evaluate_and_package(
         return f"❌ Error: {error_msg}", log_stream + error_msg
-# --- 3. MAIN WORKFLOW FUNCTION ---
-def run_amop_pipeline(model_id: str, do_prune: bool, prune_percent: float, progress=gr.Progress(track_tqdm=True)):
     if not model_id:
-        return "Please enter a Model ID.", ""
     full_log = "[START] AMOP Pipeline Initiated.\n"
-    progress(0, desc="Loading Base Model")
     try:
         model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
         full_log += f"Successfully loaded base model '{model_id}'.\n"
         if do_prune:
-            model, log = stage_2_prune_model(model, prune_percent, progress)
             full_log += log
         else:
             full_log += "[STAGE 2] Pruning skipped by user.\n"
-        # We re-export the pruned model, so it needs to be saved and reloaded by optimum
-        # For simplicity in V1, we will export the original model from the hub
-        # A future version could handle the pruned model state_dict
-        optimized_path, log = stage_3_and_4_quantize_and_onnx(model_id, progress)
         full_log += log
         options = {'prune': do_prune, 'prune_percent': prune_percent}
-        final_status, log = stage_5_evaluate_and_package(model_id, optimized_path, full_log, options, progress)
         full_log += log
-        return final_status, full_log
     except Exception as e:
         logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
         full_log += f"\n[ERROR] Pipeline failed: {e}"
-        return f"❌ An error occurred during the pipeline. Check the logs for details.", full_log
-# --- 4. GRADIO USER INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# AMOP: Adaptive Model Optimization Pipeline")
@@ -266,12 +239,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
                 gr.Checkbox(label="Enable Quantization & ONNX (Stages 3 & 4)", value=True, interactive=False)
                 run_button = gr.Button("3. Run Optimization Pipeline", variant="primary")
         with gr.Column(scale=2):
             gr.Markdown("### Pipeline Status & Logs")
-            final_output = gr.Markdown(label="Final Result")
             log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
     analyze_button.click(

 import os
 import logging
 from datetime import datetime
+from huggingface_hub import HfApi
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
 from optimum.onnxruntime.configuration import AutoQuantizationConfig
 # --- 1. SETUP AND CONFIGURATION ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
     """
     log_stream = "[STAGE 1] Analyzing model...\n"
     try:
+        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
         model_type = config.model_type
         analysis_report = f"""
         ### Model Analysis Report
         - **Model ID:** `{model_id}`
         - **Architecture:** `{model_type}`
         """
         recommendation = ""
             recommendation = "**Recommendation:** Unrecognized architecture. The standard path of **Quantization -> ONNX Conversion** is a safe starting point."
         log_stream += f"Analysis complete. Architecture: {model_type}.\n"
+        # GRADIO 5 UPDATE: Instead of gr.update(), return a new component object.
+        return log_stream, analysis_report + "\n" + recommendation, gr.Group(visible=True)
     except Exception as e:
         error_msg = f"Failed to analyze model '{model_id}'. Error: {e}"
         logging.error(error_msg)
+        return log_stream + error_msg, "Could not analyze model. Please check the model ID and try again.", gr.Group(visible=False)
+def stage_2_prune_model(model, prune_percentage: float):
     if prune_percentage == 0:
         return model, "Skipped pruning as percentage was 0."
     log_stream = "[STAGE 2] Pruning model...\n"
     for name, module in model.named_modules():
         if isinstance(module, torch.nn.Linear):
             prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
+            prune.remove(module, 'weight')
+    log_stream += f"Pruning complete. Note: This version exports the original model to ONNX for maximum compatibility.\n"
     return model, log_stream
+def stage_3_and_4_quantize_and_onnx(model_id: str):
     log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
     try:
         run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
         onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
         main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
         log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
         quantizer = ORTQuantizer.from_pretrained(onnx_path)
+        dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
         quantized_path = os.path.join(onnx_path, "quantized")
         quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
         log_stream += f"Successfully quantized model to: {quantized_path}\n"
         return quantized_path, log_stream
     except Exception as e:
         error_msg = f"Failed during ONNX conversion/quantization. Error: {e}"
         logging.error(error_msg, exc_info=True)
     model_id: str,
     optimized_model_path: str,
     pipeline_log: str,
+    options: dict
 ):
     log_stream = "[STAGE 5] Evaluating and Packaging...\n"
     try:
         ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         end_time = time.time()
         latency = (end_time - start_time) * 1000
+        num_tokens = len(gen_tokens[0]) - inputs.input_ids.shape[1]
+        ms_per_token = latency / num_tokens if num_tokens > 0 else float('inf')
         eval_report = f"- **Inference Latency:** {latency:.2f} ms\n"
         eval_report += f"- **Speed:** {ms_per_token:.2f} ms/token\n"
         log_stream += "Evaluation complete.\n"
     except Exception as e:
+        eval_report = f"- **Evaluation Failed:** Could not run generation. This often happens if the base model is not a text-generation model. Error: {e}\n"
         log_stream += f"Warning: Evaluation failed. {e}\n"
     if not HF_TOKEN:
         return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
         repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
         repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
         with open("model_card_template.md", "r", encoding="utf-8") as f:
             template_content = f.read()
         model_card_content = template_content.format(
+            repo_name=repo_name, model_id=model_id,
             optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            eval_report=eval_report, pruning_status="Enabled" if options['prune'] else "Disabled",
+            pruning_percent=options['prune_percent'], repo_id=repo_url.repo_id,
             pipeline_log=pipeline_log
         )
         readme_path = os.path.join(optimized_model_path, "README.md")
+        with open(readme_path, "w", encoding="utf-8") as f: f.write(model_card_content)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         tokenizer.save_pretrained(optimized_model_path)
         api.upload_folder(
+            folder_path=optimized_model_path, repo_id=repo_url.repo_id,
+            repo_type="model", token=HF_TOKEN
         )
+        final_message = f"✅ Success! Your optimized model is available at: [{repo_url.repo_id}](https://huggingface.co/{repo_url.repo_id})"
         log_stream += "Upload complete.\n"
         return final_message, log_stream
     except Exception as e:
         return f"❌ Error: {error_msg}", log_stream + error_msg
+# --- 3. MAIN WORKFLOW FUNCTION (GENERATOR FOR GRADIO 5+) ---
+def run_amop_pipeline(model_id: str, do_prune: bool, prune_percent: float):
+    """
+    This is now a generator function. It 'yields' updates to the UI
+    at each step, providing a real-time log.
+    """
     if not model_id:
+        yield "Please enter a Model ID.", ""
+        return
     full_log = "[START] AMOP Pipeline Initiated.\n"
+    yield gr.Markdown("🚀 Pipeline is running... Check logs for real-time updates."), full_log
     try:
+        # Step 1: Load Model
+        full_log += "Loading base model...\n"
+        yield gr.Markdown("🚀 Pipeline is running... (1/5) Loading model"), full_log
         model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
         full_log += f"Successfully loaded base model '{model_id}'.\n"
+        # Step 2: Pruning
+        yield gr.Markdown("🚀 Pipeline is running... (2/5) Pruning model"), full_log
         if do_prune:
+            model, log = stage_2_prune_model(model, prune_percent)
             full_log += log
         else:
             full_log += "[STAGE 2] Pruning skipped by user.\n"
+        # Step 3 & 4: ONNX Conversion
+        yield gr.Markdown("🚀 Pipeline is running... (3/5) Converting to ONNX & Quantizing"), full_log
+        optimized_path, log = stage_3_and_4_quantize_and_onnx(model_id)
         full_log += log
+        # Step 5: Packaging
+        yield gr.Markdown("🚀 Pipeline is running... (4/5) Evaluating and Packaging"), full_log
         options = {'prune': do_prune, 'prune_percent': prune_percent}
+        final_status_msg, log = stage_5_evaluate_and_package(model_id, optimized_path, full_log, options)
         full_log += log
+        # Final Step: Done
+        yield gr.Markdown(final_status_msg), full_log
     except Exception as e:
         logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
         full_log += f"\n[ERROR] Pipeline failed: {e}"
+        yield f"❌ An error occurred during the pipeline. Check the logs for details.", full_log
+# --- 4. GRADIO USER INTERFACE (for Gradio 5+) ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# AMOP: Adaptive Model Optimization Pipeline")
                 prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
                 gr.Checkbox(label="Enable Quantization & ONNX (Stages 3 & 4)", value=True, interactive=False)
                 run_button = gr.Button("3. Run Optimization Pipeline", variant="primary")
         with gr.Column(scale=2):
             gr.Markdown("### Pipeline Status & Logs")
+            final_output = gr.Markdown(value="*Pipeline has not been run yet.*", label="Final Result")
             log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
     analyze_button.click(