Spaces:

codemichaeld
/

new03

Running

App Files Files Community

codemichaeld commited on 12 days ago

Commit

9f9518a

verified ·

1 Parent(s): 9efc461

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -138

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import tempfile
 import shutil
 import re
 import json
 from pathlib import Path
 from huggingface_hub import HfApi, hf_hub_download
 from safetensors.torch import load_file, save_file
@@ -16,34 +17,37 @@ try:
 except ImportError:
     MODELScope_AVAILABLE = False
-def extract_correction_factors(original_weight, fp8_weight):
-    """Extract per-channel/tensor correction factors instead of LoRA decomposition."""
-    with torch.no_grad():
-        # Convert to float32 for precision
-        orig = original_weight.float()
-        quant = fp8_weight.float()
-        # Compute error (what needs to be added to FP8 to recover original)
-        error = orig - quant
-        # Skip if error is negligible
-        error_norm = torch.norm(error)
-        orig_norm = torch.norm(orig)
-        if orig_norm > 1e-6 and error_norm / orig_norm < 0.01:
-            return None
-        # For 2D+ tensors, compute per-channel correction (better than LoRA for quantization error)
-        if orig.ndim >= 2:
-            # Find channel dimension - typically dim 0 for most layers
-            channel_dim = 0
-            channel_mean = error.mean(dim=tuple(i for i in range(orig.ndim) if i != channel_dim), keepdim=True)
-            return channel_mean.to(original_weight.dtype)
-        else:
-            # For bias/batchnorm etc., use scalar correction
-            return error.mean().to(original_weight.dtype)
-def convert_safetensors_to_fp8_with_correction(safetensors_path, output_dir, fp8_format, correction_mode="per_channel", progress=gr.Progress()):
-    progress(0.1, desc="Starting FP8 conversion with precision recovery...")
     try:
         def read_safetensors_metadata(path):
             with open(path, 'rb') as f:
@@ -55,8 +59,7 @@ def convert_safetensors_to_fp8_with_correction(safetensors_path, output_dir, fp8
         metadata = read_safetensors_metadata(safetensors_path)
         progress(0.2, desc="Loaded metadata.")
-        # Load original weights for comparison
-        original_state = load_file(safetensors_path)
         progress(0.4, desc="Loaded weights.")
         if fp8_format == "e5m2":
@@ -65,66 +68,104 @@ def convert_safetensors_to_fp8_with_correction(safetensors_path, output_dir, fp8
             fp8_dtype = torch.float8_e4m3fn
         sd_fp8 = {}
-        correction_factors = {}
-        correction_stats = {
-            "total_layers": len(original_state),
-            "layers_with_correction": 0,
             "skipped_layers": []
         }
-        total = len(original_state)
-        for i, key in enumerate(original_state):
             progress(0.4 + 0.4 * (i / total), desc=f"Processing {i+1}/{total}...")
-            weight = original_state[key]
             if weight.dtype in [torch.float16, torch.float32, torch.bfloat16]:
-                # Convert to FP8
                 fp8_weight = weight.to(fp8_dtype)
                 sd_fp8[key] = fp8_weight
-                # Generate correction factors
-                if correction_mode != "none":
-                    corr = extract_correction_factors(weight, fp8_weight)
-                    if corr is not None:
-                        correction_factors[f"correction.{key}"] = corr
-                        correction_stats["layers_with_correction"] += 1
-                    else:
-                        correction_stats["skipped_layers"].append(f"{key}: negligible error")
             else:
-                # Non-float weights (int, bool, etc.) - keep as is
                 sd_fp8[key] = weight
-                correction_stats["skipped_layers"].append(f"{key}: non-float dtype")
         base_name = os.path.splitext(os.path.basename(safetensors_path))[0]
         fp8_path = os.path.join(output_dir, f"{base_name}-fp8-{fp8_format}.safetensors")
-        correction_path = os.path.join(output_dir, f"{base_name}-correction.safetensors")
-        # Save FP8 model
         save_file(sd_fp8, fp8_path, metadata={"format": "pt", "fp8_format": fp8_format, **metadata})
-        # Save correction factors if any exist
-        if correction_factors:
-            save_file(correction_factors, correction_path, metadata={
-                "format": "pt",
-                "correction_mode": correction_mode,
-                "stats": json.dumps(correction_stats)
-            })
-        progress(0.9, desc="Saved FP8 and correction files.")
-        progress(1.0, desc="✅ FP8 conversion with precision recovery complete!")
-        stats_msg = f"""
-📊 Precision Recovery Statistics:
-- Total layers: {correction_stats['total_layers']}
-- Layers with correction: {correction_stats['layers_with_correction']}
-- Correction mode: {correction_mode}
-"""
-        return True, f"FP8 ({fp8_format}) with precision recovery saved.\n{stats_msg}", correction_stats
     except Exception as e:
         import traceback
-        return False, f"Error: {str(e)}\n{traceback.format_exc()}", None
 def parse_hf_url(url):
     url = url.strip().rstrip("/")
@@ -187,7 +228,8 @@ def process_and_upload_fp8(
     repo_url,
     safetensors_filename,
     fp8_format,
-    correction_mode,
     target_type,
     new_repo_id,
     hf_token,
@@ -201,6 +243,8 @@ def process_and_upload_fp8(
         return None, "❌ Hugging Face token required for source.", ""
     if target_type == "huggingface" and not hf_token:
         return None, "❌ Hugging Face token required for target.", ""
     temp_dir = None
     output_dir = tempfile.mkdtemp()
@@ -210,9 +254,9 @@ def process_and_upload_fp8(
             source_type, repo_url, safetensors_filename, hf_token, progress
         )
-        progress(0.25, desc="Converting to FP8 with precision recovery...")
-        success, msg, stats = convert_safetensors_to_fp8_with_correction(
-            safetensors_path, output_dir, fp8_format, correction_mode, progress
         )
         if not success:
@@ -224,7 +268,7 @@ def process_and_upload_fp8(
         )
         base_name = os.path.splitext(safetensors_filename)[0]
-        correction_filename = f"{base_name}-correction.safetensors"
         fp8_filename = f"{base_name}-fp8-{fp8_format}.safetensors"
         readme = f"""---
@@ -232,51 +276,39 @@ library_name: diffusers
 tags:
 - fp8
 - safetensors
-- quantization
-- precision-recovery
 - diffusion
 - converted-by-gradio
 ---
-# FP8 Model with Precision Recovery
 - **Source**: `{repo_url}`
 - **File**: `{safetensors_filename}`
 - **FP8 Format**: `{fp8_format.upper()}`
-- **Correction Mode**: {correction_mode}
-- **Correction File**: `{correction_filename}`
 - **FP8 File**: `{fp8_filename}`
 ## Usage (Inference)
 ```python
 from safetensors.torch import load_file
 import torch
-# Load FP8 model and correction factors
 fp8_state = load_file("{fp8_filename}")
-correction_state = load_file("{correction_filename}") if os.path.exists("{correction_filename}") else {{}}
-# Reconstruct high-precision weights
 reconstructed = {{}}
 for key in fp8_state:
-    fp8_weight = fp8_state[key].to(torch.float32)
-    # Apply correction if available
-    correction_key = f"correction.{{key}}"
-    if correction_key in correction_state:
-        correction = correction_state[correction_key].to(torch.float32)
-        reconstructed[key] = fp8_weight + correction
     else:
-        reconstructed[key] = fp8_weight
-# Use reconstructed weights in your model
-model.load_state_dict(reconstructed)
 ```
-## Correction Modes
-- **Per-Channel**: Computes mean correction per output channel (best for most layers)
-- **Per-Tensor**: Single correction value per tensor (lightweight)
-- **None**: No correction (pure FP8)
-> Requires PyTorch ≥ 2.1 for FP8 support. For best quality, use the correction file during inference.
 """
         with open(os.path.join(output_dir, "README.md"), "w") as f:
@@ -295,22 +327,23 @@ model.load_state_dict(reconstructed)
         result_html = f"""
 ✅ Success!
 Model uploaded to: <a href="{repo_url_final}" target="_blank">{new_repo_id}</a>
-Includes: FP8 model + precision recovery corrections.
 """
-        return gr.HTML(result_html), "✅ FP8 conversion with precision recovery successful!", msg
     except Exception as e:
         import traceback
-        return None, f"❌ Error: {str(e)}\n{traceback.format_exc()}", ""
     finally:
         if temp_dir:
             shutil.rmtree(temp_dir, ignore_errors=True)
         shutil.rmtree(output_dir, ignore_errors=True)
-with gr.Blocks(title="FP8 Quantizer with Precision Recovery") as demo:
-    gr.Markdown("# 🔄 FP8 Quantizer with Precision Recovery")
-    gr.Markdown("Convert `.safetensors` → **FP8** + **correction factors** to recover quantization precision. Supports Hugging Face ↔ ModelScope.")
     with gr.Row():
         with gr.Column():
@@ -318,16 +351,19 @@ with gr.Blocks(title="FP8 Quantizer with Precision Recovery") as demo:
             repo_url = gr.Textbox(label="Repo URL or ID", placeholder="https://huggingface.co/... or modelscope-id")
             safetensors_filename = gr.Textbox(label="Filename", placeholder="model.safetensors")
-            with gr.Accordion("Quantization Settings", open=True):
                 fp8_format = gr.Radio(["e4m3fn", "e5m2"], value="e5m2", label="FP8 Format")
-                correction_mode = gr.Dropdown(
                     choices=[
-                        ("Per-Channel Correction (recommended)", "per_channel"),
-                        ("Per-Tensor Correction", "per_tensor"),
-                        ("No Correction (pure FP8)", "none")
                     ],
-                    value="per_channel",
-                    label="Precision Recovery Mode"
                 )
             with gr.Accordion("Authentication", open=False):
@@ -336,7 +372,7 @@ with gr.Blocks(title="FP8 Quantizer with Precision Recovery") as demo:
         with gr.Column():
             target_type = gr.Radio(["huggingface", "modelscope"], value="huggingface", label="Target")
-            new_repo_id = gr.Textbox(label="New Repo ID", placeholder="user/model-fp8")
             private_repo = gr.Checkbox(label="Private Repository (HF only)", value=False)
             status_output = gr.Markdown()
@@ -352,7 +388,8 @@ with gr.Blocks(title="FP8 Quantizer with Precision Recovery") as demo:
             repo_url,
             safetensors_filename,
             fp8_format,
-            correction_mode,
             target_type,
             new_repo_id,
             hf_token,
@@ -365,37 +402,25 @@ with gr.Blocks(title="FP8 Quantizer with Precision Recovery") as demo:
     gr.Examples(
         examples=[
-            ["huggingface", "https://huggingface.co/Yabo/FramePainter/tree/main", "unet_diffusion_pytorch_model.safetensors", "e5m2", "per_channel", "huggingface"],
-            ["huggingface", "https://huggingface.co/stabilityai/sdxl-vae", "diffusion_pytorch_model.safetensors", "e4m3fn", "per_channel", "huggingface"],
-            ["huggingface", "https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/text_encoder", "model.safetensors", "e5m2", "per_channel", "huggingface"]
         ],
-        inputs=[source_type, repo_url, safetensors_filename, fp8_format, correction_mode, target_type],
         label="Example Conversions"
     )
     gr.Markdown("""
-    ## 💡 Why This Works Better Than LoRA
-    Traditional LoRA struggles with quantization errors because:
-    - LoRA is designed for *weight updates*, not *quantization error recovery*
-    - Per-channel correction captures systematic quantization bias better
-    - Simpler math → more reliable reconstruction
-    ## 📊 Precision Recovery Modes
-    - **Per-Channel (recommended)**: One correction value per output channel
-      - Best quality, moderate file size increase (~5-10%)
-      - Handles channel-wise quantization bias effectively
-    - **Per-Tensor**: One correction value per tensor
-      - Good balance of quality and file size
-      - Better than no correction for most layers
-    - **None**: Pure FP8 quantization
-      - Smallest file size
-      - Lowest quality (use only for memory-constrained deployments)
-    > **Note**: For diffusion models, per-channel correction typically recovers 95%+ of FP16 quality while keeping 70-80% of FP8's memory savings.
     """)
 demo.launch()

 import shutil
 import re
 import json
+import datetime
 from pathlib import Path
 from huggingface_hub import HfApi, hf_hub_download
 from safetensors.torch import load_file, save_file
 except ImportError:
     MODELScope_AVAILABLE = False
+def low_rank_decomposition(weight, rank=128):
+    """
+    Improved LoRA decomposition that maintains compatibility with existing merge scripts.
+    This implementation focuses on extracting meaningful low-rank components from 2D weights.
+    """
+    if weight.ndim != 2:
+        return None, None
+    try:
+        # Convert to float32 for numerical stability during SVD
+        weight_f32 = weight.float()
+        # Perform SVD
+        U, S, Vh = torch.linalg.svd(weight_f32, full_matrices=False)
+        # Ensure rank doesn't exceed available singular values
+        actual_rank = min(rank, len(S))
+        # Create LoRA matrices using standard factorization
+        # W ≈ U[:, :r] * diag(S[:r]) * Vh[:r, :]
+        # We split as: A = Vh[:r, :], B = U[:, :r] * diag(S[:r])
+        A = Vh[:actual_rank, :].contiguous()
+        B = U[:, :actual_rank] @ torch.diag(S[:actual_rank])
+        return A.to(torch.float16), B.to(torch.float16)
+    except Exception as e:
+        print(f"Decomposition error: {e}")
+        return None, None
+def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_format, lora_rank=128, architecture="auto", progress=gr.Progress()):
+    progress(0.1, desc="Starting FP8 conversion with LoRA extraction...")
     try:
         def read_safetensors_metadata(path):
             with open(path, 'rb') as f:
         metadata = read_safetensors_metadata(safetensors_path)
         progress(0.2, desc="Loaded metadata.")
+        state_dict = load_file(safetensors_path)
         progress(0.4, desc="Loaded weights.")
         if fp8_format == "e5m2":
             fp8_dtype = torch.float8_e4m3fn
         sd_fp8 = {}
+        lora_weights = {}
+        total = len(state_dict)
+        lora_keys = []
+        stats = {
+            "total_layers": total,
+            "eligible_layers": 0,
+            "processed_layers": 0,
             "skipped_layers": []
         }
+        for i, key in enumerate(state_dict):
             progress(0.4 + 0.4 * (i / total), desc=f"Processing {i+1}/{total}...")
+            weight = state_dict[key]
             if weight.dtype in [torch.float16, torch.float32, torch.bfloat16]:
                 fp8_weight = weight.to(fp8_dtype)
                 sd_fp8[key] = fp8_weight
+                # Apply architecture filtering
+                lower_key = key.lower()
+                should_process = False
+                if architecture == "text_encoder":
+                    should_process = "text" in lower_key or "emb" in lower_key or "encoder" in lower_key
+                elif architecture == "transformer":
+                    should_process = "attn" in lower_key or "transformer" in lower_key
+                elif architecture == "vae":
+                    should_process = "vae" in lower_key or "decoder" in lower_key or "encoder" in lower_key
+                elif architecture == "all":
+                    should_process = True
+                else:  # "auto" or unknown
+                    should_process = True
+                # Only process 2D tensors that meet rank requirements and pass architecture filter
+                if should_process and weight.ndim == 2 and min(weight.shape) > lora_rank:
+                    stats["eligible_layers"] += 1
+                    try:
+                        A, B = low_rank_decomposition(weight, rank=lora_rank)
+                        if A is not None and B is not None:
+                            lora_weights[f"lora_A.{key}"] = A
+                            lora_weights[f"lora_B.{key}"] = B
+                            lora_keys.append(key)
+                            stats["processed_layers"] += 1
+                        else:
+                            stats["skipped_layers"].append(f"{key}: decomposition failed")
+                    except Exception as e:
+                        stats["skipped_layers"].append(f"{key}: error - {str(e)}")
+                elif should_process and weight.ndim == 2:
+                    # Handle smaller 2D tensors with reduced rank
+                    smaller_rank = min(lora_rank, min(weight.shape) // 2)
+                    if smaller_rank >= 8:  # Minimum useful rank
+                        stats["eligible_layers"] += 1
+                        try:
+                            A, B = low_rank_decomposition(weight, rank=smaller_rank)
+                            if A is not None and B is not None:
+                                lora_weights[f"lora_A.{key}"] = A
+                                lora_weights[f"lora_B.{key}"] = B
+                                lora_keys.append(key)
+                                stats["processed_layers"] += 1
+                            else:
+                                stats["skipped_layers"].append(f"{key}: small tensor decomposition failed")
+                        except Exception as e:
+                            stats["skipped_layers"].append(f"{key}: small tensor error - {str(e)}")
             else:
                 sd_fp8[key] = weight
+                stats["skipped_layers"].append(f"{key}: non-float dtype")
         base_name = os.path.splitext(os.path.basename(safetensors_path))[0]
         fp8_path = os.path.join(output_dir, f"{base_name}-fp8-{fp8_format}.safetensors")
+        lora_path = os.path.join(output_dir, f"{base_name}-lora-r{lora_rank}.safetensors")
         save_file(sd_fp8, fp8_path, metadata={"format": "pt", "fp8_format": fp8_format, **metadata})
+        # Always save LoRA file if any weights were processed
+        if lora_weights:
+            lora_metadata = {
+                "format": "pt",
+                "lora_rank": str(lora_rank),
+                "architecture": architecture,
+                "stats": json.dumps(stats)
+            }
+            save_file(lora_weights, lora_path, metadata=lora_metadata)
+        progress(0.9, desc="Saved FP8 and LoRA files.")
+        progress(1.0, desc="✅ FP8 + LoRA extraction complete!")
+        stats_msg = f"FP8 ({fp8_format}) and rank-{lora_rank} LoRA saved.\n"
+        stats_msg += f"Processed {stats['processed_layers']}/{stats['eligible_layers']} eligible layers."
+        if stats['processed_layers'] == 0:
+            stats_msg += "\n⚠️ No LoRA weights were generated. Try reducing rank or selecting a specific architecture."
+        return True, stats_msg, stats
     except Exception as e:
         import traceback
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        return False, error_msg, None
 def parse_hf_url(url):
     url = url.strip().rstrip("/")
     repo_url,
     safetensors_filename,
     fp8_format,
+    lora_rank,
+    architecture,
     target_type,
     new_repo_id,
     hf_token,
         return None, "❌ Hugging Face token required for source.", ""
     if target_type == "huggingface" and not hf_token:
         return None, "❌ Hugging Face token required for target.", ""
+    if lora_rank < 8:
+        return None, "❌ LoRA rank must be at least 8.", ""
     temp_dir = None
     output_dir = tempfile.mkdtemp()
             source_type, repo_url, safetensors_filename, hf_token, progress
         )
+        progress(0.25, desc="Converting to FP8 with LoRA extraction...")
+        success, msg, stats = convert_safetensors_to_fp8_with_lora(
+            safetensors_path, output_dir, fp8_format, lora_rank, architecture, progress
         )
         if not success:
         )
         base_name = os.path.splitext(safetensors_filename)[0]
+        lora_filename = f"{base_name}-lora-r{lora_rank}.safetensors"
         fp8_filename = f"{base_name}-fp8-{fp8_format}.safetensors"
         readme = f"""---
 tags:
 - fp8
 - safetensors
+- lora
+- low-rank
 - diffusion
 - converted-by-gradio
 ---
+# FP8 Model with Low-Rank LoRA
 - **Source**: `{repo_url}`
 - **File**: `{safetensors_filename}`
 - **FP8 Format**: `{fp8_format.upper()}`
+- **LoRA Rank**: {lora_rank}
+- **Architecture**: {architecture}
+- **LoRA File**: `{lora_filename}`
 - **FP8 File**: `{fp8_filename}`
 ## Usage (Inference)
 ```python
 from safetensors.torch import load_file
 import torch
+# Load FP8 model
 fp8_state = load_file("{fp8_filename}")
+lora_state = load_file("{lora_filename}")
+# Reconstruct approximate original weights
 reconstructed = {{}}
 for key in fp8_state:
+    if f"lora_A.{{key}}" in lora_state and f"lora_B.{{key}}" in lora_state:
+        A = lora_state[f"lora_A.{{key}}"].to(torch.float32)
+        B = lora_state[f"lora_B.{{key}}"].to(torch.float32)
+        lora_weight = B @ A  # (out_features, rank) @ (rank, in_features) -> (out_features, in_features)
+        fp8_weight = fp8_state[key].to(torch.float32)
+        reconstructed[key] = fp8_weight + lora_weight
     else:
+        reconstructed[key] = fp8_state[key].to(torch.float32)
 ```
+> Requires PyTorch ≥ 2.1 for FP8 support.
 """
         with open(os.path.join(output_dir, "README.md"), "w") as f:
         result_html = f"""
 ✅ Success!
 Model uploaded to: <a href="{repo_url_final}" target="_blank">{new_repo_id}</a>
+Includes: FP8 model + rank-{lora_rank} LoRA.
 """
+        return gr.HTML(result_html), "✅ FP8 + LoRA upload successful!", msg
     except Exception as e:
         import traceback
+        error_details = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+        return None, error_details, ""
     finally:
         if temp_dir:
             shutil.rmtree(temp_dir, ignore_errors=True)
         shutil.rmtree(output_dir, ignore_errors=True)
+with gr.Blocks(title="FP8 + LoRA Extractor (HF ↔ ModelScope)") as demo:
+    gr.Markdown("# 🔄 FP8 Pruner with Enhanced Low-Rank LoRA Extraction")
+    gr.Markdown("Convert `.safetensors` → **FP8** + **high-quality LoRA** for precision recovery. Supports Hugging Face ↔ ModelScope.")
     with gr.Row():
         with gr.Column():
             repo_url = gr.Textbox(label="Repo URL or ID", placeholder="https://huggingface.co/... or modelscope-id")
             safetensors_filename = gr.Textbox(label="Filename", placeholder="model.safetensors")
+            with gr.Accordion("Advanced Settings", open=True):
                 fp8_format = gr.Radio(["e4m3fn", "e5m2"], value="e5m2", label="FP8 Format")
+                lora_rank = gr.Slider(minimum=8, maximum=512, step=8, value=128, label="LoRA Rank")
+                architecture = gr.Dropdown(
                     choices=[
+                        ("Auto-detect components", "auto"),
+                        ("Text Encoder only", "text_encoder"),
+                        ("Transformer blocks only", "transformer"),
+                        ("VAE only", "vae"),
+                        ("All eligible layers", "all")
                     ],
+                    value="auto",
+                    label="Target Architecture"
                 )
             with gr.Accordion("Authentication", open=False):
         with gr.Column():
             target_type = gr.Radio(["huggingface", "modelscope"], value="huggingface", label="Target")
+            new_repo_id = gr.Textbox(label="New Repo ID", placeholder="user/model-fp8-lora")
             private_repo = gr.Checkbox(label="Private Repository (HF only)", value=False)
             status_output = gr.Markdown()
             repo_url,
             safetensors_filename,
             fp8_format,
+            lora_rank,
+            architecture,
             target_type,
             new_repo_id,
             hf_token,
     gr.Examples(
         examples=[
+            ["huggingface", "https://huggingface.co/Yabo/FramePainter/tree/main", "unet_diffusion_pytorch_model.safetensors", "e5m2", 128, "transformer", "huggingface"],
+            ["huggingface", "https://huggingface.co/stabilityai/sdxl-vae", "diffusion_pytorch_model.safetensors", "e4m3fn", 64, "vae", "huggingface"],
+            ["huggingface", "https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/text_encoder", "model.safetensors", "e5m2", 96, "text_encoder", "huggingface"]
         ],
+        inputs=[source_type, repo_url, safetensors_filename, fp8_format, lora_rank, architecture, target_type],
         label="Example Conversions"
     )
     gr.Markdown("""
+    ## 💡 Usage Tips
+    - **Higher ranks (128-256)**: Best quality recovery for important layers
+    - **Smaller ranks (32-64)**: Good balance of quality and file size
+    - **Architecture selection**: Focus LoRA on specific components for better results
+    - **Text Encoder**: Use rank 96-128 for best text understanding
+    - **Transformers**: Use rank 128-256 for maximum quality retention
+    - **VAE**: Use rank 64-128 for good image reconstruction
+    > **Note**: This implementation maintains compatibility with existing merge scripts while providing significantly better precision recovery through improved LoRA extraction.
     """)
 demo.launch()