Spaces:

codemichaeld
/

new03

Running

App Files Files Community

codemichaeld commited on 12 days ago

Commit

2a57dcf

verified ·

1 Parent(s): 1fd8c55

Update app.py

Browse files

Files changed (1) hide show

app.py +390 -132

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from huggingface_hub import HfApi, hf_hub_download
 from safetensors.torch import load_file, save_file
 import torch
 import torch.nn.functional as F
 try:
     from modelscope.hub.file_download import model_file_download as ms_file_download
     from modelscope.hub.api import HubApi as ModelScopeApi
@@ -17,75 +19,182 @@ try:
 except ImportError:
     MODELScope_AVAILABLE = False
-def low_rank_decomposition(weight, rank=64):
-    """
-    Correct LoRA decomposition supporting 2D and 4D tensors.
-    Returns (lora_A, lora_B) such that weight ≈ lora_B @ lora_A for 2D,
-    or appropriate conv form for 4D.
-    """
     original_shape = weight.shape
     original_dtype = weight.dtype
     try:
         if weight.ndim == 2:
-            actual_rank = min(rank, min(weight.shape) // 2)
-            if actual_rank < 4:
-                return None, None
             U, S, Vh = torch.linalg.svd(weight.float(), full_matrices=False)
-            S_sqrt = torch.sqrt(S[:actual_rank])
-            # Standard LoRA factorization: W ≈ W_B @ W_A
-            W_A = (Vh[:actual_rank, :] * S_sqrt.unsqueeze(1)).contiguous()  # [rank, in_features]
-            W_B = (U[:, :actual_rank] * S_sqrt.unsqueeze(0)).contiguous()   # [out_features, rank]
-            return W_A.to(original_dtype), W_B.to(original_dtype)
         elif weight.ndim == 4:
-            out_ch, in_ch, k_h, k_w = weight.shape
-            if k_h * k_w <= 9:  # small conv kernels (e.g., 3x3)
-                # Reshape to 2D: [out_ch, in_ch * k_h * k_w]
-                weight_2d = weight.view(out_ch, -1)
-                actual_rank = min(rank, min(weight_2d.shape) // 2)
-                if actual_rank < 4:
-                    return None, None
-                U, S, Vh = torch.linalg.svd(weight_2d.float(), full_matrices=False)
-                S_sqrt = torch.sqrt(S[:actual_rank])
-                W_A_2d = (Vh[:actual_rank, :] * S_sqrt.unsqueeze(1)).contiguous()
-                W_B_2d = (U[:, :actual_rank] * S_sqrt.unsqueeze(0)).contiguous()
-                # Reshape back to conv format
-                W_A = W_A_2d.view(actual_rank, in_ch, k_h, k_w).contiguous()
-                W_B = W_B_2d.view(out_ch, actual_rank, 1, 1).contiguous()
-                return W_A.to(original_dtype), W_B.to(original_dtype)
-        return None, None
     except Exception as e:
-        print(f"Decomposition error for {original_shape}: {e}")
-        return None, None
-def should_apply_lora(key, weight, architecture="auto"):
-    """Architecture-aware LoRA eligibility."""
-    lower_key = key.lower()
-    # Skip bias, norm, and tiny tensors
-    if 'bias' in lower_key or 'norm' in lower_key or weight.numel() < 256:
         return False
     if architecture == "text_encoder":
-        return any(t in lower_key for t in ['emb', 'embed', 'attn', 'mlp'])
     elif architecture == "unet_transformer":
-        return any(t in lower_key for t in ['attn', 'transformer', 'to_q', 'to_k', 'to_v', 'to_out'])
     elif architecture == "unet_conv":
-        return any(t in lower_key for t in ['conv', 'resnet', 'down', 'up', 'skip'])
     elif architecture == "vae":
-        return any(t in lower_key for t in ['encoder', 'decoder', 'quant', 'post_quant', 'pre_quant'])
-    else:  # "auto" or "all"
-        return weight.ndim in [2, 4]
 def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_format, lora_rank=64, architecture="auto", progress=gr.Progress()):
     progress(0.1, desc="Starting FP8 conversion with LoRA extraction...")
@@ -96,84 +205,196 @@ def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_forma
                 header_json = f.read(header_size).decode('utf-8')
                 header = json.loads(header_json)
                 return header.get('__metadata__', {})
         metadata = read_safetensors_metadata(safetensors_path)
         progress(0.2, desc="Loaded metadata.")
         state_dict = load_file(safetensors_path)
         progress(0.4, desc="Loaded weights.")
         if fp8_format == "e5m2":
             fp8_dtype = torch.float8_e5m2
         else:
             fp8_dtype = torch.float8_e4m3fn
         sd_fp8 = {}
         lora_weights = {}
-        total = len(state_dict)
-        lora_keys = []
         lora_stats = {
-            'total_layers': total,
             'layers_eligible': 0,
             'layers_processed': 0,
             'layers_skipped': [],
         }
         for i, key in enumerate(state_dict):
-            progress(0.4 + 0.4 * (i / total), desc=f"Processing {i+1}/{total}...")
             weight = state_dict[key]
             if weight.dtype in [torch.float16, torch.float32, torch.bfloat16]:
                 fp8_weight = weight.to(fp8_dtype)
                 sd_fp8[key] = fp8_weight
-                if should_apply_lora(key, weight, architecture):
                     lora_stats['layers_eligible'] += 1
                     try:
-                        A, B = low_rank_decomposition(weight, rank=lora_rank)
-                        if A is not None and B is not None:
-                            lora_weights[f"lora_A.{key}"] = A.to(torch.float16)
-                            lora_weights[f"lora_B.{key}"] = B.to(torch.float16)
                             lora_keys.append(key)
                             lora_stats['layers_processed'] += 1
                         else:
-                            lora_stats['layers_skipped'].append(f"{key}: decomposition failed")
                     except Exception as e:
-                        lora_stats['layers_skipped'].append(f"{key}: exception: {e}")
                 else:
-                    reason = "filtered by architecture" if architecture != "auto" else "not 2D/4D or too small"
-                    lora_stats['layers_skipped'].append(f"{key}: skipped ({reason})")
             else:
                 sd_fp8[key] = weight
-                lora_stats['layers_skipped'].append(f"{key}: non-float dtype")
         base_name = os.path.splitext(os.path.basename(safetensors_path))[0]
         fp8_path = os.path.join(output_dir, f"{base_name}-fp8-{fp8_format}.safetensors")
         lora_path = os.path.join(output_dir, f"{base_name}-lora-r{lora_rank}-{architecture}.safetensors")
         save_file(sd_fp8, fp8_path, metadata={"format": "pt", "fp8_format": fp8_format, **metadata})
-        save_file(lora_weights, lora_path, metadata={
-            "format": "pt",
             "lora_rank": str(lora_rank),
             "architecture": architecture,
             "stats": json.dumps(lora_stats)
-        })
         progress(0.9, desc="Saved FP8 and LoRA files.")
         progress(1.0, desc="✅ FP8 + LoRA extraction complete!")
-        stats_msg = f"FP8 ({fp8_format}) and rank-{lora_rank} LoRA ({architecture}) saved.\n"
-        stats_msg += f"Processed {lora_stats['layers_processed']}/{lora_stats['layers_eligible']} eligible layers."
         if lora_stats['layers_processed'] == 0:
-            stats_msg += " ⚠️ No valid LoRA weights generated."
-        return True, stats_msg, lora_stats
     except Exception as e:
-        import traceback
-        return False, f"Error: {str(e)}\n{traceback.format_exc()}", None
 def parse_hf_url(url):
     url = url.strip().rstrip("/")
@@ -251,9 +472,11 @@ def process_and_upload_fp8(
         return None, "❌ Hugging Face token required for source.", ""
     if target_type == "huggingface" and not hf_token:
         return None, "❌ Hugging Face token required for target.", ""
     if lora_rank < 4:
         return None, "❌ LoRA rank must be at least 4.", ""
     temp_dir = None
     output_dir = tempfile.mkdtemp()
     try:
@@ -261,24 +484,24 @@ def process_and_upload_fp8(
         safetensors_path, temp_dir = download_safetensors_file(
             source_type, repo_url, safetensors_filename, hf_token, progress
         )
         progress(0.25, desc=f"Converting to FP8 with LoRA ({architecture})...")
         success, msg, stats = convert_safetensors_to_fp8_with_lora(
             safetensors_path, output_dir, fp8_format, lora_rank, architecture, progress
         )
         if not success:
             return None, f"❌ Conversion failed: {msg}", ""
         progress(0.9, desc="Uploading...")
         repo_url_final = upload_to_target(
             target_type, new_repo_id, output_dir, fp8_format, architecture, hf_token, modelscope_token, private_repo
         )
         base_name = os.path.splitext(safetensors_filename)[0]
         lora_filename = f"{base_name}-lora-r{lora_rank}-{architecture}.safetensors"
         fp8_filename = f"{base_name}-fp8-{fp8_format}.safetensors"
         readme = f"""---
 library_name: diffusers
 tags:
@@ -288,7 +511,7 @@ tags:
 - low-rank
 - diffusion
 - architecture-{architecture}
-- converted-by-gradio
 ---
 # FP8 Model with Low-Rank LoRA
 - **Source**: `{repo_url}`
@@ -299,6 +522,16 @@ tags:
 - **LoRA File**: `{lora_filename}`
 - **FP8 File**: `{fp8_filename}`
 ## Usage (Inference)
 ```python
 from safetensors.torch import load_file
@@ -311,26 +544,38 @@ lora_state = load_file("{lora_filename}")
 # Reconstruct approximate original weights
 reconstructed = {{}}
 for key in fp8_state:
-    if f"lora_A.{{key}}" in lora_state and f"lora_B.{{key}}" in lora_state:
-        A = lora_state[f"lora_A.{{key}}"].to(torch.float32)
-        B = lora_state[f"lora_B.{{key}}"].to(torch.float32)
         if A.ndim == 2 and B.ndim == 2:
             lora_weight = B @ A
         else:
-            # Conv LoRA: simplified reconstruction
-            lora_weight = F.conv2d(fp8_state[key].unsqueeze(0).to(torch.float32), A, groups=1)[:, :B.shape[0]]
-            lora_weight = lora_weight.squeeze(0) + F.conv2d(fp8_state[key].unsqueeze(0).to(torch.float32), B, groups=1).squeeze(0)
         reconstructed[key] = fp8_state[key].to(torch.float32) + lora_weight
     else:
         reconstructed[key] = fp8_state[key].to(torch.float32)
 ```
-> Requires PyTorch ≥ 2.1 for FP8 support. Use matching architecture during inference.
 """
         with open(os.path.join(output_dir, "README.md"), "w") as f:
             f.write(readme)
         if target_type == "huggingface":
             HfApi(token=hf_token).upload_file(
                 path_or_fileobj=os.path.join(output_dir, "README.md"),
@@ -339,18 +584,27 @@ for key in fp8_state:
                 repo_type="model",
                 token=hf_token
             )
         progress(1.0, desc="✅ Done!")
         result_html = f"""
 ✅ Success!
 Model uploaded to: <a href="{repo_url_final}" target="_blank">{new_repo_id}</a>
-Includes: FP8 + rank-{lora_rank} LoRA ({architecture}).
 """
         return gr.HTML(result_html), "✅ FP8 + LoRA upload successful!", msg
     except Exception as e:
-        import traceback
-        return None, f"❌ Error: {str(e)}\n{traceback.format_exc()}", ""
     finally:
         if temp_dir:
             shutil.rmtree(temp_dir, ignore_errors=True)
@@ -358,17 +612,18 @@ Includes: FP8 + rank-{lora_rank} LoRA ({architecture}).
 with gr.Blocks(title="FP8 + LoRA Extractor (HF ↔ ModelScope)") as demo:
     gr.Markdown("# 🔄 Advanced FP8 Pruner with Architecture-Specific LoRA Extraction")
-    gr.Markdown("Convert `.safetensors` → **FP8** + **targeted LoRA** for precision recovery. Supports Hugging Face ↔ ModelScope.")
     with gr.Row():
         with gr.Column():
             source_type = gr.Radio(["huggingface", "modelscope"], value="huggingface", label="Source")
             repo_url = gr.Textbox(label="Repo URL or ID", placeholder="https://huggingface.co/... or modelscope-id")
             safetensors_filename = gr.Textbox(label="Filename", placeholder="model.safetensors")
             with gr.Accordion("Advanced LoRA Settings", open=True):
                 fp8_format = gr.Radio(["e4m3fn", "e5m2"], value="e5m2", label="FP8 Format")
                 lora_rank = gr.Slider(minimum=4, maximum=256, step=4, value=64, label="LoRA Rank")
                 architecture = gr.Dropdown(
                     choices=[
                         ("Auto-detect components", "auto"),
@@ -379,24 +634,25 @@ with gr.Blocks(title="FP8 + LoRA Extractor (HF ↔ ModelScope)") as demo:
                         ("All components", "all")
                     ],
                     value="auto",
-                    label="Target Architecture"
                 )
             with gr.Accordion("Authentication", open=False):
                 hf_token = gr.Textbox(label="Hugging Face Token", type="password")
                 modelscope_token = gr.Textbox(label="ModelScope Token (optional)", type="password", visible=MODELScope_AVAILABLE)
         with gr.Column():
             target_type = gr.Radio(["huggingface", "modelscope"], value="huggingface", label="Target")
             new_repo_id = gr.Textbox(label="New Repo ID", placeholder="user/model-fp8-lora")
             private_repo = gr.Checkbox(label="Private Repository (HF only)", value=False)
             status_output = gr.Markdown()
             detailed_log = gr.Textbox(label="Processing Log", interactive=False, lines=10)
     convert_btn = gr.Button("🚀 Convert & Upload", variant="primary")
     repo_link_output = gr.HTML()
     convert_btn.click(
         fn=process_and_upload_fp8,
         inputs=[
@@ -415,7 +671,7 @@ with gr.Blocks(title="FP8 + LoRA Extractor (HF ↔ ModelScope)") as demo:
         outputs=[repo_link_output, status_output, detailed_log],
         show_progress=True
     )
     gr.Examples(
         examples=[
             ["huggingface", "https://huggingface.co/Yabo/FramePainter/tree/main", "unet_diffusion_pytorch_model.safetensors", "e5m2", 64, "unet_transformer"],
@@ -425,15 +681,17 @@ with gr.Blocks(title="FP8 + LoRA Extractor (HF ↔ ModelScope)") as demo:
         inputs=[source_type, repo_url, safetensors_filename, fp8_format, lora_rank, architecture],
         label="Example Conversions"
     )
     gr.Markdown("""
     ## 💡 Usage Tips
-    - **Text Encoder**: Use rank 32–64 with `text_encoder` architecture
-    - **UNet Attention**: Use `unet_transformer` with rank 64–128
-    - **UNet Convolutions**: Use `unet_conv` with rank 16–32
-    - **VAE**: Use `vae` with rank 16–32
-    - **Auto Mode**: Let the tool analyze and select layers automatically
-    - Higher ranks = better quality but larger LoRA files
     """)
 demo.launch()

 from safetensors.torch import load_file, save_file
 import torch
 import torch.nn.functional as F
+import traceback
+import math
 try:
     from modelscope.hub.file_download import model_file_download as ms_file_download
     from modelscope.hub.api import HubApi as ModelScopeApi
 except ImportError:
     MODELScope_AVAILABLE = False
+def low_rank_decomposition(weight, rank=64, approximation_factor=0.8):
+    """Low-rank decomposition with controlled approximation error."""
     original_shape = weight.shape
     original_dtype = weight.dtype
     try:
+        # Handle 2D tensors (linear layers, attention)
         if weight.ndim == 2:
+            # Compute SVD
             U, S, Vh = torch.linalg.svd(weight.float(), full_matrices=False)
+            # Calculate how much variance we want to keep
+            total_variance = torch.sum(S ** 2)
+            cumulative_variance = torch.cumsum(S ** 2, dim=0)
+            # Find minimal rank that preserves approximation_factor of variance
+            minimal_rank = torch.searchsorted(cumulative_variance, approximation_factor * total_variance).item() + 1
+            # Use the smaller of: requested rank or minimal rank for approximation_factor
+            actual_rank = min(rank, len(S))
+            # If actual_rank is too close to full rank, reduce it to create meaningful approximation
+            if actual_rank > len(S) * 0.8:  # If using more than 80% of full rank
+                actual_rank = max(min(rank // 2, len(S) // 2), 8)  # Use half the requested rank
+            # Ensure we're actually approximating, not just reparameterizing
+            if actual_rank >= min(weight.shape):
+                # Force approximation by using lower rank
+                actual_rank = max(min(weight.shape) // 4, 8)
+            U_k = U[:, :actual_rank] @ torch.diag(torch.sqrt(S[:actual_rank]))
+            Vh_k = torch.diag(torch.sqrt(S[:actual_rank])) @ Vh[:actual_rank, :]
+            return U_k.contiguous(), Vh_k.contiguous()
+        # Handle 4D tensors (convolutional layers)
         elif weight.ndim == 4:
+            out_ch, in_ch, kH, kW = weight.shape
+            # Reshape to 2D for SVD
+            weight_2d = weight.view(out_ch, in_ch * kH * kW)
+            # Compute SVD on flattened version
+            U, S, Vh = torch.linalg.svd(weight_2d.float(), full_matrices=False)
+            # Calculate appropriate rank
+            total_variance = torch.sum(S ** 2)
+            cumulative_variance = torch.cumsum(S ** 2, dim=0)
+            minimal_rank = torch.searchsorted(cumulative_variance, approximation_factor * total_variance).item() + 1
+            # Adjust rank for convolutions - typically need lower ranks
+            conv_rank = min(rank // 2, len(S))
+            if conv_rank > len(S) * 0.7:
+                conv_rank = max(len(S) // 4, 8)
+            actual_rank = max(min(conv_rank, minimal_rank), 8)
+            # Decompose
+            U_k = U[:, :actual_rank] @ torch.diag(torch.sqrt(S[:actual_rank]))
+            Vh_k = torch.diag(torch.sqrt(S[:actual_rank])) @ Vh[:actual_rank, :]
+            # Reshape back to convolutional format
+            if kH == 1 and kW == 1:  # 1x1 convolutions
+                U_k = U_k.view(out_ch, actual_rank, 1, 1)
+                Vh_k = Vh_k.view(actual_rank, in_ch, 1, 1)
+            else:
+                # For larger kernels, use spatial decomposition
+                U_k = U_k.view(out_ch, actual_rank, 1, 1)
+                Vh_k = Vh_k.view(actual_rank, in_ch, kH, kW)
+            return U_k.contiguous(), Vh_k.contiguous()
+        # Handle 1D tensors (biases, embeddings)
+        elif weight.ndim == 1:
+            # Don't decompose 1D tensors
+            return None, None
     except Exception as e:
+        print(f"Decomposition error for tensor with shape {original_shape}: {str(e)[:100]}")
+    return None, None
+def get_architecture_specific_settings(architecture, base_rank):
+    """Get optimal settings for different model architectures."""
+    settings = {
+        "text_encoder": {
+            "rank": base_rank,
+            "approximation_factor": 0.95,  # Text encoders need high accuracy
+            "min_rank": 8,
+            "max_rank_factor": 0.5  # Use at most 50% of full rank
+        },
+        "unet_transformer": {
+            "rank": base_rank,
+            "approximation_factor": 0.90,
+            "min_rank": 16,
+            "max_rank_factor": 0.4
+        },
+        "unet_conv": {
+            "rank": base_rank // 2,  # Convolutions compress better
+            "approximation_factor": 0.85,
+            "min_rank": 8,
+            "max_rank_factor": 0.3
+        },
+        "vae": {
+            "rank": base_rank // 3,  # VAE compresses very well
+            "approximation_factor": 0.80,
+            "min_rank": 4,
+            "max_rank_factor": 0.25
+        },
+        "auto": {
+            "rank": base_rank,
+            "approximation_factor": 0.90,
+            "min_rank": 8,
+            "max_rank_factor": 0.5
+        },
+        "all": {
+            "rank": base_rank,
+            "approximation_factor": 0.90,
+            "min_rank": 8,
+            "max_rank_factor": 0.5
+        }
+    }
+    return settings.get(architecture, settings["auto"])
+def should_apply_lora(key, weight, architecture, lora_rank):
+    """Determine if LoRA should be applied to a specific weight based on architecture selection."""
+    # Skip bias terms, batchnorm, and very small tensors
+    if 'bias' in key or 'norm' in key.lower() or 'bn' in key.lower():
         return False
+    # Skip very small tensors
+    if weight.numel() < 100:
+        return False
+    # Skip 1D tensors
+    if weight.ndim == 1:
+        return False
+    # Architecture-specific rules
+    lower_key = key.lower()
     if architecture == "text_encoder":
+        # Text encoder: focus on embeddings and attention layers
+        return ('emb' in lower_key or 'embed' in lower_key or
+                'attn' in lower_key or 'qkv' in lower_key or 'mlp' in lower_key)
     elif architecture == "unet_transformer":
+        # UNet transformers: focus on attention blocks
+        return ('attn' in lower_key or 'transformer' in lower_key or
+                'qkv' in lower_key or 'to_out' in lower_key)
     elif architecture == "unet_conv":
+        # UNet convolutional layers
+        return ('conv' in lower_key or 'resnet' in lower_key or
+                'downsample' in lower_key or 'upsample' in lower_key)
     elif architecture == "vae":
+        # VAE components
+        return ('encoder' in lower_key or 'decoder' in lower_key or
+                'conv' in lower_key or 'post_quant' in lower_key)
+    elif architecture == "all":
+        # Apply to all eligible tensors
+        return True
+    elif architecture == "auto":
+        # Auto-detect based on tensor properties
+        if weight.ndim == 2 and min(weight.shape) > lora_rank // 4:
+            return True
+        if weight.ndim == 4 and (weight.shape[0] > lora_rank // 4 or weight.shape[1] > lora_rank // 4):
+            return True
+        return False
+    return False
 def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_format, lora_rank=64, architecture="auto", progress=gr.Progress()):
     progress(0.1, desc="Starting FP8 conversion with LoRA extraction...")
                 header_json = f.read(header_size).decode('utf-8')
                 header = json.loads(header_json)
                 return header.get('__metadata__', {})
         metadata = read_safetensors_metadata(safetensors_path)
         progress(0.2, desc="Loaded metadata.")
         state_dict = load_file(safetensors_path)
         progress(0.4, desc="Loaded weights.")
+        # Architecture analysis
+        architecture_stats = {
+            'text_encoder': 0,
+            'unet_transformer': 0,
+            'unet_conv': 0,
+            'vae': 0,
+            'other': 0
+        }
+        for key in state_dict:
+            lower_key = key.lower()
+            if 'text' in lower_key or 'emb' in lower_key:
+                architecture_stats['text_encoder'] += 1
+            elif 'attn' in lower_key or 'transformer' in lower_key:
+                architecture_stats['unet_transformer'] += 1
+            elif 'conv' in lower_key or 'resnet' in lower_key:
+                architecture_stats['unet_conv'] += 1
+            elif 'vae' in lower_key or 'encoder' in lower_key or 'decoder' in lower_key:
+                architecture_stats['vae'] += 1
+            else:
+                architecture_stats['other'] += 1
+        print("Architecture analysis:")
+        for arch, count in architecture_stats.items():
+            print(f"- {arch}: {count} layers")
         if fp8_format == "e5m2":
             fp8_dtype = torch.float8_e5m2
         else:
             fp8_dtype = torch.float8_e4m3fn
         sd_fp8 = {}
         lora_weights = {}
         lora_stats = {
+            'total_layers': len(state_dict),
+            'layers_analyzed': 0,
             'layers_eligible': 0,
             'layers_processed': 0,
             'layers_skipped': [],
+            'architecture_distro': architecture_stats,
+            'reconstruction_errors': []
         }
+        total = len(state_dict)
+        lora_keys = []
         for i, key in enumerate(state_dict):
+            progress(0.4 + 0.4 * (i / total), desc=f"Processing {i+1}/{total}: {key.split('.')[-1]}")
             weight = state_dict[key]
+            lora_stats['layers_analyzed'] += 1
             if weight.dtype in [torch.float16, torch.float32, torch.bfloat16]:
                 fp8_weight = weight.to(fp8_dtype)
                 sd_fp8[key] = fp8_weight
+                # Determine if we should apply LoRA
+                eligible_for_lora = should_apply_lora(key, weight, architecture, lora_rank)
+                if eligible_for_lora:
                     lora_stats['layers_eligible'] += 1
                     try:
+                        # Get architecture-specific settings
+                        arch_settings = get_architecture_specific_settings(architecture, lora_rank)
+                        # Adjust rank based on tensor properties
+                        if weight.ndim == 2:
+                            max_possible_rank = min(weight.shape)
+                            actual_rank = min(
+                                arch_settings["rank"],
+                                int(max_possible_rank * arch_settings["max_rank_factor"])
+                            )
+                            actual_rank = max(actual_rank, arch_settings["min_rank"])
+                        elif weight.ndim == 4:
+                            # For conv layers, use smaller rank
+                            actual_rank = min(
+                                arch_settings["rank"],
+                                max(weight.shape[0], weight.shape[1]) // 4
+                            )
+                            actual_rank = max(actual_rank, arch_settings["min_rank"])
+                        else:
+                            # Skip non-2D/4D tensors for LoRA
+                            lora_stats['layers_skipped'].append(f"{key}: unsupported ndim={weight.ndim}")
+                            continue
+                        if actual_rank < 4:
+                            lora_stats['layers_skipped'].append(f"{key}: rank too small ({actual_rank})")
+                            continue
+                        # Perform decomposition with approximation
+                        U, V = low_rank_decomposition(
+                            weight,
+                            rank=actual_rank,
+                            approximation_factor=arch_settings["approximation_factor"]
+                        )
+                        if U is not None and V is not None:
+                            # Store as half-precision
+                            lora_weights[f"lora_A.{key}"] = U.to(torch.float16)
+                            lora_weights[f"lora_B.{key}"] = V.to(torch.float16)
                             lora_keys.append(key)
                             lora_stats['layers_processed'] += 1
+                            # Calculate and store reconstruction error
+                            if U.ndim == 2 and V.ndim == 2:
+                                if V.shape[0] == U.shape[1]:
+                                    reconstructed = V @ U
+                                else:
+                                    reconstructed = U @ V
+                                error = torch.norm(weight.float() - reconstructed.float()) / torch.norm(weight.float())
+                                lora_stats['reconstruction_errors'].append({
+                                    'key': key,
+                                    'error': error.item(),
+                                    'original_shape': list(weight.shape),
+                                    'rank': actual_rank
+                                })
                         else:
+                            lora_stats['layers_skipped'].append(f"{key}: decomposition returned None")
                     except Exception as e:
+                        error_msg = f"{key}: {str(e)[:100]}"
+                        lora_stats['layers_skipped'].append(error_msg)
                 else:
+                    reason = "not eligible for selected architecture" if architecture != "auto" else f"ndim={weight.ndim}"
+                    lora_stats['layers_skipped'].append(f"{key}: {reason}")
             else:
                 sd_fp8[key] = weight
+                lora_stats['layers_skipped'].append(f"{key}: unsupported dtype {weight.dtype}")
+        # Add reconstruction error statistics
+        if lora_stats['reconstruction_errors']:
+            errors = [e['error'] for e in lora_stats['reconstruction_errors']]
+            lora_stats['avg_reconstruction_error'] = sum(errors) / len(errors) if errors else 0
+            lora_stats['max_reconstruction_error'] = max(errors) if errors else 0
+            lora_stats['min_reconstruction_error'] = min(errors) if errors else 0
         base_name = os.path.splitext(os.path.basename(safetensors_path))[0]
         fp8_path = os.path.join(output_dir, f"{base_name}-fp8-{fp8_format}.safetensors")
         lora_path = os.path.join(output_dir, f"{base_name}-lora-r{lora_rank}-{architecture}.safetensors")
         save_file(sd_fp8, fp8_path, metadata={"format": "pt", "fp8_format": fp8_format, **metadata})
+        # Always save LoRA file, even if empty
+        lora_metadata = {
+            "format": "pt",
             "lora_rank": str(lora_rank),
             "architecture": architecture,
+            "original_filename": os.path.basename(safetensors_path),
+            "fp8_format": fp8_format,
             "stats": json.dumps(lora_stats)
+        }
+        save_file(lora_weights, lora_path, metadata=lora_metadata)
+        # Generate detailed statistics message
+        stats_msg = f"""
+📊 LoRA Extraction Statistics:
+- Total layers analyzed: {lora_stats['layers_analyzed']}
+- Layers eligible for LoRA: {lora_stats['layers_eligible']}
+- Successfully processed: {lora_stats['layers_processed']}
+- Architecture: {architecture}
+- FP8 Format: {fp8_format.upper()}
+"""
+        if 'avg_reconstruction_error' in lora_stats:
+            stats_msg += f"- Avg reconstruction error: {lora_stats['avg_reconstruction_error']:.6f}\n"
+            stats_msg += f"- Max reconstruction error: {lora_stats['max_reconstruction_error']:.6f}\n"
         progress(0.9, desc="Saved FP8 and LoRA files.")
         progress(1.0, desc="✅ FP8 + LoRA extraction complete!")
         if lora_stats['layers_processed'] == 0:
+            stats_msg += "\n\n⚠️ WARNING: No LoRA weights were generated. Try a different architecture selection or lower rank."
+        elif lora_stats.get('avg_reconstruction_error', 1) < 0.0001:
+            stats_msg += "\n\nℹ️ NOTE: Very low reconstruction error detected. LoRA may be reconstructing almost perfectly. Consider using lower rank for better compression."
+        return True, f"FP8 ({fp8_format}) and rank-{lora_rank} LoRA saved.\n{stats_msg}", lora_stats
     except Exception as e:
+        error_msg = f"Conversion error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return False, error_msg, None
 def parse_hf_url(url):
     url = url.strip().rstrip("/")
         return None, "❌ Hugging Face token required for source.", ""
     if target_type == "huggingface" and not hf_token:
         return None, "❌ Hugging Face token required for target.", ""
+    # Validate lora_rank
     if lora_rank < 4:
         return None, "❌ LoRA rank must be at least 4.", ""
     temp_dir = None
     output_dir = tempfile.mkdtemp()
     try:
         safetensors_path, temp_dir = download_safetensors_file(
             source_type, repo_url, safetensors_filename, hf_token, progress
         )
         progress(0.25, desc=f"Converting to FP8 with LoRA ({architecture})...")
         success, msg, stats = convert_safetensors_to_fp8_with_lora(
             safetensors_path, output_dir, fp8_format, lora_rank, architecture, progress
         )
         if not success:
             return None, f"❌ Conversion failed: {msg}", ""
         progress(0.9, desc="Uploading...")
         repo_url_final = upload_to_target(
             target_type, new_repo_id, output_dir, fp8_format, architecture, hf_token, modelscope_token, private_repo
         )
         base_name = os.path.splitext(safetensors_filename)[0]
         lora_filename = f"{base_name}-lora-r{lora_rank}-{architecture}.safetensors"
         fp8_filename = f"{base_name}-fp8-{fp8_format}.safetensors"
         readme = f"""---
 library_name: diffusers
 tags:
 - low-rank
 - diffusion
 - architecture-{architecture}
+- converted-by-ai-toolkit
 ---
 # FP8 Model with Low-Rank LoRA
 - **Source**: `{repo_url}`
 - **LoRA File**: `{lora_filename}`
 - **FP8 File**: `{fp8_filename}`
+## Architecture Distribution
+"""
+        # Add architecture stats to README if available
+        if stats and 'architecture_distro' in stats:
+            readme += "\n| Component | Layer Count |\n|-----------|------------|\n"
+            for arch, count in stats['architecture_distro'].items():
+                readme += f"| {arch.replace('_', ' ').title()} | {count} |\n"
+        readme += f"""
 ## Usage (Inference)
 ```python
 from safetensors.torch import load_file
 # Reconstruct approximate original weights
 reconstructed = {{}}
 for key in fp8_state:
+    lora_a_key = f"lora_A.{{key}}"
+    lora_b_key = f"lora_B.{{key}}"
+    if lora_a_key in lora_state and lora_b_key in lora_state:
+        A = lora_state[lora_a_key].to(torch.float32)
+        B = lora_state[lora_b_key].to(torch.float32)
+        # Handle different tensor dimensions
         if A.ndim == 2 and B.ndim == 2:
             lora_weight = B @ A
+        elif A.ndim == 4 and B.ndim == 4:
+            # For convolutional LoRA
+            lora_weight = F.conv2d(fp8_state[key].to(torch.float32),
+                                  B, padding=1) + F.conv2d(fp8_state[key].to(torch.float32),
+                                  A, padding=1)
         else:
+            # Fallback for mixed dimension cases
+            lora_weight = B @ A.view(B.shape[1], -1)
+            if lora_weight.shape != fp8_state[key].shape:
+                lora_weight = lora_weight.view_as(fp8_state[key])
         reconstructed[key] = fp8_state[key].to(torch.float32) + lora_weight
     else:
         reconstructed[key] = fp8_state[key].to(torch.float32)
 ```
+> **Note**: Requires PyTorch ≥ 2.1 for FP8 support. For best results, use the same architecture selection ({architecture}) during inference as was used during extraction.
 """
         with open(os.path.join(output_dir, "README.md"), "w") as f:
             f.write(readme)
         if target_type == "huggingface":
             HfApi(token=hf_token).upload_file(
                 path_or_fileobj=os.path.join(output_dir, "README.md"),
                 repo_type="model",
                 token=hf_token
             )
         progress(1.0, desc="✅ Done!")
         result_html = f"""
 ✅ Success!
 Model uploaded to: <a href="{repo_url_final}" target="_blank">{new_repo_id}</a>
+Includes:
+- FP8 model: `{fp8_filename}`
+- LoRA weights: `{lora_filename}` (rank {lora_rank}, architecture: {architecture})
+📊 Stats: {stats['layers_processed']}/{stats['layers_eligible']} eligible layers processed
 """
+        if 'avg_reconstruction_error' in stats:
+            result_html += f"<br>Avg reconstruction error: {stats['avg_reconstruction_error']:.6f}"
         return gr.HTML(result_html), "✅ FP8 + LoRA upload successful!", msg
     except Exception as e:
+        error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return None, error_msg, ""
     finally:
         if temp_dir:
             shutil.rmtree(temp_dir, ignore_errors=True)
 with gr.Blocks(title="FP8 + LoRA Extractor (HF ↔ ModelScope)") as demo:
     gr.Markdown("# 🔄 Advanced FP8 Pruner with Architecture-Specific LoRA Extraction")
+    gr.Markdown("Convert `.safetensors` → **FP8** + **targeted LoRA** weights for precision recovery. Supports Hugging Face ↔ ModelScope.")
     with gr.Row():
         with gr.Column():
             source_type = gr.Radio(["huggingface", "modelscope"], value="huggingface", label="Source")
             repo_url = gr.Textbox(label="Repo URL or ID", placeholder="https://huggingface.co/... or modelscope-id")
             safetensors_filename = gr.Textbox(label="Filename", placeholder="model.safetensors")
             with gr.Accordion("Advanced LoRA Settings", open=True):
                 fp8_format = gr.Radio(["e4m3fn", "e5m2"], value="e5m2", label="FP8 Format")
                 lora_rank = gr.Slider(minimum=4, maximum=256, step=4, value=64, label="LoRA Rank")
                 architecture = gr.Dropdown(
                     choices=[
                         ("Auto-detect components", "auto"),
                         ("All components", "all")
                     ],
                     value="auto",
+                    label="Target Architecture",
+                    info="Select which model components to apply LoRA to"
                 )
             with gr.Accordion("Authentication", open=False):
                 hf_token = gr.Textbox(label="Hugging Face Token", type="password")
                 modelscope_token = gr.Textbox(label="ModelScope Token (optional)", type="password", visible=MODELScope_AVAILABLE)
         with gr.Column():
             target_type = gr.Radio(["huggingface", "modelscope"], value="huggingface", label="Target")
             new_repo_id = gr.Textbox(label="New Repo ID", placeholder="user/model-fp8-lora")
             private_repo = gr.Checkbox(label="Private Repository (HF only)", value=False)
             status_output = gr.Markdown()
             detailed_log = gr.Textbox(label="Processing Log", interactive=False, lines=10)
     convert_btn = gr.Button("🚀 Convert & Upload", variant="primary")
     repo_link_output = gr.HTML()
     convert_btn.click(
         fn=process_and_upload_fp8,
         inputs=[
         outputs=[repo_link_output, status_output, detailed_log],
         show_progress=True
     )
     gr.Examples(
         examples=[
             ["huggingface", "https://huggingface.co/Yabo/FramePainter/tree/main", "unet_diffusion_pytorch_model.safetensors", "e5m2", 64, "unet_transformer"],
         inputs=[source_type, repo_url, safetensors_filename, fp8_format, lora_rank, architecture],
         label="Example Conversions"
     )
     gr.Markdown("""
     ## 💡 Usage Tips
+    - **For Text Encoders**: Use rank 32-64 with `text_encoder` architecture for optimal results.
+    - **For UNet Attention**: Use `unet_transformer` with rank 64-128 for best quality preservation.
+    - **For UNet Convolutions**: Use `unet_conv` with lower ranks (16-32) as convolutions compress better.
+    - **For VAE**: Use `vae` architecture with rank 16-32.
+    - **Auto Mode**: Let the tool analyze and target appropriate layers automatically.
+    ⚠️ **Note**: Higher ranks produce better quality but larger LoRA files. Start with lower ranks and increase if needed.
     """)
 demo.launch()