Spaces:

MegaTronX
/

Hunyuan7B-fp8-Japanese-Translate

Sleeping

App Files Files Community

MegaTronX commited on Nov 5

Commit

aa3631e

verified ·

1 Parent(s): 51ae273

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -70

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import os
 # Set cache directory for Spaces
 os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache'
@@ -14,32 +15,45 @@ class HunyuanTranslator:
         self._load_model()
     def _load_model(self):
-        """Load the pre-quantized FP8 model"""
-        print("Loading pre-quantized Hunyuan-MT FP8 model...")
         try:
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_name,
                 cache_dir='/tmp/cache',
                 trust_remote_code=True
             )
-            # Load the pre-quantized FP8 model - let transformers handle the quantization automatically
-            self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
-                device_map="auto",
-                trust_remote_code=True,  # Important for custom models
-                cache_dir='/tmp/cache',
-                torch_dtype=torch.float16,  # Use fp16 as fallback, model will use its native fp8 where available
             )
-            print("FP8 model loaded successfully!")
             print(f"Model device: {self.model.device}")
             print(f"Model dtype: {next(self.model.parameters()).dtype}")
         except Exception as e:
-            print(f"Error loading model: {e}")
-            raise
     def translate_ja_to_en(self, input_text: str) -> str:
         """Translate Japanese to English using FP8 model"""
@@ -47,63 +61,84 @@ class HunyuanTranslator:
             return "Please enter some Japanese text to translate."
         # Limit input length for Spaces
-        if len(input_text) > 2000:
-            return "Input too long. Please keep under 2000 characters for this demo."
         try:
-            # Japanese to English specific prompt
-            prompt = f"Translate the following Japanese text to English. Provide only the translation without additional explanations:\n\nJapanese: {input_text}\nEnglish:"
-            messages = [{"role": "user", "content": prompt}]
-            # Apply chat template
-            tokenized_chat = self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=True,
-                add_generation_prompt=True,
-                return_tensors="pt",
             )
-            # Generate with FP8 model
             with torch.no_grad():
                 outputs = self.model.generate(
-                    tokenized_chat.to(self.model.device),
                     max_new_tokens=512,
                     temperature=0.7,
                     do_sample=True,
                     top_p=0.9,
                     repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id
                 )
-            # Decode output
-            output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract translation (remove prompt and get only the English part)
-            if "English:" in output_text:
-                output_text = output_text.split("English:")[-1].strip()
-            # Clean up any remaining special tokens or markers
-            output_text = output_text.replace("<|endoftext|>", "").strip()
-            output_text = output_text.replace("</s>", "").strip()
-            return output_text if output_text else "No translation generated. Please try again."
         except Exception as e:
             return f"Error during translation: {str(e)}"
 def create_translation_interface():
-    """Create the Gradio interface optimized for Spaces"""
     # Initialize translator
-    translator = HunyuanTranslator()
-    def translate_function(input_text):
-        """Wrapper function for Gradio"""
-        return translator.translate_ja_to_en(input_text)
-    # Custom CSS for better appearance on Spaces
     custom_css = """
     .gradio-container {
         max-width: 900px !important;
@@ -114,13 +149,16 @@ def create_translation_interface():
         margin: auto;
         padding: 20px;
     }
-    .example-text {
-        font-size: 0.9em;
-        color: #666;
     }
     """
-    # Create Gradio interface optimized for Spaces
     with gr.Blocks(
         title="Japanese to English Translation - Hunyuan-MT FP8",
         theme=gr.themes.Soft(),
@@ -130,22 +168,22 @@ def create_translation_interface():
         gr.Markdown(
             """
             # 🇯🇵 → 🇺🇸 Japanese to English Translation
-            **Model:** `tencent/Hunyuan-MT-7B-fp8` (7B parameters, pre-quantized FP8)
-            **Specialization:** High-quality Japanese → English translation
-            *Enter Japanese text below and click Translate*
             """
         )
         with gr.Row(equal_height=False):
             with gr.Column(scale=1):
                 input_text = gr.Textbox(
-                    label="Japanese Text Input",
-                    placeholder="日本語のテキストを入力してください... (Enter Japanese text here)",
-                    lines=5,
                     max_lines=8,
                     show_copy_button=True,
-                    elem_id="input-text"
                 )
                 with gr.Row():
@@ -163,17 +201,18 @@ def create_translation_interface():
                     )
             with gr.Column(scale=1):
                 output_text = gr.Textbox(
-                    label="English Translation",
                     placeholder="Translation will appear here...",
-                    lines=5,
                     max_lines=8,
                     show_copy_button=True,
-                    elem_id="output-text"
                 )
         # Examples section
-        gr.Markdown("### 💡 Try these examples:")
         examples = gr.Examples(
             examples=[
                 ["こんにちは、元気ですか？"],
@@ -181,7 +220,9 @@ def create_translation_interface():
                 ["機械学習と人工知能は現代技術の重要な分野です。"],
                 ["このレストランの料理はとても美味しいです。"],
                 ["明日の会議は午後二時から始まります。"],
-                ["日本の文化は非常に興味深いと思います。"]
             ],
             inputs=input_text,
             outputs=output_text,
@@ -211,22 +252,29 @@ def create_translation_interface():
             outputs=output_text
         )
-        # Additional info
         gr.Markdown(
             """
             ---
-            ### ℹ️ Usage Notes:
-            - **Model**: tencent/Hunyuan-MT-7B-fp8 (7B parameters, FP8 quantized)
-            - **Optimized** specifically for Japanese → English translation
-            - **Max input length**: ~2000 characters
-            - **Translation time**: Usually 10-30 seconds
-            - **Memory efficient**: Uses FP8 quantization for faster inference
-            ### 🛠️ Technical Details:
-            - Pre-quantized to FP8 (8-bit floating point)
-            - ~3-4GB memory footprint
-            - Optimized for GPU inference
-            - Supports long-form translation
             """
         )

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import os
+from compressed_tensors import load_compressed_model
 # Set cache directory for Spaces
 os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache'
         self._load_model()
     def _load_model(self):
+        """Load the pre-quantized FP8 model using Compressed Tensors"""
+        print("Loading Hunyuan-MT FP8 model with Compressed Tensors...")
         try:
+            # Load tokenizer first
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_name,
                 cache_dir='/tmp/cache',
                 trust_remote_code=True
             )
+            # Load model with Compressed Tensors
+            print("Loading model with compressed_tensors...")
+            self.model = load_compressed_model(
                 self.model_name,
+                device="auto",  # Automatically use GPU if available
+                torch_dtype=torch.float16,
+                trust_remote_code=True
             )
+            print("FP8 model loaded successfully with Compressed Tensors!")
             print(f"Model device: {self.model.device}")
             print(f"Model dtype: {next(self.model.parameters()).dtype}")
         except Exception as e:
+            print(f"Error loading model with Compressed Tensors: {e}")
+            # Fallback to standard loading without compression
+            try:
+                print("Trying standard loading as fallback...")
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_name,
+                    device_map="auto",
+                    torch_dtype=torch.float16,
+                    trust_remote_code=True,
+                    cache_dir='/tmp/cache'
+                )
+                print("Model loaded successfully with standard method!")
+            except Exception as e2:
+                raise Exception(f"Both Compressed Tensors and standard loading failed: {e2}")
     def translate_ja_to_en(self, input_text: str) -> str:
         """Translate Japanese to English using FP8 model"""
             return "Please enter some Japanese text to translate."
         # Limit input length for Spaces
+        if len(input_text) > 1500:
+            return "Input too long. Please keep under 1500 characters for this demo."
         try:
+            # Clean and prepare the input text
+            input_text = input_text.strip()
+            # Create a clear translation prompt
+            prompt = f"""Translate the following Japanese text to English. Provide only the translation without any additional explanations or notes.
+Japanese: {input_text}
+English:"""
+            # Tokenize the input
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=1024
             )
+            # Move inputs to the same device as model
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Generate translation
             with torch.no_grad():
                 outputs = self.model.generate(
+                    **inputs,
                     max_new_tokens=512,
                     temperature=0.7,
                     do_sample=True,
                     top_p=0.9,
                     repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    num_return_sequences=1
                 )
+            # Decode the output
+            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract just the translation part (remove the prompt)
+            if prompt in generated_text:
+                translation = generated_text.replace(prompt, "").strip()
+            else:
+                # If prompt isn't found, try to extract after "English:"
+                if "English:" in generated_text:
+                    translation = generated_text.split("English:")[-1].strip()
+                else:
+                    translation = generated_text.strip()
+            # Clean up the translation
+            translation = translation.split('\n')[0].strip()  # Take first line only
+            translation = translation.replace('"', '').strip()
+            return translation if translation else "No translation generated. Please try again."
         except Exception as e:
             return f"Error during translation: {str(e)}"
 def create_translation_interface():
+    """Create the Gradio interface for Japanese to English translation"""
     # Initialize translator
+    try:
+        translator = HunyuanTranslator()
+        def translate_function(input_text):
+            return translator.translate_ja_to_en(input_text)
+    except Exception as e:
+        print(f"Failed to initialize translator: {e}")
+        def translate_function(input_text):
+            return f"Model initialization failed: {str(e)}\n\nPlease check that 'compressed-tensors' is installed and try again."
+    # Custom CSS for better appearance
     custom_css = """
     .gradio-container {
         max-width: 900px !important;
         margin: auto;
         padding: 20px;
     }
+    .japanese-text {
+        font-family: "Hiragino Sans", "Yu Gothic", "Meiryo", sans-serif;
+    }
+    .translation-box {
+        border-left: 3px solid #4CAF50;
+        padding-left: 15px;
     }
     """
+    # Create Gradio interface
     with gr.Blocks(
         title="Japanese to English Translation - Hunyuan-MT FP8",
         theme=gr.themes.Soft(),
         gr.Markdown(
             """
             # 🇯🇵 → 🇺🇸 Japanese to English Translation
+            **Model:** `tencent/Hunyuan-MT-7B-fp8` • **Technology:** Compressed Tensors FP8 Quantization
+            *Fast, high-quality Japanese to English translation using optimized FP8 model*
             """
         )
         with gr.Row(equal_height=False):
             with gr.Column(scale=1):
+                gr.Markdown("### 📥 Japanese Input")
                 input_text = gr.Textbox(
+                    label="",
+                    placeholder="日本語のテキストを入力してください...\n(Enter Japanese text here)",
+                    lines=6,
                     max_lines=8,
                     show_copy_button=True,
+                    elem_classes=["japanese-text"]
                 )
                 with gr.Row():
                     )
             with gr.Column(scale=1):
+                gr.Markdown("### 📤 English Translation")
                 output_text = gr.Textbox(
+                    label="",
                     placeholder="Translation will appear here...",
+                    lines=6,
                     max_lines=8,
                     show_copy_button=True,
+                    elem_classes=["translation-box"]
                 )
         # Examples section
+        gr.Markdown("### 💡 Example Translations")
         examples = gr.Examples(
             examples=[
                 ["こんにちは、元気ですか？"],
                 ["機械学習と人工知能は現代技術の重要な分野です。"],
                 ["このレストランの料理はとても美味しいです。"],
                 ["明日の会議は午後二時から始まります。"],
+                ["日本の文化は非常に興味深いと思います。"],
+                ["新しいプロジェクトの提案書を作成しました。"],
+                ["電車の遅延により、到着が30分ほど遅れます。"]
             ],
             inputs=input_text,
             outputs=output_text,
             outputs=output_text
         )
+        # Technical details
         gr.Markdown(
             """
             ---
+            ### 🛠️ Technical Information
+            **Model Details:**
+            - **Base Model**: Hunyuan-MT 7B
+            - **Quantization**: FP8 (8-bit floating point) via Compressed Tensors
+            - **Memory Usage**: ~3-4GB
+            - **Specialization**: Japanese ↔ English translation
+            **Optimization Features:**
+            - ✅ FP8 quantization for faster inference
+            - ✅ Compressed Tensors for efficient storage
+            - ✅ GPU acceleration support
+            - ✅ Batch processing capable
+            **Usage Tips:**
+            - Keep inputs under 1500 characters for best results
+            - Translation takes 5-15 seconds typically
+            - Model works best with complete sentences
+            - Handles technical and casual Japanese well
             """
         )