Spaces:

MegaTronX
/

Hunyuan7B-fp8-Japanese-Translate

Sleeping

App Files Files Community

MegaTronX commited on Nov 5

Commit

3356f4d

verified ·

1 Parent(s): aa3631e

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -30

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import os
-from compressed_tensors import load_compressed_model
 # Set cache directory for Spaces
 os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache'
@@ -15,8 +14,8 @@ class HunyuanTranslator:
         self._load_model()
     def _load_model(self):
-        """Load the pre-quantized FP8 model using Compressed Tensors"""
-        print("Loading Hunyuan-MT FP8 model with Compressed Tensors...")
         try:
             # Load tokenizer first
@@ -26,34 +25,23 @@ class HunyuanTranslator:
                 trust_remote_code=True
             )
-            # Load model with Compressed Tensors
-            print("Loading model with compressed_tensors...")
-            self.model = load_compressed_model(
                 self.model_name,
-                device="auto",  # Automatically use GPU if available
-                torch_dtype=torch.float16,
-                trust_remote_code=True
             )
-            print("FP8 model loaded successfully with Compressed Tensors!")
             print(f"Model device: {self.model.device}")
             print(f"Model dtype: {next(self.model.parameters()).dtype}")
         except Exception as e:
-            print(f"Error loading model with Compressed Tensors: {e}")
-            # Fallback to standard loading without compression
-            try:
-                print("Trying standard loading as fallback...")
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_name,
-                    device_map="auto",
-                    torch_dtype=torch.float16,
-                    trust_remote_code=True,
-                    cache_dir='/tmp/cache'
-                )
-                print("Model loaded successfully with standard method!")
-            except Exception as e2:
-                raise Exception(f"Both Compressed Tensors and standard loading failed: {e2}")
     def translate_ja_to_en(self, input_text: str) -> str:
         """Translate Japanese to English using FP8 model"""
@@ -136,7 +124,7 @@ def create_translation_interface():
         print(f"Failed to initialize translator: {e}")
         def translate_function(input_text):
-            return f"Model initialization failed: {str(e)}\n\nPlease check that 'compressed-tensors' is installed and try again."
     # Custom CSS for better appearance
     custom_css = """
@@ -168,7 +156,7 @@ def create_translation_interface():
         gr.Markdown(
             """
             # 🇯🇵 → 🇺🇸 Japanese to English Translation
-            **Model:** `tencent/Hunyuan-MT-7B-fp8` • **Technology:** Compressed Tensors FP8 Quantization
             *Fast, high-quality Japanese to English translation using optimized FP8 model*
             """
@@ -227,7 +215,7 @@ def create_translation_interface():
             inputs=input_text,
             outputs=output_text,
             fn=translate_function,
-            cache_examples=True,
             label="Click any example to try:"
         )
@@ -260,15 +248,14 @@ def create_translation_interface():
             **Model Details:**
             - **Base Model**: Hunyuan-MT 7B
-            - **Quantization**: FP8 (8-bit floating point) via Compressed Tensors
             - **Memory Usage**: ~3-4GB
             - **Specialization**: Japanese ↔ English translation
             **Optimization Features:**
             - ✅ FP8 quantization for faster inference
-            - ✅ Compressed Tensors for efficient storage
             - ✅ GPU acceleration support
-            - ✅ Batch processing capable
             **Usage Tips:**
             - Keep inputs under 1500 characters for best results

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import os
 # Set cache directory for Spaces
 os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache'
         self._load_model()
     def _load_model(self):
+        """Load the pre-quantized FP8 model"""
+        print("Loading Hunyuan-MT FP8 model...")
         try:
             # Load tokenizer first
                 trust_remote_code=True
             )
+            # For Compressed Tensors models, use the standard from_pretrained
+            # The quantization is automatically handled by the model files
+            self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
+                device_map="auto",
+                torch_dtype=torch.float16,  # Use fp16 as base dtype
+                trust_remote_code=True,
+                cache_dir='/tmp/cache'
             )
+            print("FP8 model loaded successfully!")
             print(f"Model device: {self.model.device}")
             print(f"Model dtype: {next(self.model.parameters()).dtype}")
         except Exception as e:
+            print(f"Error loading model: {e}")
+            raise Exception(f"Could not load the Hunyuan-MT model: {str(e)}")
     def translate_ja_to_en(self, input_text: str) -> str:
         """Translate Japanese to English using FP8 model"""
         print(f"Failed to initialize translator: {e}")
         def translate_function(input_text):
+            return f"Model initialization failed: {str(e)}\n\nPlease check the Space logs for details."
     # Custom CSS for better appearance
     custom_css = """
         gr.Markdown(
             """
             # 🇯🇵 → 🇺🇸 Japanese to English Translation
+            **Model:** `tencent/Hunyuan-MT-7B-fp8` • **Technology:** FP8 Quantization
             *Fast, high-quality Japanese to English translation using optimized FP8 model*
             """
             inputs=input_text,
             outputs=output_text,
             fn=translate_function,
+            cache_examples=False,
             label="Click any example to try:"
         )
             **Model Details:**
             - **Base Model**: Hunyuan-MT 7B
+            - **Quantization**: FP8 (8-bit floating point)
             - **Memory Usage**: ~3-4GB
             - **Specialization**: Japanese ↔ English translation
             **Optimization Features:**
             - ✅ FP8 quantization for faster inference
             - ✅ GPU acceleration support
+            - ✅ Efficient memory usage
             **Usage Tips:**
             - Keep inputs under 1500 characters for best results