MegaTronX commited on
Commit
3356f4d
Β·
verified Β·
1 Parent(s): aa3631e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -30
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
  import os
5
- from compressed_tensors import load_compressed_model
6
 
7
  # Set cache directory for Spaces
8
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache'
@@ -15,8 +14,8 @@ class HunyuanTranslator:
15
  self._load_model()
16
 
17
  def _load_model(self):
18
- """Load the pre-quantized FP8 model using Compressed Tensors"""
19
- print("Loading Hunyuan-MT FP8 model with Compressed Tensors...")
20
 
21
  try:
22
  # Load tokenizer first
@@ -26,34 +25,23 @@ class HunyuanTranslator:
26
  trust_remote_code=True
27
  )
28
 
29
- # Load model with Compressed Tensors
30
- print("Loading model with compressed_tensors...")
31
- self.model = load_compressed_model(
32
  self.model_name,
33
- device="auto", # Automatically use GPU if available
34
- torch_dtype=torch.float16,
35
- trust_remote_code=True
 
36
  )
37
 
38
- print("FP8 model loaded successfully with Compressed Tensors!")
39
  print(f"Model device: {self.model.device}")
40
  print(f"Model dtype: {next(self.model.parameters()).dtype}")
41
 
42
  except Exception as e:
43
- print(f"Error loading model with Compressed Tensors: {e}")
44
- # Fallback to standard loading without compression
45
- try:
46
- print("Trying standard loading as fallback...")
47
- self.model = AutoModelForCausalLM.from_pretrained(
48
- self.model_name,
49
- device_map="auto",
50
- torch_dtype=torch.float16,
51
- trust_remote_code=True,
52
- cache_dir='/tmp/cache'
53
- )
54
- print("Model loaded successfully with standard method!")
55
- except Exception as e2:
56
- raise Exception(f"Both Compressed Tensors and standard loading failed: {e2}")
57
 
58
  def translate_ja_to_en(self, input_text: str) -> str:
59
  """Translate Japanese to English using FP8 model"""
@@ -136,7 +124,7 @@ def create_translation_interface():
136
  print(f"Failed to initialize translator: {e}")
137
 
138
  def translate_function(input_text):
139
- return f"Model initialization failed: {str(e)}\n\nPlease check that 'compressed-tensors' is installed and try again."
140
 
141
  # Custom CSS for better appearance
142
  custom_css = """
@@ -168,7 +156,7 @@ def create_translation_interface():
168
  gr.Markdown(
169
  """
170
  # πŸ‡―πŸ‡΅ β†’ πŸ‡ΊπŸ‡Έ Japanese to English Translation
171
- **Model:** `tencent/Hunyuan-MT-7B-fp8` β€’ **Technology:** Compressed Tensors FP8 Quantization
172
 
173
  *Fast, high-quality Japanese to English translation using optimized FP8 model*
174
  """
@@ -227,7 +215,7 @@ def create_translation_interface():
227
  inputs=input_text,
228
  outputs=output_text,
229
  fn=translate_function,
230
- cache_examples=True,
231
  label="Click any example to try:"
232
  )
233
 
@@ -260,15 +248,14 @@ def create_translation_interface():
260
 
261
  **Model Details:**
262
  - **Base Model**: Hunyuan-MT 7B
263
- - **Quantization**: FP8 (8-bit floating point) via Compressed Tensors
264
  - **Memory Usage**: ~3-4GB
265
  - **Specialization**: Japanese ↔ English translation
266
 
267
  **Optimization Features:**
268
  - βœ… FP8 quantization for faster inference
269
- - βœ… Compressed Tensors for efficient storage
270
  - βœ… GPU acceleration support
271
- - βœ… Batch processing capable
272
 
273
  **Usage Tips:**
274
  - Keep inputs under 1500 characters for best results
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
  import os
 
5
 
6
  # Set cache directory for Spaces
7
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache'
 
14
  self._load_model()
15
 
16
  def _load_model(self):
17
+ """Load the pre-quantized FP8 model"""
18
+ print("Loading Hunyuan-MT FP8 model...")
19
 
20
  try:
21
  # Load tokenizer first
 
25
  trust_remote_code=True
26
  )
27
 
28
+ # For Compressed Tensors models, use the standard from_pretrained
29
+ # The quantization is automatically handled by the model files
30
+ self.model = AutoModelForCausalLM.from_pretrained(
31
  self.model_name,
32
+ device_map="auto",
33
+ torch_dtype=torch.float16, # Use fp16 as base dtype
34
+ trust_remote_code=True,
35
+ cache_dir='/tmp/cache'
36
  )
37
 
38
+ print("FP8 model loaded successfully!")
39
  print(f"Model device: {self.model.device}")
40
  print(f"Model dtype: {next(self.model.parameters()).dtype}")
41
 
42
  except Exception as e:
43
+ print(f"Error loading model: {e}")
44
+ raise Exception(f"Could not load the Hunyuan-MT model: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def translate_ja_to_en(self, input_text: str) -> str:
47
  """Translate Japanese to English using FP8 model"""
 
124
  print(f"Failed to initialize translator: {e}")
125
 
126
  def translate_function(input_text):
127
+ return f"Model initialization failed: {str(e)}\n\nPlease check the Space logs for details."
128
 
129
  # Custom CSS for better appearance
130
  custom_css = """
 
156
  gr.Markdown(
157
  """
158
  # πŸ‡―πŸ‡΅ β†’ πŸ‡ΊπŸ‡Έ Japanese to English Translation
159
+ **Model:** `tencent/Hunyuan-MT-7B-fp8` β€’ **Technology:** FP8 Quantization
160
 
161
  *Fast, high-quality Japanese to English translation using optimized FP8 model*
162
  """
 
215
  inputs=input_text,
216
  outputs=output_text,
217
  fn=translate_function,
218
+ cache_examples=False,
219
  label="Click any example to try:"
220
  )
221
 
 
248
 
249
  **Model Details:**
250
  - **Base Model**: Hunyuan-MT 7B
251
+ - **Quantization**: FP8 (8-bit floating point)
252
  - **Memory Usage**: ~3-4GB
253
  - **Specialization**: Japanese ↔ English translation
254
 
255
  **Optimization Features:**
256
  - βœ… FP8 quantization for faster inference
 
257
  - βœ… GPU acceleration support
258
+ - βœ… Efficient memory usage
259
 
260
  **Usage Tips:**
261
  - Keep inputs under 1500 characters for best results