Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Sleeping

App Files Files Community

hamxaameer commited on 13 days ago

Commit

dab6cfd

verified ·

1 Parent(s): 06dde32

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -105

app.py CHANGED Viewed

@@ -61,12 +61,11 @@ CONFIG = {
     "max_tokens": 600,  # Allow natural length responses
 }
-# Local LLM configuration for Hugging Face Spaces
-# TinyLlama: 1.1B parameters, fast on CPU, reliable generation
-# Alternative: google/flan-t5-base (smaller, faster)
-LOCAL_LLM_MODEL = os.environ.get("LOCAL_LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-USE_8BIT_QUANTIZATION = False  # Not needed for TinyLlama
-USE_REMOTE_LLM = False
 # Natural flow mode: No word limits, let model decide length
 MAX_CONTEXT_LENGTH = 400  # Reduced for faster generation
@@ -95,14 +94,15 @@ if HF_INFERENCE_API_KEY:
 # ============================================================================
 def initialize_llm():
-    """Initialize TinyLlama model locally with CPU optimizations.
-    TinyLlama is fast, reliable, and works well on CPU without device issues.
     """
-    global LOCAL_LLM_MODEL, USE_8BIT_QUANTIZATION
-    logger.info(f"🔄 Initializing local LLM: {LOCAL_LLM_MODEL}")
-    logger.info("   Using CPU-optimized configuration for Hugging Face Spaces")
     try:
         from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -111,102 +111,71 @@ def initialize_llm():
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"   Target device: {device}")
         # Load tokenizer
         logger.info("   Loading tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained(
-            LOCAL_LLM_MODEL,
-            trust_remote_code=True
-        )
-        # Configure tokenizer
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        if tokenizer.pad_token_id is None:
-            tokenizer.pad_token_id = tokenizer.eos_token_id
-        logger.info(f"   Tokenizer ready: {len(tokenizer)} tokens")
-        # Load model - simple CPU configuration
-        logger.info("   Loading model (20-40 seconds)...")
-        model = AutoModelForCausalLM.from_pretrained(
             LOCAL_LLM_MODEL,
-            trust_remote_code=True,
-            torch_dtype=torch.float32,
-            low_cpu_mem_usage=True
         )
-        # Move to CPU explicitly
         model = model.to('cpu')
-        # Apply advanced optimizations for faster inference
-        if hasattr(model, 'config'):
-            # Reduce attention heads computation for speed
-            model.config.use_cache = True  # Enable KV cache for faster generation
-            model.config.output_attentions = False
-            model.config.output_hidden_states = False
-        # Move to eval mode to disable dropout and save memory
         model.eval()
-        # Skip torch.compile - can cause issues on Hugging Face Spaces
-        logger.info("   Model ready for inference")
         # Store model and tokenizer directly for faster inference
         # We'll use direct generation instead of pipeline
         logger.info("   Configuring direct model inference (faster than pipeline)...")
-        # Create a simple wrapper that mimics pipeline interface
-        class FastLLMGenerator:
             def __init__(self, model, tokenizer):
                 self.model = model
                 self.tokenizer = tokenizer
-            def __call__(self, prompt, max_new_tokens=150, temperature=0.7, top_p=0.9,
-                        do_sample=True, repetition_penalty=1.1, **kwargs):
-                """Direct generation - faster and more reliable"""
                 try:
-                    # Tokenize
-                    inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=400)
-                    input_ids = inputs["input_ids"].to('cpu')
-                    attention_mask = inputs.get("attention_mask", None)
-                    if attention_mask is not None:
-                        attention_mask = attention_mask.to('cpu')
-                    # Generate
                     with torch.no_grad():
                         outputs = self.model.generate(
-                            input_ids,
-                            attention_mask=attention_mask,
                             max_new_tokens=max_new_tokens,
-                            temperature=temperature if do_sample else 1.0,
-                            top_p=top_p if do_sample else 1.0,
-                            do_sample=do_sample,
-                            repetition_penalty=repetition_penalty,
-                            pad_token_id=self.tokenizer.pad_token_id,
-                            eos_token_id=self.tokenizer.eos_token_id
                         )
-                    # Decode only the new tokens
-                    generated_ids = outputs[0][input_ids.shape[1]:]
-                    generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
                     return [{"generated_text": generated_text.strip()}]
                 except Exception as e:
                     logger.error(f"Generation error: {e}")
-                    import traceback
-                    logger.error(traceback.format_exc())
                     return [{"generated_text": ""}]
-        llm_client = FastLLMGenerator(model, tokenizer)
-        llm_client.tokenizer = tokenizer  # Add tokenizer reference for compatibility
         CONFIG["llm_model"] = LOCAL_LLM_MODEL
-        CONFIG["model_type"] = "tinyllama_local"
-        logger.info(f"✅ LLM initialized successfully: {LOCAL_LLM_MODEL}")
-        logger.info(f"   Model size: 1.1B parameters")
-        logger.info(f"   Expected speed: 5-15 seconds per response on CPU")
         return llm_client
@@ -222,34 +191,30 @@ def initialize_llm():
         raise Exception(f"Failed to initialize LLM: {str(e)}")
-def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
-    """Call the Hugging Face Inference API for remote generation. Requires
-    `HF_INFERENCE_API_KEY` env var to be set and a model name in
-    `REMOTE_LLM_MODEL`.
-    PHI models work best with clear instruction formatting. This function
-    handles both the standard HF Inference API and PHI-specific response parsing.
     """
     if not HF_INFERENCE_API_KEY:
         raise Exception("HF_INFERENCE_API_KEY not set for remote generation")
-    # Use the HF Inference API endpoint (not router for better PHI compatibility)
     api_url = f"https://api-inference.huggingface.co/models/{REMOTE_LLM_MODEL}"
     headers = {"Authorization": f"Bearer {HF_INFERENCE_API_KEY}"}
-    # PHI models prefer simple parameters; avoid return_full_text which can cause issues
     payload = {
         "inputs": prompt,
         "parameters": {
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
-            "do_sample": True,
-            "repetition_penalty": 1.1
         }
     }
-    logger.info(f"    → Remote PHI inference to {REMOTE_LLM_MODEL} (tokens={max_new_tokens}, temp={temperature})")
     try:
         r = requests.post(api_url, headers=headers, json=payload, timeout=90)
     except Exception as e:
@@ -277,30 +242,26 @@ def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float =
         logger.error(f"    ✗ Remote inference returned error: {result.get('error')}")
         return ""
-    # Parse the generated text from various response formats
     generated_text = ""
     if isinstance(result, list) and result:
-        # HF Inference API returns [{"generated_text": "..."}]
         first = result[0]
         if isinstance(first, dict):
             generated_text = first.get("generated_text", "")
         else:
             generated_text = str(first)
-    elif isinstance(result, dict) and "generated_text" in result:
-        generated_text = result["generated_text"]
     else:
         generated_text = str(result)
-    # Clean up: PHI may return the prompt + completion, extract only new text
-    generated_text = generated_text.strip()
-    # If the response contains the original prompt, extract only the new completion
     if prompt in generated_text:
-        # Find where the prompt ends and new generation begins
-        prompt_end = generated_text.find(prompt) + len(prompt)
-        generated_text = generated_text[prompt_end:].strip()
     return generated_text
 def initialize_embeddings():
@@ -714,7 +675,7 @@ def generate_llm_answer(
                 # Ultra-simple prompt
                 formatted_prompt = f"{prompt}\n\nAnswer:"
-                logger.info(f"    → Generating with TinyLlama (max_tokens={max_new_tokens})")
                 # MINIMAL settings - most restrictive for speed
                 out = llm_client(
@@ -777,20 +738,16 @@ def generate_llm_answer(
 A:"""
-    # AGGRESSIVE speed optimization
     if attempt == 1:
-        temperature = 0.6  # Lower = faster
-        max_new_tokens = 150  # Much shorter
-        top_p = 0.85
-        repetition_penalty = 1.2
     else:
         temperature = 0.7
-        max_new_tokens = 180
-        top_p = 0.9
-        repetition_penalty = 1.25
     logger.info(f"  → Starting generation with prompt: {base_prompt[:200]}...")
-    initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
     response = (initial_output or '').strip()
     # Basic sanity checks

     "max_tokens": 600,  # Allow natural length responses
 }
+# LLM Configuration - LOCAL ONLY
+# Using Google Flan-T5: Fast on CPU, reliable, no timeouts
+LOCAL_LLM_MODEL = os.environ.get("LOCAL_LLM_MODEL", "google/flan-t5-base")
+USE_8BIT_QUANTIZATION = False
+USE_REMOTE_LLM = False  # LOCAL ONLY
 # Natural flow mode: No word limits, let model decide length
 MAX_CONTEXT_LENGTH = 400  # Reduced for faster generation
 # ============================================================================
 def initialize_llm():
+    """Initialize Flan-T5 for fast local CPU generation.
+    Flan-T5 is an encoder-decoder model optimized for instruction following.
+    Much faster than decoder-only models like TinyLlama on CPU.
     """
+    global LOCAL_LLM_MODEL
+    logger.info(f"🔄 Initializing Flan-T5: {LOCAL_LLM_MODEL}")
+    logger.info("   Optimized for fast CPU inference")
     try:
         from transformers import AutoTokenizer, AutoModelForCausalLM
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"   Target device: {device}")
+        from transformers import T5ForConditionalGeneration, T5Tokenizer
         # Load tokenizer
         logger.info("   Loading tokenizer...")
+        tokenizer = T5Tokenizer.from_pretrained(LOCAL_LLM_MODEL)
+        logger.info(f"   Tokenizer ready")
+        # Load model - Flan-T5 is much lighter
+        logger.info("   Loading model (10-20 seconds)...")
+        model = T5ForConditionalGeneration.from_pretrained(
             LOCAL_LLM_MODEL,
+            torch_dtype=torch.float32
         )
         model = model.to('cpu')
+        logger.info("   Model loaded on CPU")
+        # Optimize for inference
         model.eval()
+        logger.info("   Model ready")
         # Store model and tokenizer directly for faster inference
         # We'll use direct generation instead of pipeline
         logger.info("   Configuring direct model inference (faster than pipeline)...")
+        # Flan-T5 generator - simple and fast
+        class FlanT5Generator:
             def __init__(self, model, tokenizer):
                 self.model = model
                 self.tokenizer = tokenizer
+            def __call__(self, prompt, max_new_tokens=128, temperature=0.7, **kwargs):
+                """Generate with Flan-T5 - fast on CPU"""
                 try:
+                    # Tokenize input
+                    inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
+                    inputs = {k: v.to('cpu') for k, v in inputs.items()}
+                    # Generate - Flan-T5 is fast even on CPU
                     with torch.no_grad():
                         outputs = self.model.generate(
+                            **inputs,
                             max_new_tokens=max_new_tokens,
+                            num_beams=2,  # Beam search for quality
+                            early_stopping=True,
+                            no_repeat_ngram_size=3
                         )
+                    # Decode
+                    generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                     return [{"generated_text": generated_text.strip()}]
                 except Exception as e:
                     logger.error(f"Generation error: {e}")
                     return [{"generated_text": ""}]
+        llm_client = FlanT5Generator(model, tokenizer)
+        llm_client.tokenizer = tokenizer
         CONFIG["llm_model"] = LOCAL_LLM_MODEL
+        CONFIG["model_type"] = "flan_t5_local"
+        logger.info(f"✅ Flan-T5 initialized: {LOCAL_LLM_MODEL}")
+        logger.info(f"   Size: ~250M parameters (base model)")
+        logger.info(f"   Speed: 3-8 seconds per response")
         return llm_client
         raise Exception(f"Failed to initialize LLM: {str(e)}")
+def remote_generate(prompt: str, max_new_tokens: int = 200, temperature: float = 0.7, top_p: float = 0.9) -> str:
+    """Call Hugging Face Inference API - fast and reliable.
+    Uses Qwen2.5 model optimized for fast inference.
     """
     if not HF_INFERENCE_API_KEY:
         raise Exception("HF_INFERENCE_API_KEY not set for remote generation")
+    # Use Inference API
     api_url = f"https://api-inference.huggingface.co/models/{REMOTE_LLM_MODEL}"
     headers = {"Authorization": f"Bearer {HF_INFERENCE_API_KEY}"}
+    # Simple parameters for fast inference
     payload = {
         "inputs": prompt,
         "parameters": {
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
+            "return_full_text": False
         }
     }
+    logger.info(f"    → Remote inference (tokens={max_new_tokens})")
     try:
         r = requests.post(api_url, headers=headers, json=payload, timeout=90)
     except Exception as e:
         logger.error(f"    ✗ Remote inference returned error: {result.get('error')}")
         return ""
+    # Extract generated text
     generated_text = ""
     if isinstance(result, list) and result:
         first = result[0]
         if isinstance(first, dict):
             generated_text = first.get("generated_text", "")
         else:
             generated_text = str(first)
+    elif isinstance(result, dict):
+        generated_text = result.get("generated_text", str(result))
     else:
         generated_text = str(result)
+    # Clean up
+    generated_text = generated_text.strip()
     if prompt in generated_text:
+        generated_text = generated_text.replace(prompt, "").strip()
+    logger.info(f"    ✅ Generated {len(generated_text.split())} words remotely")
     return generated_text
 def initialize_embeddings():
                 # Ultra-simple prompt
                 formatted_prompt = f"{prompt}\n\nAnswer:"
+                logger.info(f"    → Generating with Flan-T5 (max_tokens={max_new_tokens})")
                 # MINIMAL settings - most restrictive for speed
                 out = llm_client(
 A:"""
+    # Flan-T5 optimized parameters
     if attempt == 1:
+        max_new_tokens = 128  # Flan-T5 is concise
+        temperature = 0.7
     else:
+        max_new_tokens = 150
         temperature = 0.7
     logger.info(f"  → Starting generation with prompt: {base_prompt[:200]}...")
+    initial_output = call_model(base_prompt, max_new_tokens, temperature)
     response = (initial_output or '').strip()
     # Basic sanity checks