Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Sleeping

App Files Files Community

hamxaameer commited on 13 days ago

Commit

7eb2f2d

verified ·

1 Parent(s): 2e58050

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -60

app.py CHANGED Viewed

@@ -40,9 +40,9 @@ CONFIG = {
     "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
     "llm_model": None,
     "vector_store_path": ".",
-    "top_k": 10,  # Reduced for faster retrieval
-    "temperature": 0.75,
-    "max_tokens": 300,  # Reduced for faster generation
 }
 # Local PHI model configuration for Hugging Face Spaces
@@ -52,9 +52,11 @@ LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
 USE_8BIT_QUANTIZATION = True  # Reduces memory usage by ~50%
 USE_REMOTE_LLM = False
-# Generation optimization for speed
-MAX_CONTEXT_LENGTH = 800  # Reduce context to speed up generation
-TARGET_ANSWER_WORDS = 280  # Shorter target for faster responses
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
@@ -125,16 +127,31 @@ def initialize_llm():
                 logger.warning(f"   8-bit quantization unavailable: {quant_error}")
                 logger.info("   Falling back to float32 (will use more memory)")
-        # Load the model
         logger.info("   Loading PHI model (this may take 30-60 seconds)...")
         model = AutoModelForCausalLM.from_pretrained(
             LOCAL_PHI_MODEL,
             **model_kwargs
         )
         # Move to eval mode to disable dropout and save memory
         model.eval()
         # Create pipeline for generation
         # NOTE: When using accelerate/quantization, do NOT specify device parameter
         logger.info("   Creating text-generation pipeline...")
@@ -142,8 +159,9 @@ def initialize_llm():
             "text-generation",
             model=model,
             tokenizer=tokenizer,
-            max_new_tokens=512,
-            pad_token_id=tokenizer.eos_token_id
         )
         CONFIG["llm_model"] = LOCAL_PHI_MODEL
@@ -568,14 +586,19 @@ Enhanced answer:
 def retrieve_knowledge_langchain(
     query: str,
     vectorstore,
-    top_k: int = 15
 ) -> Tuple[List[Document], float]:
     logger.info(f"🔍 Retrieving knowledge for: '{query}'")
-    query_variants = [
-        query,
-        f"fashion advice clothing outfit style for {query}",
-    ]
     all_docs = []
@@ -645,26 +668,33 @@ def generate_llm_answer(
     scored_docs.sort(key=lambda x: x[1], reverse=True)
     top_docs = [doc[0] for doc in scored_docs[:8]]
     context_parts = []
-    for doc in top_docs:
         content = doc.page_content.strip()
-        if len(content) > 300:
-            content = content[:300] + "..."
         context_parts.append(content)
-    context_text = "\n\n".join(context_parts)
-    # Optimized for speed: shorter context, shorter target, fewer iterations
-    # This significantly reduces generation time on CPU
-    target_min_words = 250
-    target_max_words = 350
-    chunk_target_words = 120
-    max_iterations = 2
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
         logger.info(f"    → PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
         try:
-            # Call local PHI model (causal LM)
             out = llm_client(
                 prompt,
                 max_new_tokens=max_new_tokens,
@@ -674,7 +704,10 @@ def generate_llm_answer(
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 pad_token_id=llm_client.tokenizer.eos_token_id,
-                eos_token_id=llm_client.tokenizer.eos_token_id
             )
             # Extract generated text from pipeline output
@@ -694,30 +727,24 @@ def generate_llm_answer(
             logger.error(f"    ✗ PHI model call error: {e}")
             return ''
-    # Build initial prompt - optimized for speed with shorter context
-    base_prompt = f"""Answer this fashion question with practical advice in ~{target_min_words} words.
-Question: {query}
-Key information:
-{context_text[:600]}
-Provide a clear, helpful answer with specific recommendations.
-Answer:
-"""
-    # Optimized parameters for faster CPU generation
     if attempt == 1:
-        temperature = 0.75
-        max_new_tokens = 400  # Reduced for speed
-        top_p = 0.90
-        repetition_penalty = 1.1
     else:
-        temperature = 0.85
-        max_new_tokens = 500
-        top_p = 0.92
-        repetition_penalty = 1.12
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
     response = (initial_output or '').strip()
@@ -730,6 +757,14 @@ Answer:
     words = response.split()
     word_count = len(words)
     # If single-shot succeeded, validate length and return
     if word_count >= target_min_words:
         if word_count > target_max_words:
@@ -738,6 +773,15 @@ Answer:
         logger.info(f"  ✅ Single-shot generated {word_count} words")
         return response
     # Otherwise, try iterative continuation to build up to the target
     accumulated = response
     prev_word_count = word_count
@@ -823,30 +867,37 @@ def generate_answer_langchain(
     if not retrieved_docs:
         return "I couldn't find relevant information to answer your question."
     llm_answer = None
-    for attempt in range(1, 3):
-        logger.info(f"\n  🤖 LLM Generation Attempt {attempt}/2")
         llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
         if llm_answer:
             logger.info(f"  ✅ LLM answer generated successfully")
             break
         else:
-            logger.warning(f"  → Attempt {attempt}/2 failed, retrying...")
     if not llm_answer:
-        logger.error(f"  ✗ All 2 LLM attempts failed")
-        # Try scaffold-and-polish as a fallback strategy
-        try:
-            logger.info("  → Attempting scaffold-and-polish using PHI model")
-            polished = scaffold_and_polish(query, retrieved_docs, llm_client)
-            if polished:
-                logger.info("  ✅ Scaffold-and-polish produced an answer")
-                return polished
-        except Exception as e:
-            logger.error(f"  ✗ Scaffold-and-polish error: {e}")
-        # Final fallback: extractive templated answer (guaranteed deterministic)
         try:
             logger.info("  → Using extractive fallback generator")
             fallback = generate_extractive_answer(query, retrieved_docs)

     "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
     "llm_model": None,
     "vector_store_path": ".",
+    "top_k": 8,  # Minimal retrieval for speed
+    "temperature": 0.85,  # Higher for faster sampling
+    "max_tokens": 280,  # Aggressive reduction
 }
 # Local PHI model configuration for Hugging Face Spaces
 USE_8BIT_QUANTIZATION = True  # Reduces memory usage by ~50%
 USE_REMOTE_LLM = False
+# Advanced optimization settings for FAST generation
+MAX_CONTEXT_LENGTH = 500  # Minimal context for speed
+TARGET_ANSWER_WORDS = 220  # Shorter answers = faster generation
+USE_CACHING = True  # Cache model outputs for repeated patterns
+ENABLE_FAST_MODE = True  # Skip iterative generation, use single-shot only
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
                 logger.warning(f"   8-bit quantization unavailable: {quant_error}")
                 logger.info("   Falling back to float32 (will use more memory)")
+        # Load the model with optimization
         logger.info("   Loading PHI model (this may take 30-60 seconds)...")
         model = AutoModelForCausalLM.from_pretrained(
             LOCAL_PHI_MODEL,
             **model_kwargs
         )
+        # Apply advanced optimizations for faster inference
+        if hasattr(model, 'config'):
+            # Reduce attention heads computation for speed
+            model.config.use_cache = True  # Enable KV cache for faster generation
+            model.config.output_attentions = False
+            model.config.output_hidden_states = False
         # Move to eval mode to disable dropout and save memory
         model.eval()
+        # Advanced: Try to optimize with torch.compile (PyTorch 2.0+)
+        try:
+            if hasattr(torch, 'compile') and not USE_8BIT_QUANTIZATION:
+                logger.info("   Applying torch.compile for faster inference...")
+                model = torch.compile(model, mode="reduce-overhead")
+        except Exception as compile_error:
+            logger.info(f"   Torch compile not available or failed: {compile_error}")
         # Create pipeline for generation
         # NOTE: When using accelerate/quantization, do NOT specify device parameter
         logger.info("   Creating text-generation pipeline...")
             "text-generation",
             model=model,
             tokenizer=tokenizer,
+            max_new_tokens=280,  # Default optimized value
+            pad_token_id=tokenizer.eos_token_id,
+            batch_size=1  # Single batch for optimal CPU performance
         )
         CONFIG["llm_model"] = LOCAL_PHI_MODEL
 def retrieve_knowledge_langchain(
     query: str,
     vectorstore,
+    top_k: int = 8
 ) -> Tuple[List[Document], float]:
     logger.info(f"🔍 Retrieving knowledge for: '{query}'")
+    # Fast mode: single query only (no variants)
+    global ENABLE_FAST_MODE
+    if ENABLE_FAST_MODE:
+        query_variants = [query]
+    else:
+        query_variants = [
+            query,
+            f"fashion advice clothing outfit style for {query}",
+        ]
     all_docs = []
     scored_docs.sort(key=lambda x: x[1], reverse=True)
     top_docs = [doc[0] for doc in scored_docs[:8]]
+    # Ultra-fast context preparation: only use top 4 docs, very short snippets
     context_parts = []
+    for doc in top_docs[:4]:  # Reduced from 8 to 4
         content = doc.page_content.strip()
+        if len(content) > 200:  # Reduced from 300 to 200
+            content = content[:200] + "..."
         context_parts.append(content)
+    context_text = "\n".join(context_parts)  # Single newline instead of double
+    # Ultra-fast mode: minimal words, no iterations
+    global ENABLE_FAST_MODE
+    if ENABLE_FAST_MODE:
+        target_min_words = 180  # Much shorter
+        target_max_words = 280
+        chunk_target_words = 0  # No continuations
+        max_iterations = 0  # No iterations
+    else:
+        target_min_words = 250
+        target_max_words = 350
+        chunk_target_words = 120
+        max_iterations = 2
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
         logger.info(f"    → PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
         try:
+            # Call local PHI model with speed optimizations
             out = llm_client(
                 prompt,
                 max_new_tokens=max_new_tokens,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 pad_token_id=llm_client.tokenizer.eos_token_id,
+                eos_token_id=llm_client.tokenizer.eos_token_id,
+                num_beams=1,  # Greedy/sampling is faster than beam search
+                early_stopping=True,  # Stop as soon as EOS is generated
+                use_cache=True  # Use KV cache for speed
             )
             # Extract generated text from pipeline output
             logger.error(f"    ✗ PHI model call error: {e}")
             return ''
+    # Ultra-compact prompt for maximum speed
+    base_prompt = f"""Q: {query}
+Context: {context_text[:400]}
+A:"""
+    # Aggressive speed optimization: fewer tokens, higher temperature for faster sampling
     if attempt == 1:
+        temperature = 0.85  # Higher = faster sampling
+        max_new_tokens = 280  # Reduced significantly
+        top_p = 0.88
+        repetition_penalty = 1.08
     else:
+        temperature = 0.90
+        max_new_tokens = 320
+        top_p = 0.90
+        repetition_penalty = 1.10
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
     response = (initial_output or '').strip()
     words = response.split()
     word_count = len(words)
+    # Fast mode: accept shorter answers immediately
+    if ENABLE_FAST_MODE and word_count >= 150:
+        if word_count > target_max_words:
+            response = ' '.join(words[:target_max_words]) + '...'
+            word_count = target_max_words
+        logger.info(f"  ✅ Fast-mode generated {word_count} words")
+        return response
     # If single-shot succeeded, validate length and return
     if word_count >= target_min_words:
         if word_count > target_max_words:
         logger.info(f"  ✅ Single-shot generated {word_count} words")
         return response
+    # Skip iterations in fast mode
+    if ENABLE_FAST_MODE or max_iterations == 0:
+        if word_count >= 120:  # Accept even shorter in fast mode
+            logger.info(f"  ✅ Fast-mode accepted {word_count} words")
+            return response
+        # If too short, return None to trigger fallback
+        logger.warning(f"  ✗ Output too short ({word_count} words), trying fallback")
+        return None
     # Otherwise, try iterative continuation to build up to the target
     accumulated = response
     prev_word_count = word_count
     if not retrieved_docs:
         return "I couldn't find relevant information to answer your question."
+    # Fast mode: single attempt only
+    global ENABLE_FAST_MODE
+    max_attempts = 1 if ENABLE_FAST_MODE else 2
     llm_answer = None
+    for attempt in range(1, max_attempts + 1):
+        logger.info(f"\n  🤖 LLM Generation Attempt {attempt}/{max_attempts}")
         llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
         if llm_answer:
             logger.info(f"  ✅ LLM answer generated successfully")
             break
         else:
+            if attempt < max_attempts:
+                logger.warning(f"  → Attempt {attempt}/{max_attempts} failed, retrying...")
     if not llm_answer:
+        logger.error(f"  ✗ All {max_attempts} LLM attempts failed")
+        # In fast mode, skip scaffold-and-polish and go straight to extractive
+        if not ENABLE_FAST_MODE:
+            try:
+                logger.info("  → Attempting scaffold-and-polish using PHI model")
+                polished = scaffold_and_polish(query, retrieved_docs, llm_client)
+                if polished:
+                    logger.info("  ✅ Scaffold-and-polish produced an answer")
+                    return polished
+            except Exception as e:
+                logger.error(f"  ✗ Scaffold-and-polish error: {e}")
+        # Final fallback: extractive templated answer (guaranteed deterministic & FAST)
         try:
             logger.info("  → Using extractive fallback generator")
             fallback = generate_extractive_answer(query, retrieved_docs)