hamxaameer commited on
Commit
7ba258a
Β·
verified Β·
1 Parent(s): 45ef96f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -7
app.py CHANGED
@@ -110,8 +110,10 @@ def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float =
110
  if not HF_INFERENCE_API_KEY:
111
  raise Exception("HF_INFERENCE_API_KEY not set for remote generation")
112
 
113
- url = f"https://api-inference.huggingface.co/models/{REMOTE_LLM_MODEL}"
114
- headers = {"Authorization": f"Bearer {HF_INFERENCE_API_KEY}"}
 
 
115
  payload = {
116
  "inputs": prompt,
117
  "parameters": {
@@ -122,8 +124,19 @@ def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float =
122
  }
123
  }
124
 
125
- logger.info(f" β†’ Remote inference request to {REMOTE_LLM_MODEL} (tokens={max_new_tokens}, temp={temperature})")
126
- r = requests.post(url, headers=headers, json=payload, timeout=60)
 
 
 
 
 
 
 
 
 
 
 
127
  if r.status_code != 200:
128
  logger.error(f" βœ— Remote inference error {r.status_code}: {r.text[:200]}")
129
  return ""
@@ -368,6 +381,92 @@ def generate_extractive_answer(query: str, retrieved_docs: List[Document]) -> Op
368
  return answer
369
 
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  def retrieve_knowledge_langchain(
372
  query: str,
373
  vectorstore,
@@ -639,9 +738,20 @@ def generate_answer_langchain(
639
 
640
  if not llm_answer:
641
  logger.error(f" βœ— All 2 LLM attempts failed")
642
- # Fallback: use an extractive + template-based generator to produce a long,
643
- # natural-flowing answer without using the LLM. This helps when the model
644
- # repeatedly returns very short outputs or errors.
 
 
 
 
 
 
 
 
 
 
 
645
  try:
646
  logger.info(" β†’ Using extractive fallback generator")
647
  fallback = generate_extractive_answer(query, retrieved_docs)
 
110
  if not HF_INFERENCE_API_KEY:
111
  raise Exception("HF_INFERENCE_API_KEY not set for remote generation")
112
 
113
+ # New router endpoint is required by HF (replaces api-inference.huggingface.co)
114
+ router_url = f"https://router.huggingface.co/models/{REMOTE_LLM_MODEL}"
115
+ old_url = f"https://api-inference.huggingface.co/models/{REMOTE_LLM_MODEL}"
116
+ headers = {"Authorization": f"Bearer {HF_INFERENCE_API_KEY}", "Accept": "application/json"}
117
  payload = {
118
  "inputs": prompt,
119
  "parameters": {
 
124
  }
125
  }
126
 
127
+ logger.info(f" β†’ Remote inference request to router {REMOTE_LLM_MODEL} (tokens={max_new_tokens}, temp={temperature})")
128
+ try:
129
+ r = requests.post(router_url, headers=headers, json=payload, timeout=120)
130
+ except Exception as e:
131
+ logger.error(f" βœ— Remote router request failed: {e}")
132
+ # Try older endpoint as a fallback
133
+ try:
134
+ logger.info(" β†’ Attempting legacy api-inference endpoint as fallback")
135
+ r = requests.post(old_url, headers=headers, json=payload, timeout=120)
136
+ except Exception as e2:
137
+ logger.error(f" βœ— Legacy endpoint request failed: {e2}")
138
+ return ""
139
+
140
  if r.status_code != 200:
141
  logger.error(f" βœ— Remote inference error {r.status_code}: {r.text[:200]}")
142
  return ""
 
381
  return answer
382
 
383
 
384
+ def scaffold_and_polish(query: str, retrieved_docs: List[Document], llm_client) -> Optional[str]:
385
+ """Create a concise scaffold (approx 150-220 words) from retrieved docs,
386
+ then ask the remote (or local) LLM to expand and polish it into a
387
+ 320-420 word expert answer. Returns None if polishing fails.
388
+ """
389
+ logger.info(f"πŸ”¨ Building scaffold for polish: '{query}'")
390
+ import re
391
+
392
+ # Reuse sentence extraction logic but stop early for a compact scaffold
393
+ all_text = "\n\n".join([d.page_content for d in retrieved_docs[:12]])
394
+ sentences = re.split(r'(?<=[.!?])\s+', all_text)
395
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
396
+ if not sentences:
397
+ logger.warning(" βœ— No sentences to build scaffold")
398
+ return None
399
+
400
+ # Score sentences by overlap with query + fashion keywords
401
+ query_tokens = set(re.findall(r"\w+", query.lower()))
402
+ fashion_keywords = set(["outfit","wear","wardrobe","style","colors","layer","blazer",
403
+ "trousers","dress","shoes","sweater","jacket","care","wool","fit",
404
+ "tailor","neutral","accessory","season","fall"])
405
+ keywords = query_tokens.union(fashion_keywords)
406
+
407
+ scored = []
408
+ for s in sentences:
409
+ s_tokens = set(re.findall(r"\w+", s.lower()))
410
+ score = len(s_tokens & keywords)
411
+ score += min(2, len(s.split()) // 30)
412
+ scored.append((score, s))
413
+
414
+ scored.sort(key=lambda x: x[0], reverse=True)
415
+ scaffold_parts = []
416
+ word_count = 0
417
+ for _, s in scored:
418
+ scaffold_parts.append(s)
419
+ word_count = len(" ".join(scaffold_parts).split())
420
+ if word_count >= 180:
421
+ break
422
+
423
+ scaffold = "\n\n".join(scaffold_parts).strip()
424
+ if not scaffold:
425
+ logger.warning(" βœ— Scaffold empty after selection")
426
+ return None
427
+
428
+ # Craft polish prompt
429
+ polish_prompt = f"""Please rewrite and expand the draft below into a clear, expert, natural-flowing answer of about 320-420 words to the question: {query}
430
+
431
+ Requirements:
432
+ - Keep paragraphs natural and connected.
433
+ - Preserve factual content from the draft and avoid inventing unsupported facts.
434
+ - Use a friendly, expert tone and provide practical, actionable advice.
435
+
436
+ Draft:
437
+ {scaffold}
438
+
439
+ Answer:
440
+ """
441
+
442
+ logger.info(" β†’ Polishing scaffold with LLM")
443
+ try:
444
+ if USE_REMOTE_LLM:
445
+ polished = remote_generate(polish_prompt, max_new_tokens=600, temperature=0.72, top_p=0.92)
446
+ else:
447
+ out = llm_client(polish_prompt, max_new_tokens=600, temperature=0.72, top_p=0.92, do_sample=True, num_beams=1)
448
+ polished = out[0].get('generated_text', '') if isinstance(out, list) and out else str(out)
449
+ polished = polished.strip()
450
+ except Exception as e:
451
+ logger.error(f" βœ— Polishing error: {e}")
452
+ return None
453
+
454
+ if not polished:
455
+ logger.warning(" βœ— Polished output empty")
456
+ return None
457
+
458
+ final_words = polished.split()
459
+ fw = len(final_words)
460
+ if fw < 300:
461
+ logger.warning(f" βœ— Polished output too short ({fw} words)")
462
+ return None
463
+ if fw > 460:
464
+ polished = ' '.join(final_words[:420]) + '...'
465
+
466
+ logger.info(f" βœ… Polished answer ready ({len(polished.split())} words)")
467
+ return polished
468
+
469
+
470
  def retrieve_knowledge_langchain(
471
  query: str,
472
  vectorstore,
 
738
 
739
  if not llm_answer:
740
  logger.error(f" βœ— All 2 LLM attempts failed")
741
+ # Next attempt: if remote LLM is available, build a short scaffold from
742
+ # retrieved documents and ask the remote model to polish/expand it. This
743
+ # is more reliable than single-shot long generation on some models.
744
+ if USE_REMOTE_LLM:
745
+ try:
746
+ logger.info(" β†’ Attempting scaffold-and-polish using remote LLM")
747
+ polished = scaffold_and_polish(query, retrieved_docs, llm_client)
748
+ if polished:
749
+ logger.info(" βœ… Scaffold-and-polish produced an answer")
750
+ return polished
751
+ except Exception as e:
752
+ logger.error(f" βœ— Scaffold-and-polish error: {e}")
753
+
754
+ # Final fallback: extractive templated answer (guaranteed deterministic)
755
  try:
756
  logger.info(" β†’ Using extractive fallback generator")
757
  fallback = generate_extractive_answer(query, retrieved_docs)