Spaces:
Sleeping
Sleeping
Update evidence_retrieval.py
Browse files- modules/evidence_retrieval.py +38 -13
modules/evidence_retrieval.py
CHANGED
|
@@ -8,11 +8,9 @@ combining evidence to support fact-checking operations.
|
|
| 8 |
|
| 9 |
import logging
|
| 10 |
import time
|
| 11 |
-
import re
|
| 12 |
-
import random
|
| 13 |
import requests
|
| 14 |
-
import json
|
| 15 |
import ssl
|
|
|
|
| 16 |
from urllib.parse import urlencode
|
| 17 |
from bs4 import BeautifulSoup
|
| 18 |
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
@@ -24,6 +22,7 @@ from utils.models import get_nlp_model
|
|
| 24 |
from modules.claim_extraction import shorten_claim_for_evidence
|
| 25 |
from modules.rss_feed import retrieve_evidence_from_rss
|
| 26 |
from config import NEWS_API_KEY, FACTCHECK_API_KEY
|
|
|
|
| 27 |
# Import the performance tracker
|
| 28 |
from utils.performance import PerformanceTracker
|
| 29 |
performance_tracker = PerformanceTracker()
|
|
@@ -342,11 +341,7 @@ def retrieve_evidence_from_wikidata(claim):
|
|
| 342 |
sparql.addCustomHttpHeader("User-Agent", "MisinformationDetectionResearchBot/1.0")
|
| 343 |
|
| 344 |
# Fix SSL issues by disabling SSL verification for this specific request
|
| 345 |
-
try:
|
| 346 |
-
# Create a context where we don't verify SSL certs
|
| 347 |
-
import ssl
|
| 348 |
-
import urllib.request
|
| 349 |
-
|
| 350 |
# Create a context that doesn't verify certificates
|
| 351 |
ssl_context = ssl._create_unverified_context()
|
| 352 |
|
|
@@ -401,10 +396,26 @@ def retrieve_evidence_from_wikidata(claim):
|
|
| 401 |
wikidata_evidence.append(evidence_text)
|
| 402 |
|
| 403 |
logger.info(f"Retrieved {len(wikidata_evidence)} Wikidata entities")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
return wikidata_evidence
|
| 405 |
|
| 406 |
except Exception as e:
|
| 407 |
logger.error(f"Error retrieving from Wikidata: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
return []
|
| 409 |
|
| 410 |
@api_error_handler("openalex")
|
|
@@ -478,10 +489,26 @@ def retrieve_evidence_from_openalex(claim):
|
|
| 478 |
logger.error(f"Unexpected error in OpenAlex request: {str(e)}")
|
| 479 |
|
| 480 |
logger.info(f"Retrieved {len(scholarly_evidence)} scholarly papers from OpenAlex")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
return scholarly_evidence
|
| 482 |
|
| 483 |
except Exception as e:
|
| 484 |
logger.error(f"Fatal error in OpenAlex retrieval: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
return []
|
| 486 |
|
| 487 |
@api_error_handler("factcheck")
|
|
@@ -702,8 +729,9 @@ def retrieve_news_articles(claim, requires_recent=False):
|
|
| 702 |
news_texts = [item["text"] for item in news_results]
|
| 703 |
|
| 704 |
# Log evidence retrieval
|
|
|
|
|
|
|
| 705 |
try:
|
| 706 |
-
success = bool(news_texts)
|
| 707 |
performance_tracker.log_evidence_retrieval(success, source_count)
|
| 708 |
except Exception as log_error:
|
| 709 |
logger.error(f"Error logging evidence retrieval: {log_error}")
|
|
@@ -736,12 +764,9 @@ def retrieve_combined_evidence(claim):
|
|
| 736 |
logger.info(f"Starting evidence retrieval for: {claim}")
|
| 737 |
start_time = time.time()
|
| 738 |
|
| 739 |
-
# Use the category detector to identify the claim category
|
| 740 |
-
from modules.category_detection import get_category_specific_rss_feeds, get_fallback_category, detect_claim_category
|
| 741 |
-
|
| 742 |
# Extract key claim components for relevance matching
|
| 743 |
claim_components = extract_claim_components(claim)
|
| 744 |
-
logger.info(f"Extracted claim components: entities={claim_components
|
| 745 |
|
| 746 |
# Determine if claim has temporal attributes
|
| 747 |
requires_recent_evidence = bool(claim_components.get("temporal_words", []))
|
|
|
|
| 8 |
|
| 9 |
import logging
|
| 10 |
import time
|
|
|
|
|
|
|
| 11 |
import requests
|
|
|
|
| 12 |
import ssl
|
| 13 |
+
import urllib.request
|
| 14 |
from urllib.parse import urlencode
|
| 15 |
from bs4 import BeautifulSoup
|
| 16 |
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
|
|
| 22 |
from modules.claim_extraction import shorten_claim_for_evidence
|
| 23 |
from modules.rss_feed import retrieve_evidence_from_rss
|
| 24 |
from config import NEWS_API_KEY, FACTCHECK_API_KEY
|
| 25 |
+
from modules.category_detection import get_category_specific_rss_feeds, get_fallback_category, detect_claim_category
|
| 26 |
# Import the performance tracker
|
| 27 |
from utils.performance import PerformanceTracker
|
| 28 |
performance_tracker = PerformanceTracker()
|
|
|
|
| 341 |
sparql.addCustomHttpHeader("User-Agent", "MisinformationDetectionResearchBot/1.0")
|
| 342 |
|
| 343 |
# Fix SSL issues by disabling SSL verification for this specific request
|
| 344 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
# Create a context that doesn't verify certificates
|
| 346 |
ssl_context = ssl._create_unverified_context()
|
| 347 |
|
|
|
|
| 396 |
wikidata_evidence.append(evidence_text)
|
| 397 |
|
| 398 |
logger.info(f"Retrieved {len(wikidata_evidence)} Wikidata entities")
|
| 399 |
+
|
| 400 |
+
# Log evidence retrieval performance
|
| 401 |
+
success = bool(wikidata_evidence)
|
| 402 |
+
source_count = {"wikidata": len(wikidata_evidence)}
|
| 403 |
+
try:
|
| 404 |
+
performance_tracker.log_evidence_retrieval(success, source_count)
|
| 405 |
+
except Exception as e:
|
| 406 |
+
logger.error(f"Error logging Wikidata evidence retrieval: {e}")
|
| 407 |
+
|
| 408 |
return wikidata_evidence
|
| 409 |
|
| 410 |
except Exception as e:
|
| 411 |
logger.error(f"Error retrieving from Wikidata: {str(e)}")
|
| 412 |
+
|
| 413 |
+
# Log failed evidence retrieval
|
| 414 |
+
try:
|
| 415 |
+
performance_tracker.log_evidence_retrieval(False, {"wikidata": 0})
|
| 416 |
+
except Exception as log_error:
|
| 417 |
+
logger.error(f"Error logging failed Wikidata evidence retrieval: {log_error}")
|
| 418 |
+
|
| 419 |
return []
|
| 420 |
|
| 421 |
@api_error_handler("openalex")
|
|
|
|
| 489 |
logger.error(f"Unexpected error in OpenAlex request: {str(e)}")
|
| 490 |
|
| 491 |
logger.info(f"Retrieved {len(scholarly_evidence)} scholarly papers from OpenAlex")
|
| 492 |
+
|
| 493 |
+
# Log evidence retrieval performance
|
| 494 |
+
success = bool(scholarly_evidence)
|
| 495 |
+
source_count = {"openalex": len(scholarly_evidence)}
|
| 496 |
+
try:
|
| 497 |
+
performance_tracker.log_evidence_retrieval(success, source_count)
|
| 498 |
+
except Exception as e:
|
| 499 |
+
logger.error(f"Error logging OpenAlex evidence retrieval: {e}")
|
| 500 |
+
|
| 501 |
return scholarly_evidence
|
| 502 |
|
| 503 |
except Exception as e:
|
| 504 |
logger.error(f"Fatal error in OpenAlex retrieval: {str(e)}")
|
| 505 |
+
|
| 506 |
+
# Log failed evidence retrieval
|
| 507 |
+
try:
|
| 508 |
+
performance_tracker.log_evidence_retrieval(False, {"openalex": 0})
|
| 509 |
+
except Exception as log_error:
|
| 510 |
+
logger.error(f"Error logging failed OpenAlex evidence retrieval: {log_error}")
|
| 511 |
+
|
| 512 |
return []
|
| 513 |
|
| 514 |
@api_error_handler("factcheck")
|
|
|
|
| 729 |
news_texts = [item["text"] for item in news_results]
|
| 730 |
|
| 731 |
# Log evidence retrieval
|
| 732 |
+
success = bool(news_texts)
|
| 733 |
+
source_count = {"news": len(news_texts)}
|
| 734 |
try:
|
|
|
|
| 735 |
performance_tracker.log_evidence_retrieval(success, source_count)
|
| 736 |
except Exception as log_error:
|
| 737 |
logger.error(f"Error logging evidence retrieval: {log_error}")
|
|
|
|
| 764 |
logger.info(f"Starting evidence retrieval for: {claim}")
|
| 765 |
start_time = time.time()
|
| 766 |
|
|
|
|
|
|
|
|
|
|
| 767 |
# Extract key claim components for relevance matching
|
| 768 |
claim_components = extract_claim_components(claim)
|
| 769 |
+
logger.info(f"Extracted claim components: entities={claim_components.get('entities', [])}, verbs={claim_components.get('verbs', [])}")
|
| 770 |
|
| 771 |
# Determine if claim has temporal attributes
|
| 772 |
requires_recent_evidence = bool(claim_components.get("temporal_words", []))
|