C2MV commited on
Commit
106a4e3
·
verified ·
1 Parent(s): d416d78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1407 -682
app.py CHANGED
@@ -1,730 +1,1455 @@
1
  import os
2
  import re
3
- import time
4
  import logging
5
  import zipfile
6
- import requests
7
- import bibtexparser
8
- from tqdm import tqdm
9
- from urllib.parse import quote, urlencode
10
- import gradio as gr
11
- from bs4 import BeautifulSoup
12
- import io
13
  import asyncio
14
- import aiohttp
15
-
16
- # Configure logging
17
- logging.basicConfig(level=logging.INFO,
18
- format='%(asctime)s - %(levelname)s: %(message)s')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  logger = logging.getLogger(__name__)
20
 
21
-
22
- class PaperDownloader:
23
- def __init__(self, output_dir='papers'):
24
- self.output_dir = output_dir
25
- os.makedirs(output_dir, exist_ok=True)
26
-
27
- # Updated download sources
28
- self.download_sources = [
29
- 'https://sci-hub.ee/',
30
- 'https://sci-hub.st/',
31
- 'https://sci-hub.ru/',
32
- 'https://sci-hub.ren/',
33
- 'https://sci-hub.mksa.top/',
34
- 'https://sci-hub.se/',
35
- 'https://libgen.rs/scimag/'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  ]
37
-
38
- # Request headers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  self.headers = {
40
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
41
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
- 'Accept-Language': 'en-US,en;q=0.9',
43
  }
44
-
45
- def clean_doi(self, doi):
46
- """Clean and encode DOI for URL"""
47
- if not isinstance(doi, str):
48
- return None
49
- return quote(doi.strip()) if doi else None
50
-
51
- async def fetch_with_headers(self, session, url, timeout=10):
52
- """Utility method to fetch an URL with headers and timeout"""
53
- try:
54
- async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
- response.raise_for_status()
56
- return await response.text(), response.headers
57
- except Exception as e:
58
- logger.debug(f"Error fetching {url}: {e}")
59
- return None, None
60
-
61
-
62
- async def download_paper_direct_doi_async(self, session, doi):
63
- """Attempt to download the pdf from the landing page of the doi"""
64
- if not doi:
65
- return None
66
-
67
- try:
68
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
- text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
- if not text:
71
- return None
72
-
73
- pdf_patterns = [
74
- r'(https?://[^\s<>"]+?\.pdf)',
75
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
- ]
78
-
79
- pdf_urls = []
80
- for pattern in pdf_patterns:
81
- pdf_urls.extend(re.findall(pattern, text))
82
-
83
- for pdf_url in pdf_urls:
84
- try:
85
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
- logger.debug(f"Found PDF from: {pdf_url}")
88
- return await pdf_response.read()
89
- except Exception as e:
90
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
-
92
-
93
- except Exception as e:
94
- logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
-
96
- return None
97
-
98
- async def download_paper_scihub_async(self, session, doi):
99
- """Improved method to download paper from Sci-Hub using async requests"""
100
- if not doi:
101
- logger.warning("DOI not provided")
102
- return None
103
-
104
- for base_url in self.download_sources:
105
- try:
106
- scihub_url = f"{base_url}{self.clean_doi(doi)}"
107
- text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
108
- if not text:
109
- continue
110
-
111
- # Search for multiple PDF URL patterns
112
- pdf_patterns = [
113
- r'(https?://[^\s<>"]+?\.pdf)',
114
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
115
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
116
- ]
117
-
118
- pdf_urls = []
119
- for pattern in pdf_patterns:
120
- pdf_urls.extend(re.findall(pattern, text))
121
-
122
- # Try downloading from found URLs
123
- for pdf_url in pdf_urls:
124
- try:
125
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
126
- # Verify if it's a PDF
127
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
128
- logger.debug(f"Found PDF from: {pdf_url}")
129
- return await pdf_response.read()
130
- except Exception as e:
131
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
132
-
133
- except Exception as e:
134
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
135
-
136
- return None
137
-
138
- async def download_paper_libgen_async(self, session, doi):
139
- """Download from Libgen, handles the query and the redirection"""
140
- if not doi:
141
- return None
142
-
143
- base_url = 'https://libgen.rs/scimag/'
144
- try:
145
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
146
- text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
147
-
148
- if not text or "No results" in text:
149
- logger.debug(f"No results for DOI: {doi} on libgen")
150
- return None
151
-
152
- soup = BeautifulSoup(text, 'html.parser')
153
-
154
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
155
-
156
- if links:
157
- link = links[0]
158
- pdf_url = link['href']
159
- pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
160
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
161
- logger.debug(f"Found PDF from: {pdf_url}")
162
- return await pdf_response.read()
163
- except Exception as e:
164
- logger.debug(f"Error trying to download {doi} from libgen: {e}")
165
- return None
166
-
167
- async def download_paper_google_scholar_async(self, session, doi):
168
- """Search google scholar to find an article with the given doi, try to get the pdf"""
169
- if not doi:
170
- return None
171
-
172
  try:
173
- query = f'doi:"{doi}"'
174
- params = {'q': query}
175
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
176
-
177
- text, headers = await self.fetch_with_headers(session, url, timeout=10)
178
- if not text:
179
- return None
180
-
181
- soup = BeautifulSoup(text, 'html.parser')
182
-
183
- # Find any links with [PDF]
184
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
185
-
186
- if links:
187
- pdf_url = links[0]['href']
188
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
189
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
190
- logger.debug(f"Found PDF from: {pdf_url}")
191
- return await pdf_response.read()
192
  except Exception as e:
193
- logger.debug(f"Google Scholar error for {doi}: {e}")
194
-
195
- return None
196
-
197
- async def download_paper_crossref_async(self, session, doi):
198
- """Alternative search method using Crossref"""
199
- if not doi:
200
- return None
201
-
 
 
 
 
 
 
 
 
 
 
202
  try:
203
- # Search for open access link
204
- url = f"https://api.crossref.org/works/{doi}"
205
- response = await session.get(url, headers=self.headers, timeout=10)
206
-
207
- if response.status == 200:
208
- data = await response.json()
209
  work = data.get('message', {})
210
-
211
- # Search for open access links
 
 
 
 
 
 
 
 
212
  links = work.get('link', [])
213
  for link in links:
214
  if link.get('content-type') == 'application/pdf':
215
- pdf_url = link.get('URL')
216
- if pdf_url:
217
- pdf_response = await session.get(pdf_url, headers=self.headers)
218
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
219
- logger.debug(f"Found PDF from: {pdf_url}")
220
- return await pdf_response.read()
221
-
 
 
 
 
 
 
 
 
 
 
222
  except Exception as e:
223
- logger.debug(f"Crossref error for {doi}: {e}")
224
-
225
- return None
226
-
227
- async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
228
- """Downloads a paper using multiple strategies with exponential backoff and async requests"""
229
- pdf_content = None
230
- retries = 0
231
- delay = initial_delay
232
-
233
- async with aiohttp.ClientSession() as session:
234
- while retries < max_retries and not pdf_content:
235
- try:
236
- pdf_content = (
237
- await self.download_paper_direct_doi_async(session, doi) or
238
- await self.download_paper_scihub_async(session, doi) or
239
- await self.download_paper_libgen_async(session, doi) or
240
- await self.download_paper_google_scholar_async(session, doi) or
241
- await self.download_paper_crossref_async(session, doi)
242
-
243
- )
244
- if pdf_content:
245
- return pdf_content
246
- except Exception as e:
247
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
248
-
249
- if not pdf_content:
250
- retries += 1
251
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
252
- await asyncio.sleep(delay)
253
- delay *= 2 # Exponential backoff
254
-
255
- return None
256
-
257
- def download_paper_scihub(self, doi):
258
- """Improved method to download paper from Sci-Hub"""
259
- if not doi:
260
- logger.warning("DOI not provided")
261
- return None
262
-
263
- for base_url in self.download_sources:
264
- try:
265
- scihub_url = f"{base_url}{self.clean_doi(doi)}"
266
-
267
- # Request with more tolerance
268
- response = requests.get(scihub_url,
269
- headers=self.headers,
270
- allow_redirects=True,
271
- timeout=15)
272
-
273
- # Search for multiple PDF URL patterns
274
- pdf_patterns = [
275
- r'(https?://[^\s<>"]+?\.pdf)',
276
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
277
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
278
- ]
279
-
280
- pdf_urls = []
281
- for pattern in pdf_patterns:
282
- pdf_urls.extend(re.findall(pattern, response.text))
283
-
284
- # Try downloading from found URLs
285
- for pdf_url in pdf_urls:
286
- try:
287
- pdf_response = requests.get(pdf_url,
288
- headers=self.headers,
289
- timeout=10)
290
-
291
- # Verify if it's a PDF
292
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
293
- logger.debug(f"Found PDF from: {pdf_url}")
294
- return pdf_response.content
295
- except Exception as e:
296
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
297
-
298
- except Exception as e:
299
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
300
-
301
- return None
302
-
303
- def download_paper_libgen(self, doi):
304
- """Download from Libgen, handles the query and the redirection"""
305
- if not doi:
306
- return None
307
-
308
- base_url = 'https://libgen.rs/scimag/'
309
  try:
310
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
311
- response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
312
- response.raise_for_status()
313
-
314
- if "No results" in response.text:
315
- logger.debug(f"No results for DOI: {doi} on libgen")
316
- return None
317
-
318
- soup = BeautifulSoup(response.text, 'html.parser')
319
-
320
- # Find the link using a specific selector
321
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
322
-
323
- if links:
324
- link = links[0]
325
- pdf_url = link['href']
326
- pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
327
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
328
- logger.debug(f"Found PDF from: {pdf_url}")
329
- return pdf_response.content
330
-
331
  except Exception as e:
332
- logger.debug(f"Error trying to download {doi} from libgen: {e}")
333
- return None
334
-
335
- def download_paper_google_scholar(self, doi):
336
- """Search google scholar to find an article with the given doi, try to get the pdf"""
337
- if not doi:
338
- return None
339
-
 
 
 
 
 
 
 
 
 
 
 
340
  try:
341
- query = f'doi:"{doi}"'
342
- params = {'q': query}
343
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
344
-
345
- response = requests.get(url, headers=self.headers, timeout=10)
346
- response.raise_for_status()
347
-
348
- soup = BeautifulSoup(response.text, 'html.parser')
349
-
350
- # Find any links with [PDF]
351
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
352
-
353
- if links:
354
- pdf_url = links[0]['href']
355
- pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
356
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
357
- logger.debug(f"Found PDF from: {pdf_url}")
358
- return pdf_response.content
 
 
 
 
 
 
 
 
359
  except Exception as e:
360
- logger.debug(f"Google Scholar error for {doi}: {e}")
361
-
362
- return None
363
-
364
- def download_paper_crossref(self, doi):
365
- """Alternative search method using Crossref"""
366
- if not doi:
367
- return None
368
-
 
 
 
 
 
 
 
 
 
 
369
  try:
370
- # Search for open access link
371
- url = f"https://api.crossref.org/works/{doi}"
372
- response = requests.get(url, headers=self.headers, timeout=10)
373
-
374
  if response.status_code == 200:
375
  data = response.json()
376
- work = data.get('message', {})
377
-
378
- # Search for open access links
379
- links = work.get('link', [])
380
- for link in links:
381
- if link.get('content-type') == 'application/pdf':
382
- pdf_url = link.get('URL')
383
- if pdf_url:
384
- pdf_response = requests.get(pdf_url, headers=self.headers)
385
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
386
- logger.debug(f"Found PDF from: {pdf_url}")
387
- return pdf_response.content
388
-
389
  except Exception as e:
390
- logger.debug(f"Crossref error for {doi}: {e}")
391
-
392
- return None
393
-
394
- def download_with_retry(self, doi, max_retries=3, initial_delay=2):
395
- """Downloads a paper using multiple strategies with exponential backoff"""
396
- pdf_content = None
397
- retries = 0
398
- delay = initial_delay
399
-
400
- while retries < max_retries and not pdf_content:
401
- try:
402
- pdf_content = (
403
- self.download_paper_scihub(doi) or
404
- self.download_paper_libgen(doi) or
405
- self.download_paper_google_scholar(doi) or
406
- self.download_paper_crossref(doi)
407
-
408
- )
409
-
410
- if pdf_content:
411
- return pdf_content
412
- except Exception as e:
413
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
414
-
415
- if not pdf_content:
416
- retries += 1
417
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
418
- time.sleep(delay)
419
- delay *= 2 # Exponential backoff
420
-
421
- return None
422
-
423
- def download_single_doi(self, doi):
424
- """Downloads a single paper using a DOI"""
425
- if not doi:
426
- return None, "Error: DOI not provided", "Error: DOI not provided"
427
-
428
  try:
429
- pdf_content = self.download_with_retry(doi)
430
-
431
- if pdf_content:
432
- if doi is None:
433
- return None, "Error: DOI not provided", "Error: DOI not provided"
434
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
435
- filepath = os.path.join(self.output_dir, filename)
436
- with open(filepath, 'wb') as f:
437
- f.write(pdf_content)
438
- logger.info(f"Successfully downloaded: {filename}")
439
- return filepath, f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>', ""
440
- else:
441
- logger.warning(f"Could not download: {doi}")
442
- return None, f"Could not download {doi}", f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>'
443
-
444
  except Exception as e:
445
- logger.error(f"Error processing {doi}: {e}")
446
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
447
-
448
- def download_multiple_dois(self, dois_text):
449
- """Downloads multiple papers from a list of DOIs"""
450
- if not dois_text:
451
- return None, "Error: No DOIs provided", "Error: No DOIs provided"
452
-
453
- dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
454
- if not dois:
455
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
456
-
457
- downloaded_files = []
458
- failed_dois = []
459
- downloaded_links = []
460
- for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
461
- filepath, success_message, fail_message = self.download_single_doi(doi)
462
- if filepath:
463
- # Unique filename for zip
464
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
465
- filepath_unique = os.path.join(self.output_dir, filename)
466
- os.rename(filepath, filepath_unique)
467
- downloaded_files.append(filepath_unique)
468
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
469
-
470
- else:
471
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
472
-
473
- if downloaded_files:
474
- zip_filename = 'papers.zip'
475
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
476
- for file_path in downloaded_files:
477
- zipf.write(file_path, arcname=os.path.basename(file_path))
478
- logger.info(f"ZIP file created: {zip_filename}")
479
-
480
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
481
-
482
- def process_bibtex(self, bib_file):
483
- """Process BibTeX file and download papers with multiple strategies"""
484
- # Read BibTeX file content from the uploaded object
485
  try:
486
- with open(bib_file.name, 'r', encoding='utf-8') as f:
487
- bib_content = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  except Exception as e:
489
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
490
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
491
-
492
- # Parse BibTeX data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  try:
494
- bib_database = bibtexparser.loads(bib_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  except Exception as e:
496
- logger.error(f"Error parsing BibTeX data: {e}")
497
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
498
-
499
- # Extract DOIs
500
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
501
- logger.info(f"Found {len(dois)} DOIs to download")
502
-
503
- # Result lists
504
- downloaded_files = []
505
- failed_dois = []
506
- downloaded_links = []
507
-
508
- # Download PDFs
509
- for doi in tqdm(dois, desc="Downloading papers"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  try:
511
- # Try to download with multiple methods with retries
512
- pdf_content = self.download_with_retry(doi)
513
-
514
- # Save PDF
515
- if pdf_content:
516
- if doi is None:
517
- return None, "Error: DOI not provided", "Error: DOI not provided", None
518
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
519
- filepath = os.path.join(self.output_dir, filename)
520
-
521
- with open(filepath, 'wb') as f:
522
- f.write(pdf_content)
523
-
524
- downloaded_files.append(filepath)
525
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
526
- logger.info(f"Successfully downloaded: {filename}")
527
- else:
528
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
529
-
530
- except Exception as e:
531
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
532
- logger.error(f"Error processing {doi}: {e}")
533
-
534
- # Create ZIP of downloaded papers
535
- if downloaded_files:
536
- zip_filename = 'papers.zip'
537
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
538
- for file_path in downloaded_files:
539
- zipf.write(file_path, arcname=os.path.basename(file_path))
540
- logger.info(f"ZIP file created: {zip_filename}")
541
-
542
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
543
-
544
- async def process_bibtex_async(self, bib_file):
545
- """Process BibTeX file and download papers with multiple strategies"""
546
- # Read BibTeX file content from the uploaded object
547
  try:
548
- with open(bib_file.name, 'r', encoding='utf-8') as f:
549
- bib_content = f.read()
 
 
 
 
550
  except Exception as e:
551
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
552
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
553
-
554
- # Parse BibTeX data
 
555
  try:
556
- bib_database = bibtexparser.loads(bib_content)
557
- except Exception as e:
558
- logger.error(f"Error parsing BibTeX data: {e}")
559
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
560
-
561
- # Extract DOIs
562
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
563
- logger.info(f"Found {len(dois)} DOIs to download")
564
-
565
- # Result lists
566
- downloaded_files = []
567
- failed_dois = []
568
- downloaded_links = []
569
-
570
- # Download PDFs
571
- for doi in tqdm(dois, desc="Downloading papers"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  try:
573
- # Try to download with multiple methods with retries
574
- pdf_content = await self.download_with_retry_async(doi)
575
-
576
- # Save PDF
577
- if pdf_content:
578
- if doi is None:
579
- return None, "Error: DOI not provided", "Error: DOI not provided", None
580
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
581
- filepath = os.path.join(self.output_dir, filename)
582
-
583
- with open(filepath, 'wb') as f:
584
- f.write(pdf_content)
585
-
586
- downloaded_files.append(filepath)
587
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
588
- logger.info(f"Successfully downloaded: {filename}")
589
- else:
590
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
591
-
592
- except Exception as e:
593
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
594
- logger.error(f"Error processing {doi}: {e}")
595
-
596
- # Create ZIP of downloaded papers
597
- if downloaded_files:
598
- zip_filename = 'papers.zip'
599
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
600
- for file_path in downloaded_files:
601
- zipf.write(file_path, arcname=os.path.basename(file_path))
602
- logger.info(f"ZIP file created: {zip_filename}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
604
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
605
 
606
  def create_gradio_interface():
607
- """Create Gradio interface for Paper Downloader"""
608
- downloader = PaperDownloader()
609
-
610
- async def download_papers(bib_file, doi_input, dois_input):
611
- if bib_file:
612
- # Check file type
613
- if not bib_file.name.lower().endswith('.bib'):
614
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
615
-
616
- zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file)
617
- return zip_path, downloaded_dois, failed_dois, None
618
- elif doi_input:
619
- filepath, message, failed_doi = downloader.download_single_doi(doi_input)
620
- return None, message, failed_doi, filepath
621
- elif dois_input:
622
- zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
623
- return zip_path, downloaded_dois, failed_dois, None
624
- else:
625
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
626
-
627
- # Gradio Interface
628
- interface = gr.Interface(
629
- fn=download_papers,
630
- inputs=[
631
- gr.File(file_types=['.bib'], label="Upload BibTeX File"),
632
- gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
633
- gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
634
- ],
635
- outputs=[
636
- gr.File(label="Download Papers (ZIP) or Single PDF"),
637
- gr.HTML(label="""
638
- <div style='padding-bottom: 5px; font-weight: bold;'>
639
- Found DOIs
640
- </div>
641
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
642
- <div id="downloaded-dois"></div>
643
- </div>
644
- """),
645
- gr.HTML(label="""
646
- <div style='padding-bottom: 5px; font-weight: bold;'>
647
- Missed DOIs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  </div>
649
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
650
- <div id="failed-dois"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  </div>
652
- """),
653
- gr.File(label="Downloaded Single PDF")
654
- ],
655
- title="🔬 Academic Paper Batch Downloader",
656
- description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
657
- theme="Hev832/Applio",
658
- examples=[
659
- ["example.bib", None, None], # Bibtex File
660
- [None, "10.1038/nature12373", None], # Single DOI
661
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
662
- ],
663
- css="""
664
- .gradio-container {
665
- background-color: black;
666
- }
667
- .gr-interface {
668
- max-width: 800px;
669
- margin: 0 auto;
670
- }
671
- .gr-box {
672
- background-color: black;
673
- border-radius: 10px;
674
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
675
- }
676
- .output-text a {
677
- color: #007bff; /* Blue color for hyperlinks */
678
- }
679
- """,
680
- cache_examples=False,
681
- )
682
-
683
- # Add Javascript to update HTML
684
- interface.load = """
685
- function(downloaded_dois, failed_dois) {
686
- let downloaded_html = '';
687
- downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
688
- downloaded_html += doi + '<br>';
689
- });
690
- document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
691
-
692
- let failed_html = '';
693
- failed_dois.split('\\n').filter(Boolean).forEach(doi => {
694
- failed_html += doi + '<br>';
695
- });
696
- document.querySelector("#failed-dois").innerHTML = failed_html;
697
- return [downloaded_html, failed_html];
698
- }
699
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
 
701
- interface.head = """
702
- <script>
703
- function copyLink(button) {
704
- const linkElement = button.previousElementSibling;
705
- const link = linkElement.href;
706
- navigator.clipboard.writeText(link)
707
- .then(() => {
708
- button.innerText = '✓ Copied';
709
- button.style.color = 'green';
710
- setTimeout(() => {
711
- button.innerText = 'Copy';
712
- button.style.color = '';
713
- }, 2000);
714
- })
715
- .catch(err => {
716
- console.error('Failed to copy link: ', err);
717
- });
718
- }
719
- </script>
720
- """
721
  return interface
722
 
 
723
 
724
- def main():
725
- interface = create_gradio_interface()
726
- interface.launch(share=True)
727
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
 
729
  if __name__ == "__main__":
730
- main()
 
 
1
  import os
2
  import re
3
+ import json
4
  import logging
5
  import zipfile
 
 
 
 
 
 
 
6
  import asyncio
7
+ import tempfile
8
+ from typing import Dict, List, Optional, Any, Tuple
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+ import gradio as gr
13
+ from enum import Enum
14
+ import hashlib
15
+ import urllib.parse
16
+
17
+ # Importar smolagents
18
+ from smolagents import CodeAgent, ToolCallingAgent, LiteLLMModel
19
+ from smolagents.tools import Tool, tool
20
+ from pydantic import BaseModel, Field
21
+
22
+ # Configuración de logging
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
26
+ handlers=[
27
+ logging.FileHandler('bibliography_system.log'),
28
+ logging.StreamHandler()
29
+ ]
30
+ )
31
  logger = logging.getLogger(__name__)
32
 
33
+ # ========== MODELOS DE DATOS ==========
34
+
35
+ class ResourceType(str, Enum):
36
+ DOI = "doi"
37
+ ISBN = "isbn"
38
+ ARXIV = "arxiv"
39
+ URL = "url"
40
+ PMID = "pmid"
41
+ BIBTEX = "bibtex"
42
+ CITATION = "citation"
43
+ UNKNOWN = "unknown"
44
+
45
+ class CitationModel(BaseModel):
46
+ id: str
47
+ raw_text: str
48
+ resource_type: ResourceType
49
+ identifier: str
50
+ metadata: Dict[str, Any] = Field(default_factory=dict)
51
+ confidence: float = 0.0
52
+ extracted_from: str
53
+ position: Tuple[int, int] = (0, 0)
54
+
55
+ class VerificationResult(BaseModel):
56
+ citation: CitationModel
57
+ verified: bool
58
+ verification_source: str
59
+ download_url: Optional[str]
60
+ file_format: Optional[str]
61
+ file_size: Optional[int]
62
+ quality_score: float
63
+ notes: List[str] = Field(default_factory=list)
64
+
65
+ class ProcessingReport(BaseModel):
66
+ input_file: str
67
+ total_citations: int
68
+ verified_resources: List[VerificationResult]
69
+ downloaded_files: List[str]
70
+ failed_verifications: List[CitationModel]
71
+ processing_time: float
72
+ summary: Dict[str, Any] = Field(default_factory=dict)
73
+ timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
74
+
75
+ # ========== HERRAMIENTAS PARA AGENTES ==========
76
+
77
+ class BibliographyExtractionTool(Tool):
78
+ name = "extract_bibliography"
79
+ description = """
80
+ Extract bibliographic references from text. Identifies DOIs, ISBNs, arXiv IDs, URLs,
81
+ and other academic identifiers from unstructured text.
82
+
83
+ Args:
84
+ text (str): The text to analyze
85
+ source_name (str): Name of the source document
86
+
87
+ Returns:
88
+ List[CitationModel]: List of extracted citations
89
+ """
90
+
91
+ def __init__(self):
92
+ super().__init__()
93
+ # Patrones para diferentes tipos de recursos
94
+ self.patterns = {
95
+ ResourceType.DOI: [
96
+ r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b',
97
+ r'doi:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)',
98
+ r'DOI:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)'
99
+ ],
100
+ ResourceType.ISBN: [
101
+ r'ISBN(?:-1[03])?:?\s*(?=[0-9X]{10}|(?=(?:[0-9]+[- ]){3})[- 0-9X]{13}|97[89][0-9]{10}|(?=(?:[0-9]+[- ]){4})[- 0-9]{17})(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]'
102
+ ],
103
+ ResourceType.ARXIV: [
104
+ r'arXiv:\s*(\d{4}\.\d{4,5}(v\d+)?)',
105
+ r'arxiv:\s*([a-z\-]+/\d{7})'
106
+ ],
107
+ ResourceType.PMID: [
108
+ r'PMID:\s*(\d+)',
109
+ r'PubMed ID:\s*(\d+)'
110
+ ]
111
+ }
112
+
113
+ def forward(self, text: str, source_name: str = "unknown") -> List[Dict[str, Any]]:
114
+ """Extract citations from text"""
115
+ citations = []
116
+ text_lower = text.lower()
117
+
118
+ # Buscar por tipo de recurso
119
+ for resource_type, patterns in self.patterns.items():
120
+ for pattern in patterns:
121
+ matches = re.finditer(pattern, text, re.IGNORECASE)
122
+ for match in matches:
123
+ identifier = match.group(1) if match.groups() else match.group(0)
124
+
125
+ # Limpiar identificador
126
+ identifier = self._clean_identifier(identifier, resource_type)
127
+
128
+ if identifier:
129
+ # Calcular confianza basada en el contexto
130
+ confidence = self._calculate_confidence(
131
+ identifier, resource_type, text_lower, match.start()
132
+ )
133
+
134
+ citation = CitationModel(
135
+ id=hashlib.md5(
136
+ f"{identifier}_{source_name}".encode()
137
+ ).hexdigest()[:12],
138
+ raw_text=match.group(0),
139
+ resource_type=resource_type,
140
+ identifier=identifier,
141
+ metadata={
142
+ "found_at": match.start(),
143
+ "context": self._get_context(text, match.start(), match.end())
144
+ },
145
+ confidence=confidence,
146
+ extracted_from=source_name,
147
+ position=(match.start(), match.end())
148
+ )
149
+ citations.append(citation.dict())
150
+
151
+ # Extraer URLs generales (solo si parecen académicas)
152
+ url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
153
+ url_matches = re.finditer(url_pattern, text)
154
+
155
+ for match in url_matches:
156
+ url = match.group(0)
157
+ if self._is_academic_url(url):
158
+ citation = CitationModel(
159
+ id=hashlib.md5(f"{url}_{source_name}".encode()).hexdigest()[:12],
160
+ raw_text=url,
161
+ resource_type=ResourceType.URL,
162
+ identifier=url,
163
+ metadata={
164
+ "found_at": match.start(),
165
+ "context": self._get_context(text, match.start(), match.end())
166
+ },
167
+ confidence=0.6,
168
+ extracted_from=source_name,
169
+ position=(match.start(), match.end())
170
+ )
171
+ citations.append(citation.dict())
172
+
173
+ return citations
174
+
175
+ def _clean_identifier(self, identifier: str, resource_type: ResourceType) -> str:
176
+ """Clean identifier"""
177
+ identifier = identifier.strip()
178
+
179
+ # Eliminar prefijos
180
+ prefixes = ['doi:', 'DOI:', 'arxiv:', 'arXiv:', 'isbn:', 'ISBN:', 'pmid:', 'PMID:']
181
+ for prefix in prefixes:
182
+ if identifier.startswith(prefix):
183
+ identifier = identifier[len(prefix):].strip()
184
+
185
+ # Limpiar caracteres no deseados
186
+ identifier = identifier.strip('"\'<>()[]{}')
187
+
188
+ return identifier
189
+
190
+ def _calculate_confidence(self, identifier: str, resource_type: ResourceType,
191
+ text: str, position: int) -> float:
192
+ """Calculate confidence score for extracted citation"""
193
+ confidence = 0.7 # Base confidence
194
+
195
+ # Verificar formato DOI
196
+ if resource_type == ResourceType.DOI:
197
+ if re.match(r'^10\.\d{4,9}/.+', identifier):
198
+ confidence += 0.2
199
+
200
+ # Verificar contexto
201
+ context_words = ['paper', 'article', 'journal', 'conference', 'published',
202
+ 'reference', 'bibliography', 'cite', 'doi', 'url']
203
+
204
+ context = text[max(0, position-100):min(len(text), position+100)]
205
+ for word in context_words:
206
+ if word in context.lower():
207
+ confidence += 0.05
208
+
209
+ return min(confidence, 1.0)
210
+
211
+ def _is_academic_url(self, url: str) -> bool:
212
+ """Check if URL looks academic"""
213
+ academic_domains = [
214
+ 'arxiv.org', 'doi.org', 'springer.com', 'ieee.org', 'acm.org',
215
+ 'sciencedirect.com', 'wiley.com', 'tandfonline.com', 'nature.com',
216
+ 'science.org', 'pnas.org', 'plos.org', 'bmc.com', 'frontiersin.org',
217
+ 'mdpi.com', 'researchgate.net', 'semanticscholar.org'
218
  ]
219
+
220
+ url_lower = url.lower()
221
+ return any(domain in url_lower for domain in academic_domains)
222
+
223
+ def _get_context(self, text: str, start: int, end: int, window: int = 50) -> str:
224
+ """Get context around match"""
225
+ context_start = max(0, start - window)
226
+ context_end = min(len(text), end + window)
227
+ return text[context_start:context_end]
228
+
229
+ class ResourceVerificationTool(Tool):
230
+ name = "verify_resource"
231
+ description = """
232
+ Verify the existence and accessibility of academic resources.
233
+
234
+ Args:
235
+ citation (Dict[str, Any]): Citation to verify
236
+ timeout (int): Timeout in seconds
237
+
238
+ Returns:
239
+ VerificationResult: Verification result with metadata
240
+ """
241
+
242
+ def __init__(self):
243
+ super().__init__()
244
  self.headers = {
245
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 
 
246
  }
247
+
248
+ def forward(self, citation: Dict[str, Any], timeout: int = 10) -> Dict[str, Any]:
249
+ """Verify a citation"""
250
+ citation_obj = CitationModel(**citation)
251
+
252
+ # Preparar resultado
253
+ result = {
254
+ "citation": citation_obj.dict(),
255
+ "verified": False,
256
+ "verification_source": "none",
257
+ "download_url": None,
258
+ "file_format": None,
259
+ "file_size": None,
260
+ "quality_score": 0.0,
261
+ "notes": []
262
+ }
263
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  try:
265
+ if citation_obj.resource_type == ResourceType.DOI:
266
+ return self._verify_doi(citation_obj, timeout)
267
+ elif citation_obj.resource_type == ResourceType.ARXIV:
268
+ return self._verify_arxiv(citation_obj, timeout)
269
+ elif citation_obj.resource_type == ResourceType.URL:
270
+ return self._verify_url(citation_obj, timeout)
271
+ elif citation_obj.resource_type == ResourceType.ISBN:
272
+ return self._verify_isbn(citation_obj, timeout)
273
+ elif citation_obj.resource_type == ResourceType.PMID:
274
+ return self._verify_pmid(citation_obj, timeout)
275
+ else:
276
+ result["notes"].append(f"Unsupported resource type: {citation_obj.resource_type}")
277
+
 
 
 
 
 
 
278
  except Exception as e:
279
+ result["notes"].append(f"Verification error: {str(e)}")
280
+
281
+ return result
282
+
283
+ def _verify_doi(self, citation: CitationModel, timeout: int) -> Dict[str, Any]:
284
+ """Verify DOI"""
285
+ import requests
286
+
287
+ result = {
288
+ "citation": citation.dict(),
289
+ "verified": False,
290
+ "verification_source": "crossref",
291
+ "download_url": None,
292
+ "file_format": None,
293
+ "file_size": None,
294
+ "quality_score": 0.0,
295
+ "notes": []
296
+ }
297
+
298
  try:
299
+ # Try Crossref API
300
+ url = f"https://api.crossref.org/works/{citation.identifier}"
301
+ response = requests.get(url, headers=self.headers, timeout=timeout)
302
+
303
+ if response.status_code == 200:
304
+ data = response.json()
305
  work = data.get('message', {})
306
+
307
+ result["verified"] = True
308
+ result["quality_score"] = 0.9
309
+
310
+ # Check for open access
311
+ if work.get('license'):
312
+ result["notes"].append("Open access available")
313
+ result["quality_score"] += 0.1
314
+
315
+ # Try to find PDF URL
316
  links = work.get('link', [])
317
  for link in links:
318
  if link.get('content-type') == 'application/pdf':
319
+ result["download_url"] = link.get('URL')
320
+ result["file_format"] = "pdf"
321
+ break
322
+
323
+ # Try Unpaywall
324
+ if not result["download_url"]:
325
+ unpaywall_url = f"https://api.unpaywall.org/v2/{citation.identifier}[email protected]"
326
+ unpaywall_response = requests.get(unpaywall_url, timeout=timeout)
327
+ if unpaywall_response.status_code == 200:
328
+ unpaywall_data = unpaywall_response.json()
329
+ if unpaywall_data.get('is_oa'):
330
+ result["download_url"] = unpaywall_data.get('best_oa_location', {}).get('url')
331
+ result["verification_source"] = "unpaywall"
332
+
333
+ else:
334
+ result["notes"].append(f"Crossref API returned {response.status_code}")
335
+
336
  except Exception as e:
337
+ result["notes"].append(f"DOI verification error: {str(e)}")
338
+
339
+ return result
340
+
341
+ def _verify_arxiv(self, citation: CitationModel, timeout: int) -> Dict[str, Any]:
342
+ """Verify arXiv ID"""
343
+ import requests
344
+
345
+ result = {
346
+ "citation": citation.dict(),
347
+ "verified": False,
348
+ "verification_source": "arxiv",
349
+ "download_url": None,
350
+ "file_format": None,
351
+ "file_size": None,
352
+ "quality_score": 0.0,
353
+ "notes": []
354
+ }
355
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  try:
357
+ # Clean arXiv ID
358
+ arxiv_id = citation.identifier
359
+ if 'arxiv:' in arxiv_id.lower():
360
+ arxiv_id = arxiv_id.split(':')[-1].strip()
361
+
362
+ # Check arXiv API
363
+ api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
364
+ response = requests.get(api_url, headers=self.headers, timeout=timeout)
365
+
366
+ if response.status_code == 200:
367
+ result["verified"] = True
368
+ result["quality_score"] = 0.95
369
+ result["download_url"] = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
370
+ result["file_format"] = "pdf"
371
+ result["notes"].append("arXiv paper available")
372
+
 
 
 
 
 
373
  except Exception as e:
374
+ result["notes"].append(f"arXiv verification error: {str(e)}")
375
+
376
+ return result
377
+
378
+ def _verify_url(self, citation: CitationModel, timeout: int) -> Dict[str, Any]:
379
+ """Verify URL"""
380
+ import requests
381
+
382
+ result = {
383
+ "citation": citation.dict(),
384
+ "verified": False,
385
+ "verification_source": "direct",
386
+ "download_url": None,
387
+ "file_format": None,
388
+ "file_size": None,
389
+ "quality_score": 0.0,
390
+ "notes": []
391
+ }
392
+
393
  try:
394
+ response = requests.head(
395
+ citation.identifier,
396
+ headers=self.headers,
397
+ timeout=timeout,
398
+ allow_redirects=True
399
+ )
400
+
401
+ if response.status_code == 200:
402
+ content_type = response.headers.get('content-type', '')
403
+
404
+ result["verified"] = True
405
+ result["quality_score"] = 0.7
406
+ result["download_url"] = citation.identifier
407
+
408
+ # Check if it's a PDF
409
+ if 'application/pdf' in content_type:
410
+ result["file_format"] = "pdf"
411
+ result["quality_score"] += 0.2
412
+
413
+ # Try to get file size
414
+ content_length = response.headers.get('content-length')
415
+ if content_length:
416
+ result["file_size"] = int(content_length)
417
+
418
+ result["notes"].append(f"Content-Type: {content_type}")
419
+
420
  except Exception as e:
421
+ result["notes"].append(f"URL verification error: {str(e)}")
422
+
423
+ return result
424
+
425
+ def _verify_isbn(self, citation: CitationModel, timeout: int) -> Dict[str, Any]:
426
+ """Verify ISBN"""
427
+ import requests
428
+
429
+ result = {
430
+ "citation": citation.dict(),
431
+ "verified": False,
432
+ "verification_source": "openlibrary",
433
+ "download_url": None,
434
+ "file_format": None,
435
+ "file_size": None,
436
+ "quality_score": 0.0,
437
+ "notes": []
438
+ }
439
+
440
  try:
441
+ # Try Open Library API
442
+ url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{citation.identifier}&format=json"
443
+ response = requests.get(url, headers=self.headers, timeout=timeout)
444
+
445
  if response.status_code == 200:
446
  data = response.json()
447
+ if data:
448
+ result["verified"] = True
449
+ result["quality_score"] = 0.8
450
+ result["notes"].append("ISBN found in Open Library")
451
+
 
 
 
 
 
 
 
 
452
  except Exception as e:
453
+ result["notes"].append(f"ISBN verification error: {str(e)}")
454
+
455
+ return result
456
+
457
+ def _verify_pmid(self, citation: CitationModel, timeout: int) -> Dict[str, Any]:
458
+ """Verify PMID"""
459
+ import requests
460
+
461
+ result = {
462
+ "citation": citation.dict(),
463
+ "verified": False,
464
+ "verification_source": "pubmed",
465
+ "download_url": None,
466
+ "file_format": None,
467
+ "file_size": None,
468
+ "quality_score": 0.0,
469
+ "notes": []
470
+ }
471
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  try:
473
+ # Try PubMed API
474
+ url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={citation.identifier}&retmode=json"
475
+ response = requests.get(url, headers=self.headers, timeout=timeout)
476
+
477
+ if response.status_code == 200:
478
+ data = response.json()
479
+ if data.get('result', {}).get(citation.identifier):
480
+ result["verified"] = True
481
+ result["quality_score"] = 0.85
482
+ result["notes"].append("PMID found in PubMed")
483
+
 
 
 
 
484
  except Exception as e:
485
+ result["notes"].append(f"PMID verification error: {str(e)}")
486
+
487
+ return result
488
+
489
+ class PaperDownloadTool(Tool):
490
+ name = "download_paper"
491
+ description = """
492
+ Download academic paper from verified source.
493
+
494
+ Args:
495
+ verification_result (Dict[str, Any]): Verified resource to download
496
+ output_dir (str): Directory to save downloaded file
497
+
498
+ Returns:
499
+ Dict[str, Any]: Download result with file path and metadata
500
+ """
501
+
502
+ def __init__(self):
503
+ super().__init__()
504
+ self.headers = {
505
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
506
+ }
507
+
508
+ def forward(self, verification_result: Dict[str, Any],
509
+ output_dir: str = "downloads") -> Dict[str, Any]:
510
+ """Download paper"""
511
+ import requests
512
+ import os
513
+
514
+ result = {
515
+ "success": False,
516
+ "file_path": None,
517
+ "file_size": 0,
518
+ "download_time": 0,
519
+ "error": None,
520
+ "metadata": verification_result
521
+ }
522
+
 
 
523
  try:
524
+ # Create output directory
525
+ os.makedirs(output_dir, exist_ok=True)
526
+
527
+ download_url = verification_result.get("download_url")
528
+ if not download_url:
529
+ result["error"] = "No download URL available"
530
+ return result
531
+
532
+ # Generate filename
533
+ citation = verification_result.get("citation", {})
534
+ identifier = citation.get("identifier", "unknown")
535
+ file_ext = verification_result.get("file_format", "pdf")
536
+
537
+ # Clean filename
538
+ filename = re.sub(r'[^\w\-\.]', '_', identifier)
539
+ if not filename.endswith(f'.{file_ext}'):
540
+ filename = f"{filename}.{file_ext}"
541
+
542
+ file_path = os.path.join(output_dir, filename)
543
+
544
+ # Download file
545
+ start_time = datetime.now()
546
+ response = requests.get(
547
+ download_url,
548
+ headers=self.headers,
549
+ stream=True,
550
+ timeout=30
551
+ )
552
+
553
+ if response.status_code == 200:
554
+ with open(file_path, 'wb') as f:
555
+ for chunk in response.iter_content(chunk_size=8192):
556
+ if chunk:
557
+ f.write(chunk)
558
+
559
+ download_time = (datetime.now() - start_time).total_seconds()
560
+ file_size = os.path.getsize(file_path)
561
+
562
+ result["success"] = True
563
+ result["file_path"] = file_path
564
+ result["file_size"] = file_size
565
+ result["download_time"] = download_time
566
+
567
+ logger.info(f"Downloaded {filename} ({file_size} bytes)")
568
+ else:
569
+ result["error"] = f"HTTP {response.status_code}"
570
+
571
  except Exception as e:
572
+ result["error"] = str(e)
573
+ logger.error(f"Download error: {e}")
574
+
575
+ return result
576
+
577
+ class FileProcessingTool(Tool):
578
+ name = "process_file"
579
+ description = """
580
+ Process different file types to extract text for bibliography extraction.
581
+
582
+ Args:
583
+ file_path (str): Path to the file
584
+ file_type (str): Type of file (auto-detected if None)
585
+
586
+ Returns:
587
+ Dict[str, Any]: Extracted text and metadata
588
+ """
589
+
590
+ def __init__(self):
591
+ super().__init__()
592
+
593
+ def forward(self, file_path: str, file_type: str = None) -> Dict[str, Any]:
594
+ """Process file and extract text"""
595
+ import os
596
+
597
+ result = {
598
+ "success": False,
599
+ "text": "",
600
+ "file_type": file_type,
601
+ "file_size": 0,
602
+ "error": None,
603
+ "metadata": {}
604
+ }
605
+
606
  try:
607
+ if not os.path.exists(file_path):
608
+ result["error"] = "File not found"
609
+ return result
610
+
611
+ file_size = os.path.getsize(file_path)
612
+ result["file_size"] = file_size
613
+
614
+ # Determine file type
615
+ if not file_type:
616
+ file_type = self._detect_file_type(file_path)
617
+
618
+ result["file_type"] = file_type
619
+
620
+ # Process based on file type
621
+ if file_type == "txt":
622
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
623
+ result["text"] = f.read()
624
+ result["success"] = True
625
+
626
+ elif file_type == "pdf":
627
+ result["text"] = self._extract_from_pdf(file_path)
628
+ result["success"] = True
629
+
630
+ elif file_type == "docx":
631
+ result["text"] = self._extract_from_docx(file_path)
632
+ result["success"] = True
633
+
634
+ elif file_type == "html":
635
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
636
+ html_content = f.read()
637
+ result["text"] = self._extract_from_html(html_content)
638
+ result["success"] = True
639
+
640
+ else:
641
+ # Try as text file
642
+ try:
643
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
644
+ result["text"] = f.read()
645
+ result["success"] = True
646
+ except:
647
+ result["error"] = f"Unsupported file type: {file_type}"
648
+
649
  except Exception as e:
650
+ result["error"] = str(e)
651
+
652
+ return result
653
+
654
+ def _detect_file_type(self, file_path: str) -> str:
655
+ """Detect file type from extension"""
656
+ ext = os.path.splitext(file_path)[1].lower()
657
+
658
+ type_mapping = {
659
+ '.txt': 'txt',
660
+ '.pdf': 'pdf',
661
+ '.docx': 'docx',
662
+ '.doc': 'doc',
663
+ '.html': 'html',
664
+ '.htm': 'html',
665
+ '.md': 'markdown',
666
+ '.rtf': 'rtf'
667
+ }
668
+
669
+ return type_mapping.get(ext, 'unknown')
670
+
671
+ def _extract_from_pdf(self, file_path: str) -> str:
672
+ """Extract text from PDF"""
673
+ try:
674
+ # Try PyPDF2
675
+ import PyPDF2
676
+ text = ""
677
+ with open(file_path, 'rb') as file:
678
+ pdf_reader = PyPDF2.PdfReader(file)
679
+ for page in pdf_reader.pages:
680
+ text += page.extract_text()
681
+ return text
682
+ except ImportError:
683
+ logger.warning("PyPDF2 not installed, using fallback")
684
+ # Fallback: use pdftotext command if available
685
+ import subprocess
686
  try:
687
+ result = subprocess.run(
688
+ ['pdftotext', file_path, '-'],
689
+ capture_output=True,
690
+ text=True
691
+ )
692
+ if result.returncode == 0:
693
+ return result.stdout
694
+ except:
695
+ pass
696
+ return ""
697
+
698
+ def _extract_from_docx(self, file_path: str) -> str:
699
+ """Extract text from DOCX"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  try:
701
+ from docx import Document
702
+ doc = Document(file_path)
703
+ return "\n".join([paragraph.text for paragraph in doc.paragraphs])
704
+ except ImportError:
705
+ logger.warning("python-docx not installed")
706
+ return ""
707
  except Exception as e:
708
+ logger.error(f"Error reading DOCX: {e}")
709
+ return ""
710
+
711
+ def _extract_from_html(self, html_content: str) -> str:
712
+ """Extract text from HTML"""
713
  try:
714
+ from bs4 import BeautifulSoup
715
+ soup = BeautifulSoup(html_content, 'html.parser')
716
+ # Remove script and style elements
717
+ for script in soup(["script", "style"]):
718
+ script.decompose()
719
+ return soup.get_text()
720
+ except ImportError:
721
+ # Simple regex-based extraction
722
+ import re
723
+ text = re.sub(r'<[^>]+>', ' ', html_content)
724
+ text = re.sub(r'\s+', ' ', text)
725
+ return text
726
+
727
+ # ========== AGENTES PRINCIPALES ==========
728
+
729
+ class BibliographyProcessingSystem:
730
+ """Sistema principal de procesamiento bibliográfico usando smolagents"""
731
+
732
+ def __init__(self, model_config: Dict[str, Any] = None):
733
+ self.model_config = model_config or {
734
+ "model_id": "gpt-4",
735
+ "api_key": os.getenv("OPENAI_API_KEY", ""),
736
+ "provider": "openai"
737
+ }
738
+
739
+ # Inicializar herramientas
740
+ self.extraction_tool = BibliographyExtractionTool()
741
+ self.verification_tool = ResourceVerificationTool()
742
+ self.download_tool = PaperDownloadTool()
743
+ self.file_tool = FileProcessingTool()
744
+
745
+ # Crear agentes
746
+ self.extraction_agent = self._create_extraction_agent()
747
+ self.verification_agent = self._create_verification_agent()
748
+ self.download_agent = self._create_download_agent()
749
+
750
+ # Directorios
751
+ self.output_dir = "bibliography_output"
752
+ self.download_dir = os.path.join(self.output_dir, "downloads")
753
+ self.report_dir = os.path.join(self.output_dir, "reports")
754
+
755
+ # Crear directorios
756
+ os.makedirs(self.output_dir, exist_ok=True)
757
+ os.makedirs(self.download_dir, exist_ok=True)
758
+ os.makedirs(self.report_dir, exist_ok=True)
759
+
760
+ # Estado
761
+ self.current_process_id = None
762
+ self.processing_results = {}
763
+
764
+ def _create_extraction_agent(self) -> ToolCallingAgent:
765
+ """Crear agente de extracción"""
766
+ model = self._create_model()
767
+
768
+ agent = ToolCallingAgent(
769
+ tools=[self.extraction_tool, self.file_tool],
770
+ model=model,
771
+ name="ExtractionAgent",
772
+ description="Extract bibliographic references from documents",
773
+ max_steps=10
774
+ )
775
+
776
+ return agent
777
+
778
+ def _create_verification_agent(self) -> ToolCallingAgent:
779
+ """Crear agente de verificación"""
780
+ model = self._create_model()
781
+
782
+ agent = ToolCallingAgent(
783
+ tools=[self.verification_tool],
784
+ model=model,
785
+ name="VerificationAgent",
786
+ description="Verify the existence and accessibility of academic resources",
787
+ max_steps=15
788
+ )
789
+
790
+ return agent
791
+
792
+ def _create_download_agent(self) -> ToolCallingAgent:
793
+ """Crear agente de descarga"""
794
+ model = self._create_model()
795
+
796
+ agent = ToolCallingAgent(
797
+ tools=[self.download_tool],
798
+ model=model,
799
+ name="DownloadAgent",
800
+ description="Download academic papers from verified sources",
801
+ max_steps=20
802
+ )
803
+
804
+ return agent
805
+
806
+ def _create_model(self):
807
+ """Crear modelo según configuración"""
808
+ provider = self.model_config.get("provider", "openai")
809
+
810
+ if provider == "openai":
811
+ return LiteLLMModel(
812
+ model_id=self.model_config.get("model_id", "gpt-4"),
813
+ api_key=self.model_config.get("api_key")
814
+ )
815
+ elif provider == "anthropic":
816
+ return LiteLLMModel(
817
+ model_id="claude-3-opus-20240229",
818
+ api_key=self.model_config.get("api_key")
819
+ )
820
+ elif provider == "huggingface":
821
+ from smolagents import InferenceClientModel
822
+ return InferenceClientModel(
823
+ model_id=self.model_config.get("model_id", "mistralai/Mixtral-8x7B-Instruct-v0.1")
824
+ )
825
+ else:
826
+ # Default to OpenAI
827
+ return LiteLLMModel(model_id="gpt-4")
828
+
829
+ async def process_document(self, file_path: str, process_id: str = None) -> Dict[str, Any]:
830
+ """Procesar documento completo"""
831
+ import time
832
+
833
+ start_time = time.time()
834
+
835
+ # Generar ID de proceso
836
+ self.current_process_id = process_id or hashlib.md5(
837
+ f"{file_path}_{datetime.now().isoformat()}".encode()
838
+ ).hexdigest()[:8]
839
+
840
+ logger.info(f"Starting process {self.current_process_id} for {file_path}")
841
+
842
+ # 1. Extraer texto del archivo
843
+ extraction_prompt = f"""
844
+ Process the file at {file_path} to extract all text content.
845
+ Focus on extracting any bibliographic references, citations, or academic resources.
846
+
847
+ Steps:
848
+ 1. Use process_file tool to extract text
849
+ 2. Return the extracted text for further analysis
850
+ """
851
+
852
+ try:
853
+ # Ejecutar agente de extracción de archivos
854
+ file_result = await self.extraction_agent.run_async(extraction_prompt)
855
+
856
+ if not file_result or "text" not in str(file_result):
857
+ return {
858
+ "success": False,
859
+ "error": "Failed to extract text from file",
860
+ "process_id": self.current_process_id
861
+ }
862
+
863
+ # 2. Extraer referencias bibliográficas
864
+ text_content = str(file_result)
865
+ extraction_prompt2 = f"""
866
+ Analyze the following text and extract all bibliographic references:
867
+
868
+ {text_content[:5000]}... # Limitar tamaño para el prompt
869
+
870
+ Extract:
871
+ 1. DOIs (Digital Object Identifiers)
872
+ 2. ISBNs
873
+ 3. arXiv IDs
874
+ 4. PubMed IDs (PMID)
875
+ 5. Academic URLs
876
+ 6. Any other academic references
877
+
878
+ Return a comprehensive list of all found references.
879
+ """
880
+
881
+ extraction_result = await self.extraction_agent.run_async(extraction_prompt2)
882
+
883
+ # Parsear resultado (asumiendo que el agente devuelve texto JSON-like)
884
+ citations = []
885
  try:
886
+ # Intentar extraer JSON del resultado
887
+ import json
888
+ result_str = str(extraction_result)
889
+
890
+ # Buscar patrón JSON
891
+ json_match = re.search(r'\{.*\}', result_str, re.DOTALL)
892
+ if json_match:
893
+ citations_data = json.loads(json_match.group())
894
+ if isinstance(citations_data, list):
895
+ citations = [CitationModel(**c) for c in citations_data]
896
+ except:
897
+ # Fallback: usar la herramienta directamente
898
+ citations_data = self.extraction_tool.forward(text_content, os.path.basename(file_path))
899
+ citations = [CitationModel(**c) for c in citations_data]
900
+
901
+ logger.info(f"Found {len(citations)} citations")
902
+
903
+ # 3. Verificar recursos
904
+ verified_resources = []
905
+ failed_verifications = []
906
+
907
+ for citation in citations:
908
+ verification_prompt = f"""
909
+ Verify the following academic resource:
910
+
911
+ Type: {citation.resource_type}
912
+ Identifier: {citation.identifier}
913
+ Source: {citation.extracted_from}
914
+
915
+ Check if this resource exists and is accessible.
916
+ """
917
+
918
+ try:
919
+ verification_result = await self.verification_agent.run_async(verification_prompt)
920
+
921
+ # Parsear resultado
922
+ if verification_result:
923
+ verification_dict = self.verification_tool.forward(citation.dict())
924
+ verified_resource = VerificationResult(**verification_dict)
925
+
926
+ if verified_resource.verified:
927
+ verified_resources.append(verified_resource)
928
+ else:
929
+ failed_verifications.append(citation)
930
+ except Exception as e:
931
+ logger.error(f"Verification error for {citation.identifier}: {e}")
932
+ failed_verifications.append(citation)
933
+
934
+ # 4. Descargar recursos verificados
935
+ downloaded_files = []
936
+
937
+ for verified_resource in verified_resources:
938
+ if verified_resource.download_url:
939
+ download_prompt = f"""
940
+ Download the academic paper from:
941
+
942
+ URL: {verified_resource.download_url}
943
+ Format: {verified_resource.file_format}
944
+
945
+ Save it to: {self.download_dir}
946
+ """
947
+
948
+ try:
949
+ download_result = await self.download_agent.run_async(download_prompt)
950
+
951
+ if download_result:
952
+ download_dict = self.download_tool.forward(
953
+ verified_resource.dict(),
954
+ self.download_dir
955
+ )
956
+
957
+ if download_dict.get("success"):
958
+ downloaded_files.append(download_dict.get("file_path"))
959
+ except Exception as e:
960
+ logger.error(f"Download error: {e}")
961
+
962
+ # 5. Generar reporte
963
+ processing_time = time.time() - start_time
964
+
965
+ report = ProcessingReport(
966
+ input_file=file_path,
967
+ total_citations=len(citations),
968
+ verified_resources=verified_resources,
969
+ downloaded_files=downloaded_files,
970
+ failed_verifications=failed_verifications,
971
+ processing_time=processing_time,
972
+ summary={
973
+ "success_rate": len(verified_resources) / max(1, len(citations)),
974
+ "download_rate": len(downloaded_files) / max(1, len(verified_resources)),
975
+ "file_count": len(downloaded_files)
976
+ }
977
+ )
978
+
979
+ # Guardar reporte
980
+ report_path = os.path.join(
981
+ self.report_dir,
982
+ f"report_{self.current_process_id}.json"
983
+ )
984
+
985
+ with open(report_path, 'w', encoding='utf-8') as f:
986
+ json.dump(report.dict(), f, indent=2, default=str)
987
+
988
+ # 6. Crear archivo ZIP con resultados
989
+ zip_path = self._create_results_zip(report)
990
+
991
+ # Guardar resultados en estado
992
+ self.processing_results[self.current_process_id] = {
993
+ "report": report.dict(),
994
+ "zip_path": zip_path,
995
+ "timestamp": datetime.now().isoformat()
996
+ }
997
+
998
+ logger.info(f"Process {self.current_process_id} completed in {processing_time:.2f}s")
999
+
1000
+ return {
1001
+ "success": True,
1002
+ "process_id": self.current_process_id,
1003
+ "report": report.dict(),
1004
+ "zip_path": zip_path,
1005
+ "summary": {
1006
+ "citations_found": len(citations),
1007
+ "resources_verified": len(verified_resources),
1008
+ "files_downloaded": len(downloaded_files),
1009
+ "processing_time": processing_time
1010
+ }
1011
+ }
1012
+
1013
+ except Exception as e:
1014
+ logger.error(f"Processing error: {e}")
1015
+ return {
1016
+ "success": False,
1017
+ "error": str(e),
1018
+ "process_id": self.current_process_id
1019
+ }
1020
+
1021
+ def _create_results_zip(self, report: ProcessingReport) -> str:
1022
+ """Crear archivo ZIP con resultados"""
1023
+ import zipfile
1024
+ from datetime import datetime
1025
+
1026
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1027
+ zip_filename = f"bibliography_results_{timestamp}.zip"
1028
+ zip_path = os.path.join(self.output_dir, zip_filename)
1029
+
1030
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
1031
+ # Agregar reporte
1032
+ report_path = os.path.join(
1033
+ self.report_dir,
1034
+ f"report_{self.current_process_id}.json"
1035
+ )
1036
+ if os.path.exists(report_path):
1037
+ zipf.write(report_path, "report.json")
1038
+
1039
+ # Agregar archivos descargados
1040
+ for file_path in report.downloaded_files:
1041
+ if os.path.exists(file_path):
1042
+ arcname = os.path.join("downloads", os.path.basename(file_path))
1043
+ zipf.write(file_path, arcname)
1044
+
1045
+ # Agregar resumen en texto
1046
+ summary_content = self._generate_summary_text(report)
1047
+ zipf.writestr("summary.txt", summary_content)
1048
+
1049
+ return zip_path
1050
+
1051
+ def _generate_summary_text(self, report: ProcessingReport) -> str:
1052
+ """Generar resumen en texto"""
1053
+ summary = f"""
1054
+ BIBLIOGRAPHY PROCESSING REPORT
1055
+ ==============================
1056
+
1057
+ Process ID: {self.current_process_id}
1058
+ Input File: {report.input_file}
1059
+ Processing Time: {report.processing_time:.2f} seconds
1060
+ Timestamp: {report.timestamp}
1061
+
1062
+ STATISTICS
1063
+ ----------
1064
+ Total Citations Found: {report.total_citations}
1065
+ Resources Verified: {len(report.verified_resources)}
1066
+ Files Downloaded: {len(report.downloaded_files)}
1067
+ Failed Verifications: {len(report.failed_verifications)}
1068
+
1069
+ Success Rate: {(len(report.verified_resources) / max(1, report.total_citations)) * 100:.1f}%
1070
+ Download Rate: {(len(report.downloaded_files) / max(1, len(report.verified_resources))) * 100:.1f}%
1071
+
1072
+ VERIFIED RESOURCES
1073
+ ------------------
1074
+ """
1075
+
1076
+ for i, resource in enumerate(report.verified_resources, 1):
1077
+ summary += f"\n{i}. {resource.citation.identifier}"
1078
+ summary += f"\n Type: {resource.citation.resource_type}"
1079
+ summary += f"\n Source: {resource.verification_source}"
1080
+ summary += f"\n Quality: {resource.quality_score:.2f}"
1081
+ if resource.download_url:
1082
+ summary += f"\n Downloaded: Yes"
1083
+ if resource.file_format:
1084
+ summary += f" ({resource.file_format})"
1085
+ summary += "\n"
1086
+
1087
+ if report.failed_verifications:
1088
+ summary += f"\nFAILED VERIFICATIONS\n-------------------\n"
1089
+ for citation in report.failed_verifications:
1090
+ summary += f"- {citation.identifier} ({citation.resource_type})\n"
1091
+
1092
+ summary += f"\nFILES DOWNLOADED\n----------------\n"
1093
+ for file_path in report.downloaded_files:
1094
+ file_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0
1095
+ summary += f"- {os.path.basename(file_path)} ({file_size} bytes)\n"
1096
+
1097
+ return summary
1098
+
1099
+ def get_status(self, process_id: str = None) -> Dict[str, Any]:
1100
+ """Obtener estado del proceso"""
1101
+ pid = process_id or self.current_process_id
1102
+ if pid and pid in self.processing_results:
1103
+ return self.processing_results[pid]
1104
+ return {"error": "Process not found"}
1105
+
1106
+ def cleanup(self, process_id: str = None):
1107
+ """Limpiar archivos temporales"""
1108
+ import shutil
1109
+
1110
+ if process_id:
1111
+ # Limpiar proceso específico
1112
+ if process_id in self.processing_results:
1113
+ del self.processing_results[process_id]
1114
+ else:
1115
+ # Limpiar todo
1116
+ self.processing_results.clear()
1117
+
1118
+ # Limpiar directorios (opcional, descomentar si se necesita)
1119
+ # shutil.rmtree(self.download_dir, ignore_errors=True)
1120
+ # shutil.rmtree(self.report_dir, ignore_errors=True)
1121
 
1122
+ # ========== INTERFAZ GRADIO ==========
1123
 
1124
  def create_gradio_interface():
1125
+ """Crear interfaz Gradio para el sistema"""
1126
+
1127
+ system = None
1128
+
1129
+ def initialize_system(provider, model_id, api_key):
1130
+ """Inicializar sistema con configuración"""
1131
+ nonlocal system
1132
+
1133
+ config = {
1134
+ "provider": provider,
1135
+ "model_id": model_id,
1136
+ "api_key": api_key
1137
+ }
1138
+
1139
+ try:
1140
+ system = BibliographyProcessingSystem(config)
1141
+ return "✅ Sistema inicializado correctamente"
1142
+ except Exception as e:
1143
+ return f" Error: {str(e)}"
1144
+
1145
+ async def process_file(file_obj, progress=gr.Progress()):
1146
+ """Procesar archivo"""
1147
+ if not system:
1148
+ return None, "❌ Sistema no inicializado", "", ""
1149
+
1150
+ try:
1151
+ progress(0, desc="Iniciando procesamiento...")
1152
+
1153
+ # Guardar archivo temporalmente
1154
+ import tempfile
1155
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_obj.name)[1]) as tmp:
1156
+ with open(file_obj.name, 'rb') as src:
1157
+ tmp.write(src.read())
1158
+ tmp_path = tmp.name
1159
+
1160
+ progress(0.2, desc="Extrayendo texto...")
1161
+
1162
+ # Procesar archivo
1163
+ result = await system.process_document(tmp_path)
1164
+
1165
+ if not result.get("success"):
1166
+ return None, f"❌ Error: {result.get('error')}", "", ""
1167
+
1168
+ # Obtener reporte
1169
+ report_data = result.get("report", {})
1170
+ summary = result.get("summary", {})
1171
+
1172
+ progress(0.8, desc="Generando resultados...")
1173
+
1174
+ # Preparar resultados para visualización
1175
+ citations_found = summary.get("citations_found", 0)
1176
+ verified = summary.get("resources_verified", 0)
1177
+ downloaded = summary.get("files_downloaded", 0)
1178
+
1179
+ # Generar HTML para visualización
1180
+ html_output = f"""
1181
+ <div style="font-family: Arial, sans-serif; padding: 20px;">
1182
+ <h2>📊 Resultados del Procesamiento</h2>
1183
+
1184
+ <div style="background: #f5f5f5; padding: 15px; border-radius: 10px; margin: 20px 0;">
1185
+ <h3>📈 Estadísticas</h3>
1186
+ <ul>
1187
+ <li><strong>Referencias encontradas:</strong> {citations_found}</li>
1188
+ <li><strong>Recursos verificados:</strong> {verified}</li>
1189
+ <li><strong>Archivos descargados:</strong> {downloaded}</li>
1190
+ <li><strong>Tasa de éxito:</strong> {(verified/max(1, citations_found))*100:.1f}%</li>
1191
+ <li><strong>ID del proceso:</strong> {result.get('process_id')}</li>
1192
+ </ul>
1193
  </div>
1194
+ """
1195
+
1196
+ # Lista de recursos verificados
1197
+ if verified > 0:
1198
+ html_output += """
1199
+ <div style="background: #e8f5e9; padding: 15px; border-radius: 10px; margin: 20px 0;">
1200
+ <h3>✅ Recursos Verificados</h3>
1201
+ <ul>
1202
+ """
1203
+
1204
+ resources = report_data.get("verified_resources", [])
1205
+ for i, resource in enumerate(resources[:10], 1): # Mostrar primeros 10
1206
+ citation = resource.get("citation", {})
1207
+ html_output += f"""
1208
+ <li>
1209
+ <strong>{citation.get('identifier', 'Unknown')}</strong><br>
1210
+ <small>Tipo: {citation.get('resource_type', 'unknown')} |
1211
+ Fuente: {resource.get('verification_source', 'unknown')} |
1212
+ Calidad: {resource.get('quality_score', 0):.2f}</small>
1213
+ </li>
1214
+ """
1215
+
1216
+ if verified > 10:
1217
+ html_output += f"<li>... y {verified - 10} más</li>"
1218
+
1219
+ html_output += "</ul></div>"
1220
+
1221
+ # Lista de fallos
1222
+ failed = len(report_data.get("failed_verifications", []))
1223
+ if failed > 0:
1224
+ html_output += f"""
1225
+ <div style="background: #ffebee; padding: 15px; border-radius: 10px; margin: 20px 0;">
1226
+ <h3>❌ Recursos No Verificados ({failed})</h3>
1227
+ <p>Algunos recursos no pudieron ser verificados. Revisa el archivo ZIP para más detalles.</p>
1228
  </div>
1229
+ """
1230
+
1231
+ html_output += "</div>"
1232
+
1233
+ # Texto plano para exportación
1234
+ text_output = f"""
1235
+ Procesamiento Bibliográfico
1236
+ ===========================
1237
+
1238
+ Archivo: {file_obj.name}
1239
+ Proceso ID: {result.get('process_id')}
1240
+ Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
1241
+
1242
+ Resumen:
1243
+ - Referencias encontradas: {citations_found}
1244
+ - Recursos verificados: {verified}
1245
+ - Archivos descargados: {downloaded}
1246
+ - Tasa de éxito: {(verified/max(1, citations_found))*100:.1f}%
1247
+
1248
+ Para ver el reporte completo, descarga el archivo ZIP.
1249
+ """
1250
+
1251
+ progress(1.0, desc="Completado!")
1252
+
1253
+ # Devolver resultados
1254
+ return (
1255
+ result.get("zip_path"),
1256
+ f"✅ Procesamiento completado. ID: {result.get('process_id')}",
1257
+ html_output,
1258
+ text_output
1259
+ )
1260
+
1261
+ except Exception as e:
1262
+ logger.error(f"Error en procesamiento: {e}")
1263
+ return None, f"❌ Error: {str(e)}", "", ""
1264
+
1265
+ def get_status():
1266
+ """Obtener estado del sistema"""
1267
+ if not system or not system.current_process_id:
1268
+ return "⚠️ No hay procesos activos"
1269
+
1270
+ status = system.get_status()
1271
+ if "error" in status:
1272
+ return f"⚠️ {status['error']}"
1273
+
1274
+ return f"""
1275
+ 📊 Estado del Sistema
1276
+ ---------------------
1277
+ Proceso activo: {system.current_process_id}
1278
+ Total procesos: {len(system.processing_results)}
1279
+ Último reporte: {status.get('timestamp', 'N/A')}
1280
+ """
1281
+
1282
+ # Crear interfaz
1283
+ with gr.Blocks(title="Sistema de Recopilación Bibliográfica", theme=gr.themes.Soft()) as interface:
1284
+ gr.Markdown("# 📚 Sistema de Recopilación Bibliográfica con IA")
1285
+ gr.Markdown("Procesa documentos y extrae referencias bibliográficas automáticamente")
1286
+
1287
+ with gr.Row():
1288
+ with gr.Column(scale=1):
1289
+ gr.Markdown("### ⚙️ Configuración")
1290
+
1291
+ provider = gr.Dropdown(
1292
+ choices=["openai", "anthropic", "huggingface"],
1293
+ label="Proveedor de IA",
1294
+ value="openai"
1295
+ )
1296
+
1297
+ model_id = gr.Textbox(
1298
+ label="Model ID",
1299
+ value="gpt-4",
1300
+ placeholder="Ej: gpt-4, claude-3-opus-20240229, mistralai/Mixtral-8x7B-Instruct-v0.1"
1301
+ )
1302
+
1303
+ api_key = gr.Textbox(
1304
+ label="API Key",
1305
+ type="password",
1306
+ placeholder="Ingresa tu API key"
1307
+ )
1308
+
1309
+ init_btn = gr.Button("🚀 Inicializar Sistema", variant="primary")
1310
+ init_status = gr.Markdown("")
1311
+
1312
+ init_btn.click(
1313
+ initialize_system,
1314
+ inputs=[provider, model_id, api_key],
1315
+ outputs=init_status
1316
+ )
1317
+
1318
+ gr.Markdown("---")
1319
+ status_btn = gr.Button("📊 Ver Estado")
1320
+ system_status = gr.Markdown("")
1321
+ status_btn.click(get_status, outputs=system_status)
1322
+
1323
+ with gr.Column(scale=2):
1324
+ gr.Markdown("### 📄 Procesar Documento")
1325
+
1326
+ file_input = gr.File(
1327
+ label="Sube tu documento",
1328
+ file_types=[".txt", ".pdf", ".docx", ".html", ".md", ".rtf"]
1329
+ )
1330
+
1331
+ process_btn = gr.Button("🔍 Procesar Documento", variant="primary")
1332
+
1333
+ gr.Markdown("### 📊 Resultados")
1334
+
1335
+ result_file = gr.File(label="Descargar Resultados (ZIP)")
1336
+ result_status = gr.Markdown("")
1337
+
1338
+ with gr.Tabs():
1339
+ with gr.TabItem("📋 Vista HTML"):
1340
+ html_output = gr.HTML(label="Resultados Detallados")
1341
+
1342
+ with gr.TabItem("📝 Texto Plano"):
1343
+ text_output = gr.Textbox(
1344
+ label="Resumen",
1345
+ lines=20,
1346
+ max_lines=50
1347
+ )
1348
+
1349
+ process_btn.click(
1350
+ process_file,
1351
+ inputs=[file_input],
1352
+ outputs=[result_file, result_status, html_output, text_output]
1353
+ )
1354
+
1355
+ # Ejemplos
1356
+ gr.Markdown("### 📖 Ejemplos")
1357
+ gr.Examples(
1358
+ examples=[
1359
+ ["ejemplo_referencias.txt"],
1360
+ ["ejemplo_bibliografia.pdf"],
1361
+ ["paper_con_referencias.docx"]
1362
+ ],
1363
+ inputs=[file_input],
1364
+ label="Archivos de ejemplo (necesitan ser creados)"
1365
+ )
1366
+
1367
+ # Información
1368
+ gr.Markdown("""
1369
+ ### 📌 Información
1370
+ - **Formatos soportados**: TXT, PDF, DOCX, HTML, MD, RTF
1371
+ - **Recursos detectados**: DOI, ISBN, arXiv, PMID, URLs académicas
1372
+ - **Salida**: Archivo ZIP con reportes y documentos descargados
1373
+
1374
+ ### ⚠️ Notas
1375
+ 1. Necesitas una API key válida para el proveedor seleccionado
1376
+ 2. Los archivos grandes pueden tardar varios minutos
1377
+ 3. La precisión depende del modelo de IA utilizado
1378
+ """)
1379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1380
  return interface
1381
 
1382
+ # ========== EJECUCIÓN PRINCIPAL ==========
1383
 
1384
+ async def main():
1385
+ """Función principal"""
1386
+ import argparse
1387
+
1388
+ parser = argparse.ArgumentParser(description="Sistema de Recopilación Bibliográfica")
1389
+ parser.add_argument("--mode", choices=["gui", "cli"], default="gui",
1390
+ help="Modo de ejecución")
1391
+ parser.add_argument("--file", type=str, help="Archivo a procesar (modo CLI)")
1392
+ parser.add_argument("--provider", default="openai", help="Proveedor de IA")
1393
+ parser.add_argument("--model", default="gpt-4", help="Modelo de IA")
1394
+ parser.add_argument("--api-key", help="API Key")
1395
+
1396
+ args = parser.parse_args()
1397
+
1398
+ if args.mode == "gui":
1399
+ # Ejecutar interfaz Gradio
1400
+ interface = create_gradio_interface()
1401
+ interface.launch(
1402
+ server_name="0.0.0.0",
1403
+ server_port=7860,
1404
+ share=True,
1405
+ debug=True
1406
+ )
1407
+
1408
+ elif args.mode == "cli":
1409
+ # Modo línea de comandos
1410
+ if not args.file:
1411
+ print("❌ Error: Debes especificar un archivo con --file")
1412
+ return
1413
+
1414
+ if not os.path.exists(args.file):
1415
+ print(f"❌ Error: Archivo no encontrado: {args.file}")
1416
+ return
1417
+
1418
+ # Configurar sistema
1419
+ config = {
1420
+ "provider": args.provider,
1421
+ "model_id": args.model,
1422
+ "api_key": args.api_key or os.getenv(f"{args.provider.upper()}_API_KEY")
1423
+ }
1424
+
1425
+ if not config["api_key"]:
1426
+ print(f"❌ Error: Necesitas especificar una API key")
1427
+ return
1428
+
1429
+ system = BibliographyProcessingSystem(config)
1430
+
1431
+ print(f"🔍 Procesando archivo: {args.file}")
1432
+ print("⏳ Esto puede tardar varios minutos...")
1433
+
1434
+ result = await system.process_document(args.file)
1435
+
1436
+ if result.get("success"):
1437
+ print(f"✅ Procesamiento completado!")
1438
+ print(f"📊 ID del proceso: {result.get('process_id')}")
1439
+
1440
+ summary = result.get("summary", {})
1441
+ print(f"""
1442
+ 📈 Resultados:
1443
+ - Referencias encontradas: {summary.get('citations_found', 0)}
1444
+ - Recursos verificados: {summary.get('resources_verified', 0)}
1445
+ - Archivos descargados: {summary.get('files_downloaded', 0)}
1446
+ - Tiempo de procesamiento: {summary.get('processing_time', 0):.2f}s
1447
+
1448
+ 📦 Archivo ZIP con resultados: {result.get('zip_path')}
1449
+ """)
1450
+ else:
1451
+ print(f"❌ Error: {result.get('error')}")
1452
 
1453
  if __name__ == "__main__":
1454
+ import asyncio
1455
+ asyncio.run(main())