Spaces:
Sleeping
Sleeping
| import os | |
| from langchain_community.document_loaders import AsyncChromiumLoader | |
| from langchain_community.document_transformers import BeautifulSoupTransformer | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_core.messages import AIMessage | |
| from fake_useragent import UserAgent | |
| ua = UserAgent() | |
| os.environ["USER_AGENT"] = ua.random | |
| def scraper(url: str, doc_type: str) -> dict: | |
| if doc_type == "html": | |
| try: | |
| loader = AsyncChromiumLoader([url]) | |
| html = loader.load() | |
| # Transform | |
| bs_transformer = BeautifulSoupTransformer() | |
| docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["p"]) | |
| print({"source":url, "content": AIMessage(docs_transformed[0].page_content)}) | |
| return {"source":url, "content": AIMessage(docs_transformed[0].page_content)} | |
| except Exception as e: | |
| return {"source": url, "content": AIMessage(f"Error scraping website: {str(e)}")} | |
| elif doc_type == "pdf": | |
| try: | |
| loader = PyPDFLoader(url) | |
| pages = loader.load_and_split() | |
| # print({"source":url, "content":AIMessage(pages)}) | |
| return {"source":url, "content":AIMessage(pages)} | |
| except Exception as e: | |
| return {"source": url, "content": AIMessage(f"Error scraping PDF: {str(e)}")} | |
| else: | |
| return {"source": url, "content": AIMessage("Unsupported document type, supported types are 'html' and 'pdf'.")} | |
| if __name__ == "__main__": | |
| scraper("https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/pdf/", "html") |