Spaces:
Sleeping
Sleeping
| import asyncio | |
| from crawl4ai import AsyncWebCrawler | |
| from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig | |
| import re | |
| def remove_links_and_pics(input_text): | |
| # Remove all links (URLs) | |
| text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text) | |
| # Remove all image references (markdown-style image syntax) | |
| text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links) | |
| # Remove HTML tags | |
| text_without_html = re.sub(r'<[^>]+>', '', text_without_images) | |
| # Remove special characters and leave only text | |
| text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html) | |
| # Remove special characters and leave only text | |
| text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars) | |
| return text_without_brackets.strip() | |
| # Crawling for marketing | |
| async def marketing_crawling(url): | |
| browser_config = BrowserConfig() # Default browser configuration | |
| run_config = CrawlerRunConfig() # Default crawl run configuration | |
| async with AsyncWebCrawler(config=browser_config) as crawler: | |
| result = await crawler.arun( | |
| url = url, | |
| config=run_config | |
| ) | |
| cleaned_text = remove_links_and_pics(result.markdown) # type: ignore | |
| return cleaned_text | |
| # Crawling for SEO | |
| async def seo_crawling(url): | |
| browser_config = BrowserConfig() # Default browser configuration | |
| run_config = CrawlerRunConfig() # Default crawl run configuration | |
| async with AsyncWebCrawler(config=browser_config) as crawler: | |
| result = await crawler.arun( | |
| url = url, | |
| config=run_config | |
| ) | |
| text = result.markdown # type: ignore | |
| return text | |
| # asyncio.run(marketing_crawling("https://allsolarworks.com/")) |