AhsanRazi's picture
Update crawl.py
7a5ef29 verified
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import re
def remove_links_and_pics(input_text):
# Remove all links (URLs)
text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text)
# Remove all image references (markdown-style image syntax)
text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links)
# Remove HTML tags
text_without_html = re.sub(r'<[^>]+>', '', text_without_images)
# Remove special characters and leave only text
text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html)
# Remove special characters and leave only text
text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars)
return text_without_brackets.strip()
# Crawling for marketing
async def marketing_crawling(url):
browser_config = BrowserConfig() # Default browser configuration
run_config = CrawlerRunConfig() # Default crawl run configuration
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url = url,
config=run_config
)
cleaned_text = remove_links_and_pics(result.markdown) # type: ignore
return cleaned_text
# Crawling for SEO
async def seo_crawling(url):
browser_config = BrowserConfig() # Default browser configuration
run_config = CrawlerRunConfig() # Default crawl run configuration
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url = url,
config=run_config
)
text = result.markdown # type: ignore
return text
# asyncio.run(marketing_crawling("https://allsolarworks.com/"))