Spaces:

Metaviz-Pro
/

Blog_Post_Generation

Sleeping

App Files Files Community

Blog_Post_Generation / crawl.py

AhsanRazi

Update crawl.py

7a5ef29 verified 10 months ago

raw

history blame contribute delete

1.83 kB

	import asyncio
	from crawl4ai import AsyncWebCrawler
	from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
	import re

	def remove_links_and_pics(input_text):
	# Remove all links (URLs)
	text_without_links = re.sub(r'https?:\/\/[^\s<>]+\|<https?:\/\/[^\s<>]+>', '', input_text)

	# Remove all image references (markdown-style image syntax)
	text_without_images = re.sub(r'!\[.?\]\(.?\)', '', text_without_links)

	# Remove HTML tags
	text_without_html = re.sub(r'<[^>]+>', '', text_without_images)

	# Remove special characters and leave only text
	text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html)

	# Remove special characters and leave only text
	text_without_brackets = re.sub(r'\[.?\]\|\(.?\)', '', text_without_special_chars)

	return text_without_brackets.strip()


	# Crawling for marketing
	async def marketing_crawling(url):
	browser_config = BrowserConfig() # Default browser configuration
	run_config = CrawlerRunConfig() # Default crawl run configuration

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url = url,
	config=run_config
	)
	cleaned_text = remove_links_and_pics(result.markdown) # type: ignore
	return cleaned_text


	# Crawling for SEO
	async def seo_crawling(url):
	browser_config = BrowserConfig() # Default browser configuration
	run_config = CrawlerRunConfig() # Default crawl run configuration

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url = url,
	config=run_config
	)
	text = result.markdown # type: ignore
	return text

	# asyncio.run(marketing_crawling("https://allsolarworks.com/"))