Spaces:
Runtime error
Runtime error
| # %% | |
| import re | |
| import fitz | |
| import pandas as pd | |
| # %% | |
| document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf" | |
| # %% | |
| skip_header_offset = 1 | |
| regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))") | |
| regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)") | |
| # %% | |
| document = "" | |
| page_article = {} | |
| pdf_page_offset = 1 | |
| with fitz.open(document_path) as doc: | |
| for page_idx, page in enumerate(doc, pdf_page_offset): | |
| text = page.get_text() | |
| document += text | |
| articles = regex_article.findall(text) | |
| for article in articles: | |
| page_article[article] = page_idx | |
| len(page_article) | |
| # %% | |
| chapters = {} | |
| chapter_name = "header" | |
| splited_chapters = regex_chapters.split(document) | |
| for chapter in splited_chapters[skip_header_offset:]: | |
| if chapter.startswith("CAPÍTULO"): | |
| chapter_name = chapter.replace(" \n", ": ") | |
| else: | |
| chapters[chapter_name] = chapter | |
| len(chapters), chapters.keys() | |
| # %% | |
| minimum_article_length = 65 | |
| def format_article(article): | |
| articles = article.lstrip('- ').split("\n \n") | |
| formated_articles = [] | |
| for article in articles: | |
| formated_article = article.replace("\n", "").replace("*", "").strip() | |
| is_article_single = formated_article.startswith("El Estado") | |
| is_article_too_short = len(formated_article) <= minimum_article_length | |
| if is_article_too_short and not is_article_single: | |
| continue | |
| formated_articles.append(formated_article) | |
| sentence = " ".join(formated_articles) | |
| return sentence | |
| # %% | |
| chapter_articles = [] | |
| for chapter_name, chapter in chapters.items(): | |
| article_name = "header" | |
| splited_articles = regex_article.split(chapter) | |
| for article in splited_articles[skip_header_offset:]: | |
| if regex_article.match(article): | |
| article_name = article | |
| continue | |
| data = { | |
| "chapter_name": chapter_name, | |
| "article_page": page_article.get(article_name), | |
| "article_name": article_name, | |
| "article": format_article(article), | |
| } | |
| chapter_articles.append(data) | |
| # %% | |
| df_document = pd.DataFrame.from_dict(chapter_articles) | |
| df_document["article_number"] = ( | |
| df_document['article_name'] | |
| .str.extract(r'(^\d+)', expand=False) | |
| ) | |
| df_document["article_name"] = ( | |
| df_document['article_name'] | |
| .str.extract(r'^\d+\.- ?(.*)', expand=False) | |
| .str.rstrip(".-") | |
| ) | |
| df_document.head() | |
| # %% | |
| df_document.to_csv("data/articles.csv", index=False) | |
| # %% | |