| import os |
| import gradio as gr |
| from datasets import ClassLabel |
| from datasets import load_dataset |
| import random |
| import pandas as pd |
| from huggingface_hub import login |
|
|
| def remove_space(example): |
| ''' |
| 移除資料集當中「犯罪事實」欄 (Corpus-Delicti) 當中作為斷詞字元的空白字元,以及每句開頭的「ㄧ、」。 |
| 並且在文章的開頭跟結尾加入 bos_token = '<s>', eos_token = '</s>' |
| ''' |
| return {'Corpus-Delicti': example['Corpus-Delicti'].replace(" ", "").split('一、')[1].replace('犯罪事實:', '')} |
|
|
| def download_file(content, filename): |
| with open(filename, "w", encoding="utf-8") as f: |
| f.write(content) |
|
|
| def random_elements(dataset, num_examples=5): |
| assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset." |
| picks = [] |
| for _ in range(num_examples): |
| pick = random.randint(0, len(dataset)-1) |
| while pick in picks: |
| pick = random.randint(0, len(dataset)-1) |
| picks.append(pick) |
| |
| df = pd.DataFrame(dataset[picks]) |
| for column, typ in dataset.features.items(): |
| if isinstance(typ, ClassLabel): |
| df[column] = df[column].transform(lambda i: typ.names[i]) |
| return df |
|
|
| def random_next(num_examples=5): |
| random_selected = random_elements(dataset["train"], num_examples=num_examples) |
| court_name = random_selected['Court'][0] |
| case_no = random_selected['CaseNo'][0] |
| crime_descrip = random_selected['Corpus-Delicti'][0] |
| filename = court_name + "_" + case_no + '.txt' |
| data_tuple = (court_name, case_no, crime_descrip, filename) |
| return data_tuple |
|
|
| def gen_template(crime_descrip, element, tag): |
| INTRO_BLURB = "The following is a description of the crime in the verdict. Write a response for the legal element of crime and its tag that appropriately completes the request." |
| DESCRIPT_KEY = "### Description:" |
| ELEMENT_KEY = "### Element:" |
| TAG_KEY = "### Tag:" |
| END_KEY = "### End" |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| blurb = f"{INTRO_BLURB}\n" |
| |
| descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n" |
| element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n" |
| tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n" |
| end = f"{END_KEY}" |
| template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end |
| return template |
|
|
| |
| js_download = '''function downloadFile(result, filename) { |
| //藉型別陣列建構的 blob 來建立 URL |
| let fileName = filename; |
| const data = result; |
| let blob = new Blob([data], { |
| type: "application/octet-stream", |
| }); |
| var href = URL.createObjectURL(blob); |
| // 從 Blob 取出資料 |
| var link = document.createElement("a"); |
| document.body.appendChild(link); |
| link.href = href; |
| link.download = fileName; |
| link.click(); |
| } |
| ''' |
|
|
| |
| use_auth_token = os.environ['HUB_TOKEN'] |
| login(token = os.environ['HUB_TOKEN'], add_to_git_credential=True) |
| dataset = load_dataset("jslin09/Fraud_Case_Verdicts", token=use_auth_token, revision="main") |
| dataset = dataset.map(remove_space) |
|
|
| |
| random_selected = random_next() |
| court_name = random_selected[0] |
| case_no = random_selected[1] |
| crime_descrip = random_selected[2] |
| filename = random_selected[3] |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown( |
| """ |
| <h1 style="text-align: center;">Legal Document Annotation</h1> |
| """) |
| with gr.Row(): |
| with gr.Column(): |
| with gr.Row(): |
| courtName = gr.Label(label='法院名稱', value=court_name, visible=False) |
| caseNo = gr.Label(label='案號', value=case_no, visible=False) |
| filename = gr.components.Textbox(label='案號',value=filename, show_copy_button=True) |
| prompt = gr.components.Textbox(lines=5, label='犯罪事實',value=crime_descrip) |
| with gr.Row(): |
| with gr.Column(): |
| btn = gr.Button("🎲 隨機選擇") |
| |
| with gr.Row(): |
| element = gr.components.Textbox(lines=2, label="構成要件要素") |
| tag = gr.Dropdown(choices = [("被告(犯罪主體)","<LEO_SOC>"), ("主觀犯意", "<LEO_SLE>"), ("不法行為","<LEO_ACT>"), ("因果關係","<LEO_CAU>"), |
| ("被害人/告訴人","<LEO_VIC>"), ("危害結果","<LEO_ROH>"), ("未遂","<LEO_ATP>"), ("既遂","<LEO_ACC>"), |
| ("中止","<LEO_ABA>"), ("預備","<LEO_PRP>")], |
| label="標籤", info="構成要件要素的標籤", type='value') |
| with gr.Column(): |
| result = gr.components.Textbox(lines=5, label="語料內容", show_copy_button=True) |
| with gr.Row(): |
| with gr.Column(): |
| with gr.Row(): |
| btn2 = gr.Button("📖 產生標註語料內容") |
| with gr.Row(): |
| btn3 = gr.Button("💾 下載") |
| btn.click(random_next, inputs=[], outputs=[courtName, caseNo, prompt, filename]) |
| btn2.click(gen_template, inputs=[prompt, element, tag], outputs=[result]) |
| btn3.click(None, inputs=[result, filename], js=js_download) |
| |
|
|
| if __name__ == "__main__": |
| demo.launch() |