Spaces:

simonzhang5429
/

breeztest

Sleeping

App Files Files Community

simonzhang5429 commited on Jun 20, 2024

Commit

5e30561

verified ·

1 Parent(s): e0facf2

Create app.py

Browse files

Files changed (1) hide show

app.py +110 -0

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import gradio as gr
+import os
+import shutil
+from pypdf import PdfReader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import fitz
+TOKENIZER_REPO = "MediaTek-Research/Breeze-7B-Instruct-v1_0"
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO,local_files_only=False,use_fast=True)
+tran_hints = "请将以下的文字转为繁体:"
+start_flag="<s>"
+end_flag="</s>"
+model = AutoModelForCausalLM.from_pretrained(
+    TOKENIZER_REPO,
+    device_map="auto",
+    local_files_only=False,
+    torch_dtype=torch.bfloat16
+)
+def generate(text):
+    chat_data = []
+    text = text.strip()
+    if text:
+       chat_data.append({"role": "system", "content": text})
+    achat=tokenizer.apply_chat_template(chat_data,return_tensors="pt")
+    #achat=tokenizer.encode(chat_data,return_tensors="pt",max_length=2048)
+    outputs = model.generate(achat.to('cuda'),
+                         max_new_tokens=2048,
+                         top_p=0.01,
+                         top_k=85,
+                         repetition_penalty=1.1,
+                         temperature=0)
+    return tokenizer.decode(outputs[0])
+def tran_txt(input_txt):
+  data_txt=tran_hints+"\n"+input_txt.strip()
+  tran_result=generate(data_txt)
+  print("tran_result="+tran_result)
+#  tran_result=tran_result.strip()
+#  index=tran_result.find(start_flag)
+#  if index>=0:
+#     tran_result=tran_result[len(start_flag):]
+#     tran_result=tran_result.strip()
+#     c_index=tran_result.find(data_txt)
+#     if c_index>=0:
+#        tran_result=tran_result[len(data_txt):]
+#     e_index=tran_result.find(end_flag)
+#     if e_index>=0:
+#        tran_result=tran_result[0:e_index]
+  return tran_result
+def exec_tran(file):
+  temp_file=upload_file(file)
+  page_texts=read_paragraphs(temp_file)
+  temp_result_file=file;
+  file_index=temp_result_file.index('.pdf')
+  if file_index!=-1:
+    temp_result_file=temp_result_file[0:file_index]
+    temp_result_file=temp_result_file+"_result.txt"
+  else :
+    temp_result_file=temp_result_file+"_result.txt"
+  tran_file_name=file.name
+  with open(temp_result_file,'w') as fw:
+    tran_result=tran_txt(tran_hints)
+ #   print(tran_result+"\n")
+    for page_content in page_texts:
+      #lines=page_content.split('\n')
+      #for line_content in lines:
+          #print("input="+line_content)
+          tran_result=tran_txt(page_content)
+ #         print("result="+tran_result)
+          fw.write(tran_result+"\n")
+  return temp_result_file
+def upload_file(file):
+  UPLOAD_FOLDER="./data"
+  if not os.path.exists(UPLOAD_FOLDER):
+    os.mkdir(UPLOAD_FOLDER)
+  return shutil.copy(file,UPLOAD_FOLDER)
+def read_paragraphs(pdf_path):
+    document = fitz.open(pdf_path)
+    paragraphs = []
+    for page in document:
+        text = page.get_text("paragraphs")
+        para_list = text.split('。')
+        paragraphs.extend([para for para in para_list if para.strip()])
+    document.close()
+    return paragraphs
+def load_pdf_pages(filename):
+  page_texts=[]
+  reader = PdfReader(filename)
+  for page in reader.pages:
+     page_texts.append(page.extract_text())
+  return page_texts
+def exec_translate(file):
+  upload_file(file)
+  page_texts=load_pdf_pages(file.name)
+with gr.Blocks() as app:
+   file_output=gr.File()
+   upload_button=gr.UploadButton("上传pdf文件",file_types=["pdf"],file_count="single")
+   upload_button.upload(exec_tran,upload_button,file_output)
+app.launch()