Merge branch 'frontier' of https://github.com/Menghuan1918/gpt_academic into Menghuan1918-frontier

This commit is contained in:
binary-husky 2024-12-07 22:45:41 +08:00
commit c288701751
3 changed files with 238 additions and 132 deletions

View File

@ -47,7 +47,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request) yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
return return
except: except:
chatbot.append([None, f"DOC2X服务不可用现在将执行效果稍差的旧版代码{trimmed_format_exc_markdown()}"]) chatbot.append([None, f"DOC2X服务不可用请检查报错详细{trimmed_format_exc_markdown()}"])
yield from update_ui(chatbot=chatbot, history=history) yield from update_ui(chatbot=chatbot, history=history)
if method == "GROBID": if method == "GROBID":

View File

@ -300,7 +300,8 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin
write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder) write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder)
# <-------- 写出文件 ----------> # <-------- 写出文件 ---------->
msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}" model_name = llm_kwargs['llm_model'].replace('_', '\\_') # 替换LLM模型名称中的下划线为转义字符
msg = f"当前大语言模型: {model_name},当前语言模型温度设定: {llm_kwargs['temperature']}"
final_tex = lps.merge_result(pfg.file_result, mode, msg) final_tex = lps.merge_result(pfg.file_result, mode, msg)
objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl')) objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl'))
@ -342,7 +343,6 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work
logger.error("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") logger.error("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.")
return False, -1, [-1] return False, -1, [-1]
def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'): def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'):
import os, time import os, time
n_fix = 1 n_fix = 1
@ -351,6 +351,31 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder}如果程序停顿5分钟以上请直接去该路径下取回翻译结果或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder}如果程序停顿5分钟以上请直接去该路径下取回翻译结果或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面
# 检查是否需要使用xelatex
def check_if_need_xelatex(tex_path):
try:
with open(tex_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read(5000)
# 检查是否有使用xelatex的宏包
return any(pkg in content for pkg in ['fontspec', 'xeCJK', 'xetex', 'unicode-math', 'xltxtra', 'xunicode'])
except Exception:
return False
# 根据编译器类型返回编译命令
def get_compile_command(compiler, filename):
return f'{compiler} -interaction=batchmode -file-line-error {filename}.tex'
# 确定使用的编译器
compiler = 'pdflatex'
if check_if_need_xelatex(pj(work_folder_modified, f'{main_file_modified}.tex')):
logger.info("检测到宏包需要xelatex编译切换至xelatex编译")
# Check if xelatex is installed
try:
import subprocess
subprocess.run(['xelatex', '--version'], capture_output=True, check=True)
compiler = 'xelatex'
except (subprocess.CalledProcessError, FileNotFoundError):
raise RuntimeError("检测到需要使用xelatex编译但系统中未安装xelatex。请先安装texlive或其他提供xelatex的LaTeX发行版。")
while True: while True:
import os import os
@ -361,10 +386,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
# https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')):
# 只有第二步成功,才能继续下面的步骤 # 只有第二步成功,才能继续下面的步骤
@ -375,10 +400,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified) ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified)
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
if mode!='translate_zh': if mode!='translate_zh':
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
@ -386,10 +411,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd()) ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd())
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder) ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
# <---------- 检查结果 -----------> # <---------- 检查结果 ----------->
results_ = "" results_ = ""

View File

@ -1,125 +1,175 @@
from toolbox import get_log_folder, gen_time_str, get_conf from toolbox import get_log_folder, gen_time_str, get_conf
from toolbox import update_ui, promote_file_to_downloadzone from toolbox import update_ui, promote_file_to_downloadzone
from toolbox import promote_file_to_downloadzone, extract_archive from toolbox import extract_archive
from toolbox import generate_file_link, zip_folder from toolbox import generate_file_link, zip_folder
from crazy_functions.crazy_utils import get_files_from_everything from crazy_functions.crazy_utils import get_files_from_everything
from shared_utils.colorful import * from shared_utils.colorful import *
from loguru import logger from loguru import logger
import os import os
import requests
import time import time
def refresh_key(doc2x_api_key):
import requests, json
url = "https://api.doc2x.noedgeai.com/api/token/refresh"
res = requests.post(
url,
headers={"Authorization": "Bearer " + doc2x_api_key}
)
res_json = []
if res.status_code == 200:
decoded = res.content.decode("utf-8")
res_json = json.loads(decoded)
doc2x_api_key = res_json['data']['token']
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
return doc2x_api_key
def retry_request(max_retries=3, delay=3):
"""
Decorator for retrying HTTP requests
Args:
max_retries: Maximum number of retry attempts
delay: Delay between retries in seconds
"""
def decorator(func):
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
if attempt < max_retries - 1:
logger.error(
f"Request failed, retrying... ({attempt + 1}/{max_retries}) Error: {e}"
)
time.sleep(delay)
continue
raise e
return None
return wrapper
return decorator
@retry_request()
def make_request(method, url, **kwargs):
"""
Make HTTP request with retry mechanism
"""
return requests.request(method, url, **kwargs)
def 状态检查(response, uid=""):
"""
Check the status of Doc2x API response
Args:
response_data: Response object from Doc2x API
"""
response_json = response.json()
response_data = response_json.get("data", {})
code = response_json.get("code", "Unknown")
meg = response_data.get("message", response_json)
trace_id = response.headers.get("trace-id", "Failed to get trace-id")
if response.status_code != 200:
raise RuntimeError(
f"Doc2x return an error:\nTrace ID: {trace_id} {uid}\n{response.status_code} - {response_json}"
)
if code in ["parse_page_limit_exceeded", "parse_concurrency_limit"]:
raise RuntimeError(
f"Reached the limit of Doc2x:\nTrace ID: {trace_id} {uid}\n{code} - {meg}"
)
if code not in ["ok", "success"]:
raise RuntimeError(
f"Doc2x return an error:\nTrace ID: {trace_id} {uid}\n{code} - {meg}"
)
return response_data
def 解析PDF_DOC2X_转Latex(pdf_file_path): def 解析PDF_DOC2X_转Latex(pdf_file_path):
zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format='tex') zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format="tex")
return unzipped_folder return unzipped_folder
def 解析PDF_DOC2X(pdf_file_path, format='tex'): def 解析PDF_DOC2X(pdf_file_path, format="tex"):
""" """
format: 'tex', 'md', 'docx' format: 'tex', 'md', 'docx'
""" """
import requests, json, os
DOC2X_API_KEY = get_conf('DOC2X_API_KEY') DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
latex_dir = get_log_folder(plugin_name="pdf_ocr_latex") latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
markdown_dir = get_log_folder(plugin_name="pdf_ocr") markdown_dir = get_log_folder(plugin_name="pdf_ocr")
doc2x_api_key = DOC2X_API_KEY doc2x_api_key = DOC2X_API_KEY
# < ------ 第1步预上传获取URL然后上传文件 ------ >
logger.info("Doc2x 上传文件预上传获取URL")
res = make_request(
"POST",
"https://v2.doc2x.noedgeai.com/api/v2/parse/preupload",
headers={"Authorization": "Bearer " + doc2x_api_key},
timeout=15,
)
res_data = 状态检查(res)
upload_url = res_data["url"]
uuid = res_data["uid"]
# < ------ 第1步上传 ------ > logger.info("Doc2x 上传文件:上传文件")
logger.info("Doc2x 第1步上传") with open(pdf_file_path, "rb") as file:
with open(pdf_file_path, 'rb') as file: res = make_request("PUT", upload_url, data=file, timeout=60)
res = requests.post( res.raise_for_status()
"https://v2.doc2x.noedgeai.com/api/v2/parse/pdf",
headers={"Authorization": "Bearer " + doc2x_api_key},
data=file
)
# res_json = []
if res.status_code == 200:
res_json = res.json()
else:
raise RuntimeError(f"Doc2x return an error: {res.json()}")
uuid = res_json['data']['uid']
# < ------ 第2步轮询等待 ------ > # < ------ 第2步轮询等待 ------ >
logger.info("Doc2x 第2步轮询等待") logger.info("Doc2x 处理文件中:轮询等待")
params = {'uid': uuid} params = {"uid": uuid}
while True: max_attempts = 60
res = requests.get( attempt = 0
'https://v2.doc2x.noedgeai.com/api/v2/parse/status', while attempt < max_attempts:
res = make_request(
"GET",
"https://v2.doc2x.noedgeai.com/api/v2/parse/status",
headers={"Authorization": "Bearer " + doc2x_api_key}, headers={"Authorization": "Bearer " + doc2x_api_key},
params=params params=params,
timeout=15,
) )
res_json = res.json() res_data = 状态检查(res)
if res_json['data']['status'] == "success": if res_data["status"] == "success":
break break
elif res_json['data']['status'] == "processing": elif res_data["status"] == "processing":
time.sleep(3) time.sleep(5)
logger.info(f"Doc2x is processing at {res_json['data']['progress']}%") logger.info(f"Doc2x is processing at {res_data['progress']}%")
elif res_json['data']['status'] == "failed": attempt += 1
raise RuntimeError(f"Doc2x return an error: {res_json}") else:
raise RuntimeError(f"Doc2x return an error: {res_data}")
if attempt >= max_attempts:
raise RuntimeError("Doc2x processing timeout after maximum attempts")
# < ------ 第3步提交转化 ------ > # < ------ 第3步提交转化 ------ >
logger.info("Doc2x 第3步提交转化") logger.info("Doc2x 第3步提交转化")
data = { data = {"uid": uuid, "to": format, "formula_mode": "dollar", "filename": "output"}
"uid": uuid, res = make_request(
"to": format, "POST",
"formula_mode": "dollar", "https://v2.doc2x.noedgeai.com/api/v2/convert/parse",
"filename": "output"
}
res = requests.post(
'https://v2.doc2x.noedgeai.com/api/v2/convert/parse',
headers={"Authorization": "Bearer " + doc2x_api_key}, headers={"Authorization": "Bearer " + doc2x_api_key},
json=data json=data,
timeout=15,
) )
if res.status_code == 200: 状态检查(res, uid=f"uid: {uuid}")
res_json = res.json()
else:
raise RuntimeError(f"Doc2x return an error: {res.json()}")
# < ------ 第4步等待结果 ------ > # < ------ 第4步等待结果 ------ >
logger.info("Doc2x 第4步等待结果") logger.info("Doc2x 第4步等待结果")
params = {'uid': uuid} params = {"uid": uuid}
while True: max_attempts = 36
res = requests.get( attempt = 0
'https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result', while attempt < max_attempts:
res = make_request(
"GET",
"https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result",
headers={"Authorization": "Bearer " + doc2x_api_key}, headers={"Authorization": "Bearer " + doc2x_api_key},
params=params params=params,
timeout=15,
) )
res_json = res.json() res_data = 状态检查(res, uid=f"uid: {uuid}")
if res_json['data']['status'] == "success": if res_data["status"] == "success":
break break
elif res_json['data']['status'] == "processing": elif res_data["status"] == "processing":
time.sleep(3) time.sleep(3)
logger.info(f"Doc2x still processing") logger.info("Doc2x still processing to convert file")
elif res_json['data']['status'] == "failed": attempt += 1
raise RuntimeError(f"Doc2x return an error: {res_json}") if attempt >= max_attempts:
raise RuntimeError("Doc2x conversion timeout after maximum attempts")
# < ------ 第5步最后的处理 ------ > # < ------ 第5步最后的处理 ------ >
logger.info("Doc2x 第5步最后的处理") logger.info("Doc2x 第5步下载转换后的文件")
if format=='tex': if format == "tex":
target_path = latex_dir target_path = latex_dir
if format=='md': if format == "md":
target_path = markdown_dir target_path = markdown_dir
os.makedirs(target_path, exist_ok=True) os.makedirs(target_path, exist_ok=True)
@ -127,17 +177,18 @@ def 解析PDF_DOC2X(pdf_file_path, format='tex'):
# < ------ 下载 ------ > # < ------ 下载 ------ >
for attempt in range(max_attempt): for attempt in range(max_attempt):
try: try:
result_url = res_json['data']['url'] result_url = res_data["url"]
res = requests.get(result_url) res = make_request("GET", result_url, timeout=60)
zip_path = os.path.join(target_path, gen_time_str() + '.zip') zip_path = os.path.join(target_path, gen_time_str() + ".zip")
unzip_path = os.path.join(target_path, gen_time_str()) unzip_path = os.path.join(target_path, gen_time_str())
if res.status_code == 200: if res.status_code == 200:
with open(zip_path, "wb") as f: f.write(res.content) with open(zip_path, "wb") as f:
f.write(res.content)
else: else:
raise RuntimeError(f"Doc2x return an error: {res.json()}") raise RuntimeError(f"Doc2x return an error: {res.json()}")
except Exception as e: except Exception as e:
if attempt < max_attempt - 1: if attempt < max_attempt - 1:
logger.error(f"Failed to download latex file, retrying... {e}") logger.error(f"Failed to download uid = {uuid} file, retrying... {e}")
time.sleep(3) time.sleep(3)
continue continue
else: else:
@ -145,22 +196,32 @@ def 解析PDF_DOC2X(pdf_file_path, format='tex'):
# < ------ 解压 ------ > # < ------ 解压 ------ >
import zipfile import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(unzip_path) zip_ref.extractall(unzip_path)
return zip_path, unzip_path return zip_path, unzip_path
def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request): def 解析PDF_DOC2X_单文件(
fp,
project_folder,
llm_kwargs,
plugin_kwargs,
chatbot,
history,
system_prompt,
DOC2X_API_KEY,
user_request,
):
def pdf2markdown(filepath): def pdf2markdown(filepath):
chatbot.append((None, f"Doc2x 解析中")) chatbot.append((None, f"Doc2x 解析中"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format='md') md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format="md")
promote_file_to_downloadzone(md_zip_path, chatbot=chatbot) promote_file_to_downloadzone(md_zip_path, chatbot=chatbot)
chatbot.append((None, f"完成解析 {md_zip_path} ...")) chatbot.append((None, f"完成解析 {md_zip_path} ..."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return md_zip_path return md_zip_path
def deliver_to_markdown_plugin(md_zip_path, user_request): def deliver_to_markdown_plugin(md_zip_path, user_request):
@ -174,77 +235,97 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
os.makedirs(target_path_base, exist_ok=True) os.makedirs(target_path_base, exist_ok=True)
shutil.copyfile(md_zip_path, this_file_path) shutil.copyfile(md_zip_path, this_file_path)
ex_folder = this_file_path + ".extract" ex_folder = this_file_path + ".extract"
extract_archive( extract_archive(file_path=this_file_path, dest_dir=ex_folder)
file_path=this_file_path, dest_dir=ex_folder
)
# edit markdown files # edit markdown files
success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md') success, file_manifest, project_folder = get_files_from_everything(
ex_folder, type=".md"
)
for generated_fp in file_manifest: for generated_fp in file_manifest:
# 修正一些公式问题 # 修正一些公式问题
with open(generated_fp, 'r', encoding='utf8') as f: with open(generated_fp, "r", encoding="utf8") as f:
content = f.read() content = f.read()
# 将公式中的\[ \]替换成$$ # 将公式中的\[ \]替换成$$
content = content.replace(r'\[', r'$$').replace(r'\]', r'$$') content = content.replace(r"\[", r"$$").replace(r"\]", r"$$")
# 将公式中的\( \)替换成$ # 将公式中的\( \)替换成$
content = content.replace(r'\(', r'$').replace(r'\)', r'$') content = content.replace(r"\(", r"$").replace(r"\)", r"$")
content = content.replace('```markdown', '\n').replace('```', '\n') content = content.replace("```markdown", "\n").replace("```", "\n")
with open(generated_fp, 'w', encoding='utf8') as f: with open(generated_fp, "w", encoding="utf8") as f:
f.write(content) f.write(content)
promote_file_to_downloadzone(generated_fp, chatbot=chatbot) promote_file_to_downloadzone(generated_fp, chatbot=chatbot)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 生成在线预览html # 生成在线预览html
file_name = '在线预览翻译(原文)' + gen_time_str() + '.html' file_name = "在线预览翻译(原文)" + gen_time_str() + ".html"
preview_fp = os.path.join(ex_folder, file_name) preview_fp = os.path.join(ex_folder, file_name)
from shared_utils.advanced_markdown_format import markdown_convertion_for_file from shared_utils.advanced_markdown_format import (
markdown_convertion_for_file,
)
with open(generated_fp, "r", encoding="utf-8") as f: with open(generated_fp, "r", encoding="utf-8") as f:
md = f.read() md = f.read()
# # Markdown中使用不标准的表格需要在表格前加上一个emoji以便公式渲染 # # Markdown中使用不标准的表格需要在表格前加上一个emoji以便公式渲染
# md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE) # md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
html = markdown_convertion_for_file(md) html = markdown_convertion_for_file(md)
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) with open(preview_fp, "w", encoding="utf-8") as f:
f.write(html)
chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"]) chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"])
promote_file_to_downloadzone(preview_fp, chatbot=chatbot) promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
chatbot.append((None, f"调用Markdown插件 {ex_folder} ...")) chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
plugin_kwargs['markdown_expected_output_dir'] = ex_folder plugin_kwargs["markdown_expected_output_dir"] = ex_folder
translated_f_name = 'translated_markdown.md' translated_f_name = "translated_markdown.md"
generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name) generated_fp = plugin_kwargs["markdown_expected_output_path"] = os.path.join(
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 ex_folder, translated_f_name
yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) )
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
yield from Markdown英译中(
ex_folder,
llm_kwargs,
plugin_kwargs,
chatbot,
history,
system_prompt,
user_request,
)
if os.path.exists(generated_fp): if os.path.exists(generated_fp):
# 修正一些公式问题 # 修正一些公式问题
with open(generated_fp, 'r', encoding='utf8') as f: content = f.read() with open(generated_fp, "r", encoding="utf8") as f:
content = content.replace('```markdown', '\n').replace('```', '\n') content = f.read()
content = content.replace("```markdown", "\n").replace("```", "\n")
# Markdown中使用不标准的表格需要在表格前加上一个emoji以便公式渲染 # Markdown中使用不标准的表格需要在表格前加上一个emoji以便公式渲染
# content = re.sub(r'^<table>', r'.<table>', content, flags=re.MULTILINE) # content = re.sub(r'^<table>', r'.<table>', content, flags=re.MULTILINE)
with open(generated_fp, 'w', encoding='utf8') as f: f.write(content) with open(generated_fp, "w", encoding="utf8") as f:
f.write(content)
# 生成在线预览html # 生成在线预览html
file_name = '在线预览翻译' + gen_time_str() + '.html' file_name = "在线预览翻译" + gen_time_str() + ".html"
preview_fp = os.path.join(ex_folder, file_name) preview_fp = os.path.join(ex_folder, file_name)
from shared_utils.advanced_markdown_format import markdown_convertion_for_file from shared_utils.advanced_markdown_format import (
markdown_convertion_for_file,
)
with open(generated_fp, "r", encoding="utf-8") as f: with open(generated_fp, "r", encoding="utf-8") as f:
md = f.read() md = f.read()
html = markdown_convertion_for_file(md) html = markdown_convertion_for_file(md)
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) with open(preview_fp, "w", encoding="utf-8") as f:
f.write(html)
promote_file_to_downloadzone(preview_fp, chatbot=chatbot) promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
# 生成包含图片的压缩包 # 生成包含图片的压缩包
dest_folder = get_log_folder(chatbot.get_user()) dest_folder = get_log_folder(chatbot.get_user())
zip_name = '翻译后的带图文档.zip' zip_name = "翻译后的带图文档.zip"
zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name) zip_folder(
source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name
)
zip_fp = os.path.join(dest_folder, zip_name) zip_fp = os.path.join(dest_folder, zip_name)
promote_file_to_downloadzone(zip_fp, chatbot=chatbot) promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
md_zip_path = yield from pdf2markdown(fp) md_zip_path = yield from pdf2markdown(fp)
yield from deliver_to_markdown_plugin(md_zip_path, user_request) yield from deliver_to_markdown_plugin(md_zip_path, user_request)
def 解析PDF_基于DOC2X(file_manifest, *args): def 解析PDF_基于DOC2X(file_manifest, *args):
for index, fp in enumerate(file_manifest): for index, fp in enumerate(file_manifest):
yield from 解析PDF_DOC2X_单文件(fp, *args) yield from 解析PDF_DOC2X_单文件(fp, *args)
return return