添加doc2x超时设置并添加对xelatex编译的支持

This commit is contained in:
Menghuan1918 2024-11-29 15:02:58 +08:00
parent c49e896082
commit 05a5add8da
2 changed files with 40 additions and 13 deletions

View File

@ -342,7 +342,6 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work
logger.error("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") logger.error("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.")
return False, -1, [-1] return False, -1, [-1]
def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'): def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'):
import os, time import os, time
n_fix = 1 n_fix = 1
@ -351,6 +350,24 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder}如果程序停顿5分钟以上请直接去该路径下取回翻译结果或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder}如果程序停顿5分钟以上请直接去该路径下取回翻译结果或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面
# 检查是否需要使用xelatex
def check_if_need_xelatex(tex_path):
try:
with open(tex_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read(5000)
# 检查是否有使用xelatex的宏包
return any(pkg in content for pkg in ['fontspec', 'xeCJK', 'xetex', 'unicode-math', 'xltxtra', 'xunicode'])
except Exception:
return False
# 根据编译器类型返回编译命令
def get_compile_command(compiler, filename):
return f'{compiler} -interaction=batchmode -file-line-error {filename}.tex'
# 确定使用的编译器
compiler = 'pdflatex'
if check_if_need_xelatex(pj(work_folder_modified, f'{main_file_modified}.tex')):
compiler = 'xelatex'
while True: while True:
import os import os
@ -361,10 +378,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
# https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')):
# 只有第二步成功,才能继续下面的步骤 # 只有第二步成功,才能继续下面的步骤
@ -375,10 +392,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified) ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified)
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
if mode!='translate_zh': if mode!='translate_zh':
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
@ -386,10 +403,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd()) ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd())
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder) ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
# <---------- 检查结果 -----------> # <---------- 检查结果 ----------->
results_ = "" results_ = ""

View File

@ -70,7 +70,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
# < ------ 第2步轮询等待 ------ > # < ------ 第2步轮询等待 ------ >
logger.info("Doc2x 处理文件中:轮询等待") logger.info("Doc2x 处理文件中:轮询等待")
params = {"uid": uuid} params = {"uid": uuid}
while True: max_attempts = 60
attempt = 0
while attempt < max_attempts:
res = requests.get( res = requests.get(
"https://v2.doc2x.noedgeai.com/api/v2/parse/status", "https://v2.doc2x.noedgeai.com/api/v2/parse/status",
headers={"Authorization": "Bearer " + doc2x_api_key}, headers={"Authorization": "Bearer " + doc2x_api_key},
@ -82,8 +84,11 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
elif res_data["status"] == "processing": elif res_data["status"] == "processing":
time.sleep(5) time.sleep(5)
logger.info(f"Doc2x is processing at {res_data['progress']}%") logger.info(f"Doc2x is processing at {res_data['progress']}%")
attempt += 1
else: else:
raise RuntimeError(f"Doc2x return an error: {res_data}") raise RuntimeError(f"Doc2x return an error: {res_data}")
if attempt >= max_attempts:
raise RuntimeError("Doc2x processing timeout after maximum attempts")
# < ------ 第3步提交转化 ------ > # < ------ 第3步提交转化 ------ >
logger.info("Doc2x 第3步提交转化") logger.info("Doc2x 第3步提交转化")
@ -98,7 +103,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
# < ------ 第4步等待结果 ------ > # < ------ 第4步等待结果 ------ >
logger.info("Doc2x 第4步等待结果") logger.info("Doc2x 第4步等待结果")
params = {"uid": uuid} params = {"uid": uuid}
while True: max_attempts = 36
attempt = 0
while attempt < max_attempts:
res = requests.get( res = requests.get(
"https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result", "https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result",
headers={"Authorization": "Bearer " + doc2x_api_key}, headers={"Authorization": "Bearer " + doc2x_api_key},
@ -110,6 +117,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
elif res_data["status"] == "processing": elif res_data["status"] == "processing":
time.sleep(3) time.sleep(3)
logger.info("Doc2x still processing to convert file") logger.info("Doc2x still processing to convert file")
attempt += 1
if attempt >= max_attempts:
raise RuntimeError("Doc2x conversion timeout after maximum attempts")
# < ------ 第5步最后的处理 ------ > # < ------ 第5步最后的处理 ------ >
logger.info("Doc2x 第5步最后的处理") logger.info("Doc2x 第5步最后的处理")
@ -124,7 +134,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
# < ------ 下载 ------ > # < ------ 下载 ------ >
for attempt in range(max_attempt): for attempt in range(max_attempt):
try: try:
result_url = res_json["data"]["url"] result_url = res_data["url"]
res = requests.get(result_url) res = requests.get(result_url)
zip_path = os.path.join(target_path, gen_time_str() + ".zip") zip_path = os.path.join(target_path, gen_time_str() + ".zip")
unzip_path = os.path.join(target_path, gen_time_str()) unzip_path = os.path.join(target_path, gen_time_str())