添加doc2x超时设置并添加对xelatex编译的支持

2024-11-29 15:02:58 +08:00 · 2024-11-29 15:02:58 +08:00 · 05a5add8da
parent c49e896082
commit 05a5add8da
2 changed files with 40 additions and 13 deletions
--- a/crazy_functions/latex_fns/latex_actions.py
+++ b/crazy_functions/latex_fns/latex_actions.py
@ -342,7 +342,6 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work
        logger.error("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.")
        return False, -1, [-1]

-
 def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'):
    import os, time
    n_fix = 1
@ -351,6 +350,24 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
    chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder}，如果程序停顿5分钟以上，请直接去该路径下取回翻译结果，或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
    chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
    yield from update_ui_lastest_msg('编译已经开始...', chatbot, history)   # 刷新Gradio前端界面
+    # 检查是否需要使用xelatex
+    def check_if_need_xelatex(tex_path):
+        try:
+            with open(tex_path, 'r', encoding='utf-8', errors='replace') as f:
+                content = f.read(5000)
+                # 检查是否有使用xelatex的宏包
+                return any(pkg in content for pkg in ['fontspec', 'xeCJK', 'xetex', 'unicode-math', 'xltxtra', 'xunicode'])
+        except Exception:
+            return False
+
+    # 根据编译器类型返回编译命令
+    def get_compile_command(compiler, filename):
+        return f'{compiler} -interaction=batchmode -file-line-error {filename}.tex'
+
+    # 确定使用的编译器
+    compiler = 'pdflatex'
+    if check_if_need_xelatex(pj(work_folder_modified, f'{main_file_modified}.tex')):
+        compiler = 'xelatex'

    while True:
        import os
@ -361,10 +378,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f

        # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
        yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history)   # 刷新Gradio前端界面
-        ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
+        ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)

        yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history)   # 刷新Gradio前端界面
-        ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
+        ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)

        if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')):
            # 只有第二步成功，才能继续下面的步骤
@ -375,10 +392,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
                ok = compile_latex_with_timeout(f'bibtex  {main_file_modified}.aux', work_folder_modified)

            yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history)  # 刷新Gradio前端界面
-            ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
-            ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
-            ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
-            ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
+            ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
+            ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
+            ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
+            ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)

            if mode!='translate_zh':
                yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
@ -386,10 +403,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
                ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex  {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd())

                yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history)   # 刷新Gradio前端界面
-                ok = compile_latex_with_timeout(f'pdflatex  -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
+                ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
                ok = compile_latex_with_timeout(f'bibtex    merge_diff.aux', work_folder)
-                ok = compile_latex_with_timeout(f'pdflatex  -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
-                ok = compile_latex_with_timeout(f'pdflatex  -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
+                ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
+                ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)

        # <---------- 检查结果 ----------->
        results_ = ""
--- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
+++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
@ -70,7 +70,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
    # < ------ 第2步：轮询等待 ------ >
    logger.info("Doc2x 处理文件中：轮询等待")
    params = {"uid": uuid}
-    while True:
+    max_attempts = 60
+    attempt = 0
+    while attempt < max_attempts:
        res = requests.get(
            "https://v2.doc2x.noedgeai.com/api/v2/parse/status",
            headers={"Authorization": "Bearer " + doc2x_api_key},
@ -82,8 +84,11 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
        elif res_data["status"] == "processing":
            time.sleep(5)
            logger.info(f"Doc2x is processing at {res_data['progress']}%")
+            attempt += 1
        else:
            raise RuntimeError(f"Doc2x return an error: {res_data}")
+    if attempt >= max_attempts:
+        raise RuntimeError("Doc2x processing timeout after maximum attempts")

    # < ------ 第3步：提交转化 ------ >
    logger.info("Doc2x 第3步：提交转化")
@ -98,7 +103,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
    # < ------ 第4步：等待结果 ------ >
    logger.info("Doc2x 第4步：等待结果")
    params = {"uid": uuid}
-    while True:
+    max_attempts = 36
+    attempt = 0
+    while attempt < max_attempts:
        res = requests.get(
            "https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result",
            headers={"Authorization": "Bearer " + doc2x_api_key},
@ -110,6 +117,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
        elif res_data["status"] == "processing":
            time.sleep(3)
            logger.info("Doc2x still processing to convert file")
+            attempt += 1
+    if attempt >= max_attempts:
+        raise RuntimeError("Doc2x conversion timeout after maximum attempts")

    # < ------ 第5步：最后的处理 ------ >
    logger.info("Doc2x 第5步：最后的处理")
@ -124,7 +134,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"):
    # < ------ 下载 ------ >
    for attempt in range(max_attempt):
        try:
-            result_url = res_json["data"]["url"]
+            result_url = res_data["url"]
            res = requests.get(result_url)
            zip_path = os.path.join(target_path, gen_time_str() + ".zip")
            unzip_path = os.path.join(target_path, gen_time_str())