From e05f105063b4d39a58563247b23a8ce2df8d3e0f Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Thu, 28 Nov 2024 14:10:27 +0800 Subject: [PATCH 1/6] =?UTF-8?q?doc2x=E8=AF=B7=E6=B1=82=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E6=B8=85=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pdf_fns/parse_pdf_via_doc2x.py | 194 ++++++++++-------- 1 file changed, 106 insertions(+), 88 deletions(-) diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index 97c62fbf..45d5ca1d 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -8,118 +8,105 @@ from loguru import logger import os import time -def refresh_key(doc2x_api_key): - import requests, json - url = "https://api.doc2x.noedgeai.com/api/token/refresh" - res = requests.post( - url, - headers={"Authorization": "Bearer " + doc2x_api_key} - ) - res_json = [] - if res.status_code == 200: - decoded = res.content.decode("utf-8") - res_json = json.loads(decoded) - doc2x_api_key = res_json['data']['token'] - else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) - return doc2x_api_key +def 状态检查(code: str, meg: str, trace_id: str): + trace_id = trace_id or "Failed to get trace_id" + if code in ["parse_page_limit_exceeded", "parse_concurrency_limit"]: + raise RuntimeError( + f"Reached the limit of Doc2x:\nTrace ID: {trace_id}\n{code} - {meg}" + ) + if code not in ["ok", "success"]: + raise RuntimeError( + f"Doc2x return an error:\nTrace ID: {trace_id}\n{code} - {meg}" + ) def 解析PDF_DOC2X_转Latex(pdf_file_path): - zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format='tex') + zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format="tex") return unzipped_folder -def 解析PDF_DOC2X(pdf_file_path, format='tex'): +def 解析PDF_DOC2X(pdf_file_path, format="tex"): """ - format: 'tex', 'md', 'docx' + format: 'tex', 'md', 'docx' """ import requests, json, os - DOC2X_API_KEY = get_conf('DOC2X_API_KEY') + + DOC2X_API_KEY = get_conf("DOC2X_API_KEY") latex_dir = get_log_folder(plugin_name="pdf_ocr_latex") markdown_dir = get_log_folder(plugin_name="pdf_ocr") doc2x_api_key = DOC2X_API_KEY - # < ------ 第1步:上传 ------ > logger.info("Doc2x 第1步:上传") - with open(pdf_file_path, 'rb') as file: + with open(pdf_file_path, "rb") as file: res = requests.post( "https://v2.doc2x.noedgeai.com/api/v2/parse/pdf", headers={"Authorization": "Bearer " + doc2x_api_key}, - data=file + data=file, ) # res_json = [] if res.status_code == 200: res_json = res.json() else: raise RuntimeError(f"Doc2x return an error: {res.json()}") - uuid = res_json['data']['uid'] + uuid = res_json["data"]["uid"] # < ------ 第2步:轮询等待 ------ > logger.info("Doc2x 第2步:轮询等待") - params = {'uid': uuid} + params = {"uid": uuid} while True: res = requests.get( - 'https://v2.doc2x.noedgeai.com/api/v2/parse/status', + "https://v2.doc2x.noedgeai.com/api/v2/parse/status", headers={"Authorization": "Bearer " + doc2x_api_key}, - params=params + params=params, ) res_json = res.json() - if res_json['data']['status'] == "success": + if res_json["data"]["status"] == "success": break - elif res_json['data']['status'] == "processing": + elif res_json["data"]["status"] == "processing": time.sleep(3) logger.info(f"Doc2x is processing at {res_json['data']['progress']}%") - elif res_json['data']['status'] == "failed": + elif res_json["data"]["status"] == "failed": raise RuntimeError(f"Doc2x return an error: {res_json}") - # < ------ 第3步:提交转化 ------ > logger.info("Doc2x 第3步:提交转化") - data = { - "uid": uuid, - "to": format, - "formula_mode": "dollar", - "filename": "output" - } + data = {"uid": uuid, "to": format, "formula_mode": "dollar", "filename": "output"} res = requests.post( - 'https://v2.doc2x.noedgeai.com/api/v2/convert/parse', + "https://v2.doc2x.noedgeai.com/api/v2/convert/parse", headers={"Authorization": "Bearer " + doc2x_api_key}, - json=data + json=data, ) if res.status_code == 200: res_json = res.json() else: raise RuntimeError(f"Doc2x return an error: {res.json()}") - # < ------ 第4步:等待结果 ------ > logger.info("Doc2x 第4步:等待结果") - params = {'uid': uuid} + params = {"uid": uuid} while True: res = requests.get( - 'https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result', + "https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result", headers={"Authorization": "Bearer " + doc2x_api_key}, - params=params + params=params, ) res_json = res.json() - if res_json['data']['status'] == "success": + if res_json["data"]["status"] == "success": break - elif res_json['data']['status'] == "processing": + elif res_json["data"]["status"] == "processing": time.sleep(3) logger.info(f"Doc2x still processing") - elif res_json['data']['status'] == "failed": + elif res_json["data"]["status"] == "failed": raise RuntimeError(f"Doc2x return an error: {res_json}") - # < ------ 第5步:最后的处理 ------ > logger.info("Doc2x 第5步:最后的处理") - if format=='tex': + if format == "tex": target_path = latex_dir - if format=='md': + if format == "md": target_path = markdown_dir os.makedirs(target_path, exist_ok=True) @@ -127,12 +114,13 @@ def 解析PDF_DOC2X(pdf_file_path, format='tex'): # < ------ 下载 ------ > for attempt in range(max_attempt): try: - result_url = res_json['data']['url'] + result_url = res_json["data"]["url"] res = requests.get(result_url) - zip_path = os.path.join(target_path, gen_time_str() + '.zip') + zip_path = os.path.join(target_path, gen_time_str() + ".zip") unzip_path = os.path.join(target_path, gen_time_str()) if res.status_code == 200: - with open(zip_path, "wb") as f: f.write(res.content) + with open(zip_path, "wb") as f: + f.write(res.content) else: raise RuntimeError(f"Doc2x return an error: {res.json()}") except Exception as e: @@ -145,22 +133,32 @@ def 解析PDF_DOC2X(pdf_file_path, format='tex'): # < ------ 解压 ------ > import zipfile - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + + with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(unzip_path) return zip_path, unzip_path -def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request): - +def 解析PDF_DOC2X_单文件( + fp, + project_folder, + llm_kwargs, + plugin_kwargs, + chatbot, + history, + system_prompt, + DOC2X_API_KEY, + user_request, +): def pdf2markdown(filepath): chatbot.append((None, f"Doc2x 解析中")) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format='md') + md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format="md") promote_file_to_downloadzone(md_zip_path, chatbot=chatbot) chatbot.append((None, f"完成解析 {md_zip_path} ...")) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return md_zip_path def deliver_to_markdown_plugin(md_zip_path, user_request): @@ -174,77 +172,97 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha os.makedirs(target_path_base, exist_ok=True) shutil.copyfile(md_zip_path, this_file_path) ex_folder = this_file_path + ".extract" - extract_archive( - file_path=this_file_path, dest_dir=ex_folder - ) + extract_archive(file_path=this_file_path, dest_dir=ex_folder) # edit markdown files - success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md') + success, file_manifest, project_folder = get_files_from_everything( + ex_folder, type=".md" + ) for generated_fp in file_manifest: # 修正一些公式问题 - with open(generated_fp, 'r', encoding='utf8') as f: + with open(generated_fp, "r", encoding="utf8") as f: content = f.read() # 将公式中的\[ \]替换成$$ - content = content.replace(r'\[', r'$$').replace(r'\]', r'$$') + content = content.replace(r"\[", r"$$").replace(r"\]", r"$$") # 将公式中的\( \)替换成$ - content = content.replace(r'\(', r'$').replace(r'\)', r'$') - content = content.replace('```markdown', '\n').replace('```', '\n') - with open(generated_fp, 'w', encoding='utf8') as f: + content = content.replace(r"\(", r"$").replace(r"\)", r"$") + content = content.replace("```markdown", "\n").replace("```", "\n") + with open(generated_fp, "w", encoding="utf8") as f: f.write(content) promote_file_to_downloadzone(generated_fp, chatbot=chatbot) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 生成在线预览html - file_name = '在线预览翻译(原文)' + gen_time_str() + '.html' + file_name = "在线预览翻译(原文)" + gen_time_str() + ".html" preview_fp = os.path.join(ex_folder, file_name) - from shared_utils.advanced_markdown_format import markdown_convertion_for_file + from shared_utils.advanced_markdown_format import ( + markdown_convertion_for_file, + ) + with open(generated_fp, "r", encoding="utf-8") as f: md = f.read() # # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 # md = re.sub(r'^', r'.
', md, flags=re.MULTILINE) html = markdown_convertion_for_file(md) - with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) + with open(preview_fp, "w", encoding="utf-8") as f: + f.write(html) chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"]) promote_file_to_downloadzone(preview_fp, chatbot=chatbot) - - chatbot.append((None, f"调用Markdown插件 {ex_folder} ...")) - plugin_kwargs['markdown_expected_output_dir'] = ex_folder + plugin_kwargs["markdown_expected_output_dir"] = ex_folder - translated_f_name = 'translated_markdown.md' - generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) + translated_f_name = "translated_markdown.md" + generated_fp = plugin_kwargs["markdown_expected_output_path"] = os.path.join( + ex_folder, translated_f_name + ) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + yield from Markdown英译中( + ex_folder, + llm_kwargs, + plugin_kwargs, + chatbot, + history, + system_prompt, + user_request, + ) if os.path.exists(generated_fp): # 修正一些公式问题 - with open(generated_fp, 'r', encoding='utf8') as f: content = f.read() - content = content.replace('```markdown', '\n').replace('```', '\n') + with open(generated_fp, "r", encoding="utf8") as f: + content = f.read() + content = content.replace("```markdown", "\n").replace("```", "\n") # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 # content = re.sub(r'^
', r'.
', content, flags=re.MULTILINE) - with open(generated_fp, 'w', encoding='utf8') as f: f.write(content) + with open(generated_fp, "w", encoding="utf8") as f: + f.write(content) # 生成在线预览html - file_name = '在线预览翻译' + gen_time_str() + '.html' + file_name = "在线预览翻译" + gen_time_str() + ".html" preview_fp = os.path.join(ex_folder, file_name) - from shared_utils.advanced_markdown_format import markdown_convertion_for_file + from shared_utils.advanced_markdown_format import ( + markdown_convertion_for_file, + ) + with open(generated_fp, "r", encoding="utf-8") as f: md = f.read() html = markdown_convertion_for_file(md) - with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) + with open(preview_fp, "w", encoding="utf-8") as f: + f.write(html) promote_file_to_downloadzone(preview_fp, chatbot=chatbot) # 生成包含图片的压缩包 dest_folder = get_log_folder(chatbot.get_user()) - zip_name = '翻译后的带图文档.zip' - zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name) + zip_name = "翻译后的带图文档.zip" + zip_folder( + source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name + ) zip_fp = os.path.join(dest_folder, zip_name) promote_file_to_downloadzone(zip_fp, chatbot=chatbot) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + md_zip_path = yield from pdf2markdown(fp) yield from deliver_to_markdown_plugin(md_zip_path, user_request) + def 解析PDF_基于DOC2X(file_manifest, *args): for index, fp in enumerate(file_manifest): yield from 解析PDF_DOC2X_单文件(fp, *args) return - - From c49e89608295c123b807f89a457baa6dd6ff1bd5 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Thu, 28 Nov 2024 20:58:26 +0800 Subject: [PATCH 2/6] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=B8=AD=E9=97=B4?= =?UTF-8?q?=E9=83=A8=E5=88=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pdf_fns/parse_pdf_via_doc2x.py | 84 +++++++++++-------- 1 file changed, 47 insertions(+), 37 deletions(-) diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index 45d5ca1d..3a37a530 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -1,24 +1,40 @@ from toolbox import get_log_folder, gen_time_str, get_conf from toolbox import update_ui, promote_file_to_downloadzone -from toolbox import promote_file_to_downloadzone, extract_archive +from toolbox import extract_archive from toolbox import generate_file_link, zip_folder from crazy_functions.crazy_utils import get_files_from_everything from shared_utils.colorful import * from loguru import logger import os +import requests import time +import json -def 状态检查(code: str, meg: str, trace_id: str): - trace_id = trace_id or "Failed to get trace_id" +def 状态检查(response, uid=""): + """ + Check the status of Doc2x API response + Args: + response_data: Response object from Doc2x API + """ + response_json = response.json() + response_data = response_json.get("data", {}) + code = response_data.get("code", "Unknown") + meg = response_data.get("message", "") + trace_id = response.headers.get("trace-id", "Failed to get trace-id") + if response.status_code != 200: + raise RuntimeError( + f"Doc2x return an error:\nTrace ID: {trace_id} {uid}\n{response.status_code} - {response_json}" + ) if code in ["parse_page_limit_exceeded", "parse_concurrency_limit"]: raise RuntimeError( - f"Reached the limit of Doc2x:\nTrace ID: {trace_id}\n{code} - {meg}" + f"Reached the limit of Doc2x:\nTrace ID: {trace_id} {uid}\n{code} - {meg}" ) if code not in ["ok", "success"]: raise RuntimeError( - f"Doc2x return an error:\nTrace ID: {trace_id}\n{code} - {meg}" + f"Doc2x return an error:\nTrace ID: {trace_id} {uid}\n{code} - {meg}" ) + return response_data def 解析PDF_DOC2X_转Latex(pdf_file_path): @@ -30,30 +46,29 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): """ format: 'tex', 'md', 'docx' """ - import requests, json, os DOC2X_API_KEY = get_conf("DOC2X_API_KEY") latex_dir = get_log_folder(plugin_name="pdf_ocr_latex") markdown_dir = get_log_folder(plugin_name="pdf_ocr") doc2x_api_key = DOC2X_API_KEY - # < ------ 第1步:上传 ------ > - logger.info("Doc2x 第1步:上传") + # < ------ 第1步:预上传获取URL,然后上传文件 ------ > + logger.info("Doc2x 上传文件:预上传获取URL") + res = requests.post( + "https://v2.doc2x.noedgeai.com/api/v2/parse/preupload", + headers={"Authorization": "Bearer " + doc2x_api_key}, + ) + res_data = 状态检查(res) + upload_url = res_data["url"] + uuid = res_data["uid"] + + logger.info("Doc2x 上传文件:上传文件") with open(pdf_file_path, "rb") as file: - res = requests.post( - "https://v2.doc2x.noedgeai.com/api/v2/parse/pdf", - headers={"Authorization": "Bearer " + doc2x_api_key}, - data=file, - ) - # res_json = [] - if res.status_code == 200: - res_json = res.json() - else: - raise RuntimeError(f"Doc2x return an error: {res.json()}") - uuid = res_json["data"]["uid"] + res = requests.put(upload_url, data=file) + res.raise_for_status() # < ------ 第2步:轮询等待 ------ > - logger.info("Doc2x 第2步:轮询等待") + logger.info("Doc2x 处理文件中:轮询等待") params = {"uid": uuid} while True: res = requests.get( @@ -61,14 +76,14 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): headers={"Authorization": "Bearer " + doc2x_api_key}, params=params, ) - res_json = res.json() - if res_json["data"]["status"] == "success": + res_data = 状态检查(res) + if res_data["status"] == "success": break - elif res_json["data"]["status"] == "processing": - time.sleep(3) - logger.info(f"Doc2x is processing at {res_json['data']['progress']}%") - elif res_json["data"]["status"] == "failed": - raise RuntimeError(f"Doc2x return an error: {res_json}") + elif res_data["status"] == "processing": + time.sleep(5) + logger.info(f"Doc2x is processing at {res_data['progress']}%") + else: + raise RuntimeError(f"Doc2x return an error: {res_data}") # < ------ 第3步:提交转化 ------ > logger.info("Doc2x 第3步:提交转化") @@ -78,10 +93,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): headers={"Authorization": "Bearer " + doc2x_api_key}, json=data, ) - if res.status_code == 200: - res_json = res.json() - else: - raise RuntimeError(f"Doc2x return an error: {res.json()}") + 状态检查(res, uid=f"uid: {uuid}") # < ------ 第4步:等待结果 ------ > logger.info("Doc2x 第4步:等待结果") @@ -92,14 +104,12 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): headers={"Authorization": "Bearer " + doc2x_api_key}, params=params, ) - res_json = res.json() - if res_json["data"]["status"] == "success": + res_data = 状态检查(res, uid=f"uid: {uuid}") + if res_data["status"] == "success": break - elif res_json["data"]["status"] == "processing": + elif res_data["status"] == "processing": time.sleep(3) - logger.info(f"Doc2x still processing") - elif res_json["data"]["status"] == "failed": - raise RuntimeError(f"Doc2x return an error: {res_json}") + logger.info("Doc2x still processing to convert file") # < ------ 第5步:最后的处理 ------ > logger.info("Doc2x 第5步:最后的处理") From 05a5add8da2ae76d4063ba87296984ae0ce69df0 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Fri, 29 Nov 2024 15:02:58 +0800 Subject: [PATCH 3/6] =?UTF-8?q?=20=20=E6=B7=BB=E5=8A=A0doc2x=E8=B6=85?= =?UTF-8?q?=E6=97=B6=E8=AE=BE=E7=BD=AE=E5=B9=B6=E6=B7=BB=E5=8A=A0=E5=AF=B9?= =?UTF-8?q?xelatex=E7=BC=96=E8=AF=91=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/latex_fns/latex_actions.py | 37 ++++++++++++++----- .../pdf_fns/parse_pdf_via_doc2x.py | 16 ++++++-- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py index df5135bb..ec1ec3e8 100644 --- a/crazy_functions/latex_fns/latex_actions.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -342,7 +342,6 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work logger.error("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") return False, -1, [-1] - def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'): import os, time n_fix = 1 @@ -351,6 +350,24 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 + # 检查是否需要使用xelatex + def check_if_need_xelatex(tex_path): + try: + with open(tex_path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read(5000) + # 检查是否有使用xelatex的宏包 + return any(pkg in content for pkg in ['fontspec', 'xeCJK', 'xetex', 'unicode-math', 'xltxtra', 'xunicode']) + except Exception: + return False + + # 根据编译器类型返回编译命令 + def get_compile_command(compiler, filename): + return f'{compiler} -interaction=batchmode -file-line-error {filename}.tex' + + # 确定使用的编译器 + compiler = 'pdflatex' + if check_if_need_xelatex(pj(work_folder_modified, f'{main_file_modified}.tex')): + compiler = 'xelatex' while True: import os @@ -361,10 +378,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) + ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original) yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) + ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified) if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): # 只有第二步成功,才能继续下面的步骤 @@ -375,10 +392,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified) yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) + ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original) + ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified) + ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original) + ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified) if mode!='translate_zh': yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 @@ -386,10 +403,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd()) yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) + ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder) ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) + ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder) + ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder) # <---------- 检查结果 -----------> results_ = "" diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index 3a37a530..b770bdab 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -70,7 +70,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): # < ------ 第2步:轮询等待 ------ > logger.info("Doc2x 处理文件中:轮询等待") params = {"uid": uuid} - while True: + max_attempts = 60 + attempt = 0 + while attempt < max_attempts: res = requests.get( "https://v2.doc2x.noedgeai.com/api/v2/parse/status", headers={"Authorization": "Bearer " + doc2x_api_key}, @@ -82,8 +84,11 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): elif res_data["status"] == "processing": time.sleep(5) logger.info(f"Doc2x is processing at {res_data['progress']}%") + attempt += 1 else: raise RuntimeError(f"Doc2x return an error: {res_data}") + if attempt >= max_attempts: + raise RuntimeError("Doc2x processing timeout after maximum attempts") # < ------ 第3步:提交转化 ------ > logger.info("Doc2x 第3步:提交转化") @@ -98,7 +103,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): # < ------ 第4步:等待结果 ------ > logger.info("Doc2x 第4步:等待结果") params = {"uid": uuid} - while True: + max_attempts = 36 + attempt = 0 + while attempt < max_attempts: res = requests.get( "https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result", headers={"Authorization": "Bearer " + doc2x_api_key}, @@ -110,6 +117,9 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): elif res_data["status"] == "processing": time.sleep(3) logger.info("Doc2x still processing to convert file") + attempt += 1 + if attempt >= max_attempts: + raise RuntimeError("Doc2x conversion timeout after maximum attempts") # < ------ 第5步:最后的处理 ------ > logger.info("Doc2x 第5步:最后的处理") @@ -124,7 +134,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): # < ------ 下载 ------ > for attempt in range(max_attempt): try: - result_url = res_json["data"]["url"] + result_url = res_data["url"] res = requests.get(result_url) zip_path = os.path.join(target_path, gen_time_str() + ".zip") unzip_path = os.path.join(target_path, gen_time_str()) From 3a03e3f3d3ca288cf6318d93030a92038fa639d3 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Fri, 29 Nov 2024 15:42:10 +0800 Subject: [PATCH 4/6] =?UTF-8?q?Bug=E4=BF=AE=E5=A4=8D=E4=BB=A5=E5=8F=8A?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9xelatex=E5=AE=89=E8=A3=85=E7=9A=84?= =?UTF-8?q?=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/PDF_Translate.py | 2 +- crazy_functions/latex_fns/latex_actions.py | 9 ++++++++- crazy_functions/pdf_fns/parse_pdf_via_doc2x.py | 16 ++++++++++------ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/crazy_functions/PDF_Translate.py b/crazy_functions/PDF_Translate.py index 0f93c821..a4d10837 100644 --- a/crazy_functions/PDF_Translate.py +++ b/crazy_functions/PDF_Translate.py @@ -47,7 +47,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request) return except: - chatbot.append([None, f"DOC2X服务不可用,现在将执行效果稍差的旧版代码。{trimmed_format_exc_markdown()}"]) + chatbot.append([None, f"DOC2X服务不可用,请检查报错详细。{trimmed_format_exc_markdown()}"]) yield from update_ui(chatbot=chatbot, history=history) if method == "GROBID": diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py index ec1ec3e8..a4486e95 100644 --- a/crazy_functions/latex_fns/latex_actions.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -367,7 +367,14 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f # 确定使用的编译器 compiler = 'pdflatex' if check_if_need_xelatex(pj(work_folder_modified, f'{main_file_modified}.tex')): - compiler = 'xelatex' + logger.info("检测到宏包需要xelatex编译,切换至xelatex编译") + # Check if xelatex is installed + try: + import subprocess + subprocess.run(['xelatex', '--version'], capture_output=True, check=True) + compiler = 'xelatex' + except (subprocess.CalledProcessError, FileNotFoundError): + raise RuntimeError("检测到需要使用xelatex编译,但系统中未安装xelatex。请先安装texlive或其他提供xelatex的LaTeX发行版。") while True: import os diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index b770bdab..7d251066 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -19,8 +19,8 @@ def 状态检查(response, uid=""): """ response_json = response.json() response_data = response_json.get("data", {}) - code = response_data.get("code", "Unknown") - meg = response_data.get("message", "") + code = response_json.get("code", "Unknown") + meg = response_data.get("message", response_json) trace_id = response.headers.get("trace-id", "Failed to get trace-id") if response.status_code != 200: raise RuntimeError( @@ -57,6 +57,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): res = requests.post( "https://v2.doc2x.noedgeai.com/api/v2/parse/preupload", headers={"Authorization": "Bearer " + doc2x_api_key}, + timeout=15, ) res_data = 状态检查(res) upload_url = res_data["url"] @@ -64,7 +65,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): logger.info("Doc2x 上传文件:上传文件") with open(pdf_file_path, "rb") as file: - res = requests.put(upload_url, data=file) + res = requests.put(upload_url, data=file, timeout=60) res.raise_for_status() # < ------ 第2步:轮询等待 ------ > @@ -77,6 +78,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): "https://v2.doc2x.noedgeai.com/api/v2/parse/status", headers={"Authorization": "Bearer " + doc2x_api_key}, params=params, + timeout=15, ) res_data = 状态检查(res) if res_data["status"] == "success": @@ -97,6 +99,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): "https://v2.doc2x.noedgeai.com/api/v2/convert/parse", headers={"Authorization": "Bearer " + doc2x_api_key}, json=data, + timeout=15, ) 状态检查(res, uid=f"uid: {uuid}") @@ -110,6 +113,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): "https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result", headers={"Authorization": "Bearer " + doc2x_api_key}, params=params, + timeout=15, ) res_data = 状态检查(res, uid=f"uid: {uuid}") if res_data["status"] == "success": @@ -122,7 +126,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): raise RuntimeError("Doc2x conversion timeout after maximum attempts") # < ------ 第5步:最后的处理 ------ > - logger.info("Doc2x 第5步:最后的处理") + logger.info("Doc2x 第5步:下载转换后的文件") if format == "tex": target_path = latex_dir @@ -135,7 +139,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): for attempt in range(max_attempt): try: result_url = res_data["url"] - res = requests.get(result_url) + res = requests.get(result_url, timeout=60) zip_path = os.path.join(target_path, gen_time_str() + ".zip") unzip_path = os.path.join(target_path, gen_time_str()) if res.status_code == 200: @@ -145,7 +149,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): raise RuntimeError(f"Doc2x return an error: {res.json()}") except Exception as e: if attempt < max_attempt - 1: - logger.error(f"Failed to download latex file, retrying... {e}") + logger.error(f"Failed to download uid = {uuid} file, retrying... {e}") time.sleep(3) continue else: From ef72a00726e8ff85c88089fec9e914de508727e4 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Fri, 29 Nov 2024 15:54:27 +0800 Subject: [PATCH 5/6] =?UTF-8?q?=E5=A2=9E=E5=BC=BA=E5=BC=B1=E7=BD=91?= =?UTF-8?q?=E7=8E=AF=E5=A2=83=E4=B8=8B=E7=9A=84=E7=A8=B3=E5=AE=9A=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pdf_fns/parse_pdf_via_doc2x.py | 53 ++++++++++++++++--- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index 7d251066..64627d05 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -8,7 +8,42 @@ from loguru import logger import os import requests import time -import json + + +def retry_request(max_retries=3, delay=3): + """ + Decorator for retrying HTTP requests + Args: + max_retries: Maximum number of retry attempts + delay: Delay between retries in seconds + """ + + def decorator(func): + def wrapper(*args, **kwargs): + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + if attempt < max_retries - 1: + logger.error( + f"Request failed, retrying... ({attempt + 1}/{max_retries}) Error: {e}" + ) + time.sleep(delay) + continue + raise e + return None + + return wrapper + + return decorator + + +@retry_request() +def make_request(method, url, **kwargs): + """ + Make HTTP request with retry mechanism + """ + return requests.request(method, url, **kwargs) def 状态检查(response, uid=""): @@ -54,7 +89,8 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): # < ------ 第1步:预上传获取URL,然后上传文件 ------ > logger.info("Doc2x 上传文件:预上传获取URL") - res = requests.post( + res = make_request( + "POST", "https://v2.doc2x.noedgeai.com/api/v2/parse/preupload", headers={"Authorization": "Bearer " + doc2x_api_key}, timeout=15, @@ -65,7 +101,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): logger.info("Doc2x 上传文件:上传文件") with open(pdf_file_path, "rb") as file: - res = requests.put(upload_url, data=file, timeout=60) + res = make_request("PUT", upload_url, data=file, timeout=60) res.raise_for_status() # < ------ 第2步:轮询等待 ------ > @@ -74,7 +110,8 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): max_attempts = 60 attempt = 0 while attempt < max_attempts: - res = requests.get( + res = make_request( + "GET", "https://v2.doc2x.noedgeai.com/api/v2/parse/status", headers={"Authorization": "Bearer " + doc2x_api_key}, params=params, @@ -95,7 +132,8 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): # < ------ 第3步:提交转化 ------ > logger.info("Doc2x 第3步:提交转化") data = {"uid": uuid, "to": format, "formula_mode": "dollar", "filename": "output"} - res = requests.post( + res = make_request( + "POST", "https://v2.doc2x.noedgeai.com/api/v2/convert/parse", headers={"Authorization": "Bearer " + doc2x_api_key}, json=data, @@ -109,7 +147,8 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): max_attempts = 36 attempt = 0 while attempt < max_attempts: - res = requests.get( + res = make_request( + "GET", "https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result", headers={"Authorization": "Bearer " + doc2x_api_key}, params=params, @@ -139,7 +178,7 @@ def 解析PDF_DOC2X(pdf_file_path, format="tex"): for attempt in range(max_attempt): try: result_url = res_data["url"] - res = requests.get(result_url, timeout=60) + res = make_request("GET", result_url, timeout=60) zip_path = os.path.join(target_path, gen_time_str() + ".zip") unzip_path = os.path.join(target_path, gen_time_str()) if res.status_code == 200: From 86ec3e365496093e9f9f7c6abea4ecae8e013e8a Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Tue, 3 Dec 2024 23:28:54 +0800 Subject: [PATCH 6/6] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E4=B8=AD=5F=E6=97=A0=E6=B3=95=E6=98=BE=E7=A4=BA=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/latex_fns/latex_actions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py index a4486e95..63c6a644 100644 --- a/crazy_functions/latex_fns/latex_actions.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -300,7 +300,8 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder) # <-------- 写出文件 ----------> - msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。" + model_name = llm_kwargs['llm_model'].replace('_', '\\_') # 替换LLM模型名称中的下划线为转义字符 + msg = f"当前大语言模型: {model_name},当前语言模型温度设定: {llm_kwargs['temperature']}。" final_tex = lps.merge_result(pfg.file_result, mode, msg) objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl'))