From a7c50a49970013b68a69e13672723fa52f099bca Mon Sep 17 00:00:00 2001 From: xunge <1063563945@qq.com> Date: Wed, 15 Jan 2025 11:32:46 +0800 Subject: [PATCH] conda_init --- crazy_functional.py | 78 ++++++++++++++++---------------- crazy_functions/PDF_Convert.py | 57 ++++++++++++------------ crazy_functions/crazy_utils.py | 81 +++++++++++++++++++++++++--------- 3 files changed, 126 insertions(+), 90 deletions(-) diff --git a/crazy_functional.py b/crazy_functional.py index 5c96dbb4..d88dd547 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -2,6 +2,7 @@ from toolbox import HotReload # HotReload 的意思是热更新,修改函数 from toolbox import trimmed_format_exc from loguru import logger + def get_crazy_functions(): from crazy_functions.读文章写摘要 import 读文章写摘要 from crazy_functions.生成函数注释 import 批量生成函数注释 @@ -50,7 +51,6 @@ def get_crazy_functions(): from crazy_functions.SourceCode_Comment import 注释Python项目 from crazy_functions.SourceCode_Comment_Wrap import SourceCodeComment_Wrap from crazy_functions.VideoResource_GPT import 多媒体任务 - from crazy_functions.PDF_Convert import 解析PDF文档 function_plugins = { "多媒体智能体": { @@ -106,7 +106,7 @@ def get_crazy_functions(): "Group": "对话", "Color": "stop", "AsButton": False, - "Info" : "基于当前对话或文件生成多种Mermaid图表,图表类型由模型判断", + "Info": "基于当前对话或文件生成多种Mermaid图表,图表类型由模型判断", "Function": None, "Class": Mermaid_Gen }, @@ -116,7 +116,7 @@ def get_crazy_functions(): "AsButton": True, "Info": "Arixv论文精细翻译 | 输入参数arxiv论文的ID,比如1812.10695", "Function": HotReload(Latex翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 - "Class": Arxiv_Localize, # 新一代插件需要注册Class + "Class": Arxiv_Localize, # 新一代插件需要注册Class }, "批量总结Word文档": { "Group": "学术", @@ -230,8 +230,8 @@ def get_crazy_functions(): "Color": "stop", "AsButton": True, "Info": "保存当前的对话 | 不需要输入参数", - "Function": HotReload(对话历史存档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 - "Class": Conversation_To_File_Wrap # 新一代插件需要注册Class + "Function": HotReload(对话历史存档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 + "Class": Conversation_To_File_Wrap # 新一代插件需要注册Class }, "[多线程Demo]解析此项目本身(源码自译解)": { "Group": "对话|编程", @@ -246,7 +246,7 @@ def get_crazy_functions(): "AsButton": True, # 加入下拉菜单中 # "Info": "连接网络回答问题(需要访问谷歌)| 输入参数是一个问题", "Function": HotReload(连接网络回答问题), - "Class": NetworkGPT_Wrap # 新一代插件需要注册Class + "Class": NetworkGPT_Wrap # 新一代插件需要注册Class }, "历史上的今天": { "Group": "对话", @@ -254,14 +254,14 @@ def get_crazy_functions(): "AsButton": False, "Info": "查看历史上的今天事件 (这是一个面向开发者的插件Demo) | 不需要输入参数", "Function": None, - "Class": Demo_Wrap, # 新一代插件需要注册Class + "Class": Demo_Wrap, # 新一代插件需要注册Class }, "精准翻译PDF论文": { "Group": "学术", "Color": "stop", "AsButton": True, "Info": "精准翻译PDF论文为中文 | 输入参数为路径", - "Function": HotReload(批量翻译PDF文档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 + "Function": HotReload(批量翻译PDF文档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 "Class": PDF_Tran, # 新一代插件需要注册Class }, "询问多个GPT模型": { @@ -355,7 +355,7 @@ def get_crazy_functions(): r'If the term "agent" is used in this section, it should be translated to "智能体". ', "Info": "Arixv论文精细翻译 | 输入参数arxiv论文的ID,比如1812.10695", "Function": HotReload(Latex翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 - "Class": Arxiv_Localize, # 新一代插件需要注册Class + "Class": Arxiv_Localize, # 新一代插件需要注册Class }, "📚本地Latex论文精细翻译(上传Latex项目)[需Latex]": { "Group": "学术", @@ -377,16 +377,8 @@ def get_crazy_functions(): r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: " r'If the term "agent" is used in this section, it should be translated to "智能体". ', "Info": "PDF翻译中文,并重新编译PDF | 输入参数为路径", - "Function": HotReload(PDF翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 - "Class": PDF_Localize # 新一代插件需要注册Class - }, - "解析PDF文档": { - "Group": "学术", - "Color": "stop", - "AsButton": False, - "AdvancedArgs": True, - "Info": "PDF解析", - "Function": HotReload(解析PDF文档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 + "Function": HotReload(PDF翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 + "Class": PDF_Localize # 新一代插件需要注册Class } } @@ -397,7 +389,7 @@ def get_crazy_functions(): "Color": "stop", "AsButton": False, "Info": "使用 DALLE2/DALLE3 生成图片 | 输入参数字符串,提供图像的内容", - "Function": HotReload(图片生成_DALLE2), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 + "Function": HotReload(图片生成_DALLE2), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 "Class": ImageGen_Wrap # 新一代插件需要注册Class }, } @@ -416,14 +408,6 @@ def get_crazy_functions(): } ) - - - - - - - - # -=--=- 尚未充分测试的实验性插件 & 需要额外依赖的插件 -=--=- try: from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要 @@ -484,7 +468,8 @@ def get_crazy_functions(): "Color": "stop", "AsButton": False, "AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False) - "ArgsReminder": '输入时用逗号隔开, *代表通配符, 加了^代表不匹配; 不输入代表全部匹配。例如: "*.c, ^*.cpp, config.toml, ^*.toml"', # 高级参数输入区的显示提示 + "ArgsReminder": '输入时用逗号隔开, *代表通配符, 加了^代表不匹配; 不输入代表全部匹配。例如: "*.c, ^*.cpp, config.toml, ^*.toml"', + # 高级参数输入区的显示提示 "Function": HotReload(解析任意code项目), }, } @@ -512,8 +497,6 @@ def get_crazy_functions(): logger.error(trimmed_format_exc()) logger.error("Load function plugin failed") - - try: from crazy_functions.总结音视频 import 总结音视频 @@ -626,7 +609,6 @@ def get_crazy_functions(): logger.error(trimmed_format_exc()) logger.error("Load function plugin failed") - try: from toolbox import get_conf @@ -735,6 +717,23 @@ def get_crazy_functions(): logger.error(trimmed_format_exc()) logger.error("Load function plugin failed") + try: + from crazy_functions.PDF_Convert import 解析PDF文档 + + function_plugins.update({ + "解析PDF到md(MinerU)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": "请输入Conda环境名称,默认为“MinerU”", + "Info": "上传PDF,并转换为Markdown | 输入参数为Anaconda环境名称", + "Function": HotReload(解析PDF文档), + } + }) + except: + logger.error(trimmed_format_exc()) + logger.error("Load function plugin failed") # try: # from crazy_functions.高级功能函数模板 import 测试图表渲染 @@ -750,7 +749,6 @@ def get_crazy_functions(): # logger.error(trimmed_format_exc()) # print('Load function plugin failed') - """ 设置默认值: - 默认 Group = 对话 @@ -771,8 +769,6 @@ def get_crazy_functions(): return function_plugins - - def get_multiplex_button_functions(): """多路复用主提交按钮的功能映射 """ @@ -780,12 +776,12 @@ def get_multiplex_button_functions(): "常规对话": "", - "多模型对话": - "询问多个GPT模型", # 映射到上面的 `询问多个GPT模型` 插件 + "多模型对话": + "询问多个GPT模型", # 映射到上面的 `询问多个GPT模型` 插件 - "智能召回 RAG": - "Rag智能召回", # 映射到上面的 `Rag智能召回` 插件 + "智能召回 RAG": + "Rag智能召回", # 映射到上面的 `Rag智能召回` 插件 - "多媒体查询": - "多媒体智能体", # 映射到上面的 `多媒体智能体` 插件 + "多媒体查询": + "多媒体智能体", # 映射到上面的 `多媒体智能体` 插件 } diff --git a/crazy_functions/PDF_Convert.py b/crazy_functions/PDF_Convert.py index d01fed82..3bff105d 100644 --- a/crazy_functions/PDF_Convert.py +++ b/crazy_functions/PDF_Convert.py @@ -9,17 +9,25 @@ import copy import os import math import logging - +import time + @CatchException def 解析PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): - disable_auto_promotion(chatbot) # 基本信息:功能、贡献者 chatbot.append([ "函数插件功能?", - "解析PDF文档。函数插件贡献者: Xunge-Jiang"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + "使用`MinerU`解析PDF文档到Markdown。(支持版本-1.0.1)\n\n" + "由于MinerU环境与gpt_academic冲突,需要事先创建好名字为`MinerU`的Conda环境。\n\n" + "安装命令如下:\n\n" + "```sh\n" + "conda create -n MinerU python=3.10\n" + "conda activate MinerU\n" + "pip install -U 'magic-pdf[full]' --extra-index-url https://wheels.myhloli.com\n```\n\n" + "默认使用CPU,使用GPU加速至少需要8GB显存,需要修改 `~/magic-pdf.json` 中的 `device-mode` 为 `cuda`\n\n" + "函数插件贡献者: Xunge-Jiang"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 清空历史,以免输入溢出 history = [] @@ -40,39 +48,32 @@ def 解析PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro if len(file_manifest) == 0: report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到任何.pdf拓展名的文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return # 开始正式执行任务 - yield from 解析PDF_基于MinerU(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + yield from 解析PDF_基于MinerU(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, + system_prompt) def 解析PDF_基于MinerU(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): - import copy - import tiktoken - TOKEN_LIMIT_PER_FRAGMENT = 1024 - generated_conclusion_files = [] - generated_html_files = [] - DST_LANG = "中文" from crazy_functions.crazy_utils import mineru_interface - from crazy_functions.pdf_fns.report_gen_html import construct_html mineru_handle = mineru_interface() for index, fp in enumerate(file_manifest): if fp.endswith('pdf'): - chatbot.append(["当前进度:", f"正在解析论文,请稍候。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - fpp = yield from mineru_handle.mineru_parse_pdf(fp, chatbot, history) - promote_file_to_downloadzone(fpp, rename_file=os.path.basename(fpp)+'.mineru.md', chatbot=chatbot) + chatbot.append(["当前进度:", f"正在解析论文,请稍候。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop( + "advanced_arg") + conda_env = plugin_kwargs.get("advanced_arg", 'MinerU') + md_path, zip_path = yield from mineru_handle.mineru_parse_pdf(fp, chatbot, history, conda_env) + chatbot.append((f"成功啦", '请查收结果...')) + yield from update_ui(chatbot=chatbot, history=history) + time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(md_path, rename_file=None, chatbot=chatbot) + promote_file_to_downloadzone(zip_path, rename_file=None, chatbot=chatbot) else: - chatbot.append(["当前论文无需解析:", fp]); yield from update_ui(chatbot=chatbot, history=history) - fpp = fp - # with open(fpp, 'r', encoding='utf8') as f: - # article_content = f.readlines() - # article_dict = markdown_to_dict(article_content) - # logging.info(article_dict) - # yield from translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG) - - # chatbot.append(("给出输出文件清单", str(generated_conclusion_files))) - - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - + chatbot.append(["当前论文无法解析:", fp]); + yield from update_ui(chatbot=chatbot, history=history) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 22eee624..df6983ca 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -637,27 +637,14 @@ class mineru_interface(): def __init__(self): self.threadLock = threading.Lock() - def mineru_with_timeout(self, command, cwd, timeout=3600, conda_env="mineru", conda_init="/home/jiang/anaconda3/etc/profile.d/conda.sh"): + def mineru_with_timeout(self, command, cwd, timeout=3600): import subprocess from toolbox import ProxyNetworkActivate - logger.info(f"正在执行命令 {command} 在 Conda 环境 '{conda_env}' 中") - - import shlex - # 确保命令中的参数安全(转义空格等特殊字符) - safe_command = ' '.join([shlex.quote(arg) for arg in command]) - print('safe_command', safe_command) - - # 构造激活 Conda 环境的命令 - activate_command = ( - f"source {conda_init} && " - f"conda activate {conda_env} && " - f"CUDA_VISIBLE_DEVICES=1 {safe_command}" - ) try: with ProxyNetworkActivate("MinerU"): process = subprocess.Popen( - activate_command, + command, shell=True, cwd=cwd, env=os.environ, @@ -677,8 +664,51 @@ class mineru_interface(): return False return True + def compress_to_zip(self, dst): + """ + 将指定路径 dst 压缩为 zip 文件,并返回压缩文件的路径。 + """ + import shutil + from pathlib import Path - def mineru_parse_pdf(self, fp, chatbot, history): + dst_path = Path(dst).resolve() # 解析为绝对路径 + dst_parent = dst_path.parent # 父目录 + dst_name = dst_path.name # 目录名 + + # 设置压缩文件存储路径(与 dst 的父目录相同) + zip_path = dst_parent / f"{dst_name}.zip" + shutil.make_archive(base_name=str(zip_path.with_suffix('')), + format='zip', + root_dir=str(dst_path)) + + return str(zip_path) + + def get_conda_activate_command(self): + # 构造激活 Conda 环境的命令 + conda_prefix = os.environ.get("CONDA_PREFIX") + conda_path_split = conda_prefix.split('/') + conda_base_path = '/'.join(conda_path_split[:4]) + + import platform + # 检测操作系统 + system = platform.system() + if system == "Windows": + # Windows 下的 conda 初始化脚本 + conda_init = os.path.join(conda_base_path, "condabin", "conda.bat") + if not os.path.exists(conda_init): + self.threadLock.release() + raise FileNotFoundError(f"找不到 conda 初始化脚本: {conda_init}") + activate_command = f'call "{conda_init}" activate ' + else: + # Linux/Mac 下的 conda 初始化脚本 + conda_init = os.path.join(conda_base_path, "etc", "profile.d", "conda.sh") + if not os.path.exists(conda_init): + self.threadLock.release() + raise FileNotFoundError(f"找不到 conda 初始化脚本: {conda_init}") + activate_command = f"source {conda_init} && conda activate " + return activate_command + + def mineru_parse_pdf(self, fp, chatbot, history, conda_env): from toolbox import update_ui_lastest_msg yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在排队, 等待线程锁...", @@ -692,20 +722,29 @@ class mineru_interface(): yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载MinerU... ", chatbot=chatbot, history=history, delay=0) command = ['magic-pdf', '-p', os.path.abspath(fp), '-o', os.path.abspath(dst), ] - self.mineru_with_timeout(command, cwd=os.getcwd(), timeout=3600) + import shlex + # 确保命令中的参数安全(转义空格等特殊字符) + safe_command = ' '.join([shlex.quote(arg) for arg in command]) + yield from update_ui_lastest_msg(f"正在执行命令 {safe_command} 在 Conda 环境 '{conda_env}' 中。", + chatbot=chatbot, history=history, delay=0) + activate_command = self.get_conda_activate_command() + activate_command += f"{conda_env} && CUDA_VISIBLE_DEVICES=0 {safe_command}" + logger.info(f"正在执行命令 {activate_command}") + self.mineru_with_timeout(activate_command, cwd=os.getcwd(), timeout=3600) pdf_name = os.path.basename(fp) # 去掉后缀 name_without_ext = os.path.splitext(pdf_name)[0] + new_dst_dir = os.path.join(dst, name_without_ext) - res = glob.glob(os.path.join(dst, name_without_ext, 'auto', '*.md')) - + res = glob.glob(os.path.join(new_dst_dir, 'auto', '*.md')) if len(res) == 0: self.threadLock.release() raise RuntimeError("MinerU解析论文失败。") self.threadLock.release() - return res[0] - + md_path = res[0] + zip_path = self.compress_to_zip(new_dst_dir) + return md_path, zip_path def try_install_deps(deps, reload_m=[]):