Merge 26b8d4d7ee into be83907394
This commit is contained in:
commit
feb9bb622b
|
|
@ -16,7 +16,7 @@ from crazy_functions.latex_fns.latex_pickle_io import objdump, objload
|
||||||
pj = os.path.join
|
pj = os.path.join
|
||||||
|
|
||||||
|
|
||||||
def split_subprocess(txt, project_folder, return_dict, opts):
|
def split_subprocess(txt: str, project_folder: str, return_dict, opts):
|
||||||
"""
|
"""
|
||||||
break down latex file to a linked list,
|
break down latex file to a linked list,
|
||||||
each node use a preserve flag to indicate whether it should
|
each node use a preserve flag to indicate whether it should
|
||||||
|
|
@ -30,26 +30,24 @@ def split_subprocess(txt, project_folder, return_dict, opts):
|
||||||
text, mask = set_forbidden_text(text, mask, r"^(.*?)\\begin{document}", re.DOTALL)
|
text, mask = set_forbidden_text(text, mask, r"^(.*?)\\begin{document}", re.DOTALL)
|
||||||
# 吸收iffalse注释
|
# 吸收iffalse注释
|
||||||
text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
|
text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
|
||||||
# 吸收在42行以内的begin-end组合
|
# 吸收在42行以内的begin-end组合 (为什么要吸收啊... 我的theorem基本都被剔除了)
|
||||||
text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42)
|
# text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42)
|
||||||
# 吸收匿名公式
|
# 吸收匿名公式
|
||||||
text, mask = set_forbidden_text(text, mask, [ r"\$\$([^$]+)\$\$", r"\\\[.*?\\\]" ], re.DOTALL)
|
text, mask = set_forbidden_text(text, mask, [ r"\$\$([^$]+)\$\$", r"\\\[.*?\\\]" ], re.DOTALL)
|
||||||
# 吸收其他杂项
|
# 吸收其他杂项
|
||||||
text, mask = set_forbidden_text(text, mask, [ r"\\section\{(.*?)\}", r"\\section\*\{(.*?)\}", r"\\subsection\{(.*?)\}", r"\\subsubsection\{(.*?)\}" ])
|
text, mask = set_forbidden_text(text, mask, [ r"\\section\*?\{(.*?)\}", r"\\subsection\{(.*?)\}", r"\\subsubsection\{(.*?)\}" ])
|
||||||
text, mask = set_forbidden_text(text, mask, [ r"\\bibliography\{(.*?)\}", r"\\bibliographystyle\{(.*?)\}" ])
|
text, mask = set_forbidden_text(text, mask, [ r"\\bibliography\{(.*?)\}", r"\\bibliographystyle\{(.*?)\}" ])
|
||||||
text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL)
|
for environment in [
|
||||||
text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
|
"thebibliography",
|
||||||
text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
|
"wraptable", "table", "tabular",
|
||||||
text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
|
"wrapfigure", "figure", "tikzpicture",
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}"], re.DOTALL)
|
"lstlisting", "algorithm", "algorithmic",
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\begin\{figure\}(.*?)\\end\{figure\}", r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}"], re.DOTALL)
|
"multline", "align", "equation",
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\begin\{multline\}(.*?)\\end\{multline\}", r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}"], re.DOTALL)
|
"minipage",
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\begin\{table\}(.*?)\\end\{table\}", r"\\begin\{table\*\}(.*?)\\end\{table\*\}"], re.DOTALL)
|
]:
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\begin\{minipage\}(.*?)\\end\{minipage\}", r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}"], re.DOTALL)
|
text, mask = set_forbidden_text(text, mask, rf"\\begin\{{({environment}\*?)\}}(.*?)\\end\{{\1\}}", re.DOTALL)
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\begin\{align\*\}(.*?)\\end\{align\*\}", r"\\begin\{align\}(.*?)\\end\{align\}"], re.DOTALL)
|
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\begin\{equation\}(.*?)\\end\{equation\}", r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}"], re.DOTALL)
|
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\includepdf\[(.*?)\]\{(.*?)\}", r"\\clearpage", r"\\newpage", r"\\appendix", r"\\tableofcontents", r"\\include\{(.*?)\}"])
|
text, mask = set_forbidden_text(text, mask, [r"\\includepdf\[(.*?)\]\{(.*?)\}", r"\\clearpage", r"\\newpage", r"\\appendix", r"\\tableofcontents", r"\\include\{(.*?)\}"])
|
||||||
text, mask = set_forbidden_text(text, mask, [r"\\vspace\{(.*?)\}", r"\\hspace\{(.*?)\}", r"\\label\{(.*?)\}", r"\\begin\{(.*?)\}", r"\\end\{(.*?)\}", r"\\item "])
|
text, mask = set_forbidden_text(text, mask, [r"\\vspace\{(.*?)\}", r"\\hspace\{(.*?)\}", r"\\label\{(.*?)\}", r"\\begin\{(.*?)\}", r"\\end\{(.*?)\}", r"\\item\s?"])
|
||||||
text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
|
text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
|
||||||
# reverse 操作必须放在最后
|
# reverse 操作必须放在最后
|
||||||
text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
|
text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
|
||||||
|
|
@ -97,13 +95,21 @@ class LatexPaperSplit():
|
||||||
self.title = "unknown"
|
self.title = "unknown"
|
||||||
self.abstract = "unknown"
|
self.abstract = "unknown"
|
||||||
|
|
||||||
def read_title_and_abstract(self, txt):
|
def shrink_spaces(self, text: str):
|
||||||
|
"""删除空白字符 包含\\n, \\\\, \\t, 空格
|
||||||
|
|
||||||
|
残存BUG: 未考虑\\\\[length]"""
|
||||||
|
text = text.replace('\\\\', ' ')
|
||||||
|
pattern = re.compile(r'\s+')
|
||||||
|
return re.sub(pattern, ' ', text)
|
||||||
|
|
||||||
|
def read_title_and_abstract(self, txt: str):
|
||||||
try:
|
try:
|
||||||
title, abstract = find_title_and_abs(txt)
|
title, abstract = find_title_and_abs(txt)
|
||||||
if title is not None:
|
if title is not None:
|
||||||
self.title = title.replace('\n', ' ').replace('\\\\', ' ').replace(' ', '').replace(' ', '')
|
self.title = self.shrink_spaces(title)
|
||||||
if abstract is not None:
|
if abstract is not None:
|
||||||
self.abstract = abstract.replace('\n', ' ').replace('\\\\', ' ').replace(' ', '').replace(' ', '')
|
self.abstract = self.shrink_spaces(abstract)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
@ -147,7 +153,7 @@ class LatexPaperSplit():
|
||||||
return result_string
|
return result_string
|
||||||
|
|
||||||
|
|
||||||
def split(self, txt, project_folder, opts):
|
def split(self, txt: str, project_folder: str, opts: list):
|
||||||
"""
|
"""
|
||||||
break down latex file to a linked list,
|
break down latex file to a linked list,
|
||||||
each node use a preserve flag to indicate whether it should
|
each node use a preserve flag to indicate whether it should
|
||||||
|
|
@ -215,7 +221,7 @@ class LatexPaperFileGroup():
|
||||||
return manifest
|
return manifest
|
||||||
|
|
||||||
|
|
||||||
def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]):
|
def Latex精细分解与转化(file_manifest: list[str], project_folder: str, llm_kwargs: dict[str,str], plugin_kwargs: dict[str,str], chatbot, history:list[str], system_prompt, mode='proofread', switch_prompt=None, opts=[]):
|
||||||
import time, os, re
|
import time, os, re
|
||||||
from ..crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
from ..crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
||||||
from .latex_actions import LatexPaperFileGroup, LatexPaperSplit
|
from .latex_actions import LatexPaperFileGroup, LatexPaperSplit
|
||||||
|
|
|
||||||
|
|
@ -153,9 +153,9 @@ Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1)
|
||||||
def set_forbidden_text(text, mask, pattern, flags=0):
|
def set_forbidden_text(text, mask, pattern, flags=0):
|
||||||
"""
|
"""
|
||||||
Add a preserve text area in this paper
|
Add a preserve text area in this paper
|
||||||
e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}"
|
e.g. with pattern = r"\\begin\\{algorithm\\}(.*?)\\end\\{algorithm\\}"
|
||||||
you can mask out (mask = PRESERVE so that text become untouchable for GPT)
|
you can mask out (mask = PRESERVE so that text become untouchable for GPT)
|
||||||
everything between "\begin{equation}" and "\end{equation}"
|
everything between "\\begin{equation}" and "\\end{equation}"
|
||||||
"""
|
"""
|
||||||
if isinstance(pattern, list):
|
if isinstance(pattern, list):
|
||||||
pattern = "|".join(pattern)
|
pattern = "|".join(pattern)
|
||||||
|
|
@ -331,18 +331,59 @@ def find_main_tex_file(file_manifest, mode):
|
||||||
return candidates[select]
|
return candidates[select]
|
||||||
|
|
||||||
|
|
||||||
def rm_comments(main_file):
|
def rm_comments_inline(content: str):
|
||||||
new_file_remove_comment_lines = []
|
"""删除掉所有行内注释"""
|
||||||
for l in main_file.splitlines():
|
pattern = re.compile(r"(?<!\\)%.*\n[ \t\r\f\v]*")
|
||||||
# 删除整行的空注释
|
return re.sub(pattern, "", content)
|
||||||
if l.lstrip().startswith("%"):
|
|
||||||
pass
|
|
||||||
else:
|
def rm_comments_block(content: str):
|
||||||
new_file_remove_comment_lines.append(l)
|
"""删除掉所有块注释"""
|
||||||
main_file = "\n".join(new_file_remove_comment_lines)
|
stack: list[int] = []
|
||||||
# main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令
|
output: list[str] = []
|
||||||
main_file = re.sub(r"(?<!\\)%.*", "", main_file) # 使用正则表达式查找半行注释, 并替换为空字符串
|
pos = 0
|
||||||
return main_file
|
len_text = len(content)
|
||||||
|
pattern = re.compile(r"\\begin\{comment\}|\\end\{comment\}")
|
||||||
|
|
||||||
|
for match in pattern.finditer(content):
|
||||||
|
start, end = match.start(), match.end()
|
||||||
|
tag = match.group()
|
||||||
|
|
||||||
|
# 记录非注释内容
|
||||||
|
if not stack:
|
||||||
|
output.append(content[pos:start])
|
||||||
|
|
||||||
|
# 处理标签
|
||||||
|
if tag == r"\begin{comment}":
|
||||||
|
stack.append(end) # 记录起始位置
|
||||||
|
else: # \end{comment}
|
||||||
|
if stack:
|
||||||
|
stack.pop() # 弹出匹配的\begin{comment}
|
||||||
|
else:
|
||||||
|
raise ValueError("\\end{comment}没有匹配的\\begin{comment}")
|
||||||
|
|
||||||
|
pos = end
|
||||||
|
|
||||||
|
# 添加最后一段内容(如果存在)
|
||||||
|
if stack:
|
||||||
|
raise ValueError("\\begin{comment}没有匹配的\\end{comment}")
|
||||||
|
elif pos < len_text:
|
||||||
|
output.append(content[pos:])
|
||||||
|
|
||||||
|
return "".join(output)
|
||||||
|
|
||||||
|
|
||||||
|
def rm_comments(main_file: str):
|
||||||
|
"""删除掉所有注释
|
||||||
|
|
||||||
|
TeX文件有数种注释:
|
||||||
|
- % 注释: 删除本行%后所有内容, 删除回车, 删除下一行缩进
|
||||||
|
- \\iffalse ... \\fi 注释: 呃, 删不动... (主要是可能的嵌套问题)
|
||||||
|
- \\begin{comment} ... \\end{comment} 注释: 删除所有内容
|
||||||
|
|
||||||
|
残存BUG: 未考虑 % 在verbatim环境中出现的情况
|
||||||
|
"""
|
||||||
|
return rm_comments_block(rm_comments_inline(main_file))
|
||||||
|
|
||||||
|
|
||||||
def find_tex_file_ignore_case(fp):
|
def find_tex_file_ignore_case(fp):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue