From 70ee810133e9d210f26af80f82aa12507cfa5885 Mon Sep 17 00:00:00 2001
From: qingxu fu <505030475@qq.com>
Date: Sat, 3 Jun 2023 19:39:19 +0800
Subject: [PATCH] improve success rate
---
crazy_functions/latex_utils.py | 69 +++++++++++++++++++++++++++-------
1 file changed, 55 insertions(+), 14 deletions(-)
diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py
index 3fbbf9b..e3d5113 100644
--- a/crazy_functions/latex_utils.py
+++ b/crazy_functions/latex_utils.py
@@ -5,6 +5,10 @@ import re
pj = os.path.join
def 寻找Latex主文件(file_manifest, mode):
+ """
+ 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。
+ P.S. 但愿没人把latex模板放在里面传进来
+ """
for texf in file_manifest:
if os.path.basename(texf).startswith('merge'):
continue
@@ -17,6 +21,9 @@ def 寻找Latex主文件(file_manifest, mode):
raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)')
def merge_tex_files_(project_foler, main_file, mode):
+ """
+ 递归地把多Tex工程整合为一个Tex文档
+ """
for s in reversed([q for q in re.finditer(r"\\input\{(.*?)\}", main_file, re.M)]):
f = s.group(1)
fp = os.path.join(project_foler, f)
@@ -33,38 +40,56 @@ def merge_tex_files_(project_foler, main_file, mode):
return main_file
def merge_tex_files(project_foler, main_file, mode):
+ """
+ 递归地把多Tex工程整合为一个Tex文档(递归外层)
+ P.S. 顺便把CTEX塞进去以支持中文
+ P.S. 顺便把Latex的注释去除
+ """
main_file = merge_tex_files_(project_foler, main_file, mode)
-
if mode == 'translate_zh':
pattern = re.compile(r'\\documentclass.*\n')
match = pattern.search(main_file)
position = match.end()
main_file = main_file[:position] + '\\usepackage{CTEX}\n\\usepackage{url}\n' + main_file[position:]
-
+ new_file_remove_comment_lines = []
+ for l in main_file.splitlines():
+ # 删除整行的空注释
+ if l.startswith("%") or (l.startswith(" ") and l.lstrip().startswith("%")):
+ pass
+ else:
+ new_file_remove_comment_lines.append(l)
+ main_file = '\n'.join(new_file_remove_comment_lines)
+ main_file = re.sub(r'(? None:
self.string = string
self.preserve = preserve
self.next = None
+
def mod_inbraket(match):
+ """
+ 为啥chatgpt会把cite里面的逗号换成中文逗号呀 艹
+ """
# get the matched string
cmd = match.group(1)
str_to_modify = match.group(2)
-
# modify the matched string
str_to_modify = str_to_modify.replace(':', ':') # 前面是中文冒号,后面是英文冒号
str_to_modify = str_to_modify.replace(',', ',') # 前面是中文逗号,后面是英文逗号
# str_to_modify = 'BOOM'
- # return the modified string as the replacement
return "\\" + cmd + "{" + str_to_modify + "}"
def fix_content(final_tex, node_string):
"""
- fix common GPT errors to increase success rate
+ Fix common GPT errors to increase success rate
"""
final_tex = final_tex.replace('%', r'\%')
final_tex = final_tex.replace(r'\%', r'\\%')
@@ -74,10 +99,19 @@ def fix_content(final_tex, node_string):
return final_tex
class LatexPaperSplit():
+ """
+ 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理
+ """
def __init__(self) -> None:
+ """
+ root是链表的根节点
+ """
self.root = None
def merge_result(self, arr, mode, msg):
+ """
+ 将GPT处理后的结果融合
+ """
result_string = ""
node = self.root
p = 0
@@ -105,8 +139,10 @@ class LatexPaperSplit():
return result_string
def split(self, txt):
- # def replace_with_hash()
- root = LinkTable(txt, False)
+ """
+ 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理
+ """
+ root = LinkedListNode(txt, False)
def split_worker(root, pattern, flags=0):
lt = root
cnt = 0
@@ -131,10 +167,10 @@ class LatexPaperSplit():
lt.string = before
tmp = lt.next
# ======
- mid = LinkTable(this, True)
+ mid = LinkedListNode(this, True)
lt.next = mid
# ======
- aft = LinkTable(after, False)
+ aft = LinkedListNode(after, False)
mid.next = aft
aft.next = tmp
# ======
@@ -152,6 +188,8 @@ class LatexPaperSplit():
split_worker(root, r"\\subsubsection\{(.*?)\}")
split_worker(root, r"\\bibliography\{(.*?)\}")
split_worker(root, r"\\bibliographystyle\{(.*?)\}")
+ split_worker(root, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
+ split_worker(root, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
split_worker(root, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL)
split_worker(root, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL)
split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
@@ -178,13 +216,17 @@ class LatexPaperSplit():
node = node.next
if node is None: break
- with open('debug_log', 'w', encoding='utf8') as f:
+ # 将分解结果返回 res_to_t
+ with open('debug_log.html', 'w', encoding='utf8') as f:
res_to_t = []
node = root
while True:
+ show_html = node.string.replace('\n','
')
if not node.preserve:
res_to_t.append(node.string)
- f.write(node.string)
+ f.write(f'
{show_html}
') + else: + f.write(f'{show_html}
') node = node.next if node is None: break @@ -260,7 +302,6 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin with open(maintex, 'r', encoding='utf-8', errors='replace') as f: content = f.read() merged_content = merge_tex_files(project_folder, content, mode) - merged_content = re.sub(r'(?