diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py
index ecba82b..855cc1c 100644
--- a/crazy_functions/Latex输出PDF结果.py
+++ b/crazy_functions/Latex输出PDF结果.py
@@ -82,7 +82,14 @@ def arxiv_download(chatbot, history, txt):
promote_file_to_downloadzone(target_file)
return target_file
return False
-
+ def is_float(s):
+ try:
+ float(s)
+ return True
+ except ValueError:
+ return False
+ if ('.' in txt) and ('/' not in txt) and is_float(txt):
+ txt = 'https://arxiv.org/abs/' + txt
if not txt.startswith('https://arxiv.org'):
return txt, None
@@ -198,7 +205,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
# <-------------- information about this plugin ------------->
chatbot.append([
"函数插件功能?",
- "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"])
+ "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
@@ -221,6 +228,8 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
+
+
if os.path.exists(txt):
project_folder = txt
else:
@@ -228,6 +237,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
+
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
if len(file_manifest) == 0:
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
@@ -261,5 +271,6 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...'))
yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
+
# <-------------- we are done ------------->
return success
diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py
index d19d653..e743878 100644
--- a/crazy_functions/crazy_functions_test.py
+++ b/crazy_functions/crazy_functions_test.py
@@ -182,13 +182,13 @@ def test_Langchain知识库读取():
def test_Latex():
from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF
- txt = r"https://arxiv.org/abs/1706.03762"
+ # txt = r"https://arxiv.org/abs/1706.03762"
# txt = r"https://arxiv.org/abs/1902.03185"
# txt = r"https://arxiv.org/abs/2305.18290"
# txt = r"https://arxiv.org/abs/2305.17608"
- # txt = r"https://arxiv.org/abs/2211.16068" # ACE
- # txt = r"C:\Users\fuqingxu\arxiv_cache\2211.16068\workfolder" # ACE
-
+ # txt = r"https://arxiv.org/abs/2211.16068" # ACE
+ # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE
+ txt = r"https://arxiv.org/abs/2002.09253"
for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
cli_printer.print(cb) # print(cb)
diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py
index b490b5c..15dfebc 100644
--- a/crazy_functions/latex_utils.py
+++ b/crazy_functions/latex_utils.py
@@ -61,8 +61,8 @@ class LinkedListNode():
self.string = string
self.preserve = preserve
self.next = None
- self.begin_line = 0
- self.begin_char = 0
+ # self.begin_line = 0
+ # self.begin_char = 0
def convert_to_linklist(text, mask):
root = LinkedListNode("", preserve=True)
@@ -97,11 +97,22 @@ def 寻找Latex主文件(file_manifest, mode):
else:
continue
raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)')
-
+def rm_comments(main_file):
+ new_file_remove_comment_lines = []
+ for l in main_file.splitlines():
+ # 删除整行的空注释
+ if l.startswith("%") or (l.startswith(" ") and l.lstrip().startswith("%")):
+ pass
+ else:
+ new_file_remove_comment_lines.append(l)
+ main_file = '\n'.join(new_file_remove_comment_lines)
+ main_file = re.sub(r'(? None:
- """
- root是链表的根节点
- """
- self.root = None
+ self.nodes = None
self.msg = "{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \
"版权归原文作者所有。翻译内容可靠性无任何保障,请仔细鉴别并以原文为准。" + \
"项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。"
@@ -212,16 +212,13 @@ class LatexPaperSplit():
Merge the result after the GPT process completed
"""
result_string = ""
- node = self.root
p = 0
- while True:
+ for node in self.nodes:
if node.preserve:
result_string += node.string
else:
result_string += fix_content(arr[p], node.string)
p += 1
- node = node.next
- if node is None: break
if mode == 'translate_zh':
pattern = re.compile(r'\\begin\{abstract\}.*\n')
match = pattern.search(result_string)
@@ -229,7 +226,27 @@ class LatexPaperSplit():
result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:]
return result_string
- def split(self, txt, project_folder):
+ def split(self, txt, project_folder):
+ """
+ break down latex file to a linked list,
+ each node use a preserve flag to indicate whether it should
+ be proccessed by GPT.
+ P.S. use multiprocessing to avoid timeout error
+ """
+ import multiprocessing
+ manager = multiprocessing.Manager()
+ return_dict = manager.dict()
+ p = multiprocessing.Process(
+ target=lambda lps, txt, project_folder, return_dict:
+ lps.split_subprocess(txt, project_folder, return_dict),
+ args=(self, txt, project_folder, return_dict))
+ p.start()
+ p.join()
+ self.nodes = return_dict['nodes']
+ self.sp = return_dict['segment_parts_for_gpt']
+ return self.sp
+
+ def split_subprocess(self, txt, project_folder, return_dict):
"""
break down latex file to a linked list,
each node use a preserve flag to indicate whether it should
@@ -318,12 +335,20 @@ class LatexPaperSplit():
node = node.next
if node is None: break
+ # 屏蔽空行和太短的句子
node = root
while True:
if len(node.string.strip('\n').strip(''))==0: node.preserve = True
if len(node.string.strip('\n').strip(''))<42: node.preserve = True
node = node.next
if node is None: break
+ node = root
+ while True:
+ if node.next and node.preserve and node.next.preserve:
+ node.string += node.next.string
+ node.next = node.next.next
+ node = node.next
+ if node is None: break
# 将前后断行符脱离
node = root
@@ -345,8 +370,10 @@ class LatexPaperSplit():
with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
segment_parts_for_gpt = []
+ nodes = []
node = root
while True:
+ nodes.append(node)
show_html = node.string.replace('\n','
')
if not node.preserve:
segment_parts_for_gpt.append(node.string)
@@ -355,9 +382,11 @@ class LatexPaperSplit():
f.write(f'
{show_html}
') node = node.next if node is None: break - self.root = root - self.sp = segment_parts_for_gpt - return self.sp + + for n in nodes: n.next = None # break + return_dict['nodes'] = nodes + return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt + return return_dict class LatexPaperFileGroup(): """ @@ -439,7 +468,7 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin # <-------- 精细切分latex文件 ----------> lps = LatexPaperSplit() - res = lps.split(merged_content, project_folder) + res = lps.split(merged_content, project_folder) # 消耗时间的函数 # <-------- 拆分过长的latex片段 ----------> pfg = LatexPaperFileGroup() @@ -515,7 +544,8 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work f.writelines(file_lines) return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines except: - return False, 0, [0] + print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") + return False, -1, [-1] def compile_latex_with_timeout(command, timeout=60): diff --git a/docs/Dockerfile+NoLocal+Latex b/docs/Dockerfile+NoLocal+Latex index 428dbc0..0f9ac8a 100644 --- a/docs/Dockerfile+NoLocal+Latex +++ b/docs/Dockerfile+NoLocal+Latex @@ -8,26 +8,17 @@ FROM fuqingxu/python311_texlive_ctex:latest # 指定路径 WORKDIR /gpt +ARG useProxyNetwork='' + +RUN $useProxyNetwork pip3 install gradio openai numpy arxiv rich -i https://pypi.douban.com/simple/ +RUN $useProxyNetwork pip3 install colorama Markdown pygments pymupdf -i https://pypi.douban.com/simple/ + # 装载项目文件 COPY . . -ARG useProxyNetwork='' - - -# # # comment out below if you do not need proxy network | 如果不需要翻墙 - 从此行向下删除 -# RUN apt-get update -# RUN apt-get install -y curl proxychains -# RUN $useProxyNetwork curl cip.cc -# RUN sed -i '$ d' /etc/proxychains.conf -# RUN sed -i '$ d' /etc/proxychains.conf -# RUN echo "socks5 127.0.0.1 10880" >> /etc/proxychains.conf -# ARG useProxyNetwork=proxychains -# # # comment out above if you do not need proxy network | 如果不需要翻墙 - 从此行向上删除 - - # 安装依赖 -RUN $useProxyNetwork pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple +RUN $useProxyNetwork pip3 install -r requirements.txt -i https://pypi.douban.com/simple/ # 可选步骤,用于预热模块 RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()'