latex toolchain

2023-06-02 21:44:11 +08:00 · 2023-06-02 21:44:11 +08:00 · 601712fd0a
commit 601712fd0a
parent e769f831c7
4 changed files with 400 additions and 11 deletions
--- a/crazy_functions/Latex输出PDF结果.py
+++ b/crazy_functions/Latex输出PDF结果.py
@ -1,11 +1,106 @@
 from toolbox import update_ui, trimmed_format_exc
 from toolbox import CatchException, report_execption, write_results_to_file, zip_folder
-import glob
+import glob, copy, os
 def confirm_answer_is_health(bufo, buf, llm_kwargs, default = True):
    # from request_llm.bridge_all import predict_no_ui_long_connection
    # inputs  = f"I asked someone to proofread some text \"{bufo}\", this is what he answered: \"{buf}\"."
    # inputs +=  "Did he answer me with proofreaded text (`true`), or did he just tell me the text has no errors (`false`)?"
    # llm_kwargs_ = copy.deepcopy(llm_kwargs); llm_kwargs_['temperature'] = 0
    # result = predict_no_ui_long_connection( inputs=inputs, llm_kwargs=llm_kwargs_,
    #     history=[], sys_prompt="Answer my question with either `true` or `false`.", observe_window=[])
    # if 'false' in result or 'FALSE' in result or 'False' in result:
    #     return False
    # if 'true' in result or 'TRUE' in result or 'True' in result:
    #     return True
    # return default
    return len(buf) >= len(bufo) // 3
 def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='polish'):
    import time, os, re
    from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
    from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit
    #  <-------- 读取Latex文件，删除其中的所有注释 ----------> 
    def 寻找主文件(file_manifest):
        for texf in file_manifest:
            with open(texf, 'r', encoding='utf8') as f:
                file_content = f.read()
            if r'\documentclass' in file_content:
                return texf
            else:
                continue
        raise RuntimeError('无法找到一个主Tex文件（包含documentclass关键字）')
    maintex = 寻找主文件(file_manifest)
    with open(maintex, 'r', encoding='utf-8', errors='replace') as f:
        content = f.read()
        merged_content = merge_tex_files(project_folder, content)
        # 使用正则表达式查找注释，并替换为空字符串
        merged_content = re.sub(r'(?<!\\)%.*', '', merged_content)
    with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f:
        f.write(merged_content)
    lps = LatexPaperSplit()
    res = lps.split(merged_content)
    #  <-------- 拆分过长的latex片段 ----------> 
    pfg = LatexPaperFileGroup()
    for index, r in enumerate(res):
        pfg.file_paths.append(index)
        pfg.file_contents.append(r)
    pfg.run_file_split(max_token_limit=1024)
    n_split = len(pfg.sp_file_contents)
    inputs_array = [r"Below is a section from an academic paper, proofread this section." + 
                    r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + 
                    r"Answer me only with the revised text:" + 
                f"\n\n{frag}" for frag in pfg.sp_file_contents]
    sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)]
    inputs_show_user_array = [f"Proofread {f}" for f in pfg.sp_file_tag]
    gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
        inputs_array=inputs_array,
        inputs_show_user_array=inputs_show_user_array,
        llm_kwargs=llm_kwargs,
        chatbot=chatbot,
        history_array=[[""] for _ in range(n_split)],
        sys_prompt_array=sys_prompt_array,
        # max_workers=5,  # 并行任务数量限制，最多同时执行5个，其他的排队等待
        scroller_max_len = 80
    )
    #  <-------- 文本碎片重组为完整的tex片段 ----------> 
    pfg.sp_file_result = []
    for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents):
        pfg.sp_file_result.append(gpt_say)
    pfg.merge_result()
    final_tex = lps.merge_result(pfg.sp_file_result)
    with open(project_folder + '/merge_proofread.tex', 'w', encoding='utf-8', errors='replace') as f:
        f.write(final_tex)
    #  <-------- 整理结果，退出 ----------> 
    create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md"
    res = write_results_to_file(gpt_response_collection, file_name=create_report_file_name)
    history = gpt_response_collection
    chatbot.append((f"完成了吗？", res))
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    return project_folder + '/merge_proofread.tex'
 def 编译Latex(main_tex, work_folder):
    import os
    current_dir = os.getcwd()
-    os.chdir(work_folder);
+    os.chdir(work_folder)
    main_file = os.path.basename(main_tex)
    assert main_file.endswith('.tex')
    main_file = main_file[:-4]
@ -39,3 +134,102 @@ def Latex预处理(tar_file):
@CatchException
 def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
    # 基本信息：功能、贡献者
    chatbot.append([
        "函数插件功能？",
        "对整个Latex项目进行纠错，用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    # 尝试导入依赖，如果缺少依赖，则给出安装建议
    try:
        import glob, os
        os.system(f'pdflatex -version')
    except Exception as e:
        print(trimmed_format_exc())
        report_execption(chatbot, history, a=f"解析项目: {txt}",
                         b=f"尝试执行Latex指令失败。Latex没有安装，或者不在环境变量PATH中。")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    history = []    # 清空历史，以免输入溢出
    if os.path.exists(txt):
        project_folder = txt
    else:
        if txt == "": txt = '空空如也的输入栏'
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
    if len(file_manifest) == 0:
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    if not os.path.exists(project_folder + '/merge_proofread.tex'):
        yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='proofread_latex')
    res_pdf_path = 编译Latex差别(main_file_original='merge', main_file_modified='merge_proofread', 
                             work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
    return res_pdf_path
 def 编译Latex差别(main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder):
    import os
    current_dir = os.getcwd()
    # <--------------------->
    import os, shutil
    # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
    os.chdir(work_folder_original); os.system(f'pdflatex -interaction=batchmode {main_file_original}.tex'); os.chdir(current_dir)
    os.chdir(work_folder_modified); os.system(f'pdflatex -interaction=batchmode {main_file_modified}.tex'); os.chdir(current_dir)
    os.chdir(work_folder_original); os.system(f'bibtex  {main_file_original}.aux'); os.chdir(current_dir)
    os.chdir(work_folder_modified); os.system(f'bibtex  {main_file_modified}.aux'); os.chdir(current_dir)
    print(    f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex  {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/diff.tex')
    os.system(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex  {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/diff.tex')
    os.chdir(work_folder); os.system(f'pdflatex  -interaction=batchmode diff.tex'); os.chdir(current_dir)
    os.chdir(work_folder); os.system(f'bibtex    diff.aux'); os.chdir(current_dir)
    os.chdir(work_folder); os.system(f'pdflatex  -interaction=batchmode diff.tex'); os.chdir(current_dir)
    os.chdir(work_folder); os.system(f'pdflatex  -interaction=batchmode diff.tex'); os.chdir(current_dir)
    # <--------------------->
    os.chdir(current_dir)
    return f'{work_folder}/diff.pdf'
 def Latex预处理(pfg, project_folder):
    import shutil, os
    work_folder = 'private_upload/latex_workshop_temp'
    try:
        shutil.rmtree(work_folder)
    except:
        pass
    finally:
        work_folder_original = 'private_upload/latex_workshop_temp/original'
        work_folder_modified = 'private_upload/latex_workshop_temp/modified'
        shutil.copytree(project_folder, work_folder_original, ignore=lambda a,b: ['.git'])
        shutil.copytree(project_folder, work_folder_modified, ignore=lambda a,b: ['.git'])
    for path, result in zip(pfg.file_paths, pfg.file_result):
        path_old = os.path.relpath(path, start=project_folder)
        path_new = os.path.join(work_folder_modified, path_old)
        with open(path_new, 'w', encoding='utf-8') as f:
            f.write(result)
    for main_file_original in glob.glob('private_upload/latex_workshop_temp/original/*.tex'):
        with open(main_file_original, 'r', encoding='utf8') as f:
            file_content = f.read()
        if r'\documentclass' in file_content:
            path_old = os.path.relpath(main_file_original, start=work_folder_original)
            main_file_modified = os.path.relpath(work_folder_modified, start=work_folder_original)
            return main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder
        else:
            continue
    raise RuntimeError('无法找到一个主Tex文件, 本程序寻找主Tex文件的方法是查找文件中的documentclass关键字。')
--- a/crazy_functions/crazy_functions_test.py
+++ b/crazy_functions/crazy_functions_test.py
@ -180,13 +180,19 @@ def test_Langchain知识库读取():
        cli_printer.print(cb)   #  print(cb)
 def test_Latex():
-    from crazy_functions.Latex输出PDF结果 import Latex预处理, 编译Latex
+    from crazy_functions.Latex输出PDF结果 import Latex预处理, 编译Latex, Latex英文纠错加PDF对比
-    txt = "2302.02948.tar"
+    txt = "C:/Users/fuqingxu/Desktop/proofread"
-    print(txt)
+    for cookies, cb, hist, msg in (Latex英文纠错加PDF对比)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
-    main_tex, work_folder = Latex预处理(txt)
+        cli_printer.print(cb)   #  print(cb)
-    print('main tex:', main_tex)
+
-    res = 编译Latex(main_tex, work_folder)
+
-    # for cookies, cb, hist, msg in silence_stdout(编译Latex)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+
    # txt = "2302.02948.tar"
    # print(txt)
    # main_tex, work_folder = Latex预处理(txt)
    # print('main tex:', main_tex)
    # res = 编译Latex(main_tex, work_folder)
    # # for cookies, cb, hist, msg in silence_stdout(编译Latex)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
    #     cli_printer.print(cb)   #  print(cb)
--- a/crazy_functions/latex_utils.py
+++ b/crazy_functions/latex_utils.py
@ -0,0 +1,175 @@
 from toolbox import update_ui, trimmed_format_exc
 from toolbox import CatchException, report_execption, write_results_to_file, zip_folder
 import os
 import re
 def merge_tex_files(project_foler, main_file):
    # Get the directory of the main tex file
    # re.findall(r"\\input\{(.*?)\}", main_file, re.M)
    for s in reversed([q for q in re.finditer(r"\\input\{(.*?)\}", main_file, re.M)]):
        f = s.group(1)
        fp = os.path.join(project_foler, f)
        with open(fp, 'r', encoding='utf-8', errors='replace') as fx:
            c = fx.read()
        c = merge_tex_files(project_foler, c)
        main_file = main_file[:s.span()[0]] + c + main_file[s.span()[1]:]
    return main_file
 class LinkTable():
    def __init__(self, string, preserve=True) -> None:
        self.string = string
        self.preserve = preserve
        self.next = None
 class LatexPaperSplit():
    def __init__(self) -> None:
        self.root = None
    def merge_result(self, arr):
        def remove_special_chars(s):
            s.replace('%', 'Percent')
            return s
        result_string = ""
        node = self.root
        p = 0
        while True:
            if node.preserve:
                result_string += node.string
            else:
                result_string += remove_special_chars(arr[p])
                p += 1
            node = node.next
            if node is None: break
        return result_string
    def split(self, txt):
        # def replace_with_hash()
        root = LinkTable(txt, False)
        def split_worker(root, pattern, flags=0):
            lt = root
            cnt = 0
            while True:
                if not lt.preserve:
                    while True:
                        res = re.search(pattern, lt.string, flags)
                        if not res: break
                        before = res.string[:res.span()[0]]
                        this = res.group(0)
                        # core = res.group(1)
                        after = res.string[res.span()[1]:]
                        lt.string = before
                        tmp  = lt.next
                        # ======
                        if after.startswith('\n'):
                            # move \n
                            this = this + '\n'
                            after = after[1:]
                        mid = LinkTable(this, True)
                        lt.next = mid
                        # ======
                        aft = LinkTable(after, False)
                        mid.next = aft
                        aft.next = tmp
                        # ======
                        lt = aft
                lt = lt.next
                cnt += 1
                print(cnt)
                if lt is None: break
        # root 是链表的头
        print('正在分解Latex源文件')
        split_worker(root, r"(.*?)\\maketitle", re.DOTALL)
        split_worker(root, r"\\section\{(.*?)\}")
        split_worker(root, r"\\subsection\{(.*?)\}")
        split_worker(root, r"\\subsubsection\{(.*?)\}")
        split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
        split_worker(root, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL)
        split_worker(root, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL)
        split_worker(root, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL)
        split_worker(root, r"\\item ")
        split_worker(root, r"\\begin\{(.*?)\}")
        split_worker(root, r"\\end\{(.*?)\}")
        res = []
        node = root
        while True:
            res.append((node.string, node.preserve))
            if len(node.string.strip('\n').strip(''))==0: node.preserve = True
            if len(node.string.strip('\n').strip(''))<50: node.preserve = True
            node = node.next
            if node is None: break
        print('======================================')
        res_to_t = []
        node = root
        while True:
            if not node.preserve:
                print(node.string)
                res_to_t.append(node.string)
            print('======================================')
            node = node.next
            if node is None: break
        print('======================================')
        self.root = root
        self.sp = res_to_t
        return self.sp
 class LatexPaperFileGroup():
    def __init__(self):
        self.file_paths = []
        self.file_contents = []
        self.sp_file_contents = []
        self.sp_file_index = []
        self.sp_file_tag = []
        # count_token
        from request_llm.bridge_all import model_info
        enc = model_info["gpt-3.5-turbo"]['tokenizer']
        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
        self.get_token_num = get_token_num
    def run_file_split(self, max_token_limit=1900):
        """
        将长文本分离开来
        """
        for index, file_content in enumerate(self.file_contents):
            if self.get_token_num(file_content) < max_token_limit:
                self.sp_file_contents.append(file_content)
                self.sp_file_index.append(index)
                self.sp_file_tag.append(self.file_paths[index])
            else:
                from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
                segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
                for j, segment in enumerate(segments):
                    self.sp_file_contents.append(segment)
                    self.sp_file_index.append(index)
                    self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
        print('Segmentation: done')
    def merge_result(self):
        self.file_result = ["" for _ in range(len(self.file_paths))]
        for r, k in zip(self.sp_file_result, self.sp_file_index):
            self.file_result[k] += r
    def write_result(self):
        manifest = []
        for path, res in zip(self.file_paths, self.file_result):
            with open(path + '.polish.tex', 'w', encoding='utf8') as f:
                manifest.append(path + '.polish.tex')
                f.write(res)
        return manifest
    def zip_result(self):
        import os, time
        folder = os.path.dirname(self.file_paths[0])
        t = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
        zip_folder(folder, './gpt_log/', f'{t}-polished.zip')
--- a/toolbox.py
+++ b/toolbox.py
@ -736,6 +736,8 @@ def clip_history(inputs, history, tokenizer, max_token_limit):
 其他小工具:
    - zip_folder:    把某个路径下所有文件压缩，然后转移到指定的另一个路径中（gpt写的）
    - gen_time_str:  生成时间戳
    - ProxyNetworkActivate: 临时地启动代理网络（如果有）
    - objdump/objload: 快捷的调试函数
 ========================================================================
 """
@ -774,7 +776,6 @@ def gen_time_str():
    import time
    return time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
 class ProxyNetworkActivate():
    """
    这段代码定义了一个名为TempProxy的空上下文管理器, 用于给一小段代码上代理
@ -793,3 +794,16 @@ class ProxyNetworkActivate():
        if 'HTTP_PROXY' in os.environ: os.environ.pop('HTTP_PROXY')
        if 'HTTPS_PROXY' in os.environ: os.environ.pop('HTTPS_PROXY')
        return
 def objdump(obj):
    import pickle
    with open('objdump.tmp', 'wb+') as f:
        pickle.dump(obj, f)
    return
 def objload():
    import pickle, os
    if not os.path.exists('objdump.tmp'): 
        return
    with open('objdump.tmp', 'rb') as f:
        return pickle.load(f)