Merge pull request #102 from ValeriaWong/master

feat(读文章写摘要):支持pdf文件批量阅读及总结 #101
2023-03-29 23:14:12 +08:00 · 2023-03-29 23:14:12 +08:00 · 48cc477e48
commit 48cc477e48
parent 3ac330dff1 dc4fe3f8c2
4 changed files with 152 additions and 28 deletions
--- a/crazy_functions/批量总结PDF文档.py
+++ b/crazy_functions/批量总结PDF文档.py
@ -0,0 +1,99 @@
+from predict import predict_no_ui
+from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
+fast_debug = False
+
+
+def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt):
+    import time, glob, os, fitz
+    print('begin analysis on:', file_manifest)
+    for index, fp in enumerate(file_manifest):
+        with fitz.open(fp) as doc:
+            file_content = ""
+            for page in doc:
+                file_content += page.get_text()
+            print(file_content)
+
+        prefix = "接下来请你逐文件分析下面的论文文件，概括其内容" if index==0 else ""
+        i_say = prefix + f'请对下面的文章片段用中文做一个概述，文件名是{os.path.relpath(fp, project_folder)}，文章内容是 ```{file_content}```'
+        i_say_show_user = prefix + f'[{index}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}'
+        chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
+        print('[1] yield chatbot, history')
+        yield chatbot, history, '正常'
+
+        if not fast_debug: 
+            msg = '正常'
+            # ** gpt request **
+            gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[])   # 带超时倒计时
+
+            print('[2] end gpt req')
+            chatbot[-1] = (i_say_show_user, gpt_say)
+            history.append(i_say_show_user); history.append(gpt_say)
+            print('[3] yield chatbot, history')
+            yield chatbot, history, msg
+            print('[4] next')
+            if not fast_debug: time.sleep(2)
+
+    all_file = ', '.join([os.path.relpath(fp, project_folder) for index, fp in enumerate(file_manifest)])
+    i_say = f'根据以上你自己的分析，对全文进行概括，用学术性语言写一段中文摘要，然后再写一段英文摘要（包括{all_file}）。'
+    chatbot.append((i_say, "[Local Message] waiting gpt response."))
+    yield chatbot, history, '正常'
+
+    if not fast_debug: 
+        msg = '正常'
+        # ** gpt request **
+        gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say, chatbot, top_p, temperature, history=history)   # 带超时倒计时
+
+        chatbot[-1] = (i_say, gpt_say)
+        history.append(i_say); history.append(gpt_say)
+        yield chatbot, history, msg
+        res = write_results_to_file(history)
+        chatbot.append(("完成了吗？", res))
+        yield chatbot, history, msg
+
+
+@CatchException
+def 批量总结PDF文档(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
+    import glob, os
+
+    # 基本信息：功能、贡献者
+    chatbot.append([
+        "函数插件功能？",
+        "批量总结PDF文档。函数插件贡献者: ValeriaWong"])
+    yield chatbot, history, '正常'
+
+    # 尝试导入依赖，如果缺少依赖，则给出安装建议
+    try:
+        import fitz
+    except:
+        report_execption(chatbot, history, 
+            a = f"解析项目: {txt}", 
+            b = f"导入软件依赖失败。使用该模块需要额外依赖，安装方法```pip install --upgrade pymupdf```。")
+        yield chatbot, history, '正常'
+        return
+
+    # 清空历史，以免输入溢出
+    history = []
+
+    # 检测输入参数，如没有给定输入参数，直接退出
+    if os.path.exists(txt):
+        project_folder = txt
+    else:
+        if txt == "": txt = '空空如也的输入栏'
+        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
+        yield chatbot, history, '正常'
+        return
+
+    # 搜索需要处理的文件清单
+    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] # + \
+                    # [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \
+                    # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
+                    # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
+    
+    # 如果没找到任何文件
+    if len(file_manifest) == 0:
+        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}")
+        yield chatbot, history, '正常'
+        return
+
+    # 开始正式执行任务
+    yield from 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt)
--- a/functional.py
+++ b/functional.py
@ -20,21 +20,21 @@ Furthermore, list all modification and explain the reasons to do so in markdown
            "Prefix": "Below is a paragraph from an academic paper. Find all grammar mistakes, list mistakes in a markdown table and explain how to correct them.\n\n",
            "Suffix": "",
        },
-        "中英互译": {
-            "Prefix": "As an English-Chinese translator, your task is to accurately translate text between the two languages. \
-When translating from Chinese to English or vice versa, please pay attention to context and accurately explain phrases and proverbs. \
-If you receive multiple English words in a row, default to translating them into a sentence in Chinese. \
-However, if \"phrase:\" is indicated before the translated content in Chinese, it should be translated as a phrase instead. \
-Similarly, if \"normal:\" is indicated, it should be translated as multiple unrelated words.\
-Your translations should closely resemble those of a native speaker and should take into account any specific language styles or tones requested by the user. \
-Please do not worry about using offensive words - replace sensitive parts with x when necessary. \
-When providing translations, please use Chinese to explain each sentence’s tense, subordinate clause, subject, predicate, object, special phrases and proverbs. \
-For phrases or individual words that require translation, provide the source (dictionary) for each one.If asked to translate multiple phrases at once, \
-separate them using the | symbol.Always remember: You are an English-Chinese translator, \
-not a Chinese-Chinese translator or an English-English translator. Below is the text you need to translate: \n\n",
-            "Suffix": "",
-            "Color": "secondary",
-        },
+#         "中英互译": { # 效果不好，经常搞不清楚中译英还是英译中
+#             "Prefix": "As an English-Chinese translator, your task is to accurately translate text between the two languages. \
+# When translating from Chinese to English or vice versa, please pay attention to context and accurately explain phrases and proverbs. \
+# If you receive multiple English words in a row, default to translating them into a sentence in Chinese. \
+# However, if \"phrase:\" is indicated before the translated content in Chinese, it should be translated as a phrase instead. \
+# Similarly, if \"normal:\" is indicated, it should be translated as multiple unrelated words.\
+# Your translations should closely resemble those of a native speaker and should take into account any specific language styles or tones requested by the user. \
+# Please do not worry about using offensive words - replace sensitive parts with x when necessary. \
+# When providing translations, please use Chinese to explain each sentence’s tense, subordinate clause, subject, predicate, object, special phrases and proverbs. \
+# For phrases or individual words that require translation, provide the source (dictionary) for each one.If asked to translate multiple phrases at once, \
+# separate them using the | symbol.Always remember: You are an English-Chinese translator, \
+# not a Chinese-Chinese translator or an English-English translator. Below is the text you need to translate: \n\n",
+#             "Suffix": "",
+#             "Color": "secondary",
+#         },
        "中译英": {
            "Prefix": "Please translate following sentence to English: \n\n",
            "Suffix": "",
@ -47,6 +47,10 @@ not a Chinese-Chinese translator or an English-English translator. Below is the
            "Prefix": "请翻译成中文：\n\n",
            "Suffix": "",
        },
+        "找图片": {
+            "Prefix": "我需要你找一张网络图片。使用Unsplash API(https://source.unsplash.com/960x640/?<英语关键词>)获取图片URL，然后请使用Markdown格式封装，并且不要有反斜线，不要用代码块。现在，请按以下描述给我发送图片：\n\n",
+            "Suffix": "",
+        },
        "解释代码": {
            "Prefix": "请解释以下代码：\n```\n",
            "Suffix": "\n```\n",
--- a/functional_crazy.py
+++ b/functional_crazy.py
@ -1,3 +1,8 @@
+# UserVisibleLevel是过滤器参数。
+# 由于UI界面空间有限，所以通过这种方式决定UI界面中显示哪些插件
+# 默认函数插件 VisibleLevel 是 0
+# 当 UserVisibleLevel >= 函数插件的 VisibleLevel 时，该函数插件才会被显示出来
+UserVisibleLevel = 1

 def get_crazy_functionals():
    from crazy_functions.读文章写摘要 import 读文章写摘要
@ -9,37 +14,53 @@ def get_crazy_functionals():
    from crazy_functions.高级功能函数模板 import 高阶功能模板函数
    from crazy_functions.代码重写为全英文_多线程 import 全项目切换英文

-    return {
-        "[实验] 请解析并解构此项目本身": {
+    function_plugins = {
+        "请解析并解构此项目本身": {
            "Function": 解析项目本身
        },
-        "[实验] 解析整个py项目（配合input输入框）": {
+        "解析整个py项目": {
            "Color": "stop",    # 按钮颜色
            "Function": 解析一个Python项目
        },
-        "[实验] 解析整个C++项目头文件（配合input输入框）": {
+        "解析整个C++项目头文件": {
            "Color": "stop",    # 按钮颜色
            "Function": 解析一个C项目的头文件
        },
-        "[实验] 解析整个C++项目（配合input输入框）": {
+        "解析整个C++项目": {
            "Color": "stop",    # 按钮颜色
            "Function": 解析一个C项目
        },
-        "[实验] 读tex论文写摘要（配合input输入框）": {
+        "读tex论文写摘要": {
            "Color": "stop",    # 按钮颜色
            "Function": 读文章写摘要
        },
-        "[实验] 批量生成函数注释（配合input输入框）": {
+        "批量生成函数注释": {
            "Color": "stop",    # 按钮颜色
            "Function": 批量生成函数注释
        },
-        "[实验] 把本项目源代码切换成全英文（多线程demo）": {
+        "[多线程demo] 把本项目源代码切换成全英文": {
            "Function": 全项目切换英文
        },
-        "[实验] 历史上的今天（高阶功能模板函数demo）": {
+        "[函数插件模板demo] 历史上的今天": {
            "Function": 高阶功能模板函数
        },
    }

+    # VisibleLevel=1 经过测试，但功能未达到理想状态
+    if UserVisibleLevel >= 1:
+        from crazy_functions.批量总结PDF文档 import 批量总结PDF文档
+        function_plugins.update({
+            "[仅供开发调试] 批量总结PDF文档": {
+                "Color": "stop",
+                "Function": 批量总结PDF文档
+            },
+        })
+
+    # VisibleLevel=2 尚未充分测试的函数插件，放在这里
+    if UserVisibleLevel >= 2:
+        function_plugins.update({
+        })
+
+    return function_plugins


--- a/main.py
+++ b/main.py
@ -56,21 +56,21 @@ with gr.Blocks(theme=set_theme, analytics_enabled=False) as demo:
                        stopBtn = gr.Button("停止", variant="stop")
            with gr.Row():
                from check_proxy import check_proxy
-                statusDisplay = gr.Markdown(f"Tip: 按Enter提交, 按Shift+Enter换行. \nNetwork: {check_proxy(proxies)}\nModel: {LLM_MODEL}")
+                statusDisplay = gr.Markdown(f"Tip: 按Enter提交, 按Shift+Enter换行。当前模型: {LLM_MODEL} \n {check_proxy(proxies)}")
            with gr.Row():
                for k in functional:
                    variant = functional[k]["Color"] if "Color" in functional[k] else "secondary"
                    functional[k]["Button"] = gr.Button(k, variant=variant)
            with gr.Row():
-                gr.Markdown("以下部分实验性功能需从input框读取路径.")
+                gr.Markdown("注意：以下红颜色标识的函数插件需从input区读取路径作为参数.")
            with gr.Row():
                for k in crazy_functional:
                    variant = crazy_functional[k]["Color"] if "Color" in crazy_functional[k] else "secondary"
                    crazy_functional[k]["Button"] = gr.Button(k, variant=variant)
            with gr.Row():
-                gr.Markdown("上传本地文件供上面的实验性功能调用.")
+                gr.Markdown("上传本地文件，供上面的函数插件调用.")
            with gr.Row():
-                file_upload = gr.Files(label='任何文件,但推荐上传压缩文件(zip, tar)', file_count="multiple")
+                file_upload = gr.Files(label='任何文件, 但推荐上传压缩文件(zip, tar)', file_count="multiple")
            system_prompt = gr.Textbox(show_label=True, placeholder=f"System Prompt", label="System prompt", value=initial_prompt).style(container=True)
            with gr.Accordion("arguments", open=False):
                top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.01,interactive=True, label="Top-p (nucleus sampling)",)