From fc222bf287d90a4a99cee956ce0c6ef222b8ccca Mon Sep 17 00:00:00 2001
From: qingxu fu <505030475@qq.com>
Date: Thu, 13 Apr 2023 12:46:31 +0800
Subject: [PATCH] =?UTF-8?q?Lua=E5=B7=A5=E7=A8=8B=E8=A7=A3=E6=9E=90+?=
 =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crazy_functional.py                | 16 +++++++---------
 crazy_functions/crazy_utils.py     | 26 ++++++++++++++++++--------
 crazy_functions/理解PDF文档内容.py | 13 +++++++------
 3 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/crazy_functional.py b/crazy_functional.py
index 8cfe558..bb04f25 100644
--- a/crazy_functional.py
+++ b/crazy_functional.py
@@ -16,7 +16,7 @@ def get_crazy_functions():
     from crazy_functions.高级功能函数模板 import 高阶功能模板函数
     from crazy_functions.代码重写为全英文_多线程 import 全项目切换英文
     from crazy_functions.Latex全文润色 import Latex英文润色
-
+    from crazy_functions.解析项目源代码 import 解析一个Lua项目
     function_plugins = {
 
         "解析整个Python项目": {
@@ -47,6 +47,11 @@ def get_crazy_functions():
             "AsButton": False,  # 加入下拉菜单中
             "Function": HotReload(解析一个Rect项目)
         },
+        "解析整个Lua项目": {
+            "Color": "stop",    # 按钮颜色
+            "AsButton": False,  # 加入下拉菜单中
+            "Function": HotReload(解析一个Lua项目)
+        },
         "读Tex论文写摘要": {
             "Color": "stop",    # 按钮颜色
             "Function": HotReload(读文章写摘要)
@@ -156,14 +161,7 @@ def get_crazy_functions():
     except Exception as err:
         print(f'[下载arxiv论文并翻译摘要] 插件导入失败 {str(err)}')
         
-    from crazy_functions.解析项目源代码 import 解析一个Lua项目
-    function_plugins.update({
-        "解析整个Lua项目": {
-            "Color": "stop",    # 按钮颜色
-            "AsButton": False,  # 加入下拉菜单中
-            "Function": HotReload(解析一个Lua项目)
-        },
-    })        
+
 
     ###################### 第n组插件 ###########################
     return function_plugins
diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py
index 253e167..91bd4af 100644
--- a/crazy_functions/crazy_utils.py
+++ b/crazy_functions/crazy_utils.py
@@ -387,12 +387,15 @@ def read_and_clean_pdf_text(fp):
     import re
     import numpy as np
     from colorful import print亮黄, print亮绿
-    fc = 0
-    fs = 1
-    fb = 2
-    REMOVE_FOOT_NOTE = True
-    REMOVE_FOOT_FFSIZE_PERCENT = 0.95 
+    fc = 0  # Index 0 文本
+    fs = 1  # Index 1 字体
+    fb = 2  # Index 2 框框
+    REMOVE_FOOT_NOTE = True # 是否丢弃掉 不是正文的内容 （比正文字体小，如参考文献、脚注、图注等）
+    REMOVE_FOOT_FFSIZE_PERCENT = 0.95 # 小于正文的？时，判定为不是正文（有些文章的正文部分字体大小不是100%统一的，有肉眼不可见的小变化）
     def primary_ffsize(l):
+        """
+        提取文本块主字体
+        """
         fsize_statiscs = {}
         for wtf in l['spans']:
             if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
@@ -400,14 +403,18 @@ def read_and_clean_pdf_text(fp):
         return max(fsize_statiscs, key=fsize_statiscs.get)
         
     def ffsize_same(a,b):
+        """
+        提取字体大小是否近似相等
+        """
         return abs((a-b)/max(a,b)) < 0.02
-    # file_content = ""
+
     with fitz.open(fp) as doc:
         meta_txt = []
         meta_font = []
 
         meta_line = []
         meta_span = []
+        ############################## <第 1 步，搜集初始信息> ##################################
         for index, page in enumerate(doc):
             # file_content += page.get_text()
             text_areas = page.get_text("dict")  # 获取页面上的文本信息
@@ -429,7 +436,8 @@ def read_and_clean_pdf_text(fp):
             if index == 0:
                 page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
                     '- ', '') for t in text_areas['blocks'] if 'lines' in t]
-        # 获取正文主字体
+                
+        ############################## <第 2 步，获取正文主字体> ##################################
         fsize_statiscs = {}
         for span in meta_span:
             if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
@@ -438,7 +446,7 @@ def read_and_clean_pdf_text(fp):
         if REMOVE_FOOT_NOTE:
             give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
 
-        # 切分和重新整合
+        ############################## <第 3 步，切分和重新整合> ##################################
         mega_sec = []
         sec = []
         for index, line in enumerate(meta_line):
@@ -480,6 +488,7 @@ def read_and_clean_pdf_text(fp):
             finals.append(final)
         meta_txt = finals
 
+        ############################## <第 4 步，乱七八糟的后处理> ##################################
         def 把字符太少的块清除为回车(meta_txt):
             for index, block_txt in enumerate(meta_txt):
                 if len(block_txt) < 100:
@@ -523,6 +532,7 @@ def read_and_clean_pdf_text(fp):
         # 换行 -> 双换行
         meta_txt = meta_txt.replace('\n', '\n\n')
 
+        ############################## <第 5 步，展示分割效果> ##################################
         for f in finals:
             print亮黄(f)
             print亮绿('***************************')
diff --git a/crazy_functions/理解PDF文档内容.py b/crazy_functions/理解PDF文档内容.py
index f4ba1a6..1440e7c 100644
--- a/crazy_functions/理解PDF文档内容.py
+++ b/crazy_functions/理解PDF文档内容.py
@@ -8,11 +8,12 @@ fast_debug = False
 def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
     import tiktoken
     print('begin analysis on:', file_name)
-    file_content, page_one = read_and_clean_pdf_text(file_name)
 
-    ############################## <第零步，从摘要中提取高价值信息，放到history中> ##################################
+    ############################## <第 0 步，切割PDF> ##################################
     # 递归地切割PDF文件，每一块（尽量是完整的一个section，比如introduction，experiment等，必要时再进行切割）
     # 的长度必须小于 2500 个 Token
+    file_content, page_one = read_and_clean_pdf_text(file_name) # （尝试）按照章节切割PDF
+
     TOKEN_LIMIT_PER_FRAGMENT = 2500
 
     from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
@@ -26,11 +27,11 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
     # 为了更好的效果，我们剥离Introduction之后的部分（如果有）
     paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
     
-    ############################## <第一步，从摘要中提取高价值信息，放到history中> ##################################
+    ############################## <第 1 步，从摘要中提取高价值信息，放到history中> ##################################
     final_results = []
     final_results.append(paper_meta)
 
-    ############################## <第二步，迭代地历遍整个文章，提取精炼信息> ##################################
+    ############################## <第 2 步，迭代地历遍整个文章，提取精炼信息> ##################################
     i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
     chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
 
@@ -51,14 +52,14 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
         iteration_results.append(gpt_say)
         last_iteration_result = gpt_say
 
-    ############################## <第三步，整理history> ##################################
+    ############################## <第 3 步，整理history> ##################################
     final_results.extend(iteration_results)
     final_results.append(f'接下来，你是一名专业的学术教授，利用以上信息，使用中文回答我的问题。')
     # 接下来两句话只显示在界面上，不起实际作用
     i_say_show_user = f'接下来，你是一名专业的学术教授，利用以上信息，使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。"
     chatbot.append([i_say_show_user, gpt_say])
 
-    ############################## <第四步，设置一个token上限，防止回答时Token溢出> ##################################
+    ############################## <第 4 步，设置一个token上限，防止回答时Token溢出> ##################################
     from .crazy_utils import input_clipping
     _, final_results = input_clipping("", final_results, max_token_limit=3200)
     yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了