Merge branch 'Euclid_Test' of https://github.com/Euclid-Jie/chatgpt_academic into Euclid-Jie-Euclid_Test

2023-03-31 20:26:59 +08:00 · 2023-03-31 20:26:59 +08:00 · a71edeea95
commit a71edeea95
parent a88a42799f db8c8afd74
5 changed files with 81 additions and 11 deletions
--- a/crazy_functions/批量总结PDF文档.py
+++ b/crazy_functions/批量总结PDF文档.py
@ -11,6 +11,7 @@ def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, histor
            file_content = ""
            for page in doc:
                file_content += page.get_text()
                file_content = file_content.encode('gbk', 'ignore').decode('gbk')
            print(file_content)
        prefix = "接下来请你逐文件分析下面的论文文件，概括其内容" if index==0 else ""
--- a/crazy_functions/读文章写摘要.py
+++ b/crazy_functions/读文章写摘要.py
@ -1,14 +1,19 @@
 from predict import predict_no_ui
-from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
+from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, readPdf
 fast_debug = False
 from bs4 import BeautifulSoup
 def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt):
    import time, glob, os
    print('begin analysis on:', file_manifest)
    for index, fp in enumerate(file_manifest):
-        with open(fp, 'r', encoding='utf-8') as f:
+        if ".tex" in fp:
-            file_content = f.read()
+            with open(fp, 'r', encoding='utf-8') as f:
                file_content = f.read()
        if ".pdf" in fp.lower():
            file_content = readPdf(fp)
            file_content = BeautifulSoup(''.join(file_content), features="lxml").body.text.encode('gbk', 'ignore').decode('gbk')
        prefix = "接下来请你逐文件分析下面的论文文件，概括其内容" if index==0 else ""
        i_say = prefix + f'请对下面的文章片段用中文做一个概述，文件名是{os.path.relpath(fp, project_folder)}，文章内容是 ```{file_content}```'
@ -17,7 +22,7 @@ def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, hist
        print('[1] yield chatbot, history')
        yield chatbot, history, '正常'
-        if not fast_debug: 
+        if not fast_debug:
            msg = '正常'
            # ** gpt request **
            gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[])   # 带超时倒计时
@ -35,7 +40,7 @@ def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, hist
    chatbot.append((i_say, "[Local Message] waiting gpt response."))
    yield chatbot, history, '正常'
-    if not fast_debug: 
+    if not fast_debug:
        msg = '正常'
        # ** gpt request **
        gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say, chatbot, top_p, temperature, history=history)   # 带超时倒计时
@ -60,11 +65,12 @@ def 读文章写摘要(txt, top_p, temperature, chatbot, history, systemPromptTx
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
        yield chatbot, history, '正常'
        return
-    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] # + \
+    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \
                    [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] # + \
                    # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
                    # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
    if len(file_manifest) == 0:
-        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
+        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或pdf文件: {txt}")
        yield chatbot, history, '正常'
        return
    yield from 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt)
--- a/functional_crazy.py
+++ b/functional_crazy.py
@ -30,7 +30,7 @@ def get_crazy_functionals():
            "Color": "stop",    # 按钮颜色
            "Function": 解析一个C项目
        },
-        "读tex论文写摘要": {
+        "读tex or pdf论文写摘要": {
            "Color": "stop",    # 按钮颜色
            "Function": 读文章写摘要
        },
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,10 @@
 gradio>=3.23
-requests[socks]
+requests[socks]~=2.28.2
-mdtex2html
+mdtex2html~=1.2.0
 Markdown~=3.4.3
 latex2mathml~=3.75.1
 bs4~=0.0.1
 lxml~=4.6.4
 beautifulsoup4~=4.12.0
 numpy~=1.24.2
 pdfminer.six
--- a/toolbox.py
+++ b/toolbox.py
@ -1,6 +1,14 @@
 import markdown, mdtex2html, threading, importlib, traceback
 from show_math import convert as convert_math
 from functools import wraps
 import pdfminer
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.layout import LAParams
 from pdfminer.converter import PDFPageAggregator
 def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
    """
@ -235,4 +243,52 @@ def clear_line_break(txt):
    txt = txt.replace('\n', ' ')
    txt = txt.replace('  ', ' ')
    txt = txt.replace('  ', ' ')
-    return txt
+    return txt
 def readPdf(pdfPath):
    """
    读取pdf文件，返回文本内容
    """
    fp = open(pdfPath, 'rb')
    # Create a PDF parser object associated with the file object
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    # device = PDFDevice(rsrcmgr)
    # BEGIN LAYOUT ANALYSIS.
    # Set parameters for analysis.
    laparams = LAParams(
        char_margin=10.0,
        line_margin=0.2,
        boxes_flow=0.2,
        all_texts=False,
    )
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # loop over all pages in the document
    outTextList = []
    for page in PDFPage.create_pages(document):
        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()
        for obj in layout._objs:
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                # print(obj.get_text())
                outTextList.append(obj.get_text())
    return outTextList