diff --git a/crazy_functions/批量总结PDF文档.py b/crazy_functions/批量总结PDF文档.py index 102bc9e..bf7fe6f 100644 --- a/crazy_functions/批量总结PDF文档.py +++ b/crazy_functions/批量总结PDF文档.py @@ -11,6 +11,7 @@ def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, histor file_content = "" for page in doc: file_content += page.get_text() + file_content = file_content.encode('gbk', 'ignore').decode('gbk') print(file_content) prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else "" diff --git a/crazy_functions/读文章写摘要.py b/crazy_functions/读文章写摘要.py index dc92256..4144d11 100644 --- a/crazy_functions/读文章写摘要.py +++ b/crazy_functions/读文章写摘要.py @@ -1,14 +1,19 @@ from predict import predict_no_ui -from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down +from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, readPdf fast_debug = False +from bs4 import BeautifulSoup def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt): import time, glob, os print('begin analysis on:', file_manifest) for index, fp in enumerate(file_manifest): - with open(fp, 'r', encoding='utf-8') as f: - file_content = f.read() + if ".tex" in fp: + with open(fp, 'r', encoding='utf-8') as f: + file_content = f.read() + if ".pdf" in fp.lower(): + file_content = readPdf(fp) + file_content = BeautifulSoup(''.join(file_content), features="lxml").body.text.encode('gbk', 'ignore').decode('gbk') prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else "" i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```' @@ -17,7 +22,7 @@ def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, hist print('[1] yield chatbot, history') yield chatbot, history, '正常' - if not fast_debug: + if not fast_debug: msg = '正常' # ** gpt request ** gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[]) # 带超时倒计时 @@ -35,7 +40,7 @@ def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, hist chatbot.append((i_say, "[Local Message] waiting gpt response.")) yield chatbot, history, '正常' - if not fast_debug: + if not fast_debug: msg = '正常' # ** gpt request ** gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say, chatbot, top_p, temperature, history=history) # 带超时倒计时 @@ -60,11 +65,12 @@ def 读文章写摘要(txt, top_p, temperature, chatbot, history, systemPromptTx report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") yield chatbot, history, '正常' return - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] # + \ + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \ + [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] # + \ # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \ # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)] if len(file_manifest) == 0: - report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或pdf文件: {txt}") yield chatbot, history, '正常' return yield from 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt) diff --git a/functional_crazy.py b/functional_crazy.py index 3f13853..4b90af4 100644 --- a/functional_crazy.py +++ b/functional_crazy.py @@ -30,7 +30,7 @@ def get_crazy_functionals(): "Color": "stop", # 按钮颜色 "Function": 解析一个C项目 }, - "读tex论文写摘要": { + "读tex or pdf论文写摘要": { "Color": "stop", # 按钮颜色 "Function": 读文章写摘要 }, diff --git a/requirements.txt b/requirements.txt index 84ced64..56c5b23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,10 @@ gradio>=3.23 -requests[socks] -mdtex2html +requests[socks]~=2.28.2 +mdtex2html~=1.2.0 +Markdown~=3.4.3 +latex2mathml~=3.75.1 +bs4~=0.0.1 +lxml~=4.6.4 +beautifulsoup4~=4.12.0 +numpy~=1.24.2 +pdfminer.six \ No newline at end of file diff --git a/toolbox.py b/toolbox.py index d96b3f6..b30c255 100644 --- a/toolbox.py +++ b/toolbox.py @@ -1,6 +1,14 @@ import markdown, mdtex2html, threading, importlib, traceback from show_math import convert as convert_math from functools import wraps +import pdfminer +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.pdfdevice import PDFDevice +from pdfminer.layout import LAParams +from pdfminer.converter import PDFPageAggregator def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''): """ @@ -235,4 +243,52 @@ def clear_line_break(txt): txt = txt.replace('\n', ' ') txt = txt.replace(' ', ' ') txt = txt.replace(' ', ' ') - return txt \ No newline at end of file + return txt + +def readPdf(pdfPath): + """ + 读取pdf文件,返回文本内容 + """ + fp = open(pdfPath, 'rb') + + # Create a PDF parser object associated with the file object + parser = PDFParser(fp) + + # Create a PDF document object that stores the document structure. + # Password for initialization as 2nd parameter + document = PDFDocument(parser) + # Check if the document allows text extraction. If not, abort. + if not document.is_extractable: + raise PDFTextExtractionNotAllowed + + # Create a PDF resource manager object that stores shared resources. + rsrcmgr = PDFResourceManager() + + # Create a PDF device object. + # device = PDFDevice(rsrcmgr) + + # BEGIN LAYOUT ANALYSIS. + # Set parameters for analysis. + laparams = LAParams( + char_margin=10.0, + line_margin=0.2, + boxes_flow=0.2, + all_texts=False, + ) + # Create a PDF page aggregator object. + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + # Create a PDF interpreter object. + interpreter = PDFPageInterpreter(rsrcmgr, device) + + # loop over all pages in the document + outTextList = [] + for page in PDFPage.create_pages(document): + # read the page into a layout object + interpreter.process_page(page) + layout = device.get_result() + for obj in layout._objs: + if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): + # print(obj.get_text()) + outTextList.append(obj.get_text()) + + return outTextList \ No newline at end of file