Merge pull request #174 from Euclid-Jie/Euclid_Test

feature(read pdf paper then write summary)
This commit is contained in:
binary-husky 2023-03-31 21:06:02 +08:00 committed by GitHub
commit fb889cb4ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 158 additions and 0 deletions

View File

@ -0,0 +1,151 @@
from predict import predict_no_ui
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
fast_debug = False
def readPdf(pdfPath):
"""
读取pdf文件返回文本内容
"""
import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
fp = open(pdfPath, 'rb')
# Create a PDF parser object associated with the file object
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
# device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS.
# Set parameters for analysis.
laparams = LAParams(
char_margin=10.0,
line_margin=0.2,
boxes_flow=0.2,
all_texts=False,
)
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# loop over all pages in the document
outTextList = []
for page in PDFPage.create_pages(document):
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
for obj in layout._objs:
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
# print(obj.get_text())
outTextList.append(obj.get_text())
return outTextList
def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt):
import time, glob, os
from bs4 import BeautifulSoup
print('begin analysis on:', file_manifest)
for index, fp in enumerate(file_manifest):
if ".tex" in fp:
with open(fp, 'r', encoding='utf-8') as f:
file_content = f.read()
if ".pdf" in fp.lower():
file_content = readPdf(fp)
file_content = BeautifulSoup(''.join(file_content), features="lxml").body.text.encode('gbk', 'ignore').decode('gbk')
prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```'
i_say_show_user = prefix + f'[{index}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}'
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
print('[1] yield chatbot, history')
yield chatbot, history, '正常'
if not fast_debug:
msg = '正常'
# ** gpt request **
gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[]) # 带超时倒计时
print('[2] end gpt req')
chatbot[-1] = (i_say_show_user, gpt_say)
history.append(i_say_show_user); history.append(gpt_say)
print('[3] yield chatbot, history')
yield chatbot, history, msg
print('[4] next')
if not fast_debug: time.sleep(2)
all_file = ', '.join([os.path.relpath(fp, project_folder) for index, fp in enumerate(file_manifest)])
i_say = f'根据以上你自己的分析,对全文进行概括,用学术性语言写一段中文摘要,然后再写一段英文摘要(包括{all_file})。'
chatbot.append((i_say, "[Local Message] waiting gpt response."))
yield chatbot, history, '正常'
if not fast_debug:
msg = '正常'
# ** gpt request **
gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say, chatbot, top_p, temperature, history=history) # 带超时倒计时
chatbot[-1] = (i_say, gpt_say)
history.append(i_say); history.append(gpt_say)
yield chatbot, history, msg
res = write_results_to_file(history)
chatbot.append(("完成了吗?", res))
yield chatbot, history, msg
@CatchException
def 批量总结PDF文档pdfminer(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
history = [] # 清空历史,以免输入溢出
import glob, os
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"批量总结PDF文档此版本使用pdfminer插件带token约简功能。函数插件贡献者: Euclid-Jie。"])
yield chatbot, history, '正常'
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import pdfminer, bs4
except:
report_execption(chatbot, history,
a = f"解析项目: {txt}",
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pdfminer beautifulsoup4```。")
yield chatbot, history, '正常'
return
if os.path.exists(txt):
project_folder = txt
else:
if txt == "": txt = '空空如也的输入栏'
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield chatbot, history, '正常'
return
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \
[f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] # + \
# [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
# [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
if len(file_manifest) == 0:
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或pdf文件: {txt}")
yield chatbot, history, '正常'
return
yield from 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt)

View File

@ -55,12 +55,17 @@ def get_crazy_functionals():
# VisibleLevel=1 经过测试,但功能未达到理想状态
if UserVisibleLevel >= 1:
from crazy_functions.批量总结PDF文档 import 批量总结PDF文档
from crazy_functions.批量总结PDF文档pdfminer import 批量总结PDF文档pdfminer
function_plugins.update({
"[仅供开发调试] 批量总结PDF文档": {
"Color": "stop",
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
"Function": HotReload(批量总结PDF文档)
},
"[仅供开发调试] 批量总结PDF文档pdfminer": {
"Color": "stop",
"Function": HotReload(批量总结PDF文档pdfminer)
},
})
# VisibleLevel=2 尚未充分测试的函数插件,放在这里

View File

@ -1,3 +1,5 @@
gradio>=3.23
requests[socks]
mdtex2html
Markdown
latex2mathml