Merge pull request #905 from Xminry/master

Update 理解PDF文档内容.py
This commit is contained in:
binary-husky 2023-06-27 23:37:25 +08:00 committed by GitHub
commit d684b4cdb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -13,6 +13,8 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
# 递归地切割PDF文件每一块尽量是完整的一个section比如introductionexperiment等必要时再进行切割
# 的长度必须小于 2500 个 Token
file_content, page_one = read_and_clean_pdf_text(file_name) # 尝试按照章节切割PDF
file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
TOKEN_LIMIT_PER_FRAGMENT = 2500