From 4290821a504ec2996241c09b262653111c7208b8 Mon Sep 17 00:00:00 2001
From: Xminry <46775500+Xminry@users.noreply.github.com>
Date: Tue, 27 Jun 2023 01:57:31 +0800
Subject: [PATCH] =?UTF-8?q?Update=20=E7=90=86=E8=A7=A3PDF=E6=96=87?=
 =?UTF-8?q?=E6=A1=A3=E5=86=85=E5=AE=B9.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crazy_functions/理解PDF文档内容.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crazy_functions/理解PDF文档内容.py b/crazy_functions/理解PDF文档内容.py
index 5050864..f1a89a7 100644
--- a/crazy_functions/理解PDF文档内容.py
+++ b/crazy_functions/理解PDF文档内容.py
@@ -13,7 +13,9 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
     # 递归地切割PDF文件，每一块（尽量是完整的一个section，比如introduction，experiment等，必要时再进行切割）
     # 的长度必须小于 2500 个 Token
     file_content, page_one = read_and_clean_pdf_text(file_name) # （尝试）按照章节切割PDF
-
+    file_content = file_content.encode('utf-8', 'ignore').decode()   # avoid reading non-utf8 chars
+    page_one = str(page_one).encode('utf-8', 'ignore').decode()  # avoid reading non-utf8 chars
+    
     TOKEN_LIMIT_PER_FRAGMENT = 2500
 
     from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf