当无法正常切割PDF文档时,强制切割

This commit is contained in:
qingxu fu 2023-04-14 13:52:56 +08:00
parent f10ea20351
commit cd6a1fd399
2 changed files with 32 additions and 11 deletions

View File

@ -104,7 +104,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback\n\n{tb_str}\n\n"
if retry_op > 0:
retry_op -= 1
mutable[0] += f"[Local Message] 重试中 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}\n\n"
mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}\n\n"
if "Rate limit reached" in tb_str:
time.sleep(30)
time.sleep(5)
@ -312,7 +312,6 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
if get_token_fn(prev) < limit:
break
if cnt == 0:
print('what the fuck ?')
raise RuntimeError("存在一行极长的文本!")
# print(len(post))
# 列表递归接龙
@ -325,8 +324,18 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
return cut(txt, must_break_at_empty_line=False)
def force_breakdown(txt, limit, get_token_fn):
"""
当无法用标点空行分割时我们用最暴力的方法切割
"""
for i in reversed(range(len(txt))):
if get_token_fn(txt[:i]) < limit:
return txt[:i], txt[i:]
return "Tiktoken未知错误", "Tiktoken未知错误"
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
def cut(txt_tocut, must_break_at_empty_line): # 递归
# 递归
def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):
if get_token_fn(txt_tocut) <= limit:
return [txt_tocut]
else:
@ -338,28 +347,40 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
if must_break_at_empty_line:
if lines[cnt] != "":
continue
print(cnt)
prev = "\n".join(lines[:cnt])
post = "\n".join(lines[cnt:])
if get_token_fn(prev) < limit:
break
if cnt == 0:
# print('what the fuck ? 存在一行极长的文本!')
raise RuntimeError("存在一行极长的文本!")
if break_anyway:
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
else:
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
# print(len(post))
# 列表递归接龙
result = [prev]
result.extend(cut(post, must_break_at_empty_line))
result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
return result
try:
# 第1次尝试将双空行\n\n作为切分点
return cut(txt, must_break_at_empty_line=True)
except RuntimeError:
try:
# 第2次尝试将单空行\n作为切分点
return cut(txt, must_break_at_empty_line=False)
except RuntimeError:
# 这个中文的句号是故意的,作为一个标识而存在
res = cut(txt.replace('.', '\n'), must_break_at_empty_line=False)
return [r.replace('\n', '.') for r in res]
try:
# 第3次尝试将英文句号.)作为切分点
res = cut(txt.replace('.', '\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
return [r.replace('\n', '.') for r in res]
except RuntimeError as e:
try:
# 第4次尝试将中文句号作为切分点
res = cut(txt.replace('', '。。\n'), must_break_at_empty_line=False)
return [r.replace('。。\n', '') for r in res]
except RuntimeError as e:
# 第5次尝试没办法了随便切一下敷衍吧
return cut(txt, must_break_at_empty_line=False, break_anyway=True)

View File

@ -96,7 +96,7 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="",
# 看门狗,如果超过期限没有喂狗,则终止
if len(observe_window) >= 2:
if (time.time()-observe_window[1]) > watch_dog_patience:
raise RuntimeError("程序终止")
raise RuntimeError("用户取消了程序。")
else: raise RuntimeError("意外Json结构"+delta)
if json_data['finish_reason'] == 'length':
raise ConnectionAbortedError("正常结束但显示Token不足导致输出不完整请削减单次输入的文本量。")