当无法正常切割PDF文档时,强制切割
This commit is contained in:
parent
f10ea20351
commit
cd6a1fd399
@ -104,7 +104,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
|
|||||||
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
||||||
if retry_op > 0:
|
if retry_op > 0:
|
||||||
retry_op -= 1
|
retry_op -= 1
|
||||||
mutable[0] += f"[Local Message] 重试中 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n"
|
mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n"
|
||||||
if "Rate limit reached" in tb_str:
|
if "Rate limit reached" in tb_str:
|
||||||
time.sleep(30)
|
time.sleep(30)
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
@ -312,7 +312,6 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
|||||||
if get_token_fn(prev) < limit:
|
if get_token_fn(prev) < limit:
|
||||||
break
|
break
|
||||||
if cnt == 0:
|
if cnt == 0:
|
||||||
print('what the fuck ?')
|
|
||||||
raise RuntimeError("存在一行极长的文本!")
|
raise RuntimeError("存在一行极长的文本!")
|
||||||
# print(len(post))
|
# print(len(post))
|
||||||
# 列表递归接龙
|
# 列表递归接龙
|
||||||
@ -325,8 +324,18 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
|||||||
return cut(txt, must_break_at_empty_line=False)
|
return cut(txt, must_break_at_empty_line=False)
|
||||||
|
|
||||||
|
|
||||||
|
def force_breakdown(txt, limit, get_token_fn):
|
||||||
|
"""
|
||||||
|
当无法用标点、空行分割时,我们用最暴力的方法切割
|
||||||
|
"""
|
||||||
|
for i in reversed(range(len(txt))):
|
||||||
|
if get_token_fn(txt[:i]) < limit:
|
||||||
|
return txt[:i], txt[i:]
|
||||||
|
return "Tiktoken未知错误", "Tiktoken未知错误"
|
||||||
|
|
||||||
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
||||||
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
# 递归
|
||||||
|
def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):
|
||||||
if get_token_fn(txt_tocut) <= limit:
|
if get_token_fn(txt_tocut) <= limit:
|
||||||
return [txt_tocut]
|
return [txt_tocut]
|
||||||
else:
|
else:
|
||||||
@ -338,28 +347,40 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
|||||||
if must_break_at_empty_line:
|
if must_break_at_empty_line:
|
||||||
if lines[cnt] != "":
|
if lines[cnt] != "":
|
||||||
continue
|
continue
|
||||||
print(cnt)
|
|
||||||
prev = "\n".join(lines[:cnt])
|
prev = "\n".join(lines[:cnt])
|
||||||
post = "\n".join(lines[cnt:])
|
post = "\n".join(lines[cnt:])
|
||||||
if get_token_fn(prev) < limit:
|
if get_token_fn(prev) < limit:
|
||||||
break
|
break
|
||||||
if cnt == 0:
|
if cnt == 0:
|
||||||
# print('what the fuck ? 存在一行极长的文本!')
|
if break_anyway:
|
||||||
raise RuntimeError("存在一行极长的文本!")
|
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
|
||||||
# print(len(post))
|
# print(len(post))
|
||||||
# 列表递归接龙
|
# 列表递归接龙
|
||||||
result = [prev]
|
result = [prev]
|
||||||
result.extend(cut(post, must_break_at_empty_line))
|
result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
|
||||||
return result
|
return result
|
||||||
try:
|
try:
|
||||||
|
# 第1次尝试,将双空行(\n\n)作为切分点
|
||||||
return cut(txt, must_break_at_empty_line=True)
|
return cut(txt, must_break_at_empty_line=True)
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
try:
|
try:
|
||||||
|
# 第2次尝试,将单空行(\n)作为切分点
|
||||||
return cut(txt, must_break_at_empty_line=False)
|
return cut(txt, must_break_at_empty_line=False)
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
# 这个中文的句号是故意的,作为一个标识而存在
|
try:
|
||||||
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
|
# 第3次尝试,将英文句号(.)作为切分点
|
||||||
return [r.replace('。\n', '.') for r in res]
|
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
|
||||||
|
return [r.replace('。\n', '.') for r in res]
|
||||||
|
except RuntimeError as e:
|
||||||
|
try:
|
||||||
|
# 第4次尝试,将中文句号(。)作为切分点
|
||||||
|
res = cut(txt.replace('。', '。。\n'), must_break_at_empty_line=False)
|
||||||
|
return [r.replace('。。\n', '。') for r in res]
|
||||||
|
except RuntimeError as e:
|
||||||
|
# 第5次尝试,没办法了,随便切一下敷衍吧
|
||||||
|
return cut(txt, must_break_at_empty_line=False, break_anyway=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="",
|
|||||||
# 看门狗,如果超过期限没有喂狗,则终止
|
# 看门狗,如果超过期限没有喂狗,则终止
|
||||||
if len(observe_window) >= 2:
|
if len(observe_window) >= 2:
|
||||||
if (time.time()-observe_window[1]) > watch_dog_patience:
|
if (time.time()-observe_window[1]) > watch_dog_patience:
|
||||||
raise RuntimeError("程序终止。")
|
raise RuntimeError("用户取消了程序。")
|
||||||
else: raise RuntimeError("意外Json结构:"+delta)
|
else: raise RuntimeError("意外Json结构:"+delta)
|
||||||
if json_data['finish_reason'] == 'length':
|
if json_data['finish_reason'] == 'length':
|
||||||
raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")
|
raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user