* Zhipu sdk update 适配最新的智谱SDK,支持GLM4v (#1502) * 适配 google gemini 优化为从用户input中提取文件 * 适配最新的智谱SDK、支持glm-4v * requirements.txt fix * pending history check --------- Co-authored-by: binary-husky <qingxu.fu@outlook.com> * Update "生成多种Mermaid图表" plugin: Separate out the file reading function (#1520) * Update crazy_functional.py with new functionality deal with PDF * Update crazy_functional.py and Mermaid.py for plugin_kwargs * Update crazy_functional.py with new chart type: mind map * Update SELECT_PROMPT and i_say_show_user messages * Update ArgsReminder message in get_crazy_functions() function * Update with read md file and update PROMPTS * Return the PROMPTS as the test found that the initial version worked best * Update Mermaid chart generation function * version 3.71 * 解决issues #1510 * Remove unnecessary text from sys_prompt in 解析历史输入 function * Remove sys_prompt message in 解析历史输入 function * Update bridge_all.py: supports gpt-4-turbo-preview (#1517) * Update bridge_all.py: supports gpt-4-turbo-preview supports gpt-4-turbo-preview * Update bridge_all.py --------- Co-authored-by: binary-husky <96192199+binary-husky@users.noreply.github.com> * Update config.py: supports gpt-4-turbo-preview (#1516) * Update config.py: supports gpt-4-turbo-preview supports gpt-4-turbo-preview * Update config.py --------- Co-authored-by: binary-husky <96192199+binary-husky@users.noreply.github.com> * Refactor 解析历史输入 function to handle file input * Update Mermaid chart generation functionality * rename files and functions --------- Co-authored-by: binary-husky <qingxu.fu@outlook.com> Co-authored-by: hongyi-zhao <hongyi.zhao@gmail.com> Co-authored-by: binary-husky <96192199+binary-husky@users.noreply.github.com> * 接入mathpix ocr功能 (#1468) * Update Latex输出PDF结果.py 借助mathpix实现了PDF翻译中文并重新编译PDF * Update config.py add mathpix appid & appkey * Add 'PDF翻译中文并重新编译PDF' feature to plugins. --------- Co-authored-by: binary-husky <96192199+binary-husky@users.noreply.github.com> * fix zhipuai * check picture * remove glm-4 due to bug * 修改config * 检查MATHPIX_APPID * Remove unnecessary code and update function_plugins dictionary * capture non-standard token overflow * bug fix #1524 * change mermaid style * 支持mermaid 滚动放大缩小重置,鼠标滚动和拖拽 (#1530) * 支持mermaid 滚动放大缩小重置,鼠标滚动和拖拽 * 微调未果 先stage一下 * update --------- Co-authored-by: binary-husky <qingxu.fu@outlook.com> Co-authored-by: binary-husky <96192199+binary-husky@users.noreply.github.com> * ver 3.72 * change live2d * save the status of ``clear btn` in cookie * 前端选择保持 * js ui bug fix * reset btn bug fix * update live2d tips * fix missing get_token_num method * fix live2d toggle switch * fix persistent custom btn with cookie * fix zhipuai feedback with core functionality * Refactor button update and clean up functions * tailing space removal * Fix missing MATHPIX_APPID and MATHPIX_APPKEY configuration * Prompt fix、脑图提示词优化 (#1537) * 适配 google gemini 优化为从用户input中提取文件 * 脑图提示词优化 * Fix missing MATHPIX_APPID and MATHPIX_APPKEY configuration --------- Co-authored-by: binary-husky <qingxu.fu@outlook.com> * 优化“PDF翻译中文并重新编译PDF”插件 (#1602) * Add gemini_endpoint to API_URL_REDIRECT (#1560) * Add gemini_endpoint to API_URL_REDIRECT * Update gemini-pro and gemini-pro-vision model_info endpoints * Update to support new claude models (#1606) * Add anthropic library and update claude models * 更新bridge_claude.py文件,添加了对图片输入的支持。修复了一些bug。 * 添加Claude_3_Models变量以限制图片数量 * Refactor code to improve readability and maintainability * minor claude bug fix * more flexible one-api support * reformat config * fix one-api new access bug * dummy * compat non-standard api * version 3.73 --------- Co-authored-by: XIao <46100050+Kilig947@users.noreply.github.com> Co-authored-by: Menghuan1918 <menghuan2003@outlook.com> Co-authored-by: hongyi-zhao <hongyi.zhao@gmail.com> Co-authored-by: Hao Ma <893017927@qq.com> Co-authored-by: zeyuan huang <599012428@qq.com>
126 lines
5.7 KiB
Python
126 lines
5.7 KiB
Python
from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout
|
||
|
||
def force_breakdown(txt, limit, get_token_fn):
|
||
""" 当无法用标点、空行分割时,我们用最暴力的方法切割
|
||
"""
|
||
for i in reversed(range(len(txt))):
|
||
if get_token_fn(txt[:i]) < limit:
|
||
return txt[:i], txt[i:]
|
||
return "Tiktoken未知错误", "Tiktoken未知错误"
|
||
|
||
|
||
def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage):
|
||
""" 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
|
||
当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出
|
||
"""
|
||
_min = int(5e4)
|
||
_max = int(1e5)
|
||
# print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage))
|
||
if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0:
|
||
remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage
|
||
remain_txt_to_cut_storage = ""
|
||
if len(remain_txt_to_cut) > _max:
|
||
remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage
|
||
remain_txt_to_cut = remain_txt_to_cut[:_max]
|
||
return remain_txt_to_cut, remain_txt_to_cut_storage
|
||
|
||
|
||
def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False):
|
||
""" 文本切分
|
||
"""
|
||
res = []
|
||
total_len = len(txt_tocut)
|
||
fin_len = 0
|
||
remain_txt_to_cut = txt_tocut
|
||
remain_txt_to_cut_storage = ""
|
||
# 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
|
||
remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
|
||
|
||
while True:
|
||
if get_token_fn(remain_txt_to_cut) <= limit:
|
||
# 如果剩余文本的token数小于限制,那么就不用切了
|
||
res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut)
|
||
break
|
||
else:
|
||
# 如果剩余文本的token数大于限制,那么就切
|
||
lines = remain_txt_to_cut.split('\n')
|
||
|
||
# 估计一个切分点
|
||
estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines)
|
||
estimated_line_cut = int(estimated_line_cut)
|
||
|
||
# 开始查找合适切分点的偏移(cnt)
|
||
cnt = 0
|
||
for cnt in reversed(range(estimated_line_cut)):
|
||
if must_break_at_empty_line:
|
||
# 首先尝试用双空行(\n\n)作为切分点
|
||
if lines[cnt] != "":
|
||
continue
|
||
prev = "\n".join(lines[:cnt])
|
||
post = "\n".join(lines[cnt:])
|
||
if get_token_fn(prev) < limit:
|
||
break
|
||
|
||
if cnt == 0:
|
||
# 如果没有找到合适的切分点
|
||
if break_anyway:
|
||
# 是否允许暴力切分
|
||
prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn)
|
||
else:
|
||
# 不允许直接报错
|
||
raise RuntimeError(f"存在一行极长的文本!{remain_txt_to_cut}")
|
||
|
||
# 追加列表
|
||
res.append(prev); fin_len+=len(prev)
|
||
# 准备下一次迭代
|
||
remain_txt_to_cut = post
|
||
remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
|
||
process = fin_len/total_len
|
||
print(f'正在文本切分 {int(process*100)}%')
|
||
if len(remain_txt_to_cut.strip()) == 0:
|
||
break
|
||
return res
|
||
|
||
|
||
def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"):
|
||
""" 使用多种方式尝试切分文本,以满足 token 限制
|
||
"""
|
||
from request_llms.bridge_all import model_info
|
||
enc = model_info[llm_model]['tokenizer']
|
||
def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
|
||
try:
|
||
# 第1次尝试,将双空行(\n\n)作为切分点
|
||
return cut(limit, get_token_fn, txt, must_break_at_empty_line=True)
|
||
except RuntimeError:
|
||
try:
|
||
# 第2次尝试,将单空行(\n)作为切分点
|
||
return cut(limit, get_token_fn, txt, must_break_at_empty_line=False)
|
||
except RuntimeError:
|
||
try:
|
||
# 第3次尝试,将英文句号(.)作为切分点
|
||
res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
|
||
return [r.replace('。\n', '.') for r in res]
|
||
except RuntimeError as e:
|
||
try:
|
||
# 第4次尝试,将中文句号(。)作为切分点
|
||
res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False)
|
||
return [r.replace('。。\n', '。') for r in res]
|
||
except RuntimeError as e:
|
||
# 第5次尝试,没办法了,随便切一下吧
|
||
return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True)
|
||
|
||
breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60)
|
||
|
||
if __name__ == '__main__':
|
||
from crazy_functions.crazy_utils import read_and_clean_pdf_text
|
||
file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf")
|
||
|
||
from request_llms.bridge_all import model_info
|
||
for i in range(5):
|
||
file_content += file_content
|
||
|
||
print(len(file_content))
|
||
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
||
res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT)
|
||
|