disallow special token + limit num of file < 512

This commit is contained in:
505030475 2023-04-14 09:50:14 +08:00
parent a2002ebd85
commit dd648bd446
8 changed files with 10 additions and 9 deletions

View File

@ -14,7 +14,7 @@ class PaperFileGroup():
import tiktoken
from toolbox import get_conf
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
def get_token_num(txt): return len(enc.encode(txt))
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
self.get_token_num = get_token_num
def run_file_split(self, max_token_limit=1900):

View File

@ -14,7 +14,7 @@ class PaperFileGroup():
import tiktoken
from toolbox import get_conf
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
def get_token_num(txt): return len(enc.encode(txt))
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
self.get_token_num = get_token_num
def run_file_split(self, max_token_limit=1900):

View File

@ -6,7 +6,7 @@ def input_clipping(inputs, history, max_token_limit):
import numpy as np
from toolbox import get_conf
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
def get_token_num(txt): return len(enc.encode(txt))
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
mode = 'input-and-history'
# 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
@ -23,7 +23,7 @@ def input_clipping(inputs, history, max_token_limit):
while n_token > max_token_limit:
where = np.argmax(everything_token)
encoded = enc.encode(everything[where])
encoded = enc.encode(everything[where], disallowed_special=())
clipped_encoded = encoded[:len(encoded)-delta]
everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
everything_token[where] = get_token_num(everything[where])

View File

@ -62,7 +62,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
import tiktoken
from toolbox import get_conf
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
def get_token_fn(txt): return len(enc.encode(txt))
def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
# 第6步任务函数

View File

@ -14,7 +14,7 @@ class PaperFileGroup():
import tiktoken
from toolbox import get_conf
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
def get_token_num(txt): return len(enc.encode(txt))
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
self.get_token_num = get_token_num
def run_file_split(self, max_token_limit=1900):

View File

@ -70,7 +70,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
from toolbox import get_conf
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
def get_token_num(txt): return len(enc.encode(txt))
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(

View File

@ -19,7 +19,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
from toolbox import get_conf
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
def get_token_num(txt): return len(enc.encode(txt))
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(

View File

@ -12,6 +12,7 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
sys_prompt_array = []
report_part_1 = []
assert len(file_manifest) <= 512, "源文件太多, 请缩减输入文件的数量, 或者删除此行并拆分file_manifest以保证结果能被分批存储。"
############################## <第一步,逐个文件分析,多线程> ##################################
for index, fp in enumerate(file_manifest):
with open(fp, 'r', encoding='utf-8', errors='replace') as f: