diff --git a/crazy_functions/Latex全文润色.py b/crazy_functions/Latex全文润色.py index 77c1102..844513b 100644 --- a/crazy_functions/Latex全文润色.py +++ b/crazy_functions/Latex全文润色.py @@ -14,7 +14,7 @@ class PaperFileGroup(): import tiktoken from toolbox import get_conf enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) - def get_token_num(txt): return len(enc.encode(txt)) + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) self.get_token_num = get_token_num def run_file_split(self, max_token_limit=1900): diff --git a/crazy_functions/Latex全文翻译.py b/crazy_functions/Latex全文翻译.py index 9d2035a..a41bdfe 100644 --- a/crazy_functions/Latex全文翻译.py +++ b/crazy_functions/Latex全文翻译.py @@ -14,7 +14,7 @@ class PaperFileGroup(): import tiktoken from toolbox import get_conf enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) - def get_token_num(txt): return len(enc.encode(txt)) + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) self.get_token_num = get_token_num def run_file_split(self, max_token_limit=1900): diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 91bd4af..c73f89f 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -6,7 +6,7 @@ def input_clipping(inputs, history, max_token_limit): import numpy as np from toolbox import get_conf enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) - def get_token_num(txt): return len(enc.encode(txt)) + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) mode = 'input-and-history' # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史 @@ -23,7 +23,7 @@ def input_clipping(inputs, history, max_token_limit): while n_token > max_token_limit: where = np.argmax(everything_token) - encoded = enc.encode(everything[where]) + encoded = enc.encode(everything[where], disallowed_special=()) clipped_encoded = encoded[:len(encoded)-delta] everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char everything_token[where] = get_token_num(everything[where]) diff --git a/crazy_functions/代码重写为全英文_多线程.py b/crazy_functions/代码重写为全英文_多线程.py index f74704a..40bc45d 100644 --- a/crazy_functions/代码重写为全英文_多线程.py +++ b/crazy_functions/代码重写为全英文_多线程.py @@ -62,7 +62,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_ import tiktoken from toolbox import get_conf enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) - def get_token_fn(txt): return len(enc.encode(txt)) + def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=())) # 第6步:任务函数 diff --git a/crazy_functions/批量Markdown翻译.py b/crazy_functions/批量Markdown翻译.py index 184b315..129ae7a 100644 --- a/crazy_functions/批量Markdown翻译.py +++ b/crazy_functions/批量Markdown翻译.py @@ -14,7 +14,7 @@ class PaperFileGroup(): import tiktoken from toolbox import get_conf enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) - def get_token_num(txt): return len(enc.encode(txt)) + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) self.get_token_num = get_token_num def run_file_split(self, max_token_limit=1900): diff --git a/crazy_functions/批量翻译PDF文档_多线程.py b/crazy_functions/批量翻译PDF文档_多线程.py index 0aa0b3c..de2a274 100644 --- a/crazy_functions/批量翻译PDF文档_多线程.py +++ b/crazy_functions/批量翻译PDF文档_多线程.py @@ -70,7 +70,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf from toolbox import get_conf enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) - def get_token_num(txt): return len(enc.encode(txt)) + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( diff --git a/crazy_functions/理解PDF文档内容.py b/crazy_functions/理解PDF文档内容.py index 1440e7c..168b2c9 100644 --- a/crazy_functions/理解PDF文档内容.py +++ b/crazy_functions/理解PDF文档内容.py @@ -19,7 +19,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf from toolbox import get_conf enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) - def get_token_num(txt): return len(enc.encode(txt)) + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( diff --git a/crazy_functions/解析项目源代码.py b/crazy_functions/解析项目源代码.py index 2cb96eb..3a57eee 100644 --- a/crazy_functions/解析项目源代码.py +++ b/crazy_functions/解析项目源代码.py @@ -11,7 +11,8 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs, history_array = [] sys_prompt_array = [] report_part_1 = [] - + + assert len(file_manifest) <= 512, "源文件太多, 请缩减输入文件的数量, 或者删除此行并拆分file_manifest以保证结果能被分批存储。" ############################## <第一步,逐个文件分析,多线程> ################################## for index, fp in enumerate(file_manifest): with open(fp, 'r', encoding='utf-8', errors='replace') as f: