improve long text breakdown perfomance
This commit is contained in:
parent
6e9936531d
commit
a0bfa7ba1c
@ -26,8 +26,8 @@ class PaperFileGroup():
|
|||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
self.sp_file_tag.append(self.file_paths[index])
|
self.sp_file_tag.append(self.file_paths[index])
|
||||||
else:
|
else:
|
||||||
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
|
segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
|
||||||
for j, segment in enumerate(segments):
|
for j, segment in enumerate(segments):
|
||||||
self.sp_file_contents.append(segment)
|
self.sp_file_contents.append(segment)
|
||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
|
@ -26,8 +26,8 @@ class PaperFileGroup():
|
|||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
self.sp_file_tag.append(self.file_paths[index])
|
self.sp_file_tag.append(self.file_paths[index])
|
||||||
else:
|
else:
|
||||||
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
|
segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
|
||||||
for j, segment in enumerate(segments):
|
for j, segment in enumerate(segments):
|
||||||
self.sp_file_contents.append(segment)
|
self.sp_file_contents.append(segment)
|
||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
|
@ -312,95 +312,6 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
|||||||
return gpt_response_collection
|
return gpt_response_collection
|
||||||
|
|
||||||
|
|
||||||
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
|
||||||
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
|
||||||
if get_token_fn(txt_tocut) <= limit:
|
|
||||||
return [txt_tocut]
|
|
||||||
else:
|
|
||||||
lines = txt_tocut.split('\n')
|
|
||||||
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
|
||||||
estimated_line_cut = int(estimated_line_cut)
|
|
||||||
for cnt in reversed(range(estimated_line_cut)):
|
|
||||||
if must_break_at_empty_line:
|
|
||||||
if lines[cnt] != "":
|
|
||||||
continue
|
|
||||||
print(cnt)
|
|
||||||
prev = "\n".join(lines[:cnt])
|
|
||||||
post = "\n".join(lines[cnt:])
|
|
||||||
if get_token_fn(prev) < limit:
|
|
||||||
break
|
|
||||||
if cnt == 0:
|
|
||||||
raise RuntimeError("存在一行极长的文本!")
|
|
||||||
# print(len(post))
|
|
||||||
# 列表递归接龙
|
|
||||||
result = [prev]
|
|
||||||
result.extend(cut(post, must_break_at_empty_line))
|
|
||||||
return result
|
|
||||||
try:
|
|
||||||
return cut(txt, must_break_at_empty_line=True)
|
|
||||||
except RuntimeError:
|
|
||||||
return cut(txt, must_break_at_empty_line=False)
|
|
||||||
|
|
||||||
|
|
||||||
def force_breakdown(txt, limit, get_token_fn):
|
|
||||||
"""
|
|
||||||
当无法用标点、空行分割时,我们用最暴力的方法切割
|
|
||||||
"""
|
|
||||||
for i in reversed(range(len(txt))):
|
|
||||||
if get_token_fn(txt[:i]) < limit:
|
|
||||||
return txt[:i], txt[i:]
|
|
||||||
return "Tiktoken未知错误", "Tiktoken未知错误"
|
|
||||||
|
|
||||||
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
|
||||||
# 递归
|
|
||||||
def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):
|
|
||||||
if get_token_fn(txt_tocut) <= limit:
|
|
||||||
return [txt_tocut]
|
|
||||||
else:
|
|
||||||
lines = txt_tocut.split('\n')
|
|
||||||
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
|
||||||
estimated_line_cut = int(estimated_line_cut)
|
|
||||||
cnt = 0
|
|
||||||
for cnt in reversed(range(estimated_line_cut)):
|
|
||||||
if must_break_at_empty_line:
|
|
||||||
if lines[cnt] != "":
|
|
||||||
continue
|
|
||||||
prev = "\n".join(lines[:cnt])
|
|
||||||
post = "\n".join(lines[cnt:])
|
|
||||||
if get_token_fn(prev) < limit:
|
|
||||||
break
|
|
||||||
if cnt == 0:
|
|
||||||
if break_anyway:
|
|
||||||
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
|
|
||||||
# print(len(post))
|
|
||||||
# 列表递归接龙
|
|
||||||
result = [prev]
|
|
||||||
result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
|
|
||||||
return result
|
|
||||||
try:
|
|
||||||
# 第1次尝试,将双空行(\n\n)作为切分点
|
|
||||||
return cut(txt, must_break_at_empty_line=True)
|
|
||||||
except RuntimeError:
|
|
||||||
try:
|
|
||||||
# 第2次尝试,将单空行(\n)作为切分点
|
|
||||||
return cut(txt, must_break_at_empty_line=False)
|
|
||||||
except RuntimeError:
|
|
||||||
try:
|
|
||||||
# 第3次尝试,将英文句号(.)作为切分点
|
|
||||||
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
|
|
||||||
return [r.replace('。\n', '.') for r in res]
|
|
||||||
except RuntimeError as e:
|
|
||||||
try:
|
|
||||||
# 第4次尝试,将中文句号(。)作为切分点
|
|
||||||
res = cut(txt.replace('。', '。。\n'), must_break_at_empty_line=False)
|
|
||||||
return [r.replace('。。\n', '。') for r in res]
|
|
||||||
except RuntimeError as e:
|
|
||||||
# 第5次尝试,没办法了,随便切一下敷衍吧
|
|
||||||
return cut(txt, must_break_at_empty_line=False, break_anyway=True)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_and_clean_pdf_text(fp):
|
def read_and_clean_pdf_text(fp):
|
||||||
"""
|
"""
|
||||||
@ -631,7 +542,6 @@ def get_files_from_everything(txt, type): # type='.md'
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
class nougat_interface():
|
class nougat_interface():
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
37
crazy_functions/ipc_fns/mp.py
Normal file
37
crazy_functions/ipc_fns/mp.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import platform
|
||||||
|
import pickle
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
def run_in_subprocess_wrapper_func(v_args):
|
||||||
|
func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args)
|
||||||
|
import sys
|
||||||
|
try:
|
||||||
|
result = func(*args, **kwargs)
|
||||||
|
return_dict['result'] = result
|
||||||
|
except Exception as e:
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
exception_dict['exception'] = exc_info
|
||||||
|
|
||||||
|
def run_in_subprocess_with_timeout(func, timeout=60):
|
||||||
|
if platform.system() == 'Linux':
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
return_dict = multiprocessing.Manager().dict()
|
||||||
|
exception_dict = multiprocessing.Manager().dict()
|
||||||
|
v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict))
|
||||||
|
process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,))
|
||||||
|
process.start()
|
||||||
|
process.join(timeout)
|
||||||
|
if process.is_alive():
|
||||||
|
process.terminate()
|
||||||
|
raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务')
|
||||||
|
process.close()
|
||||||
|
if 'exception' in exception_dict:
|
||||||
|
# ooops, the subprocess ran into an exception
|
||||||
|
exc_info = exception_dict['exception']
|
||||||
|
raise exc_info[1].with_traceback(exc_info[2])
|
||||||
|
if 'result' in return_dict.keys():
|
||||||
|
# If the subprocess ran successfully, return the result
|
||||||
|
return return_dict['result']
|
||||||
|
return wrapper
|
||||||
|
else:
|
||||||
|
return func
|
@ -175,7 +175,6 @@ class LatexPaperFileGroup():
|
|||||||
self.sp_file_contents = []
|
self.sp_file_contents = []
|
||||||
self.sp_file_index = []
|
self.sp_file_index = []
|
||||||
self.sp_file_tag = []
|
self.sp_file_tag = []
|
||||||
|
|
||||||
# count_token
|
# count_token
|
||||||
from request_llms.bridge_all import model_info
|
from request_llms.bridge_all import model_info
|
||||||
enc = model_info["gpt-3.5-turbo"]['tokenizer']
|
enc = model_info["gpt-3.5-turbo"]['tokenizer']
|
||||||
@ -192,13 +191,12 @@ class LatexPaperFileGroup():
|
|||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
self.sp_file_tag.append(self.file_paths[index])
|
self.sp_file_tag.append(self.file_paths[index])
|
||||||
else:
|
else:
|
||||||
from ..crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
|
segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
|
||||||
for j, segment in enumerate(segments):
|
for j, segment in enumerate(segments):
|
||||||
self.sp_file_contents.append(segment)
|
self.sp_file_contents.append(segment)
|
||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
|
self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
|
||||||
print('Segmentation: done')
|
|
||||||
|
|
||||||
def merge_result(self):
|
def merge_result(self):
|
||||||
self.file_result = ["" for _ in range(len(self.file_paths))]
|
self.file_result = ["" for _ in range(len(self.file_paths))]
|
||||||
|
125
crazy_functions/pdf_fns/breakdown_txt.py
Normal file
125
crazy_functions/pdf_fns/breakdown_txt.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout
|
||||||
|
|
||||||
|
def force_breakdown(txt, limit, get_token_fn):
|
||||||
|
""" 当无法用标点、空行分割时,我们用最暴力的方法切割
|
||||||
|
"""
|
||||||
|
for i in reversed(range(len(txt))):
|
||||||
|
if get_token_fn(txt[:i]) < limit:
|
||||||
|
return txt[:i], txt[i:]
|
||||||
|
return "Tiktoken未知错误", "Tiktoken未知错误"
|
||||||
|
|
||||||
|
|
||||||
|
def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage):
|
||||||
|
""" 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
|
||||||
|
当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出
|
||||||
|
"""
|
||||||
|
_min = int(5e4)
|
||||||
|
_max = int(1e5)
|
||||||
|
# print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage))
|
||||||
|
if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0:
|
||||||
|
remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage
|
||||||
|
remain_txt_to_cut_storage = ""
|
||||||
|
if len(remain_txt_to_cut) > _max:
|
||||||
|
remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage
|
||||||
|
remain_txt_to_cut = remain_txt_to_cut[:_max]
|
||||||
|
return remain_txt_to_cut, remain_txt_to_cut_storage
|
||||||
|
|
||||||
|
|
||||||
|
def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False):
|
||||||
|
""" 文本切分
|
||||||
|
"""
|
||||||
|
res = []
|
||||||
|
total_len = len(txt_tocut)
|
||||||
|
fin_len = 0
|
||||||
|
remain_txt_to_cut = txt_tocut
|
||||||
|
remain_txt_to_cut_storage = ""
|
||||||
|
# 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
|
||||||
|
remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if get_token_fn(remain_txt_to_cut) <= limit:
|
||||||
|
# 如果剩余文本的token数小于限制,那么就不用切了
|
||||||
|
res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# 如果剩余文本的token数大于限制,那么就切
|
||||||
|
lines = remain_txt_to_cut.split('\n')
|
||||||
|
|
||||||
|
# 估计一个切分点
|
||||||
|
estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines)
|
||||||
|
estimated_line_cut = int(estimated_line_cut)
|
||||||
|
|
||||||
|
# 开始查找合适切分点的偏移(cnt)
|
||||||
|
cnt = 0
|
||||||
|
for cnt in reversed(range(estimated_line_cut)):
|
||||||
|
if must_break_at_empty_line:
|
||||||
|
# 首先尝试用双空行(\n\n)作为切分点
|
||||||
|
if lines[cnt] != "":
|
||||||
|
continue
|
||||||
|
prev = "\n".join(lines[:cnt])
|
||||||
|
post = "\n".join(lines[cnt:])
|
||||||
|
if get_token_fn(prev) < limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
if cnt == 0:
|
||||||
|
# 如果没有找到合适的切分点
|
||||||
|
if break_anyway:
|
||||||
|
# 是否允许暴力切分
|
||||||
|
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
|
||||||
|
else:
|
||||||
|
# 不允许直接报错
|
||||||
|
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
|
||||||
|
|
||||||
|
# 追加列表
|
||||||
|
res.append(prev); fin_len+=len(prev)
|
||||||
|
# 准备下一次迭代
|
||||||
|
remain_txt_to_cut = post
|
||||||
|
remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
|
||||||
|
process = fin_len/total_len
|
||||||
|
print(f'\r正在文本切分 {int(process*100)}%', end='')
|
||||||
|
if len(remain_txt_to_cut.strip()) == 0:
|
||||||
|
break
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"):
|
||||||
|
""" 使用多种方式尝试切分文本,以满足 token 限制
|
||||||
|
"""
|
||||||
|
from request_llms.bridge_all import model_info
|
||||||
|
enc = model_info[llm_model]['tokenizer']
|
||||||
|
def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
|
||||||
|
try:
|
||||||
|
# 第1次尝试,将双空行(\n\n)作为切分点
|
||||||
|
return cut(limit, get_token_fn, txt, must_break_at_empty_line=True)
|
||||||
|
except RuntimeError:
|
||||||
|
try:
|
||||||
|
# 第2次尝试,将单空行(\n)作为切分点
|
||||||
|
return cut(limit, get_token_fn, txt, must_break_at_empty_line=False)
|
||||||
|
except RuntimeError:
|
||||||
|
try:
|
||||||
|
# 第3次尝试,将英文句号(.)作为切分点
|
||||||
|
res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
|
||||||
|
return [r.replace('。\n', '.') for r in res]
|
||||||
|
except RuntimeError as e:
|
||||||
|
try:
|
||||||
|
# 第4次尝试,将中文句号(。)作为切分点
|
||||||
|
res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False)
|
||||||
|
return [r.replace('。。\n', '。') for r in res]
|
||||||
|
except RuntimeError as e:
|
||||||
|
# 第5次尝试,没办法了,随便切一下吧
|
||||||
|
return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True)
|
||||||
|
|
||||||
|
breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from crazy_functions.crazy_utils import read_and_clean_pdf_text
|
||||||
|
file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf")
|
||||||
|
|
||||||
|
from request_llms.bridge_all import model_info
|
||||||
|
for i in range(5):
|
||||||
|
file_content += file_content
|
||||||
|
|
||||||
|
print(len(file_content))
|
||||||
|
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
||||||
|
res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT)
|
||||||
|
|
@ -74,7 +74,7 @@ def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chat
|
|||||||
|
|
||||||
def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG):
|
def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG):
|
||||||
from crazy_functions.pdf_fns.report_gen_html import construct_html
|
from crazy_functions.pdf_fns.report_gen_html import construct_html
|
||||||
from crazy_functions.crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
||||||
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
||||||
|
|
||||||
@ -116,7 +116,7 @@ def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_fi
|
|||||||
# find a smooth token limit to achieve even seperation
|
# find a smooth token limit to achieve even seperation
|
||||||
count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT))
|
count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT))
|
||||||
token_limit_smooth = raw_token_num // count + count
|
token_limit_smooth = raw_token_num // count + count
|
||||||
return breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn=get_token_num, limit=token_limit_smooth)
|
return breakdown_text_to_satisfy_token_limit(txt, limit=token_limit_smooth, llm_model=llm_kwargs['llm_model'])
|
||||||
|
|
||||||
for section in article_dict.get('sections'):
|
for section in article_dict.get('sections'):
|
||||||
if len(section['text']) == 0: continue
|
if len(section['text']) == 0: continue
|
||||||
|
@ -31,15 +31,11 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot
|
|||||||
|
|
||||||
print(file_content)
|
print(file_content)
|
||||||
# private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名
|
# private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名
|
||||||
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
from request_llms.bridge_all import model_info
|
from request_llms.bridge_all import model_info
|
||||||
max_token = model_info[llm_kwargs['llm_model']]['max_token']
|
max_token = model_info[llm_kwargs['llm_model']]['max_token']
|
||||||
TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4
|
TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4
|
||||||
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
|
||||||
txt=file_content,
|
|
||||||
get_token_fn=model_info[llm_kwargs['llm_model']]['token_cnt'],
|
|
||||||
limit=TOKEN_LIMIT_PER_FRAGMENT
|
|
||||||
)
|
|
||||||
this_paper_history = []
|
this_paper_history = []
|
||||||
for i, paper_frag in enumerate(paper_fragments):
|
for i, paper_frag in enumerate(paper_fragments):
|
||||||
i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```'
|
i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```'
|
||||||
|
@ -28,8 +28,8 @@ class PaperFileGroup():
|
|||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
self.sp_file_tag.append(self.file_paths[index])
|
self.sp_file_tag.append(self.file_paths[index])
|
||||||
else:
|
else:
|
||||||
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
|
segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
|
||||||
for j, segment in enumerate(segments):
|
for j, segment in enumerate(segments):
|
||||||
self.sp_file_contents.append(segment)
|
self.sp_file_contents.append(segment)
|
||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
|
@ -20,14 +20,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
|
|||||||
|
|
||||||
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
||||||
|
|
||||||
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
from request_llms.bridge_all import model_info
|
paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
|
||||||
enc = model_info["gpt-3.5-turbo"]['tokenizer']
|
page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model'])
|
||||||
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
|
||||||
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
|
||||||
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
|
||||||
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
|
||||||
txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
|
|
||||||
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
|
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
|
||||||
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
|
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
|
||||||
|
|
||||||
|
@ -91,14 +91,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
|
|||||||
page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
|
page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
|
||||||
|
|
||||||
# 递归地切割PDF文件
|
# 递归地切割PDF文件
|
||||||
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
from request_llms.bridge_all import model_info
|
paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
|
||||||
enc = model_info["gpt-3.5-turbo"]['tokenizer']
|
page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=page_one, limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model'])
|
||||||
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
|
||||||
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
|
||||||
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
|
||||||
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
|
||||||
txt=page_one, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
|
|
||||||
|
|
||||||
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
|
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
|
||||||
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
|
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
|
||||||
|
@ -18,14 +18,9 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
|
|||||||
|
|
||||||
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
||||||
|
|
||||||
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
from request_llms.bridge_all import model_info
|
paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
|
||||||
enc = model_info["gpt-3.5-turbo"]['tokenizer']
|
page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model'])
|
||||||
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
|
||||||
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
|
||||||
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
|
||||||
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
|
||||||
txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
|
|
||||||
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
|
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
|
||||||
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
|
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
|
||||||
|
|
||||||
@ -45,7 +40,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
|
|||||||
for i in range(n_fragment):
|
for i in range(n_fragment):
|
||||||
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
|
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
|
||||||
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
|
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
|
||||||
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}"
|
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]} ...."
|
||||||
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
|
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
|
||||||
llm_kwargs, chatbot,
|
llm_kwargs, chatbot,
|
||||||
history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
|
history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
|
||||||
|
@ -12,13 +12,6 @@ class PaperFileGroup():
|
|||||||
self.sp_file_index = []
|
self.sp_file_index = []
|
||||||
self.sp_file_tag = []
|
self.sp_file_tag = []
|
||||||
|
|
||||||
# count_token
|
|
||||||
from request_llms.bridge_all import model_info
|
|
||||||
enc = model_info["gpt-3.5-turbo"]['tokenizer']
|
|
||||||
def get_token_num(txt): return len(
|
|
||||||
enc.encode(txt, disallowed_special=()))
|
|
||||||
self.get_token_num = get_token_num
|
|
||||||
|
|
||||||
def run_file_split(self, max_token_limit=1900):
|
def run_file_split(self, max_token_limit=1900):
|
||||||
"""
|
"""
|
||||||
将长文本分离开来
|
将长文本分离开来
|
||||||
@ -29,9 +22,8 @@ class PaperFileGroup():
|
|||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
self.sp_file_tag.append(self.file_paths[index])
|
self.sp_file_tag.append(self.file_paths[index])
|
||||||
else:
|
else:
|
||||||
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||||
segments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
|
||||||
file_content, self.get_token_num, max_token_limit)
|
|
||||||
for j, segment in enumerate(segments):
|
for j, segment in enumerate(segments):
|
||||||
self.sp_file_contents.append(segment)
|
self.sp_file_contents.append(segment)
|
||||||
self.sp_file_index.append(index)
|
self.sp_file_index.append(index)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user