improve long text breakdown perfomance
This commit is contained in:
		
							parent
							
								
									6e9936531d
								
							
						
					
					
						commit
						a0bfa7ba1c
					
				@ -26,8 +26,8 @@ class PaperFileGroup():
 | 
			
		||||
                self.sp_file_index.append(index)
 | 
			
		||||
                self.sp_file_tag.append(self.file_paths[index])
 | 
			
		||||
            else:
 | 
			
		||||
                from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
                segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
 | 
			
		||||
                from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
                segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
 | 
			
		||||
                for j, segment in enumerate(segments):
 | 
			
		||||
                    self.sp_file_contents.append(segment)
 | 
			
		||||
                    self.sp_file_index.append(index)
 | 
			
		||||
 | 
			
		||||
@ -26,8 +26,8 @@ class PaperFileGroup():
 | 
			
		||||
                self.sp_file_index.append(index)
 | 
			
		||||
                self.sp_file_tag.append(self.file_paths[index])
 | 
			
		||||
            else:
 | 
			
		||||
                from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
                segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
 | 
			
		||||
                from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
                segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
 | 
			
		||||
                for j, segment in enumerate(segments):
 | 
			
		||||
                    self.sp_file_contents.append(segment)
 | 
			
		||||
                    self.sp_file_index.append(index)
 | 
			
		||||
 | 
			
		||||
@ -312,95 +312,6 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
 | 
			
		||||
    return gpt_response_collection
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
 | 
			
		||||
    def cut(txt_tocut, must_break_at_empty_line):  # 递归
 | 
			
		||||
        if get_token_fn(txt_tocut) <= limit:
 | 
			
		||||
            return [txt_tocut]
 | 
			
		||||
        else:
 | 
			
		||||
            lines = txt_tocut.split('\n')
 | 
			
		||||
            estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
 | 
			
		||||
            estimated_line_cut = int(estimated_line_cut)
 | 
			
		||||
            for cnt in reversed(range(estimated_line_cut)):
 | 
			
		||||
                if must_break_at_empty_line:
 | 
			
		||||
                    if lines[cnt] != "":
 | 
			
		||||
                        continue
 | 
			
		||||
                print(cnt)
 | 
			
		||||
                prev = "\n".join(lines[:cnt])
 | 
			
		||||
                post = "\n".join(lines[cnt:])
 | 
			
		||||
                if get_token_fn(prev) < limit:
 | 
			
		||||
                    break
 | 
			
		||||
            if cnt == 0:
 | 
			
		||||
                raise RuntimeError("存在一行极长的文本!")
 | 
			
		||||
            # print(len(post))
 | 
			
		||||
            # 列表递归接龙
 | 
			
		||||
            result = [prev]
 | 
			
		||||
            result.extend(cut(post, must_break_at_empty_line))
 | 
			
		||||
            return result
 | 
			
		||||
    try:
 | 
			
		||||
        return cut(txt, must_break_at_empty_line=True)
 | 
			
		||||
    except RuntimeError:
 | 
			
		||||
        return cut(txt, must_break_at_empty_line=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def force_breakdown(txt, limit, get_token_fn):
 | 
			
		||||
    """
 | 
			
		||||
    当无法用标点、空行分割时,我们用最暴力的方法切割
 | 
			
		||||
    """
 | 
			
		||||
    for i in reversed(range(len(txt))):
 | 
			
		||||
        if get_token_fn(txt[:i]) < limit:
 | 
			
		||||
            return txt[:i], txt[i:]
 | 
			
		||||
    return "Tiktoken未知错误", "Tiktoken未知错误"
 | 
			
		||||
 | 
			
		||||
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
 | 
			
		||||
    # 递归
 | 
			
		||||
    def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):  
 | 
			
		||||
        if get_token_fn(txt_tocut) <= limit:
 | 
			
		||||
            return [txt_tocut]
 | 
			
		||||
        else:
 | 
			
		||||
            lines = txt_tocut.split('\n')
 | 
			
		||||
            estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
 | 
			
		||||
            estimated_line_cut = int(estimated_line_cut)
 | 
			
		||||
            cnt = 0
 | 
			
		||||
            for cnt in reversed(range(estimated_line_cut)):
 | 
			
		||||
                if must_break_at_empty_line:
 | 
			
		||||
                    if lines[cnt] != "":
 | 
			
		||||
                        continue
 | 
			
		||||
                prev = "\n".join(lines[:cnt])
 | 
			
		||||
                post = "\n".join(lines[cnt:])
 | 
			
		||||
                if get_token_fn(prev) < limit:
 | 
			
		||||
                    break
 | 
			
		||||
            if cnt == 0:
 | 
			
		||||
                if break_anyway:
 | 
			
		||||
                    prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
 | 
			
		||||
                else:
 | 
			
		||||
                    raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
 | 
			
		||||
            # print(len(post))
 | 
			
		||||
            # 列表递归接龙
 | 
			
		||||
            result = [prev]
 | 
			
		||||
            result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
 | 
			
		||||
            return result
 | 
			
		||||
    try:
 | 
			
		||||
        # 第1次尝试,将双空行(\n\n)作为切分点
 | 
			
		||||
        return cut(txt, must_break_at_empty_line=True)
 | 
			
		||||
    except RuntimeError:
 | 
			
		||||
        try:
 | 
			
		||||
            # 第2次尝试,将单空行(\n)作为切分点
 | 
			
		||||
            return cut(txt, must_break_at_empty_line=False)
 | 
			
		||||
        except RuntimeError:
 | 
			
		||||
            try:
 | 
			
		||||
                # 第3次尝试,将英文句号(.)作为切分点
 | 
			
		||||
                res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
 | 
			
		||||
                return [r.replace('。\n', '.') for r in res]
 | 
			
		||||
            except RuntimeError as e:
 | 
			
		||||
                try:
 | 
			
		||||
                    # 第4次尝试,将中文句号(。)作为切分点
 | 
			
		||||
                    res = cut(txt.replace('。', '。。\n'), must_break_at_empty_line=False)
 | 
			
		||||
                    return [r.replace('。。\n', '。') for r in res]
 | 
			
		||||
                except RuntimeError as e:
 | 
			
		||||
                    # 第5次尝试,没办法了,随便切一下敷衍吧
 | 
			
		||||
                    return cut(txt, must_break_at_empty_line=False, break_anyway=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_and_clean_pdf_text(fp):
 | 
			
		||||
    """
 | 
			
		||||
@ -631,7 +542,6 @@ def get_files_from_everything(txt, type): # type='.md'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
@Singleton
 | 
			
		||||
class nougat_interface():
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										37
									
								
								crazy_functions/ipc_fns/mp.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								crazy_functions/ipc_fns/mp.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,37 @@
 | 
			
		||||
import platform 
 | 
			
		||||
import pickle
 | 
			
		||||
import multiprocessing
 | 
			
		||||
 | 
			
		||||
def run_in_subprocess_wrapper_func(v_args):
 | 
			
		||||
    func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args)
 | 
			
		||||
    import sys
 | 
			
		||||
    try:
 | 
			
		||||
        result = func(*args, **kwargs)
 | 
			
		||||
        return_dict['result'] = result
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        exc_info = sys.exc_info()
 | 
			
		||||
        exception_dict['exception'] = exc_info
 | 
			
		||||
 | 
			
		||||
def run_in_subprocess_with_timeout(func, timeout=60):
 | 
			
		||||
    if platform.system() == 'Linux':
 | 
			
		||||
        def wrapper(*args, **kwargs):
 | 
			
		||||
            return_dict = multiprocessing.Manager().dict()
 | 
			
		||||
            exception_dict = multiprocessing.Manager().dict()
 | 
			
		||||
            v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict))
 | 
			
		||||
            process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,))
 | 
			
		||||
            process.start()
 | 
			
		||||
            process.join(timeout)
 | 
			
		||||
            if process.is_alive():
 | 
			
		||||
                process.terminate()
 | 
			
		||||
                raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务')
 | 
			
		||||
            process.close()
 | 
			
		||||
            if 'exception' in exception_dict:
 | 
			
		||||
                # ooops, the subprocess ran into an exception
 | 
			
		||||
                exc_info = exception_dict['exception']
 | 
			
		||||
                raise exc_info[1].with_traceback(exc_info[2])
 | 
			
		||||
            if 'result' in return_dict.keys():
 | 
			
		||||
                # If the subprocess ran successfully, return the result
 | 
			
		||||
                return return_dict['result']
 | 
			
		||||
        return wrapper
 | 
			
		||||
    else:
 | 
			
		||||
        return func
 | 
			
		||||
@ -175,7 +175,6 @@ class LatexPaperFileGroup():
 | 
			
		||||
        self.sp_file_contents = []
 | 
			
		||||
        self.sp_file_index = []
 | 
			
		||||
        self.sp_file_tag = []
 | 
			
		||||
 | 
			
		||||
        # count_token
 | 
			
		||||
        from request_llms.bridge_all import model_info
 | 
			
		||||
        enc = model_info["gpt-3.5-turbo"]['tokenizer']
 | 
			
		||||
@ -192,13 +191,12 @@ class LatexPaperFileGroup():
 | 
			
		||||
                self.sp_file_index.append(index)
 | 
			
		||||
                self.sp_file_tag.append(self.file_paths[index])
 | 
			
		||||
            else:
 | 
			
		||||
                from ..crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
                segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
 | 
			
		||||
                from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
                segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
 | 
			
		||||
                for j, segment in enumerate(segments):
 | 
			
		||||
                    self.sp_file_contents.append(segment)
 | 
			
		||||
                    self.sp_file_index.append(index)
 | 
			
		||||
                    self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
 | 
			
		||||
        print('Segmentation: done')
 | 
			
		||||
 | 
			
		||||
    def merge_result(self):
 | 
			
		||||
        self.file_result = ["" for _ in range(len(self.file_paths))]
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										125
									
								
								crazy_functions/pdf_fns/breakdown_txt.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								crazy_functions/pdf_fns/breakdown_txt.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,125 @@
 | 
			
		||||
from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout
 | 
			
		||||
 | 
			
		||||
def force_breakdown(txt, limit, get_token_fn):
 | 
			
		||||
    """ 当无法用标点、空行分割时,我们用最暴力的方法切割
 | 
			
		||||
    """
 | 
			
		||||
    for i in reversed(range(len(txt))):
 | 
			
		||||
        if get_token_fn(txt[:i]) < limit:
 | 
			
		||||
            return txt[:i], txt[i:]
 | 
			
		||||
    return "Tiktoken未知错误", "Tiktoken未知错误"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage):
 | 
			
		||||
    """ 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
 | 
			
		||||
    当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出
 | 
			
		||||
    """
 | 
			
		||||
    _min = int(5e4)
 | 
			
		||||
    _max = int(1e5)
 | 
			
		||||
    # print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage))
 | 
			
		||||
    if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0:
 | 
			
		||||
        remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage
 | 
			
		||||
        remain_txt_to_cut_storage = ""
 | 
			
		||||
    if len(remain_txt_to_cut) > _max:
 | 
			
		||||
        remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage
 | 
			
		||||
        remain_txt_to_cut = remain_txt_to_cut[:_max]
 | 
			
		||||
    return remain_txt_to_cut, remain_txt_to_cut_storage
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False):
 | 
			
		||||
    """ 文本切分
 | 
			
		||||
    """
 | 
			
		||||
    res = []
 | 
			
		||||
    total_len = len(txt_tocut)
 | 
			
		||||
    fin_len = 0
 | 
			
		||||
    remain_txt_to_cut = txt_tocut
 | 
			
		||||
    remain_txt_to_cut_storage = ""
 | 
			
		||||
    # 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
 | 
			
		||||
    remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
 | 
			
		||||
    
 | 
			
		||||
    while True:
 | 
			
		||||
        if get_token_fn(remain_txt_to_cut) <= limit:
 | 
			
		||||
            # 如果剩余文本的token数小于限制,那么就不用切了
 | 
			
		||||
            res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut)
 | 
			
		||||
            break
 | 
			
		||||
        else:
 | 
			
		||||
            # 如果剩余文本的token数大于限制,那么就切
 | 
			
		||||
            lines = remain_txt_to_cut.split('\n')
 | 
			
		||||
 | 
			
		||||
            # 估计一个切分点
 | 
			
		||||
            estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines)
 | 
			
		||||
            estimated_line_cut = int(estimated_line_cut)
 | 
			
		||||
 | 
			
		||||
            # 开始查找合适切分点的偏移(cnt)
 | 
			
		||||
            cnt = 0
 | 
			
		||||
            for cnt in reversed(range(estimated_line_cut)):
 | 
			
		||||
                if must_break_at_empty_line:
 | 
			
		||||
                    # 首先尝试用双空行(\n\n)作为切分点
 | 
			
		||||
                    if lines[cnt] != "":
 | 
			
		||||
                        continue
 | 
			
		||||
                prev = "\n".join(lines[:cnt])
 | 
			
		||||
                post = "\n".join(lines[cnt:])
 | 
			
		||||
                if get_token_fn(prev) < limit:
 | 
			
		||||
                    break
 | 
			
		||||
 | 
			
		||||
            if cnt == 0:
 | 
			
		||||
                # 如果没有找到合适的切分点
 | 
			
		||||
                if break_anyway:
 | 
			
		||||
                    # 是否允许暴力切分
 | 
			
		||||
                    prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
 | 
			
		||||
                else:
 | 
			
		||||
                    # 不允许直接报错
 | 
			
		||||
                    raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
 | 
			
		||||
 | 
			
		||||
            # 追加列表
 | 
			
		||||
            res.append(prev); fin_len+=len(prev)
 | 
			
		||||
            # 准备下一次迭代
 | 
			
		||||
            remain_txt_to_cut = post
 | 
			
		||||
            remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
 | 
			
		||||
            process = fin_len/total_len
 | 
			
		||||
            print(f'\r正在文本切分 {int(process*100)}%', end='')
 | 
			
		||||
            if len(remain_txt_to_cut.strip()) == 0:
 | 
			
		||||
                break
 | 
			
		||||
    return res
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"):
 | 
			
		||||
    """ 使用多种方式尝试切分文本,以满足 token 限制
 | 
			
		||||
    """
 | 
			
		||||
    from request_llms.bridge_all import model_info
 | 
			
		||||
    enc = model_info[llm_model]['tokenizer']
 | 
			
		||||
    def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
 | 
			
		||||
    try:
 | 
			
		||||
        # 第1次尝试,将双空行(\n\n)作为切分点
 | 
			
		||||
        return cut(limit, get_token_fn, txt, must_break_at_empty_line=True)
 | 
			
		||||
    except RuntimeError:
 | 
			
		||||
        try:
 | 
			
		||||
            # 第2次尝试,将单空行(\n)作为切分点
 | 
			
		||||
            return cut(limit, get_token_fn, txt, must_break_at_empty_line=False)
 | 
			
		||||
        except RuntimeError:
 | 
			
		||||
            try:
 | 
			
		||||
                # 第3次尝试,将英文句号(.)作为切分点
 | 
			
		||||
                res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
 | 
			
		||||
                return [r.replace('。\n', '.') for r in res]
 | 
			
		||||
            except RuntimeError as e:
 | 
			
		||||
                try:
 | 
			
		||||
                    # 第4次尝试,将中文句号(。)作为切分点
 | 
			
		||||
                    res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False)
 | 
			
		||||
                    return [r.replace('。。\n', '。') for r in res]
 | 
			
		||||
                except RuntimeError as e:
 | 
			
		||||
                    # 第5次尝试,没办法了,随便切一下吧
 | 
			
		||||
                    return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True)
 | 
			
		||||
 | 
			
		||||
breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60)
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    from crazy_functions.crazy_utils import read_and_clean_pdf_text
 | 
			
		||||
    file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf")
 | 
			
		||||
 | 
			
		||||
    from request_llms.bridge_all import model_info
 | 
			
		||||
    for i in range(5):
 | 
			
		||||
        file_content += file_content
 | 
			
		||||
 | 
			
		||||
    print(len(file_content))
 | 
			
		||||
    TOKEN_LIMIT_PER_FRAGMENT = 2500
 | 
			
		||||
    res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT)
 | 
			
		||||
 | 
			
		||||
@ -74,7 +74,7 @@ def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chat
 | 
			
		||||
 | 
			
		||||
def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG):
 | 
			
		||||
    from crazy_functions.pdf_fns.report_gen_html import construct_html
 | 
			
		||||
    from crazy_functions.crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
    from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
    from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 | 
			
		||||
    from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
 | 
			
		||||
 | 
			
		||||
@ -116,7 +116,7 @@ def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_fi
 | 
			
		||||
            # find a smooth token limit to achieve even seperation
 | 
			
		||||
            count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT))
 | 
			
		||||
            token_limit_smooth = raw_token_num // count + count
 | 
			
		||||
            return breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn=get_token_num, limit=token_limit_smooth)
 | 
			
		||||
            return breakdown_text_to_satisfy_token_limit(txt, limit=token_limit_smooth, llm_model=llm_kwargs['llm_model'])
 | 
			
		||||
 | 
			
		||||
    for section in article_dict.get('sections'):
 | 
			
		||||
        if len(section['text']) == 0: continue
 | 
			
		||||
 | 
			
		||||
@ -31,15 +31,11 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot
 | 
			
		||||
 | 
			
		||||
        print(file_content)
 | 
			
		||||
        # private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名
 | 
			
		||||
        from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
        from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
        from request_llms.bridge_all import model_info
 | 
			
		||||
        max_token = model_info[llm_kwargs['llm_model']]['max_token']
 | 
			
		||||
        TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4
 | 
			
		||||
        paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
            txt=file_content,  
 | 
			
		||||
            get_token_fn=model_info[llm_kwargs['llm_model']]['token_cnt'], 
 | 
			
		||||
            limit=TOKEN_LIMIT_PER_FRAGMENT
 | 
			
		||||
        )
 | 
			
		||||
        paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
 | 
			
		||||
        this_paper_history = []
 | 
			
		||||
        for i, paper_frag in enumerate(paper_fragments):
 | 
			
		||||
            i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```'
 | 
			
		||||
 | 
			
		||||
@ -28,8 +28,8 @@ class PaperFileGroup():
 | 
			
		||||
                self.sp_file_index.append(index)
 | 
			
		||||
                self.sp_file_tag.append(self.file_paths[index])
 | 
			
		||||
            else:
 | 
			
		||||
                from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
                segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
 | 
			
		||||
                from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
                segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
 | 
			
		||||
                for j, segment in enumerate(segments):
 | 
			
		||||
                    self.sp_file_contents.append(segment)
 | 
			
		||||
                    self.sp_file_index.append(index)
 | 
			
		||||
 | 
			
		||||
@ -20,14 +20,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
 | 
			
		||||
        
 | 
			
		||||
        TOKEN_LIMIT_PER_FRAGMENT = 2500
 | 
			
		||||
 | 
			
		||||
        from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
        from request_llms.bridge_all import model_info
 | 
			
		||||
        enc = model_info["gpt-3.5-turbo"]['tokenizer']
 | 
			
		||||
        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
 | 
			
		||||
        paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
            txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
 | 
			
		||||
        page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
            txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
 | 
			
		||||
        from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
        paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content,  limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
 | 
			
		||||
        page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model'])
 | 
			
		||||
        # 为了更好的效果,我们剥离Introduction之后的部分(如果有)
 | 
			
		||||
        paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
@ -91,14 +91,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
 | 
			
		||||
        page_one = str(page_one).encode('utf-8', 'ignore').decode()      # avoid reading non-utf8 chars
 | 
			
		||||
 | 
			
		||||
        # 递归地切割PDF文件
 | 
			
		||||
        from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
        from request_llms.bridge_all import model_info
 | 
			
		||||
        enc = model_info["gpt-3.5-turbo"]['tokenizer']
 | 
			
		||||
        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
 | 
			
		||||
        paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
            txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
 | 
			
		||||
        page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
            txt=page_one, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
 | 
			
		||||
        from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
        paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
 | 
			
		||||
        page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=page_one, limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model'])
 | 
			
		||||
 | 
			
		||||
        # 为了更好的效果,我们剥离Introduction之后的部分(如果有)
 | 
			
		||||
        paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
 | 
			
		||||
 | 
			
		||||
@ -18,14 +18,9 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
 | 
			
		||||
    
 | 
			
		||||
    TOKEN_LIMIT_PER_FRAGMENT = 2500
 | 
			
		||||
 | 
			
		||||
    from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
    from request_llms.bridge_all import model_info
 | 
			
		||||
    enc = model_info["gpt-3.5-turbo"]['tokenizer']
 | 
			
		||||
    def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
 | 
			
		||||
    paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
        txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
 | 
			
		||||
    page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
        txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
 | 
			
		||||
    from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
    paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
 | 
			
		||||
    page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model'])
 | 
			
		||||
    # 为了更好的效果,我们剥离Introduction之后的部分(如果有)
 | 
			
		||||
    paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
 | 
			
		||||
    
 | 
			
		||||
@ -45,7 +40,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
 | 
			
		||||
    for i in range(n_fragment):
 | 
			
		||||
        NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
 | 
			
		||||
        i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
 | 
			
		||||
        i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}"
 | 
			
		||||
        i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]} ...."
 | 
			
		||||
        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user,  # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
 | 
			
		||||
                                                                           llm_kwargs, chatbot, 
 | 
			
		||||
                                                                           history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
 | 
			
		||||
 | 
			
		||||
@ -12,13 +12,6 @@ class PaperFileGroup():
 | 
			
		||||
        self.sp_file_index = []
 | 
			
		||||
        self.sp_file_tag = []
 | 
			
		||||
 | 
			
		||||
        # count_token
 | 
			
		||||
        from request_llms.bridge_all import model_info
 | 
			
		||||
        enc = model_info["gpt-3.5-turbo"]['tokenizer']
 | 
			
		||||
        def get_token_num(txt): return len(
 | 
			
		||||
            enc.encode(txt, disallowed_special=()))
 | 
			
		||||
        self.get_token_num = get_token_num
 | 
			
		||||
 | 
			
		||||
    def run_file_split(self, max_token_limit=1900):
 | 
			
		||||
        """
 | 
			
		||||
        将长文本分离开来
 | 
			
		||||
@ -29,9 +22,8 @@ class PaperFileGroup():
 | 
			
		||||
                self.sp_file_index.append(index)
 | 
			
		||||
                self.sp_file_tag.append(self.file_paths[index])
 | 
			
		||||
            else:
 | 
			
		||||
                from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
                segments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
                    file_content, self.get_token_num, max_token_limit)
 | 
			
		||||
                from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
 | 
			
		||||
                segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
 | 
			
		||||
                for j, segment in enumerate(segments):
 | 
			
		||||
                    self.sp_file_contents.append(segment)
 | 
			
		||||
                    self.sp_file_index.append(index)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user