diff --git a/.gitignore b/.gitignore index 36f3534..d2c0f96 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,5 @@ config_private.py gpt_log private.md private_upload -other_llms \ No newline at end of file +other_llms +cradle.py \ No newline at end of file diff --git a/crazy_functions/代码重写为全英文_多线程.py b/crazy_functions/代码重写为全英文_多线程.py index 6c6b1c7..bfcbec3 100644 --- a/crazy_functions/代码重写为全英文_多线程.py +++ b/crazy_functions/代码重写为全英文_多线程.py @@ -1,41 +1,126 @@ import threading from predict import predict_no_ui_long_connection -from toolbox import CatchException, write_results_to_file +from toolbox import CatchException, write_results_to_file, report_execption +def extract_code_block_carefully(txt): + splitted = txt.split('```') + n_code_block_seg = len(splitted) - 1 + if n_code_block_seg <= 1: return txt + # 剩下的情况都开头除去 ``` 结尾除去一次 ``` + txt_out = '```'.join(splitted[1:-1]) + return txt_out + +def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=True): + from transformers import GPT2TokenizerFast + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") + get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"]) + def cut(txt_tocut, must_break_at_empty_line): # 递归 + if get_token_cnt(txt_tocut) <= limit: + return [txt_tocut] + else: + lines = txt_tocut.split('\n') + estimated_line_cut = limit / get_token_cnt(txt_tocut) * len(lines) + estimated_line_cut = int(estimated_line_cut) + for cnt in reversed(range(estimated_line_cut)): + if must_break_at_empty_line: + if lines[cnt] != "": continue + print(cnt) + prev = "\n".join(lines[:cnt]) + post = "\n".join(lines[cnt:]) + if get_token_cnt(prev) < limit: break + if cnt == 0: + print('what the f?') + raise RuntimeError("存在一行极长的文本!") + print(len(post)) + # 列表递归接龙 + result = [prev] + result.extend(cut(post, must_break_at_empty_line)) + return result + try: + return cut(txt, must_break_at_empty_line=True) + except RuntimeError: + return cut(txt, must_break_at_empty_line=False) + + +def break_txt_into_half_at_some_linebreak(txt): + lines = txt.split('\n') + n_lines = len(lines) + pre = lines[:(n_lines//2)] + post = lines[(n_lines//2):] + return "\n".join(pre), "\n".join(post) @CatchException def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt, WEB_PORT): - history = [] # 清空历史,以免输入溢出 - # 集合文件 - import time, glob, os + # 第1步:清空历史,以免输入溢出 + history = [] + + # 第2步:尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import openai, transformers + except: + report_execption(chatbot, history, + a = f"解析项目: {txt}", + b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade openai transformers```。") + yield chatbot, history, '正常' + return + + # 第3步:集合文件 + import time, glob, os, shutil, re, openai os.makedirs('gpt_log/generated_english_version', exist_ok=True) os.makedirs('gpt_log/generated_english_version/crazy_functions', exist_ok=True) file_manifest = [f for f in glob.glob('./*.py') if ('test_project' not in f) and ('gpt_log' not in f)] + \ [f for f in glob.glob('./crazy_functions/*.py') if ('test_project' not in f) and ('gpt_log' not in f)] + # file_manifest = ['./toolbox.py'] i_say_show_user_buffer = [] - # 随便显示点什么防止卡顿的感觉 + # 第4步:随便显示点什么防止卡顿的感觉 for index, fp in enumerate(file_manifest): # if 'test_project' in fp: continue with open(fp, 'r', encoding='utf-8') as f: file_content = f.read() - i_say_show_user =f'[{index}/{len(file_manifest)}] 接下来请将以下代码中包含的所有中文转化为英文,只输出代码: {os.path.abspath(fp)}' + i_say_show_user =f'[{index}/{len(file_manifest)}] 接下来请将以下代码中包含的所有中文转化为英文,只输出转化后的英文代码,请用代码块输出代码: {os.path.abspath(fp)}' i_say_show_user_buffer.append(i_say_show_user) chatbot.append((i_say_show_user, "[Local Message] 等待多线程操作,中间过程不予显示.")) yield chatbot, history, '正常' - # 任务函数 + + # 第5步:Token限制下的截断与处理 + MAX_TOKEN = 2500 + # from transformers import GPT2TokenizerFast + # print('加载tokenizer中') + # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") + # get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"]) + # print('加载tokenizer结束') + + + # 第6步:任务函数 mutable_return = [None for _ in file_manifest] + observe_window = [[""] for _ in file_manifest] def thread_worker(fp,index): + if index > 10: + time.sleep(60) + print('Openai 限制免费用户每分钟20次请求,降低请求频率中。') with open(fp, 'r', encoding='utf-8') as f: file_content = f.read() - i_say = f'接下来请将以下代码中包含的所有中文转化为英文,只输出代码,文件名是{fp},文件代码是 ```{file_content}```' - # ** gpt request ** - gpt_say = predict_no_ui_long_connection(inputs=i_say, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt) - mutable_return[index] = gpt_say + i_say_template = lambda fp, file_content: f'接下来请将以下代码中包含的所有中文转化为英文,只输出代码,文件名是{fp},文件代码是 ```{file_content}```' + try: + gpt_say = "" + # 分解代码文件 + file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, MAX_TOKEN) + for file_content_partial in file_content_breakdown: + i_say = i_say_template(fp, file_content_partial) + # # ** gpt request ** + gpt_say_partial = predict_no_ui_long_connection(inputs=i_say, top_p=top_p, temperature=temperature, history=[], sys_prompt=sys_prompt, observe_window=observe_window[index]) + gpt_say_partial = extract_code_block_carefully(gpt_say_partial) + gpt_say += gpt_say_partial + mutable_return[index] = gpt_say + except ConnectionAbortedError as token_exceed_err: + print('至少一个线程任务Token溢出而失败', e) + except Exception as e: + print('至少一个线程任务意外失败', e) - # 所有线程同时开始执行任务函数 + # 第7步:所有线程同时开始执行任务函数 handles = [threading.Thread(target=thread_worker, args=(fp,index)) for index, fp in enumerate(file_manifest)] for h in handles: h.daemon = True @@ -43,19 +128,23 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt, chatbot.append(('开始了吗?', f'多线程操作已经开始')) yield chatbot, history, '正常' - # 循环轮询各个线程是否执行完毕 + # 第8步:循环轮询各个线程是否执行完毕 cnt = 0 while True: - time.sleep(1) + cnt += 1 + time.sleep(0.2) th_alive = [h.is_alive() for h in handles] if not any(th_alive): break - stat = ['执行中' if alive else '已完成' for alive in th_alive] - stat_str = '|'.join(stat) - cnt += 1 - chatbot[-1] = (chatbot[-1][0], f'多线程操作已经开始,完成情况: {stat_str}' + ''.join(['.']*(cnt%4))) + # 更好的UI视觉效果 + observe_win = [] + for thread_index, alive in enumerate(th_alive): + observe_win.append("[ ..."+observe_window[thread_index][0][-60:].replace('\n','').replace('```','...').replace(' ','.').replace('
','.....').replace('$','.')+"... ]") + stat = [f'执行中: {obs}\n\n' if alive else '已完成\n\n' for alive, obs in zip(th_alive, observe_win)] + stat_str = ''.join(stat) + chatbot[-1] = (chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1))) yield chatbot, history, '正常' - # 把结果写入文件 + # 第9步:把结果写入文件 for index, h in enumerate(handles): h.join() # 这里其实不需要join了,肯定已经都结束了 fp = file_manifest[index] @@ -63,13 +152,17 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt, i_say_show_user = i_say_show_user_buffer[index] where_to_relocate = f'gpt_log/generated_english_version/{fp}' - with open(where_to_relocate, 'w+', encoding='utf-8') as f: f.write(gpt_say.lstrip('```').rstrip('```')) + if gpt_say is not None: + with open(where_to_relocate, 'w+', encoding='utf-8') as f: + f.write(gpt_say) + else: # 失败 + shutil.copyfile(file_manifest[index], where_to_relocate) chatbot.append((i_say_show_user, f'[Local Message] 已完成{os.path.abspath(fp)}的转化,\n\n存入{os.path.abspath(where_to_relocate)}')) history.append(i_say_show_user); history.append(gpt_say) yield chatbot, history, '正常' time.sleep(1) - # 备份一个文件 + # 第10步:备份一个文件 res = write_results_to_file(history) chatbot.append(("生成一份任务执行报告", res)) yield chatbot, history, '正常' diff --git a/predict.py b/predict.py index f4c87cc..2a1ef4d 100644 --- a/predict.py +++ b/predict.py @@ -71,9 +71,10 @@ def predict_no_ui(inputs, top_p, temperature, history=[], sys_prompt=""): raise ConnectionAbortedError("Json解析不合常规,可能是文本过长" + response.text) -def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_prompt=""): +def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_prompt="", observe_window=None): """ 发送至chatGPT,等待回复,一次性完成,不显示中间过程。但内部用stream的方法避免有人中途掐网线。 + observe_window:用于负责跨越线程传递已经输出的部分,大部分时候仅仅为了fancy的视觉效果,留空即可 """ headers, payload = generate_payload(inputs, top_p, temperature, history, system_prompt=sys_prompt, stream=True) @@ -105,7 +106,10 @@ def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_pr delta = json_data["delta"] if len(delta) == 0: break if "role" in delta: continue - if "content" in delta: result += delta["content"]; print(delta["content"], end='') + if "content" in delta: + result += delta["content"] + print(delta["content"], end='') + if observe_window is not None: observe_window[0] += delta["content"] else: raise RuntimeError("意外Json结构:"+delta) if json_data['finish_reason'] == 'length': raise ConnectionAbortedError("正常结束,但显示Token不足。") diff --git a/requirements.txt b/requirements.txt index d71b498..bdafbe3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ requests[socks] mdtex2html Markdown latex2mathml +openai +transformers