diff --git a/config.py b/config.py index 6fff280..0d67983 100644 --- a/config.py +++ b/config.py @@ -62,8 +62,6 @@ AUTO_CLEAR_TXT = False # 色彩主体,可选: "Default", "Green" THEME = "Default" -ENABLE_AUDIO = True - # 加一个live2d装饰 ADD_WAIFU = False @@ -91,9 +89,13 @@ your bing cookies here SLACK_CLAUDE_BOT_ID = '' SLACK_CLAUDE_USER_TOKEN = '' - # 如果需要使用AZURE 详情请见额外文档 docs\use_azure.md AZURE_ENDPOINT = "https://你的api名称.openai.azure.com/" AZURE_API_KEY = "填入azure openai api的密钥" AZURE_API_VERSION = "填入api版本" AZURE_ENGINE = "填入ENGINE" + +# 阿里云实时语音识别 配置门槛较高 限高级用户使用 参考 https://help.aliyun.com/document_detail/450255.html +ENABLE_AUDIO = True +ALIYUN_TOKEN="" # 例如 f37f30e0f9934c34a992f6f64f7eba4f +ALIYUN_APPKEY="" # 例如 RoPlZrM88DnAFkZK diff --git a/crazy_functional.py b/crazy_functional.py index 4677e01..84f73f3 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -396,7 +396,7 @@ def get_crazy_functions(): function_plugins.update({ "面试助手 [实时音频采集]": { "Color": "stop", - "AsButton": False, + "AsButton": True, "Function": HotReload(辅助面试) } }) diff --git a/crazy_functions/live_audio/aliyunASR.py b/crazy_functions/live_audio/aliyunASR.py new file mode 100644 index 0000000..aa8ecc9 --- /dev/null +++ b/crazy_functions/live_audio/aliyunASR.py @@ -0,0 +1,82 @@ +import time, threading, json + + +class AliyunASR(): + + def test_on_sentence_begin(self, message, *args): + print("test_on_sentence_begin:{}".format(message)) + + def test_on_sentence_end(self, message, *args): + print("test_on_sentence_end:{}".format(message)) + message = json.loads(message) + self.parsed_sentence = message['payload']['result'] + self.event_on_entence_end.set() + + def test_on_start(self, message, *args): + print("test_on_start:{}".format(message)) + + def test_on_error(self, message, *args): + print("on_error args=>{}".format(args)) + + def test_on_close(self, *args): + print("on_close: args=>{}".format(args)) + + def test_on_result_chg(self, message, *args): + print("test_on_chg:{}".format(message)) + message = json.loads(message) + self.parsed_text = message['payload']['result'] + self.event_on_result_chg.set() + + def test_on_completed(self, message, *args): + print("on_completed:args=>{} message=>{}".format(args, message)) + + + def audio_convertion_thread(self, uuid): + # 在一个异步线程中采集音频 + import nls # pip install git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git + import tempfile + from scipy import io + from toolbox import get_conf + from .audio_io import change_sample_rate + from .audio_io import RealtimeAudioDistribution + NEW_SAMPLERATE = 16000 + rad = RealtimeAudioDistribution() + temp_folder = tempfile.gettempdir() + TOKEN, APPKEY = get_conf('ALIYUN_TOKEN', 'ALIYUN_APPKEY') + + URL="wss://nls-gateway.cn-shanghai.aliyuncs.com/ws/v1" + sr = nls.NlsSpeechTranscriber( + url=URL, + token=TOKEN, + appkey=APPKEY, + on_sentence_begin=self.test_on_sentence_begin, + on_sentence_end=self.test_on_sentence_end, + on_start=self.test_on_start, + on_result_changed=self.test_on_result_chg, + on_completed=self.test_on_completed, + on_error=self.test_on_error, + on_close=self.test_on_close, + callback_args=[uuid.hex] + ) + + r = sr.start(aformat="pcm", + enable_intermediate_result=True, + enable_punctuation_prediction=True, + enable_inverse_text_normalization=True) + + while not self.stop: + # time.sleep(self.capture_interval) + audio = rad.read(uuid.hex) + if audio is not None: + # convert to pcm file + temp_file = f'{temp_folder}/{uuid.hex}.pcm' # + dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000 + io.wavfile.write(temp_file, NEW_SAMPLERATE, dsdata) + # read pcm binary + with open(temp_file, "rb") as f: data = f.read() + print('audio len:', len(audio), '\t ds len:', len(dsdata), '\t need n send:', len(data)//640) + slices = zip(*(iter(data),) * 640) # 640个字节为一组 + for i in slices: sr.send_audio(bytes(i)) + else: + time.sleep(0.1) + r = sr.stop() diff --git a/crazy_functions/live_audio/audio_io.py b/crazy_functions/live_audio/audio_io.py index f343b02..943bd52 100644 --- a/crazy_functions/live_audio/audio_io.py +++ b/crazy_functions/live_audio/audio_io.py @@ -1,4 +1,5 @@ import numpy as np +from scipy import interpolate def Singleton(cls): _instance = {} @@ -15,12 +16,12 @@ def Singleton(cls): class RealtimeAudioDistribution(): def __init__(self) -> None: self.data = {} - self.max_len = 1024*64 + self.max_len = 1024*1024 self.rate = 48000 # 只读,每秒采样数量 def feed(self, uuid, audio): - print('feed') self.rate, audio_ = audio + print('feed', len(audio_), audio_[-25:]) if uuid not in self.data: self.data[uuid] = audio_ else: @@ -31,7 +32,17 @@ class RealtimeAudioDistribution(): def read(self, uuid): if uuid in self.data: res = self.data.pop(uuid) - print('read', len(res)) + print('read', len(res), res) else: res = None - return res \ No newline at end of file + return res + +def change_sample_rate(audio, old_sr, new_sr): + duration = audio.shape[0] / old_sr + + time_old = np.linspace(0, duration, audio.shape[0]) + time_new = np.linspace(0, duration, int(audio.shape[0] * new_sr / old_sr)) + + interpolator = interpolate.interp1d(time_old, audio.T) + new_audio = interpolator(time_new).T + return new_audio.astype(np.int16) \ No newline at end of file diff --git a/crazy_functions/辅助面试.py b/crazy_functions/辅助面试.py index 54ea010..9a70987 100644 --- a/crazy_functions/辅助面试.py +++ b/crazy_functions/辅助面试.py @@ -1,22 +1,52 @@ from toolbox import update_ui -from toolbox import CatchException, report_execption, write_results_to_file +from toolbox import CatchException, get_conf, write_results_to_file from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from request_llm.bridge_all import predict_no_ui_long_connection import threading, time import numpy as np +from .live_audio.aliyunASR import AliyunASR +import json -def take_audio_sentence_flagment(captured_audio): - """ - 判断音频是否到达句尾,如果到了,截取片段 - """ - ready_part = None - other_part = captured_audio - return ready_part, other_part -class InterviewAssistent(): + +class AsyncGptTask(): + def __init__(self) -> None: + self.observe_future = [] + self.observe_future_chatbot_index = [] + + def gpt_thread_worker(self, i_say, llm_kwargs, history, sys_prompt, observe_window, index): + try: + gpt_say_partial = predict_no_ui_long_connection(inputs=i_say, llm_kwargs=llm_kwargs, history=[], sys_prompt=sys_prompt, observe_window=observe_window[index]) + except ConnectionAbortedError as token_exceed_err: + print('至少一个线程任务Token溢出而失败', e) + except Exception as e: + print('至少一个线程任务意外失败', e) + + def add_async_gpt_task(self, i_say, chatbot_index, llm_kwargs, history, system_prompt): + self.observe_future.append([""]) + self.observe_future_chatbot_index.append(chatbot_index) + cur_index = len(self.observe_future)-1 + th_new = threading.Thread(target=self.gpt_thread_worker, args=(i_say, llm_kwargs, history, system_prompt, self.observe_future, cur_index)) + th_new.daemon = True + th_new.start() + + def update_chatbot(self, chatbot): + for of, ofci in zip(self.observe_future, self.observe_future_chatbot_index): + try: + chatbot[ofci] = list(chatbot[ofci]) + chatbot[ofci][1] = of[0] + except: + self.observe_future = [] + self.observe_future_chatbot_index = [] + return chatbot + +class InterviewAssistant(AliyunASR): def __init__(self): - self.capture_interval = 1.0 # second + self.capture_interval = 0.5 # second self.stop = False - pass + self.parsed_text = "" + self.event_on_result_chg = threading.Event() + self.event_on_entence_end = threading.Event() def init(self, chatbot): # 初始化音频采集线程 @@ -24,64 +54,76 @@ class InterviewAssistent(): self.keep_latest_n_second = 10 self.ready_audio_flagment = None self.stop = False - th1 = threading.Thread(target=self.audio_capture_thread, args=(chatbot._cookies['uuid'],)) + th1 = threading.Thread(target=self.audio_convertion_thread, args=(chatbot._cookies['uuid'],)) th1.daemon = True th1.start() - th2 = threading.Thread(target=self.audio2txt_thread, args=(chatbot._cookies['uuid'],)) - th2.daemon = True - th2.start() - - def audio_capture_thread(self, uuid): - # 在一个异步线程中采集音频 - from .live_audio.audio_io import RealtimeAudioDistribution - rad = RealtimeAudioDistribution() - while not self.stop: - time.sleep(self.capture_interval) - self.captured_audio = np.concatenate((self.captured_audio, rad.read(uuid.hex))) - if len(self.captured_audio) > self.keep_latest_n_second * rad.rate: - self.captured_audio = self.captured_audio[-self.keep_latest_n_second * rad.rate:] - - def audio2txt_thread(self, llm_kwargs): - import whisper - # 在一个异步线程中音频转文字 - while not self.stop: - time.sleep(1) - if len(self.captured_audio) > 0: - model = whisper.load_model("base") - result = model.transcribe("audio.mp3", language='Chinese') + # th2 = threading.Thread(target=self.audio2txt_thread, args=(chatbot._cookies['uuid'],)) + # th2.daemon = True + # th2.start() def gpt_answer(self, text, chatbot, history, llm_kwargs): i_say = inputs_show_user = text gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( inputs=i_say, inputs_show_user=inputs_show_user, llm_kwargs=llm_kwargs, chatbot=chatbot, history=history, - sys_prompt="你是求职者,正在参加面试,请回答问题。" + sys_prompt="请回答问题。" # 你是求职者,正在参加面试, ) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 history.extend([i_say, gpt_say]) - def begin(self, llm_kwargs, plugin_kwargs, chatbot, history): - # 面试插件主函数 + def begin(self, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): + # main plugin function self.init(chatbot) + chatbot.append(["", ""]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + self.agt = AsyncGptTask() + while True: - time.sleep(self.capture_interval) - if self.ready_audio_flagment: - audio_for_whisper = self.ready_audio_flagment - text = self.audio2txt(audio_for_whisper, llm_kwargs) - yield from self.gpt_answer(text, chatbot, history, llm_kwargs) - self.ready_audio_flagment = None + self.event_on_result_chg.wait(timeout=0.25) # run once every 0.25 second + chatbot = self.agt.update_chatbot(chatbot) # 将子线程的gpt结果写入chatbot + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + if self.event_on_result_chg.is_set(): + # update audio decode result + self.event_on_result_chg.clear() + chatbot[-1] = list(chatbot[-1]) + chatbot[-1][0] = self.parsed_text + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + if self.event_on_entence_end.is_set(): + # called when a sentence has ended + self.event_on_entence_end.clear() + chatbot[-1] = list(chatbot[-1]) + chatbot[-1] = [self.parsed_sentence, "[waiting gpt reply]"] + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + # add gpt task 创建子线程请求gpt,避免线程阻塞 + self.agt.add_async_gpt_task(self.parsed_sentence, len(chatbot)-1, llm_kwargs, history, system_prompt) + chatbot.append(["", ""]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + @CatchException def 辅助面试(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): # pip install -U openai-whisper - chatbot.append(["函数插件功能:辅助面试", "正在预热本地音频转文字模型 ..."]) + chatbot.append(["函数插件功能:辅助面试", "辅助面试助手, 正在监听音频 ..."]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - import whisper - whisper.load_model("base") - chatbot.append(["预热本地音频转文字模型完成", "辅助面试助手, 正在监听音频 ..."]) + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import nls + from scipy import io + except: + chatbot.append(["导入依赖失败", "使用该模块需要额外依赖, 安装方法:```pip install scipy git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git```"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + TOKEN, APPKEY = get_conf('ALIYUN_TOKEN', 'ALIYUN_APPKEY') + if TOKEN == "" or APPKEY == "": + chatbot.append(["导入依赖失败", "没有阿里云语音识别APPKEY和TOKEN, 详情见https://help.aliyun.com/document_detail/450255.html"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - ia = InterviewAssistent() - yield from ia.begin(llm_kwargs, plugin_kwargs, chatbot, history) + ia = InterviewAssistant() + yield from ia.begin(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) diff --git a/toolbox.py b/toolbox.py index 8e89100..256d99c 100644 --- a/toolbox.py +++ b/toolbox.py @@ -40,7 +40,6 @@ def ArgsGeneralWrapper(f): """ 装饰器函数,用于重组输入参数,改变输入参数的顺序与结构。 """ - ENABLE_AUDIO, = get_conf('ENABLE_AUDIO') def decorated(cookies, max_length, llm_model, txt, txt2, top_p, temperature, chatbot, history, system_prompt, plugin_advanced_arg, *args): txt_passon = txt if txt == "" and txt2 != "": txt_passon = txt2 @@ -59,7 +58,6 @@ def ArgsGeneralWrapper(f): plugin_kwargs = { "advanced_arg": plugin_advanced_arg, } - if ENABLE_AUDIO: plugin_kwargs.update({'audio': args[0]}) chatbot_with_cookie = ChatBotWithCookies(cookies) chatbot_with_cookie.write_list(chatbot) yield from f(txt_passon, llm_kwargs, plugin_kwargs, chatbot_with_cookie, history, system_prompt, *args)