From a330d6636ebd42283997a1ebef8eb2a3da6460ad Mon Sep 17 00:00:00 2001 From: 505030475 Date: Sun, 2 Jul 2023 22:54:05 +0800 Subject: [PATCH] error --- crazy_functional.py | 2 +- crazy_functions/live_audio/aliyunASR.py | 81 +++++++++++++++++++++++++ crazy_functions/live_audio/audio_io.py | 19 ++++-- crazy_functions/辅助面试.py | 62 ++++++------------- main.py | 2 +- 5 files changed, 115 insertions(+), 51 deletions(-) create mode 100644 crazy_functions/live_audio/aliyunASR.py diff --git a/crazy_functional.py b/crazy_functional.py index 4677e01..84f73f3 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -396,7 +396,7 @@ def get_crazy_functions(): function_plugins.update({ "面试助手 [实时音频采集]": { "Color": "stop", - "AsButton": False, + "AsButton": True, "Function": HotReload(辅助面试) } }) diff --git a/crazy_functions/live_audio/aliyunASR.py b/crazy_functions/live_audio/aliyunASR.py new file mode 100644 index 0000000..ec34ac0 --- /dev/null +++ b/crazy_functions/live_audio/aliyunASR.py @@ -0,0 +1,81 @@ +import time, threading + + +class AliyunASR(): + def __init__(self): + self.event_on_result_chg = threading.Event() + self.event_on_entence_end = threading.Event() + + def test_on_sentence_begin(self, message, *args): + print("test_on_sentence_begin:{}".format(message)) + + def test_on_sentence_end(self, message, *args): + print("test_on_sentence_end:{}".format(message)) + self.event_on_entence_end.set() + + def test_on_start(self, message, *args): + print("test_on_start:{}".format(message)) + + def test_on_error(self, message, *args): + print("on_error args=>{}".format(args)) + + def test_on_close(self, *args): + print("on_close: args=>{}".format(args)) + + def test_on_result_chg(self, message, *args): + print("test_on_chg:{}".format(message)) + self.parsed_text = message['payload']['result'] + self.event_on_result_chg.set() + + def test_on_completed(self, message, *args): + print("on_completed:args=>{} message=>{}".format(args, message)) + + def audio_convertion_thread(self, uuid): + # 在一个异步线程中采集音频 + import nls # pip install git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git + from scipy import io + from .audio_io import change_sample_rate + NEW_SAMPLERATE = 16000 + from .audio_io import RealtimeAudioDistribution + rad = RealtimeAudioDistribution() + import tempfile + temp_folder = tempfile.gettempdir() + + URL="wss://nls-gateway.cn-shanghai.aliyuncs.com/ws/v1" + TOKEN="f37f30e0f9934c34a992f6f64f7eba4f" # 参考https://help.aliyun.com/document_detail/450255.html获取token + APPKEY="RoPlZrM88DnAFkZK" # 获取Appkey请前往控制台:https://nls-portal.console.aliyun.com/applist + sr = nls.NlsSpeechTranscriber( + url=URL, + token=TOKEN, + appkey=APPKEY, + on_sentence_begin=self.test_on_sentence_begin, + on_sentence_end=self.test_on_sentence_end, + on_start=self.test_on_start, + on_result_changed=self.test_on_result_chg, + on_completed=self.test_on_completed, + on_error=self.test_on_error, + on_close=self.test_on_close, + callback_args=[uuid.hex] + ) + + r = sr.start(aformat="pcm", + enable_intermediate_result=True, + enable_punctuation_prediction=True, + enable_inverse_text_normalization=True) + + while not self.stop: + # time.sleep(self.capture_interval) + audio = rad.read(uuid.hex) + if audio is not None: + # convert to pcm file + temp_file = f'{temp_folder}/{uuid.hex}.pcm' # + dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000 + io.wavfile.write(temp_file, NEW_SAMPLERATE, dsdata) + # read pcm binary + with open(temp_file, "rb") as f: data = f.read() + print('audio len:', len(audio), '\t ds len:', len(dsdata), '\t need n send:', len(data)//640) + slices = zip(*(iter(data),) * 640) # 640个字节为一组 + for i in slices: sr.send_audio(bytes(i)) + else: + time.sleep(0.1) + r = sr.stop() diff --git a/crazy_functions/live_audio/audio_io.py b/crazy_functions/live_audio/audio_io.py index f343b02..943bd52 100644 --- a/crazy_functions/live_audio/audio_io.py +++ b/crazy_functions/live_audio/audio_io.py @@ -1,4 +1,5 @@ import numpy as np +from scipy import interpolate def Singleton(cls): _instance = {} @@ -15,12 +16,12 @@ def Singleton(cls): class RealtimeAudioDistribution(): def __init__(self) -> None: self.data = {} - self.max_len = 1024*64 + self.max_len = 1024*1024 self.rate = 48000 # 只读,每秒采样数量 def feed(self, uuid, audio): - print('feed') self.rate, audio_ = audio + print('feed', len(audio_), audio_[-25:]) if uuid not in self.data: self.data[uuid] = audio_ else: @@ -31,7 +32,17 @@ class RealtimeAudioDistribution(): def read(self, uuid): if uuid in self.data: res = self.data.pop(uuid) - print('read', len(res)) + print('read', len(res), res) else: res = None - return res \ No newline at end of file + return res + +def change_sample_rate(audio, old_sr, new_sr): + duration = audio.shape[0] / old_sr + + time_old = np.linspace(0, duration, audio.shape[0]) + time_new = np.linspace(0, duration, int(audio.shape[0] * new_sr / old_sr)) + + interpolator = interpolate.interp1d(time_old, audio.T) + new_audio = interpolator(time_new).T + return new_audio.astype(np.int16) \ No newline at end of file diff --git a/crazy_functions/辅助面试.py b/crazy_functions/辅助面试.py index 54ea010..69a9985 100644 --- a/crazy_functions/辅助面试.py +++ b/crazy_functions/辅助面试.py @@ -3,20 +3,15 @@ from toolbox import CatchException, report_execption, write_results_to_file from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive import threading, time import numpy as np +from .live_audio.aliyunASR import AliyunASR -def take_audio_sentence_flagment(captured_audio): - """ - 判断音频是否到达句尾,如果到了,截取片段 - """ - ready_part = None - other_part = captured_audio - return ready_part, other_part -class InterviewAssistent(): +class InterviewAssistant(AliyunASR): def __init__(self): - self.capture_interval = 1.0 # second + super(InterviewAssistant, self).__init__() + self.capture_interval = 0.5 # second self.stop = False - pass + self.parsed_text = "" def init(self, chatbot): # 初始化音频采集线程 @@ -24,31 +19,9 @@ class InterviewAssistent(): self.keep_latest_n_second = 10 self.ready_audio_flagment = None self.stop = False - th1 = threading.Thread(target=self.audio_capture_thread, args=(chatbot._cookies['uuid'],)) + th1 = threading.Thread(target=self.audio_convertion_thread, args=(chatbot._cookies['uuid'],)) th1.daemon = True th1.start() - th2 = threading.Thread(target=self.audio2txt_thread, args=(chatbot._cookies['uuid'],)) - th2.daemon = True - th2.start() - - def audio_capture_thread(self, uuid): - # 在一个异步线程中采集音频 - from .live_audio.audio_io import RealtimeAudioDistribution - rad = RealtimeAudioDistribution() - while not self.stop: - time.sleep(self.capture_interval) - self.captured_audio = np.concatenate((self.captured_audio, rad.read(uuid.hex))) - if len(self.captured_audio) > self.keep_latest_n_second * rad.rate: - self.captured_audio = self.captured_audio[-self.keep_latest_n_second * rad.rate:] - - def audio2txt_thread(self, llm_kwargs): - import whisper - # 在一个异步线程中音频转文字 - while not self.stop: - time.sleep(1) - if len(self.captured_audio) > 0: - model = whisper.load_model("base") - result = model.transcribe("audio.mp3", language='Chinese') def gpt_answer(self, text, chatbot, history, llm_kwargs): i_say = inputs_show_user = text @@ -63,25 +36,24 @@ class InterviewAssistent(): def begin(self, llm_kwargs, plugin_kwargs, chatbot, history): # 面试插件主函数 self.init(chatbot) + chatbot.append(["", ""]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 while True: - time.sleep(self.capture_interval) - if self.ready_audio_flagment: - audio_for_whisper = self.ready_audio_flagment - text = self.audio2txt(audio_for_whisper, llm_kwargs) - yield from self.gpt_answer(text, chatbot, history, llm_kwargs) - self.ready_audio_flagment = None + self.event_on_result_chg.wait() + chatbot[-1][0] = self.parsed_text + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + # if self.event_on_entence_end + + # yield from self.gpt_answer(text, chatbot, history, llm_kwargs) + # self.ready_audio_flagment = None @CatchException def 辅助面试(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): # pip install -U openai-whisper - chatbot.append(["函数插件功能:辅助面试", "正在预热本地音频转文字模型 ..."]) + chatbot.append(["函数插件功能:辅助面试", "辅助面试助手, 正在监听音频 ..."]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - import whisper - whisper.load_model("base") - chatbot.append(["预热本地音频转文字模型完成", "辅助面试助手, 正在监听音频 ..."]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - ia = InterviewAssistent() + ia = InterviewAssistant() yield from ia.begin(llm_kwargs, plugin_kwargs, chatbot, history) diff --git a/main.py b/main.py index ef68fd8..1d250a2 100644 --- a/main.py +++ b/main.py @@ -58,7 +58,7 @@ def main(): with gr_L1(): with gr_L2(scale=2): if ENABLE_AUDIO: - audio_mic = gr.Audio(source="microphone", type="numpy", streaming=True) + audio_mic = gr.Audio(source="microphone", type="numpy") chatbot = gr.Chatbot(label=f"当前模型:{LLM_MODEL}") chatbot.style(height=CHATBOT_HEIGHT)