diff --git a/crazy_functions/live_audio/aliyunASR.py b/crazy_functions/live_audio/aliyunASR.py index ed67fcd..cba4c01 100644 --- a/crazy_functions/live_audio/aliyunASR.py +++ b/crazy_functions/live_audio/aliyunASR.py @@ -1,4 +1,106 @@ -import time, logging, json +import time, logging, json, sys, struct +import numpy as np +from scipy.io.wavfile import WAVE_FORMAT + +def write_numpy_to_wave(filename, rate, data, add_header=False): + """ + Write a NumPy array as a WAV file. + """ + def _array_tofile(fid, data): + # ravel gives a c-contiguous buffer + fid.write(data.ravel().view('b').data) + + if hasattr(filename, 'write'): + fid = filename + else: + fid = open(filename, 'wb') + + fs = rate + + try: + dkind = data.dtype.kind + if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and + data.dtype.itemsize == 1)): + raise ValueError("Unsupported data type '%s'" % data.dtype) + + header_data = b'' + + header_data += b'RIFF' + header_data += b'\x00\x00\x00\x00' + header_data += b'WAVE' + + # fmt chunk + header_data += b'fmt ' + if dkind == 'f': + format_tag = WAVE_FORMAT.IEEE_FLOAT + else: + format_tag = WAVE_FORMAT.PCM + if data.ndim == 1: + channels = 1 + else: + channels = data.shape[1] + bit_depth = data.dtype.itemsize * 8 + bytes_per_second = fs*(bit_depth // 8)*channels + block_align = channels * (bit_depth // 8) + + fmt_chunk_data = struct.pack(' 0xFFFFFFFF: + raise ValueError("Data exceeds wave file size limit") + if add_header: + fid.write(header_data) + # data chunk + fid.write(b'data') + fid.write(struct.pack('' or (data.dtype.byteorder == '=' and + sys.byteorder == 'big'): + data = data.byteswap() + _array_tofile(fid, data) + + if add_header: + # Determine file size and place it in correct + # position at start of the file. + size = fid.tell() + fid.seek(4) + fid.write(struct.pack(' 16000 - io.wavfile.write(temp_file, NEW_SAMPLERATE, dsdata) + write_numpy_to_wave(temp_file, NEW_SAMPLERATE, dsdata) # read pcm binary with open(temp_file, "rb") as f: data = f.read() - # print('audio len:', len(audio), '\t ds len:', len(dsdata), '\t need n send:', len(data)//640) - slices = zip(*(iter(data),) * 640) # 640个字节为一组 - for i in slices: sr.send_audio(bytes(i)) + is_speaking, info = is_speaker_speaking(vad, data, NEW_SAMPLERATE) + + if is_speaking or echo_cnt > 0: + # 如果话筒激活 / 如果处于回声收尾阶段 + echo_cnt -= 1 + if not is_previous_frame_transmitted: # 上一帧没有人声,但是我们把上一帧同样加上 + if previous_frame_data is not None: data = previous_frame_data + data + if is_speaking: + echo_cnt = echo_cnt_max + slices = zip(*(iter(data),) * 640) # 640个字节为一组 + for i in slices: sr.send_audio(bytes(i)) + keep_alive_last_send_time = time.time() + is_previous_frame_transmitted = True + else: + is_previous_frame_transmitted = False + echo_cnt = 0 + # 保持链接激活,即使没有声音,也根据时间间隔,发送一些音频片段给服务器 + if time.time() - keep_alive_last_send_time > timeout_limit_second/2: + slices = zip(*(iter(data),) * 640) # 640个字节为一组 + for i in slices: sr.send_audio(bytes(i)) + keep_alive_last_send_time = time.time() + is_previous_frame_transmitted = True + self.audio_shape = info else: time.sleep(0.1) diff --git a/crazy_functions/live_audio/audio_io.py b/crazy_functions/live_audio/audio_io.py index 3ff83a6..00fd3f2 100644 --- a/crazy_functions/live_audio/audio_io.py +++ b/crazy_functions/live_audio/audio_io.py @@ -35,7 +35,7 @@ class RealtimeAudioDistribution(): def read(self, uuid): if uuid in self.data: res = self.data.pop(uuid) - print('\r read-', len(res), '-', max(res), end='', flush=True) + # print('\r read-', len(res), '-', max(res), end='', flush=True) else: res = None return res diff --git a/crazy_functions/语音助手.py b/crazy_functions/语音助手.py index b1c8c41..50f7725 100644 --- a/crazy_functions/语音助手.py +++ b/crazy_functions/语音助手.py @@ -6,6 +6,7 @@ import threading, time import numpy as np from .live_audio.aliyunASR import AliyunASR import json +import re class WatchDog(): def __init__(self, timeout, bark_fn, interval=3, msg="") -> None: @@ -38,10 +39,22 @@ def chatbot2history(chatbot): history = [] for c in chatbot: for q in c: - if q not in ["[请讲话]", "[等待GPT响应]", "[正在等您说完问题]"]: + if q in ["[ 请讲话 ]", "[ 等待GPT响应 ]", "[ 正在等您说完问题 ]"]: + continue + elif q.startswith("[ 正在等您说完问题 ]"): + continue + else: history.append(q.strip('
').strip('
').strip('

').strip('

')) return history +def visualize_audio(chatbot, audio_shape): + if len(chatbot) == 0: chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"]) + chatbot[-1] = list(chatbot[-1]) + p1 = '「' + p2 = '」' + chatbot[-1][-1] = re.sub(p1+r'(.*)'+p2, '', chatbot[-1][-1]) + chatbot[-1][-1] += (p1+f"`{audio_shape}`"+p2) + class AsyncGptTask(): def __init__(self) -> None: self.observe_future = [] @@ -81,8 +94,9 @@ class InterviewAssistant(AliyunASR): self.capture_interval = 0.5 # second self.stop = False self.parsed_text = "" # 下个句子中已经说完的部分, 由 test_on_result_chg() 写入 - self.parsed_sentence = "" # 某段话的整个句子,由 test_on_sentence_end() 写入 + self.parsed_sentence = "" # 某段话的整个句子, 由 test_on_sentence_end() 写入 self.buffered_sentence = "" # + self.audio_shape = "" # 音频的可视化表现, 由 audio_convertion_thread() 写入 self.event_on_result_chg = threading.Event() self.event_on_entence_end = threading.Event() self.event_on_commit_question = threading.Event() @@ -117,7 +131,7 @@ class InterviewAssistant(AliyunASR): def begin(self, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): # main plugin function self.init(chatbot) - chatbot.append(["[请讲话]", "[正在等您说完问题]"]) + chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 self.plugin_wd.begin_watch() self.agt = AsyncGptTask() @@ -157,14 +171,18 @@ class InterviewAssistant(AliyunASR): self.commit_wd.begin_watch() chatbot[-1] = list(chatbot[-1]) - chatbot[-1] = [self.buffered_sentence, "[等待GPT响应]"] + chatbot[-1] = [self.buffered_sentence, "[ 等待GPT响应 ]"] yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # add gpt task 创建子线程请求gpt,避免线程阻塞 history = chatbot2history(chatbot) self.agt.add_async_gpt_task(self.buffered_sentence, len(chatbot)-1, llm_kwargs, history, system_prompt) self.buffered_sentence = "" - chatbot.append(["[请讲话]", "[正在等您说完问题]"]) + chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + if not self.event_on_result_chg.is_set() and not self.event_on_entence_end.is_set() and not self.event_on_commit_question.is_set(): + visualize_audio(chatbot, self.audio_shape) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 if len(self.stop_msg) != 0: