大幅优化语音助手

2023-10-09 01:18:05 +08:00 · 2023-10-09 01:18:05 +08:00 · 500a0cbd16
commit 500a0cbd16
parent 02c270410c
3 changed files with 162 additions and 12 deletions
--- a/crazy_functions/live_audio/aliyunASR.py
+++ b/crazy_functions/live_audio/aliyunASR.py
@ -1,4 +1,106 @@
-import time, logging, json
+import time, logging, json, sys, struct
+import numpy as np
+from scipy.io.wavfile import WAVE_FORMAT
+
+def write_numpy_to_wave(filename, rate, data, add_header=False):
+    """
+    Write a NumPy array as a WAV file.
+    """
+    def _array_tofile(fid, data):
+        # ravel gives a c-contiguous buffer
+        fid.write(data.ravel().view('b').data)
+
+    if hasattr(filename, 'write'):
+        fid = filename
+    else:
+        fid = open(filename, 'wb')
+
+    fs = rate
+
+    try:
+        dkind = data.dtype.kind
+        if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and
+                                                 data.dtype.itemsize == 1)):
+            raise ValueError("Unsupported data type '%s'" % data.dtype)
+
+        header_data = b''
+
+        header_data += b'RIFF'
+        header_data += b'\x00\x00\x00\x00'
+        header_data += b'WAVE'
+
+        # fmt chunk
+        header_data += b'fmt '
+        if dkind == 'f':
+            format_tag = WAVE_FORMAT.IEEE_FLOAT
+        else:
+            format_tag = WAVE_FORMAT.PCM
+        if data.ndim == 1:
+            channels = 1
+        else:
+            channels = data.shape[1]
+        bit_depth = data.dtype.itemsize * 8
+        bytes_per_second = fs*(bit_depth // 8)*channels
+        block_align = channels * (bit_depth // 8)
+
+        fmt_chunk_data = struct.pack('<HHIIHH', format_tag, channels, fs,
+                                     bytes_per_second, block_align, bit_depth)
+        if not (dkind == 'i' or dkind == 'u'):
+            # add cbSize field for non-PCM files
+            fmt_chunk_data += b'\x00\x00'
+
+        header_data += struct.pack('<I', len(fmt_chunk_data))
+        header_data += fmt_chunk_data
+
+        # fact chunk (non-PCM files)
+        if not (dkind == 'i' or dkind == 'u'):
+            header_data += b'fact'
+            header_data += struct.pack('<II', 4, data.shape[0])
+
+        # check data size (needs to be immediately before the data chunk)
+        if ((len(header_data)-4-4) + (4+4+data.nbytes)) > 0xFFFFFFFF:
+            raise ValueError("Data exceeds wave file size limit")
+        if add_header:
+            fid.write(header_data)
+            # data chunk
+            fid.write(b'data')
+            fid.write(struct.pack('<I', data.nbytes))
+            if data.dtype.byteorder == '>' or (data.dtype.byteorder == '=' and
+                                            sys.byteorder == 'big'):
+                data = data.byteswap()
+        _array_tofile(fid, data)
+
+        if add_header:
+            # Determine file size and place it in correct
+            #  position at start of the file.
+            size = fid.tell()
+            fid.seek(4)
+            fid.write(struct.pack('<I', size-8))
+
+    finally:
+        if not hasattr(filename, 'write'):
+            fid.close()
+        else:
+            fid.seek(0)
+
+def is_speaker_speaking(vad, data, sample_rate):
+    # Function to detect if the speaker is speaking
+    # The WebRTC VAD only accepts 16-bit mono PCM audio, 
+    # sampled at 8000, 16000, 32000 or 48000 Hz. 
+    # A frame must be either 10, 20, or 30 ms in duration:
+    frame_duration = 30
+    n_bit_each = int(sample_rate * frame_duration / 1000)*2 # x2 because audio is 16 bit (2 bytes)
+    res_list = []
+    for t in range(len(data)):
+        if t!=0 and t % n_bit_each == 0:
+            res_list.append(vad.is_speech(data[t-n_bit_each:t], sample_rate))
+    
+    info = ''.join(['^' if r else '.' for r in res_list])
+    info = info[:10]
+    if any(res_list):
+        return True, info
+    else:
+        return False, info


 class AliyunASR():
@ -66,12 +168,22 @@ class AliyunASR():
                    on_close=self.test_on_close,
                    callback_args=[uuid.hex]
                )
-
+        timeout_limit_second = 20
        r = sr.start(aformat="pcm",
+                timeout=timeout_limit_second,
                enable_intermediate_result=True,
                enable_punctuation_prediction=True,
                enable_inverse_text_normalization=True)

+        import webrtcvad
+        vad = webrtcvad.Vad()
+        vad.set_mode(1)
+
+        is_previous_frame_transmitted = False   # 上一帧是否有人说话
+        previous_frame_data = None
+        echo_cnt = 0        # 在没有声音之后，继续向服务器发送n次音频数据
+        echo_cnt_max = 4   # 在没有声音之后，继续向服务器发送n次音频数据
+        keep_alive_last_send_time = time.time()
        while not self.stop:
            # time.sleep(self.capture_interval)
            audio = rad.read(uuid.hex) 
@ -79,12 +191,32 @@ class AliyunASR():
                # convert to pcm file
                temp_file = f'{temp_folder}/{uuid.hex}.pcm' # 
                dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000
-                io.wavfile.write(temp_file, NEW_SAMPLERATE, dsdata)
+                write_numpy_to_wave(temp_file, NEW_SAMPLERATE, dsdata)
                # read pcm binary
                with open(temp_file, "rb") as f: data = f.read()
-                # print('audio len:', len(audio), '\t ds len:', len(dsdata), '\t need n send:', len(data)//640)
+                is_speaking, info = is_speaker_speaking(vad, data, NEW_SAMPLERATE)
+
+                if is_speaking or echo_cnt > 0:
+                    # 如果话筒激活 / 如果处于回声收尾阶段
+                    echo_cnt -= 1
+                    if not is_previous_frame_transmitted:   # 上一帧没有人声，但是我们把上一帧同样加上
+                        if previous_frame_data is not None: data = previous_frame_data + data
+                    if is_speaking:
+                        echo_cnt = echo_cnt_max
                    slices = zip(*(iter(data),) * 640)      # 640个字节为一组
                    for i in slices: sr.send_audio(bytes(i))
+                    keep_alive_last_send_time = time.time()
+                    is_previous_frame_transmitted = True
+                else:
+                    is_previous_frame_transmitted = False
+                    echo_cnt = 0
+                    # 保持链接激活，即使没有声音，也根据时间间隔，发送一些音频片段给服务器
+                    if time.time() - keep_alive_last_send_time > timeout_limit_second/2:
+                        slices = zip(*(iter(data),) * 640)    # 640个字节为一组
+                        for i in slices: sr.send_audio(bytes(i))
+                        keep_alive_last_send_time = time.time()
+                        is_previous_frame_transmitted = True
+                self.audio_shape = info
            else:
                time.sleep(0.1)

--- a/crazy_functions/live_audio/audio_io.py
+++ b/crazy_functions/live_audio/audio_io.py
@ -35,7 +35,7 @@ class RealtimeAudioDistribution():
    def read(self, uuid):
        if uuid in self.data:
            res = self.data.pop(uuid)
-            print('\r read-', len(res), '-', max(res), end='', flush=True)
+            # print('\r read-', len(res), '-', max(res), end='', flush=True)
        else:
            res = None
        return res
--- a/crazy_functions/语音助手.py
+++ b/crazy_functions/语音助手.py
@ -6,6 +6,7 @@ import threading, time
 import numpy as np
 from .live_audio.aliyunASR import AliyunASR
 import json
+import re

 class WatchDog():
    def __init__(self, timeout, bark_fn, interval=3, msg="") -> None:
@ -38,10 +39,22 @@ def chatbot2history(chatbot):
    history = []
    for c in chatbot:
        for q in c:
-            if q not in ["[请讲话]", "[等待GPT响应]", "[正在等您说完问题]"]:
+            if q in ["[ 请讲话 ]", "[ 等待GPT响应 ]", "[ 正在等您说完问题 ]"]:
+                continue
+            elif q.startswith("[ 正在等您说完问题 ]"):
+                continue
+            else:
                history.append(q.strip('<div class="markdown-body">').strip('</div>').strip('<p>').strip('</p>'))
    return history

+def visualize_audio(chatbot, audio_shape):
+    if len(chatbot) == 0: chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"])
+    chatbot[-1] = list(chatbot[-1])
+    p1 = '「'
+    p2 = '」'
+    chatbot[-1][-1] = re.sub(p1+r'(.*)'+p2, '', chatbot[-1][-1])
+    chatbot[-1][-1] += (p1+f"`{audio_shape}`"+p2)
+
 class AsyncGptTask():
    def __init__(self) -> None:
        self.observe_future = []
@ -81,8 +94,9 @@ class InterviewAssistant(AliyunASR):
        self.capture_interval = 0.5 # second
        self.stop = False
        self.parsed_text = ""   # 下个句子中已经说完的部分, 由 test_on_result_chg() 写入
-        self.parsed_sentence = ""   # 某段话的整个句子,由 test_on_sentence_end() 写入
+        self.parsed_sentence = ""   # 某段话的整个句子, 由 test_on_sentence_end() 写入
        self.buffered_sentence = ""    #
+        self.audio_shape = ""   # 音频的可视化表现, 由 audio_convertion_thread() 写入
        self.event_on_result_chg = threading.Event()
        self.event_on_entence_end = threading.Event()
        self.event_on_commit_question = threading.Event()
@ -117,7 +131,7 @@ class InterviewAssistant(AliyunASR):
    def begin(self, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
        # main plugin function
        self.init(chatbot)
-        chatbot.append(["[请讲话]", "[正在等您说完问题]"])
+        chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"])
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        self.plugin_wd.begin_watch()
        self.agt = AsyncGptTask()
@ -157,14 +171,18 @@ class InterviewAssistant(AliyunASR):

                self.commit_wd.begin_watch()
                chatbot[-1] = list(chatbot[-1])
-                chatbot[-1] = [self.buffered_sentence, "[等待GPT响应]"]
+                chatbot[-1] = [self.buffered_sentence, "[ 等待GPT响应 ]"]
                yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
                # add gpt task 创建子线程请求gpt，避免线程阻塞
                history = chatbot2history(chatbot)
                self.agt.add_async_gpt_task(self.buffered_sentence, len(chatbot)-1, llm_kwargs, history, system_prompt)
                
                self.buffered_sentence = ""
-                chatbot.append(["[请讲话]", "[正在等您说完问题]"])
+                chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"])
+                yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
+            if not self.event_on_result_chg.is_set() and not self.event_on_entence_end.is_set() and not self.event_on_commit_question.is_set():
+                visualize_audio(chatbot, self.audio_shape)
                yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

        if len(self.stop_msg) != 0: