大幅优化语音助手

This commit is contained in:
binary-husky 2023-10-09 01:18:05 +08:00
parent 02c270410c
commit 500a0cbd16
3 changed files with 162 additions and 12 deletions

View File

@ -1,4 +1,106 @@
import time, logging, json import time, logging, json, sys, struct
import numpy as np
from scipy.io.wavfile import WAVE_FORMAT
def write_numpy_to_wave(filename, rate, data, add_header=False):
"""
Write a NumPy array as a WAV file.
"""
def _array_tofile(fid, data):
# ravel gives a c-contiguous buffer
fid.write(data.ravel().view('b').data)
if hasattr(filename, 'write'):
fid = filename
else:
fid = open(filename, 'wb')
fs = rate
try:
dkind = data.dtype.kind
if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and
data.dtype.itemsize == 1)):
raise ValueError("Unsupported data type '%s'" % data.dtype)
header_data = b''
header_data += b'RIFF'
header_data += b'\x00\x00\x00\x00'
header_data += b'WAVE'
# fmt chunk
header_data += b'fmt '
if dkind == 'f':
format_tag = WAVE_FORMAT.IEEE_FLOAT
else:
format_tag = WAVE_FORMAT.PCM
if data.ndim == 1:
channels = 1
else:
channels = data.shape[1]
bit_depth = data.dtype.itemsize * 8
bytes_per_second = fs*(bit_depth // 8)*channels
block_align = channels * (bit_depth // 8)
fmt_chunk_data = struct.pack('<HHIIHH', format_tag, channels, fs,
bytes_per_second, block_align, bit_depth)
if not (dkind == 'i' or dkind == 'u'):
# add cbSize field for non-PCM files
fmt_chunk_data += b'\x00\x00'
header_data += struct.pack('<I', len(fmt_chunk_data))
header_data += fmt_chunk_data
# fact chunk (non-PCM files)
if not (dkind == 'i' or dkind == 'u'):
header_data += b'fact'
header_data += struct.pack('<II', 4, data.shape[0])
# check data size (needs to be immediately before the data chunk)
if ((len(header_data)-4-4) + (4+4+data.nbytes)) > 0xFFFFFFFF:
raise ValueError("Data exceeds wave file size limit")
if add_header:
fid.write(header_data)
# data chunk
fid.write(b'data')
fid.write(struct.pack('<I', data.nbytes))
if data.dtype.byteorder == '>' or (data.dtype.byteorder == '=' and
sys.byteorder == 'big'):
data = data.byteswap()
_array_tofile(fid, data)
if add_header:
# Determine file size and place it in correct
# position at start of the file.
size = fid.tell()
fid.seek(4)
fid.write(struct.pack('<I', size-8))
finally:
if not hasattr(filename, 'write'):
fid.close()
else:
fid.seek(0)
def is_speaker_speaking(vad, data, sample_rate):
# Function to detect if the speaker is speaking
# The WebRTC VAD only accepts 16-bit mono PCM audio,
# sampled at 8000, 16000, 32000 or 48000 Hz.
# A frame must be either 10, 20, or 30 ms in duration:
frame_duration = 30
n_bit_each = int(sample_rate * frame_duration / 1000)*2 # x2 because audio is 16 bit (2 bytes)
res_list = []
for t in range(len(data)):
if t!=0 and t % n_bit_each == 0:
res_list.append(vad.is_speech(data[t-n_bit_each:t], sample_rate))
info = ''.join(['^' if r else '.' for r in res_list])
info = info[:10]
if any(res_list):
return True, info
else:
return False, info
class AliyunASR(): class AliyunASR():
@ -66,12 +168,22 @@ class AliyunASR():
on_close=self.test_on_close, on_close=self.test_on_close,
callback_args=[uuid.hex] callback_args=[uuid.hex]
) )
timeout_limit_second = 20
r = sr.start(aformat="pcm", r = sr.start(aformat="pcm",
timeout=timeout_limit_second,
enable_intermediate_result=True, enable_intermediate_result=True,
enable_punctuation_prediction=True, enable_punctuation_prediction=True,
enable_inverse_text_normalization=True) enable_inverse_text_normalization=True)
import webrtcvad
vad = webrtcvad.Vad()
vad.set_mode(1)
is_previous_frame_transmitted = False # 上一帧是否有人说话
previous_frame_data = None
echo_cnt = 0 # 在没有声音之后继续向服务器发送n次音频数据
echo_cnt_max = 4 # 在没有声音之后继续向服务器发送n次音频数据
keep_alive_last_send_time = time.time()
while not self.stop: while not self.stop:
# time.sleep(self.capture_interval) # time.sleep(self.capture_interval)
audio = rad.read(uuid.hex) audio = rad.read(uuid.hex)
@ -79,12 +191,32 @@ class AliyunASR():
# convert to pcm file # convert to pcm file
temp_file = f'{temp_folder}/{uuid.hex}.pcm' # temp_file = f'{temp_folder}/{uuid.hex}.pcm' #
dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000 dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000
io.wavfile.write(temp_file, NEW_SAMPLERATE, dsdata) write_numpy_to_wave(temp_file, NEW_SAMPLERATE, dsdata)
# read pcm binary # read pcm binary
with open(temp_file, "rb") as f: data = f.read() with open(temp_file, "rb") as f: data = f.read()
# print('audio len:', len(audio), '\t ds len:', len(dsdata), '\t need n send:', len(data)//640) is_speaking, info = is_speaker_speaking(vad, data, NEW_SAMPLERATE)
slices = zip(*(iter(data),) * 640) # 640个字节为一组
for i in slices: sr.send_audio(bytes(i)) if is_speaking or echo_cnt > 0:
# 如果话筒激活 / 如果处于回声收尾阶段
echo_cnt -= 1
if not is_previous_frame_transmitted: # 上一帧没有人声,但是我们把上一帧同样加上
if previous_frame_data is not None: data = previous_frame_data + data
if is_speaking:
echo_cnt = echo_cnt_max
slices = zip(*(iter(data),) * 640) # 640个字节为一组
for i in slices: sr.send_audio(bytes(i))
keep_alive_last_send_time = time.time()
is_previous_frame_transmitted = True
else:
is_previous_frame_transmitted = False
echo_cnt = 0
# 保持链接激活,即使没有声音,也根据时间间隔,发送一些音频片段给服务器
if time.time() - keep_alive_last_send_time > timeout_limit_second/2:
slices = zip(*(iter(data),) * 640) # 640个字节为一组
for i in slices: sr.send_audio(bytes(i))
keep_alive_last_send_time = time.time()
is_previous_frame_transmitted = True
self.audio_shape = info
else: else:
time.sleep(0.1) time.sleep(0.1)

View File

@ -35,7 +35,7 @@ class RealtimeAudioDistribution():
def read(self, uuid): def read(self, uuid):
if uuid in self.data: if uuid in self.data:
res = self.data.pop(uuid) res = self.data.pop(uuid)
print('\r read-', len(res), '-', max(res), end='', flush=True) # print('\r read-', len(res), '-', max(res), end='', flush=True)
else: else:
res = None res = None
return res return res

View File

@ -6,6 +6,7 @@ import threading, time
import numpy as np import numpy as np
from .live_audio.aliyunASR import AliyunASR from .live_audio.aliyunASR import AliyunASR
import json import json
import re
class WatchDog(): class WatchDog():
def __init__(self, timeout, bark_fn, interval=3, msg="") -> None: def __init__(self, timeout, bark_fn, interval=3, msg="") -> None:
@ -38,10 +39,22 @@ def chatbot2history(chatbot):
history = [] history = []
for c in chatbot: for c in chatbot:
for q in c: for q in c:
if q not in ["[请讲话]", "[等待GPT响应]", "[正在等您说完问题]"]: if q in ["[ 请讲话 ]", "[ 等待GPT响应 ]", "[ 正在等您说完问题 ]"]:
continue
elif q.startswith("[ 正在等您说完问题 ]"):
continue
else:
history.append(q.strip('<div class="markdown-body">').strip('</div>').strip('<p>').strip('</p>')) history.append(q.strip('<div class="markdown-body">').strip('</div>').strip('<p>').strip('</p>'))
return history return history
def visualize_audio(chatbot, audio_shape):
if len(chatbot) == 0: chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"])
chatbot[-1] = list(chatbot[-1])
p1 = ''
p2 = ''
chatbot[-1][-1] = re.sub(p1+r'(.*)'+p2, '', chatbot[-1][-1])
chatbot[-1][-1] += (p1+f"`{audio_shape}`"+p2)
class AsyncGptTask(): class AsyncGptTask():
def __init__(self) -> None: def __init__(self) -> None:
self.observe_future = [] self.observe_future = []
@ -81,8 +94,9 @@ class InterviewAssistant(AliyunASR):
self.capture_interval = 0.5 # second self.capture_interval = 0.5 # second
self.stop = False self.stop = False
self.parsed_text = "" # 下个句子中已经说完的部分, 由 test_on_result_chg() 写入 self.parsed_text = "" # 下个句子中已经说完的部分, 由 test_on_result_chg() 写入
self.parsed_sentence = "" # 某段话的整个句子,由 test_on_sentence_end() 写入 self.parsed_sentence = "" # 某段话的整个句子, 由 test_on_sentence_end() 写入
self.buffered_sentence = "" # self.buffered_sentence = "" #
self.audio_shape = "" # 音频的可视化表现, 由 audio_convertion_thread() 写入
self.event_on_result_chg = threading.Event() self.event_on_result_chg = threading.Event()
self.event_on_entence_end = threading.Event() self.event_on_entence_end = threading.Event()
self.event_on_commit_question = threading.Event() self.event_on_commit_question = threading.Event()
@ -117,7 +131,7 @@ class InterviewAssistant(AliyunASR):
def begin(self, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): def begin(self, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
# main plugin function # main plugin function
self.init(chatbot) self.init(chatbot)
chatbot.append(["[请讲话]", "[正在等您说完问题]"]) chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
self.plugin_wd.begin_watch() self.plugin_wd.begin_watch()
self.agt = AsyncGptTask() self.agt = AsyncGptTask()
@ -157,14 +171,18 @@ class InterviewAssistant(AliyunASR):
self.commit_wd.begin_watch() self.commit_wd.begin_watch()
chatbot[-1] = list(chatbot[-1]) chatbot[-1] = list(chatbot[-1])
chatbot[-1] = [self.buffered_sentence, "[等待GPT响应]"] chatbot[-1] = [self.buffered_sentence, "[ 等待GPT响应 ]"]
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# add gpt task 创建子线程请求gpt避免线程阻塞 # add gpt task 创建子线程请求gpt避免线程阻塞
history = chatbot2history(chatbot) history = chatbot2history(chatbot)
self.agt.add_async_gpt_task(self.buffered_sentence, len(chatbot)-1, llm_kwargs, history, system_prompt) self.agt.add_async_gpt_task(self.buffered_sentence, len(chatbot)-1, llm_kwargs, history, system_prompt)
self.buffered_sentence = "" self.buffered_sentence = ""
chatbot.append(["[请讲话]", "[正在等您说完问题]"]) chatbot.append(["[ 请讲话 ]", "[ 正在等您说完问题 ]"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
if not self.event_on_result_chg.is_set() and not self.event_on_entence_end.is_set() and not self.event_on_commit_question.is_set():
visualize_audio(chatbot, self.audio_shape)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
if len(self.stop_msg) != 0: if len(self.stop_msg) != 0: