Browse Source

解决语音的识别和转换兼容性

master
zwssunny 1 year ago
parent
commit
24de670c2c
3 changed files with 37 additions and 83 deletions
  1. +18
    -10
      channel/wechat/wechat_channel.py
  2. +16
    -66
      channel/wechat/wechaty_channel.py
  3. +3
    -7
      voice/google/google_voice.py

+ 18
- 10
channel/wechat/wechat_channel.py View File

@@ -5,6 +5,9 @@ wechat channel
""" """


import os import os
import requests
import io
import time
from lib import itchat from lib import itchat
import json import json
from lib.itchat.content import * from lib.itchat.content import *
@@ -17,9 +20,7 @@ from common.tmp_dir import TmpDir
from config import conf from config import conf
from common.time_check import time_checker from common.time_check import time_checker
from plugins import * from plugins import *
import requests
import io
import time
from voice.audio_convert import mp3_to_wav




thread_pool = ThreadPoolExecutor(max_workers=8) thread_pool = ThreadPoolExecutor(max_workers=8)
@@ -28,8 +29,7 @@ thread_pool = ThreadPoolExecutor(max_workers=8)
def thread_pool_callback(worker): def thread_pool_callback(worker):
worker_exception = worker.exception() worker_exception = worker.exception()
if worker_exception: if worker_exception:
logger.exception(
"Worker return exception: {}".format(worker_exception))
logger.exception("Worker return exception: {}".format(worker_exception))




@itchat.msg_register(TEXT) @itchat.msg_register(TEXT)
@@ -247,9 +247,16 @@ class WechatChannel(Channel):
reply = super().build_reply_content(context.content, context) reply = super().build_reply_content(context.content, context)
elif context.type == ContextType.VOICE: elif context.type == ContextType.VOICE:
msg = context['msg'] msg = context['msg']
file_name = TmpDir().path() + context.content
msg.download(file_name)
reply = super().build_voice_to_text(file_name)
mp3_path = TmpDir().path() + context.content
msg.download(mp3_path)
# mp3转wav
wav_path = os.path.splitext(mp3_path)[0] + '.wav'
mp3_to_wav(mp3_path=mp3_path, wav_path=wav_path)
# 语音识别
reply = super().build_voice_to_text(wav_path)
# 删除临时文件
os.remove(wav_path)
os.remove(mp3_path)
if reply.type != ReplyType.ERROR and reply.type != ReplyType.INFO: if reply.type != ReplyType.ERROR and reply.type != ReplyType.INFO:
context.content = reply.content # 语音转文字后,将文字内容作为新的context context.content = reply.content # 语音转文字后,将文字内容作为新的context
context.type = ContextType.TEXT context.type = ContextType.TEXT
@@ -263,12 +270,13 @@ class WechatChannel(Channel):
prefixes = conf().get('group_chat_prefix') prefixes = conf().get('group_chat_prefix')
for prefix in prefixes: for prefix in prefixes:
if context.content.startswith(prefix): if context.content.startswith(prefix):
context.content = context.content.replace(prefix, '', 1).strip()
context.content = context.content.replace(
prefix, '', 1).strip()
break break
else: else:
logger.info("[WX]receive voice check prefix: " + 'False') logger.info("[WX]receive voice check prefix: " + 'False')
return return
reply = super().build_reply_content(context.content, context) reply = super().build_reply_content(context.content, context)
if reply.type == ReplyType.TEXT: if reply.type == ReplyType.TEXT:
if conf().get('voice_reply_voice'): if conf().get('voice_reply_voice'):


+ 16
- 66
channel/wechat/wechaty_channel.py View File

@@ -4,25 +4,19 @@
wechaty channel wechaty channel
Python Wechaty - https://github.com/wechaty/python-wechaty Python Wechaty - https://github.com/wechaty/python-wechaty
""" """
import io
import os import os
import json
import time import time
import asyncio import asyncio
import requests
import pysilk
import wave
from pydub import AudioSegment
from typing import Optional, Union from typing import Optional, Union
from bridge.context import Context, ContextType from bridge.context import Context, ContextType
from wechaty_puppet import MessageType, FileBox, ScanStatus # type: ignore from wechaty_puppet import MessageType, FileBox, ScanStatus # type: ignore
from wechaty import Wechaty, Contact from wechaty import Wechaty, Contact
from wechaty.user import Message, Room, MiniProgram, UrlLink
from wechaty.user import Message, MiniProgram, UrlLink
from channel.channel import Channel from channel.channel import Channel
from common.log import logger from common.log import logger
from common.tmp_dir import TmpDir from common.tmp_dir import TmpDir
from config import conf from config import conf
from voice.audio_convert import sil_to_wav, mp3_to_sil


class WechatyChannel(Channel): class WechatyChannel(Channel):


@@ -50,8 +44,8 @@ class WechatyChannel(Channel):


async def on_scan(self, status: ScanStatus, qr_code: Optional[str] = None, async def on_scan(self, status: ScanStatus, qr_code: Optional[str] = None,
data: Optional[str] = None): data: Optional[str] = None):
contact = self.Contact.load(self.contact_id)
logger.info('[WX] scan user={}, scan status={}, scan qr_code={}'.format(contact, status.name, qr_code))
# contact = self.Contact.load(self.contact_id)
# logger.info('[WX] scan user={}, scan status={}, scan qr_code={}'.format(contact, status.name, qr_code))
# print(f'user <{contact}> scan status: {status.name} , 'f'qr_code: {qr_code}') # print(f'user <{contact}> scan status: {status.name} , 'f'qr_code: {qr_code}')


async def on_message(self, msg: Message): async def on_message(self, msg: Message):
@@ -67,7 +61,7 @@ class WechatyChannel(Channel):
content = msg.text() content = msg.text()
mention_content = await msg.mention_text() # 返回过滤掉@name后的消息 mention_content = await msg.mention_text() # 返回过滤掉@name后的消息
match_prefix = self.check_prefix(content, conf().get('single_chat_prefix')) match_prefix = self.check_prefix(content, conf().get('single_chat_prefix'))
conversation: Union[Room, Contact] = from_contact if room is None else room
# conversation: Union[Room, Contact] = from_contact if room is None else room


if room is None and msg.type() == MessageType.MESSAGE_TYPE_TEXT: if room is None and msg.type() == MessageType.MESSAGE_TYPE_TEXT:
if not msg.is_self() and match_prefix is not None: if not msg.is_self() and match_prefix is not None:
@@ -102,21 +96,8 @@ class WechatyChannel(Channel):
await voice_file.to_file(silk_file) await voice_file.to_file(silk_file)
logger.info("[WX]receive voice file: " + silk_file) logger.info("[WX]receive voice file: " + silk_file)
# 将文件转成wav格式音频 # 将文件转成wav格式音频
wav_file = silk_file.replace(".slk", ".wav")
with open(silk_file, 'rb') as f:
silk_data = f.read()
pcm_data = pysilk.decode(silk_data)

with wave.open(wav_file, 'wb') as wav_data:
wav_data.setnchannels(1)
wav_data.setsampwidth(2)
wav_data.setframerate(24000)
wav_data.writeframes(pcm_data)
if os.path.exists(wav_file):
converter_state = "true" # 转换wav成功
else:
converter_state = "false" # 转换wav失败
logger.info("[WX]receive voice converter: " + converter_state)
wav_file = os.path.splitext(silk_file)[0] + '.wav'
sil_to_wav(silk_file, wav_file)
# 语音识别为文本 # 语音识别为文本
query = super().build_voice_to_text(wav_file).content query = super().build_voice_to_text(wav_file).content
# 交验关键字 # 交验关键字
@@ -183,21 +164,8 @@ class WechatyChannel(Channel):
await voice_file.to_file(silk_file) await voice_file.to_file(silk_file)
logger.info("[WX]receive voice file: " + silk_file) logger.info("[WX]receive voice file: " + silk_file)
# 将文件转成wav格式音频 # 将文件转成wav格式音频
wav_file = silk_file.replace(".slk", ".wav")
with open(silk_file, 'rb') as f:
silk_data = f.read()
pcm_data = pysilk.decode(silk_data)

with wave.open(wav_file, 'wb') as wav_data:
wav_data.setnchannels(1)
wav_data.setsampwidth(2)
wav_data.setframerate(24000)
wav_data.writeframes(pcm_data)
if os.path.exists(wav_file):
converter_state = "true" # 转换wav成功
else:
converter_state = "false" # 转换wav失败
logger.info("[WX]receive voice converter: " + converter_state)
wav_file = os.path.splitext(silk_file)[0] + '.wav'
sil_to_wav(silk_file, wav_file)
# 语音识别为文本 # 语音识别为文本
query = super().build_voice_to_text(wav_file).content query = super().build_voice_to_text(wav_file).content
# 校验关键字 # 校验关键字
@@ -260,21 +228,12 @@ class WechatyChannel(Channel):
if reply_text: if reply_text:
# 转换 mp3 文件为 silk 格式 # 转换 mp3 文件为 silk 格式
mp3_file = super().build_text_to_voice(reply_text).content mp3_file = super().build_text_to_voice(reply_text).content
silk_file = mp3_file.replace(".mp3", ".silk")
# Load the MP3 file
audio = AudioSegment.from_file(mp3_file, format="mp3")
# Convert to WAV format
audio = audio.set_frame_rate(24000).set_channels(1)
wav_data = audio.raw_data
sample_width = audio.sample_width
# Encode to SILK format
silk_data = pysilk.encode(wav_data, 24000)
# Save the silk file
with open(silk_file, "wb") as f:
f.write(silk_data)
silk_file = os.path.splitext(mp3_file)[0] + '.sil'
voiceLength = mp3_to_sil(mp3_file, silk_file)
# 发送语音 # 发送语音
t = int(time.time()) t = int(time.time())
file_box = FileBox.from_file(silk_file, name=str(t) + '.silk')
file_box = FileBox.from_file(silk_file, name=str(t) + '.sil')
file_box.metadata = {'voiceLength': voiceLength}
await self.send(file_box, reply_user_id) await self.send(file_box, reply_user_id)
# 清除缓存文件 # 清除缓存文件
os.remove(mp3_file) os.remove(mp3_file)
@@ -337,21 +296,12 @@ class WechatyChannel(Channel):
reply_text = '@' + group_user_name + ' ' + reply_text.strip() reply_text = '@' + group_user_name + ' ' + reply_text.strip()
# 转换 mp3 文件为 silk 格式 # 转换 mp3 文件为 silk 格式
mp3_file = super().build_text_to_voice(reply_text).content mp3_file = super().build_text_to_voice(reply_text).content
silk_file = mp3_file.replace(".mp3", ".silk")
# Load the MP3 file
audio = AudioSegment.from_file(mp3_file, format="mp3")
# Convert to WAV format
audio = audio.set_frame_rate(24000).set_channels(1)
wav_data = audio.raw_data
sample_width = audio.sample_width
# Encode to SILK format
silk_data = pysilk.encode(wav_data, 24000)
# Save the silk file
with open(silk_file, "wb") as f:
f.write(silk_data)
silk_file = os.path.splitext(mp3_file)[0] + '.sil'
voiceLength = mp3_to_sil(mp3_file, silk_file)
# 发送语音 # 发送语音
t = int(time.time()) t = int(time.time())
file_box = FileBox.from_file(silk_file, name=str(t) + '.silk') file_box = FileBox.from_file(silk_file, name=str(t) + '.silk')
file_box.metadata = {'voiceLength': voiceLength}
await self.send_group(file_box, group_id) await self.send_group(file_box, group_id)
# 清除缓存文件 # 清除缓存文件
os.remove(mp3_file) os.remove(mp3_file)


+ 3
- 7
voice/google/google_voice.py View File

@@ -3,17 +3,14 @@
google voice service google voice service
""" """


import pathlib
import subprocess
import time import time
from bridge.reply import Reply, ReplyType
import speech_recognition import speech_recognition
import pyttsx3 import pyttsx3
from gtts import gTTS from gtts import gTTS
from bridge.reply import Reply, ReplyType
from common.log import logger from common.log import logger
from common.tmp_dir import TmpDir from common.tmp_dir import TmpDir
from voice.voice import Voice from voice.voice import Voice
from voice.audio_convert import mp3_to_wav




class GoogleVoice(Voice): class GoogleVoice(Voice):
@@ -30,11 +27,10 @@ class GoogleVoice(Voice):
self.engine.setProperty('voice', voices[1].id) self.engine.setProperty('voice', voices[1].id)


def voiceToText(self, voice_file): def voiceToText(self, voice_file):
new_file = voice_file.replace('.mp3', '.wav')
# new_file = voice_file.replace('.mp3', '.wav')
# subprocess.call('ffmpeg -i ' + voice_file + # subprocess.call('ffmpeg -i ' + voice_file +
# ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True) # ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True)
mp3_to_wav(voice_file, new_file)
with speech_recognition.AudioFile(new_file) as source:
with speech_recognition.AudioFile(voice_file) as source:
audio = self.recognizer.record(source) audio = self.recognizer.record(source)
try: try:
text = self.recognizer.recognize_google(audio, language='zh-CN') text = self.recognizer.recognize_google(audio, language='zh-CN')


Loading…
Cancel
Save