Browse Source

解决语音的识别和转换兼容性

develop
zwssunny 1 year ago
parent
commit
24de670c2c
3 changed files with 37 additions and 83 deletions
  1. +18
    -10
      channel/wechat/wechat_channel.py
  2. +16
    -66
      channel/wechat/wechaty_channel.py
  3. +3
    -7
      voice/google/google_voice.py

+ 18
- 10
channel/wechat/wechat_channel.py View File

@@ -5,6 +5,9 @@ wechat channel
"""

import os
import requests
import io
import time
from lib import itchat
import json
from lib.itchat.content import *
@@ -17,9 +20,7 @@ from common.tmp_dir import TmpDir
from config import conf
from common.time_check import time_checker
from plugins import *
import requests
import io
import time
from voice.audio_convert import mp3_to_wav


thread_pool = ThreadPoolExecutor(max_workers=8)
@@ -28,8 +29,7 @@ thread_pool = ThreadPoolExecutor(max_workers=8)
def thread_pool_callback(worker):
worker_exception = worker.exception()
if worker_exception:
logger.exception(
"Worker return exception: {}".format(worker_exception))
logger.exception("Worker return exception: {}".format(worker_exception))


@itchat.msg_register(TEXT)
@@ -247,9 +247,16 @@ class WechatChannel(Channel):
reply = super().build_reply_content(context.content, context)
elif context.type == ContextType.VOICE:
msg = context['msg']
file_name = TmpDir().path() + context.content
msg.download(file_name)
reply = super().build_voice_to_text(file_name)
mp3_path = TmpDir().path() + context.content
msg.download(mp3_path)
# mp3转wav
wav_path = os.path.splitext(mp3_path)[0] + '.wav'
mp3_to_wav(mp3_path=mp3_path, wav_path=wav_path)
# 语音识别
reply = super().build_voice_to_text(wav_path)
# 删除临时文件
os.remove(wav_path)
os.remove(mp3_path)
if reply.type != ReplyType.ERROR and reply.type != ReplyType.INFO:
context.content = reply.content # 语音转文字后,将文字内容作为新的context
context.type = ContextType.TEXT
@@ -263,12 +270,13 @@ class WechatChannel(Channel):
prefixes = conf().get('group_chat_prefix')
for prefix in prefixes:
if context.content.startswith(prefix):
context.content = context.content.replace(prefix, '', 1).strip()
context.content = context.content.replace(
prefix, '', 1).strip()
break
else:
logger.info("[WX]receive voice check prefix: " + 'False')
return
reply = super().build_reply_content(context.content, context)
if reply.type == ReplyType.TEXT:
if conf().get('voice_reply_voice'):


+ 16
- 66
channel/wechat/wechaty_channel.py View File

@@ -4,25 +4,19 @@
wechaty channel
Python Wechaty - https://github.com/wechaty/python-wechaty
"""
import io
import os
import json
import time
import asyncio
import requests
import pysilk
import wave
from pydub import AudioSegment
from typing import Optional, Union
from bridge.context import Context, ContextType
from wechaty_puppet import MessageType, FileBox, ScanStatus # type: ignore
from wechaty import Wechaty, Contact
from wechaty.user import Message, Room, MiniProgram, UrlLink
from wechaty.user import Message, MiniProgram, UrlLink
from channel.channel import Channel
from common.log import logger
from common.tmp_dir import TmpDir
from config import conf
from voice.audio_convert import sil_to_wav, mp3_to_sil

class WechatyChannel(Channel):

@@ -50,8 +44,8 @@ class WechatyChannel(Channel):

async def on_scan(self, status: ScanStatus, qr_code: Optional[str] = None,
data: Optional[str] = None):
contact = self.Contact.load(self.contact_id)
logger.info('[WX] scan user={}, scan status={}, scan qr_code={}'.format(contact, status.name, qr_code))
# contact = self.Contact.load(self.contact_id)
# logger.info('[WX] scan user={}, scan status={}, scan qr_code={}'.format(contact, status.name, qr_code))
# print(f'user <{contact}> scan status: {status.name} , 'f'qr_code: {qr_code}')

async def on_message(self, msg: Message):
@@ -67,7 +61,7 @@ class WechatyChannel(Channel):
content = msg.text()
mention_content = await msg.mention_text() # 返回过滤掉@name后的消息
match_prefix = self.check_prefix(content, conf().get('single_chat_prefix'))
conversation: Union[Room, Contact] = from_contact if room is None else room
# conversation: Union[Room, Contact] = from_contact if room is None else room

if room is None and msg.type() == MessageType.MESSAGE_TYPE_TEXT:
if not msg.is_self() and match_prefix is not None:
@@ -102,21 +96,8 @@ class WechatyChannel(Channel):
await voice_file.to_file(silk_file)
logger.info("[WX]receive voice file: " + silk_file)
# 将文件转成wav格式音频
wav_file = silk_file.replace(".slk", ".wav")
with open(silk_file, 'rb') as f:
silk_data = f.read()
pcm_data = pysilk.decode(silk_data)

with wave.open(wav_file, 'wb') as wav_data:
wav_data.setnchannels(1)
wav_data.setsampwidth(2)
wav_data.setframerate(24000)
wav_data.writeframes(pcm_data)
if os.path.exists(wav_file):
converter_state = "true" # 转换wav成功
else:
converter_state = "false" # 转换wav失败
logger.info("[WX]receive voice converter: " + converter_state)
wav_file = os.path.splitext(silk_file)[0] + '.wav'
sil_to_wav(silk_file, wav_file)
# 语音识别为文本
query = super().build_voice_to_text(wav_file).content
# 交验关键字
@@ -183,21 +164,8 @@ class WechatyChannel(Channel):
await voice_file.to_file(silk_file)
logger.info("[WX]receive voice file: " + silk_file)
# 将文件转成wav格式音频
wav_file = silk_file.replace(".slk", ".wav")
with open(silk_file, 'rb') as f:
silk_data = f.read()
pcm_data = pysilk.decode(silk_data)

with wave.open(wav_file, 'wb') as wav_data:
wav_data.setnchannels(1)
wav_data.setsampwidth(2)
wav_data.setframerate(24000)
wav_data.writeframes(pcm_data)
if os.path.exists(wav_file):
converter_state = "true" # 转换wav成功
else:
converter_state = "false" # 转换wav失败
logger.info("[WX]receive voice converter: " + converter_state)
wav_file = os.path.splitext(silk_file)[0] + '.wav'
sil_to_wav(silk_file, wav_file)
# 语音识别为文本
query = super().build_voice_to_text(wav_file).content
# 校验关键字
@@ -260,21 +228,12 @@ class WechatyChannel(Channel):
if reply_text:
# 转换 mp3 文件为 silk 格式
mp3_file = super().build_text_to_voice(reply_text).content
silk_file = mp3_file.replace(".mp3", ".silk")
# Load the MP3 file
audio = AudioSegment.from_file(mp3_file, format="mp3")
# Convert to WAV format
audio = audio.set_frame_rate(24000).set_channels(1)
wav_data = audio.raw_data
sample_width = audio.sample_width
# Encode to SILK format
silk_data = pysilk.encode(wav_data, 24000)
# Save the silk file
with open(silk_file, "wb") as f:
f.write(silk_data)
silk_file = os.path.splitext(mp3_file)[0] + '.sil'
voiceLength = mp3_to_sil(mp3_file, silk_file)
# 发送语音
t = int(time.time())
file_box = FileBox.from_file(silk_file, name=str(t) + '.silk')
file_box = FileBox.from_file(silk_file, name=str(t) + '.sil')
file_box.metadata = {'voiceLength': voiceLength}
await self.send(file_box, reply_user_id)
# 清除缓存文件
os.remove(mp3_file)
@@ -337,21 +296,12 @@ class WechatyChannel(Channel):
reply_text = '@' + group_user_name + ' ' + reply_text.strip()
# 转换 mp3 文件为 silk 格式
mp3_file = super().build_text_to_voice(reply_text).content
silk_file = mp3_file.replace(".mp3", ".silk")
# Load the MP3 file
audio = AudioSegment.from_file(mp3_file, format="mp3")
# Convert to WAV format
audio = audio.set_frame_rate(24000).set_channels(1)
wav_data = audio.raw_data
sample_width = audio.sample_width
# Encode to SILK format
silk_data = pysilk.encode(wav_data, 24000)
# Save the silk file
with open(silk_file, "wb") as f:
f.write(silk_data)
silk_file = os.path.splitext(mp3_file)[0] + '.sil'
voiceLength = mp3_to_sil(mp3_file, silk_file)
# 发送语音
t = int(time.time())
file_box = FileBox.from_file(silk_file, name=str(t) + '.silk')
file_box.metadata = {'voiceLength': voiceLength}
await self.send_group(file_box, group_id)
# 清除缓存文件
os.remove(mp3_file)


+ 3
- 7
voice/google/google_voice.py View File

@@ -3,17 +3,14 @@
google voice service
"""

import pathlib
import subprocess
import time
from bridge.reply import Reply, ReplyType
import speech_recognition
import pyttsx3
from gtts import gTTS
from bridge.reply import Reply, ReplyType
from common.log import logger
from common.tmp_dir import TmpDir
from voice.voice import Voice
from voice.audio_convert import mp3_to_wav


class GoogleVoice(Voice):
@@ -30,11 +27,10 @@ class GoogleVoice(Voice):
self.engine.setProperty('voice', voices[1].id)

def voiceToText(self, voice_file):
new_file = voice_file.replace('.mp3', '.wav')
# new_file = voice_file.replace('.mp3', '.wav')
# subprocess.call('ffmpeg -i ' + voice_file +
# ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True)
mp3_to_wav(voice_file, new_file)
with speech_recognition.AudioFile(new_file) as source:
with speech_recognition.AudioFile(voice_file) as source:
audio = self.recognizer.record(source)
try:
text = self.recognizer.recognize_google(audio, language='zh-CN')


Loading…
Cancel
Save