From 6fed719e09fb3b0182167c1b725b5274481c714c Mon Sep 17 00:00:00 2001 From: njnuko Date: Mon, 20 May 2024 15:04:23 +0800 Subject: [PATCH] add Xunfei Voice Signed-off-by: njnuko --- voice/factory.py | 4 + voice/xunfei/config.json.template | 7 + voice/xunfei/xunfei_asr.py | 209 ++++++++++++++++++++++++++++++ voice/xunfei/xunfei_tts.py | 163 +++++++++++++++++++++++ voice/xunfei/xunfei_voice.py | 77 +++++++++++ 5 files changed, 460 insertions(+) create mode 100644 voice/xunfei/config.json.template create mode 100644 voice/xunfei/xunfei_asr.py create mode 100644 voice/xunfei/xunfei_tts.py create mode 100644 voice/xunfei/xunfei_voice.py diff --git a/voice/factory.py b/voice/factory.py index bc9c9c3..fa8b79e 100644 --- a/voice/factory.py +++ b/voice/factory.py @@ -46,4 +46,8 @@ def create_voice(voice_type): from voice.edge.edge_voice import EdgeVoice return EdgeVoice() + elif voice_type == "xunfei": + from voice.xunfei.xunfei_voice import XunfeiVoice + + return XunfeiVoice() raise RuntimeError diff --git a/voice/xunfei/config.json.template b/voice/xunfei/config.json.template new file mode 100644 index 0000000..1a02f28 --- /dev/null +++ b/voice/xunfei/config.json.template @@ -0,0 +1,7 @@ +{ + "APPID":"xxx71xxx", #讯飞xfyun.cn控制台中应用的ID + "APIKey":"xxxx69058exxxxxx", #讯飞xfyun.cn控制台语音合成或者听写界面的APIKey + "APISecret":"xxxx697f0xxxxxx", #讯飞xfyun.cn控制台语音合成或者听写界面的APIKey + "BusinessArgsTTS":{"aue": "lame", "sfl": 1, "auf": "audio/L16;rate=16000", "vcn": "xiaoyan", "tte": "utf8"}, #语音合成的参数,具体可以参考xfyun.cn的文档 + "BusinessArgsASR":{"domain": "iat", "language": "zh_cn", "accent": "mandarin", "vad_eos":10000, "dwa": "wpgs"} #语音听写的参数,具体可以参考xfyun.cn的文档 +} diff --git a/voice/xunfei/xunfei_asr.py b/voice/xunfei/xunfei_asr.py new file mode 100644 index 0000000..f3cc9d5 --- /dev/null +++ b/voice/xunfei/xunfei_asr.py @@ -0,0 +1,209 @@ +# -*- coding:utf-8 -*- +# +# Author: njnuko +# Email: njnuko@163.com +# +# 这个文档是基于官方的demo来改的,固体官方demo文档请参考官网 +# +# 语音听写流式 WebAPI 接口调用示例 接口文档(必看):https://doc.xfyun.cn/rest_api/语音听写(流式版).html +# webapi 听写服务参考帖子(必看):http://bbs.xfyun.cn/forum.php?mod=viewthread&tid=38947&extra= +# 语音听写流式WebAPI 服务,热词使用方式:登陆开放平台https://www.xfyun.cn/后,找到控制台--我的应用---语音听写(流式)---服务管理--个性化热词, +# 设置热词 +# 注意:热词只能在识别的时候会增加热词的识别权重,需要注意的是增加相应词条的识别率,但并不是绝对的,具体效果以您测试为准。 +# 语音听写流式WebAPI 服务,方言试用方法:登陆开放平台https://www.xfyun.cn/后,找到控制台--我的应用---语音听写(流式)---服务管理--识别语种列表 +# 可添加语种或方言,添加后会显示该方言的参数值 +# 错误码链接:https://www.xfyun.cn/document/error-code (code返回错误码时必看) +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + +import websocket +import datetime +import hashlib +import base64 +import hmac +import json +from urllib.parse import urlencode +import time +import ssl +from wsgiref.handlers import format_date_time +from datetime import datetime +from time import mktime +import _thread as thread +import os +import wave + + +STATUS_FIRST_FRAME = 0 # 第一帧的标识 +STATUS_CONTINUE_FRAME = 1 # 中间帧标识 +STATUS_LAST_FRAME = 2 # 最后一帧的标识 + +############# +#whole_dict 是用来存储返回值的,由于带语音修正,所以用dict来存储,有更新的化pop之前的值,最后再合并 +global whole_dict +#这个文档是官方文档改的,这个参数是用来做函数调用时用的 +global wsParam +############## + + +class Ws_Param(object): + # 初始化 + def __init__(self, APPID, APIKey, APISecret,BusinessArgs, AudioFile): + self.APPID = APPID + self.APIKey = APIKey + self.APISecret = APISecret + self.AudioFile = AudioFile + self.BusinessArgs = BusinessArgs + # 公共参数(common) + self.CommonArgs = {"app_id": self.APPID} + # 业务参数(business),更多个性化参数可在官网查看 + #self.BusinessArgs = {"domain": "iat", "language": "zh_cn", "accent": "mandarin", "vinfo":1,"vad_eos":10000} + + # 生成url + def create_url(self): + url = 'wss://ws-api.xfyun.cn/v2/iat' + # 生成RFC1123格式的时间戳 + now = datetime.now() + date = format_date_time(mktime(now.timetuple())) + + # 拼接字符串 + signature_origin = "host: " + "ws-api.xfyun.cn" + "\n" + signature_origin += "date: " + date + "\n" + signature_origin += "GET " + "/v2/iat " + "HTTP/1.1" + # 进行hmac-sha256进行加密 + signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'), + digestmod=hashlib.sha256).digest() + signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8') + + authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % ( + self.APIKey, "hmac-sha256", "host date request-line", signature_sha) + authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8') + # 将请求的鉴权参数组合为字典 + v = { + "authorization": authorization, + "date": date, + "host": "ws-api.xfyun.cn" + } + # 拼接鉴权参数,生成url + url = url + '?' + urlencode(v) + #print("date: ",date) + #print("v: ",v) + # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释,比对相同参数时生成的url与自己代码生成的url是否一致 + #print('websocket url :', url) + return url + + +# 收到websocket消息的处理 +def on_message(ws, message): + global whole_dict + try: + code = json.loads(message)["code"] + sid = json.loads(message)["sid"] + if code != 0: + errMsg = json.loads(message)["message"] + print("sid:%s call error:%s code is:%s" % (sid, errMsg, code)) + else: + temp1 = json.loads(message)["data"]["result"] + data = json.loads(message)["data"]["result"]["ws"] + sn = temp1["sn"] + if "rg" in temp1.keys(): + rep = temp1["rg"] + rep_start = rep[0] + rep_end = rep[1] + for sn in range(rep_start,rep_end+1): + print("before pop",whole_dict) + print("sn",sn) + whole_dict.pop(sn,None) + print("after pop",whole_dict) + results = "" + for i in data: + for w in i["cw"]: + results += w["w"] + whole_dict[sn]=results + print("after add",whole_dict) + else: + results = "" + for i in data: + for w in i["cw"]: + results += w["w"] + whole_dict[sn]=results + print("sid:%s call success!,data is:%s" % (sid, json.dumps(data, ensure_ascii=False))) + except Exception as e: + print("receive msg,but parse exception:", e) + + + +# 收到websocket错误的处理 +def on_error(ws, error): + print("### error:", error) + + +# 收到websocket关闭的处理 +def on_close(ws,a,b): + print("### closed ###") + + +# 收到websocket连接建立的处理 +def on_open(ws): + global wsParam + def run(*args): + frameSize = 8000 # 每一帧的音频大小 + intervel = 0.04 # 发送音频间隔(单位:s) + status = STATUS_FIRST_FRAME # 音频的状态信息,标识音频是第一帧,还是中间帧、最后一帧 + + with wave.open(wsParam.AudioFile, "rb") as fp: + while True: + buf = fp.readframes(frameSize) + # 文件结束 + if not buf: + status = STATUS_LAST_FRAME + # 第一帧处理 + # 发送第一帧音频,带business 参数 + # appid 必须带上,只需第一帧发送 + if status == STATUS_FIRST_FRAME: + d = {"common": wsParam.CommonArgs, + "business": wsParam.BusinessArgs, + "data": {"status": 0, "format": "audio/L16;rate=16000","audio": str(base64.b64encode(buf), 'utf-8'), "encoding": "raw"}} + d = json.dumps(d) + ws.send(d) + status = STATUS_CONTINUE_FRAME + # 中间帧处理 + elif status == STATUS_CONTINUE_FRAME: + d = {"data": {"status": 1, "format": "audio/L16;rate=16000", + "audio": str(base64.b64encode(buf), 'utf-8'), + "encoding": "raw"}} + ws.send(json.dumps(d)) + # 最后一帧处理 + elif status == STATUS_LAST_FRAME: + d = {"data": {"status": 2, "format": "audio/L16;rate=16000", + "audio": str(base64.b64encode(buf), 'utf-8'), + "encoding": "raw"}} + ws.send(json.dumps(d)) + time.sleep(1) + break + # 模拟音频采样间隔 + time.sleep(intervel) + ws.close() + + thread.start_new_thread(run, ()) + +#提供给xunfei_voice调用的函数 +def xunfei_asr(APPID,APISecret,APIKey,BusinessArgsASR,AudioFile): + global whole_dict + global wsParam + whole_dict = {} + wsParam1 = Ws_Param(APPID=APPID, APISecret=APISecret, + APIKey=APIKey,BusinessArgs=BusinessArgsASR, + AudioFile=AudioFile) + #wsParam是global变量,给上面on_open函数调用使用的 + wsParam = wsParam1 + websocket.enableTrace(True) + wsUrl = wsParam.create_url() + ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close) + ws.on_open = on_open + ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE}) + #把字典的值合并起来做最后识别的输出 + whole_words = "" + for i in sorted(whole_dict.keys()): + whole_words += whole_dict[i] + return whole_words + + diff --git a/voice/xunfei/xunfei_tts.py b/voice/xunfei/xunfei_tts.py new file mode 100644 index 0000000..d289fc8 --- /dev/null +++ b/voice/xunfei/xunfei_tts.py @@ -0,0 +1,163 @@ +# -*- coding:utf-8 -*- +# +# Author: njnuko +# Email: njnuko@163.com +# +# 这个文档是基于官方的demo来改的,固体官方demo文档请参考官网 +# +# 语音听写流式 WebAPI 接口调用示例 接口文档(必看):https://doc.xfyun.cn/rest_api/语音听写(流式版).html +# webapi 听写服务参考帖子(必看):http://bbs.xfyun.cn/forum.php?mod=viewthread&tid=38947&extra= +# 语音听写流式WebAPI 服务,热词使用方式:登陆开放平台https://www.xfyun.cn/后,找到控制台--我的应用---语音听写(流式)---服务管理--个性化热词, +# 设置热词 +# 注意:热词只能在识别的时候会增加热词的识别权重,需要注意的是增加相应词条的识别率,但并不是绝对的,具体效果以您测试为准。 +# 语音听写流式WebAPI 服务,方言试用方法:登陆开放平台https://www.xfyun.cn/后,找到控制台--我的应用---语音听写(流式)---服务管理--识别语种列表 +# 可添加语种或方言,添加后会显示该方言的参数值 +# 错误码链接:https://www.xfyun.cn/document/error-code (code返回错误码时必看) +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +import websocket +import datetime +import hashlib +import base64 +import hmac +import json +from urllib.parse import urlencode +import time +import ssl +from wsgiref.handlers import format_date_time +from datetime import datetime +from time import mktime +import _thread as thread +import os + + + +STATUS_FIRST_FRAME = 0 # 第一帧的标识 +STATUS_CONTINUE_FRAME = 1 # 中间帧标识 +STATUS_LAST_FRAME = 2 # 最后一帧的标识 + +############# +#这个参数是用来做输出文件路径的 +global outfile +#这个文档是官方文档改的,这个参数是用来做函数调用时用的 +global wsParam +############## + + +class Ws_Param(object): + # 初始化 + def __init__(self, APPID, APIKey, APISecret,BusinessArgs,Text): + self.APPID = APPID + self.APIKey = APIKey + self.APISecret = APISecret + self.BusinessArgs = BusinessArgs + self.Text = Text + + # 公共参数(common) + self.CommonArgs = {"app_id": self.APPID} + # 业务参数(business),更多个性化参数可在官网查看 + #self.BusinessArgs = {"aue": "raw", "auf": "audio/L16;rate=16000", "vcn": "xiaoyan", "tte": "utf8"} + self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-8')), "UTF8")} + #使用小语种须使用以下方式,此处的unicode指的是 utf16小端的编码方式,即"UTF-16LE"” + #self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-16')), "UTF8")} + + # 生成url + def create_url(self): + url = 'wss://tts-api.xfyun.cn/v2/tts' + # 生成RFC1123格式的时间戳 + now = datetime.now() + date = format_date_time(mktime(now.timetuple())) + + # 拼接字符串 + signature_origin = "host: " + "ws-api.xfyun.cn" + "\n" + signature_origin += "date: " + date + "\n" + signature_origin += "GET " + "/v2/tts " + "HTTP/1.1" + # 进行hmac-sha256进行加密 + signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'), + digestmod=hashlib.sha256).digest() + signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8') + + authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % ( + self.APIKey, "hmac-sha256", "host date request-line", signature_sha) + authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8') + # 将请求的鉴权参数组合为字典 + v = { + "authorization": authorization, + "date": date, + "host": "ws-api.xfyun.cn" + } + # 拼接鉴权参数,生成url + url = url + '?' + urlencode(v) + # print("date: ",date) + # print("v: ",v) + # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释,比对相同参数时生成的url与自己代码生成的url是否一致 + # print('websocket url :', url) + return url + +def on_message(ws, message): + #输出文件 + global outfile + try: + message =json.loads(message) + code = message["code"] + sid = message["sid"] + audio = message["data"]["audio"] + audio = base64.b64decode(audio) + status = message["data"]["status"] + if status == 2: + print("ws is closed") + ws.close() + if code != 0: + errMsg = message["message"] + print("sid:%s call error:%s code is:%s" % (sid, errMsg, code)) + else: + + with open(outfile, 'ab') as f: + f.write(audio) + + except Exception as e: + print("receive msg,but parse exception:", e) + + + +# 收到websocket连接建立的处理 +def on_open(ws): + global outfile + global wsParam + def run(*args): + d = {"common": wsParam.CommonArgs, + "business": wsParam.BusinessArgs, + "data": wsParam.Data, + } + d = json.dumps(d) + print("------>开始发送文本数据") + ws.send(d) + if os.path.exists(outfile): + os.remove(outfile) + + thread.start_new_thread(run, ()) + +# 收到websocket错误的处理 +def on_error(ws, error): + print("### error:", error) + + + +# 收到websocket关闭的处理 +def on_close(ws): + print("### closed ###") + + + +def xunfei_tts(APPID, APIKey, APISecret,BusinessArgsTTS, Text, OutFile): + global outfile + global wsParam + outfile = OutFile + wsParam1 = Ws_Param(APPID,APIKey,APISecret,BusinessArgsTTS,Text) + wsParam = wsParam1 + websocket.enableTrace(False) + wsUrl = wsParam.create_url() + ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close) + ws.on_open = on_open + ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE}) + return outfile + diff --git a/voice/xunfei/xunfei_voice.py b/voice/xunfei/xunfei_voice.py new file mode 100644 index 0000000..f16714d --- /dev/null +++ b/voice/xunfei/xunfei_voice.py @@ -0,0 +1,77 @@ +##################################################################### +# xunfei voice service +# Auth: njnuko +# Email: njnuko@163.com +# +# 要使用本模块, 首先到 xfyun.cn 注册一个开发者账号, +# 之后创建一个新应用, 然后在应用管理的语音识别或者语音合同右边可以查看APPID API Key 和 Secret Key +# 然后在 config.json 中填入这三个值 +##################################################################### + +import json +import os +import time + +from bridge.reply import Reply, ReplyType +from common.log import logger +from common.tmp_dir import TmpDir +from config import conf +from voice.voice import Voice +from .xunfei_asr import xunfei_asr +from .xunfei_tts import xunfei_tts +from voice.audio_convert import any_to_mp3 +import shutil +from pydub import AudioSegment + + +class XunfeiVoice(Voice): + def __init__(self): + try: + curdir = os.path.dirname(__file__) + config_path = os.path.join(curdir, "config.json") + conf = None + with open(config_path, "r") as fr: + conf = json.load(fr) + print(conf) + self.APPID = str(conf.get("APPID")) + self.APIKey = str(conf.get("APIKey")) + self.APISecret = str(conf.get("APISecret")) + self.BusinessArgsTTS = conf.get("BusinessArgsTTS") + self.BusinessArgsASR= conf.get("BusinessArgsASR") + + except Exception as e: + logger.warn("XunfeiVoice init failed: %s, ignore " % e) + + def voiceToText(self, voice_file): + # 识别本地文件 + try: + logger.debug("[Xunfei] voice file name={}".format(voice_file)) + #print("voice_file===========",voice_file) + #print("voice_file_type===========",type(voice_file)) + #mp3_name, file_extension = os.path.splitext(voice_file) + #mp3_file = mp3_name + ".mp3" + #pcm_data=get_pcm_from_wav(voice_file) + #mp3_name, file_extension = os.path.splitext(voice_file) + #AudioSegment.from_wav(voice_file).export(mp3_file, format="mp3") + #shutil.copy2(voice_file, 'tmp/test1.wav') + #shutil.copy2(mp3_file, 'tmp/test1.mp3') + #print("voice and mp3 file",voice_file,mp3_file) + text = xunfei_asr(self.APPID,self.APISecret,self.APIKey,self.BusinessArgsASR,voice_file) + logger.info("讯飞语音识别到了: {}".format(text)) + reply = Reply(ReplyType.TEXT, text) + except Exception as e: + logger.warn("XunfeiVoice init failed: %s, ignore " % e) + reply = Reply(ReplyType.ERROR, "讯飞语音识别出错了;{0}") + return reply + + def textToVoice(self, text): + try: + # Avoid the same filename under multithreading + fileName = TmpDir().path() + "reply-" + str(int(time.time())) + "-" + str(hash(text) & 0x7FFFFFFF) + ".mp3" + return_file = xunfei_tts(self.APPID,self.APIKey,self.APISecret,self.BusinessArgsTTS,text,fileName) + logger.info("[Xunfei] textToVoice text={} voice file name={}".format(text, fileName)) + reply = Reply(ReplyType.VOICE, fileName) + except Exception as e: + logger.error("[Xunfei] textToVoice error={}".format(fileName)) + reply = Reply(ReplyType.ERROR, "抱歉,讯飞语音合成失败") + return reply