From d38fc6104398810fe50282b56bebca043bdca34f Mon Sep 17 00:00:00 2001
From: wanggang <mail@ender.wang>
Date: Tue, 7 Mar 2023 14:29:59 +0800
Subject: [PATCH 1/8] [voice] add google voice support

---
 .gitignore                       |  1 +
 README.md                        | 11 +++++++++-
 bridge/bridge.py                 |  4 ++++
 channel/channel.py               |  5 ++++-
 channel/wechat/wechat_channel.py | 34 ++++++++++++++++++++++++++-----
 config-template.json             |  1 +
 voice/google/google_voice.py     | 21 +++++++++++++++++++
 voice/voice.py                   | 10 +++++++++
 voice/voice_factory.py           | 17 ++++++++++++++++
 voice/xfyun/xfyun_voice.py       | 35 ++++++++++++++++++++++++++++++++
 10 files changed, 132 insertions(+), 7 deletions(-)
 create mode 100644 voice/google/google_voice.py
 create mode 100644 voice/voice.py
 create mode 100644 voice/voice_factory.py
 create mode 100644 voice/xfyun/xfyun_voice.py

diff --git a/.gitignore b/.gitignore
index c4d7bdc..8bc62f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ venv*
 config.json
 QR.png
 nohup.out
+tmp
diff --git a/README.md b/README.md
index a750378..4ff7cea 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,12 @@ cd chatgpt-on-wechat/
 ```bash
 pip3 install itchat-uos==1.5.0.dev0
 pip3 install --upgrade openai
+
+如果使用google的语音识别，需要安装speech_recognition和依赖的ffmpeg
+pip3 install speech_recognition
+--在MacOS中安装ffmpeg，brew install ffmpeg
+--在Windows中安装ffmpeg，下载ffmpeg.exe
+--在Linux中安装ffmpeg，apt-get install ffmpeg
 ```
 注：`itchat-uos`使用指定版本1.5.0.dev0，`openai`使用最新版本，需高于0.27.0。
 
@@ -112,7 +118,10 @@ cp config-template.json config.json
 + 默认只要被人 @ 就会触发机器人自动回复；另外群聊天中只要检测到以 "@bot" 开头的内容，同样会自动回复（方便自己触发），这对应配置项 `group_chat_prefix`
 + 可选配置: `group_name_keyword_white_list`配置项支持模糊匹配群名称，`group_chat_keyword`配置项则支持模糊匹配群消息内容，用法与上述两个配置项相同。（Contributed by [evolay](https://github.com/evolay))
 
-**3.其他配置**
+**3.语音识别**
++ 配置`speech_recognition=true`开启语音识别
+
+**4.其他配置**
 
 + `proxy`：由于目前 `openai` 接口国内无法访问，需配置代理客户端的地址，详情参考  [#351](https://github.com/zhayujie/chatgpt-on-wechat/issues/351)
 + 对于图像生成，在满足个人或群组触发条件外，还需要额外的关键词前缀来触发，对应配置 `image_create_prefix `
diff --git a/bridge/bridge.py b/bridge/bridge.py
index 6c164e8..78d950a 100644
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -1,4 +1,5 @@
 from bot import bot_factory
+from voice import voice_factory
 
 
 class Bridge(object):
@@ -7,3 +8,6 @@ class Bridge(object):
 
     def fetch_reply_content(self, query, context):
         return bot_factory.create_bot("chatGPT").reply(query, context)
+
+    def fetch_voice_to_text(self, voiceFile):
+        return voice_factory.create_voice("google").voiceToText(voiceFile)
diff --git a/channel/channel.py b/channel/channel.py
index e2617d1..d4c0fc5 100644
--- a/channel/channel.py
+++ b/channel/channel.py
@@ -11,7 +11,7 @@ class Channel(object):
         """
         raise NotImplementedError
 
-    def handle(self, msg):
+    def handle_text(self, msg):
         """
         process received msg
         :param msg: message object
@@ -29,3 +29,6 @@ class Channel(object):
 
     def build_reply_content(self, query, context=None):
         return Bridge().fetch_reply_content(query, context)
+
+    def build_void_text(self, voice_file):
+        return Bridge().fetch_voice_to_text(voice_file)
diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py
index 66778f4..b773010 100644
--- a/channel/wechat/wechat_channel.py
+++ b/channel/wechat/wechat_channel.py
@@ -3,6 +3,8 @@
 """
 wechat channel
 """
+
+import os
 import itchat
 import json
 from itchat.content import *
@@ -18,7 +20,7 @@ thread_pool = ThreadPoolExecutor(max_workers=8)
 
 @itchat.msg_register(TEXT)
 def handler_single_msg(msg):
-    WechatChannel().handle(msg)
+    WechatChannel().handle_text(msg)
     return None
 
 
@@ -28,9 +30,19 @@ def handler_group_msg(msg):
     return None
 
 
+@itchat.msg_register(VOICE)
+def handler_single_voice(msg):
+    WechatChannel().handle_voice(msg)
+    return None
+
+
 class WechatChannel(Channel):
+    tmpFilePath = './tmp/'
+
     def __init__(self):
-        pass
+        isExists = os.path.exists(self.tmpFilePath)
+        if not isExists: 
+            os.makedirs(self.tmpFilePath)
 
     def startup(self):
         # login by scan QRCode
@@ -39,12 +51,24 @@ class WechatChannel(Channel):
         # start message listener
         itchat.run()
 
-    def handle(self, msg):
-        logger.debug("[WX]receive msg: " + json.dumps(msg, ensure_ascii=False))
+    def handle_voice(self, msg):
+        if conf().get('speech_recognition') != True :
+            return
+        logger.debug("[WX]receive voice msg: ", msg['FileName'])
+        fileName = msg['FileName']
+        msg.download(self.tmpFilePath+fileName)
+        content = super().build_void_text(self.tmpFilePath+fileName)
+        self._handle_single_msg(msg, content)
+
+    def handle_text(self, msg):
+        logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False))
+        content = msg['Text']
+        self._handle_single_msg(msg, content)
+
+    def _handle_single_msg(self, msg, content):
         from_user_id = msg['FromUserName']
         to_user_id = msg['ToUserName']              # 接收人id
         other_user_id = msg['User']['UserName']     # 对手方id
-        content = msg['Text']
         match_prefix = self.check_prefix(content, conf().get('single_chat_prefix'))
         if "」\n- - - - - - - - - - - - - - -" in content:
             logger.debug("[WX]reference query skipped")
diff --git a/config-template.json b/config-template.json
index fd6d46a..9ad9f5d 100644
--- a/config-template.json
+++ b/config-template.json
@@ -7,6 +7,7 @@
   "group_name_white_list": ["ChatGPT测试群", "ChatGPT测试群2"],
   "image_create_prefix": ["画", "看", "找"],
   "conversation_max_tokens": 1000,
+  "speech_recognition": false,
   "character_desc": "你是ChatGPT, 一个由OpenAI训练的大型语言模型, 你旨在回答并解决人们的任何问题，并且可以使用多种语言与人交流。",
   "expires_in_seconds": 3600
 }
diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py
new file mode 100644
index 0000000..7af3880
--- /dev/null
+++ b/voice/google/google_voice.py
@@ -0,0 +1,21 @@
+
+"""
+google voice service
+"""
+
+import subprocess
+import speech_recognition 
+from voice.voice import Voice
+
+class GoogleVoice(Voice):
+    recognizer = speech_recognition.Recognizer()
+
+    def __init__(self):
+        pass
+
+    def voiceToText(self, voice_file):
+        new_file = voice_file.replace('.mp3', '.wav')
+        subprocess.call('ffmpeg -i ' + voice_file + ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True)
+        with speech_recognition.AudioFile(new_file) as source:
+            audio = self.recognizer.record(source)
+        return self.recognizer.recognize_google(audio, language='zh-CN')
diff --git a/voice/voice.py b/voice/voice.py
new file mode 100644
index 0000000..2f66dab
--- /dev/null
+++ b/voice/voice.py
@@ -0,0 +1,10 @@
+"""
+Voice service abstract class
+"""
+
+class Voice(object):
+    def voiceToText(self, voice_file):
+        """
+        Send voice to voice service and get text
+        """
+        raise NotImplementedError
diff --git a/voice/voice_factory.py b/voice/voice_factory.py
new file mode 100644
index 0000000..5457d14
--- /dev/null
+++ b/voice/voice_factory.py
@@ -0,0 +1,17 @@
+"""
+voice factory
+"""
+
+def create_voice(voice_type):
+    """
+    create a voice instance
+    :param voice_type: voice type code
+    :return: voice instance
+    """
+    if voice_type == 'xfyun':
+        from voice.xfyun.xfyun_voice import XfyunVoice
+        return XfyunVoice()
+    elif voice_type == 'google':
+        from voice.google.google_voice import GoogleVoice
+        return GoogleVoice()
+    raise RuntimeError
diff --git a/voice/xfyun/xfyun_voice.py b/voice/xfyun/xfyun_voice.py
new file mode 100644
index 0000000..74b27b2
--- /dev/null
+++ b/voice/xfyun/xfyun_voice.py
@@ -0,0 +1,35 @@
+
+"""
+科大讯飞 voice service
+"""
+
+from voice.voice import Voice
+
+# 科大讯飞语音识别
+lfasr_host = 'http://raasr.xfyun.cn/api'
+# 请求的接口名
+api_prepare = '/prepare'
+api_upload = '/upload'
+api_merge = '/merge'
+api_get_progress = '/getProgress'
+api_get_result = '/getResult'
+# 文件分片大小10M
+file_piece_sice = 10485760
+# ——————————————————转写可配置参数————————————————
+# 参数可在官网界面（https://doc.xfyun.cn/rest_api/%E8%AF%AD%E9%9F%B3%E8%BD%AC%E5%86%99.html）查看，根据需求可自行在gene_params方法里添加修改
+# 转写类型
+lfasr_type = 0
+# 是否开启分词
+has_participle = 'false'
+has_seperate = 'true'
+# 多候选词个数
+max_alternatives = 0
+# 子用户标识
+suid = ''
+
+class XfyunVoice(Voice):
+    def __init__(self):
+        pass
+
+    def voiceToText(self, voice_file):
+        pass
\ No newline at end of file

From 1711a5c0640163aa11b6bbd2a4571a71cc508900 Mon Sep 17 00:00:00 2001
From: wanggang <mail@ender.wang>
Date: Tue, 7 Mar 2023 14:42:06 +0800
Subject: [PATCH 2/8] [voice] fix google voice exception issue

---
 README.md                    | 2 +-
 voice/google/google_voice.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4ff7cea..54fbf83 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ pip3 install itchat-uos==1.5.0.dev0
 pip3 install --upgrade openai
 
 如果使用google的语音识别，需要安装speech_recognition和依赖的ffmpeg
-pip3 install speech_recognition
+pip3 install SpeechRecognition
 --在MacOS中安装ffmpeg，brew install ffmpeg
 --在Windows中安装ffmpeg，下载ffmpeg.exe
 --在Linux中安装ffmpeg，apt-get install ffmpeg
diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py
index 7af3880..97597b3 100644
--- a/voice/google/google_voice.py
+++ b/voice/google/google_voice.py
@@ -18,4 +18,9 @@ class GoogleVoice(Voice):
         subprocess.call('ffmpeg -i ' + voice_file + ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True)
         with speech_recognition.AudioFile(new_file) as source:
             audio = self.recognizer.record(source)
-        return self.recognizer.recognize_google(audio, language='zh-CN')
+        try:
+            return self.recognizer.recognize_google(audio, language='zh-CN')
+        except speech_recognition.UnknownValueError:
+            return "抱歉，我听不懂。"
+        except speech_recognition.RequestError as e:
+            return "抱歉，无法连接到 Google 语音识别服务；{0}".format(e)

From cc19017c01c7e8b7be7eb6363d64e7865826f6b8 Mon Sep 17 00:00:00 2001
From: wanggang <mail@ender.wang>
Date: Tue, 7 Mar 2023 23:28:57 +0800
Subject: [PATCH 3/8] [voice] add text to voice

---
 bridge/bridge.py                 |  3 ++
 channel/channel.py               |  5 +++-
 channel/wechat/wechat_channel.py | 47 +++++++++++++++++++++++---------
 voice/google/google_voice.py     | 32 +++++++++++++++++++---
 voice/voice.py                   |  6 ++++
 5 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/bridge/bridge.py b/bridge/bridge.py
index 78d950a..9d00bfe 100644
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -11,3 +11,6 @@ class Bridge(object):
 
     def fetch_voice_to_text(self, voiceFile):
         return voice_factory.create_voice("google").voiceToText(voiceFile)
+
+    def fetch_text_to_voice(self, text):
+        return voice_factory.create_voice("google").textToVoice(text)
\ No newline at end of file
diff --git a/channel/channel.py b/channel/channel.py
index d4c0fc5..a1395c4 100644
--- a/channel/channel.py
+++ b/channel/channel.py
@@ -30,5 +30,8 @@ class Channel(object):
     def build_reply_content(self, query, context=None):
         return Bridge().fetch_reply_content(query, context)
 
-    def build_void_text(self, voice_file):
+    def build_voice_to_text(self, voice_file):
         return Bridge().fetch_voice_to_text(voice_file)
+    
+    def build_text_to_voice(self, text):
+        return Bridge().fetch_text_to_voice(text)
diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py
index b773010..b3d3658 100644
--- a/channel/wechat/wechat_channel.py
+++ b/channel/wechat/wechat_channel.py
@@ -40,6 +40,7 @@ class WechatChannel(Channel):
     tmpFilePath = './tmp/'
 
     def __init__(self):
+        voices = self.engine.getProperty('voices')
         isExists = os.path.exists(self.tmpFilePath)
         if not isExists: 
             os.makedirs(self.tmpFilePath)
@@ -55,17 +56,20 @@ class WechatChannel(Channel):
         if conf().get('speech_recognition') != True :
             return
         logger.debug("[WX]receive voice msg: ", msg['FileName'])
-        fileName = msg['FileName']
-        msg.download(self.tmpFilePath+fileName)
-        content = super().build_void_text(self.tmpFilePath+fileName)
-        self._handle_single_msg(msg, content)
+        thread_pool.submit(self._do_handle_voice, msg)
+
+    def _do_handle_voice(self, msg):
+        fileName = self.tmpFilePath+msg['FileName']
+        msg.download(fileName)
+        content = super().build_voice_to_text(fileName)
+        self._handle_single_msg(msg, content, True)
 
     def handle_text(self, msg):
         logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False))
         content = msg['Text']
-        self._handle_single_msg(msg, content)
+        self._handle_single_msg(msg, content, False)
 
-    def _handle_single_msg(self, msg, content):
+    def _handle_single_msg(self, msg, content, is_voice):
         from_user_id = msg['FromUserName']
         to_user_id = msg['ToUserName']              # 接收人id
         other_user_id = msg['User']['UserName']     # 对手方id
@@ -84,9 +88,10 @@ class WechatChannel(Channel):
             if img_match_prefix:
                 content = content.split(img_match_prefix, 1)[1].strip()
                 thread_pool.submit(self._do_send_img, content, from_user_id)
-            else:
-                thread_pool.submit(self._do_send, content, from_user_id)
-
+            elif is_voice:
+                thread_pool.submit(self._do_send_voice, content, from_user_id)
+            else :
+                thread_pool.submit(self._do_send_text, content, from_user_id)
         elif to_user_id == other_user_id and match_prefix:
             # 自己给好友发送消息
             str_list = content.split(match_prefix, 1)
@@ -96,8 +101,10 @@ class WechatChannel(Channel):
             if img_match_prefix:
                 content = content.split(img_match_prefix, 1)[1].strip()
                 thread_pool.submit(self._do_send_img, content, to_user_id)
+            elif is_voice:
+                thread_pool.submit(self._do_send_voice, content, to_user_id)
             else:
-                thread_pool.submit(self._do_send, content, to_user_id)
+                thread_pool.submit(self._do_send_text, content, to_user_id)
 
 
     def handle_group(self, msg):
@@ -129,10 +136,24 @@ class WechatChannel(Channel):
                 thread_pool.submit(self._do_send_group, content, msg)
 
     def send(self, msg, receiver):
-        logger.info('[WX] sendMsg={}, receiver={}'.format(msg, receiver))
         itchat.send(msg, toUserName=receiver)
+        logger.info('[WX] sendMsg={}, receiver={}'.format(msg, receiver))
 
-    def _do_send(self, query, reply_user_id):
+    def _do_send_voice(self, query, reply_user_id):
+        try:
+            if not query:
+                return
+            context = dict()
+            context['from_user_id'] = reply_user_id
+            reply_text = super().build_reply_content(query, context)
+            if reply_text:
+                replyFile = super().build_text_to_voice(reply_text)
+                itchat.send_file(replyFile, toUserName=reply_user_id)
+                logger.info('[WX] sendFile={}, receiver={}'.format(replyFile, reply_user_id))
+        except Exception as e:
+            logger.exception(e)
+
+    def _do_send_text(self, query, reply_user_id):
         try:
             if not query:
                 return
@@ -162,8 +183,8 @@ class WechatChannel(Channel):
             image_storage.seek(0)
 
             # 图片发送
-            logger.info('[WX] sendImage, receiver={}'.format(reply_user_id))
             itchat.send_image(image_storage, reply_user_id)
+            logger.info('[WX] sendImage, receiver={}'.format(reply_user_id))
         except Exception as e:
             logger.exception(e)
 
diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py
index 97597b3..58955f4 100644
--- a/voice/google/google_voice.py
+++ b/voice/google/google_voice.py
@@ -4,23 +4,47 @@ google voice service
 """
 
 import subprocess
-import speech_recognition 
+import time
+import speech_recognition
+import pyttsx3
+from common.log import logger
 from voice.voice import Voice
 
+
 class GoogleVoice(Voice):
+    tmpFilePath = './tmp/'
     recognizer = speech_recognition.Recognizer()
+    engine = pyttsx3.init()
 
     def __init__(self):
-        pass
+        # 语速
+        self.engine.setProperty('rate', 125)
+        # 音量
+        self.engine.setProperty('volume', 1.0)
+        # 0为男声，1为女声
+        voices = self.engine.getProperty('voices')
+        self.engine.setProperty('voice', voices[1].id)
 
     def voiceToText(self, voice_file):
         new_file = voice_file.replace('.mp3', '.wav')
-        subprocess.call('ffmpeg -i ' + voice_file + ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True)
+        subprocess.call('ffmpeg -i ' + voice_file +
+                        ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True)
         with speech_recognition.AudioFile(new_file) as source:
             audio = self.recognizer.record(source)
         try:
-            return self.recognizer.recognize_google(audio, language='zh-CN')
+            text = self.recognizer.recognize_google(audio, language='zh-CN')
+            logger.info(
+                '[Google] voiceToText text={} voice file name={}'.format(text, voice_file))
+            return text
         except speech_recognition.UnknownValueError:
             return "抱歉，我听不懂。"
         except speech_recognition.RequestError as e:
             return "抱歉，无法连接到 Google 语音识别服务；{0}".format(e)
+
+    def textToVoice(self, text):
+        textFile = self.tmpFilePath + '语音回复_' + str(int(time.time())) + '.mp3'
+        self.engine.save_to_file(text, textFile)
+        self.engine.runAndWait()
+        logger.info(
+            '[Google] textToVoice text={} voice file name={}'.format(text, textFile))
+        return textFile
diff --git a/voice/voice.py b/voice/voice.py
index 2f66dab..52d8aaa 100644
--- a/voice/voice.py
+++ b/voice/voice.py
@@ -8,3 +8,9 @@ class Voice(object):
         Send voice to voice service and get text
         """
         raise NotImplementedError
+
+    def textToVoice(self, text):
+        """
+        Send text to voice service and get voice
+        """
+        raise NotImplementedError
\ No newline at end of file

From 720ad07f83cbfa73dfa27c90849073f7a615fa7c Mon Sep 17 00:00:00 2001
From: wanggang <mail@ender.wang>
Date: Tue, 7 Mar 2023 23:33:25 +0800
Subject: [PATCH 4/8] [voice] fix issue

---
 README.md                        | 6 +++---
 channel/wechat/wechat_channel.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 54fbf83..1676395 100644
--- a/README.md
+++ b/README.md
@@ -72,11 +72,11 @@ cd chatgpt-on-wechat/
 pip3 install itchat-uos==1.5.0.dev0
 pip3 install --upgrade openai
 
-如果使用google的语音识别，需要安装speech_recognition和依赖的ffmpeg
+如果使用google的语音识别，需要安装speech_recognition和依赖的ffmpeg和espeak
 pip3 install SpeechRecognition
---在MacOS中安装ffmpeg，brew install ffmpeg
+--在MacOS中安装ffmpeg，brew install ffmpeg espeak
 --在Windows中安装ffmpeg，下载ffmpeg.exe
---在Linux中安装ffmpeg，apt-get install ffmpeg
+--在Linux中安装ffmpeg，apt-get install ffmpeg espeak
 ```
 注：`itchat-uos`使用指定版本1.5.0.dev0，`openai`使用最新版本，需高于0.27.0。
 
diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py
index b3d3658..3fdc94f 100644
--- a/channel/wechat/wechat_channel.py
+++ b/channel/wechat/wechat_channel.py
@@ -40,7 +40,6 @@ class WechatChannel(Channel):
     tmpFilePath = './tmp/'
 
     def __init__(self):
-        voices = self.engine.getProperty('voices')
         isExists = os.path.exists(self.tmpFilePath)
         if not isExists: 
             os.makedirs(self.tmpFilePath)

From 882e6c35762bd805a6bf9a320f78ee5fa8ec7362 Mon Sep 17 00:00:00 2001
From: wanggang <mail@ender.wang>
Date: Wed, 8 Mar 2023 11:02:01 +0800
Subject: [PATCH 5/8] [voice] add support for wispper

---
 bridge/bridge.py                 |  4 ++--
 channel/wechat/wechat_channel.py |  4 ++--
 config-template.json             |  3 +++
 voice/baidu/baidu_voice.py       | 22 ++++++++++++++++++++
 voice/openai/openai_voice.py     | 25 +++++++++++++++++++++++
 voice/voice_factory.py           |  9 +++++---
 voice/xfyun/xfyun_voice.py       | 35 --------------------------------
 7 files changed, 60 insertions(+), 42 deletions(-)
 create mode 100644 voice/baidu/baidu_voice.py
 create mode 100644 voice/openai/openai_voice.py
 delete mode 100644 voice/xfyun/xfyun_voice.py

diff --git a/bridge/bridge.py b/bridge/bridge.py
index 9d00bfe..e739a7f 100644
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -10,7 +10,7 @@ class Bridge(object):
         return bot_factory.create_bot("chatGPT").reply(query, context)
 
     def fetch_voice_to_text(self, voiceFile):
-        return voice_factory.create_voice("google").voiceToText(voiceFile)
+        return voice_factory.create_voice("openai").voiceToText(voiceFile)
 
     def fetch_text_to_voice(self, text):
-        return voice_factory.create_voice("google").textToVoice(text)
\ No newline at end of file
+        return voice_factory.create_voice("baidu").textToVoice(text)
\ No newline at end of file
diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py
index 3fdc94f..2282455 100644
--- a/channel/wechat/wechat_channel.py
+++ b/channel/wechat/wechat_channel.py
@@ -54,14 +54,14 @@ class WechatChannel(Channel):
     def handle_voice(self, msg):
         if conf().get('speech_recognition') != True :
             return
-        logger.debug("[WX]receive voice msg: ", msg['FileName'])
+        logger.debug("[WX]receive voice msg: " + msg['FileName'])
         thread_pool.submit(self._do_handle_voice, msg)
 
     def _do_handle_voice(self, msg):
         fileName = self.tmpFilePath+msg['FileName']
         msg.download(fileName)
         content = super().build_voice_to_text(fileName)
-        self._handle_single_msg(msg, content, True)
+        self._handle_single_msg(msg, content, False)
 
     def handle_text(self, msg):
         logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False))
diff --git a/config-template.json b/config-template.json
index 9ad9f5d..f7549d4 100644
--- a/config-template.json
+++ b/config-template.json
@@ -8,6 +8,9 @@
   "image_create_prefix": ["画", "看", "找"],
   "conversation_max_tokens": 1000,
   "speech_recognition": false,
+  "baidu_app_id": "YOUR BAIDU APP ID",
+  "baidu_api_key": "YOUR BAIDU API KEY",
+  "baidu_secret_key": "YOUR BAIDU SERVICE KEY",
   "character_desc": "你是ChatGPT, 一个由OpenAI训练的大型语言模型, 你旨在回答并解决人们的任何问题，并且可以使用多种语言与人交流。",
   "expires_in_seconds": 3600
 }
diff --git a/voice/baidu/baidu_voice.py b/voice/baidu/baidu_voice.py
new file mode 100644
index 0000000..8534c2b
--- /dev/null
+++ b/voice/baidu/baidu_voice.py
@@ -0,0 +1,22 @@
+
+"""
+baidu voice service
+"""
+from aip import AipSpeech
+from voice.voice import Voice
+from config import conf
+
+class BaiduVoice(Voice):
+    APP_ID = conf().get('baidu_app_id')
+    API_KEY = conf().get('baidu_api_key')
+    SECRET_KEY = conf().get('baidu_secret_key')
+    client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
+    
+    def __init__(self):
+        pass
+
+    def voiceToText(self, voice_file):
+        pass
+
+    def textToVoice(self, text):
+        pass
diff --git a/voice/openai/openai_voice.py b/voice/openai/openai_voice.py
new file mode 100644
index 0000000..8cc28b7
--- /dev/null
+++ b/voice/openai/openai_voice.py
@@ -0,0 +1,25 @@
+
+"""
+google voice service
+"""
+import json
+import openai
+from common.log import logger
+from voice.voice import Voice
+
+
+class OpenaiVoice(Voice):
+    def __init__(self):
+        pass
+
+    def voiceToText(self, voice_file):
+        file = open(voice_file, "rb")
+        reply = openai.Audio.transcribe("whisper-1", file)
+        json_dict = json.loads(reply)
+        text = json_dict['text']
+        logger.info(
+            '[Openai] voiceToText text={} voice file name={}'.format(text, voice_file))
+        return text
+
+    def textToVoice(self, text):
+        pass
diff --git a/voice/voice_factory.py b/voice/voice_factory.py
index 5457d14..053840e 100644
--- a/voice/voice_factory.py
+++ b/voice/voice_factory.py
@@ -8,10 +8,13 @@ def create_voice(voice_type):
     :param voice_type: voice type code
     :return: voice instance
     """
-    if voice_type == 'xfyun':
-        from voice.xfyun.xfyun_voice import XfyunVoice
-        return XfyunVoice()
+    if voice_type == 'baidu':
+        from voice.baidu.baidu_voice import BaiduVoice
+        return BaiduVoice()
     elif voice_type == 'google':
         from voice.google.google_voice import GoogleVoice
         return GoogleVoice()
+    elif voice_type == 'openai':
+        from voice.openai.openai_voice import OpenaiVoice
+        return OpenaiVoice()
     raise RuntimeError
diff --git a/voice/xfyun/xfyun_voice.py b/voice/xfyun/xfyun_voice.py
deleted file mode 100644
index 74b27b2..0000000
--- a/voice/xfyun/xfyun_voice.py
+++ /dev/null
@@ -1,35 +0,0 @@
-
-"""
-科大讯飞 voice service
-"""
-
-from voice.voice import Voice
-
-# 科大讯飞语音识别
-lfasr_host = 'http://raasr.xfyun.cn/api'
-# 请求的接口名
-api_prepare = '/prepare'
-api_upload = '/upload'
-api_merge = '/merge'
-api_get_progress = '/getProgress'
-api_get_result = '/getResult'
-# 文件分片大小10M
-file_piece_sice = 10485760
-# ——————————————————转写可配置参数————————————————
-# 参数可在官网界面（https://doc.xfyun.cn/rest_api/%E8%AF%AD%E9%9F%B3%E8%BD%AC%E5%86%99.html）查看，根据需求可自行在gene_params方法里添加修改
-# 转写类型
-lfasr_type = 0
-# 是否开启分词
-has_participle = 'false'
-has_seperate = 'true'
-# 多候选词个数
-max_alternatives = 0
-# 子用户标识
-suid = ''
-
-class XfyunVoice(Voice):
-    def __init__(self):
-        pass
-
-    def voiceToText(self, voice_file):
-        pass
\ No newline at end of file

From d7a8854fa14fe075e59b363159bfb83c84403b4a Mon Sep 17 00:00:00 2001
From: wanggang <mail@ender.wang>
Date: Wed, 8 Mar 2023 11:32:27 +0800
Subject: [PATCH 6/8] [voice] add support for whisper-1 model

---
 README.md                        | 3 +++
 channel/wechat/wechat_channel.py | 7 ++++---
 voice/google/google_voice.py     | 2 +-
 voice/openai/openai_voice.py     | 8 +++++---
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 1676395..93660e8 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,9 @@ cd chatgpt-on-wechat/
 pip3 install itchat-uos==1.5.0.dev0
 pip3 install --upgrade openai
 
+默认使用openai的whisper-1模型
+如果使用百度的语音识别，需要安装百度的pythonSDK
+pip3 install baidu-aip
 如果使用google的语音识别，需要安装speech_recognition和依赖的ffmpeg和espeak
 pip3 install SpeechRecognition
 --在MacOS中安装ffmpeg，brew install ffmpeg espeak
diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py
index 2282455..0f20613 100644
--- a/channel/wechat/wechat_channel.py
+++ b/channel/wechat/wechat_channel.py
@@ -5,6 +5,7 @@ wechat channel
 """
 
 import os
+import pathlib
 import itchat
 import json
 from itchat.content import *
@@ -37,11 +38,11 @@ def handler_single_voice(msg):
 
 
 class WechatChannel(Channel):
-    tmpFilePath = './tmp/'
+    tmpFilePath = pathlib.Path('./tmp/')
 
     def __init__(self):
-        isExists = os.path.exists(self.tmpFilePath)
-        if not isExists: 
+        pathExists = os.path.exists(self.tmpFilePath)
+        if not pathExists and conf().get('speech_recognition') == True: 
             os.makedirs(self.tmpFilePath)
 
     def startup(self):
diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py
index 58955f4..3fff9d7 100644
--- a/voice/google/google_voice.py
+++ b/voice/google/google_voice.py
@@ -3,6 +3,7 @@
 google voice service
 """
 
+import pathlib
 import subprocess
 import time
 import speech_recognition
@@ -12,7 +13,6 @@ from voice.voice import Voice
 
 
 class GoogleVoice(Voice):
-    tmpFilePath = './tmp/'
     recognizer = speech_recognition.Recognizer()
     engine = pyttsx3.init()
 
diff --git a/voice/openai/openai_voice.py b/voice/openai/openai_voice.py
index 8cc28b7..475aac6 100644
--- a/voice/openai/openai_voice.py
+++ b/voice/openai/openai_voice.py
@@ -4,19 +4,21 @@ google voice service
 """
 import json
 import openai
+from config import conf
 from common.log import logger
 from voice.voice import Voice
 
 
 class OpenaiVoice(Voice):
     def __init__(self):
-        pass
+        openai.api_key = conf().get('open_ai_api_key')
 
     def voiceToText(self, voice_file):
+        logger.debug(
+            '[Openai] voice file name={}'.format(voice_file))
         file = open(voice_file, "rb")
         reply = openai.Audio.transcribe("whisper-1", file)
-        json_dict = json.loads(reply)
-        text = json_dict['text']
+        text = reply["text"]
         logger.info(
             '[Openai] voiceToText text={} voice file name={}'.format(text, voice_file))
         return text

From 3db452ef71940d8cc0d0c4b76ab08955b0738d2a Mon Sep 17 00:00:00 2001
From: wanggang <mail@ender.wang>
Date: Wed, 8 Mar 2023 15:22:46 +0800
Subject: [PATCH 7/8] [voice] using baidu service to gen reply voice

---
 README.md                        |  6 +++---
 channel/wechat/wechat_channel.py | 19 +++++++------------
 common/tmp_dir.py                | 20 ++++++++++++++++++++
 config-template.json             |  1 +
 voice/baidu/baidu_voice.py       | 16 +++++++++++++++-
 voice/google/google_voice.py     |  3 ++-
 6 files changed, 48 insertions(+), 17 deletions(-)
 create mode 100644 common/tmp_dir.py

diff --git a/README.md b/README.md
index 93660e8..8fe3b30 100644
--- a/README.md
+++ b/README.md
@@ -72,9 +72,8 @@ cd chatgpt-on-wechat/
 pip3 install itchat-uos==1.5.0.dev0
 pip3 install --upgrade openai
 
-默认使用openai的whisper-1模型
 如果使用百度的语音识别，需要安装百度的pythonSDK
-pip3 install baidu-aip
+pip3 install baidu-aip chardet
 如果使用google的语音识别，需要安装speech_recognition和依赖的ffmpeg和espeak
 pip3 install SpeechRecognition
 --在MacOS中安装ffmpeg，brew install ffmpeg espeak
@@ -122,7 +121,8 @@ cp config-template.json config.json
 + 可选配置: `group_name_keyword_white_list`配置项支持模糊匹配群名称，`group_chat_keyword`配置项则支持模糊匹配群消息内容，用法与上述两个配置项相同。（Contributed by [evolay](https://github.com/evolay))
 
 **3.语音识别**
-+ 配置`speech_recognition=true`开启语音识别
++ 配置`speech_recognition=true`开启语音识别，默认使用openai的whisper模型
++ 配置`voice_reply_voice=true`语音回复语音，但是需要配置对应语音合成平台的key
 
 **4.其他配置**
 
diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py
index 0f20613..b861e35 100644
--- a/channel/wechat/wechat_channel.py
+++ b/channel/wechat/wechat_channel.py
@@ -4,14 +4,13 @@
 wechat channel
 """
 
-import os
-import pathlib
 import itchat
 import json
 from itchat.content import *
 from channel.channel import Channel
 from concurrent.futures import ThreadPoolExecutor
 from common.log import logger
+from common.tmp_dir import TmpDir
 from config import conf
 import requests
 import io
@@ -38,12 +37,8 @@ def handler_single_voice(msg):
 
 
 class WechatChannel(Channel):
-    tmpFilePath = pathlib.Path('./tmp/')
-
     def __init__(self):
-        pathExists = os.path.exists(self.tmpFilePath)
-        if not pathExists and conf().get('speech_recognition') == True: 
-            os.makedirs(self.tmpFilePath)
+        pass
 
     def startup(self):
         # login by scan QRCode
@@ -59,17 +54,17 @@ class WechatChannel(Channel):
         thread_pool.submit(self._do_handle_voice, msg)
 
     def _do_handle_voice(self, msg):
-        fileName = self.tmpFilePath+msg['FileName']
+        fileName = TmpDir().path() + msg['FileName']
         msg.download(fileName)
         content = super().build_voice_to_text(fileName)
-        self._handle_single_msg(msg, content, False)
+        self._handle_single_msg(msg, content, conf().get('voice_reply_voice'))
 
     def handle_text(self, msg):
         logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False))
         content = msg['Text']
         self._handle_single_msg(msg, content, False)
 
-    def _handle_single_msg(self, msg, content, is_voice):
+    def _handle_single_msg(self, msg, content, reply_voice=False):
         from_user_id = msg['FromUserName']
         to_user_id = msg['ToUserName']              # 接收人id
         other_user_id = msg['User']['UserName']     # 对手方id
@@ -88,7 +83,7 @@ class WechatChannel(Channel):
             if img_match_prefix:
                 content = content.split(img_match_prefix, 1)[1].strip()
                 thread_pool.submit(self._do_send_img, content, from_user_id)
-            elif is_voice:
+            elif reply_voice:
                 thread_pool.submit(self._do_send_voice, content, from_user_id)
             else :
                 thread_pool.submit(self._do_send_text, content, from_user_id)
@@ -101,7 +96,7 @@ class WechatChannel(Channel):
             if img_match_prefix:
                 content = content.split(img_match_prefix, 1)[1].strip()
                 thread_pool.submit(self._do_send_img, content, to_user_id)
-            elif is_voice:
+            elif reply_voice:
                 thread_pool.submit(self._do_send_voice, content, to_user_id)
             else:
                 thread_pool.submit(self._do_send_text, content, to_user_id)
diff --git a/common/tmp_dir.py b/common/tmp_dir.py
new file mode 100644
index 0000000..1738022
--- /dev/null
+++ b/common/tmp_dir.py
@@ -0,0 +1,20 @@
+
+import os
+import pathlib
+from config import conf
+
+
+class TmpDir(object):
+    """A temporary directory that is deleted when the object is destroyed.
+    """
+
+    tmpFilePath = pathlib.Path('./tmp/')
+    
+    def __init__(self):
+        pathExists = os.path.exists(self.tmpFilePath)
+        if not pathExists and conf().get('speech_recognition') == True:
+            os.makedirs(self.tmpFilePath)
+
+    def path(self):
+        return str(self.tmpFilePath) + '/'
+    
\ No newline at end of file
diff --git a/config-template.json b/config-template.json
index f7549d4..7e693f6 100644
--- a/config-template.json
+++ b/config-template.json
@@ -8,6 +8,7 @@
   "image_create_prefix": ["画", "看", "找"],
   "conversation_max_tokens": 1000,
   "speech_recognition": false,
+  "voice_reply_voice": false,
   "baidu_app_id": "YOUR BAIDU APP ID",
   "baidu_api_key": "YOUR BAIDU API KEY",
   "baidu_secret_key": "YOUR BAIDU SERVICE KEY",
diff --git a/voice/baidu/baidu_voice.py b/voice/baidu/baidu_voice.py
index 8534c2b..d99db37 100644
--- a/voice/baidu/baidu_voice.py
+++ b/voice/baidu/baidu_voice.py
@@ -2,7 +2,10 @@
 """
 baidu voice service
 """
+import time
 from aip import AipSpeech
+from common.log import logger
+from common.tmp_dir import TmpDir
 from voice.voice import Voice
 from config import conf
 
@@ -19,4 +22,15 @@ class BaiduVoice(Voice):
         pass
 
     def textToVoice(self, text):
-        pass
+        result = self.client.synthesis(text, 'zh', 1, {
+            'spd': 5, 'pit': 5, 'vol': 5, 'per': 111
+        })
+        if not isinstance(result, dict):
+            fileName = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3'
+            with open(fileName, 'wb') as f:
+                f.write(result)
+            logger.info('[Baidu] textToVoice text={} voice file name={}'.format(text, fileName))
+            return fileName
+        else:
+            logger.error('[Baidu] textToVoice error={}'.format(result))
+            return None
diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py
index 3fff9d7..8e339f2 100644
--- a/voice/google/google_voice.py
+++ b/voice/google/google_voice.py
@@ -9,6 +9,7 @@ import time
 import speech_recognition
 import pyttsx3
 from common.log import logger
+from common.tmp_dir import TmpDir
 from voice.voice import Voice
 
 
@@ -42,7 +43,7 @@ class GoogleVoice(Voice):
             return "抱歉，无法连接到 Google 语音识别服务；{0}".format(e)
 
     def textToVoice(self, text):
-        textFile = self.tmpFilePath + '语音回复_' + str(int(time.time())) + '.mp3'
+        textFile = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3'
         self.engine.save_to_file(text, textFile)
         self.engine.runAndWait()
         logger.info(

From d02508df413f222893042805e065667c8a4596f3 Mon Sep 17 00:00:00 2001
From: wanggang <mail@ender.wang>
Date: Wed, 8 Mar 2023 16:39:25 +0800
Subject: [PATCH 8/8] [voice] Readme modify

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8fe3b30..09ca878 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,7 @@ cp config-template.json config.json
 
 **3.语音识别**
 + 配置`speech_recognition=true`开启语音识别，默认使用openai的whisper模型
-+ 配置`voice_reply_voice=true`语音回复语音，但是需要配置对应语音合成平台的key
++ 配置`voice_reply_voice=true`语音回复语音，但是需要配置对应语音合成平台的key，由于itchat协议的限制，只能发送语音mp3文件。使用wechaty则回复的是微信语音。
 
 **4.其他配置**