Compare commits

...

6 Commits

Author SHA1 Message Date
lanvent
d9ef5a6612 fix: 无前缀触发bug 2023-03-30 18:26:44 +08:00
lanvent
66a81cd47c fix: 修复群语音触发bug 2023-03-30 16:26:01 +08:00
lanvent
81edd13470 Merge branch 'master' of https://github.com/zhayujie/chatgpt-on-wechat into master-dev 2023-03-30 16:07:29 +08:00
lanvent
7a94745b8a fix: group chat bug 2023-03-30 16:06:57 +08:00
zhanws
06b02f5df8 解决百度语音合成的一些问题和参数化设置 (#676)
* 解决百度语音合成的一些问题和参数化设置

* 补充百度语音说明
2023-03-30 14:59:52 +08:00
lanvent
83136e3142 feat: refactor handle function 2023-03-30 14:44:45 +08:00
6 changed files with 249 additions and 132 deletions

View File

@@ -14,6 +14,15 @@ class Context:
self.type = type
self.content = content
self.kwargs = kwargs
def __contains__(self, key):
if key == 'type':
return self.type is not None
elif key == 'content':
return self.content is not None
else:
return key in self.kwargs
def __getitem__(self, key):
if key == 'type':
return self.type
@@ -21,6 +30,12 @@ class Context:
return self.content
else:
return self.kwargs[key]
def get(self, key, default=None):
try:
return self[key]
except KeyError:
return default
def __setitem__(self, key, value):
if key == 'type':

View File

@@ -19,6 +19,7 @@ from common.log import logger
from common.tmp_dir import TmpDir
from config import conf
from common.time_check import time_checker
from common.expired_dict import ExpiredDict
from plugins import *
try:
from voice.audio_convert import mp3_to_wav
@@ -53,12 +54,26 @@ def handler_group_voice(msg):
WechatChannel().handle_group_voice(msg)
return None
def _check(func):
def wrapper(self, msg):
msgId = msg['MsgId']
if msgId in self.receivedMsgs:
logger.info("Wechat message {} already received, ignore".format(msgId))
return
self.receivedMsgs[msgId] = msg
create_time = msg['CreateTime'] # 消息时间
if conf().get('hot_reload') == True and int(create_time) < int(time.time()) - 60: # 跳过1分钟前的历史消息
logger.debug("[WX]history message {} skipped".format(msgId))
return
return func(self, msg)
return wrapper
class WechatChannel(Channel):
def __init__(self):
self.userName = None
self.nickName = None
self.receivedMsgs = ExpiredDict(60*60*24)
def startup(self):
@@ -90,7 +105,11 @@ class WechatChannel(Channel):
# isgroup: 是否是群聊
# receiver: 需要回复的对象
# msg: itchat的原始消息对象
# origin_ctype: 原始消息类型,语音转文字后,私聊时如果匹配前缀失败,会根据初始消息是否是语音来放宽触发规则
# desire_rtype: 希望回复类型默认是文本回复设置为ReplyType.VOICE是语音回复
@time_checker
@_check
def handle_voice(self, msg):
if conf().get('speech_recognition') != True:
return
@@ -106,11 +125,12 @@ class WechatChannel(Channel):
else:
other_user_id = from_user_id
if from_user_id == other_user_id:
context = Context(ContextType.VOICE,msg['FileName'])
context.kwargs = {'isgroup': False, 'msg': msg, 'receiver': other_user_id, 'session_id': other_user_id}
thread_pool.submit(self.handle, context).add_done_callback(thread_pool_callback)
context = self._compose_context(ContextType.VOICE, msg['FileName'], isgroup=False, msg=msg, receiver=other_user_id, session_id=other_user_id)
if context:
thread_pool.submit(self.handle, context).add_done_callback(thread_pool_callback)
@time_checker
@_check
def handle_text(self, msg):
logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False))
content = msg['Text']
@@ -124,41 +144,20 @@ class WechatChannel(Channel):
other_user_id = to_user_id
else:
other_user_id = from_user_id
create_time = msg['CreateTime'] # 消息时间
match_prefix = check_prefix(content, conf().get('single_chat_prefix'))
if conf().get('hot_reload') == True and int(create_time) < int(time.time()) - 60: # 跳过1分钟前的历史消息
logger.debug("[WX]history message skipped")
return
if "\n- - - - - - - - - - - - - - -" in content:
logger.debug("[WX]reference query skipped")
return
if match_prefix:
content = content.replace(match_prefix, '', 1).strip()
elif match_prefix is None:
return
context = Context()
context.kwargs = {'isgroup': False, 'msg': msg,
'receiver': other_user_id, 'session_id': other_user_id}
img_match_prefix = check_prefix(content, conf().get('image_create_prefix'))
if img_match_prefix:
content = content.replace(img_match_prefix, '', 1).strip()
context.type = ContextType.IMAGE_CREATE
else:
context.type = ContextType.TEXT
context.content = content
thread_pool.submit(self.handle, context).add_done_callback(thread_pool_callback)
context = self._compose_context(ContextType.TEXT, content, isgroup=False, msg=msg, receiver=other_user_id, session_id=other_user_id)
if context:
thread_pool.submit(self.handle, context).add_done_callback(thread_pool_callback)
@time_checker
@_check
def handle_group(self, msg):
logger.debug("[WX]receive group msg: " + json.dumps(msg, ensure_ascii=False))
group_name = msg['User'].get('NickName', None)
group_id = msg['User'].get('UserName', None)
create_time = msg['CreateTime'] # 消息时间
if conf().get('hot_reload') == True and int(create_time) < int(time.time()) - 60: # 跳过1分钟前的历史消息
logger.debug("[WX]history group message skipped")
return
if not group_name:
return ""
origin_content = msg['Content']
@@ -172,13 +171,75 @@ class WechatChannel(Channel):
if "\n- - - - - - - - - - - - - - -" in content:
logger.debug("[WX]reference query skipped")
return ""
config = conf()
match_prefix = (msg['IsAt'] and not config.get("group_at_off", False)) or check_prefix(origin_content, config.get('group_chat_prefix')) \
or check_contain(origin_content, config.get('group_chat_keyword'))
if ('ALL_GROUP' in config.get('group_name_white_list') or group_name in config.get('group_name_white_list') or check_contain(group_name, config.get('group_name_keyword_white_list'))) and match_prefix:
context = Context()
context.kwargs = { 'isgroup': True, 'msg': msg, 'receiver': group_id}
config = conf()
group_name_white_list = config.get('group_name_white_list', [])
group_name_keyword_white_list = config.get('group_name_keyword_white_list', [])
if any([group_name in group_name_white_list, 'ALL_GROUP' in group_name_white_list, check_contain(group_name, group_name_keyword_white_list)]):
group_chat_in_one_session = conf().get('group_chat_in_one_session', [])
session_id = msg['ActualUserName']
if any([group_name in group_chat_in_one_session, 'ALL_GROUP' in group_chat_in_one_session]):
session_id = group_id
context = self._compose_context(ContextType.TEXT, content, isgroup=True, msg=msg, receiver=group_id, session_id=session_id)
if context:
thread_pool.submit(self.handle, context).add_done_callback(thread_pool_callback)
@time_checker
@_check
def handle_group_voice(self, msg):
if conf().get('group_speech_recognition', False) != True:
return
logger.debug("[WX]receive voice for group msg: " + msg['FileName'])
group_name = msg['User'].get('NickName', None)
group_id = msg['User'].get('UserName', None)
# 验证群名
if not group_name:
return ""
config = conf()
group_name_white_list = config.get('group_name_white_list', [])
group_name_keyword_white_list = config.get('group_name_keyword_white_list', [])
if any([group_name in group_name_white_list, 'ALL_GROUP' in group_name_white_list, check_contain(group_name, group_name_keyword_white_list)]):
group_chat_in_one_session = conf().get('group_chat_in_one_session', [])
session_id =msg['ActualUserName']
if any([group_name in group_chat_in_one_session, 'ALL_GROUP' in group_chat_in_one_session]):
session_id = group_id
context = self._compose_context(ContextType.VOICE, msg['FileName'], isgroup=True, msg=msg, receiver=group_id, session_id=session_id)
if context:
thread_pool.submit(self.handle, context).add_done_callback(thread_pool_callback)
# 根据消息构造context消息内容相关的触发项写在这里
def _compose_context(self, ctype: ContextType, content, **kwargs):
context = Context(ctype, content)
context.kwargs = kwargs
if 'origin_ctype' not in context:
context['origin_ctype'] = ctype
if ctype == ContextType.TEXT:
if context["isgroup"]: # 群聊
# 校验关键字
match_prefix = check_prefix(content, conf().get('group_chat_prefix'))
match_contain = check_contain(content, conf().get('group_chat_keyword'))
if match_prefix is not None or match_contain is not None:
# 判断如果匹配到自定义前缀,则返回过滤掉前缀+空格后的内容,用于实现类似自定义+前缀触发生成AI图片的功能
if match_prefix:
content = content.replace(match_prefix, '', 1).strip()
elif context['msg']['IsAt'] and not conf().get("group_at_off", False):
logger.info("[WX]receive group at, continue")
elif context["origin_ctype"] == ContextType.VOICE:
logger.info("[WX]receive group voice, checkprefix didn't match")
return None
else:
return None
else: # 单聊
match_prefix = check_prefix(content, conf().get('single_chat_prefix'))
if match_prefix is not None: # 判断如果匹配到自定义前缀,则返回过滤掉前缀+空格后的内容
content = content.replace(match_prefix, '', 1).strip()
elif context["origin_ctype"] == ContextType.VOICE: # 如果源消息是私聊的语音消息,允许不匹配前缀,放宽条件
pass
else:
return None
img_match_prefix = check_prefix(content, conf().get('image_create_prefix'))
if img_match_prefix:
content = content.replace(img_match_prefix, '', 1).strip()
@@ -186,44 +247,11 @@ class WechatChannel(Channel):
else:
context.type = ContextType.TEXT
context.content = content
group_chat_in_one_session = conf().get('group_chat_in_one_session', [])
if ('ALL_GROUP' in group_chat_in_one_session or
group_name in group_chat_in_one_session or
check_contain(group_name, group_chat_in_one_session)):
context['session_id'] = group_id
else:
context['session_id'] = msg['ActualUserName']
thread_pool.submit(self.handle, context).add_done_callback(thread_pool_callback)
def handle_group_voice(self, msg):
if conf().get('group_speech_recognition', False) != True:
return
logger.debug("[WX]receive voice for group msg: " + msg['FileName'])
group_name = msg['User'].get('NickName', None)
group_id = msg['User'].get('UserName', None)
create_time = msg['CreateTime'] # 消息时间
if conf().get('hot_reload') == True and int(create_time) < int(time.time()) - 60: #跳过1分钟前的历史消息
logger.debug("[WX]history group voice skipped")
return
# 验证群名
if not group_name:
return ""
if ('ALL_GROUP' in conf().get('group_name_white_list') or group_name in conf().get('group_name_white_list') or check_contain(group_name, conf().get('group_name_keyword_white_list'))):
context = Context(ContextType.VOICE,msg['FileName'])
context.kwargs = {'isgroup': True, 'msg': msg, 'receiver': group_id}
group_chat_in_one_session = conf().get('group_chat_in_one_session', [])
if ('ALL_GROUP' in group_chat_in_one_session or
group_name in group_chat_in_one_session or
check_contain(group_name, group_chat_in_one_session)):
context['session_id'] = group_id
else:
context['session_id'] = msg['ActualUserName']
thread_pool.submit(self.handle, context).add_done_callback(thread_pool_callback)
elif context.type == ContextType.VOICE:
if 'desire_rtype' not in context and conf().get('voice_reply_voice'):
context['desire_rtype'] = ReplyType.VOICE
return context
# 统一的发送函数每个Channel自行实现根据reply的type字段发送不同类型的消息
def send(self, reply: Reply, receiver, retry_cnt = 0):
try:
@@ -257,23 +285,29 @@ class WechatChannel(Channel):
self.send(reply, receiver, retry_cnt + 1)
# 处理消息 TODO: 如果wechaty解耦此处逻辑可以放置到父类
def handle(self, context):
if not context.content:
return
reply = Reply()
def handle(self, context: Context):
if context is None or not context.content:
return
logger.debug('[WX] ready to handle context: {}'.format(context))
# reply的构建步骤
reply = self._generate_reply(context)
logger.debug('[WX] ready to decorate reply: {}'.format(reply))
# reply的包装步骤
reply = self._decorate_reply(context, reply)
# reply的发送步骤
self._send_reply(context, reply)
def _generate_reply(self, context: Context, reply: Reply = Reply()) -> Reply:
e_context = PluginManager().emit_event(EventContext(Event.ON_HANDLE_CONTEXT, {
'channel': self, 'context': context, 'reply': reply}))
reply = e_context['reply']
if not e_context.is_pass():
logger.debug('[WX] ready to handle context: type={}, content={}'.format(context.type, context.content))
if context.type == ContextType.TEXT or context.type == ContextType.IMAGE_CREATE: # 文字和图片消息
if context.type == ContextType.TEXT or context.type == ContextType.IMAGE_CREATE: # 文字和图片消息
reply = super().build_reply_content(context.content, context)
elif context.type == ContextType.VOICE: # 语音消息
elif context.type == ContextType.VOICE: # 语音消息
msg = context['msg']
mp3_path = TmpDir().path() + context.content
msg.download(mp3_path)
@@ -281,7 +315,7 @@ class WechatChannel(Channel):
wav_path = os.path.splitext(mp3_path)[0] + '.wav'
try:
mp3_to_wav(mp3_path=mp3_path, wav_path=wav_path)
except Exception as e: # 转换失败直接使用mp3对于某些apimp3也可以识别
except Exception as e: # 转换失败直接使用mp3对于某些apimp3也可以识别
logger.warning("[WX]mp3 to wav error, use mp3 path. " + str(e))
wav_path = mp3_path
# 语音识别
@@ -293,50 +327,30 @@ class WechatChannel(Channel):
except Exception as e:
logger.warning("[WX]delete temp file error: " + str(e))
if reply.type != ReplyType.ERROR and reply.type != ReplyType.INFO:
content = reply.content # 语音转文字后,将文字内容作为新的context
context.type = ContextType.TEXT
if context["isgroup"]: # 群聊
# 校验关键字
match_prefix = check_prefix(content, conf().get('group_chat_prefix'))
match_contain = check_contain(content, conf().get('group_chat_keyword'))
if match_prefix is not None or match_contain is not None:
# 判断如果匹配到自定义前缀,则返回过滤掉前缀+空格后的内容,用于实现类似自定义+前缀触发生成AI图片的功能
if match_prefix:
content = content.replace(match_prefix, '', 1).strip()
else:
logger.info("[WX]receive voice, checkprefix didn't match")
return
else: # 单聊
match_prefix = check_prefix(content, conf().get('single_chat_prefix'))
if match_prefix: # 判断如果匹配到自定义前缀,则返回过滤掉前缀+空格后的内容
content = content.replace(match_prefix, '', 1).strip()
img_match_prefix = check_prefix(content, conf().get('image_create_prefix'))
if img_match_prefix:
content = content.replace(img_match_prefix, '', 1).strip()
context.type = ContextType.IMAGE_CREATE
if reply.type == ReplyType.TEXT:
new_context = self._compose_context(
ContextType.TEXT, reply.content, **context.kwargs)
if new_context:
reply = self._generate_reply(new_context)
else:
context.type = ContextType.TEXT
context.content = content
reply = super().build_reply_content(context.content, context)
if reply.type == ReplyType.TEXT:
if conf().get('voice_reply_voice'):
reply = super().build_text_to_voice(reply.content)
return
else:
logger.error('[WX] unknown context type: {}'.format(context.type))
return
return reply
logger.debug('[WX] ready to decorate reply: {}'.format(reply))
# reply的包装步骤
def _decorate_reply(self, context: Context, reply: Reply) -> Reply:
if reply and reply.type:
e_context = PluginManager().emit_event(EventContext(Event.ON_DECORATE_REPLY, {
'channel': self, 'context': context, 'reply': reply}))
reply = e_context['reply']
desire_rtype = context.get('desire_rtype')
if not e_context.is_pass() and reply and reply.type:
if reply.type == ReplyType.TEXT:
reply_text = reply.content
if desire_rtype == ReplyType.VOICE:
reply = super().build_text_to_voice(reply.content)
return self._decorate_reply(context, reply)
if context['isgroup']:
reply_text = '@' + context['msg']['ActualNickName'] + ' ' + reply_text.strip()
reply_text = conf().get("group_chat_reply_prefix", "")+reply_text
@@ -350,8 +364,11 @@ class WechatChannel(Channel):
else:
logger.error('[WX] unknown reply type: {}'.format(reply.type))
return
if desire_rtype and desire_rtype != reply.type and reply.type not in [ReplyType.ERROR, ReplyType.INFO]:
logger.warning('[WX] desire_rtype: {}, but reply type: {}'.format(context.get('desire_rtype'), reply.type))
return reply
# reply的发送步骤
def _send_reply(self, context: Context, reply: Reply):
if reply and reply.type:
e_context = PluginManager().emit_event(EventContext(Event.ON_SEND_REPLY, {
'channel': self, 'context': context, 'reply': reply}))
@@ -360,6 +377,7 @@ class WechatChannel(Channel):
logger.debug('[WX] ready to send reply: {} to {}'.format(reply, context['receiver']))
self.send(reply, context['receiver'])
def check_prefix(content, prefix_list):
for prefix in prefix_list:
if content.startswith(prefix):

View File

@@ -70,6 +70,8 @@ available_setting = {
# chatgpt指令自定义触发词
"clear_memory_commands": ['#清除记忆'], # 重置会话指令
# channel配置
"channel_type": "wx", # 通道类型支持wx,wxy和terminal

55
voice/baidu/README.md Normal file
View File

@@ -0,0 +1,55 @@
## 说明
百度语音识别与合成参数说明
百度语音依赖,经常会出现问题,可能就是缺少依赖:
pip install baidu-aip
pip install pydub
pip install pysilk
还有ffmpeg不同系统安装方式不同
系统中收到的语音文件为mp3格式wx或者sil格式wxy如果要识别需要转换为pcm格式转换后的文件为16k采样率单声道16bit的pcm文件
发送时又需要wx转换为mp3格式转换后的文件为16k采样率单声道16bit的pcm文件,wxy转换为sil格式,还要计算声音长度,发送时需要带上声音长度
这些事情都在audio_convert.py中封装了直接调用即可
参数说明
识别参数
https://ai.baidu.com/ai-doc/SPEECH/Vk38lxily
合成参数
https://ai.baidu.com/ai-doc/SPEECH/Gk38y8lzk
## 使用说明
分两个地方配置
1、对于def voiceToText(self, filename)函数中调用的百度语音识别API,中接口调用asr参数这个配置见CHATGPT-ON-WECHAT工程目录下的`config.json`文件和config.py文件。
参数 可需 描述
app_id 必填 应用的APPID
api_key 必填 应用的APIKey
secret_key 必填 应用的SecretKey
dev_pid 必填 语言选择,填写语言对应的dev_pid值
2、对于def textToVoice(self, text)函数中调用的百度语音合成API,中接口调用synthesis参数在本目录下的`config.json`文件中进行配置。
参数 可需 描述
tex 必填 合成的文本使用UTF-8编码请注意文本长度必须小于1024字节
lan 必填 固定值zh。语言选择,目前只有中英文混合模式填写固定值zh
spd 选填 语速取值0-15默认为5中语速
pit 选填 音调取值0-15默认为5中语调
vol 选填 音量取值0-15默认为5中音量取值为0时为音量最小值并非为无声
per基础音库 选填 度小宇=1度小美=0度逍遥基础=3度丫丫=4
per精品音库 选填 度逍遥(精品)=5003度小鹿=5118度博文=106度小童=110度小萌=111度米朵=103度小娇=5
aue 选填 3为mp3格式(默认) 4为pcm-16k5为pcm-8k6为wav内容同pcm-16k; 注意aue=4或者6是语音识别要求的格式但是音频内容不是语音识别要求的自然人发音所以识别效果会受影响。
关于per参数的说明注意您购买的哪个音库就填写哪个音库的参数否则会报错。如果您购买的是基础音库那么per参数只能填写0到4如果您购买的是精品音库那么per参数只能填写50035118106,110,111,103,5其他的都会报错。
### 配置文件
将文件夹中`config.json.template`复制为`config.json`
``` json
{
"lang": "zh",
"ctp": 1,
"spd": 5,
"pit": 5,
"vol": 5,
"per": 0
}
```

View File

@@ -2,6 +2,8 @@
"""
baidu voice service
"""
import json
import os
import time
from aip import AipSpeech
from bridge.reply import Reply, ReplyType
@@ -21,29 +23,47 @@ from config import conf
- 1837四川话
要使用本模块, 首先到 yuyin.baidu.com 注册一个开发者账号,
之后创建一个新应用, 然后在应用管理的"查看key"中获得 API Key 和 Secret Key
填入 config.json 中.
baidu_app_id: ''
baidu_api_key: ''
baidu_secret_key: ''
baidu_dev_pid: '1536'
"""
然后在 config.json 中填入这两个值, 以及 app_id, dev_pid
"""
class BaiduVoice(Voice):
APP_ID = conf().get('baidu_app_id')
API_KEY = conf().get('baidu_api_key')
SECRET_KEY = conf().get('baidu_secret_key')
DEV_ID = conf().get('baidu_dev_pid')
client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
def __init__(self):
pass
try:
curdir = os.path.dirname(__file__)
config_path = os.path.join(curdir, "config.json")
bconf = None
if not os.path.exists(config_path): #如果没有配置文件,创建本地配置文件
bconf = { "lang": "zh", "ctp": 1, "spd": 5,
"pit": 5, "vol": 5, "per": 0}
with open(config_path, "w") as fw:
json.dump(bconf, fw, indent=4)
else:
with open(config_path, "r") as fr:
bconf = json.load(fr)
self.app_id = conf().get('baidu_app_id')
self.api_key = conf().get('baidu_api_key')
self.secret_key = conf().get('baidu_secret_key')
self.dev_id = conf().get('baidu_dev_pid')
self.lang = bconf["lang"]
self.ctp = bconf["ctp"]
self.spd = bconf["spd"]
self.pit = bconf["pit"]
self.vol = bconf["vol"]
self.per = bconf["per"]
self.client = AipSpeech(self.app_id, self.api_key, self.secret_key)
except Exception as e:
logger.warn("BaiduVoice init failed: %s, ignore " % e)
def voiceToText(self, voice_file):
# 识别本地文件
logger.debug('[Baidu] voice file name={}'.format(voice_file))
pcm = get_pcm_from_wav(voice_file)
res = self.client.asr(pcm, "pcm", 16000, {"dev_pid": self.DEV_ID})
res = self.client.asr(pcm, "pcm", 16000, {"dev_pid": self.dev_id})
if res["err_no"] == 0:
logger.info("百度语音识别到了:{}".format(res["result"]))
text = "".join(res["result"])
@@ -57,9 +77,8 @@ class BaiduVoice(Voice):
return reply
def textToVoice(self, text):
result = self.client.synthesis(text, 'zh', 1, {
'spd': 5, 'pit': 5, 'vol': 5, 'per': 111
})
result = self.client.synthesis(text, self.lang, self.ctp, {
'spd': self.spd, 'pit': self.pit, 'vol': self.vol, 'per': self.per})
if not isinstance(result, dict):
fileName = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3'
with open(fileName, 'wb') as f:

View File

@@ -0,0 +1,8 @@
{
"lang": "zh",
"ctp": 1,
"spd": 5,
"pit": 5,
"vol": 5,
"per": 0
}