fix: get correct audio format in pytts

2026-07-17 11:07:11 +08:00 · 2023-04-01 20:58:06 +08:00
parent 1545a9f262
commit 18aa5ce75c
6 changed files with 49 additions and 39 deletions
--- a/app.py
+++ b/app.py
@@ -1,5 +1,6 @@
 # encoding:utf-8
 import os
 from config import conf, load_config
 from channel import channel_factory
 from common.log import logger
@@ -13,6 +14,10 @@ def run():
        # create channel
        channel_name=conf().get('channel_type', 'wx')
        if channel_name == 'wxy':
            os.environ['WECHATY_LOG']="warn"
            # os.environ['WECHATY_PUPPET_SERVICE_ENDPOINT'] = '127.0.0.1:9001'
        channel = channel_factory.create_channel(channel_name)
        if channel_name in ['wx','wxy']:
            PluginManager().load_plugins()
--- a/channel/chat_channel.py
+++ b/channel/chat_channel.py
@@ -194,14 +194,17 @@ class ChatChannel(Channel):
                'channel': self, 'context': context, 'reply': reply}))
            reply = e_context['reply']
            if not e_context.is_pass() and reply and reply.type:
-                logger.debug('[WX] ready to send reply: {} to {}'.format(reply, context))
+                logger.debug('[WX] ready to send reply: {}, context: {}'.format(reply, context))
                self._send(reply, context)
    def _send(self, reply: Reply, context: Context, retry_cnt = 0):
        try:
            self.send(reply, context)
        except Exception as e:
-            logger.error('[WX] sendMsg error: {}'.format(e))
+            logger.error('[WX] sendMsg error: {}'.format(str(e)))
            if isinstance(e, NotImplementedError):
                return
            logger.exception(e)
            if retry_cnt < 2:
                time.sleep(3+3*retry_cnt)
                self._send(reply, context, retry_cnt+1)
--- a/channel/wechat/wechaty_channel.py
+++ b/channel/wechat/wechaty_channel.py
@@ -20,7 +20,7 @@ from channel.wechat.wechaty_message import WechatyMessage
 from common.log import logger
 from config import conf
 try:
-    from voice.audio_convert import mp3_to_sil
+    from voice.audio_convert import any_to_sil
 except Exception as e:
    pass
@@ -35,14 +35,12 @@ class WechatyChannel(ChatChannel):
        pass
    def startup(self):
        asyncio.run(self.main())
    async def main(self):
        config = conf()
        token = config.get('wechaty_puppet_service_token')
        os.environ['WECHATY_PUPPET_SERVICE_TOKEN'] = token
-        os.environ['WECHATY_LOG']="warn"
+        asyncio.run(self.main())
-        # os.environ['WECHATY_PUPPET_SERVICE_ENDPOINT'] = '127.0.0.1:9001'
+
    async def main(self):
        self.bot = Wechaty()
        self.bot.on('login', self.on_login)
        self.bot.on('message', self.on_message)
@@ -72,18 +70,9 @@ class WechatyChannel(ChatChannel):
            logger.info('[WX] sendMsg={}, receiver={}'.format(reply, receiver))
        elif reply.type == ReplyType.VOICE:
            voiceLength = None
-            if reply.content.endswith('.mp3'):
+            file_path = reply.content
-                mp3_file = reply.content
+            sil_file = os.path.splitext(file_path)[0] + '.sil'
-                sil_file = os.path.splitext(mp3_file)[0] + '.sil'
+            voiceLength = any_to_sil(file_path, sil_file)
                voiceLength = mp3_to_sil(mp3_file, sil_file)
                try:
                    os.remove(mp3_file)
                except Exception as e:
                    pass
            elif reply.content.endswith('.sil'):
                sil_file = reply.content
            else:
                raise Exception('voice file must be mp3 or sil format')
            # 发送语音
            t = int(time.time())
            msg = FileBox.from_file(sil_file, name=str(t) + '.sil')
@@ -91,6 +80,7 @@ class WechatyChannel(ChatChannel):
                msg.metadata['voiceLength'] = voiceLength
            asyncio.run_coroutine_threadsafe(receiver.say(msg),loop).result()
            try:
                os.remove(file_path)
                os.remove(sil_file)
            except Exception as e:
                pass
--- a/voice/audio_convert.py
+++ b/voice/audio_convert.py
@@ -1,8 +1,8 @@
 import shutil
 import wave
 import pysilk
 from pydub import AudioSegment
 def get_pcm_from_wav(wav_path):
    """
    从 wav 文件中读取 pcm
@@ -13,6 +13,30 @@ def get_pcm_from_wav(wav_path):
    wav = wave.open(wav_path, "rb")
    return wav.readframes(wav.getnframes())
 def any_to_wav(any_path, wav_path):
    """
    把任意格式转成wav文件
    """
    if any_path.endswith('.wav'):
        shutil.copy2(any_path, wav_path)
        return
    if any_path.endswith('.sil') or any_path.endswith('.silk') or any_path.endswith('.slk'):
        return sil_to_wav(any_path, wav_path)
    audio = AudioSegment.from_file(any_path)
    audio.export(wav_path, format="wav")
 def any_to_sil(any_path, sil_path):
    """
    把任意格式转成sil文件
    """
    if any_path.endswith('.sil') or any_path.endswith('.silk') or any_path.endswith('.slk'):
        shutil.copy2(any_path, sil_path)
        return 10000
    if any_path.endswith('.wav'):
        return pcm_to_sil(any_path, sil_path)
    if any_path.endswith('.mp3'):
        return mp3_to_sil(any_path, sil_path)
    raise NotImplementedError("Not support file type: {}".format(any_path))
 def mp3_to_wav(mp3_path, wav_path):
    """
@@ -21,18 +45,7 @@ def mp3_to_wav(mp3_path, wav_path):
    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(wav_path, format="wav")
-def any_to_wav(any_path, wav_path):
+def pcm_to_sil(pcm_path, silk_path):
    """
    把任意格式转成wav文件
    """
    if any_path.endswith('.wav'):
        return
    if any_path.endswith('.sil') or any_path.endswith('.silk') or any_path.endswith('.slk'):
        return sil_to_wav(any_path, wav_path)
    audio = AudioSegment.from_file(any_path)
    audio.export(wav_path, format="wav")
 def pcm_to_silk(pcm_path, silk_path):
    """
    wav 文件转成 silk
    return 声音长度，毫秒
@@ -60,7 +73,6 @@ def mp3_to_sil(mp3_path, silk_path):
        f.write(silk_data)
    return audio.duration_seconds * 1000
 def sil_to_wav(silk_path, wav_path, rate: int = 24000):
    """
    silk 文件转 wav
--- a/voice/azure/azure_voice.py
+++ b/voice/azure/azure_voice.py
@@ -56,7 +56,7 @@ class AzureVoice(Voice):
        return reply
    def textToVoice(self, text):
-        fileName = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3'
+        fileName = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.wav'
        audio_config = speechsdk.AudioConfig(filename=fileName)
        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config, audio_config=audio_config)
        result = speech_synthesizer.speak_text(text)
--- a/voice/pytts/pytts_voice.py
+++ b/voice/pytts/pytts_voice.py
@@ -25,12 +25,12 @@ class PyttsVoice(Voice):
    def textToVoice(self, text):
        try:
-            mp3File = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3'
+            wavFile = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.wav'
-            self.engine.save_to_file(text, mp3File)
+            self.engine.save_to_file(text, wavFile)
            self.engine.runAndWait()
            logger.info(
-                '[Pytts] textToVoice text={} voice file name={}'.format(text, mp3File))
+                '[Pytts] textToVoice text={} voice file name={}'.format(text, wavFile))
-            reply = Reply(ReplyType.VOICE, mp3File)
+            reply = Reply(ReplyType.VOICE, wavFile)
        except Exception as e:
            reply = Reply(ReplyType.ERROR, str(e))
        finally: