Files
chatgpt-on-wechat/voice/audio_convert.py
zhayujie b80c3fe5a8 feat(feishu): enhance #2791 with cardkit streaming + ASR fixes
- rewrite streaming reply to official cardkit v2.0 API (default on, auto-fallback)
- fix Whisper hallucination: bump ASR sample rate to 16k, pass language=zh
- fix lock-over-IO and tmp file cleanup from #2791
- drop deprecated feishu_bot_name; quiet unknown-key warnings
- docs: cardkit permission and feishu_stream_reply usage
2026-05-05 14:15:25 +08:00

157 lines
5.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import shutil
import wave
from common.log import logger
try:
import pysilk
except ImportError:
logger.debug("import pysilk failed, silk voice format will not be supported.")
try:
from pydub import AudioSegment
_pydub_available = True
except ImportError:
logger.debug("import pydub failed, voice conversion features will not be supported.")
AudioSegment = None
_pydub_available = False
sil_supports = [8000, 12000, 16000, 24000, 32000, 44100, 48000] # slk转wav时支持的采样率
def find_closest_sil_supports(sample_rate):
"""
找到最接近的支持的采样率
"""
if sample_rate in sil_supports:
return sample_rate
closest = 0
mindiff = 9999999
for rate in sil_supports:
diff = abs(rate - sample_rate)
if diff < mindiff:
closest = rate
mindiff = diff
return closest
def get_pcm_from_wav(wav_path):
"""
从 wav 文件中读取 pcm
:param wav_path: wav 文件路径
:returns: pcm 数据
"""
wav = wave.open(wav_path, "rb")
return wav.readframes(wav.getnframes())
def any_to_mp3(any_path, mp3_path):
"""
把任意格式转成mp3文件
"""
if not _pydub_available:
raise ImportError("pydub is required for audio conversion. Please install it with: pip install pydub")
if any_path.endswith(".mp3"):
shutil.copy2(any_path, mp3_path)
return
if any_path.endswith(".sil") or any_path.endswith(".silk") or any_path.endswith(".slk"):
sil_to_wav(any_path, any_path)
any_path = mp3_path
audio = AudioSegment.from_file(any_path)
audio.export(mp3_path, format="mp3")
def any_to_wav(any_path, wav_path):
"""
把任意格式转成wav文件
"""
if not _pydub_available:
raise ImportError("pydub is required for audio conversion. Please install it with: pip install pydub")
if any_path.endswith(".wav"):
shutil.copy2(any_path, wav_path)
return
if any_path.endswith(".sil") or any_path.endswith(".silk") or any_path.endswith(".slk"):
return sil_to_wav(any_path, wav_path)
# pydub 0.23.0+ 会将 parameters 追加到 ffmpeg 命令的输出文件 `-` 之后,
# 因此 -nostdin 可能被当作"尾部选项"处理,是否生效取决于 ffmpeg 版本。
# 目的是防止后台服务中 ffmpeg 子进程继承父进程的 stdin避免死锁。
audio = AudioSegment.from_file(any_path, parameters=["-nostdin"])
# AudioSegment 是不可变对象set_frame_rate/set_channels 返回新对象,不修改原对象。
# 必须将返回值重新赋给 audio否则修改不会生效。
audio = audio.set_frame_rate(16000)
audio = audio.set_channels(1)
audio.export(wav_path, format="wav", codec='pcm_s16le')
def any_to_sil(any_path, sil_path):
"""
把任意格式转成sil文件
"""
if not _pydub_available:
raise ImportError("pydub is required for audio conversion. Please install it with: pip install pydub")
if any_path.endswith(".sil") or any_path.endswith(".silk") or any_path.endswith(".slk"):
shutil.copy2(any_path, sil_path)
return 10000
audio = AudioSegment.from_file(any_path)
rate = find_closest_sil_supports(audio.frame_rate)
# Convert to PCM_s16
pcm_s16 = audio.set_sample_width(2)
pcm_s16 = pcm_s16.set_frame_rate(rate)
wav_data = pcm_s16.raw_data
silk_data = pysilk.encode(wav_data, data_rate=rate, sample_rate=rate)
with open(sil_path, "wb") as f:
f.write(silk_data)
return audio.duration_seconds * 1000
def any_to_amr(any_path, amr_path):
"""
把任意格式转成amr文件
"""
if not _pydub_available:
raise ImportError("pydub is required for audio conversion. Please install it with: pip install pydub")
if any_path.endswith(".amr"):
shutil.copy2(any_path, amr_path)
return
if any_path.endswith(".sil") or any_path.endswith(".silk") or any_path.endswith(".slk"):
raise NotImplementedError("Not support file type: {}".format(any_path))
audio = AudioSegment.from_file(any_path)
audio = audio.set_frame_rate(8000) # only support 8000
audio.export(amr_path, format="amr")
return audio.duration_seconds * 1000
def sil_to_wav(silk_path, wav_path, rate: int = 24000):
"""
silk 文件转 wav
"""
wav_data = pysilk.decode_file(silk_path, to_wav=True, sample_rate=rate)
with open(wav_path, "wb") as f:
f.write(wav_data)
def split_audio(file_path, max_segment_length_ms=60000):
"""
分割音频文件
"""
if not _pydub_available:
raise ImportError("pydub is required for audio conversion. Please install it with: pip install pydub")
audio = AudioSegment.from_file(file_path)
audio_length_ms = len(audio)
if audio_length_ms <= max_segment_length_ms:
return audio_length_ms, [file_path]
segments = []
for start_ms in range(0, audio_length_ms, max_segment_length_ms):
end_ms = min(audio_length_ms, start_ms + max_segment_length_ms)
segment = audio[start_ms:end_ms]
segments.append(segment)
file_prefix = file_path[: file_path.rindex(".")]
format = file_path[file_path.rindex(".") + 1 :]
files = []
for i, segment in enumerate(segments):
path = f"{file_prefix}_{i+1}" + f".{format}"
segment.export(path, format=format)
files.append(path)
return audio_length_ms, files