feat: 接入 Edge TTS 免费中文语音合成

设备不支持内置中文 TTS,改用微软 Edge TTS(WebSocket 协议)。
- 新增 EdgeTtsManager:WebSocket 调用、MP3 缓存、MediaPlayer 播放
- 调试模式(设置页点头像6次)触发 TTS 测试
- 缓存机制:相同文本不重复请求,上限50条自动清理

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
dongliang
2026-05-08 12:33:59 +09:30
parent 557363c80d
commit 5a386a6060
2 changed files with 355 additions and 38 deletions

View File

@@ -0,0 +1,343 @@
package com.xiaoqu.watch.service.manager
import android.content.Context
import android.media.MediaPlayer
import dagger.hilt.android.qualifiers.ApplicationContext
import kotlinx.coroutines.*
import okhttp3.*
import okio.ByteString
import timber.log.Timber
import java.io.ByteArrayOutputStream
import java.io.File
import java.security.MessageDigest
import java.text.SimpleDateFormat
import java.util.*
import javax.inject.Inject
import javax.inject.Singleton
/**
* Edge TTS 语音合成管理器
* 使用微软 Edge 浏览器的免费 TTS 服务,支持高质量中文语音合成。
* 通过 WebSocket 连接,发送文本,接收 MP3 音频并播放。
*/
@Singleton
class EdgeTtsManager @Inject constructor(
@ApplicationContext private val context: Context,
private val okHttpClient: OkHttpClient
) {
companion object {
private const val TAG = "EdgeTTS"
/** WebSocket 基础地址 */
private const val WSS_BASE = "wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1"
/** 固定的信任客户端 Token */
private const val TRUSTED_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"
/** GEC 版本号 */
private const val GEC_VERSION = "1-143.0.3650.75"
/** Windows 纪元偏移Unix epoch → Windows file time epoch */
private const val WINDOWS_EPOCH_OFFSET = 11644473600L
/** 默认中文女声(晓晓,微软神经网络语音,音质最好) */
const val VOICE_XIAOXIAO = "zh-CN-XiaoxiaoNeural"
/** 中文男声(云希) */
const val VOICE_YUNXI = "zh-CN-YunxiNeural"
/** 音频输出格式24kHz 48kbps MP3体积小质量够用 */
private const val OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"
/** 缓存目录名 */
private const val CACHE_DIR = "tts_cache"
/** 缓存最大条数 */
private const val MAX_CACHE_SIZE = 50
}
/** 当前 MediaPlayer */
private var mediaPlayer: MediaPlayer? = null
/** 是否正在播放 */
var isPlaying: Boolean = false
private set
/** 时钟偏移(秒),用于 GEC Token 生成 */
private var clockSkewSeconds: Long = 0
/** 播放完成回调 */
var onComplete: (() -> Unit)? = null
/**
* 合成并播放语音
* @param text 要朗读的文本
* @param voice 语音类型,默认晓晓
* @param onError 错误回调
*/
fun speak(text: String, voice: String = VOICE_XIAOXIAO, onError: ((String) -> Unit)? = null) {
if (text.isBlank()) return
// 停止当前播放
stop()
CoroutineScope(Dispatchers.IO).launch {
try {
// 检查缓存
val cacheFile = getCacheFile(text, voice)
if (cacheFile.exists()) {
Timber.d("$TAG: 命中缓存 ${cacheFile.name}")
playAudio(cacheFile)
return@launch
}
// 调用 Edge TTS API
val audioData = synthesize(text, voice)
if (audioData != null && audioData.isNotEmpty()) {
// 保存到缓存
saveToCacheDir(cacheFile, audioData)
playAudio(cacheFile)
} else {
Timber.w("$TAG: 合成返回空数据")
withContext(Dispatchers.Main) { onError?.invoke("语音合成失败") }
}
} catch (e: Exception) {
Timber.e(e, "$TAG: 语音合成异常")
withContext(Dispatchers.Main) { onError?.invoke("语音合成异常: ${e.message}") }
}
}
}
/** 停止播放 */
fun stop() {
try {
mediaPlayer?.apply {
if (isPlaying) stop()
release()
}
} catch (_: Exception) {
}
mediaPlayer = null
isPlaying = false
}
/**
* 通过 WebSocket 调用 Edge TTS 合成语音
* @return MP3 音频字节数组,失败返回 null
*/
private suspend fun synthesize(text: String, voice: String): ByteArray? {
return suspendCancellableCoroutine { continuation ->
val connectionId = UUID.randomUUID().toString().replace("-", "")
val requestId = UUID.randomUUID().toString().replace("-", "")
val gecToken = generateGecToken()
val url = "$WSS_BASE?TrustedClientToken=$TRUSTED_TOKEN" +
"&ConnectionId=$connectionId" +
"&Sec-MS-GEC=$gecToken" +
"&Sec-MS-GEC-Version=$GEC_VERSION"
val request = Request.Builder()
.url(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0")
.header("Origin", "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold")
.build()
val audioBuffer = ByteArrayOutputStream()
var resumed = false
val ws = okHttpClient.newWebSocket(request, object : WebSocketListener() {
override fun onOpen(webSocket: WebSocket, response: Response) {
Timber.d("$TAG: WebSocket 已连接")
// 1. 发送 speech.config
val configMsg = buildConfigMessage()
webSocket.send(configMsg)
// 2. 发送 SSML 合成请求
val ssmlMsg = buildSsmlMessage(requestId, text, voice)
webSocket.send(ssmlMsg)
}
override fun onMessage(webSocket: WebSocket, text: String) {
// 文本帧:解析 Path
if (text.contains("Path:turn.end")) {
// 合成完成
Timber.d("$TAG: 合成完成,音频大小 ${audioBuffer.size()} 字节")
webSocket.close(1000, "done")
if (!resumed) {
resumed = true
continuation.resumeWith(Result.success(audioBuffer.toByteArray()))
}
}
}
override fun onMessage(webSocket: WebSocket, bytes: ByteString) {
// 二进制帧:提取音频数据
val data = bytes.toByteArray()
val audioPayload = parseBinaryFrame(data)
if (audioPayload != null) {
audioBuffer.write(audioPayload)
}
}
override fun onFailure(webSocket: WebSocket, t: Throwable, response: Response?) {
Timber.e(t, "$TAG: WebSocket 连接失败")
// 尝试从错误响应中修正时钟偏移
response?.header("Date")?.let { adjustClockSkew(it) }
if (!resumed) {
resumed = true
continuation.resumeWith(Result.success(null))
}
}
override fun onClosed(webSocket: WebSocket, code: Int, reason: String) {
if (!resumed) {
resumed = true
continuation.resumeWith(Result.success(audioBuffer.toByteArray()))
}
}
})
continuation.invokeOnCancellation {
ws.cancel()
}
}
}
/** 构建 speech.config 消息 */
private fun buildConfigMessage(): String {
val timestamp = formatTimestamp()
return "X-Timestamp:$timestamp\r\n" +
"Content-Type:application/json; charset=utf-8\r\n" +
"Path:speech.config\r\n" +
"\r\n" +
"""{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"false"},"outputFormat":"$OUTPUT_FORMAT"}}}}"""
}
/** 构建 SSML 合成请求消息 */
private fun buildSsmlMessage(requestId: String, text: String, voice: String): String {
val timestamp = formatTimestamp()
val escapedText = text
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace("\"", "&quot;")
.replace("'", "&apos;")
return "X-RequestId:$requestId\r\n" +
"Content-Type:application/ssml+xml\r\n" +
"X-Timestamp:${timestamp}Z\r\n" +
"Path:ssml\r\n" +
"\r\n" +
"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='zh-CN'>" +
"<voice name='$voice'>" +
"<prosody pitch='+0Hz' rate='+0%' volume='+0%'>$escapedText</prosody>" +
"</voice></speak>"
}
/**
* 解析二进制帧,提取音频数据
* 格式:[2字节头部长度][头部内容][音频数据]
*/
private fun parseBinaryFrame(data: ByteArray): ByteArray? {
if (data.size < 2) return null
val headerLength = ((data[0].toInt() and 0xFF) shl 8) or (data[1].toInt() and 0xFF)
val audioStart = 2 + headerLength
if (audioStart >= data.size) return null
// 验证是音频帧
val headerStr = String(data, 2, headerLength, Charsets.US_ASCII)
if (!headerStr.contains("Path:audio")) return null
return data.copyOfRange(audioStart, data.size)
}
/** 生成 Sec-MS-GEC Token基于时间的 SHA256 哈希) */
private fun generateGecToken(): String {
var ticks = (System.currentTimeMillis() / 1000.0) + clockSkewSeconds
ticks += WINDOWS_EPOCH_OFFSET
ticks -= ticks % 300 // 对齐到 5 分钟
ticks *= 10_000_000 // 转换为 100 纳秒间隔
val strToHash = "${ticks.toLong()}$TRUSTED_TOKEN"
val digest = MessageDigest.getInstance("SHA-256")
.digest(strToHash.toByteArray(Charsets.US_ASCII))
return digest.joinToString("") { "%02X".format(it) }
}
/** 格式化时间戳 */
private fun formatTimestamp(): String {
val sdf = SimpleDateFormat("EEE MMM dd yyyy HH:mm:ss 'GMT+0000 (Coordinated Universal Time)'", Locale.US)
sdf.timeZone = TimeZone.getTimeZone("UTC")
return sdf.format(Date())
}
/** 从服务器响应修正时钟偏移 */
private fun adjustClockSkew(serverDateHeader: String) {
try {
val sdf = SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.US)
val serverTime = sdf.parse(serverDateHeader)?.time ?: return
clockSkewSeconds = (serverTime - System.currentTimeMillis()) / 1000
Timber.d("$TAG: 时钟偏移修正为 ${clockSkewSeconds}s")
} catch (_: Exception) {
}
}
/** 播放音频文件 */
private suspend fun playAudio(file: File) {
withContext(Dispatchers.Main) {
try {
stop()
mediaPlayer = MediaPlayer().apply {
setDataSource(file.absolutePath)
setOnCompletionListener {
isPlaying = false
onComplete?.invoke()
Timber.d("$TAG: 播放完成")
}
setOnErrorListener { _, what, extra ->
Timber.e("$TAG: 播放错误 what=$what extra=$extra")
isPlaying = false
true
}
prepare()
start()
isPlaying = true
Timber.d("$TAG: 开始播放")
}
} catch (e: Exception) {
Timber.e(e, "$TAG: 播放异常")
isPlaying = false
}
}
}
// ===== 缓存管理 =====
/** 获取缓存文件路径(基于文本+语音的 MD5 */
private fun getCacheFile(text: String, voice: String): File {
val cacheDir = File(context.cacheDir, CACHE_DIR).also { it.mkdirs() }
val key = MessageDigest.getInstance("MD5")
.digest("$voice:$text".toByteArray())
.joinToString("") { "%02x".format(it) }
return File(cacheDir, "$key.mp3")
}
/** 保存到缓存目录,超过上限时清理最旧的 */
private fun saveToCacheDir(file: File, data: ByteArray) {
file.writeBytes(data)
// 清理超出上限的旧缓存
val cacheDir = file.parentFile ?: return
val files = cacheDir.listFiles()?.sortedBy { it.lastModified() } ?: return
if (files.size > MAX_CACHE_SIZE) {
files.take(files.size - MAX_CACHE_SIZE).forEach { it.delete() }
}
}
/** 清除所有缓存 */
fun clearCache() {
File(context.cacheDir, CACHE_DIR).deleteRecursively()
Timber.d("$TAG: 缓存已清除")
}
}

View File

@@ -28,7 +28,7 @@ import com.xiaoqu.watch.ui.punch.PunchResult
import com.xiaoqu.watch.ui.punch.PunchViewModel
import com.xiaoqu.watch.ui.widget.StatusBarView
import com.xiaoqu.watch.util.DateUtil
import android.speech.tts.TextToSpeech
import com.xiaoqu.watch.service.manager.EdgeTtsManager
import dagger.hilt.android.AndroidEntryPoint
import kotlinx.coroutines.delay
import kotlinx.coroutines.isActive
@@ -52,6 +52,7 @@ class HomeFragment : BaseFragment<FragmentHomeBinding>() {
@Inject lateinit var bluetoothScanManager: com.xiaoqu.watch.service.manager.BluetoothScanManager
@Inject lateinit var notificationManager: com.xiaoqu.watch.service.manager.NotificationManager
@Inject lateinit var vibrationConfigManager: com.xiaoqu.watch.device.sensor.VibrationConfigManager
@Inject lateinit var edgeTtsManager: EdgeTtsManager
/** 考勤打卡 ViewModel */
private val punchViewModel: PunchViewModel by viewModels()
@@ -85,7 +86,6 @@ class HomeFragment : BaseFragment<FragmentHomeBinding>() {
private var lastTapTime = 0L
// ===== TTS 语音测试 =====
private var tts: TextToSpeech? = null
override fun createBinding(inflater: LayoutInflater, container: ViewGroup?): FragmentHomeBinding {
return FragmentHomeBinding.inflate(inflater, container, false)
@@ -183,9 +183,8 @@ class HomeFragment : BaseFragment<FragmentHomeBinding>() {
it.onBackKeyPressed = null
it.notificationBanner.onClick = null
}
// 释放 TTS 资源
tts?.shutdown()
tts = null
// 停止 TTS 播放
edgeTtsManager.stop()
}
// ===== 打卡面板 =====
@@ -561,43 +560,18 @@ class HomeFragment : BaseFragment<FragmentHomeBinding>() {
}
/**
* TTS 语音测试:验证设备是否支持中文语音合成
* 测试内容:初始化 TTS → 设置中文 → 播放测试语音
* Edge TTS 语音测试:通过微软 Edge TTS 合成中文语音
* 测试内容:调用 Edge TTS API → 接收 MP3 → 播放
* 结果通过 Logcat 和 Toast 反馈
*/
private fun testTts() {
tts?.shutdown()
tts = TextToSpeech(requireContext()) { status ->
if (status == TextToSpeech.SUCCESS) {
val result = tts?.setLanguage(java.util.Locale.CHINESE)
when {
result == TextToSpeech.LANG_MISSING_DATA -> {
Timber.w("TTS: 中文语音包缺失")
activity?.runOnUiThread {
Toast.makeText(requireContext(), "TTS: 中文语音包缺失", Toast.LENGTH_LONG).show()
}
}
result == TextToSpeech.LANG_NOT_SUPPORTED -> {
Timber.w("TTS: 不支持中文")
activity?.runOnUiThread {
Toast.makeText(requireContext(), "TTS: 不支持中文", Toast.LENGTH_LONG).show()
}
}
else -> {
Timber.d("TTS: 中文语音可用,开始播放测试")
activity?.runOnUiThread {
Toast.makeText(requireContext(), "TTS 测试播放中...", Toast.LENGTH_SHORT).show()
}
tts?.speak("您有3条新任务待处理", TextToSpeech.QUEUE_FLUSH, null, "tts_test")
}
}
} else {
Timber.e("TTS: 初始化失败, status=$status")
activity?.runOnUiThread {
Toast.makeText(requireContext(), "TTS: 初始化失败", Toast.LENGTH_LONG).show()
}
Toast.makeText(requireContext(), "Edge TTS 测试中...", Toast.LENGTH_SHORT).show()
edgeTtsManager.speak(
text = "您有3条新任务待处理请及时查看",
onError = { msg ->
Toast.makeText(requireContext(), "TTS 失败: $msg", Toast.LENGTH_LONG).show()
}
}
)
}
// ===== 事件监听 =====