CosyVoice
重要 cosyvoice-v3.5-plus 和 cosyvoice-v3.5-flash 模型目前僅在北京地區可用,且專門用於聲音設計和聲音複刻情境(無系統音色)。在使用它們進行語音合成之前,請先參見CosyVoice聲音複刻/設計API建立目標音色。建立完成後,只需將代碼中的 voice 欄位更新為您的音色 ID,並將 model 欄位指定為對應模型,即可正常運行。
使用系統音色進行語音合成以下樣本示範如何使用系統音色(參見音色列表)進行語音合成。 將合成音頻儲存為檔案Python# coding=utf-8
import os
import dashscope
from dashscope.audio.tts_v2 import *
# 新加坡地區和北京地區的API Key不同。擷取API Key:https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# 若沒有配置環境變數,請用百鍊API Key將下行替換為:dashscope.api_key = "sk-xxx"
dashscope.api_key = os.environ.get('DASHSCOPE_API_KEY')
# 以下為新加坡地區url,若使用北京地區的模型,需將url替換為:wss://dashscope.aliyuncs.com/api-ws/v1/inference
dashscope.base_websocket_api_url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference'
# 模型
# 不同模型版本需要使用對應版本的音色:
# cosyvoice-v3-flash/cosyvoice-v3-plus:使用longanyang等音色。
# cosyvoice-v2:使用longxiaochun_v2等音色。
# 每個音色支援的語言不同,合成日語、韓語等非中文語言時,需選擇支援對應語言的音色。詳見CosyVoice音色列表。
model = "cosyvoice-v3-flash"
# 音色
voice = "longanyang"
# 執行個體化SpeechSynthesizer,並在構造方法中傳入模型(model)、音色(voice)等請求參數
synthesizer = SpeechSynthesizer(model=model, voice=voice)
# 發送待合成文本,擷取二進位音頻
audio = synthesizer.call("今天天氣怎麼樣?")
# 首次發送文本時需建立 WebSocket 串連,因此首包延遲會包含串連建立的耗時
print('[Metric] requestId為:{},首包延遲為:{}毫秒'.format(
synthesizer.get_last_request_id(),
synthesizer.get_first_package_delay()))
# 將音頻儲存至本地
with open('output.mp3', 'wb') as f:
f.write(audio)
Javaimport com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.utils.Constants;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
public class Main {
// 模型
// 不同模型版本需要使用對應版本的音色:
// cosyvoice-v3-flash/cosyvoice-v3-plus:使用longanyang等音色。
// cosyvoice-v2:使用longxiaochun_v2等音色。
// 每個音色支援的語言不同,合成日語、韓語等非中文語言時,需選擇支援對應語言的音色。詳見CosyVoice音色列表。
private static String model = "cosyvoice-v3-flash";
// 音色
private static String voice = "longanyang";
public static void streamAudioDataToSpeaker() {
// 請求參數
SpeechSynthesisParam param =
SpeechSynthesisParam.builder()
// 新加坡地區和北京地區的API Key不同。擷取API Key:https://www.alibabacloud.com/help/zh/model-studio/get-api-key
// 若沒有配置環境變數,請用百鍊API Key將下行替換為:.apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model(model) // 模型
.voice(voice) // 音色
.build();
// 同步模式:禁用回調(第二個參數為null)
SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
ByteBuffer audio = null;
try {
// 阻塞直至音頻返回
audio = synthesizer.call("今天天氣怎麼樣?");
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
// 任務結束關閉websocket串連
synthesizer.getDuplexApi().close(1000, "bye");
}
if (audio != null) {
// 將音頻資料儲存到本地檔案“output.mp3”中
File file = new File("output.mp3");
// 首次發送文本時需建立 WebSocket 串連,因此首包延遲會包含串連建立的耗時
System.out.println(
"[Metric] requestId為:"
+ synthesizer.getLastRequestId()
+ "首包延遲(毫秒)為:"
+ synthesizer.getFirstPackageDelay());
try (FileOutputStream fos = new FileOutputStream(file)) {
fos.write(audio.array());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public static void main(String[] args) {
// 以下為新加坡地區url,若使用北京地區的模型,需將url替換為:wss://dashscope.aliyuncs.com/api-ws/v1/inference
Constants.baseWebsocketApiUrl = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference";
streamAudioDataToSpeaker();
System.exit(0);
}
}
將LLM產生的文本即時轉成語音並通過擴音器播放以下代碼展示通過本地裝置播放千問大語言模型(qwen-turbo)即時返回的常值內容。 Python運行Python樣本前,需要通過pip安裝第三方音頻播放庫。 # coding=utf-8
# Installation instructions for pyaudio:
# APPLE Mac OS X
# brew install portaudio
# pip install pyaudio
# Debian/Ubuntu
# sudo apt-get install python-pyaudio python3-pyaudio
# or
# pip install pyaudio
# CentOS
# sudo yum install -y portaudio portaudio-devel && pip install pyaudio
# Microsoft Windows
# python -m pip install pyaudio
import os
import pyaudio
import dashscope
from dashscope.audio.tts_v2 import *
from http import HTTPStatus
from dashscope import Generation
# 新加坡地區和北京地區的API Key不同。擷取API Key:https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# 若沒有配置環境變數,請用百鍊API Key將下行替換為:dashscope.api_key = "sk-xxx"
dashscope.api_key = os.environ.get('DASHSCOPE_API_KEY')
# 以下為新加坡地區url,若使用北京地區的模型,需將url替換為:wss://dashscope.aliyuncs.com/api-ws/v1/inference
dashscope.base_websocket_api_url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference'
# 不同模型版本需要使用對應版本的音色:
# cosyvoice-v3-flash/cosyvoice-v3-plus:使用longanyang等音色。
# cosyvoice-v2:使用longxiaochun_v2等音色。
# 每個音色支援的語言不同,合成日語、韓語等非中文語言時,需選擇支援對應語言的音色。詳見CosyVoice音色列表。
model = "cosyvoice-v3-flash"
voice = "longanyang"
class Callback(ResultCallback):
_player = None
_stream = None
def on_open(self):
print("websocket is open.")
self._player = pyaudio.PyAudio()
self._stream = self._player.open(
format=pyaudio.paInt16, channels=1, rate=22050, output=True
)
def on_complete(self):
print("speech synthesis task complete successfully.")
def on_error(self, message: str):
print(f"speech synthesis task failed, {message}")
def on_close(self):
print("websocket is closed.")
# stop player
self._stream.stop_stream()
self._stream.close()
self._player.terminate()
def on_event(self, message):
print(f"recv speech synthsis message {message}")
def on_data(self, data: bytes) -> None:
print("audio result length:", len(data))
self._stream.write(data)
def synthesizer_with_llm():
callback = Callback()
synthesizer = SpeechSynthesizer(
model=model,
voice=voice,
format=AudioFormat.PCM_22050HZ_MONO_16BIT,
callback=callback,
)
messages = [{"role": "user", "content": "請介紹一下你自己"}]
responses = Generation.call(
model="qwen-turbo",
messages=messages,
result_format="message", # set result format as 'message'
stream=True, # enable stream output
incremental_output=True, # enable incremental output
)
for response in responses:
if response.status_code == HTTPStatus.OK:
print(response.output.choices[0]["message"]["content"], end="")
synthesizer.streaming_call(response.output.choices[0]["message"]["content"])
else:
print(
"Request id: %s, Status code: %s, error code: %s, error message: %s"
% (
response.request_id,
response.status_code,
response.code,
response.message,
)
)
synthesizer.streaming_complete()
print('requestId: ', synthesizer.get_last_request_id())
if __name__ == "__main__":
synthesizer_with_llm()
使用聲音複刻音色進行語音合成聲音複刻與語音合成是緊密關聯的兩個獨立步驟,遵循“先建立,後使用”的流程: 準備錄音檔案 將符合聲音複刻:輸入音頻格式的音頻檔案上傳至公網可訪問的位置,如阿里雲Object Storage Service,並確保URL可公開訪問。 建立音色 調用建立音色介面。此步驟必須指定target_model/targetModel,聲明建立的音色將由哪個語音合成模型驅動。 若已有建立好的音色(調用查詢音色列表介面查看),可跳過這一步直接進行下一步。 使用音色進行語音合成 使用建立音色介面建立音色成功後,系統會返回一個voice_id/voiceID: 該 voice_id/voiceID 可直接作為語音合成介面或各語言 SDK 中的 voice 參數使用,用於後續的文本轉語音。 支援多種調用形態,包括非流式、單向流式以及雙向流式合成。 合成時指定的語音合成模型必須與建立音色時的 target_model/targetModel 保持一致,否則合成會失敗。
範例程式碼: import os
import time
import dashscope
from dashscope.audio.tts_v2 import VoiceEnrollmentService, SpeechSynthesizer
# 1. 環境準備
# 推薦通過環境變數配置API Key
# 新加坡和北京地區的API Key不同。擷取API Key:https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# 若沒有配置環境變數,請用百鍊API Key將下行替換為:dashscope.api_key = "sk-xxx"
dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")
if not dashscope.api_key:
raise ValueError("DASHSCOPE_API_KEY environment variable not set.")
# 以下為新加坡地區WebSocket url,若使用北京地區的模型,需將url替換為:wss://dashscope.aliyuncs.com/api-ws/v1/inference
dashscope.base_websocket_api_url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference'
# 以下為新加坡地區HTTP url,若使用北京地區的模型,需將url替換為:https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
# 2. 定義複刻參數
TARGET_MODEL = "cosyvoice-v3.5-plus"
# 為音色起一個有意義的首碼
VOICE_PREFIX = "myvoice" # 僅允許數字和小寫字母,小於十個字元
# 公網可訪問音頻URL
AUDIO_URL = "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/cosyvoice/cosyvoice-zeroshot-sample.wav" # 樣本URL,請替換為自己的
# 3. 建立音色 (非同步任務)
print("--- Step 1: Creating voice enrollment ---")
service = VoiceEnrollmentService()
try:
voice_id = service.create_voice(
target_model=TARGET_MODEL,
prefix=VOICE_PREFIX,
url=AUDIO_URL
)
print(f"Voice enrollment submitted successfully. Request ID: {service.get_last_request_id()}")
print(f"Generated Voice ID: {voice_id}")
except Exception as e:
print(f"Error during voice creation: {e}")
raise e
# 4. 輪詢查詢音色狀態
print("\n--- Step 2: Polling for voice status ---")
max_attempts = 30
poll_interval = 10 # 秒
for attempt in range(max_attempts):
try:
voice_info = service.query_voice(voice_id=voice_id)
status = voice_info.get("status")
print(f"Attempt {attempt + 1}/{max_attempts}: Voice status is '{status}'")
if status == "OK":
print("Voice is ready for synthesis.")
break
elif status == "UNDEPLOYED":
print(f"Voice processing failed with status: {status}. Please check audio quality or contact support.")
raise RuntimeError(f"Voice processing failed with status: {status}")
# 對於 "DEPLOYING" 等中間狀態,繼續等待
time.sleep(poll_interval)
except Exception as e:
print(f"Error during status polling: {e}")
time.sleep(poll_interval)
else:
print("Polling timed out. The voice is not ready after several attempts.")
raise RuntimeError("Polling timed out. The voice is not ready after several attempts.")
# 5. 使用複刻音色進行語音合成
print("\n--- Step 3: Synthesizing speech with the new voice ---")
try:
synthesizer = SpeechSynthesizer(model=TARGET_MODEL, voice=voice_id)
text_to_synthesize = "恭喜,已成功複刻併合成了屬於自己的聲音!"
# call()方法返回二進位音頻資料
audio_data = synthesizer.call(text_to_synthesize)
print(f"Speech synthesis successful. Request ID: {synthesizer.get_last_request_id()}")
# 6. 儲存音頻檔案
output_file = "my_custom_voice_output.mp3"
with open(output_file, "wb") as f:
f.write(audio_data)
print(f"Audio saved to {output_file}")
except Exception as e:
print(f"Error during speech synthesis: {e}")
使用聲音設計音色進行語音合成聲音設計與語音合成是緊密關聯的兩個獨立步驟,遵循“先建立,後使用”的流程: 準備聲音設計所需的聲音描述與試聽文本。 調用建立音色介面,建立一個專屬音色,擷取音色名和預覽音頻。 此步驟必須指定target_model,聲明建立的音色將由哪個語音合成模型驅動 試聽擷取預覽音頻來判斷是否符合預期;若符合要求,繼續下一步,否則,重新設計。 若已有建立好的音色(調用查詢音色列表介面查看),可跳過這一步直接進行下一步。 使用音色進行語音合成 使用建立音色介面建立音色成功後,系統會返回一個voice_id/voiceID: 該 voice_id/voiceID 可直接作為語音合成介面或各語言 SDK 中的 voice 參數使用,用於後續的文本轉語音。 支援多種調用形態,包括非流式、單向流式以及雙向流式合成。 合成時指定的語音合成模型必須與建立音色時的 target_model/targetModel 保持一致,否則合成會失敗。
範例程式碼: 產生專屬音色並試聽效果,若對效果滿意,進行下一步;否則重建。 Pythonimport requests
import base64
import os
def create_voice_and_play():
# 新加坡和北京地區的API Key不同。擷取API Key:https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# 若沒有配置環境變數,請用百鍊API Key將下行替換為:api_key = "sk-xxx"
api_key = os.getenv("DASHSCOPE_API_KEY")
if not api_key:
print("錯誤: 未找到DASHSCOPE_API_KEY環境變數,請先設定API Key")
return None, None, None
# 準備請求資料
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
data = {
"model": "voice-enrollment",
"input": {
"action": "create_voice",
"target_model": "cosyvoice-v3.5-plus",
"voice_prompt": "A composed middle-aged male announcer with a deep, rich and magnetic voice, a steady speaking speed and clear articulation, is suitable for news broadcasting or documentary commentary.",
"preview_text": "Dear listeners, hello everyone. Welcome to the evening news.",
"prefix": "announcer"
},
"parameters": {
"sample_rate": 24000,
"response_format": "wav"
}
}
# 以下為新加坡地區url,若使用北京地區的模型,需將url替換為:https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization
url = "https://dashscope-intl.aliyuncs.com/api/v1/services/audio/tts/customization"
try:
# 發送請求
response = requests.post(
url,
headers=headers,
json=data,
timeout=60 # 添加逾時設定
)
if response.status_code == 200:
result = response.json()
# 擷取音色ID
voice_id = result["output"]["voice_id"]
print(f"音色ID: {voice_id}")
# 擷取預覽音頻資料
base64_audio = result["output"]["preview_audio"]["data"]
# 解碼Base64音頻資料
audio_bytes = base64.b64decode(base64_audio)
# 儲存音頻檔案到本地
filename = f"{voice_id}_preview.wav"
# 將音頻資料寫入本地檔案
with open(filename, 'wb') as f:
f.write(audio_bytes)
print(f"音頻已儲存到本地檔案: {filename}")
print(f"檔案路徑: {os.path.abspath(filename)}")
return voice_id, audio_bytes, filename
else:
print(f"請求失敗,狀態代碼: {response.status_code}")
print(f"響應內容: {response.text}")
return None, None, None
except requests.exceptions.RequestException as e:
print(f"網路請求發生錯誤: {e}")
return None, None, None
except KeyError as e:
print(f"響應資料格式錯誤,缺少必要的欄位: {e}")
print(f"響應內容: {response.text if 'response' in locals() else 'No response'}")
return None, None, None
except Exception as e:
print(f"發生未知錯誤: {e}")
return None, None, None
if __name__ == "__main__":
print("開始建立語音...")
voice_id, audio_data, saved_filename = create_voice_and_play()
if voice_id:
print(f"\n成功建立音色 '{voice_id}'")
print(f"音頻檔案已儲存: '{saved_filename}'")
print(f"檔案大小: {os.path.getsize(saved_filename)} 位元組")
else:
print("\n音色建立失敗")
Java需要匯入Gson依賴,若是使用Maven或者Gradle,添加依賴方式如下: Maven在pom.xml中添加如下內容: <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.13.1</version>
</dependency>
Gradle在build.gradle中添加如下內容: // https://mvnrepository.com/artifact/com.google.code.gson/gson
implementation("com.google.code.gson:gson:2.13.1")
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;
public class Main {
public static void main(String[] args) {
Main example = new Main();
example.createVoice();
}
public void createVoice() {
// 新加坡和北京地區的API Key不同。擷取API Key:https://www.alibabacloud.com/help/zh/model-studio/get-api-key
// 若沒有配置環境變數,請用百鍊API Key將下行替換為:String apiKey = "sk-xxx"
String apiKey = System.getenv("DASHSCOPE_API_KEY");
// 建立JSON請求體字串
String jsonBody = "{\n" +
" \"model\": \"voice-enrollment\",\n" +
" \"input\": {\n" +
" \"action\": \"create_voice\",\n" +
" \"target_model\": \"cosyvoice-v3.5-plus\",\n" +
" \"voice_prompt\": \"A composed middle-aged male announcer with a deep, rich and magnetic voice, a steady speaking speed and clear articulation, is suitable for news broadcasting or documentary commentary.\",\n" +
" \"preview_text\": \"Dear listeners, hello everyone. Welcome to the evening news.\",\n" +
" \"prefix\": \"announcer\"\n" +
" },\n" +
" \"parameters\": {\n" +
" \"sample_rate\": 24000,\n" +
" \"response_format\": \"wav\"\n" +
" }\n" +
"}";
HttpURLConnection connection = null;
try {
// 以下為新加坡地區url,若使用北京地區的模型,需將url替換為:https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization
URL url = new URL("https://dashscope-intl.aliyuncs.com/api/v1/services/audio/tts/customization");
connection = (HttpURLConnection) url.openConnection();
// 佈建要求方法和頭部
connection.setRequestMethod("POST");
connection.setRequestProperty("Authorization", "Bearer " + apiKey);
connection.setRequestProperty("Content-Type", "application/json");
connection.setDoOutput(true);
connection.setDoInput(true);
// 發送請求體
try (OutputStream os = connection.getOutputStream()) {
byte[] input = jsonBody.getBytes("UTF-8");
os.write(input, 0, input.length);
os.flush();
}
// 擷取響應
int responseCode = connection.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK) {
// 讀取響應內容
StringBuilder response = new StringBuilder();
try (BufferedReader br = new BufferedReader(
new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String responseLine;
while ((responseLine = br.readLine()) != null) {
response.append(responseLine.trim());
}
}
// 解析JSON響應
JsonObject jsonResponse = JsonParser.parseString(response.toString()).getAsJsonObject();
JsonObject outputObj = jsonResponse.getAsJsonObject("output");
JsonObject previewAudioObj = outputObj.getAsJsonObject("preview_audio");
// 擷取音色名稱
String voiceId = outputObj.get("voice_id").getAsString();
System.out.println("音色ID: " + voiceId);
// 擷取Base64編碼的音頻資料
String base64Audio = previewAudioObj.get("data").getAsString();
// 解碼Base64音頻資料
byte[] audioBytes = Base64.getDecoder().decode(base64Audio);
// 儲存音頻到本地檔案
String filename = voiceId + "_preview.wav";
saveAudioToFile(audioBytes, filename);
System.out.println("音頻已儲存到本地檔案: " + filename);
} else {
// 讀取錯誤響應
StringBuilder errorResponse = new StringBuilder();
try (BufferedReader br = new BufferedReader(
new InputStreamReader(connection.getErrorStream(), "UTF-8"))) {
String responseLine;
while ((responseLine = br.readLine()) != null) {
errorResponse.append(responseLine.trim());
}
}
System.out.println("請求失敗,狀態代碼: " + responseCode);
System.out.println("錯誤響應: " + errorResponse.toString());
}
} catch (Exception e) {
System.err.println("請求發生錯誤: " + e.getMessage());
e.printStackTrace();
} finally {
if (connection != null) {
connection.disconnect();
}
}
}
private void saveAudioToFile(byte[] audioBytes, String filename) {
try {
File file = new File(filename);
try (FileOutputStream fos = new FileOutputStream(file)) {
fos.write(audioBytes);
}
System.out.println("音頻已儲存到: " + file.getAbsolutePath());
} catch (IOException e) {
System.err.println("儲存音頻檔案時發生錯誤: " + e.getMessage());
e.printStackTrace();
}
}
}
使用上一步產生的專屬音色進行語音合成。 這裡參考了非流式調用範例程式碼,將voice參數替換為聲音設計產生的專屬音色進行語音合成。 關鍵原則:聲音設計時使用的模型 (target_model) 必須與後續進行語音合成時使用的模型 (model) 保持一致,否則會導致合成失敗。 Python# coding=utf-8
import dashscope
from dashscope.audio.tts_v2 import *
import os
# 新加坡地區和北京地區的API Key不同。擷取API Key:https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# 若沒有配置環境變數,請用百鍊API Key將下行替換為:dashscope.api_key = "sk-xxx"
dashscope.api_key = os.environ.get('DASHSCOPE_API_KEY')
# 以下為新加坡地區url,若使用北京地區的模型,需將url替換為:wss://dashscope.aliyuncs.com/api-ws/v1/inference
dashscope.base_websocket_api_url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference'
# 聲音設計、語音合成要使用相同的模型
model = "cosyvoice-v3.5-plus"
# 將voice參數替換為聲音設計產生的專屬音色
voice = "your_voice"
# 執行個體化SpeechSynthesizer,並在構造方法中傳入模型(model)、音色(voice)等請求參數
synthesizer = SpeechSynthesizer(model=model, voice=voice)
# 發送待合成文本,擷取二進位音頻
audio = synthesizer.call("今天天氣怎麼樣?")
# 首次發送文本時需建立 WebSocket 串連,因此首包延遲會包含串連建立的耗時
print('[Metric] requestId為:{},首包延遲為:{}毫秒'.format(
synthesizer.get_last_request_id(),
synthesizer.get_first_package_delay()))
# 將音頻儲存至本地
with open('output.mp3', 'wb') as f:
f.write(audio)
Javaimport com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.utils.Constants;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
public class Main {
// 聲音設計、語音合成要使用相同的模型
private static String model = "cosyvoice-v3.5-plus";
// 將voice參數替換為聲音設計產生的專屬音色
private static String voice = "your_voice_id";
public static void streamAudioDataToSpeaker() {
// 請求參數
SpeechSynthesisParam param =
SpeechSynthesisParam.builder()
// 新加坡地區和北京地區的API Key不同。擷取API Key:https://www.alibabacloud.com/help/zh/model-studio/get-api-key
// 若沒有配置環境變數,請用百鍊API Key將下行替換為:.apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model(model) // 模型
.voice(voice) // 音色
.build();
// 同步模式:禁用回調(第二個參數為null)
SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
ByteBuffer audio = null;
try {
// 阻塞直至音頻返回
audio = synthesizer.call("今天天氣怎麼樣?");
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
// 任務結束關閉websocket串連
synthesizer.getDuplexApi().close(1000, "bye");
}
if (audio != null) {
// 將音頻資料儲存到本地檔案“output.mp3”中
File file = new File("output.mp3");
// 首次發送文本時需建立 WebSocket 串連,因此首包延遲會包含串連建立的耗時
System.out.println(
"[Metric] requestId為:"
+ synthesizer.getLastRequestId()
+ "首包延遲(毫秒)為:"
+ synthesizer.getFirstPackageDelay());
try (FileOutputStream fos = new FileOutputStream(file)) {
fos.write(audio.array());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public static void main(String[] args) {
// 以下為新加坡地區url,若使用北京地區的模型,需將url替換為:wss://dashscope.aliyuncs.com/api-ws/v1/inference
Constants.baseWebsocketApiUrl = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference";
streamAudioDataToSpeaker();
System.exit(0);
}
}
|