Request body | Non-streaming outputPythonThe SpeechSynthesizer interface in the DashScope Python SDK is now unified under MultiModalConversation. Its usage and parameters remain fully consistent. # Install the latest version of the DashScope SDK
import os
import dashscope
# This is the URL for the Singapore region. If using a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
text = "Let me recommend a T-shirt to everyone. This one is really super nice. The color is very elegant, and it's also a perfect item to match. Everyone can buy it without hesitation. It's truly beautiful and very forgiving on the figure. No matter what body type you have, it will look great. I recommend everyone to place an order."
# SpeechSynthesizer interface usage: dashscope.audio.qwen_tts.SpeechSynthesizer.call(...)
response = dashscope.MultiModalConversation.call(
# To use the instruction control feature, replace the model with qwen3-tts-instruct-flash
model="qwen3-tts-flash",
# The API keys for Singapore and Beijing regions are different. Get your API Key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx"
api_key=os.getenv("DASHSCOPE_API_KEY"),
text=text,
voice="Cherry"
# To use the instruction control feature, uncomment the following line and replace the model with qwen3-tts-instruct-flash
# instructions='Fast speech rate, with a clear rising intonation, suitable for introducing fashion products.',
# optimize_instructions=True
)
print(response)
Java// Install the latest version of the DashScope SDK
import com.alibaba.dashscope.aigc.multimodalconversation.AudioParameters;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.JsonUtils;
import com.alibaba.dashscope.utils.Constants;
public class Main {
// To use the instruction control feature, replace MODEL with qwen3-tts-instruct-flash
private static final String MODEL = "qwen3-tts-flash";
public static void call() throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.model(MODEL)
// The API keys for Singapore and Beijing regions are different. Get your API Key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If the environment variable is not configured, replace the following line with your Model Studio API key: apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.text("Today is a wonderful day to build something people love!")
.voice(AudioParameters.Voice.CHERRY)
.languageType("English")
// To use the instruction control feature, uncomment the following line and replace the model with qwen3-tts-instruct-flash
// .parameter("instructions","Fast speech rate, with a clear rising intonation, suitable for introducing fashion products.")
// .parameter("optimize_instructions",true)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(JsonUtils.toJson(result));
}
public static void main(String[] args) {
// This is the URL for the Singapore region. If using a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl = "https://dashscope-intl.aliyuncs.com/api/v1";
try {
call();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl# ======= IMPORTANT NOTE =======
# This is the URL for the Singapore region. If using a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# The API keys for Singapore and Beijing regions are different. Get your API Key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# If the environment variable is not configured, replace $DASHSCOPE_API_KEY with your Model Studio API key: sk-xxx.
# === DELETE THIS COMMENT WHEN EXECUTING ===
curl -X POST 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H 'Content-Type: application/json' \
-d '{
"model": "qwen3-tts-flash",
"input": {
"text": "Let me recommend a T-shirt to everyone. This one is really super nice. The color is very elegant, and it's also a perfect item to match. Everyone can buy it without hesitation. It's truly beautiful and very forgiving on the figure. No matter what body type you have, it will look great. I recommend everyone to place an order.",
"voice": "Cherry",
"language_type": "Chinese"
}
}'
Streaming outputPythonThe SpeechSynthesizer interface in the DashScope Python SDK is now unified under MultiModalConversation. To use the new interface, replace only the interface name. All other parameters remain fully compatible. # DashScope SDK version 1.24.5 or later
import os
import dashscope
# The following URL is for the Singapore region. If you use models in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
text = "Let me recommend a T-shirt to you. This one is truly stunning. Its color highlights your elegance and makes it an ideal match for any outfit. You can buy it without hesitation - it looks great on everyone. It flatters all body types. Whether you're tall, short, slim, or curvy, this T-shirt suits you perfectly. We highly recommend ordering it."
# Use SpeechSynthesizer as follows: dashscope.audio.qwen_tts.SpeechSynthesizer.call(...)
response = dashscope.MultiModalConversation.call(
# To use instruction control, set model to qwen3-tts-instruct-flash
model="qwen3-tts-flash",
# API keys differ between the Singapore and Beijing regions. Get your API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# If you have not set an environment variable, replace the next line with: api_key="sk-xxx"
api_key=os.getenv("DASHSCOPE_API_KEY"),
text=text,
voice="Cherry",
# To use instruction control, uncomment the lines below and set model to qwen3-tts-instruct-flash
# instructions='Speak quickly with a clear rising intonation, suitable for promoting fashion items.',
# optimize_instructions=True,
stream=True
)
for chunk in response:
print(chunk)
Java// DashScope SDK version 2.19.0 or later
import com.alibaba.dashscope.aigc.multimodalconversation.AudioParameters;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.JsonUtils;
import io.reactivex.Flowable;
import com.alibaba.dashscope.utils.Constants;
public class Main {
// To use instruction control, set MODEL to qwen3-tts-instruct-flash
private static final String MODEL = "qwen3-tts-flash";
public static void streamCall() throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.model(MODEL)
// API keys differ between the Singapore and Beijing regions. Get your API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If you have not set an environment variable, replace the next line with: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.text("Today is a wonderful day to build something people love!")
.voice(AudioParameters.Voice.CHERRY)
.languageType("English")
// To use instruction control, uncomment the lines below and set model to qwen3-tts-instruct-flash
// .parameter("instructions","Speak quickly with a clear rising intonation, suitable for promoting fashion items.")
// .parameter("optimize_instructions",true)
.build();
Flowable<MultiModalConversationResult> result = conv.streamCall(param);
result.blockingForEach(r -> {System.out.println(JsonUtils.toJson(r));
});
}
public static void main(String[] args) {
// The following URL is for the Singapore region. If you use models in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl = "https://dashscope-intl.aliyuncs.com/api/v1";
try {
streamCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl# ======= Important notice =======
# The following URL is for the Singapore region. If you use models in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# API keys differ between the Singapore and Beijing regions. Get your API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# If you have not set an environment variable, replace $DASHSCOPE_API_KEY with your Model Studio API key: sk-xxx.
# === Remove this comment before running ===
curl -X POST 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H 'Content-Type: application/json' \
-H 'X-DashScope-SSE: enable' \
-d '{
"model": "qwen3-tts-flash",
"input": {
"text": "Let me recommend a T-shirt to you. This one is truly stunning. Its color highlights your elegance and makes it an ideal match for any outfit. You can buy it without hesitation - it looks great on everyone. It flatters all body types. Whether you're tall, short, slim, or curvy, this T-shirt suits you perfectly. We highly recommend ordering it.",
"voice": "Cherry",
"language_type": "Chinese"
}
}'
To play Base64-encoded audio in real time, see Speech synthesis - Qwen. |
model string (Required) The model name. For details, see Supported models. |
text string (Required) The text to synthesize. Mixed-language input is supported. Qwen-TTS supports a maximum input of 512 tokens. Other models support a maximum input of 600 characters. |
voice string (Required) The voice to use. See Supported system voices. |
language_type string (Optional) Specify the language of the synthesized audio. The default value is Auto. Auto: Use this when the text language is uncertain or contains multiple languages. The model automatically matches pronunciation for different language segments in the text, but cannot guarantee perfectly accurate pronunciation.
Specify language: Use this when the text is in a single language. Specifying the exact language significantly improves synthesis quality, usually outperforming Auto. Supported values include the following: Chinese
English
German
Italian
Portuguese
Spanish
Japanese
Korean
French
Russian
|
instructions string (Optional) Provide instructions to guide speech synthesis. See Real-time speech synthesis - Qwen. Defaults to: None. This parameter has no effect if not set. Length limit: Must not exceed 1600 tokens. Supported languages: Chinese and English only. Scope: This feature applies only to the Qwen3-TTS-Instruct-Flash-Realtime model series. |
optimize_instructions boolean (Optional) Optimize the instructions to improve the naturalness and expressiveness of speech synthesis. Defaults to: false. Behavior: When set to true, the system semantically enhances and rewrites the content of instructions to generate internal instructions better suited for speech synthesis. Scenarios: Enable this for high-quality, fine-grained speech expression. Dependency: This parameter depends on the instructions parameter being set. If instructions is empty, this parameter has no effect. Scope: This feature applies only to the Qwen3-TTS-Instruct-Flash model series. |
stream boolean (Optional) Defaults to: false Stream the response. Valid values: Return the URL of the audio after the model finishes generating. Output Base64-encoded audio data as it is generated. Read these segments one by one in real time to obtain the complete result. See Speech synthesis - Qwen.
This parameter is supported only by the Python SDK. To achieve streaming output with the Java SDK, call the streamCall interface. To achieve streaming output with HTTP, specify X-DashScope-SSE as enable in the header. |