This topic describes the key interfaces and request parameters for calling real-time speech synthesis (Qwen) using the DashScope Python SDK.
User guide: For model descriptions and selection recommendations, see Real-time Text-to-Speech - Qwen or Speech synthesis - Qwen.
Prerequisites
Your DashScope Python SDK version must be 1.25.11 or later.
Getting started
Server commit mode
import os
import base64
import threading
import time
import dashscope
from dashscope.audio.qwen_tts_realtime import *
qwen_tts_realtime: QwenTtsRealtime = None
text_to_synthesize = [
'Right? I love supermarkets like this.',
'Especially during Chinese New Year,',
'I go shopping at supermarkets.',
'And I feel',
'absolutely thrilled!',
'I want to buy so many things!'
]
DO_VIDEO_TEST = False
def init_dashscope_api_key():
"""
Set your DashScope API key. More information:
https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
"""
# API keys differ between the Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
if 'DASHSCOPE_API_KEY' in os.environ:
dashscope.api_key = os.environ[
'DASHSCOPE_API_KEY'] # Load API key from environment variable DASHSCOPE_API_KEY
else:
dashscope.api_key = 'your-dashscope-api-key' # Set API key manually
class MyCallback(QwenTtsRealtimeCallback):
def __init__(self):
self.complete_event = threading.Event()
self.file = open('result_24k.pcm', 'wb')
def on_open(self) -> None:
print('connection opened, init player')
def on_close(self, close_status_code, close_msg) -> None:
self.file.close()
print('connection closed with code: {}, msg: {}, destroy player'.format(close_status_code, close_msg))
def on_event(self, response: str) -> None:
try:
global qwen_tts_realtime
type = response['type']
if 'session.created' == type:
print('start session: {}'.format(response['session']['id']))
if 'response.audio.delta' == type:
recv_audio_b64 = response['delta']
self.file.write(base64.b64decode(recv_audio_b64))
if 'response.done' == type:
print(f'response {qwen_tts_realtime.get_last_response_id()} done')
if 'session.finished' == type:
print('session finished')
self.complete_event.set()
except Exception as e:
print('[Error] {}'.format(e))
return
def wait_for_finished(self):
self.complete_event.wait()
if __name__ == '__main__':
init_dashscope_api_key()
print('Initializing ...')
callback = MyCallback()
qwen_tts_realtime = QwenTtsRealtime(
# To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime
model='qwen3-tts-flash-realtime',
callback=callback,
# This URL is for the Singapore region. If you use the Beijing region, replace it with: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime'
)
qwen_tts_realtime.connect()
qwen_tts_realtime.update_session(
voice = 'Cherry',
response_format = AudioFormat.PCM_24000HZ_MONO_16BIT,
# To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime
# instructions='Speak quickly with a rising intonation, suitable for introducing fashion products.',
# optimize_instructions=True,
mode = 'server_commit'
)
for text_chunk in text_to_synthesize:
print(f'send text: {text_chunk}')
qwen_tts_realtime.append_text(text_chunk)
time.sleep(0.1)
qwen_tts_realtime.finish()
callback.wait_for_finished()
print('[Metric] session: {}, first audio delay: {}'.format(
qwen_tts_realtime.get_session_id(),
qwen_tts_realtime.get_first_audio_delay(),
))
Commit mode
import base64
import os
import threading
import dashscope
from dashscope.audio.qwen_tts_realtime import *
qwen_tts_realtime: QwenTtsRealtime = None
text_to_synthesize = [
'This is the first sentence.',
'This is the second sentence.',
'This is the third sentence.',
]
DO_VIDEO_TEST = False
def init_dashscope_api_key():
"""
Set your DashScope API key. More information:
https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
"""
# API keys differ between the Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
if 'DASHSCOPE_API_KEY' in os.environ:
dashscope.api_key = os.environ[
'DASHSCOPE_API_KEY'] # Load API key from environment variable DASHSCOPE_API_KEY
else:
dashscope.api_key = 'your-dashscope-api-key' # Set API key manually
class MyCallback(QwenTtsRealtimeCallback):
def __init__(self):
super().__init__()
self.response_counter = 0
self.complete_event = threading.Event()
self.file = open(f'result_{self.response_counter}_24k.pcm', 'wb')
def reset_event(self):
self.response_counter += 1
self.file = open(f'result_{self.response_counter}_24k.pcm', 'wb')
self.complete_event = threading.Event()
def on_open(self) -> None:
print('connection opened, init player')
def on_close(self, close_status_code, close_msg) -> None:
print('connection closed with code: {}, msg: {}, destroy player'.format(close_status_code, close_msg))
def on_event(self, response: str) -> None:
try:
global qwen_tts_realtime
type = response['type']
if 'session.created' == type:
print('start session: {}'.format(response['session']['id']))
if 'response.audio.delta' == type:
recv_audio_b64 = response['delta']
self.file.write(base64.b64decode(recv_audio_b64))
if 'response.done' == type:
print(f'response {qwen_tts_realtime.get_last_response_id()} done')
self.complete_event.set()
self.file.close()
if 'session.finished' == type:
print('session finished')
self.complete_event.set()
except Exception as e:
print('[Error] {}'.format(e))
return
def wait_for_response_done(self):
self.complete_event.wait()
if __name__ == '__main__':
init_dashscope_api_key()
print('Initializing ...')
callback = MyCallback()
qwen_tts_realtime = QwenTtsRealtime(
# To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime
model='qwen3-tts-flash-realtime',
callback=callback,
# This URL is for the Singapore region. If you use the Beijing region, replace it with: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime'
)
qwen_tts_realtime.connect()
qwen_tts_realtime.update_session(
voice = 'Cherry',
response_format = AudioFormat.PCM_24000HZ_MONO_16BIT,
# To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime
# instructions='Speak quickly with a rising intonation, suitable for introducing fashion products.',
# optimize_instructions=True,
mode = 'commit'
)
print(f'send text: {text_to_synthesize[0]}')
qwen_tts_realtime.append_text(text_to_synthesize[0])
qwen_tts_realtime.commit()
callback.wait_for_response_done()
callback.reset_event()
print(f'send text: {text_to_synthesize[1]}')
qwen_tts_realtime.append_text(text_to_synthesize[1])
qwen_tts_realtime.commit()
callback.wait_for_response_done()
callback.reset_event()
print(f'send text: {text_to_synthesize[2]}')
qwen_tts_realtime.append_text(text_to_synthesize[2])
qwen_tts_realtime.commit()
callback.wait_for_response_done()
qwen_tts_realtime.finish()
print('[Metric] session: {}, first audio delay: {}'.format(
qwen_tts_realtime.get_session_id(),
qwen_tts_realtime.get_first_audio_delay(),
))
Visit GitHub to download more sample code.
Request parameters
Set the following request parameters in the QwenTtsRealtime constructor.
|
Parameter |
Type |
Required |
Description |
|
model |
str |
Yes |
Model name. See Supported models. |
|
url |
str |
Yes |
Chinese Mainland: International: |
Configure the following request parameters using the update_session method.
|
Parameter |
Type |
Required |
Description |
|
voice |
str |
Yes |
The voice for speech synthesis. For more information, see Supported voices. System voices and custom voices are supported:
|
|
language_type |
str |
No |
Specifies the language of the synthesized audio. The default value is
|
|
mode |
str |
No |
The interaction mode. Valid values:
|
|
format |
str |
No |
The format of the audio output from the model. Supported formats:
Qwen-TTS-Realtime (see Supported models) supports only |
|
sample_rate |
int |
No |
The sample rate of the audio output from the model, in Hz. Supported sample rates:
Qwen-TTS-Realtime (see Supported models) supports only 24000. |
|
speech_rate |
float |
No |
The speech rate of the audio. A value of 1.0 is the normal speed. A value less than 1.0 is slower, and a value greater than 1.0 is faster. Default value: 1.0. Valid range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
|
volume |
int |
No |
The volume of the audio. Default value: 50. Valid range: [0, 100]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
|
pitch_rate |
float |
No |
The pitch of the synthesized audio. Default value: 1.0. Valid range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
|
bit_rate |
int |
No |
Specifies the bitrate of the audio in kbps. A higher bitrate results in better audio quality and a larger file size. This parameter is available only when the audio format ( Default value: 128. Valid range: [6, 510]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
|
instructions |
str |
No |
Sets the instructions. For more information, see Real-time speech synthesis - Qwen. Default value: None. The parameter is not active if not set. Length limit: The length cannot exceed 1600 tokens. Supported languages: Chinese and English only. Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series. |
|
optimize_instructions |
bool |
No |
Specifies whether to optimize the Default value: False Behavior: When set to True, the system enhances and rewrites the Scenarios: Recommended for scenarios that require high-quality, fine-grained voice expression. Dependency: This parameter depends on the Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series. |
Key interfaces
QwenTtsRealtime class
Import QwenTtsRealtime using from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime.
|
Method signature |
Server response events (delivered via callback) |
Description |
|
Session created Session configuration updated |
Connect to the server. |
|
Session configuration updated |
Update default session configurations. For parameter details, see the Request parameters section. After you establish a connection, the server returns default input and output configurations for the session. We recommend that you call this method immediately after connecting to update these defaults. When the server receives a session.update event, it validates the parameters. If any parameter is invalid, the server returns an error. Otherwise, it updates the session configuration on the server side. |
|
None |
Append a text chunk to the cloud input text buffer. The buffer is temporary storage where you write text before submitting it.
|
|
Clear text received by the server |
Delete all text in the current cloud buffer. |
|
Submit text and trigger speech synthesis New output content added New content added to assistant message Incremental audio generated by the model Audio generation completed Streaming of audio content for assistant message completed Streaming of entire output item for assistant message completed Response completed |
Submit all text previously appended to the cloud buffer and synthesize it immediately. An error occurs if the buffer is empty.
|
|
Response completed |
Terminate the task. |
|
None |
Close the connection. |
|
None |
Get the session ID for the current task. |
|
None |
Get the response ID of the most recent response. |
|
None |
Get the delay before the first audio packet arrives. |
Callback interface (QwenTtsRealtimeCallback)
The server sends responses and data to the client through callbacks. Implement callback methods to handle server responses and data.
Import QwenTtsRealtimeCallback using from dashscope.audio.qwen_tts_realtime import QwenTtsRealtimeCallback.
|
Method |
Parameters |
Return value |
Description |
|
None |
None |
Called immediately after the connection to the server is established. |
|
message: Server response event. |
None |
Includes responses to API calls and model-generated text and audio. For details, see Server events. |
|
close_status_code: WebSocket close status code. close_msg: WebSocket close message. |
None |
Called after the server closes the connection. |