Call Qwen real-time speech synthesis using Python SDK - Alibaba Cloud Model Studio - Alibaba Cloud - Alibaba Cloud Model Studio

User guide: For model introductions and selection recommendations, see Real-time speech synthesis – Qwen or Speech synthesis – Qwen.

Prerequisites

Requires DashScope Python SDK 1.25.11 or later.

Getting started

Server commit mode

import os
import base64
import threading
import time
import dashscope
from dashscope.audio.qwen_tts_realtime import *


qwen_tts_realtime: QwenTtsRealtime = None
text_to_synthesize = [
    'Right? I love supermarkets like this.',
    'Especially during Chinese New Year,',
    'I go shopping at supermarkets.',
    'And I feel',
    'absolutely thrilled!',
    'I want to buy so many things!'
]

DO_VIDEO_TEST = False

def init_dashscope_api_key():
    """
        Set your DashScope API key. More information:
        https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
    """

    # API keys differ between the Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    if 'DASHSCOPE_API_KEY' in os.environ:
        dashscope.api_key = os.environ[
            'DASHSCOPE_API_KEY']  # Load API key from environment variable DASHSCOPE_API_KEY
    else:
        dashscope.api_key = 'your-dashscope-api-key'  # Set API key manually



class MyCallback(QwenTtsRealtimeCallback):
    def __init__(self):
        self.complete_event = threading.Event()
        self.file = open('result_24k.pcm', 'wb')

    def on_open(self) -> None:
        print('connection opened, init player')

    def on_close(self, close_status_code, close_msg) -> None:
        self.file.close()
        print('connection closed with code: {}, msg: {}, destroy player'.format(close_status_code, close_msg))

    def on_event(self, response: str) -> None:
        try:
            global qwen_tts_realtime
            type = response['type']
            if 'session.created' == type:
                print('start session: {}'.format(response['session']['id']))
            if 'response.audio.delta' == type:
                recv_audio_b64 = response['delta']
                self.file.write(base64.b64decode(recv_audio_b64))
            if 'response.done' == type:
                print(f'response {qwen_tts_realtime.get_last_response_id()} done')
            if 'session.finished' == type:
                print('session finished')
                self.complete_event.set()
        except Exception as e:
            print('[Error] {}'.format(e))
            return

    def wait_for_finished(self):
        self.complete_event.wait()


if __name__  == '__main__':
    init_dashscope_api_key()

    print('Initializing ...')

    callback = MyCallback()

    qwen_tts_realtime = QwenTtsRealtime(
        # To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime
        model='qwen3-tts-flash-realtime',
        callback=callback,
        # Singapore region
        url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime'
        )

    qwen_tts_realtime.connect()
    qwen_tts_realtime.update_session(
        voice = 'Cherry',
        response_format = AudioFormat.PCM_24000HZ_MONO_16BIT,
        # To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime
        # instructions='Speak quickly with a rising intonation, suitable for introducing fashion products.',
        # optimize_instructions=True,
        mode = 'server_commit'        
    )
    for text_chunk in text_to_synthesize:
        print(f'send text: {text_chunk}')
        qwen_tts_realtime.append_text(text_chunk)
        time.sleep(0.1)
    qwen_tts_realtime.finish()
    callback.wait_for_finished()
    print('[Metric] session: {}, first audio delay: {}'.format(
                    qwen_tts_realtime.get_session_id(), 
                    qwen_tts_realtime.get_first_audio_delay(),
                    ))

Commit mode

import base64
import os
import threading
import dashscope
from dashscope.audio.qwen_tts_realtime import *


qwen_tts_realtime: QwenTtsRealtime = None
text_to_synthesize = [
    'This is the first sentence.',
    'This is the second sentence.',
    'This is the third sentence.',
]

DO_VIDEO_TEST = False

def init_dashscope_api_key():
    """
        Set your DashScope API key. More information:
        https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
    """

    # API keys differ between the Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    if 'DASHSCOPE_API_KEY' in os.environ:
        dashscope.api_key = os.environ[
            'DASHSCOPE_API_KEY']  # Load API key from environment variable DASHSCOPE_API_KEY
    else:
        dashscope.api_key = 'your-dashscope-api-key'  # Set API key manually



class MyCallback(QwenTtsRealtimeCallback):
    def __init__(self):
        super().__init__()
        self.response_counter = 0
        self.complete_event = threading.Event()
        self.file = open(f'result_{self.response_counter}_24k.pcm', 'wb')

    def reset_event(self):
        self.response_counter += 1
        self.file = open(f'result_{self.response_counter}_24k.pcm', 'wb')
        self.complete_event = threading.Event()

    def on_open(self) -> None:
        print('connection opened, init player')

    def on_close(self, close_status_code, close_msg) -> None:
        print('connection closed with code: {}, msg: {}, destroy player'.format(close_status_code, close_msg))

    def on_event(self, response: str) -> None:
        try:
            global qwen_tts_realtime
            type = response['type']
            if 'session.created' == type:
                print('start session: {}'.format(response['session']['id']))
            if 'response.audio.delta' == type:
                recv_audio_b64 = response['delta']
                self.file.write(base64.b64decode(recv_audio_b64))
            if 'response.done' == type:
                print(f'response {qwen_tts_realtime.get_last_response_id()} done')
                self.complete_event.set()
                self.file.close()
            if 'session.finished' == type:
                print('session finished')
                self.complete_event.set()
        except Exception as e:
            print('[Error] {}'.format(e))
            return

    def wait_for_response_done(self):
        self.complete_event.wait()


if __name__  == '__main__':
    init_dashscope_api_key()

    print('Initializing ...')

    callback = MyCallback()

    qwen_tts_realtime = QwenTtsRealtime(
        # To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime
        model='qwen3-tts-flash-realtime',
        callback=callback, 
        # Singapore region
        url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime'
        )

    qwen_tts_realtime.connect()
    qwen_tts_realtime.update_session(
        voice = 'Cherry',
        response_format = AudioFormat.PCM_24000HZ_MONO_16BIT,
        # To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime
        # instructions='Speak quickly with a rising intonation, suitable for introducing fashion products.',
        # optimize_instructions=True,
        mode = 'commit'        
    )
    print(f'send text: {text_to_synthesize[0]}')
    qwen_tts_realtime.append_text(text_to_synthesize[0])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    callback.reset_event()
    
    print(f'send text: {text_to_synthesize[1]}')
    qwen_tts_realtime.append_text(text_to_synthesize[1])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    callback.reset_event()

    print(f'send text: {text_to_synthesize[2]}')
    qwen_tts_realtime.append_text(text_to_synthesize[2])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    
    qwen_tts_realtime.finish()
    print('[Metric] session: {}, first audio delay: {}'.format(
                    qwen_tts_realtime.get_session_id(), 
                    qwen_tts_realtime.get_first_audio_delay(),
                    ))

Visit GitHub to download more sample code.

Request parameters

Set these parameters in the QwenTtsRealtime constructor:

Parameter	Type	Required	Description
model	str	Yes	Model name (see Supported models).
url	str	Yes	China (Beijing): `wss://dashscope.aliyuncs.com/api-ws/v1/realtime` Singapore: `wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime`

Configure these parameters using update_session:

Parameter	Type	Required	Description
voice	str	Yes	The voice for speech synthesis. For more information, see Supported voices. System voices and custom voices are supported: System voices: Available only for the Qwen3-TTS-Instruct-Flash-Realtime, Qwen3-TTS-Flash-Realtime, and Qwen-TTS-Realtime model series. For voice samples, see Supported voices. Custom voices Voices customized by the Voice Cloning (Qwen) feature: Available only for the Qwen3-TTS-VC-Realtime model series. Voices customized by the Voice Design (Qwen) feature: Available only for the Qwen3-TTS-VD-Realtime model series.
language_type	str	No	Specifies the language of the synthesized audio. The default value is `Auto`. `Auto`: Use this value when the language of the text is uncertain or contains multiple languages. The model automatically matches the pronunciation for different language segments in the text, but cannot guarantee perfect accuracy. Specific language: Use this for single-language text. Specifying a language significantly improves synthesis quality and typically yields better results than `Auto`. Valid values include the following: `Chinese` `English` `German` `Italian` `Portuguese` `Spanish` `Japanese` `Korean` `French` `Russian`
mode	str	No	The interaction mode. Valid values: `server_commit` (default): The server automatically determines when to synthesize speech, balancing latency and quality. This mode is recommended for most scenarios. `commit`: The client manually triggers synthesis. This mode has the lowest latency, but you must manage sentence integrity yourself.
format	str	No	The format of the audio output from the model. Supported formats: `pcm` (default) `wav` `mp3` `opus` Qwen-TTS-Realtime (see Supported models) supports only `pcm`.
sample_rate	int	No	The sample rate of the audio output from the model, in Hz. Supported sample rates: 8000 16000 24000 (default) 48000 Qwen-TTS-Realtime (see Supported models) supports only 24000.
speech_rate	float	No	The speech rate of the audio. A value of 1.0 is the normal speed. A value less than 1.0 is slower, and a value greater than 1.0 is faster. Default value: 1.0. Valid range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter.
volume	int	No	The volume of the audio. Default value: 50. Valid range: [0, 100]. Qwen-TTS-Realtime (see Supported models) does not support this parameter.
pitch_rate	float	No	The pitch of the synthesized audio. Default value: 1.0. Valid range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter.
bit_rate	int	No	Specifies the bitrate of the audio in kbps. A higher bitrate results in better audio quality and a larger file size. This parameter is available only when the audio format (`response_format`) is set to `opus`. Default value: 128. Valid range: [6, 510]. Qwen-TTS-Realtime (see Supported models) does not support this parameter.
instructions	str	No	Sets the instructions. For more information, see Real-time speech synthesis - Qwen. Default value: None. The parameter is not active if not set. Length limit: The length cannot exceed 1600 tokens. Supported languages: Chinese and English only. Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series.
optimize_instructions	bool	No	Specifies whether to optimize the `instructions` to improve the naturalness and expressiveness of the speech synthesis. Default value: False Behavior: When set to True, the system enhances and rewrites the `instructions` semantically to generate internal instructions optimized for speech synthesis. Scenarios: Recommended for scenarios that require high-quality, fine-grained voice expression. Dependency: This parameter depends on the `instructions` parameter being set. If `instructions` is empty, this parameter has no effect. Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series.

Key interfaces

QwenTtsRealtime class

Import: from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime

Method signature	Server response events (delivered via callback)	Description
`def connect(self) -> None`	session.created Session created session.updated Session configuration updated	Connect to the server.
`def update_session(self, voice: str, response_format: AudioFormat = AudioFormat. PCM_24000HZ_MONO_16BIT, mode: str = 'server_commit', language_type : str = "Chinese", **kwargs) -> None`	session.updated Session configuration updated	Update default session configurations. For parameter details, see the Request parameters. After connecting, the server returns default input and output configurations. Call this method immediately after connecting to update these defaults. The server validates parameters when it receives a session.update event. If validation fails, it returns an error; otherwise, it updates the session configuration.
`def append_text(self, text: str) -> None`	None	Append a text chunk to the cloud input buffer (temporary storage for text before submission). In server_commit mode, the server decides when to submit and synthesize text in the buffer. In commit mode, the client must trigger synthesis by calling commit.
`def clear_appended_text(self, ) -> None`	input_text_buffer.cleared Clear text received by the server	Delete all text in the cloud buffer.
`def commit(self, ) -> None`	input_text_buffer.committed Submit text and trigger speech synthesis response.output_item.added New output content added response.content_part.added New content added to assistant message response.audio.delta Incremental audio generated by the model response.audio.done Audio generation completed response.content_part.done Streaming of audio content for assistant message completed response.output_item.done Streaming of entire output item for assistant message completed response.done Response completed	Submit all text in the cloud buffer and synthesize immediately. Returns an error if the buffer is empty. In server_commit mode, clients do not need to send this event. The server submits the buffer automatically. In commit mode, clients must call commit to trigger speech synthesis.
`def finish(self, ) -> None`	session.finished Response completed	Terminate the task.
`def close(self, ) -> None`	None	Close the connection.
`def get_session_id(self) -> str`	None	Get the session ID for the current task.
`def get_last_response_id(self) -> str`	None	Get the response ID of the most recent response.
`def get_first_audio_delay(self)`	None	Get the delay before the first audio packet arrives.

Callback interface (QwenTtsRealtimeCallback)

The server sends responses and data through callbacks. Implement callback methods to handle them.

Import: from dashscope.audio.qwen_tts_realtime import QwenTtsRealtimeCallback

Method	Parameters	Return value	Description
`def on_open(self) -> None`	None	None	Called when the server connection is established.
`def on_event(self, message: str) -> None`	message: Server response event.	None	Includes API call responses, model-generated text, and audio. See Server events.
`def on_close(self, close_status_code, close_msg) -> None`	close_status_code: WebSocket close status code. close_msg: WebSocket close message.	None	Called after the server closes the connection.