All Products
Search
Document Center

Alibaba Cloud Model Studio:Python SDK

Last Updated:Feb 25, 2026

This topic describes the key interfaces and request parameters for calling real-time speech synthesis (Qwen) using the DashScope Python SDK.

User guide: For model descriptions and selection recommendations, see Real-time Text-to-Speech - Qwen or Speech synthesis - Qwen.

Prerequisites

Your DashScope Python SDK version must be 1.25.11 or later.

Getting started

Server commit mode

import os
import base64
import threading
import time
import dashscope
from dashscope.audio.qwen_tts_realtime import *


qwen_tts_realtime: QwenTtsRealtime = None
text_to_synthesize = [
    'Right? I love supermarkets like this.',
    'Especially during Chinese New Year,',
    'I go shopping at supermarkets.',
    'And I feel',
    'absolutely thrilled!',
    'I want to buy so many things!'
]

DO_VIDEO_TEST = False

def init_dashscope_api_key():
    """
        Set your DashScope API key. More information:
        https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
    """

    # API keys differ between the Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    if 'DASHSCOPE_API_KEY' in os.environ:
        dashscope.api_key = os.environ[
            'DASHSCOPE_API_KEY']  # Load API key from environment variable DASHSCOPE_API_KEY
    else:
        dashscope.api_key = 'your-dashscope-api-key'  # Set API key manually



class MyCallback(QwenTtsRealtimeCallback):
    def __init__(self):
        self.complete_event = threading.Event()
        self.file = open('result_24k.pcm', 'wb')

    def on_open(self) -> None:
        print('connection opened, init player')

    def on_close(self, close_status_code, close_msg) -> None:
        self.file.close()
        print('connection closed with code: {}, msg: {}, destroy player'.format(close_status_code, close_msg))

    def on_event(self, response: str) -> None:
        try:
            global qwen_tts_realtime
            type = response['type']
            if 'session.created' == type:
                print('start session: {}'.format(response['session']['id']))
            if 'response.audio.delta' == type:
                recv_audio_b64 = response['delta']
                self.file.write(base64.b64decode(recv_audio_b64))
            if 'response.done' == type:
                print(f'response {qwen_tts_realtime.get_last_response_id()} done')
            if 'session.finished' == type:
                print('session finished')
                self.complete_event.set()
        except Exception as e:
            print('[Error] {}'.format(e))
            return

    def wait_for_finished(self):
        self.complete_event.wait()


if __name__  == '__main__':
    init_dashscope_api_key()

    print('Initializing ...')

    callback = MyCallback()

    qwen_tts_realtime = QwenTtsRealtime(
        # To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime
        model='qwen3-tts-flash-realtime',
        callback=callback,
        # This URL is for the Singapore region. If you use the Beijing region, replace it with: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
        url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime'
        )

    qwen_tts_realtime.connect()
    qwen_tts_realtime.update_session(
        voice = 'Cherry',
        response_format = AudioFormat.PCM_24000HZ_MONO_16BIT,
        # To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime
        # instructions='Speak quickly with a rising intonation, suitable for introducing fashion products.',
        # optimize_instructions=True,
        mode = 'server_commit'        
    )
    for text_chunk in text_to_synthesize:
        print(f'send text: {text_chunk}')
        qwen_tts_realtime.append_text(text_chunk)
        time.sleep(0.1)
    qwen_tts_realtime.finish()
    callback.wait_for_finished()
    print('[Metric] session: {}, first audio delay: {}'.format(
                    qwen_tts_realtime.get_session_id(), 
                    qwen_tts_realtime.get_first_audio_delay(),
                    ))

Commit mode

import base64
import os
import threading
import dashscope
from dashscope.audio.qwen_tts_realtime import *


qwen_tts_realtime: QwenTtsRealtime = None
text_to_synthesize = [
    'This is the first sentence.',
    'This is the second sentence.',
    'This is the third sentence.',
]

DO_VIDEO_TEST = False

def init_dashscope_api_key():
    """
        Set your DashScope API key. More information:
        https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
    """

    # API keys differ between the Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    if 'DASHSCOPE_API_KEY' in os.environ:
        dashscope.api_key = os.environ[
            'DASHSCOPE_API_KEY']  # Load API key from environment variable DASHSCOPE_API_KEY
    else:
        dashscope.api_key = 'your-dashscope-api-key'  # Set API key manually



class MyCallback(QwenTtsRealtimeCallback):
    def __init__(self):
        super().__init__()
        self.response_counter = 0
        self.complete_event = threading.Event()
        self.file = open(f'result_{self.response_counter}_24k.pcm', 'wb')

    def reset_event(self):
        self.response_counter += 1
        self.file = open(f'result_{self.response_counter}_24k.pcm', 'wb')
        self.complete_event = threading.Event()

    def on_open(self) -> None:
        print('connection opened, init player')

    def on_close(self, close_status_code, close_msg) -> None:
        print('connection closed with code: {}, msg: {}, destroy player'.format(close_status_code, close_msg))

    def on_event(self, response: str) -> None:
        try:
            global qwen_tts_realtime
            type = response['type']
            if 'session.created' == type:
                print('start session: {}'.format(response['session']['id']))
            if 'response.audio.delta' == type:
                recv_audio_b64 = response['delta']
                self.file.write(base64.b64decode(recv_audio_b64))
            if 'response.done' == type:
                print(f'response {qwen_tts_realtime.get_last_response_id()} done')
                self.complete_event.set()
                self.file.close()
            if 'session.finished' == type:
                print('session finished')
                self.complete_event.set()
        except Exception as e:
            print('[Error] {}'.format(e))
            return

    def wait_for_response_done(self):
        self.complete_event.wait()


if __name__  == '__main__':
    init_dashscope_api_key()

    print('Initializing ...')

    callback = MyCallback()

    qwen_tts_realtime = QwenTtsRealtime(
        # To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime
        model='qwen3-tts-flash-realtime',
        callback=callback, 
        # This URL is for the Singapore region. If you use the Beijing region, replace it with: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
        url='wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime'
        )

    qwen_tts_realtime.connect()
    qwen_tts_realtime.update_session(
        voice = 'Cherry',
        response_format = AudioFormat.PCM_24000HZ_MONO_16BIT,
        # To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime
        # instructions='Speak quickly with a rising intonation, suitable for introducing fashion products.',
        # optimize_instructions=True,
        mode = 'commit'        
    )
    print(f'send text: {text_to_synthesize[0]}')
    qwen_tts_realtime.append_text(text_to_synthesize[0])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    callback.reset_event()
    
    print(f'send text: {text_to_synthesize[1]}')
    qwen_tts_realtime.append_text(text_to_synthesize[1])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    callback.reset_event()

    print(f'send text: {text_to_synthesize[2]}')
    qwen_tts_realtime.append_text(text_to_synthesize[2])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    
    qwen_tts_realtime.finish()
    print('[Metric] session: {}, first audio delay: {}'.format(
                    qwen_tts_realtime.get_session_id(), 
                    qwen_tts_realtime.get_first_audio_delay(),
                    ))

Visit GitHub to download more sample code.

Request parameters

Set the following request parameters in the QwenTtsRealtime constructor.

Parameter

Type

Required

Description

model

str

Yes

Model name. See Supported models.

url

str

Yes

Chinese Mainland: wss://dashscope.aliyuncs.com/api-ws/v1/realtime

International: wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime

Configure the following request parameters using the update_session method.

Parameter

Type

Required

Description

voice

str

Yes

The voice for speech synthesis. For more information, see Supported voices.

System voices and custom voices are supported:

  • System voices: Available only for the Qwen3-TTS-Instruct-Flash-Realtime, Qwen3-TTS-Flash-Realtime, and Qwen-TTS-Realtime model series. For voice samples, see Supported voices.

  • Custom voices

    • Voices customized by the Voice Cloning (Qwen) feature: Available only for the Qwen3-TTS-VC-Realtime model series.

    • Voices customized by the Voice Design (Qwen) feature: Available only for the Qwen3-TTS-VD-Realtime model series.

language_type

str

No

Specifies the language of the synthesized audio. The default value is Auto.

  • Auto: Use this value when the language of the text is uncertain or contains multiple languages. The model automatically matches the pronunciation for different language segments in the text, but cannot guarantee perfect accuracy.

  • Specific language: Use this for single-language text. Specifying a language significantly improves synthesis quality and typically yields better results than Auto. Valid values include the following:

    • Chinese

    • English

    • German

    • Italian

    • Portuguese

    • Spanish

    • Japanese

    • Korean

    • French

    • Russian

mode

str

No

The interaction mode. Valid values:

  • server_commit (default): The server automatically determines when to synthesize speech, balancing latency and quality. This mode is recommended for most scenarios.

  • commit: The client manually triggers synthesis. This mode has the lowest latency, but you must manage sentence integrity yourself.

format

str

No

The format of the audio output from the model.

Supported formats:

  • pcm (default)

  • wav

  • mp3

  • opus

Qwen-TTS-Realtime (see Supported models) supports only pcm.

sample_rate

int

No

The sample rate of the audio output from the model, in Hz.

Supported sample rates:

  • 8000

  • 16000

  • 24000 (default)

  • 48000

Qwen-TTS-Realtime (see Supported models) supports only 24000.

speech_rate

float

No

The speech rate of the audio. A value of 1.0 is the normal speed. A value less than 1.0 is slower, and a value greater than 1.0 is faster.

Default value: 1.0.

Valid range: [0.5, 2.0].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

volume

int

No

The volume of the audio.

Default value: 50.

Valid range: [0, 100].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

pitch_rate

float

No

The pitch of the synthesized audio.

Default value: 1.0.

Valid range: [0.5, 2.0].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

bit_rate

int

No

Specifies the bitrate of the audio in kbps. A higher bitrate results in better audio quality and a larger file size. This parameter is available only when the audio format (response_format) is set to opus.

Default value: 128.

Valid range: [6, 510].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

instructions

str

No

Sets the instructions. For more information, see Real-time speech synthesis - Qwen.

Default value: None. The parameter is not active if not set.

Length limit: The length cannot exceed 1600 tokens.

Supported languages: Chinese and English only.

Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series.

optimize_instructions

bool

No

Specifies whether to optimize the instructions to improve the naturalness and expressiveness of the speech synthesis.

Default value: False

Behavior: When set to True, the system enhances and rewrites the instructions semantically to generate internal instructions optimized for speech synthesis.

Scenarios: Recommended for scenarios that require high-quality, fine-grained voice expression.

Dependency: This parameter depends on the instructions parameter being set. If instructions is empty, this parameter has no effect.

Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series.

Key interfaces

QwenTtsRealtime class

Import QwenTtsRealtime using from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime.

Method signature

Server response events (delivered via callback)

Description

def connect(self) -> None

session.created

Session created

session.updated

Session configuration updated

Connect to the server.

    def update_session(self,
                       voice: str,
                       response_format: AudioFormat = AudioFormat.
                       PCM_24000HZ_MONO_16BIT,
                       mode: str = 'server_commit',
                       language_type : str = "Chinese",
                       **kwargs) -> None

session.updated

Session configuration updated

Update default session configurations. For parameter details, see the Request parameters section.

After you establish a connection, the server returns default input and output configurations for the session. We recommend that you call this method immediately after connecting to update these defaults.

When the server receives a session.update event, it validates the parameters. If any parameter is invalid, the server returns an error. Otherwise, it updates the session configuration on the server side.

def append_text(self, text: str) -> None

None

Append a text chunk to the cloud input text buffer. The buffer is temporary storage where you write text before submitting it.

  • In server_commit mode, the server decides when to submit and synthesize text in the buffer.

  • In commit mode, the client must trigger synthesis by calling commit.

def clear_appended_text(self, ) -> None

input_text_buffer.cleared

Clear text received by the server

Delete all text in the current cloud buffer.

def commit(self, ) -> None

input_text_buffer.committed

Submit text and trigger speech synthesis

response.output_item.added

New output content added

response.content_part.added

New content added to assistant message

response.audio.delta

Incremental audio generated by the model

response.audio.done

Audio generation completed

response.content_part.done

Streaming of audio content for assistant message completed

response.output_item.done

Streaming of entire output item for assistant message completed

response.done

Response completed

Submit all text previously appended to the cloud buffer and synthesize it immediately. An error occurs if the buffer is empty.

  • In server_commit mode, clients do not need to send this event. The server submits the buffer automatically.

  • In commit mode, clients must call commit to trigger speech synthesis.

def finish(self, ) -> None

session.finished

Response completed

Terminate the task.

def close(self, ) -> None

None

Close the connection.

def get_session_id(self) -> str

None

Get the session ID for the current task.

def get_last_response_id(self) -> str

None

Get the response ID of the most recent response.

def get_first_audio_delay(self)

None

Get the delay before the first audio packet arrives.

Callback interface (QwenTtsRealtimeCallback)

The server sends responses and data to the client through callbacks. Implement callback methods to handle server responses and data.

Import QwenTtsRealtimeCallback using from dashscope.audio.qwen_tts_realtime import QwenTtsRealtimeCallback.

Method

Parameters

Return value

Description

def on_open(self) -> None

None

None

Called immediately after the connection to the server is established.

def on_event(self, message: str) -> None

message: Server response event.

None

Includes responses to API calls and model-generated text and audio. For details, see Server events.

def on_close(self, close_status_code, close_msg) -> None

close_status_code: WebSocket close status code.

close_msg: WebSocket close message.

None

Called after the server closes the connection.