All Products
Search
Document Center

Alibaba Cloud Model Studio:Java SDK

Last Updated:Apr 01, 2026

Key interfaces and request parameters for Qwen real-time speech synthesis DashScope Java SDK.

User guide: For model introductions and selection recommendations, see Real-time speech synthesis – Qwen or Speech synthesis – Qwen.

Prerequisites

Requires DashScope Java SDK 2.22.7 or later.

Getting started

Server commit mode

appendText()

import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.*;
import java.util.Base64;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;

public class Main {
    static String[] textToSynthesize = {
            "Right? I really love this kind of supermarket.",
            "Especially during the Chinese New Year.",
            "Going to the supermarket.",
            "It just makes me feel.",
            "Super, super happy!",
            "I want to buy so many things!"
    };
    public static QwenTtsRealtimeAudioFormat ttsFormat = QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT;

    // Real-time PCM audio player
    public static class RealtimePcmPlayer {
        private int sampleRate;
        private SourceDataLine line;
        private AudioFormat audioFormat;
        private Thread decoderThread;
        private Thread playerThread;
        private AtomicBoolean stopped = new AtomicBoolean(false);
        private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
        private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();
        private ByteArrayOutputStream totalAudioStream = new ByteArrayOutputStream();

        // Initialize the audio format and audio line.
        public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
            this.sampleRate = sampleRate;
            this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
            DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
            line = (SourceDataLine) AudioSystem.getLine(info);
            line.open(audioFormat);
            line.start();
            decoderThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!stopped.get()) {
                        String b64Audio = b64AudioBuffer.poll();
                        if (b64Audio != null) {
                            byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
                            RawAudioBuffer.add(rawAudio);
                            // Write audio data to totalAudioStream.
                            try {
                                totalAudioStream.write(rawAudio);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            }
                        } else {
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }
            });
            playerThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!stopped.get()) {
                        byte[] rawAudio = RawAudioBuffer.poll();
                        if (rawAudio != null) {
                            try {
                                playChunk(rawAudio);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        } else {
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }
            });
            decoderThread.start();
            playerThread.start();
        }

        // Play an audio chunk and block until playback completes.
        private void playChunk(byte[] chunk) throws IOException, InterruptedException {
            if (chunk == null || chunk.length == 0) return;

            int bytesWritten = 0;
            while (bytesWritten < chunk.length) {
                bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
            }
            int audioLength = chunk.length / (this.sampleRate*2/1000);
            // Wait for the buffered audio to finish playing.
            Thread.sleep(audioLength - 10);
        }

        public void write(String b64Audio) {
            b64AudioBuffer.add(b64Audio);
        }

        public void cancel() {
            b64AudioBuffer.clear();
            RawAudioBuffer.clear();
        }

        public void waitForComplete() throws InterruptedException {
            while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
                Thread.sleep(100);
            }
            line.drain();
        }

        public void shutdown() throws InterruptedException, IOException {
            stopped.set(true);
            decoderThread.join();
            playerThread.join();

            // Save the complete audio file.
            File file = new File("TotalAudio_"+ttsFormat.getSampleRate()+"."+ttsFormat.getFormat());
            try (FileOutputStream fos = new FileOutputStream(file)) {
                fos.write(totalAudioStream.toByteArray());
            }

            if (line != null && line.isRunning()) {
                line.drain();
                line.close();
            }
        }
    }

    public static void main(String[] args) throws InterruptedException, LineUnavailableException, IOException {
        QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
                // To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime.
                .model("qwen3-tts-flash-realtime")
                // Singapore endpoint. For China (Beijing), use wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
                .url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
                // API keys differ between Singapore and China (Beijing). See https://www.alibabacloud.com/help/zh/model-studio/get-api-key.
                .apikey(System.getenv("DASHSCOPE_API_KEY"))
                .build();
        AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));
        final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);

        // Create a real-time audio player instance.
        RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);

        QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
            @Override
            public void onOpen() {
                // Handle connection establishment.
            }
            @Override
            public void onEvent(JsonObject message) {
                String type = message.get("type").getAsString();
                switch(type) {
                    case "session.created":
                        // Handle session creation.
                        if (message.has("session")) {
                            String eventId = message.get("event_id").getAsString();
                            String sessionId = message.get("session").getAsJsonObject().get("id").getAsString();
                            System.out.println("[onEvent] session.created, session_id: "
                                    + sessionId + ", event_id: " + eventId);
                        }
                        break;
                    case "response.audio.delta":
                        String recvAudioB64 = message.get("delta").getAsString();
                        // Play audio in real time.
                        audioPlayer.write(recvAudioB64);
                        break;
                    case "response.done":
                        // Handle response completion.
                        break;
                    case "session.finished":
                        // Handle session termination.
                        completeLatch.get().countDown();
                    default:
                        break;
                }
            }
            @Override
            public void onClose(int code, String reason) {
                // Handle connection closure.
            }
        });
        qwenTtsRef.set(qwenTtsRealtime);
        try {
            qwenTtsRealtime.connect();
        } catch (NoApiKeyException e) {
            throw new RuntimeException(e);
        }
        QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
                .voice("Cherry")
                .responseFormat(ttsFormat)
                .mode("server_commit")
                // To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime.
                // .instructions("")
                // .optimizeInstructions(true)
                .build();
        qwenTtsRealtime.updateSession(config);
        for (String text:textToSynthesize) {
            qwenTtsRealtime.appendText(text);
            Thread.sleep(100);
        }
        qwenTtsRealtime.finish();
        completeLatch.get().await();
        qwenTtsRealtime.close();

        // Wait for audio playback to complete, then shut down the player.
        audioPlayer.waitForComplete();
        audioPlayer.shutdown();
        System.exit(0);
    }
}

Commit mode

commit()

import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.*;
import java.util.Base64;
import java.util.Queue;
import java.util.Scanner;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;

public class Main {
    public static QwenTtsRealtimeAudioFormat ttsFormat = QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT;
    // Real-time PCM audio player
    public static class RealtimePcmPlayer {
        private int sampleRate;
        private SourceDataLine line;
        private AudioFormat audioFormat;
        private Thread decoderThread;
        private Thread playerThread;
        private AtomicBoolean stopped = new AtomicBoolean(false);
        private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
        private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();
        private ByteArrayOutputStream totalAudioStream = new ByteArrayOutputStream();


        // Initialize the audio format and audio line.
        public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
            this.sampleRate = sampleRate;
            this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
            DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
            line = (SourceDataLine) AudioSystem.getLine(info);
            line.open(audioFormat);
            line.start();
            decoderThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!stopped.get()) {
                        String b64Audio = b64AudioBuffer.poll();
                        if (b64Audio != null) {
                            byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
                            RawAudioBuffer.add(rawAudio);
                            // Write audio data to totalAudioStream.
                            try {
                                totalAudioStream.write(rawAudio);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            }
                        } else {
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }
            });
            playerThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!stopped.get()) {
                        byte[] rawAudio = RawAudioBuffer.poll();
                        if (rawAudio != null) {
                            try {
                                playChunk(rawAudio);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        } else {
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }
            });
            decoderThread.start();
            playerThread.start();
        }

        // Play an audio chunk and block until playback completes.
        private void playChunk(byte[] chunk) throws IOException, InterruptedException {
            if (chunk == null || chunk.length == 0) return;

            int bytesWritten = 0;
            while (bytesWritten < chunk.length) {
                bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
            }
            int audioLength = chunk.length / (this.sampleRate*2/1000);
            // Wait for the buffered audio to finish playing.
            Thread.sleep(audioLength - 10);
        }

        public void write(String b64Audio) {
            b64AudioBuffer.add(b64Audio);
        }

        public void cancel() {
            b64AudioBuffer.clear();
            RawAudioBuffer.clear();
        }

        public void waitForComplete() throws InterruptedException {
            // Wait for all buffered audio data to finish playing.
            while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
                Thread.sleep(100);
            }
            // Wait for the audio line to drain.
            line.drain();
        }

        public void shutdown() throws InterruptedException {
            stopped.set(true);
            decoderThread.join();
            playerThread.join();
            // Save the complete audio file.
            File file = new File("TotalAudio_"+ttsFormat.getSampleRate()+"."+ttsFormat.getFormat());
            try (FileOutputStream fos = new FileOutputStream(file)) {
                fos.write(totalAudioStream.toByteArray());
            } catch (FileNotFoundException e) {
                throw new RuntimeException(e);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            if (line != null && line.isRunning()) {
                line.drain();
                line.close();
            }
        }
    }

    public static void main(String[] args) throws InterruptedException, LineUnavailableException, FileNotFoundException {
        Scanner scanner = new Scanner(System.in);

        QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
                // To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime.
                .model("qwen3-tts-flash-realtime")
                // Singapore endpoint. For China (Beijing), use wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
                .url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
                // API keys differ between Singapore and China (Beijing). See https://www.alibabacloud.com/help/zh/model-studio/get-api-key.
                .apikey(System.getenv("DASHSCOPE_API_KEY"))
                .build();

        AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));

        // Create a real-time player instance.
        RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);

        final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);
        QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
            @Override
            public void onOpen() {
                System.out.println("connection opened");
                System.out.println("Enter text and press Enter to send. Enter 'quit' to exit the program.");
            }
            @Override
            public void onEvent(JsonObject message) {
                String type = message.get("type").getAsString();
                switch(type) {
                    case "session.created":
                        System.out.println("start session: " + message.get("session").getAsJsonObject().get("id").getAsString());
                        break;
                    case "response.audio.delta":
                        String recvAudioB64 = message.get("delta").getAsString();
                        byte[] rawAudio = Base64.getDecoder().decode(recvAudioB64);
                        // Play audio in real time.
                        audioPlayer.write(recvAudioB64);
                        break;
                    case "response.done":
                        System.out.println("response done");
                        // Wait for audio playback to complete.
                        try {
                            audioPlayer.waitForComplete();
                        } catch (InterruptedException e) {
                            throw new RuntimeException(e);
                        }
                        // Prepare for the next input.
                        completeLatch.get().countDown();
                        break;
                    case "session.finished":
                        System.out.println("session finished");
                        if (qwenTtsRef.get() != null) {
                            System.out.println("[Metric] response: " + qwenTtsRef.get().getResponseId() +
                                    ", first audio delay: " + qwenTtsRef.get().getFirstAudioDelay() + " ms");
                        }
                        completeLatch.get().countDown();
                    default:
                        break;
                }
            }
            @Override
            public void onClose(int code, String reason) {
                System.out.println("connection closed code: " + code + ", reason: " + reason);
                try {
                    // Wait for playback to complete, then shut down the player.
                    audioPlayer.waitForComplete();
                    audioPlayer.shutdown();
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
            }
        });
        qwenTtsRef.set(qwenTtsRealtime);
        try {
            qwenTtsRealtime.connect();
        } catch (NoApiKeyException e) {
            throw new RuntimeException(e);
        }
        QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
                .voice("Cherry")
                .responseFormat(ttsFormat)
                .mode("commit")
                // To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime.
                // .instructions("")
                // .optimizeInstructions(true)
                .build();
        qwenTtsRealtime.updateSession(config);

        // Read user input in a loop.
        while (true) {
            System.out.print("Enter the text to synthesize: ");
            String text = scanner.nextLine();

            // Exit when the user enters 'quit'.
            if ("quit".equalsIgnoreCase(text.trim())) {
                System.out.println("Closing the connection...");
                qwenTtsRealtime.finish();
                completeLatch.get().await();
                break;
            }

            // Skip empty input.
            if (text.trim().isEmpty()) {
                continue;
            }

            // Re-initialize the countdown latch.
            completeLatch.set(new CountDownLatch(1));

            // Send the text.
            qwenTtsRealtime.appendText(text);
            qwenTtsRealtime.commit();

            // Wait for the current synthesis to complete.
            completeLatch.get().await();
        }

        // Clean up resources.
        audioPlayer.waitForComplete();
        audioPlayer.shutdown();
        scanner.close();
        System.exit(0);
    }
}

For more samples, see the GitHub repository.

Request parameters

Configure the following request parameters with the chaining methods or setters of a QwenTtsRealtimeParam object, then pass the object to the QwenTtsRealtime constructor.

Parameter

Type

Required

Description

model

String

Yes

Model name (see Supported models).

url

String

Yes

Chinese Mainland: wss://dashscope.aliyuncs.com/api-ws/v1/realtime

International: wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime

Configure the following request parameters with the chaining methods or setters of a QwenTtsRealtimeConfig object, then pass the object to the updateSession method.

Parameter

Type

Required

Description

voice

String

Yes

The voice for speech synthesis. For more information, see Supported voices.

System voices and custom voices are supported:

  • System voices: Available only for the Qwen3-TTS-Instruct-Flash-Realtime, Qwen3-TTS-Flash-Realtime, and Qwen-TTS-Realtime model series. For voice samples, see Supported voices.

  • Custom voices

    • Voices customized by the Voice Cloning (Qwen) feature: Available only for the Qwen3-TTS-VC-Realtime model series.

    • Voices customized by the Voice Design (Qwen) feature: Available only for the Qwen3-TTS-VD-Realtime model series.

languageType

String

No

Specifies the language of the synthesized audio. The default value is Auto.

  • Auto: Use this value when the language of the text is uncertain or contains multiple languages. The model automatically matches the pronunciation for different language segments in the text, but cannot guarantee perfect accuracy.

  • Specific language: Use this for single-language text. Specifying a language significantly improves synthesis quality and typically yields better results than Auto. Valid values include the following:

    • Chinese

    • English

    • German

    • Italian

    • Portuguese

    • Spanish

    • Japanese

    • Korean

    • French

    • Russian

mode

String

No

The interaction mode. Valid values:

  • server_commit (default): The server automatically determines when to synthesize speech, balancing latency and quality. This mode is recommended for most scenarios.

  • commit: The client manually triggers synthesis. This mode has the lowest latency, but you must manage sentence integrity yourself.

format

String

No

The format of the audio output from the model.

Supported formats:

  • pcm (default)

  • wav

  • mp3

  • opus

Qwen-TTS-Realtime (see Supported models) supports only pcm.

sampleRate

int

No

The sample rate of the audio output from the model, in Hz.

Supported sample rates:

  • 8000

  • 16000

  • 24000 (default)

  • 48000

Qwen-TTS-Realtime (see Supported models) supports only 24000.

speechRate

float

No

The speech rate of the audio. A value of 1.0 is the normal speed. A value less than 1.0 is slower, and a value greater than 1.0 is faster.

Default value: 1.0.

Valid range: [0.5, 2.0].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

volume

int

No

The volume of the audio.

Default value: 50.

Valid range: [0, 100].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

pitchRate

float

No

The pitch of the synthesized audio.

Default value: 1.0.

Valid range: [0.5, 2.0].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

bitRate

int

No

Specifies the bitrate of the audio in kbps. A higher bitrate results in better audio quality and a larger file size. This parameter is available only when the audio format (response_format) is set to opus.

Default value: 128.

Valid range: [6, 510].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

instructions

String

No

Sets the instructions. For more information, see Real-time speech synthesis - Qwen.

Default value: None. The parameter is not active if not set.

Length limit: The length cannot exceed 1600 tokens.

Supported languages: Chinese and English only.

Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series.

optimizeInstructions

boolean

No

Specifies whether to optimize the instructions to improve the naturalness and expressiveness of the speech synthesis.

Default value: false.

Behavior: When set to true, the system enhances and rewrites the content of instructions to generate internal instructions that are better suited for speech synthesis.

Scenarios: Recommended for scenarios that require high-quality, fine-grained voice expression.

Dependency: This parameter depends on the instructions parameter being set. If instructions is empty, this parameter has no effect.

Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series.

Key interfaces

QwenTtsRealtime class

To import:

import com.alibaba.dashscope.audio.qwen_tts_realtime.QwenTtsRealtime;

Method

Signature

Server events

Description

connect

public void connect() throws NoApiKeyException, InterruptedException

session.created

Session created

session.updated

Session configuration updated

Opens a WebSocket connection to the server.

updateSession

public void updateSession(QwenTtsRealtimeConfig config)

session.updated

Session configuration updated

Updates the session configuration. See Request parameters.

After connecting, the server returns default input and output configurations for the session. Call this method immediately after connect() to override defaults.

The server validates parameters upon receiving a session.update event. If any parameter is invalid, the server returns an error. Otherwise, it updates the session configuration on the server side.

appendText

public void appendText(String text)

None

Appends a text segment to the server-side input buffer. The buffer stores text until you submit it.

  • In server_commit mode, the server decides when to commit and synthesize buffered text.

  • In commit mode, the client must trigger synthesis by calling commit.

clearAppendedText

public void clearAppendedText() 

input_text_buffer.cleared

Clears text received by the server

Clears all text in the server-side input buffer.

commit

public void commit()

input_text_buffer.committed

Commits text and triggers speech synthesis

response.output_item.added

New output content appears in the response

response.content_part.added

New output content added to the assistant message item

response.audio.delta

Model generates audio incrementally

response.audio.done

Audio generation completed

response.content_part.done

Streaming of audio content for the assistant message is complete

response.output_item.done

Streaming of the entire output item for the assistant message is complete

response.done

Response completed

Commits text previously appended to the server-side buffer and synthesizes all text immediately. Returns an error if the buffer is empty.

  • In server_commit mode, the client does not need to call this method. The server commits automatically.

  • In commit mode, the client must call commit to trigger synthesis.

finish

public void finish()

session.finished

Response completed

Stops the current task.

close

public void close()

None

Closes the connection.

getSessionId

public String getSessionId()

None

Returns the session ID of the current task.

getResponseId

public String getResponseId() 

None

Returns the response ID of the most recent response.

getFirstAudioDelay

public long getFirstAudioDelay()

None

Returns the latency of the first audio packet in milliseconds.

Callback interface (QwenTtsRealtimeCallback)

Method

Parameters

Return value

Description

public void onOpen()

None

None

Called immediately after the WebSocket connection is established.

public abstract void onEvent(JsonObject message)

message: server-side response event.

None

Called when the server sends an event, including API call responses and model-generated audio. See Server-side events.

public abstract void onClose(int code, String reason)

code: WebSocket close status code.

reason: Close reason.

None

Called after the server closes the connection.