All Products
Search
Document Center

Alibaba Cloud Model Studio:Java SDK

Last Updated:Feb 06, 2026

This topic describes the key interfaces and request parameters for the Qwen real-time speech synthesis DashScope Java SDK.

User guide: For a model overview and selection suggestions, see Real-time speech synthesis - Qwen.

Prerequisites

Use DashScope Java SDK version 2.22.7 or later.

Getting started

Server commit mode

import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Base64;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;

public class Main {
    static String[] textToSynthesize = {
            "Right? I just really love this kind of supermarket",
            "Especially during the New Year",
            "Going to the supermarket",
            "Makes me feel",
            "Super, super happy!",
            "I want to buy so many things!"
    };

    // Real-time PCM audio player class
    public static class RealtimePcmPlayer {
        private int sampleRate;
        private SourceDataLine line;
        private AudioFormat audioFormat;
        private Thread decoderThread;
        private Thread playerThread;
        private AtomicBoolean stopped = new AtomicBoolean(false);
        private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
        private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();

        // The constructor initializes the audio format and audio line.
        public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
            this.sampleRate = sampleRate;
            this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
            DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
            line = (SourceDataLine) AudioSystem.getLine(info);
            line.open(audioFormat);
            line.start();
            decoderThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!stopped.get()) {
                        String b64Audio = b64AudioBuffer.poll();
                        if (b64Audio != null) {
                            byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
                            RawAudioBuffer.add(rawAudio);
                        } else {
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }
            });
            playerThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!stopped.get()) {
                        byte[] rawAudio = RawAudioBuffer.poll();
                        if (rawAudio != null) {
                            try {
                                playChunk(rawAudio);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        } else {
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }
            });
            decoderThread.start();
            playerThread.start();
        }

        // Plays an audio chunk and blocks until playback is complete.
        private void playChunk(byte[] chunk) throws IOException, InterruptedException {
            if (chunk == null || chunk.length == 0) return;

            int bytesWritten = 0;
            while (bytesWritten < chunk.length) {
                bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
            }
            int audioLength = chunk.length / (this.sampleRate*2/1000);
            // Waits for the audio in the buffer to finish playing.
            Thread.sleep(audioLength - 10);
        }

        public void write(String b64Audio) {
            b64AudioBuffer.add(b64Audio);
        }

        public void cancel() {
            b64AudioBuffer.clear();
            RawAudioBuffer.clear();
        }

        public void waitForComplete() throws InterruptedException {
            while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
                Thread.sleep(100);
            }
            line.drain();
        }

        public void shutdown() throws InterruptedException {
            stopped.set(true);
            decoderThread.join();
            playerThread.join();
            if (line != null && line.isRunning()) {
                line.drain();
                line.close();
            }
        }
    }

    public static void main(String[] args) throws InterruptedException, LineUnavailableException, FileNotFoundException {
        QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
                // To use the instruction control feature, replace the model with qwen3-tts-instruct-flash-realtime.
                .model("qwen3-tts-flash-realtime")
                // The following URL is for the Singapore region. If you use a model in the China (Beijing) region, replace the URL with wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
                .url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
                // The API keys for the Singapore and China (Beijing) regions are different. To obtain an API key, visit https://www.alibabacloud.com/help/en/model-studio/get-api-key.
                .apikey(System.getenv("DASHSCOPE_API_KEY"))
                .build();
        AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));
        final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);

        // Creates a real-time audio player instance.
        RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);

        QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
            @Override
            public void onOpen() {
                // Handles the event when the connection is established.
            }
            @Override
            public void onEvent(JsonObject message) {
                String type = message.get("type").getAsString();
                switch(type) {
                    case "session.created":
                        // Handles the event when the session is created.
                        break;
                    case "response.audio.delta":
                        String recvAudioB64 = message.get("delta").getAsString();
                        // Plays the audio in real time.
                        audioPlayer.write(recvAudioB64);
                        break;
                    case "response.done":
                        // Handles the event when the response is complete.
                        break;
                    case "session.finished":
                        // Handles the event when the session is finished.
                        completeLatch.get().countDown();
                    default:
                        break;
                }
            }
            @Override
            public void onClose(int code, String reason) {
                // Handles the event when the connection is closed.
            }
        });
        qwenTtsRef.set(qwenTtsRealtime);
        try {
            qwenTtsRealtime.connect();
        } catch (NoApiKeyException e) {
            throw new RuntimeException(e);
        }
        QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
                .voice("Cherry")
                .responseFormat(QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT)
                .mode("server_commit")
                // To use the instruction control feature, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime.
                // .instructions("")
                // .optimizeInstructions(true)
                .build();
        qwenTtsRealtime.updateSession(config);
        for (String text:textToSynthesize) {
            qwenTtsRealtime.appendText(text);
            Thread.sleep(100);
        }
        qwenTtsRealtime.finish();
        completeLatch.get().await();
        qwenTtsRealtime.close();

        // Waits for audio playback to complete and then shuts down the player.
        audioPlayer.waitForComplete();
        audioPlayer.shutdown();
        System.exit(0);
    }
}

Commit mode

import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Base64;
import java.util.Queue;
import java.util.Scanner;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;

public class commit {
    // Real-time PCM audio player class
    public static class RealtimePcmPlayer {
        private int sampleRate;
        private SourceDataLine line;
        private AudioFormat audioFormat;
        private Thread decoderThread;
        private Thread playerThread;
        private AtomicBoolean stopped = new AtomicBoolean(false);
        private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
        private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();

        // The constructor initializes the audio format and audio line.
        public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
            this.sampleRate = sampleRate;
            this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
            DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
            line = (SourceDataLine) AudioSystem.getLine(info);
            line.open(audioFormat);
            line.start();
            decoderThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!stopped.get()) {
                        String b64Audio = b64AudioBuffer.poll();
                        if (b64Audio != null) {
                            byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
                            RawAudioBuffer.add(rawAudio);
                        } else {
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }
            });
            playerThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!stopped.get()) {
                        byte[] rawAudio = RawAudioBuffer.poll();
                        if (rawAudio != null) {
                            try {
                                playChunk(rawAudio);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        } else {
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }
            });
            decoderThread.start();
            playerThread.start();
        }

        // Plays an audio chunk and blocks until playback is complete.
        private void playChunk(byte[] chunk) throws IOException, InterruptedException {
            if (chunk == null || chunk.length == 0) return;

            int bytesWritten = 0;
            while (bytesWritten < chunk.length) {
                bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
            }
            int audioLength = chunk.length / (this.sampleRate*2/1000);
            // Waits for the audio in the buffer to finish playing.
            Thread.sleep(audioLength - 10);
        }

        public void write(String b64Audio) {
            b64AudioBuffer.add(b64Audio);
        }

        public void cancel() {
            b64AudioBuffer.clear();
            RawAudioBuffer.clear();
        }

        public void waitForComplete() throws InterruptedException {
            // Waits for all audio data in the buffers to finish playing.
            while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
                Thread.sleep(100);
            }
            // Waits for the audio line to finish playing.
            line.drain();
        }

        public void shutdown() throws InterruptedException {
            stopped.set(true);
            decoderThread.join();
            playerThread.join();
            if (line != null && line.isRunning()) {
                line.drain();
                line.close();
            }
        }
    }

    public static void main(String[] args) throws InterruptedException, LineUnavailableException, FileNotFoundException {
        Scanner scanner = new Scanner(System.in);

        QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
                // To use the instruction control feature, replace the model with qwen3-tts-instruct-flash-realtime.
                .model("qwen3-tts-flash-realtime")
                // The following URL is for the Singapore region. If you use a model in the China (Beijing) region, replace the URL with wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
                .url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
                // The API keys for the Singapore and China (Beijing) regions are different. To obtain an API key, visit https://www.alibabacloud.com/help/en/model-studio/get-api-key.
                .apikey(System.getenv("DASHSCOPE_API_KEY"))
                .build();

        AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));

        // Creates a real-time player instance.
        RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);

        final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);
        QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
//            File file = new File("result_24k.pcm");
//            FileOutputStream fos = new FileOutputStream(file);
            @Override
            public void onOpen() {
                System.out.println("connection opened");
                System.out.println("Enter text and press Enter to send. Enter 'quit' to exit the program.");
            }
            @Override
            public void onEvent(JsonObject message) {
                String type = message.get("type").getAsString();
                switch(type) {
                    case "session.created":
                        System.out.println("start session: " + message.get("session").getAsJsonObject().get("id").getAsString());
                        break;
                    case "response.audio.delta":
                        String recvAudioB64 = message.get("delta").getAsString();
                        byte[] rawAudio = Base64.getDecoder().decode(recvAudioB64);
                        //                            fos.write(rawAudio);
                        // Plays the audio in real time.
                        audioPlayer.write(recvAudioB64);
                        break;
                    case "response.done":
                        System.out.println("response done");
                        // Waits for the audio playback to complete.
                        try {
                            audioPlayer.waitForComplete();
                        } catch (InterruptedException e) {
                            throw new RuntimeException(e);
                        }
                        // Prepares for the next input.
                        completeLatch.get().countDown();
                        break;
                    case "session.finished":
                        System.out.println("session finished");
                        if (qwenTtsRef.get() != null) {
                            System.out.println("[Metric] response: " + qwenTtsRef.get().getResponseId() +
                                    ", first audio delay: " + qwenTtsRef.get().getFirstAudioDelay() + " ms");
                        }
                        completeLatch.get().countDown();
                    default:
                        break;
                }
            }
            @Override
            public void onClose(int code, String reason) {
                System.out.println("connection closed code: " + code + ", reason: " + reason);
                try {
//                    fos.close();
                    // Waits for playback to complete and then shuts down the player.
                    audioPlayer.waitForComplete();
                    audioPlayer.shutdown();
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
            }
        });
        qwenTtsRef.set(qwenTtsRealtime);
        try {
            qwenTtsRealtime.connect();
        } catch (NoApiKeyException e) {
            throw new RuntimeException(e);
        }
        QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
                .voice("Cherry")
                .responseFormat(QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT)
                .mode("commit")
                // To use the instruction control feature, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime.
                // .instructions("")
                // .optimizeInstructions(true)
                .build();
        qwenTtsRealtime.updateSession(config);

        // Reads user input in a loop.
        while (true) {
            System.out.print("Enter the text to synthesize: ");
            String text = scanner.nextLine();

            // If the user enters 'quit', exit the program.
            if ("quit".equalsIgnoreCase(text.trim())) {
                System.out.println("Closing the connection...");
                qwenTtsRealtime.finish();
                completeLatch.get().await();
                break;
            }

            // If the user input is empty, skip.
            if (text.trim().isEmpty()) {
                continue;
            }

            // Re-initializes the countdown latch.
            completeLatch.set(new CountDownLatch(1));

            // Sends the text.
            qwenTtsRealtime.appendText(text);
            qwenTtsRealtime.commit();

            // Waits for the current synthesis to complete.
            completeLatch.get().await();
        }

        // Cleans up resources.
        audioPlayer.waitForComplete();
        audioPlayer.shutdown();
        scanner.close();
        System.exit(0);
    }
}

You can download additional sample code from GitHub.

Request parameters

Set the following request parameters using the chained methods or setters of the QwenTtsRealtimeParam object. Then, pass the object to the QwenTtsRealtime constructor.

Parameter

Type

Required

Description

model

String

Yes

Model name. See Supported models.

url

String

Yes

Mainland China: wss://dashscope.aliyuncs.com/api-ws/v1/realtime

International: wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime

Set the request parameters using the chained methods or setters of the QwenTtsRealtimeConfig object. Then, pass the object as a parameter to the updateSession method.

Parameter

Type

Required

Description

voice

String

Yes

The voice used for speech synthesis. For more information, see Supported voices.

System voices and custom voices are supported:

  • System voices: Available only for the Qwen3-TTS-Instruct-Flash-Realtime, Qwen3-TTS-Flash-Realtime, and Qwen-TTS-Realtime model series. For voice samples, see Supported voices.

  • Custom voices

    • Voices customized using Voice cloning (Qwen): Available only for the Qwen3-TTS-VC-Realtime series.

    • Voices customized using Voice design (Qwen): Available only for the Qwen3-TTS-VD-Realtime series.

languageType

String

No

The language of the synthesized audio. The default value is Auto.

  • Auto: Use this value when the language of the text is uncertain or the text contains multiple languages. The model automatically matches the pronunciation for different language segments in the text, but cannot guarantee perfect accuracy.

  • Specific language: Use this for single-language text. Specifying a language significantly improves synthesis quality and usually performs better than Auto. Valid values include the following:

    • Chinese

    • English

    • German

    • Italian

    • Portuguese

    • Spanish

    • Japanese

    • Korean

    • French

    • Russian

mode

String

No

The interaction pattern. Valid values:

  • server_commit (default): The server automatically determines when to synthesize, balancing latency and quality. This pattern is recommended for most scenarios.

  • commit: The client manually triggers synthesis. This pattern provides the lowest latency but requires you to manage sentence integrity.

format

String

No

The format of the audio output from the model.

Supported formats:

  • pcm (default)

  • wav

  • mp3

  • opus

Qwen-TTS-Realtime (see Supported models) supports only pcm.

sampleRate

int

No

The sample rate (in Hz) of the audio output from the model.

Supported sample rates:

  • 8000

  • 16000

  • 24000 (default)

  • 48000

Qwen-TTS-Realtime (see Supported models) supports only 24000.

speechRate

float

No

The speech rate of the audio. A value of 1.0 is the normal speed. A value less than 1.0 is slower, and a value greater than 1.0 is faster.

Default value: 1.0.

Valid range: [0.5, 2.0].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

volume

int

No

The volume of the audio.

Default value: 50.

Valid range: [0, 100].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

pitchRate

float

No

The pitch of the synthesized audio.

Default value: 1.0.

Valid range: [0.5, 2.0].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

bitRate

int

No

The bitrate (in kbps) of the audio. A higher bitrate results in better audio quality and a larger file size. This parameter is available only when the audio format (response_format) is set to opus.

Default value: 128.

Valid range: [6, 510].

Qwen-TTS-Realtime (see Supported models) does not support this parameter.

instructions

String

No

Sets instructions, see Real-time speech synthesis - Qwen.

Default value: None. The parameter has no effect if not set.

Length limit: The length cannot exceed 1600 tokens.

Supported languages: Only Chinese and English are supported.

Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series.

optimizeInstructions

boolean

No

Specifies whether to optimize the instructions to improve the naturalness and expressiveness of the speech synthesis.

Default value: false.

Behavior: When set to true, the system enhances the semantics and rewrites the content of instructions to generate internal instructions that are better suited for speech synthesis.

Scenarios: Enable this feature in scenarios that require high-quality, fine-grained vocal expression.

Dependency: This parameter depends on the instructions parameter being set. If instructions is empty, this parameter has no effect.

Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series.

Key interfaces

QwenTtsRealtime class

Import method:

import com.alibaba.dashscope.audio.qwen_tts_realtime.QwenTtsRealtime;

Member method

Method signature

Server-side response events (delivered through callbacks)

Description

connect

public void connect() throws NoApiKeyException, InterruptedException

session.created

The session is created.

session.updated

The session configuration is updated.

Creates a connection with the server.

updateSession

public void updateSession(QwenTtsRealtimeConfig config)

session.updated

The session configuration is updated.

Updates the default configuration for the current session interaction. For parameter settings, see the "Request parameters" section.

After you establish a connection, the server promptly returns the default input and output configurations for the session. To update the default session configuration, call this interface immediately after establishing the connection.

After the server receives the session.update event, it performs parameter verification. If the parameters are invalid, an error is returned. Otherwise, the server-side session configuration is updated.

appendText

public void appendText(String text)

None

Appends a text segment to the cloud-side input text buffer. The buffer is temporary storage where you can write text and commit it later.

  • In "server_commit" mode, the server decides when to commit and synthesize the text in the text buffer.

  • In "commit" mode, the client must actively trigger speech synthesis using the commit method.

clearAppendedText

public void clearAppendedText() 

input_text_buffer.cleared

The text received by the server is cleared.

Deletes the text in the current cloud-side buffer.

commit

public void commit()

input_text_buffer.committed

The text is committed and speech synthesis is triggered.

response.output_item.added

New output content is available in the response.

response.content_part.added

New output content is added to the assistant message item.

response.audio.delta

Incrementally generated audio from the model.

response.audio.done

Audio generation is complete.

response.content_part.done

The streaming output of the audio content for the assistant message is complete.

response.output_item.done

The streaming of the entire output item for the assistant message is complete.

response.done

The response is complete.

Commits the text previously appended to the cloud-side buffer and immediately synthesizes all the text. An error occurs if the input text buffer is empty.

  • In "server_commit" mode, the client does not need to send this event. The server automatically commits the text buffer.

  • In "commit" mode, the client must trigger speech synthesis using the commit method.

finish

public void finish()

session.finished

Response completed

Stops the task.

close

public void close()

None

Closes the connection.

getSessionId

public String getSessionId()

None

Gets the session_id of the current task.

getResponseId

public String getResponseId() 

None

Gets the response_id of the most recent response.

getFirstAudioDelay

public long getFirstAudioDelay()

None

Gets the first audio packet latency.

Callback interface (QwenTtsRealtimeCallback)

Method

Parameters

Return value

Description

public void onOpen()

None

None

This method is immediately called back after a connection is established with the server.

public abstract void onEvent(JsonObject message)

message: The server-side response event.

None

Includes response replies to interface calls and the text and audio generated by the model. For more information, see Server-side events.

public abstract void onClose(int code, String reason)

code: The status code for closing the WebSocket.

reason: The closing information for the WebSocket.

None

This method is called back after the service has closed the connection.