This topic describes the key interfaces and request parameters for the Qwen real-time speech synthesis DashScope Java SDK.
User guide: For a model overview and selection suggestions, see Real-time speech synthesis - Qwen.
Prerequisites
Use DashScope Java SDK version 2.22.7 or later.
Getting started
Server commit mode
import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Base64;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
public class Main {
static String[] textToSynthesize = {
"Right? I just really love this kind of supermarket",
"Especially during the New Year",
"Going to the supermarket",
"Makes me feel",
"Super, super happy!",
"I want to buy so many things!"
};
// Real-time PCM audio player class
public static class RealtimePcmPlayer {
private int sampleRate;
private SourceDataLine line;
private AudioFormat audioFormat;
private Thread decoderThread;
private Thread playerThread;
private AtomicBoolean stopped = new AtomicBoolean(false);
private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();
// The constructor initializes the audio format and audio line.
public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
this.sampleRate = sampleRate;
this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
line = (SourceDataLine) AudioSystem.getLine(info);
line.open(audioFormat);
line.start();
decoderThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
String b64Audio = b64AudioBuffer.poll();
if (b64Audio != null) {
byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
RawAudioBuffer.add(rawAudio);
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
playerThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
byte[] rawAudio = RawAudioBuffer.poll();
if (rawAudio != null) {
try {
playChunk(rawAudio);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
decoderThread.start();
playerThread.start();
}
// Plays an audio chunk and blocks until playback is complete.
private void playChunk(byte[] chunk) throws IOException, InterruptedException {
if (chunk == null || chunk.length == 0) return;
int bytesWritten = 0;
while (bytesWritten < chunk.length) {
bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
}
int audioLength = chunk.length / (this.sampleRate*2/1000);
// Waits for the audio in the buffer to finish playing.
Thread.sleep(audioLength - 10);
}
public void write(String b64Audio) {
b64AudioBuffer.add(b64Audio);
}
public void cancel() {
b64AudioBuffer.clear();
RawAudioBuffer.clear();
}
public void waitForComplete() throws InterruptedException {
while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
Thread.sleep(100);
}
line.drain();
}
public void shutdown() throws InterruptedException {
stopped.set(true);
decoderThread.join();
playerThread.join();
if (line != null && line.isRunning()) {
line.drain();
line.close();
}
}
}
public static void main(String[] args) throws InterruptedException, LineUnavailableException, FileNotFoundException {
QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
// To use the instruction control feature, replace the model with qwen3-tts-instruct-flash-realtime.
.model("qwen3-tts-flash-realtime")
// The following URL is for the Singapore region. If you use a model in the China (Beijing) region, replace the URL with wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
.url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
// The API keys for the Singapore and China (Beijing) regions are different. To obtain an API key, visit https://www.alibabacloud.com/help/en/model-studio/get-api-key.
.apikey(System.getenv("DASHSCOPE_API_KEY"))
.build();
AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));
final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);
// Creates a real-time audio player instance.
RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);
QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
@Override
public void onOpen() {
// Handles the event when the connection is established.
}
@Override
public void onEvent(JsonObject message) {
String type = message.get("type").getAsString();
switch(type) {
case "session.created":
// Handles the event when the session is created.
break;
case "response.audio.delta":
String recvAudioB64 = message.get("delta").getAsString();
// Plays the audio in real time.
audioPlayer.write(recvAudioB64);
break;
case "response.done":
// Handles the event when the response is complete.
break;
case "session.finished":
// Handles the event when the session is finished.
completeLatch.get().countDown();
default:
break;
}
}
@Override
public void onClose(int code, String reason) {
// Handles the event when the connection is closed.
}
});
qwenTtsRef.set(qwenTtsRealtime);
try {
qwenTtsRealtime.connect();
} catch (NoApiKeyException e) {
throw new RuntimeException(e);
}
QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
.voice("Cherry")
.responseFormat(QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT)
.mode("server_commit")
// To use the instruction control feature, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime.
// .instructions("")
// .optimizeInstructions(true)
.build();
qwenTtsRealtime.updateSession(config);
for (String text:textToSynthesize) {
qwenTtsRealtime.appendText(text);
Thread.sleep(100);
}
qwenTtsRealtime.finish();
completeLatch.get().await();
qwenTtsRealtime.close();
// Waits for audio playback to complete and then shuts down the player.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
System.exit(0);
}
}Commit mode
import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Base64;
import java.util.Queue;
import java.util.Scanner;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
public class commit {
// Real-time PCM audio player class
public static class RealtimePcmPlayer {
private int sampleRate;
private SourceDataLine line;
private AudioFormat audioFormat;
private Thread decoderThread;
private Thread playerThread;
private AtomicBoolean stopped = new AtomicBoolean(false);
private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();
// The constructor initializes the audio format and audio line.
public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
this.sampleRate = sampleRate;
this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
line = (SourceDataLine) AudioSystem.getLine(info);
line.open(audioFormat);
line.start();
decoderThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
String b64Audio = b64AudioBuffer.poll();
if (b64Audio != null) {
byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
RawAudioBuffer.add(rawAudio);
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
playerThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
byte[] rawAudio = RawAudioBuffer.poll();
if (rawAudio != null) {
try {
playChunk(rawAudio);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
decoderThread.start();
playerThread.start();
}
// Plays an audio chunk and blocks until playback is complete.
private void playChunk(byte[] chunk) throws IOException, InterruptedException {
if (chunk == null || chunk.length == 0) return;
int bytesWritten = 0;
while (bytesWritten < chunk.length) {
bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
}
int audioLength = chunk.length / (this.sampleRate*2/1000);
// Waits for the audio in the buffer to finish playing.
Thread.sleep(audioLength - 10);
}
public void write(String b64Audio) {
b64AudioBuffer.add(b64Audio);
}
public void cancel() {
b64AudioBuffer.clear();
RawAudioBuffer.clear();
}
public void waitForComplete() throws InterruptedException {
// Waits for all audio data in the buffers to finish playing.
while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
Thread.sleep(100);
}
// Waits for the audio line to finish playing.
line.drain();
}
public void shutdown() throws InterruptedException {
stopped.set(true);
decoderThread.join();
playerThread.join();
if (line != null && line.isRunning()) {
line.drain();
line.close();
}
}
}
public static void main(String[] args) throws InterruptedException, LineUnavailableException, FileNotFoundException {
Scanner scanner = new Scanner(System.in);
QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
// To use the instruction control feature, replace the model with qwen3-tts-instruct-flash-realtime.
.model("qwen3-tts-flash-realtime")
// The following URL is for the Singapore region. If you use a model in the China (Beijing) region, replace the URL with wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
.url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
// The API keys for the Singapore and China (Beijing) regions are different. To obtain an API key, visit https://www.alibabacloud.com/help/en/model-studio/get-api-key.
.apikey(System.getenv("DASHSCOPE_API_KEY"))
.build();
AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));
// Creates a real-time player instance.
RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);
final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);
QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
// File file = new File("result_24k.pcm");
// FileOutputStream fos = new FileOutputStream(file);
@Override
public void onOpen() {
System.out.println("connection opened");
System.out.println("Enter text and press Enter to send. Enter 'quit' to exit the program.");
}
@Override
public void onEvent(JsonObject message) {
String type = message.get("type").getAsString();
switch(type) {
case "session.created":
System.out.println("start session: " + message.get("session").getAsJsonObject().get("id").getAsString());
break;
case "response.audio.delta":
String recvAudioB64 = message.get("delta").getAsString();
byte[] rawAudio = Base64.getDecoder().decode(recvAudioB64);
// fos.write(rawAudio);
// Plays the audio in real time.
audioPlayer.write(recvAudioB64);
break;
case "response.done":
System.out.println("response done");
// Waits for the audio playback to complete.
try {
audioPlayer.waitForComplete();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
// Prepares for the next input.
completeLatch.get().countDown();
break;
case "session.finished":
System.out.println("session finished");
if (qwenTtsRef.get() != null) {
System.out.println("[Metric] response: " + qwenTtsRef.get().getResponseId() +
", first audio delay: " + qwenTtsRef.get().getFirstAudioDelay() + " ms");
}
completeLatch.get().countDown();
default:
break;
}
}
@Override
public void onClose(int code, String reason) {
System.out.println("connection closed code: " + code + ", reason: " + reason);
try {
// fos.close();
// Waits for playback to complete and then shuts down the player.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
});
qwenTtsRef.set(qwenTtsRealtime);
try {
qwenTtsRealtime.connect();
} catch (NoApiKeyException e) {
throw new RuntimeException(e);
}
QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
.voice("Cherry")
.responseFormat(QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT)
.mode("commit")
// To use the instruction control feature, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime.
// .instructions("")
// .optimizeInstructions(true)
.build();
qwenTtsRealtime.updateSession(config);
// Reads user input in a loop.
while (true) {
System.out.print("Enter the text to synthesize: ");
String text = scanner.nextLine();
// If the user enters 'quit', exit the program.
if ("quit".equalsIgnoreCase(text.trim())) {
System.out.println("Closing the connection...");
qwenTtsRealtime.finish();
completeLatch.get().await();
break;
}
// If the user input is empty, skip.
if (text.trim().isEmpty()) {
continue;
}
// Re-initializes the countdown latch.
completeLatch.set(new CountDownLatch(1));
// Sends the text.
qwenTtsRealtime.appendText(text);
qwenTtsRealtime.commit();
// Waits for the current synthesis to complete.
completeLatch.get().await();
}
// Cleans up resources.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
scanner.close();
System.exit(0);
}
}You can download additional sample code from GitHub.
Request parameters
Set the following request parameters using the chained methods or setters of the QwenTtsRealtimeParam object. Then, pass the object to the QwenTtsRealtime constructor.
Parameter | Type | Required | Description |
model | String | Yes | Model name. See Supported models. |
url | String | Yes | Mainland China: International: |
Set the request parameters using the chained methods or setters of the QwenTtsRealtimeConfig object. Then, pass the object as a parameter to the updateSession method.
Parameter | Type | Required | Description |
voice | String | Yes | The voice used for speech synthesis. For more information, see Supported voices. System voices and custom voices are supported:
|
languageType | String | No | The language of the synthesized audio. The default value is
|
mode | String | No | The interaction pattern. Valid values:
|
format | String | No | The format of the audio output from the model. Supported formats:
Qwen-TTS-Realtime (see Supported models) supports only |
sampleRate | int | No | The sample rate (in Hz) of the audio output from the model. Supported sample rates:
Qwen-TTS-Realtime (see Supported models) supports only 24000. |
speechRate | float | No | The speech rate of the audio. A value of 1.0 is the normal speed. A value less than 1.0 is slower, and a value greater than 1.0 is faster. Default value: 1.0. Valid range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
volume | int | No | The volume of the audio. Default value: 50. Valid range: [0, 100]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
pitchRate | float | No | The pitch of the synthesized audio. Default value: 1.0. Valid range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
bitRate | int | No | The bitrate (in kbps) of the audio. A higher bitrate results in better audio quality and a larger file size. This parameter is available only when the audio format ( Default value: 128. Valid range: [6, 510]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
instructions | String | No | Sets instructions, see Real-time speech synthesis - Qwen. Default value: None. The parameter has no effect if not set. Length limit: The length cannot exceed 1600 tokens. Supported languages: Only Chinese and English are supported. Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series. |
optimizeInstructions | boolean | No | Specifies whether to optimize the Default value: false. Behavior: When set to true, the system enhances the semantics and rewrites the content of Scenarios: Enable this feature in scenarios that require high-quality, fine-grained vocal expression. Dependency: This parameter depends on the Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series. |
Key interfaces
QwenTtsRealtime class
Import method:
import com.alibaba.dashscope.audio.qwen_tts_realtime.QwenTtsRealtime;Member method | Method signature | Server-side response events (delivered through callbacks) | Description |
connect | | The session is created. The session configuration is updated. | Creates a connection with the server. |
updateSession | | The session configuration is updated. | Updates the default configuration for the current session interaction. For parameter settings, see the "Request parameters" section. After you establish a connection, the server promptly returns the default input and output configurations for the session. To update the default session configuration, call this interface immediately after establishing the connection. After the server receives the session.update event, it performs parameter verification. If the parameters are invalid, an error is returned. Otherwise, the server-side session configuration is updated. |
appendText | | None | Appends a text segment to the cloud-side input text buffer. The buffer is temporary storage where you can write text and commit it later.
|
clearAppendedText | | The text received by the server is cleared. | Deletes the text in the current cloud-side buffer. |
commit | | The text is committed and speech synthesis is triggered. New output content is available in the response. New output content is added to the assistant message item. Incrementally generated audio from the model. Audio generation is complete. The streaming output of the audio content for the assistant message is complete. The streaming of the entire output item for the assistant message is complete. The response is complete. | Commits the text previously appended to the cloud-side buffer and immediately synthesizes all the text. An error occurs if the input text buffer is empty.
|
finish | | Response completed | Stops the task. |
close | | None | Closes the connection. |
getSessionId | | None | Gets the session_id of the current task. |
getResponseId | | None | Gets the response_id of the most recent response. |
getFirstAudioDelay | | None | Gets the first audio packet latency. |
Callback interface (QwenTtsRealtimeCallback)
Method | Parameters | Return value | Description |
| None | None | This method is immediately called back after a connection is established with the server. |
| message: The server-side response event. | None | Includes response replies to interface calls and the text and audio generated by the model. For more information, see Server-side events. |
| code: The status code for closing the WebSocket. reason: The closing information for the WebSocket. | None | This method is called back after the service has closed the connection. |