Key interfaces and request parameters for Qwen real-time speech synthesis DashScope Java SDK.
User guide: For model introductions and selection recommendations, see Real-time speech synthesis – Qwen or Speech synthesis – Qwen.
Prerequisites
Requires DashScope Java SDK 2.22.7 or later.
Getting started
Server commit mode
appendText()
import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.*;
import java.util.Base64;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
public class Main {
static String[] textToSynthesize = {
"Right? I really love this kind of supermarket.",
"Especially during the Chinese New Year.",
"Going to the supermarket.",
"It just makes me feel.",
"Super, super happy!",
"I want to buy so many things!"
};
public static QwenTtsRealtimeAudioFormat ttsFormat = QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT;
// Real-time PCM audio player
public static class RealtimePcmPlayer {
private int sampleRate;
private SourceDataLine line;
private AudioFormat audioFormat;
private Thread decoderThread;
private Thread playerThread;
private AtomicBoolean stopped = new AtomicBoolean(false);
private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();
private ByteArrayOutputStream totalAudioStream = new ByteArrayOutputStream();
// Initialize the audio format and audio line.
public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
this.sampleRate = sampleRate;
this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
line = (SourceDataLine) AudioSystem.getLine(info);
line.open(audioFormat);
line.start();
decoderThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
String b64Audio = b64AudioBuffer.poll();
if (b64Audio != null) {
byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
RawAudioBuffer.add(rawAudio);
// Write audio data to totalAudioStream.
try {
totalAudioStream.write(rawAudio);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
playerThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
byte[] rawAudio = RawAudioBuffer.poll();
if (rawAudio != null) {
try {
playChunk(rawAudio);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
decoderThread.start();
playerThread.start();
}
// Play an audio chunk and block until playback completes.
private void playChunk(byte[] chunk) throws IOException, InterruptedException {
if (chunk == null || chunk.length == 0) return;
int bytesWritten = 0;
while (bytesWritten < chunk.length) {
bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
}
int audioLength = chunk.length / (this.sampleRate*2/1000);
// Wait for the buffered audio to finish playing.
Thread.sleep(audioLength - 10);
}
public void write(String b64Audio) {
b64AudioBuffer.add(b64Audio);
}
public void cancel() {
b64AudioBuffer.clear();
RawAudioBuffer.clear();
}
public void waitForComplete() throws InterruptedException {
while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
Thread.sleep(100);
}
line.drain();
}
public void shutdown() throws InterruptedException, IOException {
stopped.set(true);
decoderThread.join();
playerThread.join();
// Save the complete audio file.
File file = new File("TotalAudio_"+ttsFormat.getSampleRate()+"."+ttsFormat.getFormat());
try (FileOutputStream fos = new FileOutputStream(file)) {
fos.write(totalAudioStream.toByteArray());
}
if (line != null && line.isRunning()) {
line.drain();
line.close();
}
}
}
public static void main(String[] args) throws InterruptedException, LineUnavailableException, IOException {
QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
// To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime.
.model("qwen3-tts-flash-realtime")
// Singapore endpoint. For China (Beijing), use wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
.url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
// API keys differ between Singapore and China (Beijing). See https://www.alibabacloud.com/help/zh/model-studio/get-api-key.
.apikey(System.getenv("DASHSCOPE_API_KEY"))
.build();
AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));
final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);
// Create a real-time audio player instance.
RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);
QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
@Override
public void onOpen() {
// Handle connection establishment.
}
@Override
public void onEvent(JsonObject message) {
String type = message.get("type").getAsString();
switch(type) {
case "session.created":
// Handle session creation.
if (message.has("session")) {
String eventId = message.get("event_id").getAsString();
String sessionId = message.get("session").getAsJsonObject().get("id").getAsString();
System.out.println("[onEvent] session.created, session_id: "
+ sessionId + ", event_id: " + eventId);
}
break;
case "response.audio.delta":
String recvAudioB64 = message.get("delta").getAsString();
// Play audio in real time.
audioPlayer.write(recvAudioB64);
break;
case "response.done":
// Handle response completion.
break;
case "session.finished":
// Handle session termination.
completeLatch.get().countDown();
default:
break;
}
}
@Override
public void onClose(int code, String reason) {
// Handle connection closure.
}
});
qwenTtsRef.set(qwenTtsRealtime);
try {
qwenTtsRealtime.connect();
} catch (NoApiKeyException e) {
throw new RuntimeException(e);
}
QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
.voice("Cherry")
.responseFormat(ttsFormat)
.mode("server_commit")
// To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime.
// .instructions("")
// .optimizeInstructions(true)
.build();
qwenTtsRealtime.updateSession(config);
for (String text:textToSynthesize) {
qwenTtsRealtime.appendText(text);
Thread.sleep(100);
}
qwenTtsRealtime.finish();
completeLatch.get().await();
qwenTtsRealtime.close();
// Wait for audio playback to complete, then shut down the player.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
System.exit(0);
}
}Commit mode
commit()
import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.*;
import java.util.Base64;
import java.util.Queue;
import java.util.Scanner;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
public class Main {
public static QwenTtsRealtimeAudioFormat ttsFormat = QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT;
// Real-time PCM audio player
public static class RealtimePcmPlayer {
private int sampleRate;
private SourceDataLine line;
private AudioFormat audioFormat;
private Thread decoderThread;
private Thread playerThread;
private AtomicBoolean stopped = new AtomicBoolean(false);
private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();
private ByteArrayOutputStream totalAudioStream = new ByteArrayOutputStream();
// Initialize the audio format and audio line.
public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
this.sampleRate = sampleRate;
this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
line = (SourceDataLine) AudioSystem.getLine(info);
line.open(audioFormat);
line.start();
decoderThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
String b64Audio = b64AudioBuffer.poll();
if (b64Audio != null) {
byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
RawAudioBuffer.add(rawAudio);
// Write audio data to totalAudioStream.
try {
totalAudioStream.write(rawAudio);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
playerThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
byte[] rawAudio = RawAudioBuffer.poll();
if (rawAudio != null) {
try {
playChunk(rawAudio);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
decoderThread.start();
playerThread.start();
}
// Play an audio chunk and block until playback completes.
private void playChunk(byte[] chunk) throws IOException, InterruptedException {
if (chunk == null || chunk.length == 0) return;
int bytesWritten = 0;
while (bytesWritten < chunk.length) {
bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
}
int audioLength = chunk.length / (this.sampleRate*2/1000);
// Wait for the buffered audio to finish playing.
Thread.sleep(audioLength - 10);
}
public void write(String b64Audio) {
b64AudioBuffer.add(b64Audio);
}
public void cancel() {
b64AudioBuffer.clear();
RawAudioBuffer.clear();
}
public void waitForComplete() throws InterruptedException {
// Wait for all buffered audio data to finish playing.
while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
Thread.sleep(100);
}
// Wait for the audio line to drain.
line.drain();
}
public void shutdown() throws InterruptedException {
stopped.set(true);
decoderThread.join();
playerThread.join();
// Save the complete audio file.
File file = new File("TotalAudio_"+ttsFormat.getSampleRate()+"."+ttsFormat.getFormat());
try (FileOutputStream fos = new FileOutputStream(file)) {
fos.write(totalAudioStream.toByteArray());
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
if (line != null && line.isRunning()) {
line.drain();
line.close();
}
}
}
public static void main(String[] args) throws InterruptedException, LineUnavailableException, FileNotFoundException {
Scanner scanner = new Scanner(System.in);
QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
// To use instruction control, replace the model with qwen3-tts-instruct-flash-realtime.
.model("qwen3-tts-flash-realtime")
// Singapore endpoint. For China (Beijing), use wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
.url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
// API keys differ between Singapore and China (Beijing). See https://www.alibabacloud.com/help/zh/model-studio/get-api-key.
.apikey(System.getenv("DASHSCOPE_API_KEY"))
.build();
AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));
// Create a real-time player instance.
RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);
final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);
QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
@Override
public void onOpen() {
System.out.println("connection opened");
System.out.println("Enter text and press Enter to send. Enter 'quit' to exit the program.");
}
@Override
public void onEvent(JsonObject message) {
String type = message.get("type").getAsString();
switch(type) {
case "session.created":
System.out.println("start session: " + message.get("session").getAsJsonObject().get("id").getAsString());
break;
case "response.audio.delta":
String recvAudioB64 = message.get("delta").getAsString();
byte[] rawAudio = Base64.getDecoder().decode(recvAudioB64);
// Play audio in real time.
audioPlayer.write(recvAudioB64);
break;
case "response.done":
System.out.println("response done");
// Wait for audio playback to complete.
try {
audioPlayer.waitForComplete();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
// Prepare for the next input.
completeLatch.get().countDown();
break;
case "session.finished":
System.out.println("session finished");
if (qwenTtsRef.get() != null) {
System.out.println("[Metric] response: " + qwenTtsRef.get().getResponseId() +
", first audio delay: " + qwenTtsRef.get().getFirstAudioDelay() + " ms");
}
completeLatch.get().countDown();
default:
break;
}
}
@Override
public void onClose(int code, String reason) {
System.out.println("connection closed code: " + code + ", reason: " + reason);
try {
// Wait for playback to complete, then shut down the player.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
});
qwenTtsRef.set(qwenTtsRealtime);
try {
qwenTtsRealtime.connect();
} catch (NoApiKeyException e) {
throw new RuntimeException(e);
}
QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
.voice("Cherry")
.responseFormat(ttsFormat)
.mode("commit")
// To use instruction control, uncomment the following lines and replace the model with qwen3-tts-instruct-flash-realtime.
// .instructions("")
// .optimizeInstructions(true)
.build();
qwenTtsRealtime.updateSession(config);
// Read user input in a loop.
while (true) {
System.out.print("Enter the text to synthesize: ");
String text = scanner.nextLine();
// Exit when the user enters 'quit'.
if ("quit".equalsIgnoreCase(text.trim())) {
System.out.println("Closing the connection...");
qwenTtsRealtime.finish();
completeLatch.get().await();
break;
}
// Skip empty input.
if (text.trim().isEmpty()) {
continue;
}
// Re-initialize the countdown latch.
completeLatch.set(new CountDownLatch(1));
// Send the text.
qwenTtsRealtime.appendText(text);
qwenTtsRealtime.commit();
// Wait for the current synthesis to complete.
completeLatch.get().await();
}
// Clean up resources.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
scanner.close();
System.exit(0);
}
}For more samples, see the GitHub repository.
Request parameters
Configure the following request parameters with the chaining methods or setters of a QwenTtsRealtimeParam object, then pass the object to the QwenTtsRealtime constructor.
Parameter | Type | Required | Description |
model | String | Yes | Model name (see Supported models). |
url | String | Yes | Chinese Mainland: International: |
Configure the following request parameters with the chaining methods or setters of a QwenTtsRealtimeConfig object, then pass the object to the updateSession method.
Parameter | Type | Required | Description |
voice | String | Yes | The voice for speech synthesis. For more information, see Supported voices. System voices and custom voices are supported:
|
languageType | String | No | Specifies the language of the synthesized audio. The default value is
|
mode | String | No | The interaction mode. Valid values:
|
format | String | No | The format of the audio output from the model. Supported formats:
Qwen-TTS-Realtime (see Supported models) supports only |
sampleRate | int | No | The sample rate of the audio output from the model, in Hz. Supported sample rates:
Qwen-TTS-Realtime (see Supported models) supports only 24000. |
speechRate | float | No | The speech rate of the audio. A value of 1.0 is the normal speed. A value less than 1.0 is slower, and a value greater than 1.0 is faster. Default value: 1.0. Valid range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
volume | int | No | The volume of the audio. Default value: 50. Valid range: [0, 100]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
pitchRate | float | No | The pitch of the synthesized audio. Default value: 1.0. Valid range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
bitRate | int | No | Specifies the bitrate of the audio in kbps. A higher bitrate results in better audio quality and a larger file size. This parameter is available only when the audio format ( Default value: 128. Valid range: [6, 510]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
instructions | String | No | Sets the instructions. For more information, see Real-time speech synthesis - Qwen. Default value: None. The parameter is not active if not set. Length limit: The length cannot exceed 1600 tokens. Supported languages: Chinese and English only. Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series. |
optimizeInstructions | boolean | No | Specifies whether to optimize the Default value: false. Behavior: When set to true, the system enhances and rewrites the content of Scenarios: Recommended for scenarios that require high-quality, fine-grained voice expression. Dependency: This parameter depends on the Scope: This feature is available only for the Qwen3-TTS-Instruct-Flash-Realtime model series. |
Key interfaces
QwenTtsRealtime class
To import:
import com.alibaba.dashscope.audio.qwen_tts_realtime.QwenTtsRealtime;Method | Signature | Server events | Description |
connect | | Session created Session configuration updated | Opens a WebSocket connection to the server. |
updateSession | | Session configuration updated | Updates the session configuration. See Request parameters. After connecting, the server returns default input and output configurations for the session. Call this method immediately after connect() to override defaults. The server validates parameters upon receiving a session.update event. If any parameter is invalid, the server returns an error. Otherwise, it updates the session configuration on the server side. |
appendText | | None | Appends a text segment to the server-side input buffer. The buffer stores text until you submit it.
|
clearAppendedText | | Clears text received by the server | Clears all text in the server-side input buffer. |
commit | | Commits text and triggers speech synthesis New output content appears in the response New output content added to the assistant message item Model generates audio incrementally Audio generation completed Streaming of audio content for the assistant message is complete Streaming of the entire output item for the assistant message is complete Response completed | Commits text previously appended to the server-side buffer and synthesizes all text immediately. Returns an error if the buffer is empty.
|
finish | | Response completed | Stops the current task. |
close | | None | Closes the connection. |
getSessionId | | None | Returns the session ID of the current task. |
getResponseId | | None | Returns the response ID of the most recent response. |
getFirstAudioDelay | | None | Returns the latency of the first audio packet in milliseconds. |
Callback interface (QwenTtsRealtimeCallback)
Method | Parameters | Return value | Description |
| None | None | Called immediately after the WebSocket connection is established. |
| message: server-side response event. | None | Called when the server sends an event, including API call responses and model-generated audio. See Server-side events. |
| code: WebSocket close status code. reason: Close reason. | None | Called after the server closes the connection. |