This topic describes the key interfaces and request parameters for Real-time speech synthesis - Qwen DashScope Java SDK.
User guide: For model overview and selection, see Real-time speech synthesis - Qwen.
Prerequisites
Use DashScope Java SDK version 2.21.16 or later.
Getting started
Server commit mode
// The Dashscope SDK version must be 2.21.16 or later.
import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Base64;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
public class Main {
static String[] textToSynthesize = {
"Right? I especially love this kind of supermarket.",
"Especially during the New Year.",
"Going to the supermarket.",
"It just makes me feel.",
"Super, super happy!",
"I want to buy so many things!"
};
// Real-time PCM audio player class
public static class RealtimePcmPlayer {
private int sampleRate;
private SourceDataLine line;
private AudioFormat audioFormat;
private Thread decoderThread;
private Thread playerThread;
private AtomicBoolean stopped = new AtomicBoolean(false);
private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();
// The constructor initializes the audio format and audio line.
public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
this.sampleRate = sampleRate;
this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
line = (SourceDataLine) AudioSystem.getLine(info);
line.open(audioFormat);
line.start();
decoderThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
String b64Audio = b64AudioBuffer.poll();
if (b64Audio != null) {
byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
RawAudioBuffer.add(rawAudio);
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
playerThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
byte[] rawAudio = RawAudioBuffer.poll();
if (rawAudio != null) {
try {
playChunk(rawAudio);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
decoderThread.start();
playerThread.start();
}
// Play an audio chunk and block until playback is complete.
private void playChunk(byte[] chunk) throws IOException, InterruptedException {
if (chunk == null || chunk.length == 0) return;
int bytesWritten = 0;
while (bytesWritten < chunk.length) {
bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
}
int audioLength = chunk.length / (this.sampleRate*2/1000);
// Wait for the audio in the buffer to finish playing.
Thread.sleep(audioLength - 10);
}
public void write(String b64Audio) {
b64AudioBuffer.add(b64Audio);
}
public void cancel() {
b64AudioBuffer.clear();
RawAudioBuffer.clear();
}
public void waitForComplete() throws InterruptedException {
while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
Thread.sleep(100);
}
line.drain();
}
public void shutdown() throws InterruptedException {
stopped.set(true);
decoderThread.join();
playerThread.join();
if (line != null && line.isRunning()) {
line.drain();
line.close();
}
}
}
public static void main(String[] args) throws InterruptedException, LineUnavailableException, FileNotFoundException {
QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
.model("qwen3-tts-flash-realtime")
// The following URL is for the Singapore region. If you use a model in the China (Beijing) region, replace the URL with wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
.url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
// The API keys for the Singapore and China (Beijing) regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
.apikey(System.getenv("DASHSCOPE_API_KEY"))
.build();
AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));
final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);
// Create a real-time audio player instance.
RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);
QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
@Override
public void onOpen() {
// Handle the event when the connection is established.
}
@Override
public void onEvent(JsonObject message) {
String type = message.get("type").getAsString();
switch(type) {
case "session.created":
// Handle the event when the session is created.
break;
case "response.audio.delta":
String recvAudioB64 = message.get("delta").getAsString();
// Play the audio in real time.
audioPlayer.write(recvAudioB64);
break;
case "response.done":
// Handle the event when the response is complete.
break;
case "session.finished":
// Handle the event when the session is finished.
completeLatch.get().countDown();
default:
break;
}
}
@Override
public void onClose(int code, String reason) {
// Handle the event when the connection is closed.
}
});
qwenTtsRef.set(qwenTtsRealtime);
try {
qwenTtsRealtime.connect();
} catch (NoApiKeyException e) {
throw new RuntimeException(e);
}
QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
.voice("Cherry")
.responseFormat(QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT)
.mode("server_commit")
.build();
qwenTtsRealtime.updateSession(config);
for (String text:textToSynthesize) {
qwenTtsRealtime.appendText(text);
Thread.sleep(100);
}
qwenTtsRealtime.finish();
completeLatch.get().await();
qwenTtsRealtime.close();
// Wait for the audio to finish playing and then shut down the player.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
System.exit(0);
}
}Commit mode
// The Dashscope SDK version must be 2.21.16 or later.
import com.alibaba.dashscope.audio.qwen_tts_realtime.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.SourceDataLine;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.AudioSystem;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Base64;
import java.util.Queue;
import java.util.Scanner;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
public class commit {
// Real-time PCM audio player class
public static class RealtimePcmPlayer {
private int sampleRate;
private SourceDataLine line;
private AudioFormat audioFormat;
private Thread decoderThread;
private Thread playerThread;
private AtomicBoolean stopped = new AtomicBoolean(false);
private Queue<String> b64AudioBuffer = new ConcurrentLinkedQueue<>();
private Queue<byte[]> RawAudioBuffer = new ConcurrentLinkedQueue<>();
// The constructor initializes the audio format and audio line.
public RealtimePcmPlayer(int sampleRate) throws LineUnavailableException {
this.sampleRate = sampleRate;
this.audioFormat = new AudioFormat(this.sampleRate, 16, 1, true, false);
DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
line = (SourceDataLine) AudioSystem.getLine(info);
line.open(audioFormat);
line.start();
decoderThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
String b64Audio = b64AudioBuffer.poll();
if (b64Audio != null) {
byte[] rawAudio = Base64.getDecoder().decode(b64Audio);
RawAudioBuffer.add(rawAudio);
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
playerThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get()) {
byte[] rawAudio = RawAudioBuffer.poll();
if (rawAudio != null) {
try {
playChunk(rawAudio);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
} else {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
});
decoderThread.start();
playerThread.start();
}
// Play an audio chunk and block until playback is complete.
private void playChunk(byte[] chunk) throws IOException, InterruptedException {
if (chunk == null || chunk.length == 0) return;
int bytesWritten = 0;
while (bytesWritten < chunk.length) {
bytesWritten += line.write(chunk, bytesWritten, chunk.length - bytesWritten);
}
int audioLength = chunk.length / (this.sampleRate*2/1000);
// Wait for the audio in the buffer to finish playing.
Thread.sleep(audioLength - 10);
}
public void write(String b64Audio) {
b64AudioBuffer.add(b64Audio);
}
public void cancel() {
b64AudioBuffer.clear();
RawAudioBuffer.clear();
}
public void waitForComplete() throws InterruptedException {
// Wait for all audio data in the buffers to finish playing.
while (!b64AudioBuffer.isEmpty() || !RawAudioBuffer.isEmpty()) {
Thread.sleep(100);
}
// Wait for the audio line to finish playing.
line.drain();
}
public void shutdown() throws InterruptedException {
stopped.set(true);
decoderThread.join();
playerThread.join();
if (line != null && line.isRunning()) {
line.drain();
line.close();
}
}
}
public static void main(String[] args) throws InterruptedException, LineUnavailableException, FileNotFoundException {
Scanner scanner = new Scanner(System.in);
QwenTtsRealtimeParam param = QwenTtsRealtimeParam.builder()
.model("qwen3-tts-flash-realtime")
// The following URL is for the Singapore region. If you use a model in the China (Beijing) region, replace the URL with wss://dashscope.aliyuncs.com/api-ws/v1/realtime.
.url("wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime")
// The API keys for the Singapore and China (Beijing) regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
.apikey(System.getenv("DASHSCOPE_API_KEY"))
.build();
AtomicReference<CountDownLatch> completeLatch = new AtomicReference<>(new CountDownLatch(1));
// Create a real-time player instance.
RealtimePcmPlayer audioPlayer = new RealtimePcmPlayer(24000);
final AtomicReference<QwenTtsRealtime> qwenTtsRef = new AtomicReference<>(null);
QwenTtsRealtime qwenTtsRealtime = new QwenTtsRealtime(param, new QwenTtsRealtimeCallback() {
// File file = new File("result_24k.pcm");
// FileOutputStream fos = new FileOutputStream(file);
@Override
public void onOpen() {
System.out.println("connection opened");
System.out.println("Enter text and press Enter to send. Enter 'quit' to exit the program.");
}
@Override
public void onEvent(JsonObject message) {
String type = message.get("type").getAsString();
switch(type) {
case "session.created":
System.out.println("start session: " + message.get("session").getAsJsonObject().get("id").getAsString());
break;
case "response.audio.delta":
String recvAudioB64 = message.get("delta").getAsString();
byte[] rawAudio = Base64.getDecoder().decode(recvAudioB64);
// fos.write(rawAudio);
// Play the audio in real time.
audioPlayer.write(recvAudioB64);
break;
case "response.done":
System.out.println("response done");
// Wait for the audio to finish playing.
try {
audioPlayer.waitForComplete();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
// Prepare for the next input.
completeLatch.get().countDown();
break;
case "session.finished":
System.out.println("session finished");
if (qwenTtsRef.get() != null) {
System.out.println("[Metric] response: " + qwenTtsRef.get().getResponseId() +
", first audio delay: " + qwenTtsRef.get().getFirstAudioDelay() + " ms");
}
completeLatch.get().countDown();
default:
break;
}
}
@Override
public void onClose(int code, String reason) {
System.out.println("connection closed code: " + code + ", reason: " + reason);
try {
// fos.close();
// Wait for playback to complete and then shut down the player.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
});
qwenTtsRef.set(qwenTtsRealtime);
try {
qwenTtsRealtime.connect();
} catch (NoApiKeyException e) {
throw new RuntimeException(e);
}
QwenTtsRealtimeConfig config = QwenTtsRealtimeConfig.builder()
.voice("Cherry")
.responseFormat(QwenTtsRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT)
.mode("commit")
.build();
qwenTtsRealtime.updateSession(config);
// Loop to read user input.
while (true) {
System.out.print("Enter the text to synthesize: ");
String text = scanner.nextLine();
// If the user enters 'quit', exit the program.
if ("quit".equalsIgnoreCase(text.trim())) {
System.out.println("Closing the connection...");
qwenTtsRealtime.finish();
completeLatch.get().await();
break;
}
// If the user input is empty, skip.
if (text.trim().isEmpty()) {
continue;
}
// Reinitialize the countdown latch.
completeLatch.set(new CountDownLatch(1));
// Send the text.
qwenTtsRealtime.appendText(text);
qwenTtsRealtime.commit();
// Wait for the current synthesis to complete.
completeLatch.get().await();
}
// Clean up resources.
audioPlayer.waitForComplete();
audioPlayer.shutdown();
scanner.close();
System.exit(0);
}
}To download more sample code, visit GitHub.
Request parameters
Configure the following request parameters using the chained methods or setters of the QwenTtsRealtimeParam object and then pass the object to the `QwenTtsRealtime` constructor.
Parameter | Type | Description |
model | String | The model name. For more information, see Supported models. |
url | String | China (Beijing): International (Singapore): |
Configure the following request parameters using the chained methods or setters of the QwenTtsRealtimeConfig object and then pass the object to the `updateSession` method.
Parameter | Type | Description |
voice | String | The voice used for speech synthesis. For more information, see Supported voices. Supports system and custom voices:
|
languageType | String | Specifies the language of the synthesized audio. The default value is
|
mode | String | The interaction mode. Valid values: "server_commit" and "commit". The default value is "server_commit".
|
format | String | The format of the audio output from the model. Supported formats:
Qwen-TTS-Realtime (see Supported models) supports only pcm. |
sampleRate | int | The sample rate (in Hz) of the audio output from the model. Supported sample rates:
Qwen-TTS-Realtime (see Supported models) supports only 24000. |
speechRate | float | The speech rate of the audio. A value of 1.0 indicates a normal speed. A value less than 1.0 indicates a slower speed, and a value greater than 1.0 indicates a faster speed. Default value: 1.0. Valid values: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
volume | int | The volume of the audio. Default value: 50. Value range: [0, 100]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
pitchRate | float | The pitch of the synthesized audio. Default value: 1.0. Value range: [0.5, 2.0]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
bitRate | int | Specifies the bitrate (in kbps) of the audio. A higher bitrate results in better audio quality and a larger file size. This parameter is available only when the audio format ( Default value: 128. Value range: [6, 510]. Qwen-TTS-Realtime (see Supported models) does not support this parameter. |
Key interfaces
QwenTtsRealtime class
Import method:
import com.alibaba.dashscope.audio.qwen_tts_realtime.QwenTtsRealtime;Member method | Method signature | Server response event (sent via callback) | Description |
connect | | The session is created. The session configuration is updated. | Creates a connection with the server. |
updateSession | | The session configuration is updated. | Updates the default configuration for the current session interaction. For parameter settings, see the "Request parameters" section. After you establish a connection, the server returns the default configurations for the session. To update these configurations, call this interface right after the connection is established. After the server receives the session.update event, it performs parameter verification. If the parameters are invalid, an error is returned. Otherwise, the session configuration on the server is updated. |
appendText | | None | Appends a text segment to the input text buffer on the cloud. The buffer is temporary storage where you can write text and commit it later.
|
clearAppendedText | | The text received by the server is cleared. | Deletes the text in the current cloud buffer. |
commit | | The text is committed and speech synthesis is triggered. A new output item is added to the response. New output content is added to the assistant message item. The model incrementally generates audio. Audio generation is complete. The streaming output of the audio content for the assistant message is complete. The streaming of the entire output item for the assistant message is complete. The response is complete. | Commits the text previously added to the cloud buffer using `append` and immediately synthesizes all the text. An error occurs if the input text buffer is empty.
|
finish | | The response is complete. | Stops the task. |
close | | None | Closes the connection. |
getSessionId | | None | Gets the session_id of the current task. |
getResponseId | | None | Gets the response_id of the most recent response. |
getFirstAudioDelay | | None | Gets the first audio packet delay. |
Callback interface (QwenTtsRealtimeCallback)
Method | Parameters | Return value | Description |
| None | None | This method is called immediately after a connection is established with the server. |
| message: The server response event. | None | Includes response replies to interface calls and the text and audio generated by the model. For more information, see Server events. |
| code: The status code for closing the WebSocket. reason: The reason for closing the WebSocket. | None | This method is called after the service has closed the connection. |