Qwen-Omni-Realtime processes streaming audio and image inputs (including video frames) and generates text and audio responses in real time.
Supported regions: Singapore, China (Beijing). Each region requires its own API key.
How to use
1. Establish connection
Qwen-Omni-Realtime supports WebSocket and WebRTC. WebSocket suits server-side integration with quick setup. WebRTC targets browser-based low-latency voice scenarios, transmitting audio over UDP with built-in echo cancellation and noise reduction.
WebSocket
Native WebSocket
Connection parameters:
|
Parameter |
Description |
|
Endpoint |
China (Beijing) region: wss://dashscope.aliyuncs.com/api-ws/v1/realtime Singapore region: wss://{WorkspaceId}.ap-southeast-1.maas.aliyuncs.com/api-ws/v1/realtime. Replace {WorkspaceId} with your actual workspace ID. |
|
Query parameter |
Use the |
|
Request header |
Use a Bearer token for authentication:
|
# pip install websocket-client
import json
import websocket
import os
API_KEY=os.getenv("DASHSCOPE_API_KEY")
API_URL = "wss://{WorkspaceId}.ap-southeast-1.maas.aliyuncs.com/api-ws/v1/realtime?model=qwen3.5-omni-plus-realtime"
headers = [
"Authorization: Bearer " + API_KEY
]
def on_open(ws):
print(f"Connected to server: {API_URL}")
def on_message(ws, message):
data = json.loads(message)
print("Received event:", json.dumps(data, indent=2))
def on_error(ws, error):
print("Error:", error)
ws = websocket.WebSocketApp(
API_URL,
header=headers,
on_open=on_open,
on_message=on_message,
on_error=on_error
)
ws.run_forever()
DashScope Python SDK
# SDK version 1.23.9 or later is required.
import os
import json
from dashscope.audio.qwen_omni import OmniRealtimeConversation,OmniRealtimeCallback
import dashscope
# API keys for the Singapore and China (Beijing) regions are different. To get an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If you have not configured an API key, change the following line to dashscope.api_key = "sk-xxx".
dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")
class PrintCallback(OmniRealtimeCallback):
def on_open(self) -> None:
print("Connected Successfully")
def on_event(self, response: dict) -> None:
print("Received event:")
print(json.dumps(response, indent=2, ensure_ascii=False))
def on_close(self, close_status_code: int, close_msg: str) -> None:
print(f"Connection closed (code={close_status_code}, msg={close_msg}).")
callback = PrintCallback()
conversation = OmniRealtimeConversation(
model="qwen3.5-omni-plus-realtime",
callback=callback,
# The following URL is for the Singapore region. When calling, replace WorkspaceId with your actual workspace ID. URLs vary by region.
url="wss://{WorkspaceId}.ap-southeast-1.maas.aliyuncs.com/api-ws/v1/realtime"
)
try:
conversation.connect()
print("Conversation started. Press Ctrl+C to exit.")
conversation.thread.join()
except KeyboardInterrupt:
conversation.close()
DashScope Java SDK
// SDK version 2.20.9 or later is required.
import com.alibaba.dashscope.audio.omni.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import java.util.concurrent.CountDownLatch;
public class Main {
public static void main(String[] args) throws InterruptedException, NoApiKeyException {
CountDownLatch latch = new CountDownLatch(1);
OmniRealtimeParam param = OmniRealtimeParam.builder()
.model("qwen3.5-omni-plus-realtime")
.apikey(System.getenv("DASHSCOPE_API_KEY"))
// The following URL is for the Singapore region. When calling, replace WorkspaceId with your actual workspace ID. URLs vary by region.
.url("wss://{WorkspaceId}.ap-southeast-1.maas.aliyuncs.com/api-ws/v1/realtime")
.build();
OmniRealtimeConversation conversation = new OmniRealtimeConversation(param, new OmniRealtimeCallback() {
@Override
public void onOpen() {
System.out.println("Connected Successfully");
}
@Override
public void onEvent(JsonObject message) {
System.out.println(message);
}
@Override
public void onClose(int code, String reason) {
System.out.println("connection closed code: " + code + ", reason: " + reason);
latch.countDown();
}
});
conversation.connect();
latch.await();
conversation.close(1000, "bye");
System.exit(0);
}
}
WebRTC
Establishing a WebRTC connection involves two stages:
-
SDP exchange (HTTP): The client sends its media capabilities and network addresses (Offer SDP) to the server via HTTP POST. The server returns its information (Answer SDP) to complete the capability negotiation.
-
Connection (automatic): After the negotiation, the WebRTC layer automatically establishes the audio transport channel.
SDP exchange configuration:
|
Parameter |
Description |
|
Request URL |
POST https://{endpoint}/api/v1/webrtc/realtime The WebRTC feature is currently available by allowlist only. Contact your sales manager to get the endpoint. |
|
Query parameter |
Use the |
|
Content-Type |
application/sdp |
|
Request header |
Authorization: Bearer DASHSCOPE_API_KEY |
|
Request body |
The client-generated Offer SDP string |
|
Response |
Success: HTTP 200 with the server Answer SDP string. Failure: HTTP 4xx with a JSON error message. |
Connection code examples:
# pip install aiortc aiohttp certifi
import asyncio, aiohttp, ssl, certifi
from aiortc import RTCPeerConnection, RTCConfiguration, RTCSessionDescription
from aiortc.mediastreams import AudioStreamTrack
API_KEY = "your-api-key"
MODEL = "qwen3.5-omni-plus-realtime"
SIGNALING_URL = f"https://{{endpoint}}/api/v1/webrtc/realtime?model={MODEL}"
async def connect():
pc = RTCPeerConnection(RTCConfiguration(iceServers=[]))
# Add an audio track to ensure the Offer SDP contains m=audio (required by the server)
pc.addTrack(AudioStreamTrack())
# Create a DataChannel to trigger SDP negotiation (name is customizable; the server pushes events through a channel named "txt")
pc.createDataChannel("oai-events")
# SDP exchange: create an Offer and send it to the server
offer = await pc.createOffer()
await pc.setLocalDescription(offer)
async with aiohttp.ClientSession() as session:
async with session.post(
SIGNALING_URL,
ssl=ssl.create_default_context(cafile=certifi.where()),
data=offer.sdp.encode("utf-8"),
headers={
"Content-Type": "application/sdp",
"Authorization": f"Bearer {API_KEY}",
},
) as resp:
if not resp.ok:
raise Exception(f"SDP exchange failed: {resp.status} {await resp.text()}")
answer_sdp = await resp.text()
print("=== Offer SDP ===")
print(offer.sdp)
print("=== Answer SDP ===")
print(answer_sdp)
# ICE connection is established automatically
await pc.setRemoteDescription(RTCSessionDescription(sdp=answer_sdp, type="answer"))
print("WebRTC connection established")
return pcconst API_KEY = 'your-api-key';
const API_URL = 'https://{endpoint}/api/v1/webrtc/realtime?model=qwen3.5-omni-plus-realtime';
async function connect() {
const pc = new RTCPeerConnection({ iceServers: [] });
// Add an audio track to ensure the Offer SDP contains m=audio (required by the server)
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
stream.getAudioTracks().forEach(t => pc.addTrack(t, stream));
// Create a DataChannel to trigger SDP negotiation (name is customizable; the server pushes events through a channel named "txt")
pc.createDataChannel('oai-events');
// Wait for ICE gathering to complete before sending the Offer to get the Answer
pc.onicegatheringstatechange = async () => {
if (pc.iceGatheringState !== 'complete') return;
const resp = await fetch(API_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/sdp',
'Authorization': `Bearer ${API_KEY}`,
},
body: pc.localDescription.sdp,
});
if (!resp.ok) throw new Error('SDP exchange failed: ' + resp.status);
const answerSdp = await resp.text();
// ICE connection is established automatically
await pc.setRemoteDescription({ type: 'answer', sdp: answerSdp });
console.log('WebRTC connection established');
};
// Create the Offer
const offer = await pc.createOffer();
await pc.setLocalDescription(offer);
return pc;
}2. Configure session
Send the session.update client event:
{
// A client-generated event ID.
"event_id": "event_ToPZqeobitzUJnt3QqtWg",
// The event type. Must be "session.update".
"type": "session.update",
// The session configuration.
"session": {
// The output modality. Set this to ["text"] for text-only output, or ["text", "audio"] for both text and audio output.
"modalities": [
"text",
"audio"
],
// The voice for the audio output.
"voice": "Ethan",
// The input audio format. Only "pcm" is supported. The input audio must be a PCM audio stream at a 16 kHz sample rate.
"input_audio_format": "pcm",
// The output audio format. Only "pcm" is supported. The output audio is a PCM audio stream at a 24 kHz sample rate.
"output_audio_format": "pcm",
// A system instruction to define the model's goal or role.
"instructions": "You are an AI customer service agent for a five-star hotel. Answer customer inquiries about room types, facilities, prices, and booking policies accurately and in a friendly manner. Always respond with a professional and helpful attitude. Do not provide unconfirmed information or information beyond the scope of the hotel's services.",
// Enables server-side voice activity detection (VAD). If enabled, the server automatically detects the start and end of speech.
// If null, the client controls when to trigger model responses.
"turn_detection": {
// The VAD type. Valid values: "server_vad" and "semantic_vad". We recommend "semantic_vad" for the qwen3.5-omni-realtime model.
"type": "semantic_vad",
// The VAD detection threshold. We recommend increasing this value in noisy environments and decreasing it in quiet environments.
"threshold": 0.5,
// The silence duration in milliseconds (ms) that signals the end of an utterance. The model triggers a response if this duration is exceeded.
"silence_duration_ms": 800
}
}
}
3. Input audio and images
Audio input is required; image input is optional. The input method depends on the protocol.
WebSocket
Send Base64-encoded audio and image data to the server buffer using the input_audio_buffer.append and input_image_buffer.append events.
Images can come from local files or real-time video stream captures.
With server-side VAD enabled, the server automatically submits data and triggers a response at end-of-utterance. With VAD disabled (manual mode), call the input_audio_buffer.commit event to submit data after sending.
WebRTC
Audio and video tracks (RTP media channels) added during connection establishment transmit data to the server automatically.
-
Audio: Transmitted directly through the audio track (RTP). No
input_audio_buffer.appendevents needed. -
Images: Sent as video frames through the video track (RTP).
input_image_buffer.appendis not supported.
WebRTC only supports server-side VAD mode (server_vadorsemantic_vad). Manual mode is not supported.
4. Receive model responses
The response format depends on the configured output modality.
WebSocket
-
Text only
Receive streaming text with the response.text.delta event, and the complete text with the response.text.done event.
-
Text and audio
-
Text: Receive streaming text with the response.audio_transcript.delta event, and the complete text with the response.audio_transcript.done event.
-
Audio: Receive Base64-encoded streaming audio with the response.audio.delta event. The response.audio.done event indicates that audio generation is complete.
-
WebRTC
-
Text only
Same as WebSocket. Receive streaming text events through the DataChannel.
-
Text and audio
-
Text: Received through the DataChannel as streaming text events, same as WebSocket.
-
Audio: Received and played in real time through RTP tracks. No
response.audio.deltaevents needed.
-
Model selection
Qwen3.5-Omni-Realtime improves over Qwen3-Omni-Flash-Realtime in the following areas:
-
Intelligence level
On par with Qwen3.5-Plus.
-
Web search
Built-in web search — the model autonomously searches to answer real-time questions. For details, see Web search.
-
Tool calling
Function calling — the model autonomously invokes external tools. For details, see Qwen-Omni-Realtime series.
-
Semantic interruption
Identifies conversational intent to prevent interruptions from backchanneling and background noise.
-
Voice control
Control volume, speaking rate, and emotion via voice commands (e.g., "speak faster", "louder", "in a happy tone").
-
Supported languages
Supports speech recognition for 113 languages and dialects and speech generation for 36 languages and dialects.
-
Supported voices
Supports 55 voices, including 47 multilingual voices and 8 dialectal voices. For a complete list, see Voice list.
-
Voice cloning
Use a custom cloned voice for real-time conversations (Qwen3.5-omni-plus-realtime and Qwen3.5-omni-flash-realtime). For details, see Voice cloning.
Check the Model Studio console for model names, context, pricing, and snapshot versions. For concurrency rate limits, see Rate limits.
Limitations
-
Web search and tool calling are mutually exclusive.
-
A single WebSocket session can last up to 120 minutes. The connection closes automatically at this limit.
-
The model retains conversation history up to the following turn and duration limits. When exceeded, the oldest history is discarded. Max duration is the cumulative audio or video (image frame) duration retained in context.
Video is input as extracted frames (recommended: 1 fps). Video max duration is the cumulative frame duration retained — for example, 240 s means only frames from the last 240 seconds are kept.
The
qwen3-omni-flash-realtimemodel has a limit of 8 dialog turns (typically reached first). Its duration limit depends on the model's context length and is not listed separately.Model
Audio max turns
Video max turns
Audio max duration
Video max duration
qwen3.5-omni-plus-realtime
100 turns
50 turns
600 seconds
240 seconds
qwen3.5-omni-flash-realtime
80 turns
50 turns
480 seconds
120 seconds
qwen3-omni-flash-realtime
8 turns
8 turns
—
—
Getting started
Get an API key and set it as an environment variable.
Select a programming language and follow the steps to start a real-time chat.
WebSocket
DashScope Python SDK
-
Runtime environment
Ensure Python 3.10 or later is installed.
Install PyAudio for your operating system.
macOS
brew install portaudio && pip install pyaudio
Debian/Ubuntu
-
If you are not using a virtual environment, you can install it directly using the system package manager:
sudo apt-get install python3-pyaudio -
If you are using a virtual environment, first install the build dependencies:
sudo apt update sudo apt install -y python3-dev portaudio19-devThen, install it with pip in the activated virtual environment:
pip install pyaudio
CentOS
sudo yum install -y portaudio portaudio-devel && pip install pyaudio
Windows
pip install pyaudio
Install the other dependencies:
pip install websocket-client dashscope
-
Interaction mode
-
VAD mode (Voice Activity Detection, automatically detects the start and end of speech)
The server responds after detecting the end of the user's speech.
-
Manual mode (press to speak, release to send)
The client controls the start and end of speech. After speaking, your application must notify the server.
VAD mode
Create a Python file named vad_dash.py and copy the following code into the file:
Run
vad_dash.pyto start a real-time conversation through your microphone. The system detects speech and streams audio to the server.Manual mode
Create a Python file named
manual_dash.pyand copy the following code into the file:Run
manual_dash.py. Press Enter to start recording, and press Enter again to stop and send. The model's audio response plays automatically. -
DashScope Java SDK
Select an interaction mode
-
VAD mode (Voice Activity Detection, automatically detects the start and end of speech)
The Realtime API detects when you start and stop speaking and responds.
-
Manual mode (press to talk, release to send)
The client controls the start and end of speech. After speaking, the client must send a message to the server.
VAD mode
Run OmniServerVad.main() to start a real-time conversation through your microphone. The system detects speech and sends audio to the server.
Manual mode
Run OmniWithoutServerVad.main(). Press Enter to start recording, and press it again to stop and send. The model's response plays automatically.
WebSocket (Python)
-
Prepare the runtime environment
Ensure Python 3.10 or later is installed.
Install pyaudio for your operating system.
macOS
brew install portaudio && pip install pyaudioDebian/Ubuntu
sudo apt-get install python3-pyaudio or pip install pyaudioWe recommend using
pip install pyaudio. If the installation fails, first install theportaudiodependency for your operating system.CentOS
sudo yum install -y portaudio portaudio-devel && pip install pyaudioWindows
pip install pyaudioInstall the WebSocket dependency:
pip install websockets==15.0.1 -
Create the client
Create a file named
omni_realtime_client.pyand copy the following code into it: -
Select an interaction mode
-
VAD mode (Voice Activity Detection, automatically detects the start and end of speech)
The Realtime API detects when you start and stop speaking and generates a response.
-
Manual mode (press to speak, release to send)
You control when to start and stop sending audio. After speaking, the client must send a message to the server to generate a response.
VAD mode
In the same directory as
omni_realtime_client.py, create a file namedvad_mode.pyand copy the following code into it:Run
vad_mode.pyto start a real-time conversation through your microphone. The system detects speech and streams audio to the server.Manual mode
In the same directory as
omni_realtime_client.py, create a file namedmanual_mode.pyand copy the following code into it:Run
manual_mode.py. Press Enter to start recording, and press Enter again to stop and send. -
WebRTC
Python
-
Runtime environment
Python 3.10 or later is required. Install the following dependencies:
pip install aiortc aiohttp sounddevice numpy certifi av -
Run the demo
Create a Python file named
webrtc_demo.pyand paste the following code:Run
webrtc_demo.pyto start a real-time conversation with the Qwen-Omni-Realtime model through your microphone. The system detects the start of your speech and sends audio to the server automatically.
JavaScript
-
Prerequisites
-
Use a modern browser that supports WebRTC (Chrome, Edge, Firefox, Safari, etc.).
-
The browser requires microphone permission.
-
Due to browser cross-origin security policies, the browser cannot directly send the connection request to the server. You need to run a curl command in the terminal to complete the connection setup.
-
-
Run the demo
Create an HTML file named
webrtc_demo.htmland paste the following code:Open this file in a browser and follow these steps:
-
Click Start Session. The page automatically generates the Offer SDP and the corresponding curl command.
-
Click Copy curl Command and run it in your terminal. The output is the Answer SDP.
-
Paste the Answer SDP into the Answer SDP text box, then click Set Answer to establish the connection and start the voice chat.
-
Interaction flow
VAD mode
Set session.turn_detection.type in session.update to "server_vad" or "semantic_vad" to enable VAD mode. Suitable for voice call scenarios. Both WebSocket and WebRTC support VAD mode with the same server events; they differ only in how audio and images are transmitted.
WebRTC only supports VAD mode and does not support Manual mode. With WebRTC, audio is transmitted directly via RTP without sendinginput_audio_buffer.appendevents; images are transmitted via video tracks without support forinput_image_buffer.appendevents. Control commands and server events are transmitted via DataChannel with the same event types as WebSocket.
The interaction flow is as follows:
-
The client sends audio data. WebSocket sends it via input_audio_buffer.append events; WebRTC transmits it automatically via the audio track (RTP) without manually sending events.
-
The server detects the start of speech and sends the input_audio_buffer.speech_started event via DataChannel (WebRTC) or WebSocket.
-
The server detects the end of speech and sends the input_audio_buffer.speech_stopped event.
-
The server commits the audio buffer and sends the input_audio_buffer.committed event.
-
The server begins generating a response, sending conversation.item.created and other events. Audio responses are returned incrementally via the WebSocket
response.audio.deltaevent, or transmitted directly via the WebRTC audio track (RTP). -
During the response, the server returns incremental text transcription via
response.audio_transcript.deltaevents, and sends theresponse.doneevent when the response is complete.
|
Lifecycle |
Client events |
Server events |
|
Session initialization |
Session configuration |
Session created. Session configuration updated. |
|
User audio input |
WebSocket: Appends audio to the buffer via this event. WebSocket: Appends an image to the buffer via this event. WebRTC: Audio is transmitted automatically via the RTP audio track, and images are transmitted via the video track. These events are not needed. |
input_audio_buffer.speech_started Speech start detected. input_audio_buffer.speech_stopped Speech end detected. Audio buffer committed. |
|
Server audio output |
None |
Response generation started. New output item added to the response. Conversation item created. New content part added to the assistant message. response.audio_transcript.delta Incrementally generated transcribed text. WebSocket: Incrementally generated audio from the model is returned via this event. WebRTC: Audio is transmitted directly via the RTP audio track; this event is not returned. response.audio_transcript.done Text transcription completed. Audio generation completed. Streaming of the assistant's text or audio content is complete. The assistant's entire output item has finished streaming. Response completed.
User audio input transcription completed (requires enabling input_audio_transcription in session.update). |
Manual mode
Set session.turn_detection in session.update to null for manual mode. The client sends input_audio_buffer.commit and response.create to request a response. Suitable for push-to-talk scenarios such as voice messages in chat apps.
The interaction flow is as follows:
-
The client can send the input_audio_buffer.append and input_image_buffer.append events at any time to append audio and images to the buffer.
You must send at least one
input_audio_buffer.appendevent before you send aninput_image_buffer.appendevent. -
The client sends the input_audio_buffer.commit event to commit the audio and image buffers, signaling to the server that all user input (audio and images) for the current turn has been sent.
-
The server responds with an input_audio_buffer.committed event.
-
The client sends the response.create event and waits for the server to return the model's output.
-
The server responds with a conversation.item.created event.
|
Lifecycle |
Client events |
Server events |
|
Session initialization |
Session configuration |
Session created. Session configuration updated. |
|
User audio input |
Appends audio to the buffer. Appends an image to the buffer. Commits the audio and image buffers. Requests a model response. |
Audio buffer committed. |
|
Server audio output |
Clears audio from the buffer. |
Response generation started. New output item added to the response. Conversation item created. New content part added to the assistant message item. response.audio_transcript.delta Incrementally generated transcribed text. Incrementally generated audio from the model. response.audio_transcript.done Text transcription completed. Audio generation completed. Streaming of the assistant's text or audio content is complete. The assistant's entire output item has finished streaming. Response completed. |
Web search
Web search lets the model use real-time data to answer questions about timely information such as stock prices and weather. The model automatically determines if a search is needed.
Onlyqwen3.5-omni-plus-realtimesupports web search. Disabled by default; enable it withsession.update.
For billing, see the agent policy in billing rules.
Enable web search
Add the following parameters to the session.update event:
-
enable_search: Set totrueto enable the web search feature. -
search_options.enable_source: Set totrueto include the sources of the search results in the response.
For more parameters, see session.update.
Response format
When web search is enabled, the usage object in response.done includes a plugins field with search metering information:
{
"usage": {
"total_tokens": 2937,
"input_tokens": 2554,
"output_tokens": 383,
"input_tokens_details": {
"text_tokens": 2512,
"audio_tokens": 42
},
"output_tokens_details": {
"text_tokens": 90,
"audio_tokens": 293
},
"plugins": {
"search": {
"count": 1,
"strategy": "agent"
}
}
}
}
Code example
Enable web search in a real-time conversation:
DashScope Python SDK
Pass the enable_search and search_options parameters in the update_session call:
import os
import base64
import time
import json
import pyaudio
from dashscope.audio.qwen_omni import MultiModality, AudioFormat, OmniRealtimeCallback, OmniRealtimeConversation
import dashscope
dashscope.api_key = os.getenv('DASHSCOPE_API_KEY')
url = 'wss://{WorkspaceId}.ap-southeast-1.maas.aliyuncs.com/api-ws/v1/realtime'
model = 'qwen3.5-omni-plus-realtime'
voice = 'Tina'
class SearchCallback(OmniRealtimeCallback):
def __init__(self, pya):
self.pya = pya
self.out = None
def on_open(self):
self.out = self.pya.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
def on_event(self, response):
if response['type'] == 'response.audio.delta':
self.out.write(base64.b64decode(response['delta']))
elif response['type'] == 'conversation.item.input_audio_transcription.delta':
preview = response.get('text', '') + response.get('stash', '')
print(f"\r[User] {preview}", end='', flush=True)
elif response['type'] == 'conversation.item.input_audio_transcription.completed':
print(f"\r[User] {response['transcript']}")
elif response['type'] == 'response.audio_transcript.done':
print(f"[LLM] {response['transcript']}")
elif response['type'] == 'response.done':
usage = response.get('response', {}).get('usage', {})
plugins = usage.get('plugins', {})
if plugins.get('search'):
print(f"[Search] count={plugins['search']['count']}, strategy={plugins['search']['strategy']}")
pya = pyaudio.PyAudio()
callback = SearchCallback(pya)
conv = OmniRealtimeConversation(model=model, callback=callback, url=url)
conv.connect()
conv.update_session(
output_modalities=[MultiModality.AUDIO, MultiModality.TEXT],
voice=voice,
instructions="You are Xiaoyun, a personal assistant.",
enable_search=True,
search_options={'enable_source': True}
)
mic = pya.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True)
print("Web search is enabled. Speak into the microphone (Ctrl+C to exit)...")
try:
while True:
audio_data = mic.read(3200, exception_on_overflow=False)
conv.append_audio(base64.b64encode(audio_data).decode())
time.sleep(0.01)
except KeyboardInterrupt:
conv.close()
mic.close()
callback.out.close()
pya.terminate()
print("\nConversation ended.")
DashScope Java SDK
In the updateSession method, pass the web search configuration in the parameters argument:
import com.alibaba.dashscope.audio.omni.*;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.google.gson.JsonObject;
import javax.sound.sampled.*;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
public class OmniSearch {
static class SequentialAudioPlayer {
private final SourceDataLine line;
private final Queue<byte[]> audioQueue = new ConcurrentLinkedQueue<>();
private final Thread playerThread;
private final AtomicBoolean shouldStop = new AtomicBoolean(false);
public SequentialAudioPlayer() throws LineUnavailableException {
AudioFormat format = new AudioFormat(24000, 16, 1, true, false);
line = AudioSystem.getSourceDataLine(format);
line.open(format);
line.start();
playerThread = new Thread(() -> {
while (!shouldStop.get()) {
byte[] audio = audioQueue.poll();
if (audio != null) {
line.write(audio, 0, audio.length);
} else {
try { Thread.sleep(10); } catch (InterruptedException ignored) {}
}
}
}, "AudioPlayer");
playerThread.start();
}
public void play(String base64Audio) {
audioQueue.add(Base64.getDecoder().decode(base64Audio));
}
public void close() {
shouldStop.set(true);
try { playerThread.join(1000); } catch (InterruptedException ignored) {}
line.drain();
line.close();
}
}
public static void main(String[] args) {
try {
SequentialAudioPlayer player = new SequentialAudioPlayer();
AtomicBoolean shouldStop = new AtomicBoolean(false);
OmniRealtimeParam param = OmniRealtimeParam.builder()
.model("qwen3.5-omni-plus-realtime")
.apikey(System.getenv("DASHSCOPE_API_KEY"))
.url("wss://{WorkspaceId}.ap-southeast-1.maas.aliyuncs.com/api-ws/v1/realtime")
.build();
OmniRealtimeConversation conversation = new OmniRealtimeConversation(param, new OmniRealtimeCallback() {
@Override public void onOpen() {
System.out.println("Connection established.");
}
@Override public void onClose(int code, String reason) {
System.out.println("Connection closed.");
shouldStop.set(true);
}
@Override public void onEvent(JsonObject event) {
String type = event.get("type").getAsString();
if ("response.audio.delta".equals(type)) {
player.play(event.get("delta").getAsString());
} else if ("response.audio_transcript.done".equals(type)) {
System.out.println("[LLM] " + event.get("transcript").getAsString());
} else if ("response.done".equals(type)) {
JsonObject response = event.getAsJsonObject("response");
if (response != null && response.has("usage")) {
JsonObject usage = response.getAsJsonObject("usage");
if (usage.has("plugins")) {
JsonObject plugins = usage.getAsJsonObject("plugins");
if (plugins.has("search")) {
JsonObject search = plugins.getAsJsonObject("search");
System.out.println("[Search] count=" + search.get("count").getAsInt()
+ ", strategy=" + search.get("strategy").getAsString());
}
}
}
}
}
});
conversation.connect();
conversation.updateSession(OmniRealtimeConfig.builder()
.modalities(Arrays.asList(OmniRealtimeModality.AUDIO, OmniRealtimeModality.TEXT))
.voice("Tina")
.enableTurnDetection(true)
.enableInputAudioTranscription(true)
.parameters(Map.of(
"instructions", "You are Xiaoyun, a personal assistant.",
"enable_search", true,
"search_options", Map.of("enable_source", true)
))
.build()
);
System.out.println("Web search is enabled. Start speaking (press Ctrl+C to exit)...");
AudioFormat format = new AudioFormat(16000, 16, 1, true, false);
TargetDataLine mic = AudioSystem.getTargetDataLine(format);
mic.open(format);
mic.start();
ByteBuffer buffer = ByteBuffer.allocate(3200);
while (!shouldStop.get()) {
int bytesRead = mic.read(buffer.array(), 0, buffer.capacity());
if (bytesRead > 0) {
conversation.appendAudio(Base64.getEncoder().encodeToString(buffer.array()));
}
Thread.sleep(20);
}
conversation.close(1000, "Normal termination");
player.close();
mic.close();
} catch (NoApiKeyException e) {
System.err.println("API key not found: Set the DASHSCOPE_API_KEY environment variable.");
} catch (Exception e) {
e.printStackTrace();
}
}
}
WebSocket (Python)
Add the enable_search and search_options fields to the JSON payload of the session.update event:
import json
import os
import websocket
import base64
import pyaudio
import threading
API_KEY = os.getenv("DASHSCOPE_API_KEY")
API_URL = "wss://{WorkspaceId}.ap-southeast-1.maas.aliyuncs.com/api-ws/v1/realtime?model=qwen3.5-omni-plus-realtime"
pya = pyaudio.PyAudio()
out_stream = pya.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
def on_open(ws):
ws.send(json.dumps({
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"voice": "Tina",
"instructions": "You are Xiaoyun, a personal assistant.",
"input_audio_format": "pcm",
"output_audio_format": "pcm",
"enable_search": True,
"search_options": {
"enable_source": True
}
}
}))
print("Web search is enabled. Speak into the microphone...")
def send_audio():
mic = pya.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True)
try:
while True:
audio = mic.read(3200, exception_on_overflow=False)
ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": base64.b64encode(audio).decode()
}))
except Exception:
mic.close()
threading.Thread(target=send_audio, daemon=True).start()
def on_message(ws, message):
event = json.loads(message)
if event["type"] == "response.audio.delta":
out_stream.write(base64.b64decode(event["delta"]))
elif event["type"] == "response.audio_transcript.done":
print(f"[LLM] {event['transcript']}")
elif event["type"] == "response.done":
usage = event.get("response", {}).get("usage", {})
plugins = usage.get("plugins", {})
if plugins.get("search"):
print(f"[Search] count={plugins['search']['count']}, strategy={plugins['search']['strategy']}")
def on_error(ws, error):
print(f"Error: {error}")
headers = ["Authorization: Bearer " + API_KEY]
ws = websocket.WebSocketApp(API_URL, header=headers, on_open=on_open, on_message=on_message, on_error=on_error)
ws.run_forever()
API reference
Billing and rate limits
Billing
Billing is token-based, metered by modality (audio, image, text). Check the Model Studio console for pricing.
Rate limiting
For model rate limits, see Rate limits.
Error codes
If the model call fails and returns an error message, see Error codes for resolution.
Voice list
For a list of voices available for the Qwen-Omni-Realtime model, see Voices.