Qwen-Omni - Alibaba Cloud Model Studio - Alibaba Cloud Documentation Center

Qwen-Omni accepts text combined with a single additional modality, image, audio, or video, and generates text or speech responses. It delivers human-like voices and supports multilingual and dialect-based speech output. You can use it for content moderation, text creation, visual recognition, and audio-video interaction assistants.

Supported regions: Singapore, Beijing. Use the API key for your region.

Getting started

Prerequisites

You have configured the API key and set the API key as an environment variable.
The Qwen-Omni model supports only OpenAI-compatible invocation. Install the latest SDK: OpenAI Python SDK 1.52.0 or later, Node.js SDK 4.68.0 or later.

This example sends a text prompt to the Qwen-Omni API and returns a streaming response with both text and audio.

Python

import os
import base64
import soundfile as sf
import numpy as np
from openai import OpenAI

# 1. Initialize the client
client = OpenAI(
    api_key=os.getenv("DASHSCOPE_API_KEY"),  # Confirm that the environment variable is set
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

# 2. Send the request
try:
    completion = client.chat.completions.create(
        model="qwen3.5-omni-plus",
        messages=[{"role": "user", "content": "Who are you?"}],
        modalities=["text", "audio"],  # Specify text and audio output
        audio={"voice": "Tina", "format": "wav"},
        stream=True,  # Must be set to True
        stream_options={"include_usage": True},
    )

    # 3. Process the streaming response and decode the audio
    print("Model response:")
    audio_base64_string = ""
    for chunk in completion:
        # Process the text part
        if chunk.choices and chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="")

        # Collect the audio part
        if chunk.choices and hasattr(chunk.choices[0].delta, "audio") and chunk.choices[0].delta.audio:
            audio_base64_string += chunk.choices[0].delta.audio.get("data", "")

    # 4. Save the audio file
    if audio_base64_string:
        wav_bytes = base64.b64decode(audio_base64_string)
        audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
        sf.write("audio_assistant.wav", audio_np, samplerate=24000)
        print("\nAudio file saved to: audio_assistant.wav")

except Exception as e:
    print(f"Request failed: {e}")

Node.js

// Before running:
// 1. Ensure Node.js version >= 14 is installed.
// 2. Run: npm install openai wav

import OpenAI from "openai";
import { createWriteStream } from 'node:fs';
import { Writer } from 'wav';

// Convert a Base64 string and save it as a WAV audio file
async function convertAudio(audioString, audioPath) {
    try {
        // Decode the Base64 string into a Buffer
        const wavBuffer = Buffer.from(audioString, 'base64');
        // Create a WAV file write stream
        const writer = new Writer({
            sampleRate: 24000,  // Sample rate
            channels: 1,        // Mono
            bitDepth: 16        // 16-bit depth
        });
        // Create an output file stream and establish a pipe connection
        const outputStream = createWriteStream(audioPath);
        writer.pipe(outputStream);

        // Write PCM data and end writing
        writer.write(wavBuffer);
        writer.end();

        // Wait for the file to finish writing
        await new Promise((resolve, reject) => {
            outputStream.on('finish', resolve);
            outputStream.on('error', reject);
        });

        // Add extra wait time to ensure audio integrity
        await new Promise(resolve => setTimeout(resolve, 800));

        console.log(`\nAudio file saved to: ${audioPath}`);
    } catch (error) {
        console.error('Error during audio processing:', error);
    }
}

// 1. Initialize the client
const openai = new OpenAI(
    {
        // If no environment variable is set, replace the next line with your Alibaba Cloud Model Studio API key: apiKey: "sk-xxx"
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);
// 2. Send the request
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  
    messages: [
        {
            "role": "user",
            "content": "Who are you?"
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

let audioString = "";
console.log("Model response:")

// 3. Process the streaming response and decode the audio
for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        // Process text content
        if (chunk.choices[0].delta.content) {
            process.stdout.write(chunk.choices[0].delta.content);
        }
        // Process audio content
        if (chunk.choices[0].delta.audio) {
            if (chunk.choices[0].delta.audio["data"]) {
                audioString += chunk.choices[0].delta.audio["data"];
            }
        }
    }
}
// 4. Save the audio file
convertAudio(audioString, "audio_assistant.wav");

curl

# ======= Important note =======
# API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
        {
            "role": "user",
            "content": "Who are you?"
        }
    ],
    "stream":true,
    "stream_options":{
        "include_usage":true
    },
    "modalities":["text","audio"],
    "audio":{"voice":"Tina","format":"wav"}
}'

Response

After you run the Python or Node.js code, the text response appears in the console and an audio file named audio_assistant.wav is saved in the same directory as your code file.

Model response:
I am a large language model developed by Alibaba Cloud. My name is Qwen. How can I help you?

Running HTTP code returns text and Base64-encoded audio data directly in the audio field.

data: {"choices":[{"delta":{"content":"I"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757647879,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-a68eca3b-c67e-4666-a72f-73c0b4919860"}
data: {"choices":[{"delta":{"content":"am"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757647879,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-a68eca3b-c67e-4666-a72f-73c0b4919860"}
......
data: {"choices":[{"delta":{"audio":{"data":"/v8AAAAAAAAAAAAAAA...","expires_at":1757647879,"id":"audio_a68eca3b-c67e-4666-a72f-73c0b4919860"}},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757647879,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-a68eca3b-c67e-4666-a72f-73c0b4919860"}
data: {"choices":[{"finish_reason":"stop","delta":{"content":""},"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1764763585,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-e8c82e9e-073e-4289-a786-a20eb444ac9c"}
data: {"choices":[],"object":"chat.completion.chunk","usage":{"prompt_tokens":207,"completion_tokens":103,"total_tokens":310,"completion_tokens_details":{"audio_tokens":83,"text_tokens":20},"prompt_tokens_details":{"text_tokens":207}},"created":1757940330,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-9cdd5a26-f9e9-4eff-9dcc-93a878165afc"}

Model selection

Note

Qwen3.5-Omni is in preview. Model invocation is temporarily free, but tool calling still incurs fees, see Billing details.

Qwen3.5-Omni series: Best for long video analysis, meeting summaries, caption generation, content moderation, and audio-video interaction.
- Input limits: Up to 3 hours of audio or 1 hour of video
- Audio control: Supports adjusting volume, speaking rate, and emotion via instructions
- Visual capability: Matches Qwen3.5's level. Understands images, speech, ambient sounds, and other multimodal information
Qwen3-Omni-Flash series: Best for short video analysis and cost-sensitive scenarios.
- Input limits: Audio-video input under 150 seconds
- Thinking mode: The only Qwen-Omni series that supports thinking mode
Qwen-Omni-Turbo series
This series is no longer updated and has limited features. Migrate to the Qwen3.5-Omni or Qwen3-Omni-Flash series.

Series	Audio-video description	Deep thinking	Web search	Input languages	Output audio languages	Supported voices
Qwen3.5-Omni Latest-generation omni-modal model	Strong	Not supported	Supported	113 92 languages and 21 dialects Languages: Chinese, English, German, French, Italian, Czech, Indonesian, Thai, Korean, Polish, Japanese, Vietnamese, Finnish, Portuguese, Spanish, Dutch, Russian, Malay, Catalan, Swedish, Turkish, Ukrainian, Romanian, Slovak, Danish, Icelandic, Norwegian (Bokmål), Macedonian, Greek, Hungarian, Galician, Filipino, Croatian, Bosnian, Slovenian, Bulgarian, Kazakh, Belarusian, Latvian, Estonian, Azerbaijani, Uyghur, Swahili, Hindi, Esperanto, Kyrgyz, Tajik, Cebuano, Afrikaans, Arabic, Lithuanian, Javanese, Bengali, Persian, Hebrew, Punjabi, Gujarati, Mongolian, Asturian, Kannada, Marathi, Interlingua, Malayalam, Maltese, Norwegian Nynorsk, Telugu, Urdu, Georgian, Basque, Tamil, Odia, Serbian, Maori Dialects: Northeastern Mandarin, Guizhou dialect, Cantonese, Henan dialect, Hong Kong Cantonese, Shanghainese, Shaanxi dialect, Tianjin dialect, Taiwanese Hokkien, Yunnan dialect, Anhui dialect, Fujian dialect, Gansu dialect, Guangdong dialect, Hubei dialect, Hunan dialect, Jiangxi dialect, Shandong dialect, Shanxi dialect, Sichuan dialect, Guangxi dialect, Hainan dialect, Chongqing dialect, Changsha dialect, Hangzhou dialect, Hefei dialect, Yinchuan dialect, Zhengzhou dialect, Shenyang dialect, Wenzhou dialect, Wuhan dialect, Kunming dialect, Taiyuan dialect, Nanchang dialect, Jinan dialect, Lanzhou dialect, Nanjing dialect, Hakka, Southern Min	36 29 languages and 7 dialects Languages: Chinese, English, German, Italian, Portuguese, Spanish, Japanese, Korean, French, Russian, Thai, Indonesian, Arabic, Vietnamese, Turkish, Finnish, Polish, Hindi, Dutch, Czech, Urdu, Tagalog, Swedish, Danish, Hebrew, Icelandic, Malay, Norwegian, Persian Dialects: Sichuan dialect, Beijing dialect, Tianjin dialect, Nanjing dialect, Shaanxi dialect, Cantonese, Southern Min	55
Qwen3-Omni-Flash Hybrid thinking model	Weaker	Supported	Not supported	19 10 languages and 9 dialects Languages: Chinese, English, French, German, Russian, Italian, Spanish, Portuguese, Japanese, Korean Dialects: Sichuan dialect, Shanghainese, Cantonese, Southern Min, Shaanxi dialect, Nanjing dialect, Tianjin dialect, Beijing dialect	19 10 languages and 9 dialects Languages: Chinese, English, French, German, Russian, Italian, Spanish, Portuguese, Japanese, Korean Dialects: Sichuanese, Shanghainese, Cantonese, Hokkien, Shaanxi dialect, Nanjing dialect, Tianjin dialect, Beijing dialect	17 to 49 Varies by version
Qwen-Omni-Turbo No longer updated	None	Not supported	Not supported	Chinese, English	Chinese, English	4

For model names, context windows, pricing, and snapshot versions, see the Model list. For rate limits, see Rate limiting.

Model performance

Audio and video content analysis

Generate a comprehensive timestamped description of this video.

00:00.000 – 00:02.500

A rain-soaked city street fills the widescreen frame. Long-exposure photography creates streaks of red and blue car lights on wet pavement. A solitary man in a dark knee-length trench coat and light shirt with a tie walks toward the camera along the right sidewalk. Raindrops cling to his shoulders and hair; each step makes a muffled plop on damp concrete. A low, ominous electronic hum underscores ambient sounds, while steady rain crackles all around. Colorful graffiti covers the brick wall beside him, and neon signs glow in the distance—one clearly displays orange-red cursive script reading "Nice," while another vertical sign shows the letters "AT" at its bottom.

......

00:12.300 – 00:14.533

The camera tilts down. Inside a flashlight beam, a yellowed, aged single sheet lies in a shallow puddle, edges charred and corners curled. Inked paragraphs cover most of its surface, but moisture and scorch marks make the text unreadable. The man kneels, extending gloved fingers toward the document; ripples spread outward as his hand nears. Sirens peak then begin to fade slightly, while the ambient hum remains steady. The scene freezes just before the trembling paper is touched, ending the clip.

Only partial timestamps shown.

Multidialect understanding

Describe the vocal characteristics, emotional state, and full lyrics for singers in different Chinese dialects.

This audio features rap performances by singers from various regions of China, showcasing rich dialect diversity. Below are detailed descriptions of each singer's vocal traits, emotional state, and corresponding lyrics:

Hangzhou dialect (Wu)

Vocal traits:

The male singer's voice is soft and gentle, with the distinctive smoothness and warmth of Wu dialect. His intonation rises and falls gently, and his rhythm is light and quick, creating a relaxed storytelling effect.

Emotional state:

Relaxed, comfortable, and full of everyday life. He describes Hangzhou's slow-paced lifestyle and urban changes, expressing affection for his hometown and reflection on modern development. Overall, he feels cozy and familiar.

Lyrics:

> Hangzhou dialect, listen up! Our culture includes local erhua pronunciation.

> Like flowers blooming by West Lake, no cultural survey needed—we understand it best.

> Outsiders don't get it. "Ge," "lao'er," "fen'er," "ya'er"—can outsiders tell them apart?

> Let's go! Transport is fast—subways everywhere. We'll check it out.

> Welcoming Southeast Asia and Thailand, watching "Thirteen Ta" shows. Proud of our city, direct in personality.

> So you only came for the "bridge-crossing" experience, and because the flavor is different, you're not returning?

Only partial results shown.

Lyric caption generation

Transcribe the song lyrics and provide timestamps for each line in this format:[00:00:15,020 --> 00:00:28,085] : When you walk through a storm, hold your head up high.[00:00:28,085 --> 00:00:40,200] And don't be afraid of the dark.......

[00:00:12,680 --> 00:00:16,960] Cat thread sways past moonlight on trees.

[00:00:18,400 --> 00:00:22,800] Radiators hum 1998 chart hits.

[00:00:24.160 → 00:00:28.080] Time parts the mist-like heat waves.

[00:00:28,920 --> 00:00:33,000] Neon from the screen shines on my nose bridge.

......

[00:03:16,720 --> 00:03:21,680] We nestle in the softest ring of the tree trunk.

[00:03:22,400 --> 00:03:27,000] Breathing turns residual warmth into honey-sugar.

[00:03:28,160 --> 00:03:33,200] The sofa sinks into cloud-fluff shape.

[00:03:34,000 --> 00:03:38,800] Every pore soaks in sunshine.

[00:04:09,000 --> 00:04:10,020] (End)

Only partial results shown.

Audio-video programming

Usage

Streaming output

All requests to Qwen-Omni must set stream=True.

Model configuration

Configure parameters, prompts, and audio-video lengths based on your use case to balance cost, speed, and quality.

Audio-video understanding

Use case	Recommended video length	Recommended prompt	Recommended max_pixels
Fast review, low cost	≤60 minutes	Simple prompt within 50 words	230,400
Content extraction (long video segmentation)	≤60 minutes	Simple prompt within 50 words	921,600–2,073,600
Standard analysis (short video tagging)	≤4 minutes	Use the structured prompt below Recommended prompt Provide a detailed description of the video. It should explicitly include three sections: 1. A structured chronological storyline of every noticeable audio and visual details 2. A structured list of all visible text. For each text element, include start timestamp, end timestamp, the exact text content, the appearance characteristics. If no text appears, explicitly state so. 3. A structured speech-to-text transcription, include speaker（Corresponding to the character or voice‑over in Section 1, including their accent and tone）, exact spoken content, start timestamp, end timestamp, and speaking state (prosody, emotion, and style). If no speech appears, explicitly state so. Aside from these three required sections, you are free to organize any additional content in any way you find helpful. This additional content can include global information about the entire video or localized information about specific moments. You may choose the topic of this extra content freely. Output Format: ``` ## Storyline <xx:xx.xxx> - <xx:xx.xxx> <an unstructured long paragraph in natural language describing what happened during this period, blending both audio and video details.> <xx:xx.xxx> - <xx:xx.xxx> <an unstructured long paragraph in natural language describing what happened during this period, blending both audio and video details.> <xx:xx.xxx> - <xx:xx.xxx> <an unstructured long paragraph in natural language describing what happened during this period, blending both audio and video details.> ... ## Visible Text <xx:xx.xxx> - <xx:xx.xxx> "<element>": <appearance> "<element>": <appearance> <xx:xx.xxx> - <xx:xx.xxx> "<element>": <appearance> "<element>": <appearance> "<element>": <appearance> <xx:xx.xxx> - <xx:xx.xxx> "<element>": <appearance> ... ## Speakers and Transcript Speaker profiles: <speaker> - <profile> <speaker> - <profile> <speaker> - <profile> ... <xx:xx.xxx> - <xx:xx.xxx> Speaker: <speaker> State: <description> Content: "<content>" <xx:xx.xxx> - <xx:xx.xxx> Speaker: <speaker> State: <description> Content: "<content>" <xx:xx.xxx> - <xx:xx.xxx> Speaker: <speaker> State: <description> Content: "<content>" ... ## <another section> <paragraphs> ## <another section> <paragraphs> ... ```	921,600–2,073,600
Fine-grained analysis (multiple speakers/complex scenes)	≤2 minutes		2,073,600

Note

For fine-grained descriptions of long videos, segment them first.

Audio understanding

Balance cost and quality by controlling audio length and prompt complexity.

Use case	Recommended audio length	Recommended prompt
Fast review, low cost	≤60 minutes	Simple prompt within 50 words
Content extraction (segment long audio)	≤60 minutes	Simple prompt within 50 words
Standard analysis (audio tagging)	≤2 minutes	Use a structured prompt Structured prompt Provide a detailed description of the audio. It should explicitly include two sections: 1. A structured chronological storyline of every noticeable audio details 2. A structured speech-to-text transcription, include speaker（Corresponding to the character or voice‑over in Section 1, including their accent and tone）, exact spoken content, start timestamp, end timestamp, and speaking state (prosody, emotion, and style). If no speech appears, explicitly state so. Aside from these two required components, you are free to organize any additional content in any way you find helpful. This additional content can include global information about the entire audio or localized information about specific moments. You may choose the topic of this extra content freely. Output Format: ``` ## Storyline <xx:xx.xxx> - <xx:xx.xxx> <an unstructured long paragraph in natural language describing what happened during this period, blending both audio details.> <xx:xx.xxx> - <xx:xx.xxx> <an unstructured long paragraph in natural language describing what happened during this period, blending both audio details.> <xx:xx.xxx> - <xx:xx.xxx> <an unstructured long paragraph in natural language describing what happened during this period, blending both audio details.> ... ... ## Speakers and Transcript Speaker profiles: <speaker> - <profile> <speaker> - <profile> <speaker> - <profile> ... <xx:xx.xxx> - <xx:xx.xxx> Speaker: <speaker> State: <description> Content: "<content>" <xx:xx.xxx> - <xx:xx.xxx> Speaker: <speaker> State: <description> Content: "<content>" <xx:xx.xxx> - <xx:xx.xxx> Speaker: <speaker> State: <description> Content: "<content>" ... ## <another section> <paragraphs> ## <another section> <paragraphs> ... ```
Fine-grained analysis (multiple speakers/complex scenes)	≤1 minute

Note

For fine-grained descriptions of long audio, segment it first.

Multimodal input

Video and text input

Provide video as an image list or a video file (with audio support).

Video file (supports audio in the video)

Number of files:
- Qwen3.5-Omni series: Up to 512 files using public URLs; up to 250 files using Base64 encoding.
- Qwen3-Omni-Flash and Qwen-Omni-Turbo series: Only one file allowed.
File size:
- Qwen3.5-Omni: Up to 2 GB, up to 1 hour duration.
- Qwen3-Omni-Flash: Up to 256 MB, up to 150 seconds duration.
- Qwen-Omni-Turbo: Up to 150 MB, up to 40 seconds duration.
File formats: MP4, AVI, MKV, MOV, FLV, WMV, etc.
Visual and audio information in the video file are billed separately.

OpenAI compatible

Python

import os
from openai import OpenAI

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video_url",
                    "video_url": {
                        "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4"
                    },
                },
                {"type": "text", "text": "What is the video about?"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "video_url",
                "video_url": { "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4" },
            },
            { "type": "text", "text": "What is the video about?" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});


for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

curl

# ======= Important note =======
# API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "video_url",
          "video_url": {
            "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4"
          }
        },
        {
          "type": "text",
          "text": "What is the video about"
        }
      ]
    }
  ],
    "stream":true,
    "stream_options": {
        "include_usage": true
    },
    "modalities":["text","audio"],
    "audio":{"voice":"Tina","format":"wav"}
}'

Image list format

Number of images

Qwen3.5-Omni: Minimum 2 images, maximum 2048 images
Qwen3-Omni-Flash: Minimum 2 images, maximum 128 images
Qwen-Omni-Turbo: Minimum 4 images, maximum 80 images

OpenAI compatible

Python

import os
from openai import OpenAI

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": [
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/xzsgiz/football1.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/tdescd/football2.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/zefdja/football3.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/aedbqh/football4.jpg",
                    ],
                },
                {"type": "text", "text": "Describe the process shown in this video"},
            ],
        }
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [{
        role: "user",
        content: [
            {
                type: "video",
                video: [
                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/xzsgiz/football1.jpg",
                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/tdescd/football2.jpg",
                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/zefdja/football3.jpg",
                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/aedbqh/football4.jpg"
                ]
            },
            {
                type: "text",
                text: "Describe the process shown in this video"
            }
        ]
    }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

curl

# ======= Important note =======
# API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": [
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/xzsgiz/football1.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/tdescd/football2.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/zefdja/football3.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/aedbqh/football4.jpg"
                    ]
                },
                {
                    "type": "text",
                    "text": "Describe the process shown in this video"
                }
            ]
        }
    ],
    "stream": true,
    "stream_options": {
        "include_usage": true
    },
    "modalities": ["text", "audio"],
    "audio": {
        "voice": "Tina",
        "format": "wav"
    }
}'

Audio and text input

Number of files:
- Qwen3.5-Omni series: Up to 2048 files using public URLs; up to 250 files using Base64 encoding.
- Qwen3-Omni-Flash and Qwen-Omni-Turbo series: Only one file allowed.
File size:
- Qwen3.5-Omni: Up to 2 GB, up to 3 hours duration
- Qwen3-Omni-Flash: Up to 100 MB, up to 20 minutes duration
- Qwen-Omni-Turbo: Up to 10 MB, up to 3 minutes duration
File formats: AMR, WAV, 3GP, 3GPP, AAC, MP3, etc.

These examples use a public audio URL. To use a local file, see Send local files with Base64 encoding. Streaming output is required.

OpenAI compatible

Python

import os
from openai import OpenAI

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus",# For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav",
                        "format": "wav",
                    },
                },
                {"type": "text", "text": "What is this audio about"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "input_audio",
                "input_audio": { "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav", "format": "wav" },
            },
            { "type": "text", "text": "What is this audio about" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

curl

# ======= Important note =======
# API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "input_audio",
          "input_audio": {
            "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav",
            "format": "wav"
          }
        },
        {
          "type": "text",
          "text": "What is this audio about"
        }
      ]
    }
  ],
    "stream":true,
    "stream_options":{
        "include_usage":true
    },
    "modalities":["text","audio"],
    "audio":{"voice":"Tina","format":"wav"}
}'

Image and text input

Qwen-Omni supports multiple images per request. Image requirements:

Number of images:
- Public URL: Up to 2048 images
- Base64 encoding: Up to 250 images
Image size:
- Qwen3.5 series: Each image file must be ≤20 MB
- Qwen3-Omni-Flash and Qwen-Omni-Turbo series: Each image file must be ≤10 MB
Both width and height must exceed 10 pixels. Aspect ratio must not exceed 200:1 or 1:200.
Supported image types: See Image and video understanding.

These examples use a public image URL. To use a local file, see Send local files with Base64 encoding. Streaming output is required.

OpenAI compatible

Python

import os
from openai import OpenAI

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
                    },
                },
                {"type": "text", "text": "What scene is depicted in the image?"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={
        "include_usage": True
    }
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "image_url",
                "image_url": { "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg" },
            },
            { "type": "text", "text": "What scene is depicted in the image?" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

curl

# ======= Important note =======
# API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===


curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "image_url",
          "image_url": {
            "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
          }
        },
        {
          "type": "text",
          "text": "What scene is depicted in the image?"
        }
      ]
    }
  ],
    "stream":true,
    "stream_options":{
        "include_usage":true
    },
    "modalities":["text","audio"],
    "audio":{"voice":"Tina","format":"wav"}
}'

Web search

The Qwen3.5-Omni series supports web search to retrieve real-time information and perform reasoning.

Enable web search using the enable_search parameter and set search_strategy to agent.

OpenAI compatible

Python

# Before you run this code:
# pip install openai

import os
from openai import OpenAI

# Initialize the client
client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

# Make the request (enable web search)
try:
    completion = client.chat.completions.create(
        model="qwen3.5-omni-plus",
        messages=[{
            "role": "user", 
            "content": "Please look up today's date and day of the week, and tell me what major holidays fall on this date."
        }],
        stream=True,
        stream_options={"include_usage": True},
        # Enable web search
        extra_body={
            "enable_search": True,
            "search_options": {
                # Web search strategy. Only "agent" is supported.
                "search_strategy": "agent"
            }
        }
    )
    
    print("Model response (includes real-time information):")
    for chunk in completion:
        if chunk.choices and chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="")
    print()
    
except Exception as e:
    print(f"Request failed: {e}")

Node.js

// Before you run this code:
// npm install openai

import OpenAI from "openai";

// Initialize the client
const openai = new OpenAI({
    // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY,
    // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
    baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
});

// Make the request (enable web search)
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",
    messages: [{
        "role": "user",
        "content": "Please look up today's date and day of the week, and tell me what major holidays fall on this date."
    }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    // Enable web search
    extra_body: {
        enable_search: true,
        search_options: {
            // Web search strategy. Only "agent" is supported.
            search_strategy: "agent"
        }
    }
});

console.log("Model response (includes real-time information):");

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        if (chunk.choices[0].delta.content) {
            process.stdout.write(chunk.choices[0].delta.content);
        }
    }
}
console.log();

curl

# ======= Important note =======
# API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
        {
            "role": "user", 
            "content": "Please look up today's date and day of the week, and tell me what major holidays fall on this date."
        }
    ],
    "stream": true,
    "stream_options": {
        "include_usage": true
    },
    "enable_search": true,
    "search_options": {
        "search_strategy": "agent"
    }
}'

Notes

Web search is supported only in the Qwen3.5-Omni series. The search_strategy parameter only accepts agent.
See Billing details for billing rules related to the agent strategy.

Enable/disable thinking mode

Only the Qwen3-Omni-Flash model supports hybrid thinking. Control thinking mode using the enable_thinking parameter:

true
false (default)

OpenAI compatible

Python

import os
from openai import OpenAI

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3-omni-flash",
    messages=[{"role": "user", "content": "Who are you?"}],

    # Enable or disable thinking mode. Audio output is not supported in thinking mode. Qwen-Omni-Turbo does not support enable_thinking.
    extra_body={'enable_thinking': True},

    # Set the output modality. Two options are supported in non-thinking mode: ["text","audio"] and ["text"]. Only ["text"] is supported in thinking mode.
    modalities=["text"],

    # Set the voice. The audio parameter is not supported in thinking mode.
    # audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    model: "qwen3-omni-flash",
    messages: [
        { role: "user", content: "Who are you?" }
    ],

    // stream must be set to True, otherwise an error occurs.
    stream: true,
    stream_options: {
        include_usage: true
    },
    // Enable or disable thinking mode. Audio output is not supported in thinking mode. Qwen-Omni-Turbo does not support enable_thinking.
    extra_body:{'enable_thinking': true},
    // Set the output modality. Two options are supported in non-thinking mode: ["text","audio"] and ["text"]. Only ["text"] is supported in thinking mode.
    modalities: ["text"],
    // Set the voice. The audio parameter is not supported in thinking mode.
    //audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

curl

# ======= Important note =======
# API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3-omni-flash",
    "messages": [
        {
            "role": "user",
            "content": "Who are you?"
        }
    ],
    "stream":true,
    "stream_options":{
        "include_usage":true
    },
    "modalities":["text"],
    "enable_thinking": true
}'

Response

data: {"choices":[{"delta":{"content":null,"role":"assistant","reasoning_content":""},"index":0,"logprobs":null,"finish_reason":null}],"object":"chat.completion.chunk","usage":null,"created":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[{"finish_reason":null,"logprobs":null,"delta":{"content":null,"reasoning_content":"Hmm"},"index":0}],"object":"chat.completion.chunk","usage":null,"reated":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[{"delta":{"content":null,"reasoning_content":","},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"reated":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
......
data: {"choices":[{"delta":{"content":"Tell me"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757937336,"tem_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[{"delta":{"content":"!"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757937336,"systm_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[{"finish_reason":"stop","delta":{"content":"","reasoning_content":null},"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[],"object":"chat.completion.chunk","usage":{"prompt_tokens":11,"completion_tokens":363,"total_tokens":374,"completion_tokens_details":{"reasoning_tokens":195,"text_tokens":168},"prompt_tokens_details":{"text_tokens":11}},"created":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}

Multi-turn conversation

When using Qwen-Omni for multi-turn conversations, note the following:

Assistant Message
Assistant messages added to the messages array can contain only text data.
User Message
A user message can contain text and one other modality. In multi-turn conversations, you can input different modalities in different user messages.

OpenAI compatible

Python

import os
from openai import OpenAI

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://dashscope.oss-cn-beijing.aliyuncs.com/audios/welcome.mp3",
                        "format": "mp3",
                    },
                },
                {"type": "text", "text": "What is this audio about"},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": "This audio says: Welcome to Alibaba Cloud"}],
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": "Tell me about this company."}],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text"],
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://dashscope.oss-cn-beijing.aliyuncs.com/audios/welcome.mp3",
                        "format": "mp3",
                    },
                },
                { "type": "text", "text": "What is this audio about" },
            ],
        },
        {
            "role": "assistant",
            "content": [{ "type": "text", "text": "This audio says: Welcome to Alibaba Cloud" }],
        },
        {
            "role": "user",
            "content": [{ "type": "text", "text": "Tell me about this company." }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text"]
});


for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

curl

# ======= Important note =======
# API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
# The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
  "model": "qwen3.5-omni-plus",
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "input_audio",
          "input_audio": {
            "data": "https://dashscope.oss-cn-beijing.aliyuncs.com/audios/welcome.mp3"
          }
        },
        {
          "type": "text",
          "text": "What is this audio about"
        }
      ]
    },
    {
      "role": "assistant",
      "content": [
        {
          "type": "text",
          "text": "This audio says: Welcome to Alibaba Cloud"
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "Tell me about this company."
        }
      ]
    }
  ],
  "stream": true,
  "stream_options": {
    "include_usage": true
  },
  "modalities": ["text"]
}'

Parsing output Base64-encoded audio data

Method 1: Decode after completion

Qwen-Omni outputs audio as streaming Base64-encoded data. Maintain a string variable during generation and append the Base64-encoded data from each chunk. After generation completes, Base64-decode the complete string to get the audio file. Alternatively, decode and play each chunk in real time.

Python

# Installation instructions for pyaudio:
# APPLE Mac OS X
#   brew install portaudio
#   pip install pyaudio
# Debian/Ubuntu
#   sudo apt-get install python-pyaudio python3-pyaudio
#   or
#   pip install pyaudio
# CentOS
#   sudo yum install -y portaudio portaudio-devel && pip install pyaudio
# Microsoft Windows
#   python -m pip install pyaudio

import os
from openai import OpenAI
import base64
import numpy as np
import soundfile as sf

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope-intl.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[{"role": "user", "content": "Who are you?"}],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

# Method 1: Decode after the generation is complete
audio_string = ""
for chunk in completion:
    if chunk.choices:
        if hasattr(chunk.choices[0].delta, "audio"):
            try:
                audio_string += chunk.choices[0].delta.audio["data"]
            except Exception as e:
                print(chunk.choices[0].delta.content)
    else:
        print(chunk.usage)

wav_bytes = base64.b64decode(audio_string)
audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
sf.write("audio_assistant_py.wav", audio_np, samplerate=24000)

# Method 2: Decode while generating (comment out the code for Method 1 to use Method 2)
# # Initialize PyAudio
# import pyaudio
# import time
# p = pyaudio.PyAudio()
# # Create an audio stream
# stream = p.open(format=pyaudio.paInt16,
#                 channels=1,
#                 rate=24000,
#                 output=True)

# for chunk in completion:
#     if chunk.choices:
#         if hasattr(chunk.choices[0].delta, "audio"):
#             try:
#                 audio_string = chunk.choices[0].delta.audio["data"]
#                 wav_bytes = base64.b64decode(audio_string)
#                 audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
#                 # Play the audio data directly
#                 stream.write(audio_np.tobytes())
#             except Exception as e:
#                 print(chunk.choices[0].delta.content)

# time.sleep(0.8)
# # Clean up resources
# stream.stop_stream()
# stream.close()
# p.terminate()

Node.js

// Before running:
// 1. Ensure Node.js version >= 14 is installed.
// 2. Run: npm install openai wav
// 
// To use the real-time playback feature (Method 2), you also need:
// Windows:
//    npm install speaker
// Mac:
//    brew install portaudio
//    npm install speaker
// Linux (Ubuntu/Debian):
//    sudo apt-get install libasound2-dev
//    npm install speaker

import OpenAI from "openai";

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope-intl.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": "Who are you?"
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

// Method 1: Decode after the generation is complete
// Requires installation: npm install wav
import { createWriteStream } from 'node:fs';  // node:fs is a built-in Node.js module, no installation required
import { Writer } from 'wav';

async function convertAudio(audioString, audioPath) {
    try {
        // Decode the Base64 string into a Buffer
        const wavBuffer = Buffer.from(audioString, 'base64');
        // Create a WAV file write stream
        const writer = new Writer({
            sampleRate: 24000,  // Sample rate
            channels: 1,        // Mono
            bitDepth: 16        // 16-bit depth
        });
        // Create an output file stream and establish a pipe connection
        const outputStream = createWriteStream(audioPath);
        writer.pipe(outputStream);

        // Write PCM data and end writing
        writer.write(wavBuffer);
        writer.end();

        // Wait for the file to finish writing
        await new Promise((resolve, reject) => {
            outputStream.on('finish', resolve);
            outputStream.on('error', reject);
        });

        // Add extra wait time to ensure audio integrity
        await new Promise(resolve => setTimeout(resolve, 800));

        console.log(`Audio file successfully saved as ${audioPath}`);
    } catch (error) {
        console.error('An error occurred during processing:', error);
    }
}

let audioString = "";
for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        if (chunk.choices[0].delta.audio) {
            if (chunk.choices[0].delta.audio["data"]) {
                audioString += chunk.choices[0].delta.audio["data"];
            }
        }
    } else {
        console.log(chunk.usage);
    }
}
// Execute the conversion
convertAudio(audioString, "audio_assistant_mjs.wav");


// Method 2: Generate and play in real time
// Install necessary components according to your system's instructions above.
// import Speaker from 'speaker'; // Import the audio playback library

// // Create a speaker instance (configuration matches WAV file parameters)
// const speaker = new Speaker({
//     sampleRate: 24000,  // Sample rate
//     channels: 1,        // Number of sound channels
//     bitDepth: 16,       // Bit depth
//     signed: true        // Signed PCM
// });
// for await (const chunk of completion) {
//     if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
//         if (chunk.choices[0].delta.audio) {
//             if (chunk.choices[0].delta.audio["data"]) {
//                 const pcmBuffer = Buffer.from(chunk.choices[0].delta.audio.data, 'base64');
//                 // Write directly to the speaker for playback
//                 speaker.write(pcmBuffer);
//             }
//         }
//     } else {
//         console.log(chunk.usage);
//     }
// }
// speaker.on('finish', () => console.log('Playback complete'));
// speaker.end(); // Call based on the actual end of the API stream

Send local files with Base64 encoding

Images

This example uses the locally saved file eagle.png.

Python

import os
from openai import OpenAI
import base64

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope-intl.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)


# Base64 encoding format
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


base64_image = encode_image("eagle.png")

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{base64_image}"},
                },
                {"type": "text", "text": "What scene is depicted in the image?"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";
import { readFileSync } from 'fs';

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);

const encodeImage = (imagePath) => {
    const imageFile = readFileSync(imagePath);
    return imageFile.toString('base64');
};
const base64Image = encodeImage("eagle.png")

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",// For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "image_url",
                "image_url": { "url": `data:image/png;base64,${base64Image}` },
            },
            { "type": "text", "text": "What scene is depicted in the image?" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

Audio

This example uses the locally saved file welcome.mp3.

Python

import os
from openai import OpenAI
import base64
import numpy as np
import soundfile as sf
import requests

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope-intl.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)


def encode_audio(audio_path):
    with open(audio_path, "rb") as audio_file:
        return base64.b64encode(audio_file.read()).decode("utf-8")


base64_audio = encode_audio("welcome.mp3")

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": f"data:;base64,{base64_audio}",
                        "format": "mp3",
                    },
                },
                {"type": "text", "text": "What is this audio about"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";
import { readFileSync } from 'fs';

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);

const encodeAudio = (audioPath) => {
    const audioFile = readFileSync(audioPath);
    return audioFile.toString('base64');
};
const base64Audio = encodeAudio("welcome.mp3")

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "input_audio",
                "input_audio": { "data": `data:;base64,${base64Audio}`, "format": "mp3" },
            },
            { "type": "text", "text": "What is this audio about" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

Video

Video file

This example uses the locally saved file spring_mountain.mp4.

Python

import os
from openai import OpenAI
import base64
import numpy as np
import soundfile as sf

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope-intl.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)

# Base64 encoding format
def encode_video(video_path):
    with open(video_path, "rb") as video_file:
        return base64.b64encode(video_file.read()).decode("utf-8")


base64_video = encode_video("spring_mountain.mp4")

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video_url",
                    "video_url": {"url": f"data:;base64,{base64_video}"},
                },
                {"type": "text", "text": "What is she singing?"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";
import { readFileSync } from 'fs';

const openai = new OpenAI(
    {
        // API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope-intl.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);

const encodeVideo = (videoPath) => {
    const videoFile = readFileSync(videoPath);
    return videoFile.toString('base64');
};
const base64Video = encodeVideo("spring_mountain.mp4")

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "video_url",
                "video_url": { "url": `data:;base64,${base64Video}` },
            },
            { "type": "text", "text": "What is she singing?" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

Image list

For example, consider the locally saved files football1.jpg, football2.jpg, football3.jpg, and football4.jpg.

Python

import os
from openai import OpenAI
import base64
import numpy as np
import soundfile as sf

client = OpenAI(
    # API keys differ between Singapore and Beijing regions. Get an API key: https://www.alibabacloud.com/help/zh/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following URL is for the Singapore region. If you use a model in the Beijing region, replace it with: https://dashscope-intl.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)


# Base64 encoding format
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


base64_image_1 = encode_image("football1.jpg")
base64_image_2 = encode_image("football2.jpg")
base64_image_3 = encode_image("football3.jpg")
base64_image_4 = encode_image("football4.jpg")

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": [
                        f"data:image/jpeg;base64,{base64_image_1}",
                        f"data:image/jpeg;base64,{base64_image_2}",
                        f"data:image/jpeg;base64,{base64_image_3}",
                        f"data:image/jpeg;base64,{base64_image_4}",
                    ],
                },
                {"type": "text", "text": "Describe the process shown in this video"},
            ],
        }
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)

Node.js

import OpenAI from "openai";
import { readFileSync } from 'fs';

const openai = new OpenAI(
    {
        // The API key differs between the Singapore and Beijing regions. To obtain an API key, see https://www.alibabacloud.com/help/zh/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/compatible-mode/v1
        baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    }
);

const encodeImage = (imagePath) => {
    const imageFile = readFileSync(imagePath);
    return imageFile.toString('base64');
  };
const base64Image1 = encodeImage("football1.jpg")
const base64Image2 = encodeImage("football2.jpg")
const base64Image3 = encodeImage("football3.jpg")
const base64Image4 = encodeImage("football4.jpg")

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // When the model is Qwen3-Omni-Flash, run it in non-thinking mode.
    messages: [{
        role: "user",
        content: [
            {
                type: "video",
                video: [
                    `data:image/jpeg;base64,${base64Image1}`,
                    `data:image/jpeg;base64,${base64Image2}`,
                    `data:image/jpeg;base64,${base64Image3}`,
                    `data:image/jpeg;base64,${base64Image4}`
                ]
            },
            {
                type: "text",
                text: "Describe the specific procedure of this video."
            }
        ]
    }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }

API reference

For input and output parameters, see the Qwen API reference.

Billing and rate limits

Billing rules

Qwen-Omni is billed based on tokens consumed across modalities (audio, image, and video). For pricing, see Models.

Token conversion rules for audio, images, and videos

Audio

Qwen3.5-Omni: Total tokens = Audio duration (in seconds) × 7
Qwen3-Omni-Flash: Total tokens = Audio duration (seconds) × 12.5
Qwen-Omni-Turbo: Total tokens = Audio duration (seconds) × 25

Audio shorter than 1 second is calculated as 1 second.

Images

Qwen3.5-Omni and Qwen3-Omni-Flash: 1 token per 32x32 pixels
Qwen-Omni-Turbo: 1 token per 28x28 pixels

Each image requires a minimum of 4 tokens and a maximum of 1280 tokens. Use the following code to estimate the token count for a single image:

import math
# Install the Pillow library: pip install Pillow
from PIL import Image

# For Qwen-Omni-Turbo, the factor is 28.
# factor = 28
# For Qwen3-Omni-Flash, the factor is 32.
factor = 32

def token_calculate(image_path=''):
    """
    param image_path: The path of the image.
    return: The number of tokens for a single image.
    """
    if len(image_path) > 0:
        # Open the specified PNG image file.
        image = Image.open(image_path)
        # Get the original dimensions of the image.
        height = image.height
        width = image.width
        print(f"Image dimensions before scaling: Height={height}, Width={width}")
        # Adjust the height to be a multiple of the factor.
        h_bar = round(height / factor) * factor
        # Adjust the width to be a multiple of the factor.
        w_bar = round(width / factor) * factor
        # Lower limit for image tokens: 4 tokens.
        min_pixels = 4 * factor * factor
        # Upper limit for image tokens: 1280 tokens.
        max_pixels = 1280 * factor * factor
        # Scale the image to adjust the total number of pixels to be within the [min_pixels, max_pixels] range.
        if h_bar * w_bar > max_pixels:
            # Calculate the scaling factor beta so that the total pixels of the scaled image do not exceed max_pixels.
            beta = math.sqrt((height * width) / max_pixels)
            # Recalculate the adjusted height to ensure it is a multiple of the factor.
            h_bar = math.floor(height / beta / factor) * factor
            # Recalculate the adjusted width to ensure it is a multiple of the factor.
            w_bar = math.floor(width / beta / factor) * factor
        elif h_bar * w_bar < min_pixels:
            # Calculate the scaling factor beta so that the total pixels of the scaled image are not less than min_pixels.
            beta = math.sqrt(min_pixels / (height * width))
            # Recalculate the adjusted height to ensure it is a multiple of the factor.
            h_bar = math.ceil(height * beta / factor) * factor
            # Recalculate the adjusted width to ensure it is a multiple of the factor.
            w_bar = math.ceil(width * beta / factor) * factor
        print(f"Image dimensions after scaling: Height={h_bar}, Width={w_bar}")
        # Calculate the number of tokens for the image: total pixels / (factor * factor).
        token = int((h_bar * w_bar) / (factor * factor)) + 2
        print(f"Number of tokens after scaling: {token}")
        return token
    else:
        raise ValueError("Image path cannot be empty. Provide a valid image file path")
    
if __name__ == "__main__":
    token = token_calculate(image_path="xxx/test.jpg")

Video

Video files produce two types of tokens: video_tokens and audio_tokens. The total token count is the sum of both.

video_tokens

The calculation process is complex. See the following code:

# Before use, install: pip install opencv-python
import math
import os
import logging
import cv2

# Fixed parameters
FRAME_FACTOR = 2

# For Qwen3-Omni-Flash, IMAGE_FACTOR is 32
IMAGE_FACTOR = 32

# For Qwen-Omni-Turbo, IMAGE_FACTOR is 28
# IMAGE_FACTOR = 28

# Aspect ratio of video frames
MAX_RATIO = 200

# Lower limit for video frame pixels. For Qwen3-Omni-Flash: 128 * 32 * 32
VIDEO_MIN_PIXELS = 128 * 32 * 32
# For Qwen-Omni-Turbo
# VIDEO_MIN_PIXELS = 128 * 28 * 28

# Upper limit for video frame pixels. For Qwen3-Omni-Flash: 768 * 32 * 32
VIDEO_MAX_PIXELS = 768 * 32 * 32
# For Qwen-Omni-Turbo:
# VIDEO_MAX_PIXELS = 768 * 28 * 28

FPS = 2
# Minimum number of extracted frames
FPS_MIN_FRAMES = 4

# Maximum number of extracted frames
# Maximum number of extracted frames for Qwen3-Omni-Flash: 128
# Maximum number of extracted frames for Qwen-Omni-Turbo: 80
FPS_MAX_FRAMES = 128

# Maximum pixel value for video input. For Qwen3-Omni-Flash: 16384 * 32 * 32
VIDEO_TOTAL_PIXELS = 16384 * 32 * 32
# For Qwen-Omni-Turbo:
# VIDEO_TOTAL_PIXELS = 16384 * 28 * 28

def round_by_factor(number, factor):
    return round(number / factor) * factor

def ceil_by_factor(number, factor):
    return math.ceil(number / factor) * factor

def floor_by_factor(number, factor):
    return math.floor(number / factor) * factor

def get_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    cap.release()
    return frame_height, frame_width, total_frames, video_fps

def smart_nframes(total_frames, video_fps):
    min_frames = ceil_by_factor(FPS_MIN_FRAMES, FRAME_FACTOR)
    max_frames = floor_by_factor(min(FPS_MAX_FRAMES, total_frames), FRAME_FACTOR)
    duration = total_frames / video_fps if video_fps != 0 else 0
    if duration - int(duration) > (1 / FPS):
        total_frames = math.ceil(duration * video_fps)
    else:
        total_frames = math.ceil(int(duration) * video_fps)
    nframes = total_frames / video_fps * FPS
    nframes = int(min(min(max(nframes, min_frames), max_frames), total_frames))
    if not (FRAME_FACTOR <= nframes <= total_frames):
        raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
    return nframes

def smart_resize(height, width, nframes, factor=IMAGE_FACTOR):
    min_pixels = VIDEO_MIN_PIXELS
    total_pixels = VIDEO_TOTAL_PIXELS
    max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
    if max(height, width) / min(height, width) > MAX_RATIO:
        raise ValueError(f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}")
    h_bar = max(factor, round_by_factor(height, factor))
    w_bar = max(factor, round_by_factor(width, factor))
    if h_bar * w_bar > max_pixels:
        beta = math.sqrt((height * width) / max_pixels)
        h_bar = floor_by_factor(height / beta, factor)
        w_bar = floor_by_factor(width / beta, factor)
    elif h_bar * w_bar < min_pixels:
        beta = math.sqrt(min_pixels / (height * width))
        h_bar = ceil_by_factor(height * beta, factor)
        w_bar = ceil_by_factor(width * beta, factor)
    return h_bar, w_bar

def video_token_calculate(video_path):
    height, width, total_frames, video_fps = get_video(video_path)
    nframes = smart_nframes(total_frames, video_fps)
    resized_height, resized_width = smart_resize(height, width, nframes)
    video_token = int(math.ceil(nframes / FPS) * resized_height / 32 * resized_width / 32)
    video_token += 2  # Visual marks
    return video_token

if __name__ == "__main__":
    video_path = "spring_mountain.mp4"  # Your video path
    video_token = video_token_calculate(video_path)
    print("video_tokens:", video_token)

audio_tokens
- Qwen3.5-Omni: Total tokens = Audio duration (in seconds) × 7
- Qwen3-Omni-Flash: Total tokens = Audio duration (in seconds) × 12.5
- Qwen-Omni-Turbo: Total tokens = Audio duration (in seconds) × 25
Audio shorter than 1 second is calculated as 1 second.

Free quota

To claim, query, and use your free quota, see Free quota for new users.

Rate limits

For rate limit rules and FAQ, see Rate limiting.

Error codes

If the model call fails and returns an error message, see Error messages for resolution.

Voice list

See Voice list.