from openai import OpenAI
import os
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'train_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx', 'seat_class': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
try:
client = OpenAI(
# API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/compatible-mode/v1
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-ocr-2025-11-20",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
},
# The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{"type": "text",
"text": PROMPT_TICKET_EXTRACTION}
]
}
])
print(completion.choices[0].message.content)
except Exception as e:
print(f"Error message: {e}")
Node.js
import OpenAI from 'openai';
// Define the prompt for extracting train ticket information.
const PROMPT_TICKET_EXTRACTION = `
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'train_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx', 'seat_class': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'}
`;
const client = new OpenAI({
// API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/compatible-mode/v1
baseURL: 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',
});
async function main() {
const response = await client.chat.completions.create({
model: 'qwen-vl-ocr-2025-11-20',
messages: [
{
role: 'user',
content: [
// The model supports passing a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{ type: 'text', text: PROMPT_TICKET_EXTRACTION},
{
type: 'image_url',
image_url: {
url: 'https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg',
},
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
}
]
}
],
});
console.log(response.choices[0].message.content)
}
main();
curl
# ======= Important =======
# API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===
curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "qwen-vl-ocr-2025-11-20",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'train_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\', \'seat_class\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
]
}'
Streaming output
Python
import os
from openai import OpenAI
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'train_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx', 'seat_class': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
client = OpenAI(
# API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/compatible-mode/v1
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-ocr-2025-11-20",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
},
# The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{"type": "text","text": PROMPT_TICKET_EXTRACTION}
]
}
],
stream=True,
stream_options={"include_usage": True}
)
for chunk in completion:
print(chunk.model_dump_json())
Node.js
import OpenAI from 'openai';
// Define the prompt for extracting train ticket information.
const PROMPT_TICKET_EXTRACTION = `
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'train_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx', 'seat_class': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'}
`;
const openai = new OpenAI({
// API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/compatible-mode/v1
baseURL: 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',
});
async function main() {
const response = await openai.chat.completions.create({
model: 'qwen-vl-ocr-2025-11-20',
messages: [
{
role: 'user',
content: [
// The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{ type: 'text', text: PROMPT_TICKET_EXTRACTION},
{
type: 'image_url',
image_url: {
url: 'https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg',
},
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
}
]
}
],
stream: true,
stream_options:{"include_usage": true}
});
let fullContent = ""
console.log("Streaming output content:")
for await (const chunk of response) {
if (chunk.choices[0] && chunk.choices[0].delta.content != null) {
fullContent += chunk.choices[0].delta.content;
console.log(chunk.choices[0].delta.content);
}
}
console.log(`Full output content: ${fullContent}`)
}
main();
curl
# ======= Important =======
# API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===
curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "qwen-vl-ocr-2025-11-20",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'train_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\', \'seat_class\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
],
"stream": true,
"stream_options": {"include_usage": true}
}'
model string(Required)
Specifies the model name. For a list of supported models, see Qwen-OCR.
messages array(Required)
The context to pass to the model, as a sequence of messages in conversational order.
Message types
User Messageobject(Required)
A user message that provides instructions and the image for the model to process.
Properties
content array (Required)
The message content.
Properties
type string(Required)
Valid values:
text
Use text for text input.
image_url
Use image_url for image input.
text string(Optional)
The input text.
The default value is: Please output only the text content from the image without any additional descriptions or formatting. This means the model extracts all text from the image by default.
image_url object
The input image information. This parameter is required when type is image_url.
Properties
urlstring (Required)
The URL or Base64 Data URL of the image. For more information about passing a local file, see Text extraction.
min_pixels integer(Optional)
The minimum pixel threshold for the input image, in pixels.
If the input image has fewer pixels than min_pixels, the image is enlarged until its total pixel count exceeds min_pixels.
Image tokens and pixels
The number of pixels per image token varies by model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Each token corresponds to 32×32 pixels.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models: Each token corresponds to 28×28 pixels.
Value range for min_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: The default and minimum value is 3072 (that is, 3×32×32).
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models: The default and minimum value is 3136 (that is, 4×28×28).
The maximum pixel threshold for the input image, in pixels.
If the pixel count of the input image is within the [min_pixels, max_pixels] range, the model processes the original image without resizing. If the pixel count of the input image is greater than max_pixels, the image is scaled down until the total pixel count is less than max_pixels.
Image tokens and pixels
The number of pixels per image token varies by model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Each token corresponds to 32×32 pixels.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models: Each token corresponds to 28×28 pixels.
Value range for max_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20
Default value: 8388608 (that is, 8192×32×32)
Maximum value: 30720000 (that is, 30000×32×32)
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models
The role of the user message. The value is fixed to user.
stream boolean(Optional) Defaults to: false
Specifies whether to return the response in streaming mode.
Valid values:
false: Waits for the model to generate the complete response and returns it all at once.
true: Returns data blocks as the model generates them. The client must read the blocks sequentially to reconstruct the complete response.
stream_options object(Optional)
Configuration items for streaming output. This parameter takes effect only when stream is true.
Properties
include_usage boolean(Optional) Defaults to: false
Specifies whether to include token usage information in the last data block of the stream.
Valid values:
true: Included.
false: Does not include token usage.
max_tokens integer(Optional)
The maximum number of tokens to generate in the output. If the generated content exceeds this value, the response is truncated.
For qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20, and qwen-vl-ocr-2024-10-28, the default and maximum values are the same as the model's maximum output length. For more information, see Models and pricing.
For qwen-vl-ocr, qwen-vl-ocr-2025-04-13, and qwen-vl-ocr-2025-08-28, the default and maximum value is 4096.
To increase this parameter's value to a number between 4097 and 8192, send an email to modelstudio@service.aliyun.com. Your email must include the following information: your Alibaba Cloud account ID, the image type (such as document, e-commerce, or contract), the model name, your estimated queries per second (QPS) and total daily requests, and the percentage of requests where the model output exceeds 4096 tokens.
logprobsboolean (Optional) Defaults to: false
Specifies whether to return the log probabilities of the output tokens. Valid values:
true
false
top_logprobsinteger (Optional) Defaults to: 0
Specifies the number of most likely tokens to return at each generation step.
Value range: [0, 5]
This parameter takes effect only when logprobs is true.
temperature float(Optional) Defaults to: 0.01
The sampling temperature, which controls the diversity of the text generated by the model.
A higher temperature results in more diverse text, while a lower temperature results in more deterministic text.
Value range: [0, 2)
Because both `temperature` and `top_p` control the diversity of the generated text, you should set only one of them.
We recommend that you use the default value.
top_p float(Optional) Defaults to: 0.001
The probability threshold for nucleus sampling, which controls the diversity of the text generated by the model.
A higher top_p results in more diverse text. A lower top_p results in more deterministic text.
Value range: (0, 1.0]
Because both `temperature` and `top_p` control the diversity of the generated text, you should set only one of them.
We recommend that you use the default value.
top_k integer(Optional) Defaults to: 1
The size of the candidate set for sampling during generation. For example, if the value is 50, only the 50 tokens with the highest scores in a single generation form the candidate set for random sampling. A larger value increases randomness, while a smaller value increases determinism. If the value is None or greater than 100, the top_k policy is not enabled. In this case, only the top_p policy takes effect.
The value must be greater than or equal to 0.
This parameter is not a standard OpenAI parameter. When calling with the Python SDK, place it in the extra_body object. For example: extra_body={"top_k": xxx}. When calling with the Node.js SDK or by HTTP, pass it as a top-level parameter.
We recommend that you use the default value.
repetition_penalty float(Optional) Defaults to: 1.0
The repetition penalty for consecutive sequences during model generation. Increasing repetition_penalty can reduce repetition in the generated text. A value of 1.0 means no penalty. We recommend that you use the default value because this parameter significantly affects the model's performance.
We recommend that you use the default value.
presence_penaltyfloat(Optional) Defaults to: 0.0
Controls the repetition of content in the text generated by the model.
Value range: [-2.0, 2.0]. A positive value reduces repetition, while a negative value increases it.
Increase this value for scenarios that require diversity, creativity, or brainstorming, such as creative writing. Decrease this value for scenarios that emphasize consistency and terminological accuracy, such as technical documents or formal texts.
How it works
If the parameter value is positive, the model applies a penalty to tokens that already exist in the current text. The penalty is independent of the number of times the token appears. This reduces the likelihood of these tokens reappearing, thereby decreasing content repetition and increasing word diversity.
We recommend that you use the default value.
seed integer(Optional)
A random number seed. Using a seed ensures reproducible results for the same input and parameters. If you pass the same seed in a call and keep other parameters unchanged, the model returns a deterministic result.
Value range: [0,231-1].
We recommend that you use the default value.
stop string or array(Optional)
Specifies stop words. When a string or token_id specified by stop appears in the generated text, generation stops immediately.
You can pass sensitive words to control the model's output.
If stop is an array, all elements must be of the same type, either all strings or all token IDs. You cannot mix them. For example, you cannot specify ["Hello",104307].
The following code provides an example of how to call the built-in high-precision recognition task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
"max_pixels": 32 * 32 * 8192,
# Enables automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# If you have not configured the environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to high-precision recognition.
ocr_options={"task": "advanced_recognition"}
)
# The multi-language recognition task returns the result as plain text.
print(response["output"]["choices"][0]["message"].content[0]["text"])
// dashscope SDK version >= 2.21.8
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
map.put("min_pixels", 3072);
// Enables automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.ADVANCED_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// If you have not configured the environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the Singapore region, replace the base_url with: https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "advanced_recognition"
}
}
}
'
Information extraction
The following code provides an example of how to call the built-in information extraction task. For more information, see Call a built-in task.
# use [pip install -U dashscope] to update sdk
import os
import dashscope
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role":"user",
"content":[
{
"image":"http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": False
}
]
}
]
params = {
"ocr_options":{
"task": "key_information_extraction",
"task_config": {
"result_schema": {
"Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05",
"Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters",
"Invoice Number": "Extract the number from the invoice, usually composed of only digits."
}
}
}
}
response = dashscope.MultiModalConversation.call(
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
**params)
print(response.output.choices[0].message.content[0]["ocr_result"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.google.gson.JsonObject;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
map.put("min_pixels", 3072);
// Enables automatic image rotation.
map.put("enable_rotate", false);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
// Create the main JSON object.
JsonObject resultSchema = new JsonObject();
resultSchema.addProperty("Ride Date", "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05");
resultSchema.addProperty("Invoice Code", "Extract the invoice code from the image, usually a combination of numbers or letters");
resultSchema.addProperty("Invoice Number", "Extract the number from the invoice, usually composed of only digits.");
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.KEY_INFORMATION_EXTRACTION)
.taskConfig(OcrOptions.TaskConfig.builder()
.resultSchema(resultSchema)
.build())
.build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured the environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("ocr_result"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "key_information_extraction",
"task_config": {
"result_schema": {
"Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05",
"Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters",
"Invoice Number": "Extract the number from the invoice, usually composed of only digits."
}
}
}
}
}
'
Table parsing
The following code provides an example of how to call the built-in table parsing task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
"max_pixels": 32 * 32 * 8192,
# Enables automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured the environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to table parsing.
ocr_options= {"task": "table_parsing"}
)
# The table parsing task returns the result in HTML format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
map.put("min_pixels",3072);
// Enables automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.TABLE_PARSING)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured the environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "table_parsing"
}
}
}
'
Document parsing
The following code provides an example of how to call the built-in document parsing task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
"max_pixels": 32 * 32 * 8192,
# Enables automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured the environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to document parsing.
ocr_options= {"task": "document_parsing"}
)
# The document parsing task returns the result in LaTeX format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
map.put("min_pixels", 3072);
// Enables automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.DOCUMENT_PARSING)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured the environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation'\
--header "Authorization: Bearer $DASHSCOPE_API_KEY"\
--header 'Content-Type: application/json'\
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "document_parsing"
}
}
}
'
Formula recognition
The following code provides an example of how to call the built-in formula recognition task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
"max_pixels": 32 * 32 * 8192,
# Enables automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured the environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to formula recognition.
ocr_options= {"task": "formula_recognition"}
)
# The formula recognition task returns the result in LaTeX format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
map.put("min_pixels", 3072);
// Enables automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.FORMULA_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured the environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "formula_recognition"
}
}
}
'
General text recognition
The following code provides an example of how to call the built-in general text recognition task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
"max_pixels": 32 * 32 * 8192,
# Enables automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured the environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to general text recognition.
ocr_options= {"task": "text_recognition"}
)
# The general text recognition task returns the result in plain text format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels exceed min_pixels.
map.put("min_pixels", 3072);
// Enables automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.TEXT_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured the environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# The API keys for the Singapore and Beijing regions are different. For more information, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation'\
--header "Authorization: Bearer $DASHSCOPE_API_KEY"\
--header 'Content-Type: application/json'\
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "text_recognition"
}
}
}'
Multilingual recognition
The following code provides an example of how to call the built-in general multilingual recognition task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png",
# The minimum pixel threshold for the input image. If an image is smaller than this threshold, it is scaled up until its total number of pixels exceeds `min_pixels`.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If an image is larger than this threshold, it is scaled down until its total number of pixels is below `max_pixels`.
"max_pixels": 32 * 32 * 8192,
# Enable the automatic image rotation feature.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys for the Singapore and Beijing regions are different. To get an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key
# If you have not configured the environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to multilingual recognition.
ocr_options={"task": "multi_lan"}
)
# The multilingual recognition task returns results in plain text.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png");
// The maximum pixel threshold for the input image. If the image is larger, it is scaled down until its total pixels are below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller, it is scaled up until its total pixels exceed min_pixels.
map.put("min_pixels", 3072);
// Enable the automatic image rotation feature.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.MULTI_LAN)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys for the Singapore and Beijing regions are different. To get an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If you have not configured the environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before execution ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "multi_lan"
}
}
}
'
Streaming output
Python
import os
import dashscope
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'train_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx', 'seat_class': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg",
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192},
# When no built-in task is set, you can pass a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{
"type": "text",
"text": PROMPT_TICKET_EXTRACTION,
},
],
}
]
response = dashscope.MultiModalConversation.call(
# API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
model="qwen-vl-ocr-2025-11-20",
messages=messages,
stream=True,
incremental_output=True,
)
full_content = ""
print("Streaming output content:")
for response in response:
try:
print(response["output"]["choices"][0]["message"].content[0]["text"])
full_content += response["output"]["choices"][0]["message"].content[0]["text"]
except:
pass
print(f"Full content: {full_content}")
Java
import java.util.*;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import io.reactivex.Flowable;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg");
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
map.put("min_pixels", 3072);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map,
// When no built-in task is set, you can pass a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
Collections.singletonMap("text", "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'train_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\', \'seat_class\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'"))).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.incrementalOutput(true)
.build();
Flowable<MultiModalConversationResult> result = conv.streamCall(param);
result.blockingForEach(item -> {
try {
List<Map<String, Object>> contentList = item.getOutput().getChoices().get(0).getMessage().getContent();
if (!contentList.isEmpty()){
System.out.println(contentList.get(0).get("text"));
}//
} catch (Exception e){
System.exit(0);
}
});
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl
# ======= Important =======
# API keys for the Singapore and Beijing regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# The following is the URL for the Singapore region. If you use a model in the Beijing region, replace the URL with: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before execution ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/agic/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
-H 'X-DashScope-SSE: enable' \
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input":{
"messages":[
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg",
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'train_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\', \'seat_class\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
]
},
"parameters": {
"incremental_output": true
}
}'
model string(Required)
Specifies the model name. For a list of supported models, see Qwen-OCR.
messages array(Required)
The context for the model, as a sequence of messages in conversational order.
When you call the API over HTTP, place the messages object inside the input object.
Message types
User Messageobject (Required)
A user message that passes questions, instructions, or context to the model.
Properties
content string or array (Required)
The message content. Use a string for text-only input. Use an array if the input includes image data.
Properties
text string(Optional)
The input text.
The default value is: Please output only the text content from the image without any additional descriptions or formatting. This means the model extracts all text from the image by default.
image string (Optional)
The URL, Base64 Data URL, or local path of the image. For more information about passing a local file, see Passing local files.
Example: {"image":"https://xxxx.jpeg"}
enable_rotate boolean(Optional) Defaults to: false
Specifies whether to correct skewed images.
Valid values:
true: The model automatically corrects the image orientation.
false: The model does not correct the image orientation.
The maximum pixel threshold for the input image, in pixels.
If an image's pixel count is within the [min_pixels, max_pixels] range, the model processes it at its original size. If the pixel count exceeds max_pixels, the model scales down the image until its total pixel count is less than max_pixels.
Image tokens and pixels
The number of pixels per image token varies by model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Each token corresponds to 32×32 pixels.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models: Each token corresponds to 28×28 pixels.
Value range for max_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20
Default value: 8388608 (that is, 8192×32×32)
Maximum value: 30720000 (that is, 30000×32×32)
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models
The role for a user message. The value must be user.
max_tokens integer(Optional)
The maximum number of tokens to generate in the output. If the generated content exceeds this value, the response is truncated.
For qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20, and qwen-vl-ocr-2024-10-28, the default and maximum values are the same as the model's maximum output length. For more information, see Models and pricing.
For qwen-vl-ocr, qwen-vl-ocr-2025-04-13, and qwen-vl-ocr-2025-08-28, the default and maximum value is 4096.
To increase this parameter's value to a number between 4097 and 8192, send an email to modelstudio@service.aliyun.com. Your email must include the following information: your Alibaba Cloud account ID, the image type (such as document, e-commerce, or contract), the model name, your estimated queries per second (QPS) and total daily requests, and the percentage of requests where the model output exceeds 4096 tokens.
In the Java SDK, the parameter is maxTokens. For HTTP calls, set max_tokens in the parameters object.
ocr_options object (Optional)
The parameters to configure when you call a built-in task with the Qwen-OCR model. When you call a built-in task, you do not need to pass a User Message. The model uses the default Prompt for that task. For more information, see Call a built-in task.
Properties
taskstring (Required)
The name of the built-in task. Valid values are:
text_recognition: General OCR
key_information_extraction: Information extraction
document_parsing: Document parsing
table_parsing: Table parsing
formula_recognition: Formula recognition
multi_lan: Multilingual recognition
advanced_recognition: Advanced recognition
task_configobject (Optional)
When task is set to key_information_extraction (Information extraction), this parameter specifies the fields to extract. If you do not specify task_config, the model extracts all fields from the image by default.
Property
result_schema object (Optional)
Specifies the fields for the model to extract. The value must be a JSON object. You can nest JSON objects up to three layers deep.
Specify the name of the field to extract in the JSON object's key. The corresponding value can be empty. For higher extraction accuracy, provide a field description or format requirement in the value.
Example:
"result_schema": {
"invoice_number": "The unique identification number of the invoice, usually a combination of numbers and letters.",
"issue_date": "The date the invoice was issued. Extract it in YYYY-MM-DD format, for example, 2023-10-26.",
"seller_name": "The full company name of the seller shown on the invoice.",
"total_amount": "The total amount on the invoice, including tax. Extract the numerical value and keep two decimal places, for example, 123.45."
}
In the Java SDK, this parameter is named OcrOptions. The minimum required version for the DashScope Python SDK is 1.22.2, and for the Java SDK is 2.18.4.
For HTTP, place ocr_options in the parameters object.
seed integer(Optional)
A random number seed. Using a seed ensures reproducible results for the same input and parameters. If you pass the same seed in a call and keep other parameters unchanged, the model returns a deterministic result.
Value range: [0,231-1].
We recommend that you use the default value.
When making an HTTP call, place seed in the parameters object.
temperature float(Optional) Defaults to: 0.01
The sampling temperature, which controls the diversity of the text generated by the model.
A higher temperature results in more diverse text, while a lower temperature results in more deterministic text.
Value range: [0, 2)
Because both `temperature` and `top_p` control the diversity of the generated text, you should set only one of them.
We recommend that you use the default value.
When making an HTTP call, place temperature in the parameters object.
top_p float(Optional) Defaults to: 0.001
The probability threshold for nucleus sampling, which controls the diversity of the text generated by the model.
A higher top_p results in more diverse text. A lower top_p results in more deterministic text.
Value range: (0, 1.0]
Because both `temperature` and `top_p` control the diversity of the generated text, you should set only one of them.
We recommend that you use the default value.
In the Java SDK, the parameter is topP. For HTTP calls, place top_p in the parameters object.
top_k integer(Optional) Defaults to: 1
The size of the candidate set for sampling during generation. For example, if the value is 50, only the 50 tokens with the highest scores in a single generation form the candidate set for random sampling. A larger value increases randomness, while a smaller value increases determinism. If the value is None or greater than 100, the top_k policy is not enabled. In this case, only the top_p policy takes effect.
The value must be greater than or equal to 0.
This parameter is not a standard OpenAI parameter. When calling with the Python SDK, place it in the extra_body object. For example: extra_body={"top_k": xxx}. When calling with the Node.js SDK or by HTTP, pass it as a top-level parameter.
We recommend that you use the default value.
repetition_penalty float(Optional) Defaults to: 1.0
The repetition penalty for consecutive sequences during model generation. Increasing repetition_penalty can reduce repetition in the generated text. A value of 1.0 means no penalty. We recommend that you use the default value because this parameter significantly affects the model's performance.
We recommend that you use the default value.
The parameter is repetitionPenalty. When making an HTTP call, add repetition_penalty to the parameters object.
presence_penaltyfloat(Optional) Defaults to: 0.0
Controls the repetition of content in the text generated by the model.
Value range: [-2.0, 2.0]. A positive value reduces repetition, while a negative value increases it.
Increase this value for scenarios that require diversity, creativity, or brainstorming, such as creative writing. Decrease this value for scenarios that emphasize consistency and terminological accuracy, such as technical documents or formal texts.
How it works
If the parameter value is positive, the model applies a penalty to tokens that already exist in the current text. The penalty is independent of the number of times the token appears. This reduces the likelihood of these tokens reappearing, thereby decreasing content repetition and increasing word diversity.
We recommend that you use the default value.
stream boolean(Optional) Defaults to: false
Specifies whether to stream the response. Valid values:
false: The model returns the result at once after all content is generated.
true: The model outputs content in chunks as it is generated.
This parameter is supported only by the Python SDK. To use streaming output with the Java SDK, call the streamCall interface. To use streaming output over HTTP, set X-DashScope-SSE to enable in the header.
incremental_output boolean(Optional) Defaults to: false
Specifies whether to enable incremental output in streaming output mode. The recommended setting is true.
Valid values:
false: Each output is the entire sequence generated so far. The final output is the complete result.
I
I like
I like apple
I like apple.
true (Recommended): Enables incremental output. Subsequent outputs contain only the newly generated content. You must concatenate these segments to obtain the complete result.
I
like
apple
.
In the Java SDK, the parameter is incrementalOutput. For HTTP calls, add incremental_output to the parameters object.
stop string or array(Optional)
Specifies stop words. When a string or token_id specified by stop appears in the generated text, generation stops immediately.
You can pass sensitive words to control the model's output.
If stop is an array, all elements must be of the same type, either all strings or all token IDs. You cannot mix them. For example, you cannot specify ["Hello",104307].
logprobsboolean (Optional) Defaults to: false
Specifies whether to return the log probabilities of output tokens. Valid values:
true
false
Supported models: qwen-vl-ocr-2025-04-13 and later models.
When calling over HTTP, place logprobs in the parameters object.
top_logprobsinteger (Optional) Defaults to: 0
Specifies the number of most likely tokens to return at each generation step. This parameter applies only when logprobs is set to true.
The value must be an integer from 0 to 5.
In the Java SDK, the parameter is named topLogprobs. For HTTP calls, set the top_logprobs parameter in the parameters object.
Chat response object (same format for streaming and non-streaming outputs)
The status code of the request. A value of 200 indicates that the request is successful. Otherwise, the request failed.
The Java SDK does not return this parameter. If a call fails, an exception is thrown. The exception message contains the `status_code` and `message`.
request_id string
The unique identifier for this call.
The Java SDK returns this parameter as requestId
code string
The error code. This parameter is empty if the call is successful.
Only the Python SDK returns this parameter.
output object
Information about the call result.
Properties
text string
This parameter is reserved and is currently always null.
finish_reason string
The reason that the model stopped generating. Possible values:
null: Generation is in progress.
stop: The model generated a complete response.
length: The output was truncated because it reached the maximum length.
choices array
The output information from the model.
Properties
finish_reason string
This can occur in the following situations:
The value is null during generation.
stop: The model generated a complete response.
length: The output was truncated because it reached the maximum length.
message object
The message object output by the model.
Properties
role string
The role of the output message. This is fixed to assistant.
content object
The content of the output message.
Properties
ocr_result object
When you use a Qwen-OCR model to call a built-in information extraction or high-precision recognition task, this parameter contains the task result information.
Properties
kv_result array
The output result of the information extraction task.
words_info array
The output result of the high-precision recognition task.
The rotated rectangle representation of the text box:
center_x and center_y are the coordinates of the text box centroid.
width is the width of the text box, and height is the height.
angle is the rotation angle of the text box relative to the horizontal direction. The value range is [-90, 90].
locationarray
Example: [x1, y1, x2, y2, x3, y3, x4, y4]
The coordinates of the four vertices of the text box. The coordinates are arranged in clockwise order, starting from the top-left vertex: top-left → top-right → bottom-right → bottom-left.
text string
The content of the text line.
text string
The content of the output message.
logprobs object
The probability information for the current `choices` object.
Properties
contentarray
An array of tokens with log probability information.
Properties
tokenstring
The current token.
bytesarray
A list of the original UTF-8 bytes of the current token. This helps accurately reconstruct the output content, especially when handling emojis and Chinese characters.
logprobfloat
The log probability of the current token. A return value of null indicates an extremely low probability.
top_logprobsarray
The most likely tokens at the current token's position and their log probabilities. The number of elements is the same as the value of the top_logprobs input parameter.
Properties
tokenstring
The current token.
bytesarray
A list of the original UTF-8 bytes of the current token. This helps accurately reconstruct the output content, especially when handling emojis and Chinese characters.
logprobfloat
The log probability of the current token. A return value of null indicates an extremely low probability.
usage object
Information about the tokens used in this request.
Properties
input_tokensinteger
The number of input tokens.
output_tokensinteger
The number of output tokens.
charactersinteger
This parameter is reserved and is currently always 0.
input_tokens_details object
A fine-grained classification of input tokens.
Properties
image_tokensinteger
The number of tokens that correspond to the image input to the model.
text_tokensinteger
The number of tokens that correspond to the text input to the model.
output_tokens_details object
A fine-grained classification of output tokens.
Properties
text_tokensinteger
The number of tokens in the input text for the model.
total_tokensinteger
The total number of tokens consumed. This is the sum of input_tokens and output_tokens.
image_tokensinteger
This field is returned if the input includes an image. It represents the number of tokens that correspond to the image input.
Error codes
If a model call fails and returns an error message, see Error messages to resolve the issue.