from openai import OpenAI
import os
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
try:
client = OpenAI(
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/compatible-mode/v1
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-ocr-2025-11-20",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
},
# The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{"type": "text",
"text": PROMPT_TICKET_EXTRACTION}
]
}
])
print(completion.choices[0].message.content)
except Exception as e:
print(f"Error message: {e}")
Node.js
import OpenAI from 'openai';
// Define the prompt for extracting train ticket information.
const PROMPT_TICKET_EXTRACTION = `
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'}
`;
const client = new OpenAI({
// API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
// If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1
baseURL: 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',
});
async function main() {
const response = await client.chat.completions.create({
model: 'qwen-vl-ocr-2025-11-20',
messages: [
{
role: 'user',
content: [
// The model supports passing a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{ type: 'text', text: PROMPT_TICKET_EXTRACTION},
{
type: 'image_url',
image_url: {
url: 'https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg',
},
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
}
]
}
],
});
console.log(response.choices[0].message.content)
}
main();
curl
# ======= Important =======
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===
curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "qwen-vl-ocr-2025-11-20",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
]
}'
Streaming output
Python
import os
from openai import OpenAI
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx','departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
client = OpenAI(
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-ocr-2025-11-20",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
},
# The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{"type": "text","text": PROMPT_TICKET_EXTRACTION}
]
}
],
stream=True,
stream_options={"include_usage": True}
)
for chunk in completion:
print(chunk.model_dump_json())
Node.js
import OpenAI from 'openai';
// Define the prompt for extracting train ticket information.
const PROMPT_TICKET_EXTRACTION = `
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'}
`;
const openai = new OpenAI({
// API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
// If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1
baseURL: 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',
});
async function main() {
const response = await openai.chat.completions.create({
model: 'qwen-vl-ocr-2025-11-20',
messages: [
{
role: 'user',
content: [
// The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{ type: 'text', text: PROMPT_TICKET_EXTRACTION},
{
type: 'image_url',
image_url: {
url: 'https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg',
},
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
}
]
}
],
stream: true,
stream_options:{"include_usage": true}
});
let fullContent = ""
console.log("Streaming output content:")
for await (const chunk of response) {
if (chunk.choices[0] && chunk.choices[0].delta.content != null) {
fullContent += chunk.choices[0].delta.content;
console.log(chunk.choices[0].delta.content);
}
}
console.log(`Full output content: ${fullContent}`)
}
main();
curl
# ======= Important =======
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===
curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "qwen-vl-ocr-2025-11-20",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
],
"stream": true,
"stream_options": {"include_usage": true}
}'
modelstring(Required)
The name of the model. See Qwen-OCR for a list of supported models.
messagesarray(Required)
A sequence of messages that provides context to the model in conversational order.
Message types
User Messageobject(Required)
A user message that provides instructions and an image for the model to process.
Properties
contentarray(Required)
The content of the message.
Properties
typestring(Required)
Valid values:
text
Set the type to text for text input.
image_url
Use image_url to specify the input image.
textstring(Optional)
The input text.
The default value is Please output only the text content from the image without any additional descriptions or formatting. This default behavior instructs the model to extract all text from the image.
image_urlobject
Information about the input image. This parameter is required if the type is set to image_url.
Properties
urlstring(Required)
The URL or Base64-encoded Data URL of the image. For more information about passing a local file, see Text extraction.
min_pixelsinteger(Optional)
The minimum pixel threshold for the input image in pixels.
If an input image has a pixel count below min_pixels, it is enlarged until its total pixel count exceeds min_pixels.
Conversion between image tokens and pixels
The number of pixels per image token varies by model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Each token corresponds to 32×32 pixels.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models: Each token corresponds to 28×28 pixels.
Value range for min_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: The default and minimum value is 3072 (that is, 3×32×32).
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models: The default and minimum value is 3136 (that is, 4×28×28).
The maximum pixel threshold for the input image in pixels.
If the pixel count of the input image is within the [min_pixels, max_pixels] range, the model processes the original image without resizing. If the pixel count of the input image exceeds max_pixels, the image is scaled down until its pixel count is less than max_pixels.
Conversion between image tokens and pixels
The number of pixels per image token varies by model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Each token corresponds to 32×32 pixels.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models: Each token corresponds to 28×28 pixels.
Value range for max_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20
Default value: 8388608 (that is, 8192×32×32)
Maximum value: 30720000 (that is, 30000×32×32)
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models
The role of the user message. The value must be user.
streamboolean (Optional) Default: false
Specifies whether to return the response in streaming mode.
Valid values:
false: Returns the complete response at once after the model finishes generation.
true: Returns data blocks as the model generates them. The client must read the blocks sequentially to reconstruct the complete response.
stream_options object(Optional)
The configuration settings for streaming output. This parameter applies only when the stream parameter is set to true.
Properties
include_usage boolean(Optional) Default: false
Specifies whether to include token usage information in the last data block of the stream.
Valid values:
true
false
max_tokens integer(Optional)
The maximum number of tokens to generate in the output. If the generated content exceeds this value, the response is truncated.
For qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20, and qwen-vl-ocr-2024-10-28, the default and maximum values are the same as the model's maximum output length. For more information, see Availability.
For qwen-vl-ocr, qwen-vl-ocr-2025-04-13, and qwen-vl-ocr-2025-08-28, the default and maximum values are 4096.
To increase this parameter's value to a number between 4097 and 8192, send an email to modelstudio@service.aliyun.com. Your email must include the following information: your Alibaba Cloud account ID, the image type (such as document, e-commerce, or contract), the model name, your estimated queries per second (QPS) and total daily requests, and the percentage of requests where the model output exceeds 4096 tokens.
logprobsboolean (Optional) Default: false
Specifies whether to return the log probabilities of the output tokens. Valid values:
true
false
top_logprobsinteger (Optional) Default: 0
Specifies the number of most likely tokens to return at each generation step.
Value range: [0, 5]
This parameter takes effect only when logprobs is true.
temperaturefloat (Optional) Default: 0.01
The sampling temperature controls the diversity of the text generated by the model.
A higher temperature results in more diverse text, while a lower temperature results in more deterministic text.
Value range: [0, 2)
Because both temperature and top_p control the diversity of the generated text, you can set only one of them.
We recommend that you use the default value.
top_pfloat(Optional) Default: 0.001
This parameter is the probability threshold for nucleus sampling, which controls the diversity of the text that the model generates.
A higher value results in more diverse text. A lower value results in more deterministic text.
Value range: (0, 1.0]
Because both temperature and top_p control text diversity, you should set only one of them.
We recommend that you use the default value.
top_kinteger (Optional) Default: 1
Specifies the size of the candidate set for sampling during generation. For example, if you set the value to 50, only the 50 tokens with the highest scores are used as the candidate set for random sampling. A larger value increases randomness, while a smaller value increases determinism. If the value is None or greater than 100, the top_k policy is not enabled. In this case, only the top_p policy takes effect.
The value must be greater than or equal to 0.
This parameter is not a standard OpenAI parameter. When using the Python SDK, place this parameter in the extra_body object. For example: extra_body={"top_k": xxx}. When using the Node.js SDK or HTTP, pass this parameter at the top level.
We recommend that you use the default value.
repetition_penalty float(Optional) Default: 1.0
The penalty for repeated sequences during model generation. A higher value can reduce repetition in the generated text. A value of 1.0 means no penalty is applied.
We recommend using the default value.
presence_penaltyfloat(Optional) Default: 0.0
Controls the repetition of content in the text generated by the model.
The value must be within the range of -2.0 to 2.0. A positive value reduces repetition, and a negative value increases it.
Increase this value for scenarios that require diversity, creativity, or brainstorming, such as creative writing. Decrease this value for scenarios that emphasize consistency and terminological accuracy, such as technical documents or formal texts.
How it works
If the value of this parameter is positive, the model applies a penalty to tokens that already exist in the text. The penalty is applied regardless of the number of times a token appears. This reduces the likelihood of these tokens reappearing, which decreases content repetition and increases word diversity.
We recommend that you use the default value.
seed integer(Optional)
A random number seed. Using a seed ensures reproducible results. If you pass the same seed value in a call and keep other parameters unchanged, the model returns a deterministic result.
Value range: [0,2<sup>31</sup>−1].
We recommend that you use the default value.
stop string or array(Optional)
Specifies the stop words. When a string or token_id specified in stop appears in the generated text, generation stops immediately.
You can use this parameter to specify sensitive words and control the model's output.
If stop is an array, you cannot mix token_ids and strings as elements. For example, you cannot specify ["Hello",104307].
The following code provides an example of how to call the built-in high-precision recognition task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to high-precision recognition.
ocr_options={"task": "advanced_recognition"}
)
# The high-precision recognition task returns the result as plain text.
print(response["output"]["choices"][0]["message"].content[0]["text"])
// dashscope SDK version >= 2.21.8
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.ADVANCED_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "advanced_recognition"
}
}
}
'
Information extraction
The following code provides an example of how to call the built-in information extraction task. For more information, see Call a built-in task.
# use [pip install -U dashscope] to update sdk
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role":"user",
"content":[
{
"image":"http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": False
}
]
}
]
params = {
"ocr_options":{
"task": "key_information_extraction",
"task_config": {
"result_schema": {
"Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05",
"Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters",
"Invoice Number": "Extract the number from the invoice, usually composed of only digits."
}
}
}
}
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
**params)
print(response.output.choices[0].message.content[0]["ocr_result"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.google.gson.JsonObject;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
// Create the main JSON object.
JsonObject resultSchema = new JsonObject();
resultSchema.addProperty("Ride Date", "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05");
resultSchema.addProperty("Invoice Code", "Extract the invoice code from the image, usually a combination of numbers or letters");
resultSchema.addProperty("Invoice Number", "Extract the number from the invoice, usually composed of only digits.");
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.KEY_INFORMATION_EXTRACTION)
.taskConfig(OcrOptions.TaskConfig.builder()
.resultSchema(resultSchema)
.build())
.build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("ocr_result"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "key_information_extraction",
"task_config": {
"result_schema": {
"Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05",
"Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters",
"Invoice Number": "Extract the number from the invoice, usually composed of only digits."
}
}
}
}
}
'
Table parsing
The following code provides an example of how to call the built-in table parsing task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to table parsing.
ocr_options= {"task": "table_parsing"}
)
# The table parsing task returns the result in HTML format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels",3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.TABLE_PARSING)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "table_parsing"
}
}
}
'
Document parsing
The following code provides an example of how to call the built-in document parsing task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to document parsing.
ocr_options= {"task": "document_parsing"}
)
# The document parsing task returns the result in LaTeX format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.DOCUMENT_PARSING)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation'\
--header "Authorization: Bearer $DASHSCOPE_API_KEY"\
--header 'Content-Type: application/json'\
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "document_parsing"
}
}
}
'
Formula recognition
The following code provides an example of how to call the built-in formula recognition task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False
}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to formula recognition.
ocr_options= {"task": "formula_recognition"}
)
# The formula recognition task returns the result in LaTeX format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.FORMULA_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "formula_recognition"
}
}
}
'
General text recognition
The following code provides an example of how to call the built-in general text recognition task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to general text recognition.
ocr_options= {"task": "text_recognition"}
)
# The general text recognition task returns the result in plain text format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.TEXT_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation'\
--header "Authorization: Bearer $DASHSCOPE_API_KEY"\
--header 'Content-Type: application/json'\
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "text_recognition"
}
}
}'
Multilingual recognition
The following code provides an example of how to call the built-in general multilingual recognition task. For more information, see Call a built-in task.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to multilingual recognition.
ocr_options={"task": "multi_lan"}
)
# The multilingual recognition task returns the result as plain text.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.MULTI_LAN)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "multi_lan"
}
}
}
'
Streaming output
Python
import os
import dashscope
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg",
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192},
# When no built-in task is set, you can pass a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{
"type": "text",
"text": PROMPT_TICKET_EXTRACTION,
},
],
}
]
response = dashscope.MultiModalConversation.call(
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
model="qwen-vl-ocr-2025-11-20",
messages=messages,
stream=True,
incremental_output=True,
)
full_content = ""
print("Streaming output content:")
for response in response:
try:
print(response["output"]["choices"][0]["message"].content[0]["text"])
full_content += response["output"]["choices"][0]["message"].content[0]["text"]
except:
pass
print(f"Full content: {full_content}")
Java
import java.util.*;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import io.reactivex.Flowable;
import com.alibaba.dashscope.utils.Constants;
public class Main {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1
// If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/api/v1
static {
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg");
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
map.put("min_pixels", 3072);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map,
// When no built-in task is set, you can pass a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
Collections.singletonMap("text", "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'"))).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.incrementalOutput(true)
.build();
Flowable<MultiModalConversationResult> result = conv.streamCall(param);
result.blockingForEach(item -> {
try {
List<Map<String, Object>> contentList = item.getOutput().getChoices().get(0).getMessage().getContent();
if (!contentList.isEmpty()){
System.out.println(contentList.get(0).get("text"));
}//
} catch (Exception e){
System.exit(0);
}
});
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl
# ======= Important =======
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to: https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# If you use a model in the China (Beijing) region, change the base_url to: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before execution ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
-H 'X-DashScope-SSE: enable' \
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input":{
"messages":[
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg",
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
]
},
"parameters": {
"incremental_output": true
}
}'
modelstring(Required)
The name of the model. See Qwen-OCR for a list of supported models.
messagesarray(Required)
The context for the model, provided as a sequence of messages in conversational order.
When you call the API over HTTP, place the messages object inside the input object.
Message types
User Messageobject(Required)
A user message that passes questions, instructions, or context to the model.
Properties
contentstring or array(Required)
The message content. Use a string for text-only input. Use an array if the input includes image data.
Properties
textstring(Optional)
The input text.
The default value is Please output only the text content from the image without any additional descriptions or formatting. This default behavior instructs the model to extract all text from the image.
imagestring (Optional)
The URL, Base64 Data URL, or local path of the image. For more information about passing a local file, see Passing local files.
If an image's pixel count is within the [min_pixels, max_pixels] range, the model processes it at its original size. If the pixel count exceeds max_pixels, the model scales down the image until its total pixel count is less than or equal to max_pixels.
Conversion between image tokens and pixels
The number of pixels per image token varies by model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Each token corresponds to 32×32 pixels.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models: Each token corresponds to 28×28 pixels.
Value range for max_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20
Default value: 8388608 (that is, 8192×32×32)
Maximum value: 30720000 (that is, 30000×32×32)
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, and earlier models
The role for a user message. The value must be user.
max_tokens integer(Optional)
The maximum number of tokens to generate in the output. If the generated content exceeds this value, the response is truncated.
For qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20, and qwen-vl-ocr-2024-10-28, the default and maximum values are the same as the model's maximum output length. For more information, see Availability.
For qwen-vl-ocr, qwen-vl-ocr-2025-04-13, and qwen-vl-ocr-2025-08-28, the default and maximum values are 4096.
To increase this parameter's value to a number between 4097 and 8192, send an email to modelstudio@service.aliyun.com. Your email must include the following information: your Alibaba Cloud account ID, the image type (such as document, e-commerce, or contract), the model name, your estimated queries per second (QPS) and total daily requests, and the percentage of requests where the model output exceeds 4096 tokens.
In the Java SDK, the parameter is maxTokens. For HTTP calls, set max_tokens in the parameters object.
ocr_optionsobject (Optional)
The parameters to configure when you call a built-in task with the Qwen-OCR model. When you call a built-in task, you do not need to pass a User Message because the model uses the default Prompt for that task. For more information, see Call a built-in task.
Properties
taskstring (Required)
The name of the built-in task. Valid values are:
text_recognition: General text recognition
key_information_extraction: Information extraction
document_parsing: Document parsing
table_parsing: Table parsing
formula_recognition: Formula recognition
multi_lan: Multilingual recognition
advanced_recognition: High-precision recognition
task_configobject (Optional)
When task is set to key_information_extraction (Information extraction), this parameter specifies the fields to extract. If you do not specify task_config, the model extracts all fields from the image by default.
Properties
result_schemaobject (Optional)
Specifies the fields for the model to extract. The value must be a JSON object. You can nest JSON objects up to three layersdeep.
Specify the name of the field to extract in the JSON object's key. The corresponding value can be empty. To achieve higher extraction accuracy, you can provide a field description or format requirement in the value.
Example:
"result_schema": {
"invoice_number": "The unique identification number of the invoice, usually a combination of numbers and letters.",
"issue_date": "The date the invoice was issued. Extract it in YYYY-MM-DD format, for example, 2023-10-26.",
"seller_name": "The full company name of the seller shown on the invoice.",
"total_amount": "The total amount on the invoice, including tax. Extract the numerical value and keep two decimal places, for example, 123.45."
}
In the Java SDK, this parameter is named OcrOptions. The minimum required version for the DashScope Python SDK is 1.22.2, and for the Java SDK is 2.18.4.
For HTTP calls, place ocr_options in the parameters object.
seed integer(Optional)
A random number seed. Using a seed ensures reproducible results. If you pass the same seed value in a call and keep other parameters unchanged, the model returns a deterministic result.
Value range: [0,2<sup>31</sup>−1].
We recommend that you use the default value.
When you make an HTTP call, place seed in the parameters object.
temperaturefloat (Optional) Default: 0.01
The sampling temperature controls the diversity of the text generated by the model.
A higher temperature results in more diverse text, while a lower temperature results in more deterministic text.
Value range: [0, 2)
Because both temperature and top_p control the diversity of the generated text, you can set only one of them.
We recommend that you use the default value.
When you make an HTTP call, place temperature in the parameters object.
top_pfloat(Optional) Default: 0.001
This parameter is the probability threshold for nucleus sampling, which controls the diversity of the text that the model generates.
A higher value results in more diverse text. A lower value results in more deterministic text.
Value range: (0, 1.0]
Because both temperature and top_p control text diversity, you should set only one of them.
We recommend that you use the default value.
In the Java SDK, the parameter is topP. For HTTP calls, place top_p in the parameters object.
top_kinteger (Optional) Default: 1
Specifies the size of the candidate set for sampling during generation. For example, if you set the value to 50, only the 50 tokens with the highest scores are used as the candidate set for random sampling. A larger value increases randomness, while a smaller value increases determinism. If the value is None or greater than 100, the top_k policy is not enabled. In this case, only the top_p policy takes effect.
The value must be greater than or equal to 0.
This parameter is not a standard OpenAI parameter. When using the Python SDK, place this parameter in the extra_body object. For example: extra_body={"top_k": xxx}. When using the Node.js SDK or HTTP, pass this parameter at the top level.
We recommend that you use the default value.
repetition_penalty float(Optional) Default: 1.0
The penalty for repeated sequences during model generation. A higher value can reduce repetition in the generated text. A value of 1.0 means no penalty is applied.
We recommend using the default value.
In the Java SDK, the parameter is repetitionPenalty. For HTTP calls, add repetition_penalty to the parameters object.
presence_penaltyfloat(Optional) Default: 0.0
Controls the repetition of content in the text generated by the model.
The value must be within the range of -2.0 to 2.0. A positive value reduces repetition, and a negative value increases it.
Increase this value for scenarios that require diversity, creativity, or brainstorming, such as creative writing. Decrease this value for scenarios that emphasize consistency and terminological accuracy, such as technical documents or formal texts.
How it works
If the value of this parameter is positive, the model applies a penalty to tokens that already exist in the text. The penalty is applied regardless of the number of times a token appears. This reduces the likelihood of these tokens reappearing, which decreases content repetition and increases word diversity.
We recommend that you use the default value.
streamboolean (Optional) Default: false
Specifies whether to stream the response. Valid values:
false: The model returns the result at once after all content is generated.
true: The model outputs content in chunks as it is generated.
This parameter is supported only by the Python SDK. To use streaming output with the Java SDK, you can call the streamCall interface. To use streaming output over HTTP, set X-DashScope-SSE to enable in the header.
Specifies whether to enable incremental output in streaming output mode. The recommended setting is true.
Valid values:
false: Each output contains the entire sequence generated so far. The final output is the complete result.
I
I like
I like apple
I like apple.
true (Recommended): Enables incremental output. Subsequent outputs contain only the newly generated content. You must concatenate these segments to retrieve the complete result.
I
like
apple
.
In the Java SDK, the parameter is incrementalOutput. For HTTP calls, add incremental_output to the parameters object.
stop string or array(Optional)
Specifies the stop words. When a string or token_id specified in stop appears in the generated text, generation stops immediately.
You can use this parameter to specify sensitive words and control the model's output.
If stop is an array, you cannot mix token_ids and strings as elements. For example, you cannot specify ["Hello",104307].
logprobsboolean (Optional) Default: false
Specifies whether to return the log probabilities of output tokens. Valid values:
true
false
Supported models: qwen-vl-ocr-2025-04-13 and later models.
For HTTP calls, place logprobs in the parameters object.
top_logprobsinteger (Optional) Default: 0
Specifies the number of most likely tokens to return at each generation step. This parameter applies only when logprobs is set to true.
The value must be an integer from 0 to 5.
In the Java SDK, the parameter is named topLogprobs. For HTTP calls, set the top_logprobs parameter in the parameters object.
Chat response object (same format for streaming and non-streaming outputs)
The rotated rectangle representation of the text box:
center_x and center_y are the coordinates of the text box's centroid.
width is the width of the text box, and height is the height.
angle is the rotation angle of the text box relative to the horizontal direction. The value range is [-90, 90].
locationarray
Example: [x1, y1, x2, y2, x3, y3, x4, y4]
The coordinates of the four vertices of the text box. The coordinates are arranged in clockwise order, starting from the top-left vertex: top-left → top-right → bottom-right → bottom-left.
textstring
The content of the text line.
textstring
The content of the output message.
logprobsobject
The probability information for the current `choices` object.
Properties
contentarray
An array of tokens that have log probability information.
Properties
tokenstring
The current token.
bytesarray
A list of the original UTF-8 bytes of the current token. This helps accurately reconstruct the output content, especially when you handle emojis and Chinese characters.
logprobfloat
The log probability of the current token. A `null` return value indicates an extremely low probability.
top_logprobsarray
The most likely tokens at the current token's position and their log probabilities. The number of elements is the same as the value of the top_logprobs input parameter.
Properties
tokenstring
The current token.
bytesarray
A list of the original UTF-8 bytes of the current token. This helps accurately reconstruct the output content, especially when you handle emojis and Chinese characters.
logprobfloat
The log probability of the current token. A `null` return value indicates an extremely low probability.
usageobject
Information about the tokens that are used in this request.
Properties
input_tokensinteger
The number of input tokens.
output_tokensinteger
The number of output tokens.
charactersinteger
This parameter is currently fixed to 0.
input_tokens_detailsobject
A fine-grained classification of input tokens.
Properties
image_tokensinteger
The number of tokens that correspond to the image that is input to the model.
text_tokensinteger
The number of tokens that correspond to the text that is input to the model.
output_tokens_detailsobject
A fine-grained classification of output tokens.
Properties
text_tokensinteger
The number of tokens in the model input.
total_tokensinteger
The total number of tokens consumed. This is the sum of input_tokens and output_tokens.
image_tokensinteger
This field is returned if the input includes an image. It represents the number of tokens that correspond to the image input.
Error codes
If a model call returns an error, see Error messages to resolve the issue.