from openai import OpenAI
import os
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
try:
client = OpenAI(
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/compatible-mode/v1
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-ocr-2025-11-20",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
},
# The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{"type": "text",
"text": PROMPT_TICKET_EXTRACTION}
]
}
])
print(completion.choices[0].message.content)
except Exception as e:
print(f"Error message: {e}")
Node.js
import OpenAI from 'openai';
// Define the prompt for extracting train ticket information.
const PROMPT_TICKET_EXTRACTION = `
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'}
`;
const client = new OpenAI({
// API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
// If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1
baseURL: 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',
});
async function main() {
const response = await client.chat.completions.create({
model: 'qwen-vl-ocr-2025-11-20',
messages: [
{
role: 'user',
content: [
// The model supports passing a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{ type: 'text', text: PROMPT_TICKET_EXTRACTION},
{
type: 'image_url',
image_url: {
url: 'https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg',
},
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
}
]
}
],
});
console.log(response.choices[0].message.content)
}
main();
curl
# ======= Important =======
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===
curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "qwen-vl-ocr-2025-11-20",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
]
}'
Keluaran streaming
Python
import os
from openai import OpenAI
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx','departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
client = OpenAI(
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-ocr-2025-11-20",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
},
# The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{"type": "text","text": PROMPT_TICKET_EXTRACTION}
]
}
],
stream=True,
stream_options={"include_usage": True}
)
for chunk in completion:
print(chunk.model_dump_json())
Node.js
import OpenAI from 'openai';
// Define the prompt for extracting train ticket information.
const PROMPT_TICKET_EXTRACTION = `
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'}
`;
const openai = new OpenAI({
// API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
// If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1
baseURL: 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',
});
async function main() {
const response = await openai.chat.completions.create({
model: 'qwen-vl-ocr-2025-11-20',
messages: [
{
role: 'user',
content: [
// The model supports passing a prompt in the following text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{ type: 'text', text: PROMPT_TICKET_EXTRACTION},
{
type: 'image_url',
image_url: {
url: 'https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg',
},
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192
}
]
}
],
stream: true,
stream_options:{"include_usage": true}
});
let fullContent = ""
console.log("Streaming output content:")
for await (const chunk of response) {
if (chunk.choices[0] && chunk.choices[0].delta.content != null) {
fullContent += chunk.choices[0].delta.content;
console.log(chunk.choices[0].delta.content);
}
}
console.log(`Full output content: ${fullContent}`)
}
main();
curl
# ======= Important =======
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
# === Delete this comment before execution ===
curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "qwen-vl-ocr-2025-11-20",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"},
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
],
"stream": true,
"stream_options": {"include_usage": true}
}'
modelstring(Wajib)
Nama model. Lihat <a baseurl="t3183345_v2_1_0.xdita" data-node="5088954" data-root="85177" data-tag="xref" href="t2751232.xdita#8442951a76clt" id="b9f5555f63h80">Qwen-OCR</a> untuk daftar model yang didukung.
messagesarray(Wajib)
Urutan pesan yang menyediakan konteks untuk model dalam urutan percakapan.
Jenis pesan
Pesan Penggunaobject(Wajib)
Pesan pengguna yang memberikan instruksi dan gambar untuk diproses oleh model.
Properti
contentarray(Wajib)
Konten pesan.
Properti
typestring(Wajib)
Nilai yang valid:
text
Atur tipe ke text untuk input teks.
image_url
Gunakan image_url untuk menentukan gambar input.
textstring(Opsional)
Teks input.
Nilai default adalah Please output only the text content from the image without any additional descriptions or formatting. Perilaku default ini menginstruksikan model untuk mengekstrak seluruh teks dari gambar.
image_urlobject
Informasi tentang gambar input. Parameter ini wajib jika type diatur ke image_url.
Properti
urlstring(Wajib)
URL atau Data URL Base64 dari gambar. Untuk informasi lebih lanjut tentang melewatkan file lokal, lihat Ekstraksi teks.
min_pixelsinteger(Opsional)
Ambang batas piksel minimum untuk gambar input dalam piksel.
Jika jumlah piksel gambar input berada di bawah min_pixels, gambar tersebut diperbesar hingga jumlah total pikselnya melebihi min_pixels.
Konversi antara token gambar dan piksel
Jumlah piksel per token gambar bervariasi tergantung model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Setiap token sesuai dengan 32×32 piksel.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, dan model sebelumnya: Setiap token sesuai dengan 28×28 piksel.
Rentang nilai untuk min_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Nilai default dan minimum adalah 3072 (yaitu, 3×32×32).
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, dan model sebelumnya: Nilai default dan minimum adalah 3136 (yaitu, 4×28×28).
Ambang batas piksel maksimum untuk gambar input dalam piksel.
Jika jumlah piksel gambar input berada dalam rentang [min_pixels, max_pixels], model memproses gambar aslinya tanpa mengubah ukuran. Jika jumlah piksel gambar input melebihi max_pixels, gambar tersebut diperkecil hingga jumlah pikselnya kurang dari max_pixels.
Konversi antara token gambar dan piksel
Jumlah piksel per token gambar bervariasi tergantung model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Setiap token sesuai dengan 32×32 piksel.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, dan model sebelumnya: Setiap token sesuai dengan 28×28 piksel.
Rentang nilai untuk max_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20
Nilai default: 8388608 (yaitu, 8192×32×32)
Nilai maksimum: 30720000 (yaitu, 30000×32×32)
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, dan model sebelumnya
Menentukan apakah respons dikembalikan dalam mode streaming.
Nilai yang valid:
false: Mengembalikan respons lengkap sekaligus setelah model selesai menghasilkan.
true: Mengembalikan blok data saat model menghasilkannya. Klien harus membaca blok secara berurutan untuk merekonstruksi respons lengkap.
stream_options object(Opsional)
Pengaturan konfigurasi untuk keluaran streaming. Parameter ini hanya berlaku ketika parameter stream diatur ke true.
Properti
include_usage boolean(Opsional) Default: false
Menentukan apakah informasi penggunaan token disertakan dalam blok data terakhir dari aliran.
Nilai yang valid:
true
false
max_tokens integer(Opsional)
Jumlah maksimum token yang dihasilkan dalam output. Jika konten yang dihasilkan melebihi nilai ini, respons akan dipotong.
Untuk qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20, dan qwen-vl-ocr-2024-10-28, nilai default dan maksimum sama dengan panjang output maksimum model. Untuk informasi lebih lanjut, lihat Ketersediaan.
Untuk qwen-vl-ocr, qwen-vl-ocr-2025-04-13, dan qwen-vl-ocr-2025-08-28, nilai default dan maksimum adalah 4096.
Untuk meningkatkan nilai parameter ini menjadi angka antara 4097 dan 8192, kirim email ke modelstudio@service.aliyun.com. Email Anda harus mencantumkan informasi berikut: ID akun Alibaba Cloud Anda, jenis gambar (misalnya dokumen, e-commerce, atau kontrak), nama model, perkiraan queries per second (QPS) dan total permintaan harian, serta persentase permintaan di mana output model melebihi 4096 token.
logprobsboolean (Opsional) Default: false
Menentukan apakah probabilitas log dari token output dikembalikan. Nilai yang valid:
true
false
top_logprobsinteger (Opsional) Default: 0
Menentukan jumlah token paling mungkin yang dikembalikan pada setiap langkah generasi.
Rentang nilai: [0, 5]
Parameter ini hanya berlaku ketika logprobs bernilai true.
temperaturefloat (Opsional) Default: 0.01
Suhu pengambilan sampel mengontrol keragaman teks yang dihasilkan oleh model.
Suhu yang lebih tinggi menghasilkan teks yang lebih beragam, sedangkan suhu yang lebih rendah menghasilkan teks yang lebih deterministik.
Rentang nilai: [0, 2)
Karena baik temperature maupun top_p mengontrol keragaman teks yang dihasilkan, Anda hanya dapat mengatur salah satunya.
Kami menyarankan Anda menggunakan nilai default.
top_pfloat(Opsional) Default: 0.001
Parameter ini adalah ambang batas probabilitas untuk pengambilan sampel inti, yang mengontrol keragaman teks yang dihasilkan oleh model.
Nilai yang lebih tinggi menghasilkan teks yang lebih beragam. Nilai yang lebih rendah menghasilkan teks yang lebih deterministik.
Rentang nilai: (0, 1.0]
Karena baik temperature maupun top_p mengontrol keragaman teks, Anda hanya boleh mengatur salah satunya.
Kami menyarankan Anda menggunakan nilai default.
top_kinteger (Opsional) Default: 1
Menentukan ukuran set kandidat untuk pengambilan sampel selama generasi. Misalnya, jika Anda mengatur nilainya ke 50, hanya 50 token dengan skor tertinggi yang digunakan sebagai set kandidat untuk pengambilan sampel acak. Nilai yang lebih besar meningkatkan keacakan, sedangkan nilai yang lebih kecil meningkatkan determinisme. Jika nilainya None atau lebih besar dari 100, kebijakan top_k tidak diaktifkan. Dalam hal ini, hanya kebijakan top_p yang berlaku.
Nilainya harus lebih besar atau sama dengan 0.
Parameter ini bukan parameter standar OpenAI. Saat menggunakan SDK Python, letakkan parameter ini dalam objek extra_body. Contoh: extra_body={"top_k": xxx}. Saat menggunakan SDK Node.js atau HTTP, lewatkan parameter ini di tingkat atas.
Kami menyarankan Anda menggunakan nilai default.
repetition_penalty float(Opsional) Default: 1.0
Hukuman untuk urutan berulang selama generasi model. Nilai yang lebih tinggi dapat mengurangi pengulangan dalam teks yang dihasilkan. Nilai 1.0 berarti tidak ada hukuman yang diterapkan.
Kami menyarankan menggunakan nilai default.
presence_penaltyfloat(Opsional) Default: 0.0
Mengontrol pengulangan konten dalam teks yang dihasilkan oleh model.
Nilainya harus berada dalam rentang -2.0 hingga 2.0. Nilai positif mengurangi pengulangan, dan nilai negatif meningkatkannya.
Tingkatkan nilai ini untuk skenario yang membutuhkan keragaman, kreativitas, atau curah pendapat, seperti penulisan kreatif. Turunkan nilai ini untuk skenario yang menekankan konsistensi dan akurasi terminologi, seperti dokumen teknis atau teks formal.
Cara kerja
Jika nilai parameter ini positif, model menerapkan hukuman pada token yang sudah ada dalam teks. Hukuman diterapkan terlepas dari jumlah kemunculan token tersebut. Hal ini mengurangi kemungkinan token tersebut muncul kembali, sehingga mengurangi pengulangan konten dan meningkatkan keragaman kata.
Kami menyarankan Anda menggunakan nilai default.
seed integer(Opsional)
Seed bilangan acak. Menggunakan seed memastikan hasil yang dapat direproduksi. Jika Anda melewatkan nilai seed yang sama dalam panggilan dan menjaga parameter lain tetap tidak berubah, model akan mengembalikan hasil yang deterministik.
Rentang nilai: [0,2<sup>31</sup>−1].
Kami menyarankan Anda menggunakan nilai default.
stop string atau array(Opsional)
Menentukan kata berhenti. Ketika string atau token_id yang ditentukan dalam stop muncul dalam teks yang dihasilkan, generasi segera dihentikan.
Anda dapat menggunakan parameter ini untuk menentukan kata sensitif dan mengontrol output model.
Jika stop berupa array, Anda tidak boleh mencampur token_id dan string sebagai elemen. Misalnya, Anda tidak dapat menentukan ["Hello",104307].
Kode berikut memberikan contoh cara memanggil tugas bawaan pengenalan presisi tinggi. Untuk informasi lebih lanjut, lihat Memanggil tugas bawaan.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to high-precision recognition.
ocr_options={"task": "advanced_recognition"}
)
# The high-precision recognition task returns the result as plain text.
print(response["output"]["choices"][0]["message"].content[0]["text"])
// dashscope SDK version >= 2.21.8
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.ADVANCED_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "advanced_recognition"
}
}
}
'
Ekstraksi informasi
Kode berikut memberikan contoh cara memanggil tugas bawaan ekstraksi informasi. Untuk informasi lebih lanjut, lihat Memanggil tugas bawaan.
# use [pip install -U dashscope] to update sdk
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role":"user",
"content":[
{
"image":"http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": False
}
]
}
]
params = {
"ocr_options":{
"task": "key_information_extraction",
"task_config": {
"result_schema": {
"Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05",
"Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters",
"Invoice Number": "Extract the number from the invoice, usually composed of only digits."
}
}
}
}
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
**params)
print(response.output.choices[0].message.content[0]["ocr_result"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.google.gson.JsonObject;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
// Create the main JSON object.
JsonObject resultSchema = new JsonObject();
resultSchema.addProperty("Ride Date", "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05");
resultSchema.addProperty("Invoice Code", "Extract the invoice code from the image, usually a combination of numbers or letters");
resultSchema.addProperty("Invoice Number", "Extract the number from the invoice, usually composed of only digits.");
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.KEY_INFORMATION_EXTRACTION)
.taskConfig(OcrOptions.TaskConfig.builder()
.resultSchema(resultSchema)
.build())
.build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("ocr_result"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "key_information_extraction",
"task_config": {
"result_schema": {
"Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05",
"Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters",
"Invoice Number": "Extract the number from the invoice, usually composed of only digits."
}
}
}
}
}
'
Penguraian tabel
Kode berikut memberikan contoh cara memanggil tugas bawaan penguraian tabel. Untuk informasi lebih lanjut, lihat Memanggil tugas bawaan.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to table parsing.
ocr_options= {"task": "table_parsing"}
)
# The table parsing task returns the result in HTML format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels",3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.TABLE_PARSING)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "table_parsing"
}
}
}
'
Penguraian dokumen
Kode berikut memberikan contoh cara memanggil tugas bawaan penguraian dokumen. Untuk informasi lebih lanjut, lihat Memanggil tugas bawaan.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to document parsing.
ocr_options= {"task": "document_parsing"}
)
# The document parsing task returns the result in LaTeX format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.DOCUMENT_PARSING)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation'\
--header "Authorization: Bearer $DASHSCOPE_API_KEY"\
--header 'Content-Type: application/json'\
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "document_parsing"
}
}
}
'
Pengenalan rumus
Kode berikut memberikan contoh cara memanggil tugas bawaan pengenalan rumus. Untuk informasi lebih lanjut, lihat Memanggil tugas bawaan.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False
}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to formula recognition.
ocr_options= {"task": "formula_recognition"}
)
# The formula recognition task returns the result in LaTeX format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.FORMULA_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "formula_recognition"
}
}
}
'
Pengenalan teks umum
Kode berikut memberikan contoh cara memanggil tugas bawaan pengenalan teks umum. Untuk informasi lebih lanjut, lihat Memanggil tugas bawaan.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to general text recognition.
ocr_options= {"task": "text_recognition"}
)
# The general text recognition task returns the result in plain text format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.TEXT_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation'\
--header "Authorization: Bearer $DASHSCOPE_API_KEY"\
--header 'Content-Type: application/json'\
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "text_recognition"
}
}
}'
Pengenalan multibahasa
Kode berikut memberikan contoh cara memanggil tugas bawaan pengenalan multibahasa umum. Untuk informasi lebih lanjut, lihat Memanggil tugas bawaan.
import os
import dashscope
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
# If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png",
# The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
"max_pixels": 32 * 32 * 8192,
# Specifies whether to enable automatic image rotation.
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# If you have not configured an environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to multilingual recognition.
ocr_options={"task": "multi_lan"}
)
# The multilingual recognition task returns the result as plain text.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1.
// If you use a model in the China (Beijing) region, change the base_url to https://dashscope.aliyuncs.com/api/v1.
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png");
// The maximum pixel threshold for the input image. If the image is larger than this value, it is scaled down until the total pixels are less than max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image is smaller than this value, it is scaled up until the total pixels are greater than min_pixels.
map.put("min_pixels", 3072);
// Specifies whether to enable automatic image rotation.
map.put("enable_rotate", false);
// Configure the built-in OCR task.
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.MULTI_LAN)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
// If you have not configured an environment variable, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# ======= Important =======
# API keys vary by region. To get an API key, see https://www.alibabacloud.com/help/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, replace the base_url with https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# If you use a model in the China (Beijing) region, replace the base_url with https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation.
# === Delete this comment before running ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "multi_lan"
}
}
}
'
Keluaran streaming
Python
import os
import dashscope
PROMPT_TICKET_EXTRACTION = """
Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image.
You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?).
Return the data in JSON format as follows: {'invoice_number': 'xxx', 'departure_station': 'xxx', 'arrival_station': 'xxx', 'departure_date_and_time':'xxx', 'seat_number': 'xxx','ticket_price':'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'},
"""
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg",
# The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
"min_pixels": 32 * 32 * 3,
# The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
"max_pixels": 32 * 32 * 8192},
# When no built-in task is set, you can pass a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
{
"type": "text",
"text": PROMPT_TICKET_EXTRACTION,
},
],
}
]
response = dashscope.MultiModalConversation.call(
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# If the environment variable is not configured, replace the following line with your Model Studio API key: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
model="qwen-vl-ocr-2025-11-20",
messages=messages,
stream=True,
incremental_output=True,
)
full_content = ""
print("Streaming output content:")
for response in response:
try:
print(response["output"]["choices"][0]["message"].content[0]["text"])
full_content += response["output"]["choices"][0]["message"].content[0]["text"]
except:
pass
print(f"Full content: {full_content}")
Java
import java.util.*;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import io.reactivex.Flowable;
import com.alibaba.dashscope.utils.Constants;
public class Main {
// The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1
// If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/api/v1
static {
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg");
// The maximum pixel threshold for the input image. If the image has more pixels than this value, it is reduced until its total pixel count is below max_pixels.
map.put("max_pixels", 8388608);
// The minimum pixel threshold for the input image. If the image has fewer pixels than this value, it is enlarged until its total pixel count exceeds min_pixels.
map.put("min_pixels", 3072);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map,
// When no built-in task is set, you can pass a prompt in the text field. If no prompt is passed, the default prompt is used: Please output only the text content from the image without any additional descriptions or formatting.
Collections.singletonMap("text", "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'"))).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
// If the environment variable is not configured, replace the following line with your Model Studio API key: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.incrementalOutput(true)
.build();
Flowable<MultiModalConversationResult> result = conv.streamCall(param);
result.blockingForEach(item -> {
try {
List<Map<String, Object>> contentList = item.getOutput().getChoices().get(0).getMessage().getContent();
if (!contentList.isEmpty()){
System.out.println(contentList.get(0).get("text"));
}//
} catch (Exception e){
System.exit(0);
}
});
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl
# ======= Important =======
# API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key.
# The following is the base URL for the Singapore region. If you use a model in the US (Virginia) region, change the base_url to: https://dashscope-us.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# If you use a model in the China (Beijing) region, change the base_url to: https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation
# === Delete this comment before execution ===
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
-H 'X-DashScope-SSE: enable' \
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input":{
"messages":[
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg",
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Please extract the invoice number, train number, departure station, arrival station, departure date and time, seat number, seat class, ticket price, ID card number, and passenger name from the train ticket image. You must accurately extract the key information. Do not omit or fabricate information. Replace any single character that is blurry or obscured by strong light with an English question mark (?). Return the data in JSON format as follows: {\'invoice_number\': \'xxx\', \'departure_station\': \'xxx\', \'arrival_station\': \'xxx\', \'departure_date_and_time\':\'xxx\', \'seat_number\': \'xxx\',\'ticket_price\':\'xxx\', \'id_card_number\': \'xxx\', \'passenger_name\': \'xxx\'}"}
]
}
]
},
"parameters": {
"incremental_output": true
}
}'
modelstring(Wajib)
Nama model. Lihat <a baseurl="t3183345_v2_1_0.xdita" data-node="5088954" data-root="85177" data-tag="xref" href="t2751232.xdita#8442951a76clt" id="b9f5555f63h80">Qwen-OCR</a> untuk daftar model yang didukung.
messagesarray(Wajib)
Konteks untuk model, disediakan sebagai urutan pesan dalam urutan percakapan.
Saat Anda memanggil API melalui HTTP, letakkan objek messages di dalam objek input.
Jenis pesan
Pesan Penggunaobject(Wajib)
Pesan pengguna yang mengirimkan pertanyaan, instruksi, atau konteks ke model.
Properti
contentstring atau array(Wajib)
Konten pesan. Gunakan string untuk input teks saja. Gunakan array jika input mencakup data gambar.
Properti
textstring(Opsional)
Teks input.
Nilai default adalah Please output only the text content from the image without any additional descriptions or formatting. Perilaku default ini menginstruksikan model untuk mengekstrak seluruh teks dari gambar.
imagestring (Opsional)
URL, Data URL Base64, atau jalur lokal gambar. Untuk informasi lebih lanjut tentang melewatkan file lokal, lihat Melewatkan file lokal.
Contoh: {"image":"https://xxxx.jpeg"}
enable_rotateboolean (Opsional) Default: false
Menentukan apakah akan mengoreksi gambar yang miring.
Jika gambar input memiliki lebih sedikit piksel daripada min_pixels, model memperbesar gambar hingga jumlah total pikselnya lebih besar dari min_pixels.
Konversi antara token gambar dan piksel
Jumlah piksel per token gambar bervariasi tergantung model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Setiap token sesuai dengan 32×32 piksel.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, dan model sebelumnya: Setiap token sesuai dengan 28×28 piksel.
Rentang nilai untuk min_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Nilai default dan minimum adalah 3072 (yaitu, 3×32×32).
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, dan model sebelumnya: Nilai default dan minimum adalah 3136 (yaitu, 4×28×28).
Jika jumlah piksel gambar berada dalam rentang [min_pixels, max_pixels], model memprosesnya dalam ukuran aslinya. Jika jumlah piksel melebihi max_pixels, model memperkecil gambar hingga jumlah total pikselnya kurang dari atau sama dengan max_pixels.
Konversi antara token gambar dan piksel
Jumlah piksel per token gambar bervariasi tergantung model:
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20: Setiap token sesuai dengan 32×32 piksel.
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, dan model sebelumnya: Setiap token sesuai dengan 28×28 piksel.
Rentang nilai untuk max_pixels
qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20
Nilai default: 8388608 (yaitu, 8192×32×32)
Nilai maksimum: 30720000 (yaitu, 30000×32×32)
qwen-vl-ocr, qwen-vl-ocr-2025-08-28, dan model sebelumnya
Jumlah maksimum token yang dihasilkan dalam output. Jika konten yang dihasilkan melebihi nilai ini, respons akan dipotong.
Untuk qwen-vl-ocr-latest, qwen-vl-ocr-2025-11-20, dan qwen-vl-ocr-2024-10-28, nilai default dan maksimum sama dengan panjang output maksimum model. Untuk informasi lebih lanjut, lihat Ketersediaan.
Untuk qwen-vl-ocr, qwen-vl-ocr-2025-04-13, dan qwen-vl-ocr-2025-08-28, nilai default dan maksimum adalah 4096.
Untuk meningkatkan nilai parameter ini menjadi angka antara 4097 dan 8192, kirim email ke modelstudio@service.aliyun.com. Email Anda harus mencantumkan informasi berikut: ID akun Alibaba Cloud Anda, jenis gambar (misalnya dokumen, e-commerce, atau kontrak), nama model, perkiraan queries per second (QPS) dan total permintaan harian, serta persentase permintaan di mana output model melebihi 4096 token.
Dalam SDK Java, parameter ini adalah maxTokens. Untuk panggilan HTTP, atur max_tokens dalam objek parameters.
ocr_optionsobject (Opsional)
Parameter untuk dikonfigurasi saat Anda memanggil tugas bawaan dengan model Qwen-OCR. Saat Anda memanggil tugas bawaan, Anda tidak perlu melewatkan Pesan Pengguna karena model menggunakan Prompt default untuk tugas tersebut. Untuk informasi lebih lanjut, lihat Memanggil tugas bawaan.
Properti
taskstring (Wajib)
Nama tugas bawaan. Nilai yang valid adalah:
text_recognition: Pengenalan teks umum
key_information_extraction: Ekstraksi informasi
document_parsing: Penguraian dokumen
table_parsing: Penguraian tabel
formula_recognition: Pengenalan rumus
multi_lan: Pengenalan multibahasa
advanced_recognition: Pengenalan presisi tinggi
task_configobject (Opsional)
Saat task diatur ke key_information_extraction (Ekstraksi informasi), parameter ini menentukan bidang yang akan diekstrak. Jika Anda tidak menentukan task_config, model mengekstrak seluruh bidang dari gambar secara default.
Properti
result_schemaobject (Opsional)
Menentukan bidang yang akan diekstrak oleh model. Nilainya harus berupa objek JSON. Anda dapat menyusun objek JSON hingga tiga lapisandalam.
Tentukan nama bidang yang akan diekstrak dalam key objek JSON. value yang sesuai dapat dikosongkan. Untuk mencapai akurasi ekstraksi yang lebih tinggi, Anda dapat memberikan deskripsi bidang atau persyaratan format dalam value tersebut.
Contoh:
"result_schema": {
"invoice_number": "The unique identification number of the invoice, usually a combination of numbers and letters.",
"issue_date": "The date the invoice was issued. Extract it in YYYY-MM-DD format, for example, 2023-10-26.",
"seller_name": "The full company name of the seller shown on the invoice.",
"total_amount": "The total amount on the invoice, including tax. Extract the numerical value and keep two decimal places, for example, 123.45."
}
Dalam SDK Java, parameter ini bernama OcrOptions. Versi minimum yang diperlukan untuk SDK Python DashScope adalah 1.22.2, dan untuk SDK Java adalah 2.18.4.
Untuk panggilan HTTP, letakkan ocr_options dalam objek parameters.
seed integer(Opsional)
Seed bilangan acak. Menggunakan seed memastikan hasil yang dapat direproduksi. Jika Anda melewatkan nilai seed yang sama dalam panggilan dan menjaga parameter lain tetap tidak berubah, model akan mengembalikan hasil yang deterministik.
Rentang nilai: [0,2<sup>31</sup>−1].
Kami menyarankan Anda menggunakan nilai default.
Saat Anda melakukan panggilan HTTP, letakkan seed dalam objek parameters.
temperaturefloat (Opsional) Default: 0.01
Suhu pengambilan sampel mengontrol keragaman teks yang dihasilkan oleh model.
Suhu yang lebih tinggi menghasilkan teks yang lebih beragam, sedangkan suhu yang lebih rendah menghasilkan teks yang lebih deterministik.
Rentang nilai: [0, 2)
Karena baik temperature maupun top_p mengontrol keragaman teks yang dihasilkan, Anda hanya dapat mengatur salah satunya.
Kami menyarankan Anda menggunakan nilai default.
Saat Anda melakukan panggilan HTTP, letakkan temperature dalam objek parameters.
top_pfloat(Opsional) Default: 0.001
Parameter ini adalah ambang batas probabilitas untuk pengambilan sampel inti, yang mengontrol keragaman teks yang dihasilkan oleh model.
Nilai yang lebih tinggi menghasilkan teks yang lebih beragam. Nilai yang lebih rendah menghasilkan teks yang lebih deterministik.
Rentang nilai: (0, 1.0]
Karena baik temperature maupun top_p mengontrol keragaman teks, Anda hanya boleh mengatur salah satunya.
Kami menyarankan Anda menggunakan nilai default.
Dalam SDK Java, parameter ini adalah topP. Untuk panggilan HTTP, letakkan top_p dalam objek parameters.
top_kinteger (Opsional) Default: 1
Menentukan ukuran set kandidat untuk pengambilan sampel selama generasi. Misalnya, jika Anda mengatur nilainya ke 50, hanya 50 token dengan skor tertinggi yang digunakan sebagai set kandidat untuk pengambilan sampel acak. Nilai yang lebih besar meningkatkan keacakan, sedangkan nilai yang lebih kecil meningkatkan determinisme. Jika nilainya None atau lebih besar dari 100, kebijakan top_k tidak diaktifkan. Dalam hal ini, hanya kebijakan top_p yang berlaku.
Nilainya harus lebih besar atau sama dengan 0.
Parameter ini bukan parameter standar OpenAI. Saat menggunakan SDK Python, letakkan parameter ini dalam objek extra_body. Contoh: extra_body={"top_k": xxx}. Saat menggunakan SDK Node.js atau HTTP, lewatkan parameter ini di tingkat atas.
Kami menyarankan Anda menggunakan nilai default.
repetition_penalty float(Opsional) Default: 1.0
Hukuman untuk urutan berulang selama generasi model. Nilai yang lebih tinggi dapat mengurangi pengulangan dalam teks yang dihasilkan. Nilai 1.0 berarti tidak ada hukuman yang diterapkan.
Kami menyarankan menggunakan nilai default.
Dalam SDK Java, parameter ini adalah repetitionPenalty. Untuk panggilan HTTP, tambahkan repetition_penalty ke objek parameters.
presence_penaltyfloat(Opsional) Default: 0.0
Mengontrol pengulangan konten dalam teks yang dihasilkan oleh model.
Nilainya harus berada dalam rentang -2.0 hingga 2.0. Nilai positif mengurangi pengulangan, dan nilai negatif meningkatkannya.
Tingkatkan nilai ini untuk skenario yang membutuhkan keragaman, kreativitas, atau curah pendapat, seperti penulisan kreatif. Turunkan nilai ini untuk skenario yang menekankan konsistensi dan akurasi terminologi, seperti dokumen teknis atau teks formal.
Cara kerja
Jika nilai parameter ini positif, model menerapkan hukuman pada token yang sudah ada dalam teks. Hukuman diterapkan terlepas dari jumlah kemunculan token tersebut. Hal ini mengurangi kemungkinan token tersebut muncul kembali, sehingga mengurangi pengulangan konten dan meningkatkan keragaman kata.
Kami menyarankan Anda menggunakan nilai default.
streamboolean (Opsional) Default: false
Menentukan apakah respons dialirkan. Nilai yang valid:
false: Model mengembalikan hasil sekaligus setelah seluruh konten dihasilkan.
true: Model mengeluarkan konten dalam potongan saat dihasilkan.
Parameter ini hanya didukung oleh SDK Python. Untuk menggunakan keluaran streaming dengan SDK Java, Anda dapat memanggil antarmuka streamCall. Untuk menggunakan keluaran streaming melalui HTTP, atur X-DashScope-SSE ke enable dalam header.
Menentukan apakah akan mengaktifkan keluaran inkremental dalam mode keluaran streaming. Pengaturan yang disarankan adalah true.
Nilai yang valid:
false: Setiap keluaran berisi seluruh urutan yang dihasilkan sejauh ini. Keluaran akhir adalah hasil lengkapnya.
I
I like
I like apple
I like apple.
true (Direkomendasikan): Mengaktifkan keluaran inkremental. Keluaran berikutnya hanya berisi konten yang baru dihasilkan. Anda harus menggabungkan segmen-segmen ini untuk mendapatkan hasil lengkapnya.
I
like
apple
.
Dalam SDK Java, parameter ini adalah incrementalOutput. Untuk panggilan HTTP, tambahkan incremental_output ke objek parameters.
stop string atau array(Opsional)
Menentukan kata berhenti. Ketika string atau token_id yang ditentukan dalam stop muncul dalam teks yang dihasilkan, generasi segera dihentikan.
Anda dapat menggunakan parameter ini untuk menentukan kata sensitif dan mengontrol output model.
Jika stop berupa array, Anda tidak boleh mencampur token_id dan string sebagai elemen. Misalnya, Anda tidak dapat menentukan ["Hello",104307].
logprobsboolean (Opsional) Default: false
Menentukan apakah probabilitas log dari token output dikembalikan. Nilai yang valid:
true
false
Model yang didukung: qwen-vl-ocr-2025-04-13 dan model setelahnya.
Untuk panggilan HTTP, letakkan logprobs dalam objek parameters.
top_logprobsinteger (Opsional) Default: 0
Menentukan jumlah token paling mungkin yang dikembalikan pada setiap langkah generasi. Parameter ini hanya berlaku ketika logprobs diatur ke true.
Nilainya harus berupa bilangan bulat dari 0 hingga 5.
Dalam SDK Java, parameter ini bernama topLogprobs. Untuk panggilan HTTP, atur parameter top_logprobs dalam objek parameters.
Objek respons chat (format sama untuk keluaran streaming dan non-streaming)
Kode status permintaan. Nilai 200 menunjukkan bahwa permintaan berhasil. Jika tidak, permintaan gagal.
SDK Java tidak mengembalikan parameter ini. Jika panggilan gagal, exception dilemparkan. Pesan exception berisi `status_code` dan `message`.
request_idstring
Pengidentifikasi unik untuk panggilan ini.
SDK Java mengembalikan parameter ini sebagai requestId
codestring
Kode kesalahan. Parameter ini kosong jika panggilan berhasil.
Hanya SDK Python yang mengembalikan parameter ini.
outputobject
Informasi tentang hasil panggilan.
Properti
textstring
Parameter ini saat ini tetap null.
finish_reasonstring
Alasan model berhenti menghasilkan. Nilai yang valid:
Nilainya adalah null selama generasi.
Nilai stop menunjukkan bahwa output model telah berakhir secara alami.
Generasi dihentikan karena output terlalu panjang. Alasan berhentinya adalah length.
choicesarray
Informasi output dari model.
Properti
finish_reasonstring
Berlaku untuk skenario berikut:
Nilainya adalah null selama generasi.
Generasi output model selesai secara alami, yang ditunjukkan oleh stop.
length: Output dipotong karena mencapai panjang maksimum.
messageobject
Objek pesan yang dikeluarkan oleh model.
Properti
rolestring
Peran dari pesan output. Nilainya tetap assistant.
contentobject
Konten dari pesan output.
Properti
ocr_resultobject
Saat Anda menggunakan model Qwen-OCR untuk memanggil tugas bawaan ekstraksi informasi atau pengenalan presisi tinggi, parameter ini berisi hasil tugas tersebut.
Representasi persegi panjang yang diputar dari kotak teks:
center_x dan center_y adalah koordinat pusat kotak teks.
width adalah lebar kotak teks, dan height adalah tingginya.
angle adalah sudut rotasi kotak teks relatif terhadap arah horizontal. Rentang nilainya adalah [-90, 90].
locationarray
Contoh: [x1, y1, x2, y2, x3, y3, x4, y4]
Koordinat empat titik sudut kotak teks. Koordinat diatur secara searah jarum jam, dimulai dari titik sudut kiri atas: kiri atas → kanan atas → kanan bawah → kiri bawah.
textstring
Konten baris teks.
textstring
Konten dari pesan output.
logprobsobject
Informasi probabilitas untuk objek `choices` saat ini.
Properti
contentarray
Array token yang memiliki informasi probabilitas log.
Properti
tokenstring
Token saat ini.
bytesarray
Daftar byte UTF-8 asli dari token saat ini. Ini membantu merekonstruksi konten output secara akurat, terutama saat menangani emoji dan karakter Tionghoa.
logprobfloat
Probabilitas log dari token saat ini. Nilai pengembalian `null` menunjukkan probabilitas yang sangat rendah.
top_logprobsarray
Token paling mungkin pada posisi token saat ini dan probabilitas log-nya. Jumlah elemennya sama dengan nilai parameter input top_logprobs.
Properti
tokenstring
Token saat ini.
bytesarray
Daftar byte UTF-8 asli dari token saat ini. Ini membantu merekonstruksi konten output secara akurat, terutama saat menangani emoji dan karakter Tionghoa.
logprobfloat
Probabilitas log dari token saat ini. Nilai pengembalian `null` menunjukkan probabilitas yang sangat rendah.
usageobject
Informasi tentang token yang digunakan dalam permintaan ini.
Properti
input_tokensinteger
Jumlah token input.
output_tokensinteger
Jumlah token output.
charactersinteger
Parameter ini saat ini tetap 0.
input_tokens_detailsobject
Klasifikasi detail halus dari token input.
Properti
image_tokensinteger
Jumlah token yang sesuai dengan gambar yang diinput ke model.
text_tokensinteger
Jumlah token yang sesuai dengan teks yang diinput ke model.
output_tokens_detailsobject
Klasifikasi detail halus dari token output.
Properti
text_tokensinteger
Jumlah token dalam input model.
total_tokensinteger
Jumlah total token yang dikonsumsi. Ini adalah jumlah dari input_tokens dan output_tokens.
image_tokensinteger
Bidang ini dikembalikan jika input mencakup image. Ini merepresentasikan jumlah token yang sesuai dengan input gambar.
Kode kesalahan
Jika pemanggilan model mengembalikan kesalahan, lihat Pesan kesalahan untuk menyelesaikan masalah.