Use the WebSocket protocol to integrate directly with the Fun-ASR real-time speech recognition service. This is compatible with any programming language that supports WebSocket. Although we offer higher-level SDKs for Python and Java to simplify integration, this generic protocol offers maximum flexibility.
For model descriptions and selection guidance, see real-time speech recognition - Fun-ASR/Paraformer.
Getting started
Prerequisites
Get an API key. For security, we recommend storing the API key in an environment variable.
Download the sample audio file: asr_example.wav.
Sample code
Node.js
Install the required dependencies:
npm install ws
npm install uuidUse the following sample code:
const fs = require('fs');
const WebSocket = require('ws');
const { v4: uuidv4 } = require('uuid'); // Used to generate a UUID
// API keys for the Singapore and China (Beijing) regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If you have not set an environment variable, replace the following line with your Model Studio API key: const apiKey = "sk-xxx"
const apiKey = process.env.DASHSCOPE_API_KEY;
// This is the endpoint for the Singapore region. To use a model in the China (Beijing) region, replace the URL with: wss://dashscope.aliyuncs.com/api-ws/v1/inference/
const url = 'wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference/'; // WebSocket server endpoint
const audioFile = 'asr_example.wav'; // Replace with the path to your audio file
// Generate a 32-character random ID
const TASK_ID = uuidv4().replace(/-/g, '').slice(0, 32);
// Create a WebSocket client
const ws = new WebSocket(url, {
headers: {
Authorization: `bearer ${apiKey}`
}
});
let taskStarted = false; // A flag to indicate whether the task has started
// Once the connection opens, send the run-task command
ws.on('open', () => {
console.log('Connected to the server');
sendRunTask();
});
// Handle incoming messages
ws.on('message', (data) => {
const message = JSON.parse(data);
switch (message.header.event) {
case 'task-started':
console.log('Task started');
taskStarted = true;
sendAudioStream();
break;
case 'result-generated':
console.log('Recognition result:', message.payload.output.sentence.text);
if (message.payload.usage) {
console.log('Billable duration (seconds):', message.payload.usage.duration);
}
break;
case 'task-finished':
console.log('Task finished');
ws.close();
break;
case 'task-failed':
console.error('Task failed:', message.header.error_message);
ws.close();
break;
default:
console.log('Unknown event:', message.header.event);
}
});
// If no task-started event is received, close the connection
ws.on('close', () => {
if (!taskStarted) {
console.error('Task did not start. Closing the connection.');
}
});
// Send the run-task command
function sendRunTask() {
const runTaskMessage = {
header: {
action: 'run-task',
task_id: TASK_ID,
streaming: 'duplex'
},
payload: {
task_group: 'audio',
task: 'asr',
function: 'recognition',
model: 'fun-asr-realtime',
parameters: {
sample_rate: 16000,
format: 'wav'
},
input: {}
}
};
ws.send(JSON.stringify(runTaskMessage));
}
// Send the audio stream
function sendAudioStream() {
const audioStream = fs.createReadStream(audioFile);
let chunkCount = 0;
function sendNextChunk() {
const chunk = audioStream.read();
if (chunk) {
ws.send(chunk);
chunkCount++;
setTimeout(sendNextChunk, 100); // Send a chunk every 100 ms
}
}
audioStream.on('readable', () => {
sendNextChunk();
});
audioStream.on('end', () => {
console.log('Audio stream ended');
sendFinishTask();
});
audioStream.on('error', (err) => {
console.error('Error reading the audio file:', err);
ws.close();
});
}
// Send the finish-task command
function sendFinishTask() {
const finishTaskMessage = {
header: {
action: 'finish-task',
task_id: TASK_ID,
streaming: 'duplex'
},
payload: {
input: {}
}
};
ws.send(JSON.stringify(finishTaskMessage));
}
// Handle errors
ws.on('error', (error) => {
console.error('WebSocket error:', error);
});C#
Use the following sample code:
using System.Net.WebSockets;
using System.Text;
using System.Text.Json;
using System.Text.Json.Nodes;
class Program {
private static ClientWebSocket _webSocket = new ClientWebSocket();
private static CancellationTokenSource _cancellationTokenSource = new CancellationTokenSource();
private static bool _taskStartedReceived = false;
private static bool _taskFinishedReceived = false;
// API keys for the Singapore and China (Beijing) regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If you have not set an environment variable, replace the following line with your Model Studio API key: private static readonly string ApiKey = "sk-xxx"
private static readonly string ApiKey = Environment.GetEnvironmentVariable("DASHSCOPE_API_KEY") ?? throw new InvalidOperationException("DASHSCOPE_API_KEY environment variable is not set.");
// This is the endpoint for the Singapore region. To use a model in the China (Beijing) region, replace the URL with: wss://dashscope.aliyuncs.com/api-ws/v1/inference/
private const string WebSocketUrl = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference/";
// Replace with the path to your audio file
private const string AudioFilePath = "asr_example.wav";
static async Task Main(string[] args) {
// Establish a WebSocket connection and configure headers for authentication.
_webSocket.Options.SetRequestHeader("Authorization", $"bearer {ApiKey}");
await _webSocket.ConnectAsync(new Uri(WebSocketUrl), _cancellationTokenSource.Token);
// Start a task to receive WebSocket messages asynchronously.
var receiveTask = ReceiveMessagesAsync();
// Send the run-task command.
string _taskId = Guid.NewGuid().ToString("N"); // Generate a 32-character random ID.
var runTaskJson = GenerateRunTaskJson(_taskId);
await SendAsync(runTaskJson);
// Wait for the task-started event.
while (!_taskStartedReceived) {
await Task.Delay(100, _cancellationTokenSource.Token);
}
// Read the local file and stream the audio to the server for recognition.
await SendAudioStreamAsync(AudioFilePath);
// Send the finish-task command to end the task.
var finishTaskJson = GenerateFinishTaskJson(_taskId);
await SendAsync(finishTaskJson);
// Wait for the task-finished event.
while (!_taskFinishedReceived && !_cancellationTokenSource.IsCancellationRequested) {
try {
await Task.Delay(100, _cancellationTokenSource.Token);
} catch (OperationCanceledException) {
// The task was canceled, so exit the loop.
break;
}
}
// Close the connection.
if (!_cancellationTokenSource.IsCancellationRequested) {
await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", _cancellationTokenSource.Token);
}
_cancellationTokenSource.Cancel();
try {
await receiveTask;
} catch (OperationCanceledException) {
// Ignore operation canceled exceptions.
}
}
private static async Task ReceiveMessagesAsync() {
try {
while (_webSocket.State == WebSocketState.Open && !_cancellationTokenSource.IsCancellationRequested) {
var message = await ReceiveMessageAsync(_cancellationTokenSource.Token);
if (message != null) {
var eventValue = message["header"]?["event"]?.GetValue<string>();
switch (eventValue) {
case "task-started":
Console.WriteLine("Task started successfully.");
_taskStartedReceived = true;
break;
case "result-generated":
Console.WriteLine($"Recognition result: {message["payload"]?["output"]?["sentence"]?["text"]?.GetValue<string>()}");
if (message["payload"]?["usage"] != null && message["payload"]?["usage"]?["duration"] != null) {
Console.WriteLine($"Billable duration (seconds): {message["payload"]?["usage"]?["duration"]?.GetValue<int>()}");
}
break;
case "task-finished":
Console.WriteLine("Task finished.");
_taskFinishedReceived = true;
_cancellationTokenSource.Cancel();
break;
case "task-failed":
Console.WriteLine($"Task failed: {message["header"]?["error_message"]?.GetValue<string>()}");
_cancellationTokenSource.Cancel();
break;
}
}
}
} catch (OperationCanceledException) {
// Ignore operation canceled exceptions.
}
}
private static async Task<JsonNode?> ReceiveMessageAsync(CancellationToken cancellationToken) {
var buffer = new byte[1024 * 4];
var segment = new ArraySegment<byte>(buffer);
var result = await _webSocket.ReceiveAsync(segment, cancellationToken);
if (result.MessageType == WebSocketMessageType.Close) {
await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", cancellationToken);
return null;
}
var message = Encoding.UTF8.GetString(buffer, 0, result.Count);
return JsonNode.Parse(message);
}
private static async Task SendAsync(string message) {
var buffer = Encoding.UTF8.GetBytes(message);
var segment = new ArraySegment<byte>(buffer);
await _webSocket.SendAsync(segment, WebSocketMessageType.Text, true, _cancellationTokenSource.Token);
}
private static async Task SendAudioStreamAsync(string filePath) {
using (var audioStream = File.OpenRead(filePath)) {
var buffer = new byte[1024]; // Create a buffer for audio chunks.
int bytesRead;
while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0) {
var segment = new ArraySegment<byte>(buffer, 0, bytesRead);
await _webSocket.SendAsync(segment, WebSocketMessageType.Binary, true, _cancellationTokenSource.Token);
await Task.Delay(100); // Wait 100 ms between chunks.
}
}
}
private static string GenerateRunTaskJson(string taskId) {
var runTask = new JsonObject {
["header"] = new JsonObject {
["action"] = "run-task",
["task_id"] = taskId,
["streaming"] = "duplex"
},
["payload"] = new JsonObject {
["task_group"] = "audio",
["task"] = "asr",
["function"] = "recognition",
["model"] = "fun-asr-realtime",
["parameters"] = new JsonObject {
["format"] = "wav",
["sample_rate"] = 16000,
},
["input"] = new JsonObject()
}
};
return JsonSerializer.Serialize(runTask);
}
private static string GenerateFinishTaskJson(string taskId) {
var finishTask = new JsonObject {
["header"] = new JsonObject {
["action"] = "finish-task",
["task_id"] = taskId,
["streaming"] = "duplex"
},
["payload"] = new JsonObject {
["input"] = new JsonObject()
}
};
return JsonSerializer.Serialize(finishTask);
}
}PHP
The sample code uses the following directory structure:
my-php-project/
├── composer.json
├── vendor/
└── index.php
Use the following content for the composer.json file. You can specify the versions of the dependencies based on your needs.
{
"require": {
"react/event-loop": "^1.3",
"react/socket": "^1.11",
"react/stream": "^1.2",
"react/http": "^1.1",
"ratchet/pawl": "^0.4"
},
"autoload": {
"psr-4": {
"App\\": "src/"
}
}
}The index.php file contains the following code:
<?php
require __DIR__ . '/vendor/autoload.php';
use Ratchet\Client\Connector;
use React\EventLoop\Loop;
use React\Socket\Connector as SocketConnector;
use Ratchet\rfc6455\Messaging\Frame;
// API keys for the Singapore and China (Beijing) regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If you have not set an environment variable, replace the following line with your Model Studio API key: $api_key = "sk-xxx"
$api_key = getenv("DASHSCOPE_API_KEY");
// This is the endpoint for the Singapore region. To use a model in the China (Beijing) region, replace the URL with: wss://dashscope.aliyuncs.com/api-ws/v1/inference/
$websocket_url = 'wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference/';
$audio_file_path = 'asr_example.wav'; // Replace with the path to your audio file
$loop = Loop::get();
// Create a custom connector.
$socketConnector = new SocketConnector($loop, [
'tcp' => [
'bindto' => '0.0.0.0:0',
],
'tls' => [
'verify_peer' => false,
'verify_peer_name' => false,
],
]);
$connector = new Connector($loop, $socketConnector);
$headers = [
'Authorization' => 'bearer ' . $api_key
];
$connector($websocket_url, [], $headers)->then(function ($conn) use ($loop, $audio_file_path) {
echo "Connected to the WebSocket server\n";
// Start listening for WebSocket messages asynchronously.
$conn->on('message', function($msg) use ($conn, $loop, $audio_file_path) {
$response = json_decode($msg, true);
if (isset($response['header']['event'])) {
handleEvent($conn, $response, $loop, $audio_file_path);
} else {
echo "Unknown message format\n";
}
});
// Listen for the connection close event.
$conn->on('close', function($code = null, $reason = null) {
echo "Connection closed\n";
if ($code !== null) {
echo "Close code: " . $code . "\n";
}
if ($reason !== null) {
echo "Close reason: " . $reason . "\n";
}
});
// Generate a task ID.
$taskId = generateTaskId();
// Send the run-task command.
sendRunTaskMessage($conn, $taskId);
}, function ($e) {
echo "Could not connect: {$e->getMessage()}\n";
});
$loop->run();
/**
* Generate a task ID.
* @return string
*/
function generateTaskId(): string {
return bin2hex(random_bytes(16));
}
/**
* Send the run-task command.
* @param $conn
* @param $taskId
*/
function sendRunTaskMessage($conn, $taskId) {
$runTaskMessage = json_encode([
"header" => [
"action" => "run-task",
"task_id" => $taskId,
"streaming" => "duplex"
],
"payload" => [
"task_group" => "audio",
"task" => "asr",
"function" => "recognition",
"model" => "fun-asr-realtime",
"parameters" => [
"format" => "wav",
"sample_rate" => 16000
],
"input" => []
]
]);
echo "Preparing to send the run-task command: " . $runTaskMessage . "\n";
$conn->send($runTaskMessage);
echo "run-task command sent\n";
}
/**
* Read the audio file.
* @param string $filePath
* @return bool|string
*/
function readAudioFile(string $filePath) {
$voiceData = file_get_contents($filePath);
if ($voiceData === false) {
echo "Failed to read the audio file\n";
}
return $voiceData;
}
/**
* Split the audio data into chunks.
* @param string $data
* @param int $chunkSize
* @return array
*/
function splitAudioData(string $data, int $chunkSize): array {
return str_split($data, $chunkSize);
}
/**
* Send the finish-task command.
* @param $conn
* @param $taskId
*/
function sendFinishTaskMessage($conn, $taskId) {
$finishTaskMessage = json_encode([
"header" => [
"action" => "finish-task",
"task_id" => $taskId,
"streaming" => "duplex"
],
"payload" => [
"input" => []
]
]);
echo "Preparing to send the finish-task command: " . $finishTaskMessage . "\n";
$conn->send($finishTaskMessage);
echo "finish-task command sent\n";
}
/**
* Handle events.
* @param $conn
* @param $response
* @param $loop
* @param $audio_file_path
*/
function handleEvent($conn, $response, $loop, $audio_file_path) {
static $taskId;
static $chunks;
static $allChunksSent = false;
if (is_null($taskId)) {
$taskId = generateTaskId();
}
switch ($response['header']['event']) {
case 'task-started':
echo "Task started, sending audio data...\n";
// Read the audio file.
$voiceData = readAudioFile($audio_file_path);
if ($voiceData === false) {
echo "Failed to read the audio file\n";
$conn->close();
return;
}
// Split the audio data into chunks.
$chunks = splitAudioData($voiceData, 1024);
// Define the sending function.
$sendChunk = function() use ($conn, &$chunks, $loop, &$sendChunk, &$allChunksSent, $taskId) {
if (!empty($chunks)) {
$chunk = array_shift($chunks);
$binaryMsg = new Frame($chunk, true, Frame::OP_BINARY);
$conn->send($binaryMsg);
// Send the next chunk after 100 ms.
$loop->addTimer(0.1, $sendChunk);
} else {
echo "All chunks have been sent\n";
$allChunksSent = true;
// Send the finish-task command.
sendFinishTaskMessage($conn, $taskId);
}
};
// Start sending audio data.
$sendChunk();
break;
case 'result-generated':
$result = $response['payload']['output']['sentence'];
echo "Recognition result: " . $result['text'] . "\n";
if (isset($response['payload']['usage']['duration'])) {
echo "Billable duration (seconds): " . $response['payload']['usage']['duration'] . "\n";
}
break;
case 'task-finished':
echo "Task finished\n";
$conn->close();
break;
case 'task-failed':
echo "Task failed\n";
echo "Error code: " . $response['header']['error_code'] . "\n";
echo "Error message: " . $response['header']['error_message'] . "\n";
$conn->close();
break;
case 'error':
echo "Error: " . $response['payload']['message'] . "\n";
break;
default:
echo "Unknown event: " . $response['header']['event'] . "\n";
break;
}
// If all chunks are sent and the task is finished, close the connection.
if ($allChunksSent && $response['header']['event'] == 'task-finished') {
// Wait 1 second to ensure all data has been transmitted.
$loop->addTimer(1, function() use ($conn) {
$conn->close();
echo "Client closed the connection\n";
});
}
}Go
package main
import (
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"time"
"github.com/google/uuid"
"github.com/gorilla/websocket"
)
const (
// This is the endpoint for the Singapore region. To use a model in the China (Beijing) region, replace the URL with: wss://dashscope.aliyuncs.com/api-ws/v1/inference/
wsURL = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference/" // WebSocket server endpoint
audioFile = "asr_example.wav" // Replace with the path to your audio file
)
var dialer = websocket.DefaultDialer
func main() {
// API keys for the Singapore and China (Beijing) regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If you have not set an environment variable, replace the following line with your Model Studio API key: apiKey := "sk-xxx"
apiKey := os.Getenv("DASHSCOPE_API_KEY")
// Connect to the WebSocket service.
conn, err := connectWebSocket(apiKey)
if err != nil {
log.Fatal("Failed to connect to WebSocket: ", err)
}
defer closeConnection(conn)
// Start a goroutine to receive results.
taskStarted := make(chan bool)
taskDone := make(chan bool)
startResultReceiver(conn, taskStarted, taskDone)
// Send the run-task command.
taskID, err := sendRunTaskCmd(conn)
if err != nil {
log.Fatal("Failed to send run-task command: ", err)
}
// Wait for the task-started event.
waitForTaskStarted(taskStarted)
// Send the audio stream for recognition.
if err := sendAudioData(conn); err != nil {
log.Fatal("Failed to send audio: ", err)
}
// Send the finish-task command.
if err := sendFinishTaskCmd(conn, taskID); err != nil {
log.Fatal("Failed to send finish-task command: ", err)
}
// Wait for the task to finish or fail.
<-taskDone
}
// Define structs to represent the JSON data.
type Header struct {
Action string `json:"action"`
TaskID string `json:"task_id"`
Streaming string `json:"streaming"`
Event string `json:"event"`
ErrorCode string `json:"error_code,omitempty"`
ErrorMessage string `json:"error_message,omitempty"`
Attributes map[string]interface{} `json:"attributes"`
}
type Output struct {
Sentence struct {
BeginTime int64 `json:"begin_time"`
EndTime *int64 `json:"end_time"`
Text string `json:"text"`
Words []struct {
BeginTime int64 `json:"begin_time"`
EndTime *int64 `json:"end_time"`
Text string `json:"text"`
Punctuation string `json:"punctuation"`
} `json:"words"`
} `json:"sentence"`
}
type Payload struct {
TaskGroup string `json:"task_group"`
Task string `json:"task"`
Function string `json:"function"`
Model string `json:"model"`
Parameters Params `json:"parameters"`
Input Input `json:"input"`
Output Output `json:"output,omitempty"`
Usage *struct {
Duration int `json:"duration"`
} `json:"usage,omitempty"`
}
type Params struct {
Format string `json:"format"`
SampleRate int `json:"sample_rate"`
DisfluencyRemovalEnabled bool `json:"disfluency_removal_enabled"`
}
type Input struct {
}
type Event struct {
Header Header `json:"header"`
Payload Payload `json:"payload"`
}
// Connect to the WebSocket service.
func connectWebSocket(apiKey string) (*websocket.Conn, error) {
header := make(http.Header)
header.Add("Authorization", fmt.Sprintf("bearer %s", apiKey))
conn, _, err := dialer.Dial(wsURL, header)
return conn, err
}
// Start a goroutine to asynchronously receive WebSocket messages.
func startResultReceiver(conn *websocket.Conn, taskStarted chan<- bool, taskDone chan<- bool) {
go func() {
for {
_, message, err := conn.ReadMessage()
if err != nil {
log.Println("Failed to read server message: ", err)
return
}
var event Event
err = json.Unmarshal(message, &event)
if err != nil {
log.Println("Failed to parse event: ", err)
continue
}
if handleEvent(conn, event, taskStarted, taskDone) {
return
}
}
}()
}
// Send the run-task command.
func sendRunTaskCmd(conn *websocket.Conn) (string, error) {
runTaskCmd, taskID, err := generateRunTaskCmd()
if err != nil {
return "", err
}
err = conn.WriteMessage(websocket.TextMessage, []byte(runTaskCmd))
return taskID, err
}
// Generate the run-task command.
func generateRunTaskCmd() (string, string, error) {
taskID := uuid.New().String()
runTaskCmd := Event{
Header: Header{
Action: "run-task",
TaskID: taskID,
Streaming: "duplex",
},
Payload: Payload{
TaskGroup: "audio",
Task: "asr",
Function: "recognition",
Model: "fun-asr-realtime",
Parameters: Params{
Format: "wav",
SampleRate: 16000,
},
Input: Input{},
},
}
runTaskCmdJSON, err := json.Marshal(runTaskCmd)
return string(runTaskCmdJSON), taskID, err
}
// Wait for the task-started event.
func waitForTaskStarted(taskStarted chan bool) {
select {
case <-taskStarted:
fmt.Println("Task started successfully.")
case <-time.After(10 * time.Second):
log.Fatal("Timeout waiting for task-started event; the task failed to start.")
}
}
// Send the audio data.
func sendAudioData(conn *websocket.Conn) error {
file, err := os.Open(audioFile)
if err != nil {
return err
}
defer file.Close()
buf := make([]byte, 1024)
for {
n, err := file.Read(buf)
if n == 0 {
break
}
if err != nil && err != io.EOF {
return err
}
err = conn.WriteMessage(websocket.BinaryMessage, buf[:n])
if err != nil {
return err
}
time.Sleep(100 * time.Millisecond)
}
return nil
}
// Send the finish-task command.
func sendFinishTaskCmd(conn *websocket.Conn, taskID string) error {
finishTaskCmd, err := generateFinishTaskCmd(taskID)
if err != nil {
return err
}
err = conn.WriteMessage(websocket.TextMessage, []byte(finishTaskCmd))
return err
}
// Generate the finish-task command.
func generateFinishTaskCmd(taskID string) (string, error) {
finishTaskCmd := Event{
Header: Header{
Action: "finish-task",
TaskID: taskID,
Streaming: "duplex",
},
Payload: Payload{
Input: Input{},
},
}
finishTaskCmdJSON, err := json.Marshal(finishTaskCmd)
return string(finishTaskCmdJSON), err
}
// Handle events.
func handleEvent(conn *websocket.Conn, event Event, taskStarted chan<- bool, taskDone chan<- bool) bool {
switch event.Header.Event {
case "task-started":
fmt.Println("Received task-started event.")
taskStarted <- true
case "result-generated":
if event.Payload.Output.Sentence.Text != "" {
fmt.Println("Recognition result:", event.Payload.Output.Sentence.Text)
}
if event.Payload.Usage != nil {
fmt.Println("Billable duration (seconds):", event.Payload.Usage.Duration)
}
case "task-finished":
fmt.Println("Task finished.")
taskDone <- true
return true
case "task-failed":
handleTaskFailed(event, conn)
taskDone <- true
return true
default:
log.Printf("Unexpected event: %v", event)
}
return false
}
// Handle the task-failed event.
func handleTaskFailed(event Event, conn *websocket.Conn) {
if event.Header.ErrorMessage != "" {
log.Fatalf("Task failed: %s", event.Header.ErrorMessage)
} else {
log.Fatal("Task failed due to an unknown reason.")
}
}
// Close the connection.
func closeConnection(conn *websocket.Conn) {
if conn != nil {
conn.Close()
}
}
Key concepts
Interaction sequence
The client and server follow a strict interaction sequence to ensure proper task execution.
Establish a connection: The client initiates a WebSocket connection request to the server and includes authentication information in the request headers.
Start the task: After the connection is established, the client sends a
run-taskcommand to specify the model and audio parameters to use.Confirm the task: The server returns a
task-startedevent, indicating it is ready to receive audio.Transfer data:
The client continuously sends binary audio.
During the recognition process, the server returns multiple
result-generatedevents in real time, containing intermediate and final recognition results.
End the task: After sending all audio, the client sends a
finish-taskcommand.Confirm task completion: After processing any remaining audio, the server returns a
task-finishedevent, signaling that the task completed successfully.Close the connection: The client or server closes the WebSocket connection.
Audio stream specifications
Channels: The binary audio sent to the server must be mono.
Format and encoding: Supported formats include pcm, wav, mp3, opus, speex, aac, and amr.
WAV files must be PCM-encoded.
Opus or Speex files must be encapsulated in an Ogg container.
For the amr format, only the AMR-NB type is supported.
Sample rate: The sample rate must match the
sample_rateparameter specified in therun-taskcommand and the requirements of the selected model.
Available models
International
In the international deployment mode, endpoints and data storage are in the Singapore region. Model inference compute resources are dynamically scheduled globally, excluding Chinese Mainland.
Model | Version | Unit price | Free quota (Note) |
fun-asr-realtime Currently, fun-asr-realtime-2025-11-07 | Stable | $0.00009/second | 36,000 seconds (10 hours) Valid for 90 days |
fun-asr-realtime-2025-11-07 | Snapshot |
Languages supported: Mandarin, Cantonese, Wu, Minnan, Hakka, Gan, Xiang, and Jin. Also supports Mandarin accents from Zhongyuan, Southwest, Jilu, Jianghuai, Lanyin, Jiaoliao, Northeast, Beijing, and Hong Kong–Taiwan regions—including Henan, Shaanxi, Hubei, Sichuan, Chongqing, Yunnan, Guizhou, Guangdong, Guangxi, Hebei, Tianjin, Shandong, Anhui, Nanjing, Jiangsu, Hangzhou, Gansu, and Ningxia. Also supports English and Japanese.
Sample rates supported: 16 kHz
Audio formats supported: pcm, wav, mp3, opus, speex, aac, amr
Chinese Mainland
In the Chinese Mainland deployment mode, endpoints and data storage are in the Beijing region. Model inference compute resources are limited to Chinese Mainland.
Model | Version | Unit price | Free quota (Note) |
fun-asr-realtime Currently, fun-asr-realtime-2025-11-07 | Stable | $0.000047/second | No free quota |
fun-asr-realtime-2026-02-28 | Snapshot | ||
fun-asr-realtime-2025-11-07 | Snapshot | ||
fun-asr-realtime-2025-09-15 | |||
fun-asr-flash-8k-realtime Currently, fun-asr-flash-8k-realtime-2026-01-28 | Stable | $0.000032/second | |
fun-asr-flash-8k-realtime-2026-01-28 | Snapshot |
Languages supported:
fun-asr-realtime, fun-asr-realtime-2026-02-28, fun-asr-realtime-2025-11-07: Chinese (Mandarin, Cantonese, Wu, Minnan, Hakka, Gan, Xiang, and Jin. Also supports Mandarin accents from Zhongyuan, Southwest, Jilu, Jianghuai, Lanyin, Jiaoliao, Northeast, Beijing, and Hong Kong–Taiwan regions—including Henan, Shaanxi, Hubei, Sichuan, Chongqing, Yunnan, Guizhou, Guangdong, Guangxi, Hebei, Tianjin, Shandong, Anhui, Nanjing, Jiangsu, Hangzhou, Gansu, and Ningxia), English, and Japanese.
fun-asr-realtime-2025-09-15: Chinese (Mandarin), English
Sample rates supported: 16 kHz
Sample rates supported:
fun-asr-flash-8k-realtime and fun-asr-flash-8k-realtime-2026-01-28: 8 kHz
All other models: 16 kHz
Audio formats supported: pcm, wav, mp3, opus, speex, aac, amr
API reference
Endpoint
wss://dashscope.aliyuncs.com/api-ws/v1/inferenceRequest headers
Parameter | Type | Required | Description |
Authorization | string | Yes | The authentication token in the format |
user-agent | string | No | The client identifier. It helps the server track the request source. |
X-DashScope-WorkSpace | string | No | Model Studio workspace ID. |
X-DashScope-DataInspection | string | No | Specifies whether to enable the data compliance check. Default: |
Commands (client to server)
Commands are JSON-formatted text messages sent from the client to manage a speech recognition task.
1. run-task command: Start a task
Purpose: After establishing a connection, send this command to start a speech recognition task and configure its parameters.
Example:
{
"header": {
"action": "run-task",
"task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
"streaming": "duplex"
},
"payload": {
"task_group": "audio",
"task": "asr",
"function": "recognition",
"model": "fun-asr-realtime",
"parameters": {
"format": "pcm",
"sample_rate": 16000
},
"input": {}
}
}header parameters:
Parameter | Type | Required | Description |
header.action | string | Yes | The command type. Must be |
header.task_id | string | Yes | A unique identifier for the task. The subsequent finish-task command must use the same |
header.streaming | string | Yes | The communication mode. Must be |
payload parameters:
Parameter | Type | Required | Description |
payload.task_group | string | Yes | The task group. Must be |
payload.task | string | Yes | The task type. Must be |
payload.function | string | Yes | The function type. Must be |
payload.model | string | Yes | Specifies the model to use. For details, see the model list. |
payload.input | object | Yes | The input configuration. Must be an empty object, |
payload.parameters | |||
format | string | Yes | The audio format. Supported formats include |
sample_rate | integer | Yes | The audio sample rate in Hz. The |
semantic_punctuation_enabled | boolean | No | Specifies whether to enable semantic punctuation. Default value:
Semantic punctuation provides more precise sentence boundaries, while VAD offers a faster response. You can adjust the |
max_sentence_silence | integer | No | The silence duration threshold for VAD. When using Voice Activity Detection (VAD) for segmentation, a silence period exceeding this value marks the end of a sentence. Default value: 1300. Valid range: [200, 6000]. Condition: This parameter takes effect only when |
multi_threshold_mode_enabled | boolean | No | Specifies whether to prevent excessively long sentences from being generated by VAD segmentation. Default value:
Condition: This parameter takes effect only when |
heartbeat | boolean | No | Specifies whether to maintain a persistent connection. Default value:
Note:
|
language_hints | array[string] | No | Sets the language codes for recognition. If the language is unknown in advance, leave this parameter unset and the model will identify it automatically. The system reads only the first value in the array and ignores all other values. Supported language codes by model:
|
speech_noise_threshold | float | No | Adjusts the speech-noise detection threshold to control VAD sensitivity. Range: [-1.0, 1.0]. Guidelines:
Important This is an advanced parameter. Adjustments can significantly affect recognition quality.
|
2. finish-task command: End a task
Purpose: After sending all audio data, the client sends this command to signal that the transmission is complete.
Example:
{
"header": {
"action": "finish-task",
"task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
"streaming": "duplex"
},
"payload": {
"input": {}
}
}header parameters:
Parameter | Type | Required | Description |
header.action | string | Yes | The command type. Must be |
header.task_id | string | Yes | The task ID. This must match the |
header.streaming | string | Yes | The communication mode. Must be |
payload parameters:
Parameter | Type | Required | Description |
payload.input | object | Yes | The input configuration. Must be an empty object, |
Events (server to client)
Events are JSON-formatted text messages sent from the server to provide the client with task status updates and recognition results.
1. task-started
Trigger: Occurs after the server successfully processes a run-task command.
Purpose: Notifies the client that the task has started and that the client can begin sending audio data.
Example:
{
"header": {
"task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
"event": "task-started",
"attributes": {}
},
"payload": {}
}header parameters:
Parameter | Type | Description |
header.event | string | The event type. Must be |
header.task_id | string | The task ID. |
2. result-generated
Trigger: Occurs during the recognition process whenever the server generates a new result.
Purpose: Returns real-time recognition results, including both intermediate and final sentence results.
Example:
{
"header": {
"task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
"event": "result-generated",
"attributes": {}
},
"payload": {
"output": {
"sentence": {
"begin_time": 170,
"end_time": 920,
"text": "Okay, I got it.",
"heartbeat": false,
"sentence_end": true,
"words": [
{
"begin_time": 170,
"end_time": 295,
"text": "Okay",
"punctuation": ","
},
{
"begin_time": 295,
"end_time": 503,
"text": "I",
"punctuation": ""
},
{
"begin_time": 503,
"end_time": 711,
"text": "got",
"punctuation": ""
},
{
"begin_time": 711,
"end_time": 920,
"text": "it",
"punctuation": "."
}
]
}
},
"usage": {
"duration": 3
}
}
}header parameters:
Parameter | Type | Description |
header.event | string | The event type. Must be |
header.task_id | string | The task ID. |
payload parameters:
Parameter | Type | Description |
output | object | Contains the recognition result in the output.sentence object. See details below. |
usage | object | When For final sentence results (when |
The object has the following format:payload.usage
Parameter | Type | Description |
duration | integer | The billing duration for the task, in seconds. |
The payload.output.sentence object has the following format:
Parameter | Type | Description |
begin_time | integer | The start time of the sentence in milliseconds (ms). |
end_time | integer | null | The end time of the sentence in milliseconds (ms). This value is |
text | string | The transcribed text content. |
words | array | An array of word timestamp objects. |
heartbeat | boolean | null | If this value is |
sentence_end | boolean | Indicates whether the current sentence is complete. |
The payload.output.sentence.words array contains word timestamp objects, each with the following format:
Parameter | Type | Description |
begin_time | integer | The start time of the word in milliseconds (ms). |
end_time | integer | The end time of the word in milliseconds (ms). |
text | string | The transcribed word. |
punctuation | string | The punctuation that follows the word. |
3. task-finished
Trigger: Occurs after the server receives a finish-task command and processes all cached audio.
Purpose: Signals that the recognition task is complete.
Example:
{
"header": {
"task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
"event": "task-finished",
"attributes": {}
},
"payload": {
"output": {}
}
}header parameters:
Parameter | Type | Description |
header.event | string | The event type. Must be |
header.task_id | string | The task ID. |
4. task-failed
Trigger: Occurs if any error is encountered during task processing.
Purpose: Notifies the client that the task has failed and provides an error message. After receiving this event, close the WebSocket connection and handle the error.
Example:
{
"header": {
"task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
"event": "task-failed",
"error_code": "CLIENT_ERROR",
"error_message": "request timeout after 23 seconds.",
"attributes": {}
},
"payload": {}
}header parameters:
Parameter | Type | Description |
header.event | string | The event type. Must be |
header.task_id | string | The task ID. |
header.error_code | string | The error code. |
header.error_message | string | A detailed error message. |
Connection overhead and connection reuse
The WebSocket service supports connection reuse to improve resource efficiency and avoid connection overhead.
The client starts a task by sending a run-task command and ends it by sending a finish-task command. The server confirms the task's completion by returning a task-finished event, after which the client can reuse the connection to start another task by sending a new run-task command.
Each task on a reused connection must have a unique task_id.
If a task fails, the service returns a task-failed event and closes the connection. This connection cannot be reused.
If the client does not start a new task within 60 seconds after a task finishes, the connection times out.
Error codes
For troubleshooting, see error messages.
FAQ
Features
Maintain a long-lived connection
Set the heartbeat request parameter to true and continuously send silent audio to the server.
Note:
Silent audio is any part of an audio file or stream with no audible signal.
You can create silent audio using audio editing software (such as Audacity or Adobe Audition) or command-line tools (such as FFmpeg).
Convert audio format
You can use the FFmpeg tool. For more information, see the official FFmpeg website.
# Basic conversion command (universal template)
# -i: Specifies the input file path. Example: audio.wav
# -c:a: Specifies the audio encoder. Examples: aac, libmp3lame, pcm_s16le
# -b:a: Specifies the bit rate (audio quality). Examples: 192k, 320k
# -ar: Specifies the sample rate.
# -ac: Specifies the number of channels. Examples: 1 (mono), 2 (stereo)
# -y: Overwrites the output file if it exists (no value required).
ffmpeg -i input_audio.ext -c:a encoder_name -b:a bit_rate -ar sample_rate -ac channels output.ext
# Example: Convert WAV to MP3 (maintaining original quality)
ffmpeg -i input.wav -c:a libmp3lame -q:a 0 output.mp3
# Example: Convert MP3 to WAV (standard 16-bit PCM format)
ffmpeg -i input.mp3 -c:a pcm_s16le -ar 16000 -ac 2 output.wav
# Example: Convert M4A to AAC (extract/convert Apple audio)
ffmpeg -i input.m4a -c:a copy output.aac # Extract without re-encoding
ffmpeg -i input.m4a -c:a aac -b:a 256k output.aac # Re-encode for higher quality
# Example: Convert lossless FLAC to Opus (high compression)
ffmpeg -i input.flac -c:a libopus -b:a 128k -vbr on output.opusWebSocket vs. HTTP/RESTful API
The speech service uses the WebSocket protocol because it requires full-duplex communication. The WebSocket protocol allows the server and client to exchange data simultaneously, such as sending real-time updates on speech synthesis or recognition progress. In contrast, RESTful APIs, which are based on the HTTP/HTTPS protocol, only support a one-way request-response model initiated by the client. This model does not support real-time interaction.
Troubleshooting
If you receive an error code, refer to Error codes for troubleshooting.
Audio not recognized
Check that the audio format (
format) and sample rate (sample_rate) in your request parameters are set correctly and meet the parameter constraints. The following are common errors:An audio file has a .wav extension but is actually in MP3 format; in this case, setting the
formatrequest parameter tomp3is incorrect.The audio's sample rate is 3600 Hz, but the
sample_raterequest parameter is incorrectly set to 48000.
You can use the ffprobe tool to check the audio container, encoding, sample rate, and channels:
ffprobe -v error -show_entries format=format_name -show_entries stream=codec_name,sample_rate,channels -of default=noprint_wrappers=1 input.xxx