All Products
Search
Document Center

Alibaba Cloud Model Studio:WebSocket API

Last Updated:Mar 25, 2026

Use the WebSocket protocol to integrate directly with the Fun-ASR real-time speech recognition service. This is compatible with any programming language that supports WebSocket. Although we offer higher-level SDKs for Python and Java to simplify integration, this generic protocol offers maximum flexibility.

For model descriptions and selection guidance, see real-time speech recognition - Fun-ASR/Paraformer.

Getting started

Prerequisites

  1. Get an API key. For security, we recommend storing the API key in an environment variable.

  2. Download the sample audio file: asr_example.wav.

Sample code

Node.js

Install the required dependencies:

npm install ws
npm install uuid

Use the following sample code:

const fs = require('fs');
const WebSocket = require('ws');
const { v4: uuidv4 } = require('uuid'); // Used to generate a UUID

// API keys for the Singapore and China (Beijing) regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If you have not set an environment variable, replace the following line with your Model Studio API key: const apiKey = "sk-xxx"
const apiKey = process.env.DASHSCOPE_API_KEY;
// This is the endpoint for the Singapore region. To use a model in the China (Beijing) region, replace the URL with: wss://dashscope.aliyuncs.com/api-ws/v1/inference/
const url = 'wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference/'; // WebSocket server endpoint
const audioFile = 'asr_example.wav'; // Replace with the path to your audio file

// Generate a 32-character random ID
const TASK_ID = uuidv4().replace(/-/g, '').slice(0, 32);

// Create a WebSocket client
const ws = new WebSocket(url, {
  headers: {
    Authorization: `bearer ${apiKey}`
  }
});

let taskStarted = false; // A flag to indicate whether the task has started

// Once the connection opens, send the run-task command
ws.on('open', () => {
  console.log('Connected to the server');
  sendRunTask();
});

// Handle incoming messages
ws.on('message', (data) => {
  const message = JSON.parse(data);
  switch (message.header.event) {
    case 'task-started':
      console.log('Task started');
      taskStarted = true;
      sendAudioStream();
      break;
    case 'result-generated':
      console.log('Recognition result:', message.payload.output.sentence.text);
      if (message.payload.usage) {
        console.log('Billable duration (seconds):', message.payload.usage.duration);
      }
      break;
    case 'task-finished':
      console.log('Task finished');
      ws.close();
      break;
    case 'task-failed':
      console.error('Task failed:', message.header.error_message);
      ws.close();
      break;
    default:
      console.log('Unknown event:', message.header.event);
  }
});

// If no task-started event is received, close the connection
ws.on('close', () => {
  if (!taskStarted) {
    console.error('Task did not start. Closing the connection.');
  }
});

// Send the run-task command
function sendRunTask() {
  const runTaskMessage = {
    header: {
      action: 'run-task',
      task_id: TASK_ID,
      streaming: 'duplex'
    },
    payload: {
      task_group: 'audio',
      task: 'asr',
      function: 'recognition',
      model: 'fun-asr-realtime',
      parameters: {
        sample_rate: 16000,
        format: 'wav'
      },
      input: {}
    }
  };
  ws.send(JSON.stringify(runTaskMessage));
}

// Send the audio stream
function sendAudioStream() {
  const audioStream = fs.createReadStream(audioFile);
  let chunkCount = 0;

  function sendNextChunk() {
    const chunk = audioStream.read();
    if (chunk) {
      ws.send(chunk);
      chunkCount++;
      setTimeout(sendNextChunk, 100); // Send a chunk every 100 ms
    }
  }

  audioStream.on('readable', () => {
    sendNextChunk();
  });

  audioStream.on('end', () => {
    console.log('Audio stream ended');
    sendFinishTask();
  });

  audioStream.on('error', (err) => {
    console.error('Error reading the audio file:', err);
    ws.close();
  });
}

// Send the finish-task command
function sendFinishTask() {
  const finishTaskMessage = {
    header: {
      action: 'finish-task',
      task_id: TASK_ID,
      streaming: 'duplex'
    },
    payload: {
      input: {}
    }
  };
  ws.send(JSON.stringify(finishTaskMessage));
}

// Handle errors
ws.on('error', (error) => {
  console.error('WebSocket error:', error);
});

C#

Use the following sample code:

using System.Net.WebSockets;
using System.Text;
using System.Text.Json;
using System.Text.Json.Nodes;

class Program {
    private static ClientWebSocket _webSocket = new ClientWebSocket();
    private static CancellationTokenSource _cancellationTokenSource = new CancellationTokenSource();
    private static bool _taskStartedReceived = false;
    private static bool _taskFinishedReceived = false;
    // API keys for the Singapore and China (Beijing) regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    // If you have not set an environment variable, replace the following line with your Model Studio API key: private static readonly string ApiKey = "sk-xxx"
    private static readonly string ApiKey = Environment.GetEnvironmentVariable("DASHSCOPE_API_KEY") ?? throw new InvalidOperationException("DASHSCOPE_API_KEY environment variable is not set.");

    // This is the endpoint for the Singapore region. To use a model in the China (Beijing) region, replace the URL with: wss://dashscope.aliyuncs.com/api-ws/v1/inference/
    private const string WebSocketUrl = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference/";
    // Replace with the path to your audio file
    private const string AudioFilePath = "asr_example.wav";

    static async Task Main(string[] args) {
        // Establish a WebSocket connection and configure headers for authentication.
        _webSocket.Options.SetRequestHeader("Authorization", $"bearer {ApiKey}");

        await _webSocket.ConnectAsync(new Uri(WebSocketUrl), _cancellationTokenSource.Token);

        // Start a task to receive WebSocket messages asynchronously.
        var receiveTask = ReceiveMessagesAsync();

        // Send the run-task command.
        string _taskId = Guid.NewGuid().ToString("N"); // Generate a 32-character random ID.
        var runTaskJson = GenerateRunTaskJson(_taskId);
        await SendAsync(runTaskJson);

        // Wait for the task-started event.
        while (!_taskStartedReceived) {
            await Task.Delay(100, _cancellationTokenSource.Token);
        }

        // Read the local file and stream the audio to the server for recognition.
        await SendAudioStreamAsync(AudioFilePath);

        // Send the finish-task command to end the task.
        var finishTaskJson = GenerateFinishTaskJson(_taskId);
        await SendAsync(finishTaskJson);

        // Wait for the task-finished event.
        while (!_taskFinishedReceived && !_cancellationTokenSource.IsCancellationRequested) {
            try {
                await Task.Delay(100, _cancellationTokenSource.Token);
            } catch (OperationCanceledException) {
                // The task was canceled, so exit the loop.
                break;
            }
        }

        // Close the connection.
        if (!_cancellationTokenSource.IsCancellationRequested) {
            await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", _cancellationTokenSource.Token);
        }

        _cancellationTokenSource.Cancel();
        try {
            await receiveTask;
        } catch (OperationCanceledException) {
            // Ignore operation canceled exceptions.
        }
    }

    private static async Task ReceiveMessagesAsync() {
        try {
            while (_webSocket.State == WebSocketState.Open && !_cancellationTokenSource.IsCancellationRequested) {
                var message = await ReceiveMessageAsync(_cancellationTokenSource.Token);
                if (message != null) {
                    var eventValue = message["header"]?["event"]?.GetValue<string>();
                    switch (eventValue) {
                        case "task-started":
                            Console.WriteLine("Task started successfully.");
                            _taskStartedReceived = true;
                            break;
                        case "result-generated":
                            Console.WriteLine($"Recognition result: {message["payload"]?["output"]?["sentence"]?["text"]?.GetValue<string>()}");
                            if (message["payload"]?["usage"] != null && message["payload"]?["usage"]?["duration"] != null) {
                                Console.WriteLine($"Billable duration (seconds): {message["payload"]?["usage"]?["duration"]?.GetValue<int>()}");
                            }
                            break;
                        case "task-finished":
                            Console.WriteLine("Task finished.");
                            _taskFinishedReceived = true;
                            _cancellationTokenSource.Cancel();
                            break;
                        case "task-failed":
                            Console.WriteLine($"Task failed: {message["header"]?["error_message"]?.GetValue<string>()}");
                            _cancellationTokenSource.Cancel();
                            break;
                    }
                }
            }
        } catch (OperationCanceledException) {
            // Ignore operation canceled exceptions.
        }
    }

    private static async Task<JsonNode?> ReceiveMessageAsync(CancellationToken cancellationToken) {
        var buffer = new byte[1024 * 4];
        var segment = new ArraySegment<byte>(buffer);
        var result = await _webSocket.ReceiveAsync(segment, cancellationToken);

        if (result.MessageType == WebSocketMessageType.Close) {
            await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", cancellationToken);
            return null;
        }

        var message = Encoding.UTF8.GetString(buffer, 0, result.Count);
        return JsonNode.Parse(message);
    }

    private static async Task SendAsync(string message) {
        var buffer = Encoding.UTF8.GetBytes(message);
        var segment = new ArraySegment<byte>(buffer);
        await _webSocket.SendAsync(segment, WebSocketMessageType.Text, true, _cancellationTokenSource.Token);
    }

    private static async Task SendAudioStreamAsync(string filePath) {
        using (var audioStream = File.OpenRead(filePath)) {
            var buffer = new byte[1024]; // Create a buffer for audio chunks.
            int bytesRead;

            while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0) {
                var segment = new ArraySegment<byte>(buffer, 0, bytesRead);
                await _webSocket.SendAsync(segment, WebSocketMessageType.Binary, true, _cancellationTokenSource.Token);
                await Task.Delay(100); // Wait 100 ms between chunks.
            }
        }
    }

    private static string GenerateRunTaskJson(string taskId) {
        var runTask = new JsonObject {
            ["header"] = new JsonObject {
                ["action"] = "run-task",
                ["task_id"] = taskId,
                ["streaming"] = "duplex"
            },
            ["payload"] = new JsonObject {
                ["task_group"] = "audio",
                ["task"] = "asr",
                ["function"] = "recognition",
                ["model"] = "fun-asr-realtime",
                ["parameters"] = new JsonObject {
                    ["format"] = "wav",
                    ["sample_rate"] = 16000,
                },
                ["input"] = new JsonObject()
            }
        };
        return JsonSerializer.Serialize(runTask);
    }

    private static string GenerateFinishTaskJson(string taskId) {
        var finishTask = new JsonObject {
            ["header"] = new JsonObject {
                ["action"] = "finish-task",
                ["task_id"] = taskId,
                ["streaming"] = "duplex"
            },
            ["payload"] = new JsonObject {
                ["input"] = new JsonObject()
            }
        };
        return JsonSerializer.Serialize(finishTask);
    }
}

PHP

The sample code uses the following directory structure:

my-php-project/

├── composer.json

├── vendor/

└── index.php

Use the following content for the composer.json file. You can specify the versions of the dependencies based on your needs.

{
    "require": {
        "react/event-loop": "^1.3",
        "react/socket": "^1.11",
        "react/stream": "^1.2",
        "react/http": "^1.1",
        "ratchet/pawl": "^0.4"
    },
    "autoload": {
        "psr-4": {
            "App\\": "src/"
        }
    }
}

The index.php file contains the following code:

<?php

require __DIR__ . '/vendor/autoload.php';

use Ratchet\Client\Connector;
use React\EventLoop\Loop;
use React\Socket\Connector as SocketConnector;
use Ratchet\rfc6455\Messaging\Frame;

// API keys for the Singapore and China (Beijing) regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
// If you have not set an environment variable, replace the following line with your Model Studio API key: $api_key = "sk-xxx"
$api_key = getenv("DASHSCOPE_API_KEY");
// This is the endpoint for the Singapore region. To use a model in the China (Beijing) region, replace the URL with: wss://dashscope.aliyuncs.com/api-ws/v1/inference/
$websocket_url = 'wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference/';
$audio_file_path = 'asr_example.wav'; // Replace with the path to your audio file

$loop = Loop::get();

// Create a custom connector.
$socketConnector = new SocketConnector($loop, [
    'tcp' => [
        'bindto' => '0.0.0.0:0',
    ],
    'tls' => [
        'verify_peer' => false,
        'verify_peer_name' => false,
    ],
]);

$connector = new Connector($loop, $socketConnector);

$headers = [
    'Authorization' => 'bearer ' . $api_key
];

$connector($websocket_url, [], $headers)->then(function ($conn) use ($loop, $audio_file_path) {
    echo "Connected to the WebSocket server\n";

    // Start listening for WebSocket messages asynchronously.
    $conn->on('message', function($msg) use ($conn, $loop, $audio_file_path) {
        $response = json_decode($msg, true);

        if (isset($response['header']['event'])) {
            handleEvent($conn, $response, $loop, $audio_file_path);
        } else {
            echo "Unknown message format\n";
        }
    });

    // Listen for the connection close event.
    $conn->on('close', function($code = null, $reason = null) {
        echo "Connection closed\n";
        if ($code !== null) {
            echo "Close code: " . $code . "\n";
        }
        if ($reason !== null) {
            echo "Close reason: " . $reason . "\n";
        }
    });

    // Generate a task ID.
    $taskId = generateTaskId();

    // Send the run-task command.
    sendRunTaskMessage($conn, $taskId);

}, function ($e) {
    echo "Could not connect: {$e->getMessage()}\n";
});

$loop->run();

/**
 * Generate a task ID.
 * @return string
 */
function generateTaskId(): string {
    return bin2hex(random_bytes(16));
}

/**
 * Send the run-task command.
 * @param $conn
 * @param $taskId
 */
function sendRunTaskMessage($conn, $taskId) {
    $runTaskMessage = json_encode([
        "header" => [
            "action" => "run-task",
            "task_id" => $taskId,
            "streaming" => "duplex"
        ],
        "payload" => [
            "task_group" => "audio",
            "task" => "asr",
            "function" => "recognition",
            "model" => "fun-asr-realtime",
            "parameters" => [
                "format" => "wav",
                "sample_rate" => 16000
            ],
            "input" => []
        ]
    ]);
    echo "Preparing to send the run-task command: " . $runTaskMessage . "\n";
    $conn->send($runTaskMessage);
    echo "run-task command sent\n";
}

/**
 * Read the audio file.
 * @param string $filePath
 * @return bool|string
 */
function readAudioFile(string $filePath) {
    $voiceData = file_get_contents($filePath);
    if ($voiceData === false) {
        echo "Failed to read the audio file\n";
    }
    return $voiceData;
}

/**
 * Split the audio data into chunks.
 * @param string $data
 * @param int $chunkSize
 * @return array
 */
function splitAudioData(string $data, int $chunkSize): array {
    return str_split($data, $chunkSize);
}

/**
 * Send the finish-task command.
 * @param $conn
 * @param $taskId
 */
function sendFinishTaskMessage($conn, $taskId) {
    $finishTaskMessage = json_encode([
        "header" => [
            "action" => "finish-task",
            "task_id" => $taskId,
            "streaming" => "duplex"
        ],
        "payload" => [
            "input" => []
        ]
    ]);
    echo "Preparing to send the finish-task command: " . $finishTaskMessage . "\n";
    $conn->send($finishTaskMessage);
    echo "finish-task command sent\n";
}

/**
 * Handle events.
 * @param $conn
 * @param $response
 * @param $loop
 * @param $audio_file_path
 */
function handleEvent($conn, $response, $loop, $audio_file_path) {
    static $taskId;
    static $chunks;
    static $allChunksSent = false;

    if (is_null($taskId)) {
        $taskId = generateTaskId();
    }

    switch ($response['header']['event']) {
        case 'task-started':
            echo "Task started, sending audio data...\n";
            // Read the audio file.
            $voiceData = readAudioFile($audio_file_path);
            if ($voiceData === false) {
                echo "Failed to read the audio file\n";
                $conn->close();
                return;
            }

            // Split the audio data into chunks.
            $chunks = splitAudioData($voiceData, 1024);

            // Define the sending function.
            $sendChunk = function() use ($conn, &$chunks, $loop, &$sendChunk, &$allChunksSent, $taskId) {
                if (!empty($chunks)) {
                    $chunk = array_shift($chunks);
                    $binaryMsg = new Frame($chunk, true, Frame::OP_BINARY);
                    $conn->send($binaryMsg);
                    // Send the next chunk after 100 ms.
                    $loop->addTimer(0.1, $sendChunk);
                } else {
                    echo "All chunks have been sent\n";
                    $allChunksSent = true;

                    // Send the finish-task command.
                    sendFinishTaskMessage($conn, $taskId);
                }
            };

            // Start sending audio data.
            $sendChunk();
            break;
        case 'result-generated':
            $result = $response['payload']['output']['sentence'];
            echo "Recognition result: " . $result['text'] . "\n";
            if (isset($response['payload']['usage']['duration'])) {
                echo "Billable duration (seconds): " . $response['payload']['usage']['duration'] . "\n";
            }
            break;
        case 'task-finished':
            echo "Task finished\n";
            $conn->close();
            break;
        case 'task-failed':
            echo "Task failed\n";
            echo "Error code: " . $response['header']['error_code'] . "\n";
            echo "Error message: " . $response['header']['error_message'] . "\n";
            $conn->close();
            break;
        case 'error':
            echo "Error: " . $response['payload']['message'] . "\n";
            break;
        default:
            echo "Unknown event: " . $response['header']['event'] . "\n";
            break;
    }

    // If all chunks are sent and the task is finished, close the connection.
    if ($allChunksSent && $response['header']['event'] == 'task-finished') {
        // Wait 1 second to ensure all data has been transmitted.
        $loop->addTimer(1, function() use ($conn) {
            $conn->close();
            echo "Client closed the connection\n";
        });
    }
}

Go

package main

import (
	"encoding/json"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"time"

	"github.com/google/uuid"
	"github.com/gorilla/websocket"
)

const (
	// This is the endpoint for the Singapore region. To use a model in the China (Beijing) region, replace the URL with: wss://dashscope.aliyuncs.com/api-ws/v1/inference/
	wsURL     = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference/" // WebSocket server endpoint
	audioFile = "asr_example.wav"                                   // Replace with the path to your audio file
)

var dialer = websocket.DefaultDialer

func main() {
	// API keys for the Singapore and China (Beijing) regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    // If you have not set an environment variable, replace the following line with your Model Studio API key: apiKey := "sk-xxx"
	apiKey := os.Getenv("DASHSCOPE_API_KEY")

	// Connect to the WebSocket service.
	conn, err := connectWebSocket(apiKey)
	if err != nil {
		log.Fatal("Failed to connect to WebSocket: ", err)
	}
	defer closeConnection(conn)

	// Start a goroutine to receive results.
	taskStarted := make(chan bool)
	taskDone := make(chan bool)
	startResultReceiver(conn, taskStarted, taskDone)

	// Send the run-task command.
	taskID, err := sendRunTaskCmd(conn)
	if err != nil {
		log.Fatal("Failed to send run-task command: ", err)
	}

	// Wait for the task-started event.
	waitForTaskStarted(taskStarted)

	// Send the audio stream for recognition.
	if err := sendAudioData(conn); err != nil {
		log.Fatal("Failed to send audio: ", err)
	}

	// Send the finish-task command.
	if err := sendFinishTaskCmd(conn, taskID); err != nil {
		log.Fatal("Failed to send finish-task command: ", err)
	}

	// Wait for the task to finish or fail.
	<-taskDone
}

// Define structs to represent the JSON data.
type Header struct {
	Action       string                 `json:"action"`
	TaskID       string                 `json:"task_id"`
	Streaming    string                 `json:"streaming"`
	Event        string                 `json:"event"`
	ErrorCode    string                 `json:"error_code,omitempty"`
	ErrorMessage string                 `json:"error_message,omitempty"`
	Attributes   map[string]interface{} `json:"attributes"`
}

type Output struct {
	Sentence struct {
		BeginTime int64  `json:"begin_time"`
		EndTime   *int64 `json:"end_time"`
		Text      string `json:"text"`
		Words     []struct {
			BeginTime   int64  `json:"begin_time"`
			EndTime     *int64 `json:"end_time"`
			Text        string `json:"text"`
			Punctuation string `json:"punctuation"`
		} `json:"words"`
	} `json:"sentence"`
}

type Payload struct {
	TaskGroup  string `json:"task_group"`
	Task       string `json:"task"`
	Function   string `json:"function"`
	Model      string `json:"model"`
	Parameters Params `json:"parameters"`
	Input      Input  `json:"input"`
	Output     Output `json:"output,omitempty"`
	Usage      *struct {
		Duration int `json:"duration"`
	} `json:"usage,omitempty"`
}

type Params struct {
	Format                   string `json:"format"`
	SampleRate               int    `json:"sample_rate"`
	DisfluencyRemovalEnabled bool   `json:"disfluency_removal_enabled"`
}

type Input struct {
}

type Event struct {
	Header  Header  `json:"header"`
	Payload Payload `json:"payload"`
}

// Connect to the WebSocket service.
func connectWebSocket(apiKey string) (*websocket.Conn, error) {
	header := make(http.Header)
	header.Add("Authorization", fmt.Sprintf("bearer %s", apiKey))
	conn, _, err := dialer.Dial(wsURL, header)
	return conn, err
}

// Start a goroutine to asynchronously receive WebSocket messages.
func startResultReceiver(conn *websocket.Conn, taskStarted chan<- bool, taskDone chan<- bool) {
	go func() {
		for {
			_, message, err := conn.ReadMessage()
			if err != nil {
				log.Println("Failed to read server message: ", err)
				return
			}
			var event Event
			err = json.Unmarshal(message, &event)
			if err != nil {
				log.Println("Failed to parse event: ", err)
				continue
			}
			if handleEvent(conn, event, taskStarted, taskDone) {
				return
			}
		}
	}()
}

// Send the run-task command.
func sendRunTaskCmd(conn *websocket.Conn) (string, error) {
	runTaskCmd, taskID, err := generateRunTaskCmd()
	if err != nil {
		return "", err
	}
	err = conn.WriteMessage(websocket.TextMessage, []byte(runTaskCmd))
	return taskID, err
}

// Generate the run-task command.
func generateRunTaskCmd() (string, string, error) {
	taskID := uuid.New().String()
	runTaskCmd := Event{
		Header: Header{
			Action:    "run-task",
			TaskID:    taskID,
			Streaming: "duplex",
		},
		Payload: Payload{
			TaskGroup: "audio",
			Task:      "asr",
			Function:  "recognition",
			Model:     "fun-asr-realtime",
			Parameters: Params{
				Format:     "wav",
				SampleRate: 16000,
			},
			Input: Input{},
		},
	}
	runTaskCmdJSON, err := json.Marshal(runTaskCmd)
	return string(runTaskCmdJSON), taskID, err
}

// Wait for the task-started event.
func waitForTaskStarted(taskStarted chan bool) {
	select {
	case <-taskStarted:
		fmt.Println("Task started successfully.")
	case <-time.After(10 * time.Second):
		log.Fatal("Timeout waiting for task-started event; the task failed to start.")
	}
}

// Send the audio data.
func sendAudioData(conn *websocket.Conn) error {
	file, err := os.Open(audioFile)
	if err != nil {
		return err
	}
	defer file.Close()

	buf := make([]byte, 1024)
	for {
		n, err := file.Read(buf)
		if n == 0 {
			break
		}
		if err != nil && err != io.EOF {
			return err
		}
		err = conn.WriteMessage(websocket.BinaryMessage, buf[:n])
		if err != nil {
			return err
		}
		time.Sleep(100 * time.Millisecond)
	}
	return nil
}

// Send the finish-task command.
func sendFinishTaskCmd(conn *websocket.Conn, taskID string) error {
	finishTaskCmd, err := generateFinishTaskCmd(taskID)
	if err != nil {
		return err
	}
	err = conn.WriteMessage(websocket.TextMessage, []byte(finishTaskCmd))
	return err
}

// Generate the finish-task command.
func generateFinishTaskCmd(taskID string) (string, error) {
	finishTaskCmd := Event{
		Header: Header{
			Action:    "finish-task",
			TaskID:    taskID,
			Streaming: "duplex",
		},
		Payload: Payload{
			Input: Input{},
		},
	}
	finishTaskCmdJSON, err := json.Marshal(finishTaskCmd)
	return string(finishTaskCmdJSON), err
}

// Handle events.
func handleEvent(conn *websocket.Conn, event Event, taskStarted chan<- bool, taskDone chan<- bool) bool {
	switch event.Header.Event {
	case "task-started":
		fmt.Println("Received task-started event.")
		taskStarted <- true
	case "result-generated":
		if event.Payload.Output.Sentence.Text != "" {
			fmt.Println("Recognition result:", event.Payload.Output.Sentence.Text)
		}
		if event.Payload.Usage != nil {
			fmt.Println("Billable duration (seconds):", event.Payload.Usage.Duration)
		}
	case "task-finished":
		fmt.Println("Task finished.")
		taskDone <- true
		return true
	case "task-failed":
		handleTaskFailed(event, conn)
		taskDone <- true
		return true
	default:
		log.Printf("Unexpected event: %v", event)
	}
	return false
}

// Handle the task-failed event.
func handleTaskFailed(event Event, conn *websocket.Conn) {
	if event.Header.ErrorMessage != "" {
		log.Fatalf("Task failed: %s", event.Header.ErrorMessage)
	} else {
		log.Fatal("Task failed due to an unknown reason.")
	}
}

// Close the connection.
func closeConnection(conn *websocket.Conn) {
	if conn != nil {
		conn.Close()
	}
}

Key concepts

Interaction sequence

The client and server follow a strict interaction sequence to ensure proper task execution.

image
  1. Establish a connection: The client initiates a WebSocket connection request to the server and includes authentication information in the request headers.

  2. Start the task: After the connection is established, the client sends a run-task command to specify the model and audio parameters to use.

  3. Confirm the task: The server returns a task-started event, indicating it is ready to receive audio.

  4. Transfer data:

    • The client continuously sends binary audio.

    • During the recognition process, the server returns multiple result-generated events in real time, containing intermediate and final recognition results.

  5. End the task: After sending all audio, the client sends a finish-task command.

  6. Confirm task completion: After processing any remaining audio, the server returns a task-finished event, signaling that the task completed successfully.

  7. Close the connection: The client or server closes the WebSocket connection.

Audio stream specifications

  • Channels: The binary audio sent to the server must be mono.

  • Format and encoding: Supported formats include pcm, wav, mp3, opus, speex, aac, and amr.

    • WAV files must be PCM-encoded.

    • Opus or Speex files must be encapsulated in an Ogg container.

    • For the amr format, only the AMR-NB type is supported.

  • Sample rate: The sample rate must match the sample_rate parameter specified in the run-task command and the requirements of the selected model.

Available models

International

In the international deployment mode, endpoints and data storage are in the Singapore region. Model inference compute resources are dynamically scheduled globally, excluding Chinese Mainland.

Model

Version

Unit price

Free quota (Note)

fun-asr-realtime

Currently, fun-asr-realtime-2025-11-07

Stable

$0.00009/second

36,000 seconds (10 hours)

Valid for 90 days

fun-asr-realtime-2025-11-07

Snapshot

  • Languages supported: Mandarin, Cantonese, Wu, Minnan, Hakka, Gan, Xiang, and Jin. Also supports Mandarin accents from Zhongyuan, Southwest, Jilu, Jianghuai, Lanyin, Jiaoliao, Northeast, Beijing, and Hong Kong–Taiwan regions—including Henan, Shaanxi, Hubei, Sichuan, Chongqing, Yunnan, Guizhou, Guangdong, Guangxi, Hebei, Tianjin, Shandong, Anhui, Nanjing, Jiangsu, Hangzhou, Gansu, and Ningxia. Also supports English and Japanese.

  • Sample rates supported: 16 kHz

  • Audio formats supported: pcm, wav, mp3, opus, speex, aac, amr

Chinese Mainland

In the Chinese Mainland deployment mode, endpoints and data storage are in the Beijing region. Model inference compute resources are limited to Chinese Mainland.

Model

Version

Unit price

Free quota (Note)

fun-asr-realtime

Currently, fun-asr-realtime-2025-11-07

Stable

$0.000047/second

No free quota

fun-asr-realtime-2026-02-28

Snapshot

fun-asr-realtime-2025-11-07

Snapshot

fun-asr-realtime-2025-09-15

fun-asr-flash-8k-realtime

Currently, fun-asr-flash-8k-realtime-2026-01-28

Stable

$0.000032/second

fun-asr-flash-8k-realtime-2026-01-28

Snapshot

  • Languages supported:

    • fun-asr-realtime, fun-asr-realtime-2026-02-28, fun-asr-realtime-2025-11-07: Chinese (Mandarin, Cantonese, Wu, Minnan, Hakka, Gan, Xiang, and Jin. Also supports Mandarin accents from Zhongyuan, Southwest, Jilu, Jianghuai, Lanyin, Jiaoliao, Northeast, Beijing, and Hong Kong–Taiwan regions—including Henan, Shaanxi, Hubei, Sichuan, Chongqing, Yunnan, Guizhou, Guangdong, Guangxi, Hebei, Tianjin, Shandong, Anhui, Nanjing, Jiangsu, Hangzhou, Gansu, and Ningxia), English, and Japanese.

    • fun-asr-realtime-2025-09-15: Chinese (Mandarin), English

  • Sample rates supported: 16 kHz

  • Sample rates supported:

    • fun-asr-flash-8k-realtime and fun-asr-flash-8k-realtime-2026-01-28: 8 kHz

    • All other models: 16 kHz

  • Audio formats supported: pcm, wav, mp3, opus, speex, aac, amr

API reference

Endpoint

wss://dashscope.aliyuncs.com/api-ws/v1/inference

Request headers

Parameter

Type

Required

Description

Authorization

string

Yes

The authentication token in the format Bearer <your_api_key>. Replace "<your_api_key>" with your actual API key.

user-agent

string

No

The client identifier. It helps the server track the request source.

X-DashScope-WorkSpace

string

No

Model Studio workspace ID.

X-DashScope-DataInspection

string

No

Specifies whether to enable the data compliance check. Default: enable. Do not enable this parameter unless necessary.

Commands (client to server)

Commands are JSON-formatted text messages sent from the client to manage a speech recognition task.

1. run-task command: Start a task

Purpose: After establishing a connection, send this command to start a speech recognition task and configure its parameters.

Example:

{
    "header": {
        "action": "run-task",
        "task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
        "streaming": "duplex"
    },
    "payload": {
        "task_group": "audio",
        "task": "asr",
        "function": "recognition",
        "model": "fun-asr-realtime",
        "parameters": {
            "format": "pcm",
            "sample_rate": 16000
        },
        "input": {}
    }
}

header parameters:

Parameter

Type

Required

Description

header.action

string

Yes

The command type. Must be run-task.

header.task_id

string

Yes

A unique identifier for the task. The subsequent finish-task command must use the same task_id.

header.streaming

string

Yes

The communication mode. Must be duplex.

payload parameters:

Parameter

Type

Required

Description

payload.task_group

string

Yes

The task group. Must be audio.

payload.task

string

Yes

The task type. Must be asr.

payload.function

string

Yes

The function type. Must be recognition.

payload.model

string

Yes

Specifies the model to use. For details, see the model list.

payload.input

object

Yes

The input configuration. Must be an empty object, {}.

payload.parameters

format

string

Yes

The audio format. Supported formats include pcm, wav, mp3, opus, speex, aac, and amr. For detailed constraints, see Audio stream specifications.

sample_rate

integer

Yes

The audio sample rate in Hz.

The fun-asr-realtime model supports a 16000 Hz sample rate.

semantic_punctuation_enabled

boolean

No

Specifies whether to enable semantic punctuation.

Default value: false.

  • true: Enables semantic punctuation. Sentence splitting based on Voice Activity Detection (VAD) is disabled. This setting is suitable for meeting transcription and provides high accuracy.

  • false: Uses VAD to split sentences. Disables semantic punctuation. Suitable for low-latency interactive scenarios.

Semantic punctuation provides more precise sentence boundaries, while VAD offers a faster response. You can adjust the semantic_punctuation_enabled parameter to switch between segmentation methods based on your use case.

max_sentence_silence

integer

No

The silence duration threshold for VAD. When using Voice Activity Detection (VAD) for segmentation, a silence period exceeding this value marks the end of a sentence.
Unit: milliseconds (ms).

Default value: 1300.

Valid range: [200, 6000].

Condition: This parameter takes effect only when semantic_punctuation_enabled is set to false (using VAD segmentation).

multi_threshold_mode_enabled

boolean

No

Specifies whether to prevent excessively long sentences from being generated by VAD segmentation.

Default value: false.

  • true: Limits the segmentation length for VAD (Voice Activity Detection) to prevent excessively long segments.

  • false: Disabled.

Condition: This parameter takes effect only when semantic_punctuation_enabled is set to false (using VAD segmentation).

heartbeat

boolean

No

Specifies whether to maintain a persistent connection.

Default value: false.

  • true: The connection to the server remains active when silent audio is continuously sent.

  • false: Disabled. Even if silent audio is continuously sent, the connection is disconnected after 60 seconds due to a timeout.

Note:

  • Silent audio is any part of an audio file or stream with no audible signal.

  • You can create silent audio using audio editing software (such as Audacity or Adobe Audition) or command-line tools (such as FFmpeg).

language_hints

array[string]

No

Sets the language codes for recognition. If the language is unknown in advance, leave this parameter unset and the model will identify it automatically.

The system reads only the first value in the array and ignores all other values.

Supported language codes by model:

  • fun-asr-realtime, fun-asr-realtime-2025-11-07:

    • zh: Chinese

    • en: English

    • ja: Japanese

  • fun-asr-realtime-2025-09-15:

    • zh: Chinese

    • en: English

speech_noise_threshold

float

No

Adjusts the speech-noise detection threshold to control VAD sensitivity.

Range: [-1.0, 1.0].

Guidelines:

  • Near -1: Lowers the noise threshold — more noise may be transcribed as speech.

  • Near +1: Raises the noise threshold — some speech may be filtered out as noise.

Important

This is an advanced parameter. Adjustments can significantly affect recognition quality.

  • Test thoroughly before adjusting.

  • Make small adjustments (step size 0.1) based on your audio environment.

2. finish-task command: End a task

Purpose: After sending all audio data, the client sends this command to signal that the transmission is complete.

Example:

{
    "header": {
        "action": "finish-task",
        "task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
        "streaming": "duplex"
    },
    "payload": {
        "input": {}
    }
}

header parameters:

Parameter

Type

Required

Description

header.action

string

Yes

The command type. Must be finish-task.

header.task_id

string

Yes

The task ID. This must match the task_id from the run-task command.

header.streaming

string

Yes

The communication mode. Must be duplex.

payload parameters:

Parameter

Type

Required

Description

payload.input

object

Yes

The input configuration. Must be an empty object, {}.

Events (server to client)

Events are JSON-formatted text messages sent from the server to provide the client with task status updates and recognition results.

1. task-started

Trigger: Occurs after the server successfully processes a run-task command.
Purpose: Notifies the client that the task has started and that the client can begin sending audio data.

Example:

{
    "header": {
        "task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
        "event": "task-started",
        "attributes": {}
    },
    "payload": {}
}

header parameters:

Parameter

Type

Description

header.event

string

The event type. Must be task-started.

header.task_id

string

The task ID.

2. result-generated

Trigger: Occurs during the recognition process whenever the server generates a new result.
Purpose: Returns real-time recognition results, including both intermediate and final sentence results.

Example:

{
  "header": {
    "task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
    "event": "result-generated",
    "attributes": {}
  },
  "payload": {
    "output": {
      "sentence": {
        "begin_time": 170,
        "end_time": 920,
        "text": "Okay, I got it.",
        "heartbeat": false,
        "sentence_end": true,
        "words": [
          {
            "begin_time": 170,
            "end_time": 295,
            "text": "Okay",
            "punctuation": ","
          },
          {
            "begin_time": 295,
            "end_time": 503,
            "text": "I",
            "punctuation": ""
          },
          {
            "begin_time": 503,
            "end_time": 711,
            "text": "got",
            "punctuation": ""
          },
          {
            "begin_time": 711,
            "end_time": 920,
            "text": "it",
            "punctuation": "."
          }
        ]
      }
    },
    "usage": {
      "duration": 3
    }
  }
}

header parameters:

Parameter

Type

Description

header.event

string

The event type. Must be result-generated.

header.task_id

string

The task ID.

payload parameters:

Parameter

Type

Description

output

object

Contains the recognition result in the output.sentence object. See details below.

usage

object

When payload.output.sentence.sentence_end is false (indicating that the current sentence has not ended, see Description of the payload.output.sentence parameter), usage is null.

For final sentence results (when payload.output.sentence.sentence_end is true), the usage.duration field indicates the task's billing duration in seconds. See the payload.output.sentence parameters for more information.

The payload.usage object has the following format:

Parameter

Type

Description

duration

integer

The billing duration for the task, in seconds.

The payload.output.sentence object has the following format:

Parameter

Type

Description

begin_time

integer

The start time of the sentence in milliseconds (ms).

end_time

integer | null

The end time of the sentence in milliseconds (ms). This value is null for an intermediate result.

text

string

The transcribed text content.

words

array

An array of word timestamp objects.

heartbeat

boolean | null

If this value is true, you can skip processing this recognition result. This value is consistent with the heartbeat parameter in the run-task command.

sentence_end

boolean

Indicates whether the current sentence is complete.

The payload.output.sentence.words array contains word timestamp objects, each with the following format:

Parameter

Type

Description

begin_time

integer

The start time of the word in milliseconds (ms).

end_time

integer

The end time of the word in milliseconds (ms).

text

string

The transcribed word.

punctuation

string

The punctuation that follows the word.

3. task-finished

Trigger: Occurs after the server receives a finish-task command and processes all cached audio.
Purpose: Signals that the recognition task is complete.

Example:

{
    "header": {
        "task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
        "event": "task-finished",
        "attributes": {}
    },
    "payload": {
        "output": {}
    }
}

header parameters:

Parameter

Type

Description

header.event

string

The event type. Must be task-finished.

header.task_id

string

The task ID.

4. task-failed

Trigger: Occurs if any error is encountered during task processing.
Purpose: Notifies the client that the task has failed and provides an error message. After receiving this event, close the WebSocket connection and handle the error.

Example:

{
    "header": {
        "task_id": "2bf83b9a-baeb-4fda-8d9a-xxxxxxxxxxxx",
        "event": "task-failed",
        "error_code": "CLIENT_ERROR",
        "error_message": "request timeout after 23 seconds.",
        "attributes": {}
    },
    "payload": {}
}

header parameters:

Parameter

Type

Description

header.event

string

The event type. Must be task-failed.

header.task_id

string

The task ID.

header.error_code

string

The error code.

header.error_message

string

A detailed error message.

Connection overhead and connection reuse

The WebSocket service supports connection reuse to improve resource efficiency and avoid connection overhead.

The client starts a task by sending a run-task command and ends it by sending a finish-task command. The server confirms the task's completion by returning a task-finished event, after which the client can reuse the connection to start another task by sending a new run-task command.

Important
  1. Each task on a reused connection must have a unique task_id.

  2. If a task fails, the service returns a task-failed event and closes the connection. This connection cannot be reused.

  3. If the client does not start a new task within 60 seconds after a task finishes, the connection times out.

Error codes

For troubleshooting, see error messages.

FAQ

Features

Maintain a long-lived connection

Set the heartbeat request parameter to true and continuously send silent audio to the server.

Note:

  • Silent audio is any part of an audio file or stream with no audible signal.

  • You can create silent audio using audio editing software (such as Audacity or Adobe Audition) or command-line tools (such as FFmpeg).

Convert audio format

You can use the FFmpeg tool. For more information, see the official FFmpeg website.

# Basic conversion command (universal template)
# -i: Specifies the input file path. Example: audio.wav
# -c:a: Specifies the audio encoder. Examples: aac, libmp3lame, pcm_s16le
# -b:a: Specifies the bit rate (audio quality). Examples: 192k, 320k
# -ar: Specifies the sample rate.
# -ac: Specifies the number of channels. Examples: 1 (mono), 2 (stereo)
# -y: Overwrites the output file if it exists (no value required).
ffmpeg -i input_audio.ext -c:a encoder_name -b:a bit_rate -ar sample_rate -ac channels output.ext

# Example: Convert WAV to MP3 (maintaining original quality)
ffmpeg -i input.wav -c:a libmp3lame -q:a 0 output.mp3
# Example: Convert MP3 to WAV (standard 16-bit PCM format)
ffmpeg -i input.mp3 -c:a pcm_s16le -ar 16000 -ac 2 output.wav
# Example: Convert M4A to AAC (extract/convert Apple audio)
ffmpeg -i input.m4a -c:a copy output.aac  # Extract without re-encoding
ffmpeg -i input.m4a -c:a aac -b:a 256k output.aac  # Re-encode for higher quality
# Example: Convert lossless FLAC to Opus (high compression)
ffmpeg -i input.flac -c:a libopus -b:a 128k -vbr on output.opus

WebSocket vs. HTTP/RESTful API

The speech service uses the WebSocket protocol because it requires full-duplex communication. The WebSocket protocol allows the server and client to exchange data simultaneously, such as sending real-time updates on speech synthesis or recognition progress. In contrast, RESTful APIs, which are based on the HTTP/HTTPS protocol, only support a one-way request-response model initiated by the client. This model does not support real-time interaction.

Troubleshooting

If you receive an error code, refer to Error codes for troubleshooting.

Audio not recognized

  1. Check that the audio format (format) and sample rate (sample_rate) in your request parameters are set correctly and meet the parameter constraints. The following are common errors:

    • An audio file has a .wav extension but is actually in MP3 format; in this case, setting the format request parameter to mp3 is incorrect.

    • The audio's sample rate is 3600 Hz, but the sample_rate request parameter is incorrectly set to 48000.

    You can use the ffprobe tool to check the audio container, encoding, sample rate, and channels:

    ffprobe -v error -show_entries format=format_name -show_entries stream=codec_name,sample_rate,channels -of default=noprint_wrappers=1 input.xxx