Demo code for pushing structured documents - OpenSearch - Alibaba Cloud Documentation Center

To upload data in push mode, you must first generate datasets in the valid format and upload the datasets to the client buffer. Then, call the push method to submit the datasets to the application at a time.

Dependencies

To use OpenSearch SDK to upload files, you must specify the following dependencies:

For information about BaseRequest, see Demo code for using the Python client.

Java

<dependency>
    <groupId>com.aliyun.opensearch</groupId>
    <artifactId>aliyun-sdk-opensearch</artifactId>
    <version>4.0.0</version>
</dependency>

Python

pip install alibabacloud_tea_util 
pip install alibabacloud_opensearch_util
pip install alibabacloud_credentials

PHP

V3.4.1 (2021-05-11)
Download URL: https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20230719/mxik/opensearch-sdk-php-release-v3.4.1.zip

Demo code

Usage notes

Datasets that can be pushed must be in the valid format. To view the valid format, log on to the OpenSearch console, go to the Instance Management page, choose More > Upload File in the Actions column for an application, and then download the sample file. You can use the sample file as a template to generate your datasets.
You can also generate datasets by using the JSONObject and JSONArray objects and call the push method to submit the datasets to the application at a time.
If the number of documents to be pushed at a time exceeds the limit, an error occurs and the push fails. For more information, see Limits.

ADD operation:

Java

import java.util.HashMap;
import java.util.Map;


import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;


import com.aliyun.opensearch.OpenSearchClient;
import com.aliyun.opensearch.sdk.generated.OpenSearch;
import com.aliyun.opensearch.sdk.generated.commons.OpenSearchClientException;
import com.aliyun.opensearch.sdk.generated.commons.OpenSearchException;
import com.aliyun.opensearch.sdk.generated.commons.OpenSearchResult;


/**
 * Demo for adding or updating documents.
 */
public class testPushDemo {


    private static String appName = "The name of the OpenSearch application to which you want to push documents";
    private static String accesskey = "The AccessKey ID";
    private static String secret = "The AccessKey secret";
    private static String host = "The endpoint of the OpenSearch API in your region";
    private static String path = "/apps/%s/actions/knowledge-bulk";


    public static void main(String[] args) {


        String appPath = String.format(path, appName);


        // Create an OpenSearch object.
        OpenSearch openSearch = new OpenSearch(accesskey, secret, host);
        // Use the OpenSearch object as a parameter to create an OpenSearchClient object.
        OpenSearchClient openSearchClient = new OpenSearchClient(openSearch);


        // Create a JSON object for adding a single document.
        JSONObject oneRequest = new JSONObject();
        oneRequest.put("cmd", "ADD");
        JSONObject fields = new JSONObject();
        fields.put("id", "The ID of the test document");
        fields.put("title", "The title of the test document");
        fields.put("url", "The URL of the test document");
        fields.put("content", "The content of the test document");
        fields.put("category", "The category of the test document");
        oneRequest.put("fields", fields);


        // Create a JSON array. You can use the JSON array to add multiple documents at a time.
        JSONArray request = new JSONArray();
        request.add(oneRequest);


        Map<String, String> params = new HashMap<String, String>() {{
            put("format", "full_json");
            put("_POST_BODY", request.toJSONString());
        }};
        try {
            OpenSearchResult openSearchResult = openSearchClient.callAndDecodeResult(appPath, params, "POST");
            // Display the returned result.
            System.out.println(openSearchResult.getResult());
        } catch (OpenSearchException e) {
            e.printStackTrace();
        } catch (OpenSearchClientException e) {
            e.printStackTrace();
        }
    }
}

Python

# -*- coding: utf-8 -*-

import time, os
from typing import Dict, Any
from Tea.exceptions import TeaException
from Tea.request import TeaRequest
from alibabacloud_tea_util import models as util_models
from BaseRequest import Config, Client


class LLMDocumentPush:
    def __init__(self, config: Config):
        self.Clients = Client(config=config)
        self.runtime = util_models.RuntimeOptions(
            connect_timeout=10000,
            read_timeout=10000,
            autoretry=False,
            ignore_ssl=False,
            max_idle_conns=50,
            max_attempts=3
        )
        self.header = {}

    def docBulk(self, app_name: str,doc_content: list) -> Dict[str, Any]:
        try:
            response = self.Clients._request(method="POST",
                                             pathname=f'/v3/openapi/apps/{app_name}/actions/knowledge-bulk',
                                             query={}, headers=self.header,
                                             body=doc_content, runtime=self.runtime)
            return response
        except Exception as e:
            print(e)

if __name__ == "__main__":
    # Specify the endpoint of the OpenSearch API. The value does not contain the http:// prefix.
    endpoint = "<endpoint>"
    # Specify the request protocol. Valid values: HTTPS and HTTP.
    endpoint_protocol = "HTTP"
    # Specify your AccessKey pair.
    # Obtain the AccessKey ID and AccessKey secret from environment variables. 
    # You must configure environment variables before you run this code. For more information, see the "Configure environment variables" section of this topic.
    access_key_id = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_ID")
    access_key_secret = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_SECRET")
    # Specify the authentication method. Default value: access_key. A value of sts specifies authentication based on Resource Access Management (RAM) and Security Token Service (STS).
    # Valid values: sts and access_key.
    auth_type = "access_key"
    # If you use authentication based on RAM and STS, you must specify the security_token parameter. You can call the AssumeRole operation of Alibaba Cloud RAM to obtain an STS token.
    security_token = "<security_token>"
    # Specify common request parameters.
    # Note: The security_token and type parameters are required only if you use the SDK as a RAM user.
    Configs = Config(endpoint=endpoint, access_key_id=access_key_id, access_key_secret=access_key_secret,
                     security_token=security_token, type=auth_type, protocol=endpoint_protocol)
    # Create an OpenSearch LLM-Based Conversational Search Edition instance.
    ops = LLMDocumentPush(Configs)
    app_name = "The name of the OpenSearch LLM-Based Conversational Search Edition instance"

    # ---------------  Push structured documents to an OpenSearch LLM-Based Conversational Search Edition instance---------------

    document = [
        {
            "fields": {
                "id": "1",
                "title": "Benefits",
                "url": "https://help.aliyun.com/document_detail/464900.html",
                "content": "Industry Algorithm Edition: Intelligence: Industry Algorithm Edition provides rich built-in and customized algorithm models and introduces industry retrieval and sorting algorithms based on the search needs of different industries. This way, optimal search results are ensured. Flexibility and customization: Industry Algorithm Edition allows you to customize configurations such as algorithm models, application schema, data processing, query analysis, and sorting to meet personalized search requirements. This improves the click-through rate of search results, accelerates service iteration, and greatly shortens the rollout cycle. Security and stability: O&M services are available on a 24/7 basis. You can get technical support by submitting tickets online or using the telephone. A series of complete fault emergency response mechanisms are provided, such as fault monitoring, automatic alerting, and rapid troubleshooting. AccessKey IDs and AccessKey secrets assigned by Alibaba Cloud control permissions to access OpenSearch. This ensures data security by isolating data of different users. Multiple copies of data are backed up to implement data redundancy, which ensures data security. Auto scaling: The auto scaling capability allows you to scale up or down the resources based on your needs. Rich extended features: OpenSearch supports a variety of extended search features, such as top searches, hints, drop-down suggestions, and report statistics. This helps you view and analyze search results. Out-of-the-box service: You do not need to deploy or perform O&M operations on clusters before you access OpenSearch. High-performance Search Edition: High throughput: A single table supports tens of thousands of write transactions per second (TPS) and data updates within seconds. Security and stability: O&M services are available on a 24/7 basis. You can get technical support by submitting tickets online or using the telephone. A series of complete fault emergency response mechanisms are provided, such as fault monitoring, automatic alerting, and rapid troubleshooting. AccessKey IDs and AccessKey secrets assigned by Alibaba Cloud control permissions to access OpenSearch. This ensures data security by isolating data of different users. Multiple copies of data are backed up to implement data redundancy, which ensures data security. Auto scaling: The auto scaling capability allows you to scale up or down the resources based on your needs. Out-of-the-box service: You do not need to deploy or perform O&M operations on clusters before you access OpenSearch. Vector Search Edition: Stability: The underlying layer of Vector Search Edition is developed by using the C++ programming language. After more than ten years of development, Vector Search Edition provides stable search services for various core business systems. Vector Search Edition is suitable for core search scenarios that require high stability. High efficiency: Vector Search Edition provides a distributed search engine that allows you to retrieve large amounts of data. Vector Search Edition supports real-time data updates within seconds. Therefore, Vector Search Edition is applicable to query and search scenarios that are time-sensitive. Cost-effectiveness: Vector Search Edition supports multiple policies for index compression and multi-value index loading tests. You can use Vector Search Edition to meet your query requirements at low costs. Vector algorithm: Vector Search Edition supports vector searches for various types of unstructured data, such as voice data, images, videos, natural languages, and behavior data. SQL query: Vector Search Edition allows you to use SQL syntax and join tables online and provides a variety of built-in user-defined functions (UDFs) and function customization mechanisms to meet different requirements for data retrieval. To facilitate SQL development and testing, an SQL studio is integrated into the O&M system of Vector Search Edition. Retrieval Engine Edition: Stability: The underlying layer of Retrieval Engine Edition is developed by using the C++ programming language. After more than ten years of development, Retrieval Engine Edition provides stable search services for various core business systems. Retrieval Engine Edition is suitable for core search scenarios that require high stability. High efficiency: Retrieval Engine Edition provides a distributed search engine that allows you to retrieve large amounts of data. Retrieval Engine Edition supports real-time data updates within seconds. Therefore, Retrieval Engine Edition is suitable for query and search scenarios that are time-sensitive. Cost-effectiveness: Retrieval Engine Edition supports multiple policies for index compression and multi-value index loading tests. You can use Retrieval Engine Edition to meet your query requirements at low costs. Enriched features: Retrieval Engine Edition supports multiple types of analyzers and indexes and powerful query syntax. This service can meet your data retrieval requirements. Retrieval Engine Edition also supports plug-ins. This way, you can customize your own business logic. SQL query: Retrieval Engine Edition allows you to use SQL syntax and join tables online, and provides a variety of built-in UDFs and function customization mechanisms to meet different requirements for data retrieval. To facilitate SQL development and testing, an SQL studio will be integrated into the O&M system of Retrieval Engine Edition in later versions.",
                "category": "opensearch",
                "timestamp": 1691722088645,
                "score": 0.8821945219723084
            },
            "cmd": "ADD"
        },
        {
            "fields": {
                "id": "2",
                "title": "Scenarios",
                "url": "https://help.aliyun.com/document_detail/464901.html",
                "content": "Industry Algorithm Edition: Features: provides industry built-in capabilities such as semantic understanding and machine learning-based algorithms, and supports lightweight custom models and search guidance. This helps you build intelligent search services in a quick manner. <br/><img src=\"https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/4685770861/p622804.png\" width=300>Business scenarios: intelligent searches in industries such as e-commerce, content communities, and games, and educational Q&A searches. Target customers: Industry Algorithm Edition is out-of-the-box and suitable for small and medium-sized enterprises and developers that have intelligent search requirements. High-performance Search Edition: Features: Deep optimization is performed for big data search performance. OpenSearch supports quick response within seconds and real-time queries, and provides a one-stop solution for you to build big data search services in various scenarios such as searches for orders, coupons, logistics, and insurance policies. <br/><img src=\"https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/3685770861/p622799.png\" width=300>Business scenarios: searches for orders, coupons, logistics, and insurance policies. Target customers: High-performance Search Edition is out-of-the-box and suitable for small and medium-sized enterprises and developers that have high requirements for search performance. Vector Search Edition: Features: provides a large-scale distributed and high-performance vector search solution in Alibaba Cloud. Vector Search Edition supports multiple search algorithms to achieve a balance between precision and performance. Other features such as building index in streaming mode and instant queries are also supported. <br/><img src=\"https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/4685770861/p622805.png\" width=300>Business scenarios: graph searches, audio or video searches, natural language processing (NLP) vector searches, and intelligent Q&A. Target customers: enterprises and developers that face large-scale vectors and require flexible development. Retrieval Engine Edition: Features: provides you with high-performance, low-cost, easy-to-use, and large-scale online search services. Retrieval Engine Edition supports customized development based on your business requirements and fast iteration of search algorithms. <br/><img src=\"https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/4685770861/p622806.png\" width=300>Business scenarios: searches for enterprise information, tags, and financial research reports, and intelligent searches. Target customers: enterprises and developers that face a large amount of data and require flexible data development.",
                "category": "opensearch",
                "timestamp": 1691722088646,
                "score": 0.8993507402088953
            },
            "cmd": "ADD"
        }
    ]

    # Delete documents.
    deletedocument = {"cmd": "DELETE", "fields": {"id": 2}}
    documents = document
    res5 = ops.docBulk(app_name=app_name, doc_content=documents)
    print(res5)

PHP

<?php
  
require_once($path . "/OpenSearch/Autoloader/Autoloader.php");

use OpenSearch\Client\OpenSearchClient;

// Specify your AccessKey pair.
// Obtain the AccessKey ID and AccessKey secret from environment variables. 
// You must configure environment variables before you run this code. For more information, see the "Configure environment variables" section of this topic.
// Specify the AccessKey ID.
$accessKeyId = getenv('ALIBABA_CLOUD_ACCESS_KEY_ID');
// Specify the AccessKey secret.
$secret = getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET');
$end Point = '<The endpoint of the OpenSearch API in your region>';
$appName = '<The application name>';
$options = array('debug' => true);
$requestBody = "[
 {
  \"fields\":{
   \"id\":\"15739\",
   \"title\":\"Benefits\",
   \"url\":\"https://help.aliyun.com/document_detail/464900.html\",
   \"content\":\"Industry Algorithm Edition: Features: provides industry built-in capabilities such as semantic understanding and machine learning-based algorithms, and supports lightweight custom models and search guidance. This helps you build intelligent search services in a quick manner. <br/><img src=\"https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/4685770861/p622804.png\"width=300>Business scenarios: intelligent searches in industries such as e-commerce, content communities, and games, and educational Q&A searches. Target customers: Industry Algorithm Edition is out-of-the-box and suitable for small and medium-sized enterprises and developers that have intelligent search requirements. High-performance Search Edition: Features: Deep optimization is performed for big data search performance. OpenSearch supports quick response within seconds, real-time queries, and provides a one-stop solution for you to build big data search services in various scenarios such as searches for orders, coupons, logistics, and insurance policies. \",
   \"category\":\"opensearch\",
   \"timestamp\":1691722088646,\"score\":0.8993507402088953},
   \"cmd\":\"ADD\"
 }
]";

$client = new OpenSearchClient($accessKeyId, $secret, $endPoint, $options);

$uri = "/apps/{$appName}/actions/knowledge-bulk";

try{
    $ret = $client->post($uri, $requestBody);
    print_r(json_decode($ret->result, true));
}catch (\Throwable $e) {
    print_r($e);
}

Note

No separate operation is provided for updating documents. To update a document, you must use the ADD operation and specify all fields in the request. Otherwise, the fields that are not specified are left empty in the updated document. This is because the ADD operation overwrites existing data with new data.

DELETE operation:

Java

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;


import com.aliyun.opensearch.OpenSearchClient;
import com.aliyun.opensearch.sdk.generated.OpenSearch;
import com.aliyun.opensearch.sdk.generated.commons.OpenSearchClientException;
import com.aliyun.opensearch.sdk.generated.commons.OpenSearchException;
import com.aliyun.opensearch.sdk.generated.commons.OpenSearchResult;


/**
 * Demo for deleting documents.
 */
public class testDeleteDemo {


    private static String appName = "The name of the OpenSearch application from which you want to delete data";
    private static String accesskey = "The AccessKey ID";
    private static String secret = "The AccessKey secret";
    private static String host = "The endpoint of the OpenSearch API in your region";
    private static String path = "/apps/%s/actions/knowledge-bulk";


    public static void main(String[] args) {


        String appPath = String.format(path, appName);


        // Create an OpenSearch object.
        OpenSearch openSearch = new OpenSearch(accesskey, secret, host);
        // Use the OpenSearch object as a parameter to create an OpenSearchClient object.
        OpenSearchClient openSearchClient = new OpenSearchClient(openSearch);


        // Create a JSON object for deleting a single document.
        JSONObject oneRequest = new JSONObject();
        oneRequest.put("cmd", "DELETE");
        JSONObject fields = new JSONObject();
        fields.put("id", "The ID of the test document");
        oneRequest.put("fields", fields);


        // Create a JSON array. You can use the JSON array to delete multiple documents at a time.
        JSONArray request = new JSONArray();
        request.add(oneRequest);


        Map<String, String> params = new HashMap<String, String>() {{
            put("format", "full_json");
            put("_POST_BODY", request.toJSONString());
        }};
        try {
            OpenSearchResult openSearchResult = openSearchClient.callAndDecodeResult(appPath, params, "POST");
            // Display the returned result.
            System.out.println(openSearchResult.getResult());
        } catch (OpenSearchException e) {
            e.printStackTrace();
        } catch (OpenSearchClientException e) {
            e.printStackTrace();
        }
    }
}

Python

# -*- coding: utf-8 -*-

import time, os
from typing import Dict, Any
from Tea.exceptions import TeaException
from Tea.request import TeaRequest
from alibabacloud_tea_util import models as util_models
from BaseRequest import Config, Client


class LLMDocumentPush:
    def __init__(self, config: Config):
        self.Clients = Client(config=config)
        self.runtime = util_models.RuntimeOptions(
            connect_timeout=10000,
            read_timeout=10000,
            autoretry=False,
            ignore_ssl=False,
            max_idle_conns=50,
            max_attempts=3
        )
        self.header = {}

    def docBulk(self, app_name: str,doc_content: list) -> Dict[str, Any]:
        try:
            response = self.Clients._request(method="POST",
                                             pathname=f'/v3/openapi/apps/{app_name}/actions/knowledge-bulk',
                                             query={}, headers=self.header,
                                             body=doc_content, runtime=self.runtime)
            return response
        except Exception as e:
            print(e)

if __name__ == "__main__":
    # Specify the endpoint of the OpenSearch API. The value does not contain the http:// prefix.
    endpoint = "<endpoint>"
    # Specify the request protocol. Valid values: HTTPS and HTTP.
    endpoint_protocol = "HTTP"
    # Specify your AccessKey pair.
    # Obtain the AccessKey ID and AccessKey secret from environment variables. 
    # You must configure environment variables before you run this code. For more information, see the "Configure environment variables" section of this topic.
    access_key_id = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_ID")
    access_key_secret = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_SECRET")
    # Specify the authentication method. Default value: access_key. A value of sts specifies authentication based on RAM and STS.
    # Valid values: sts and access_key.
    auth_type = "access_key"
    # If you use authentication based on RAM and STS, you must specify the security_token parameter. You can call the AssumeRole operation of Alibaba Cloud RAM to obtain an STS token.
    security_token = "<security_token>"
    # Specify common request parameters.
    # Note: The security_token and type parameters are required only if you use the SDK as a RAM user.
    Configs = Config(endpoint=endpoint, access_key_id=access_key_id, access_key_secret=access_key_secret,
                     security_token=security_token, type=auth_type, protocol=endpoint_protocol)
    # Create an OpenSearch LLM-Based Conversational Search Edition instance.
    ops = LLMDocumentPush(Configs)
    app_name = "The name of the OpenSearch LLM-Based Conversational Search Edition instance"

    # Delete documents.
    deletedocument = {"cmd": "DELETE", "fields": {"id": 2}}
    documents = document
    res5 = ops.docBulk(app_name=app_name, doc_content=documents)
    print(res5)