Data tables support writing vector data in string and binary formats. The string format is more readable and simplifies troubleshooting, while the binary format reduces storage costs.
Prerequisites
You have converted content, such as images, videos, and text, into vector data using a Large Language Model (LLM). For more information, see Generate vectors.
Binary format
Storing vector data in binary format uses less disk space and reduces storage costs. For cost-sensitive scenarios with high vector dimensions, write vector data in binary format.
When you write vector data in binary format, you must convert the vectors to binary data using the Tablestore SDK or a tool.
Even when written in binary format, vectors are still of the Float32 type.
Vectors are stored in the data table in binary format and are also read as binary data. To improve readability, you can use a utility class to convert them to string format.
Convert to binary using the Tablestore SDK
Starting with Java SDK version 5.17.6 and Python SDK version 6.2.1, Tablestore supports binary conversion of vector data using the VectorUtils utility class.
import com.alicloud.openservices.tablestore.SyncClient;
import com.alicloud.openservices.tablestore.model.*;
import com.alicloud.openservices.tablestore.model.search.vector.VectorUtils;
import java.util.Random;
import java.util.UUID;
// Helper method to generate a random vector.
private static float[] generateRandomFloats(int length, Random random) {
float[] result = new float[length];
for (int i = 0; i < length; i++) {
result[i] = random.nextFloat();
}
return result;
}
// Write data in batches.
private static void batchWriteRow(SyncClient tableStoreClient) throws Exception {
Random random = new Random();
// Write 1,000 rows of data in batches of 100 rows.
for (int i = 0; i < 10; i++) {
BatchWriteRowRequest batchWriteRowRequest = new BatchWriteRowRequest();
for (int j = 0; j < 100; j++) {
// Your business data.
String text = "A string for full-text search. An embedding vector is generated from this field and written to the field_vector field below for vector semantic similarity search";
// The converted vector. Perform the conversion.
float[] vector = generateRandomFloats(1024,random);
RowPutChange rowPutChange = new RowPutChange("TABLE_NAME");
// Set the primary key.
rowPutChange.setPrimaryKey(PrimaryKeyBuilder.createPrimaryKeyBuilder().addPrimaryKeyColumn("PK_1", PrimaryKeyValue.fromString(UUID.randomUUID().toString())).build());
// Set attribute columns.
rowPutChange.addColumn("field_string", ColumnValue.fromLong(i));
rowPutChange.addColumn("field_long", ColumnValue.fromLong(i * 100 + j));
rowPutChange.addColumn("field_text", ColumnValue.fromString(text));
// Write vector data in binary format.
rowPutChange.addColumn("field_vector", ColumnValue.fromBinary(VectorUtils.toBytes(vector)));
batchWriteRowRequest.addRowChange(rowPutChange);
}
BatchWriteRowResponse batchWriteRowResponse = tableStoreClient.batchWriteRow(batchWriteRowRequest);
System.out.println("Batch write successful: " + batchWriteRowResponse.isAllSucceed());
if (!batchWriteRowResponse.isAllSucceed()) {
for (BatchWriteRowResponse.RowResult rowResult : batchWriteRowResponse.getFailedRows()) {
System.out.println("Failed row: " + batchWriteRowRequest.getRowChange(rowResult.getTableName(), rowResult.getIndex()).getPrimaryKey());
System.out.println("Failure reason: " + rowResult.getError());
}
}
}
}import time
import tablestore.utils
from tablestore import *
def batch_write_vector(rows_count):
print('Begin prepare data: %d' % rows_count)
batch_write_row_reqs = BatchWriteRowRequest()
put_row_items = []
for i in range(rows_count):
pk = [('PK1', i)]
cols = [('field_string', 'key%03d' % i),
('field_long', i),
('field_text', 'some text'),
('field_vector', tablestore.utils.VectorUtils.floats_to_bytes([0.1, 0.2, 0.3, 0.4]))]
put_row_item = PutRowItem(Row(pk,cols),Condition(RowExistenceExpectation.IGNORE))
put_row_items.append(put_row_item)
batch_write_row_reqs.add(TableInBatchWriteRowItem(table_name, put_row_items))
client.batch_write_row(batch_write_row_reqs)
print('End prepare data.')
print('Wait for data sync to search index.')
time.sleep(60)Convert to binary using a tool
public class VectorUtils {
private static final ByteOrder order = ByteOrder.LITTLE_ENDIAN;
/**
* Converts a float[] array to binary format.
* @param vector The vector to convert.
* @return byte The data in binary format.
*/
public static byte[] toBytes(float[] vector) {
if (vector == null || vector.length == 0) {
throw new ClientException("vector is null or empty");
}
ByteBuffer buffer = ByteBuffer.allocate(vector.length * 4);
buffer.order(order);
for (float value : vector) {
buffer.putFloat(value);
}
return buffer.array();
}
/**
* Converts data from binary format back to a float[] array.
* @param bytes The data in binary format.
* @return Float The original vector.
*/
public static float[] toFloats(byte[] bytes) {
int length = bytes.length / 4;
if (bytes.length % 4 != 0 || length == 0) {
throw new ClientException("bytes length is not multiple of 4(SIZE_OF_FLOAT32) or length is 0");
}
ByteBuffer buffer = ByteBuffer.wrap(bytes);
buffer.order(order);
float[] vector = new float[length];
buffer.asFloatBuffer().get(vector);
return vector;
}
}// Float32ToBytes converts a []float32 slice to a byte array.
func Float32ToBytes(vector []float32) ([]byte, error) {
if len(vector) == 0 {
return nil, errors.New("vector is null or empty")
}
data := make([]byte, 4*len(vector))
for i, v := range vector {
binary.LittleEndian.PutUint32(data[i*4:(i+1)*4], math.Float32bits(v))
}
return data, nil
}
// ToFloat32 converts a byte array back to a []float32 slice.
func ToFloat32(data []byte) ([]float32, error) {
if data == nil {
return nil, errors.New("bytes is null")
}
if len(data)%4 != 0 || len(data) == 0 {
return nil, errors.New("bytes length is not multiple of 4(SIZE_OF_FLOAT32) or length is 0")
}
floats := make([]float32, len(data)/4)
buf := bytes.NewReader(data)
for i := range floats {
if err := binary.Read(buf, binary.LittleEndian, &floats[i]); err != nil {
return nil, err
}
}
return floats, nil
}class VectorUtils:
# Converts floats to a bytearray.
@staticmethod
def floats_to_bytes(floats):
if not isinstance(floats, (list, tuple)) or not all(isinstance(f, float) for f in floats):
raise TypeError("Input must be a list/tuple of floats")
if len(floats) == 0:
raise ValueError("vector is empty")
return bytearray(struct.pack('<' + 'f' * len(floats), *floats))
# Converts a bytearray back to floats.
@staticmethod
def bytes_to_floats(byte_data):
if not isinstance(byte_data, bytearray):
raise TypeError("Input must be a bytearray object")
num_floats = len(byte_data) // 4
if len(byte_data) % 4 != 0 or num_floats == 0:
raise ValueError("bytes length is not multiple of 4(SIZE_OF_FLOAT32) or length is 0")
floats = struct.unpack('<' + 'f' * num_floats, byte_data)
return list(floats)Verify the conversion
This section uses the Java SDK to show how to verify the conversion between vector data and binary data. The conversion is successful if the converted floating-point number array is identical to the original array.
public class VectorUtilsTest {
public static void main(String[] args) {
float[] vector = new float[] { 1, 2, 3, 4 };
byte[] bytes = VectorUtils.toBytes(vector);
System.out.println("Converted binary data: " + Arrays.toString(bytes));
float[] newVector = VectorUtils.toFloats(bytes);
System.out.println("Converted floating-point number array: " + Arrays.toString(newVector));
}
}String format
Storing vectors in string format uses more disk space but is more readable. When you write vector data in string format, convert the Float32 array into a JSON string, such as [0.1,0.2,0.3,0.4].
This section uses the Java SDK to show how to write vector data in string format.
// Write data in batches.
private static void batchWriteRow(SyncClient tableStoreClient) throws Exception {
// Write 1,000 rows of data in batches of 100 rows.
for (int i = 0; i < 10; i++) {
BatchWriteRowRequest batchWriteRowRequest = new BatchWriteRowRequest();
for (int j = 0; j < 100; j++) {
// Your business data.
String text = "A string for full-text search. An embedding vector is generated from this field and written to the field_vector field below for vector semantic similarity search";
// Convert text to a vector.
String vector = "[1, 2, 3, 4]";
RowPutChange rowPutChange = new RowPutChange("TABLE_NAME");
// Set the primary key.
rowPutChange.setPrimaryKey(PrimaryKeyBuilder.createPrimaryKeyBuilder().addPrimaryKeyColumn("PK_1", PrimaryKeyValue.fromString(UUID.randomUUID().toString())).build());
// Set attribute columns.
rowPutChange.addColumn("field_string", ColumnValue.fromLong(i));
rowPutChange.addColumn("field_long", ColumnValue.fromLong(i * 100 + j));
rowPutChange.addColumn("field_text", ColumnValue.fromString(text));
// The vector format is a string of a float32 array, for example, [1, 5.1, 4.7, 0.08 ].
rowPutChange.addColumn("field_vector", ColumnValue.fromString(vector));
batchWriteRowRequest.addRowChange(rowPutChange);
}
BatchWriteRowResponse batchWriteRowResponse = tableStoreClient.batchWriteRow(batchWriteRowRequest);
System.out.println("Batch write successful: " + batchWriteRowResponse.isAllSucceed());
if (!batchWriteRowResponse.isAllSucceed()) {
for (BatchWriteRowResponse.RowResult rowResult : batchWriteRowResponse.getFailedRows()) {
System.out.println("Failed row: " + batchWriteRowRequest.getRowChange(rowResult.getTableName(), rowResult.getIndex()).getPrimaryKey());
System.out.println("Failure reason: " + rowResult.getError());
}
}
}
}