本文通過如下樣本為您說明如何配置特徵組建組態檔案fg.json和模型設定檔config。
樣本資料
以下為樣本資料,欄位包括了常見的資料類型,其中:
ID特徵:user_id、item_id
類別特徵:gender、category
數值特徵:age
lookup特徵:user__kv_category_click_1d
多值特徵:tags
文本特徵:description
行為序列特徵:click_10_seq
樣本的label:is_click
Lookup特徵是一種通過外部預計算的映射表(如KV儲存、緩衝或資料庫表)來擷取特徵值的機制。它允許模型在訓練或預測時快速查詢已預先計算好的特徵值,而無需Realtime Compute或複雜的資料處理邏輯。這種方式在推薦系統、點擊率預估等情境中常見,例如使用者的歷史行為統計特徵(如使用者對某類商品的點擊次數)、物品的流行度等。
欄位名稱 | 樣本資料1 | 樣本資料2 | 樣本資料3 |
request_id | 101 | 102 | 103 |
user_id | 1 | 2 | 3 |
item_id | 4 | 5 | 10 |
event_unix_time | 1672502400 | 1672502400 | 1672502400 |
is_click | 0 | 1 | 1 |
age | 25 | 30 | 22 |
gender | 男 | 女 | 女 |
user__kv_category_click_1d | 電子產品:10家電:1飾品:2 | 電子產品:1家電:5飾品:1 | 電子產品:1家電:2飾品:11 |
category | 電子產品 | 家電 | 飾品 |
tags | 科技電腦便攜 | 家居電器冷藏 | 時尚眼鏡防晒 |
description | 便攜高效能筆記本 | 大容量冷藏冰箱 | 時尚防晒太陽鏡 |
click_10_seq | item__item_id:4#item__category:電子產品#user__ts:21041;item__item_id:5#item__category:家電#user__ts:168139;item__item_id:10#item__category:飾品#user__ts:168284 | ||
ds | 20230101 | 20230101 | 20230101 |
特徵組建組態檔案 fg.json
以下是配置好的fg.json樣本:
{
"features": [
{
"feature_name": "user_id",
"feature_type": "id_feature",
"value_type": "String",
"expression": "user:user_id",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false
},
{
"feature_name": "item_id",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:item_id",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false
},
{
"feature_name": "age",
"feature_type": "raw_feature",
"value_type": "Double",
"expression": "user:age",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false
},
{
"feature_name": "gender",
"feature_type": "id_feature",
"value_type": "String",
"expression": "user:gender",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false
},
{
"feature_name": "user__kv_category_click_1d",
"feature_type": "lookup_feature",
"value_type": "Double",
"map": "user:user__kv_category_click_1d",
"key": "item:category",
"needDiscrete": false,
"needWeighting": false,
"needKey": false,
"default_value": "0",
"combiner": "mean",
"need_prefix": false
},
{
"feature_name": "category",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:category",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false
},
{
"feature_name": "tags",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:tags",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": true
},
{
"feature_name": "description",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:description",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": true
},
{
"sequence_name": "click_10_seq",
"sequence_column": "click_10_seq",
"sequence_length": 10,
"sequence_delim": ";",
"attribute_delim": "#",
"sequence_table": "item",
"sequence_pk": "user:click_10_seq",
"features": [
{
"feature_name": "item_id",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:item_id",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false,
"group": "click_10_seq_feature"
},
{
"feature_name": "category",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:category",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false,
"group": "click_10_seq_feature"
},
{
"feature_name": "ts",
"feature_type": "raw_feature",
"value_type": "Double",
"expression": "user:ts",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"group": "click_10_seq_feature"
}
]
}
],
"reserves": [
"request_id",
"user_id",
"item_id",
"is_click"
]
}模型設定檔 config
以下是配置好的模型設定檔config樣本。包括了組合特徵和運算式特徵,詳情請參見easy_rec。
train_config {
optimizer_config {
use_moving_average: false
adam_optimizer {
learning_rate {
exponential_decay_learning_rate {
initial_learning_rate: 0.001
decay_steps: 1
decay_factor: 0.5
min_learning_rate: 1e-06
}
}
}
}
num_steps: 1
sync_replicas: true
save_summary_steps: 100
log_step_count_steps: 100
}
eval_config {
metrics_set {
auc {
}
}
}
data_config {
batch_size: 1024
label_fields: "is_click"
shuffle: false
num_epochs: 10000
input_type: OdpsRTPInput
separator: ""
selected_cols: "is_click,features"
input_fields {
input_name: "is_click"
input_type: INT32
default_val: "0"
}
input_fields {
input_name: "user_id"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "item_id"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "age"
input_type: DOUBLE
default_val: "-1024"
}
input_fields {
input_name: "gender"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "user__kv_category_click_1d"
input_type: DOUBLE
default_val: "0"
}
input_fields {
input_name: "category"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "tags"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "description"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "click_10_seq__item_id"
input_type: STRING
}
input_fields {
input_name: "click_10_seq__category"
input_type: STRING
}
input_fields {
input_name: "click_10_seq__ts"
input_type: STRING
}
pai_worker_queue: true
}
feature_configs {
input_names: "user_id"
feature_type: IdFeature
embedding_dim: 8
hash_bucket_size: 48000
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "item_id"
feature_type: IdFeature
embedding_dim: 8
hash_bucket_size: 27000
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "age"
feature_type: RawFeature
embedding_dim: 4
separator: ""
boundaries: 1e-08
boundaries: 10
boundaries: 20
boundaries: 30
boundaries: 40
boundaries: 50
boundaries: 60
}
feature_configs {
input_names: "gender"
feature_type: IdFeature
embedding_dim: 4
hash_bucket_size: 10
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "user__kv_category_click_1d"
feature_type: RawFeature
embedding_dim: 4
separator: ""
boundaries: 1e-08
boundaries: 1.0
boundaries: 2.0
boundaries: 3.0
boundaries: 4.0
boundaries: 5.0
boundaries: 6.0
}
feature_configs {
input_names: "category"
feature_type: IdFeature
embedding_dim: 4
hash_bucket_size: 100
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "tags"
feature_type: TagFeature
embedding_dim: 4
hash_bucket_size: 1000
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "description"
feature_type: SequenceFeature
embedding_dim: 4
hash_bucket_size: 10
separator: ""
sequence_combiner {
text_cnn {
filter_sizes: 2
filter_sizes: 3
filter_sizes: 4
num_filters: 16
num_filters: 8
num_filters: 8
}
}
}
feature_configs {
input_names: "click_10_seq__item_id"
feature_type: SequenceFeature
embedding_dim: 8
hash_bucket_size: 27000
separator: ";"
combiner: "mean"
sub_feature_type: IdFeature
}
feature_configs {
input_names: "click_10_seq__category"
feature_type: SequenceFeature
embedding_dim: 4
hash_bucket_size: 10000
separator: ";"
combiner: "mean"
sub_feature_type: IdFeature
}
feature_configs {
input_names: "click_10_seq__ts"
feature_type: SequenceFeature
embedding_dim: 4
separator: ";"
sub_feature_type: RawFeature
}
feature_configs {
input_names: "click_10_seq__ts"
feature_type: SequenceFeature
embedding_dim: 4
separator: ";"
sub_feature_type: RawFeature
}
feature_configs {
input_names: ["age", "gender"]
feature_name: "combo_age_gender"
feature_type: ComboFeature
embedding_dim: 16
hash_bucket_size: 1000
}
feature_configs {
input_names: "age"
feature_name: "age_satisfy1"
feature_type: ExprFeature
expression: "age>=18"
}
model_config {
model_class: "MultiTower"
feature_groups {
group_name: "all"
feature_names: "user_id"
feature_names: "item_id"
feature_names: "age"
feature_names: "gender"
feature_names: "user__kv_category_click_1d"
feature_names: "category"
feature_names: "tags"
feature_names: "description"
feature_names: "combo_age_gender"
feature_names: "age_satisfy1"
wide_deep: DEEP
sequence_features {
group_name: "click_10_seq"
seq_att_map {
key: "item_id"
key: "category"
hist_seq: "click_10_seq__item_id"
hist_seq: "click_10_seq__category"
hist_seq: "click_10_seq__ts"
}
tf_summary: false
allow_key_search: false
allow_key_transform: true
}
}
embedding_regularization: 5e-06
multi_tower {
towers {
input: "all"
dnn {
hidden_units: 256
hidden_units: 128
}
}
final_dnn {
hidden_units: 64
hidden_units: 32
}
l2_regularization: 1e-06
}
}
export_config {
multi_placeholder: true
}