Topik ini menjelaskan cara menguji konfigurasi Feature Generator (FG) Anda secara lokal.
Dokumen ini menjelaskan logika waktu proses FG. Di lingkungan Linux, Anda dapat menguji apakah file konfigurasi, input, dan output sesuai dengan ekspektasi Anda.
Instal pyfg
Di lingkungan Python 3.11, jalankan perintah berikut:
pip install http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-1.0.1-cp311-cp311-linux_x86_64.whlJalur paket untuk Python 3.10 adalah: http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-1.0.1-cp310-cp310-linux_x86_64.whl
Jalur paket untuk Python 3.12 adalah: http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-1.0.1-cp312-cp312-linux_x86_64.whl
Catatan: Perubahan utama pada versi 1.0.1 dibandingkan versi sebelumnya adalah peningkatan serialisasi untuk operator fitur. Versi ini mendukung semua operator fitur sebagai sub-fitur dari sequence features.
Jalankan FG
1. Untuk layanan inferensi model yang menggunakan EasyRec processor (TensorFlow)
#!/usr/bin/env python
import os
import pyfg
config = {
"features": [
{
"feature_name": "goods_id",
"feature_type": "id_feature",
"value_type": "string",
"expression": "item:goods_id",
"default_value": "-1024",
"need_prefix": False,
"value_dimension": 1
},
{
"feature_name": "color_pair",
"feature_type": "combo_feature",
"value_type": "string",
"expression": ["user:query_color", "item:color"],
"default_value": "",
"need_prefix": False,
"value_dimension": 1
},
{
"feature_name": "current_price",
"feature_type": "raw_feature",
"value_type": "double",
"expression": "item:current_price",
"default_value": "0",
"need_prefix": False
},
{
"feature_name": "usr_cate1_clk_cnt_1d",
"feature_type": "lookup_feature",
"map": "user:usr_cate1_clk_cnt_1d",
"key": "item:cate1",
"need_discrete": False,
"need_key": False,
"default_value": "0",
"combiner": "max",
"need_prefix": False,
"value_type": "double"
},
{
"feature_name": "recommend_match",
"feature_type": "overlap_feature",
"method": "is_contain",
"query": "user:query_recommend",
"title": "item:recommend",
"default_value": "0"
},
{
"feature_name": "query_title_match_ratio",
"feature_type": "overlap_feature",
"method": "query_common_ratio",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "title_term_match_ratio",
"feature_type": "overlap_feature",
"method": "title_common_ratio",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "term_proximity_min_cover",
"feature_type": "overlap_feature",
"method": "proximity_min_cover",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
}
]
}
if __name__ == '__main__':
handler = pyfg.FgHandler(config)
print("------------------------ meta info ---------------------------")
print("user side inputs:", handler.user_inputs())
print("item side inputs:", handler.item_inputs())
print("context side inputs:", handler.context_inputs())
print("offline table schema:", handler.table_schema())
features = handler.all_feature_names()
print("all generated features:", features)
inputs = {
"goods_id": ["110", "111", "112"],
"query_color": ["red", "pink", "gray"],
"color": ["white", "black", "pink"],
"current_price": [0.5, 0.25, 0.78],
"usr_cate1_clk_cnt_1d": [
{"c1": 1, "c2": 13, "c3": 5},
{"c1": 5, "c2": 3, "c4": 4.5},
{"c7": 7, "c5": 9, "c3": 5}
],
"cate1": ["c1", "c2", "c3"],
"query_recommend": ["High-quality", "Brand", "Premium"],
"recommend": ["High-quality", "Brand", "Carefully-selected"],
"title_terms": [
"Clear\035Men\035Shampoo\035Anti-dandruff\035Refreshing\035Shampoo-cream\035Men\035Vitality\035Sport\035100G",
"Master-Kong\035Jasmine\035Honey-Tea\035330ml*12\035bottles",
"Diao-Brand\035Detergent\035Household\035Large-barrel\035Food-grade\035Dishwashing-liquid\035Fruit-and-vegetable\035Cleaner\035Value-pack\035Dish-soap"
],
"query_terms": [
"Clear\035Shampoo",
"Jasmine\035Green-Tea",
"Detergent\035Household"
]
}
outputs, status = handler(inputs)
print("status:", status.ok())
print("outputs:", outputs)
# debug log: input data & generated features
print("------------------------ debug log ---------------------------")
input_str = handler.to_input_str(inputs)
print("input data:", input_str)
print()
generated_str = handler.to_debug_str_v2(outputs)
print("generated feature:", generated_str)Output-nya sebagai berikut. Parameter value_dimension mengontrol dimensi output.
------------------------ meta info ---------------------------
user side inputs: {'usr_cate1_clk_cnt_1d', 'query_terms', 'query_recommend', 'query_color', 'query'}
item side inputs: {'title_terms', 'title', 'recommend', 'goods_id', 'color', 'cate1', 'current_price'}
context side inputs: set()
offline table schema: {'usr_cate1_clk_cnt_1d': 'double', 'query_title_match_ratio': 'float', 'current_price': 'double', 'term_proximity_min_cover': 'float', 'title_term_match_ratio': 'float', 'edit_distance': 'int', 'recommend_match': 'float', 'goods_id': 'string', 'color_pair': 'string'}
all generated features: ['goods_id', 'color_pair', 'current_price', 'usr_cate1_clk_cnt_1d', 'recommend_match', 'query_title_match_ratio', 'title_term_match_ratio', 'term_proximity_min_cover', 'edit_distance']
inputs {'goods_id': ['110', '111', '112'], 'query_color': ['red', 'pink', 'gray'], 'color': ['white', 'black', 'pink'], 'current_price': [0.5, 0.25, 0.78], 'usr_cate1_clk_cnt_1d': [{'c1': 1, 'c2': 13, 'c3': 5}, {'c1': 5, 'c2': 3, 'c4': 4.5}, {'c7': 7, 'c5': 9, 'c3': 5}], 'cate1': ['c1', 'c2', 'c3'], 'query_recommend': ['High-quality', 'Brand', 'Premium'], 'recommend': ['High-quality', 'Brand', 'Carefully-selected'], 'title_terms': ['Clear\x1dMen\x1dShampoo\x1dAnti-dandruff\x1dRefreshing\x1dShampoo-cream\x1dMen\x1dVitality\x1dSport\x1d100G', 'Master-Kong\x1dJasmine\x1dHoney-Tea\x1d330ml*12\x1dbottles', 'Diao-Brand\x1dDetergent\x1dHousehold\x1dLarge-barrel\x1dFood-grade\x1dDishwashing-liquid\x1dFruit-and-vegetable\x1dCleaner\x1dValue-pack\x1dDish-soap'], 'query_terms': ['Clear\x1dShampoo', 'Jasmine\x1dGreen-Tea', 'Detergent\x1dHousehold'], 'query': ['Republic of China', 'Feature|Generation', 'The tool is very useful'], 'title': ['China', 'Feature|Transformation', 'The tool is useful']}
status: True
outputs: {'title_term_match_ratio': [0.20000000298023224, 0.20000000298023224, 0.20000000298023224], 'term_proximity_min_cover': [3.0, 0.0, 2.0], 'edit_distance': [12, 12, 11], 'query_title_match_ratio': [1.0, 0.5, 1.0], 'color_pair': ['red_white', 'pink_black', 'gray_pink'], 'goods_id': ['110', '111', '112'], 'current_price': [0.5, 0.25, 0.7799999713897705], 'usr_cate1_clk_cnt_1d': [1.0, 3.0, 5.0], 'recommend_match': [1.0, 1.0, 0.0]}
------------------------ debug log ---------------------------
input data: ['cate1:c1 | color:white | current_price:0.5 | goods_id:110 | query:Republic of China | query_color:red | query_recommend:High-quality | query_terms:Clear\x1dShampoo | recommend:High-quality | title:China | title_terms:Clear\x1dMen\x1dShampoo\x1dAnti-dandruff\x1dRefreshing\x1dShampoo-cream\x1dMen\x1dVitality\x1dSport\x1d100G | usr_cate1_clk_cnt_1d:c1:1\x1dc2:13\x1dc3:5', 'cate1:c2 | color:black | current_price:0.25 | goods_id:111 | query:Feature|Generation | query_color:pink | query_recommend:Brand | query_terms:Jasmine\x1dGreen-Tea | recommend:Brand | title:Feature|Transformation | title_terms:Master-Kong\x1dJasmine\x1dHoney-Tea\x1d330ml*12\x1dbottles | usr_cate1_clk_cnt_1d:c1:5\x1dc2:3\x1dc4:4.5', 'cate1:c3 | color:pink | current_price:0.78 | goods_id:112 | query:The tool is very useful | query_color:gray | query_recommend:Premium | query_terms:Detergent\x1dHousehold | recommend:Carefully-selected | title:The tool is useful | title_terms:Diao-Brand\x1dDetergent\x1dHousehold\x1dLarge-barrel\x1dFood-grade\x1dDishwashing-liquid\x1dFruit-and-vegetable\x1dCleaner\x1dValue-pack\x1dDish-soap | usr_cate1_clk_cnt_1d:c3:5\x1dc5:9\x1dc7:7']
generated feature: ['goods_id:110 | color_pair:red_white | current_price:0.5 | usr_cate1_clk_cnt_1d:1 | recommend_match:1 | query_title_match_ratio:1 | title_term_match_ratio:0.2 | term_proximity_min_cover:3 | edit_distance:12', 'goods_id:111 | color_pair:pink_black | current_price:0.25 | usr_cate1_clk_cnt_1d:3 | recommend_match:1 | query_title_match_ratio:0.5 | title_term_match_ratio:0.2 | term_proximity_min_cover:0 | edit_distance:12', 'goods_id:112 | color_pair:gray_pink | current_price:0.78 | usr_cate1_clk_cnt_1d:5 | recommend_match:0 | query_title_match_ratio:1 | title_term_match_ratio:0.2 | term_proximity_min_cover:2 | edit_distance:11']2. Untuk layanan inferensi model yang menggunakan TorchEasyRec processor (PyTorch)
#!/usr/bin/env python
import pyfg
config = {
"features": [
{
"feature_name": "goods_id",
"feature_type": "id_feature",
"value_type": "string",
"expression": "item:goods_id",
"default_value": "-1024",
"need_prefix": False,
"hash_bucket_size": 100000,
"value_dimension": 1
},
{
"feature_name": "color_pair",
"feature_type": "combo_feature",
"value_type": "string",
"expression": ["user:query_color", "item:color"],
"default_value": "",
"need_prefix": False,
"hash_bucket_size": 100000,
"value_dimension": 1
},
{
"feature_name": "current_price",
"feature_type": "raw_feature",
"value_type": "float",
"expression": "item:current_price",
"default_value": "0",
"need_prefix": False
},
{
"feature_name": "usr_cate1_clk_cnt_1d",
"feature_type": "lookup_feature",
"map": "user:usr_cate1_clk_cnt_1d",
"key": "item:cate1",
"need_discrete": False,
"need_key": False,
"default_value": "0",
"combiner": "max",
"need_prefix": False,
"value_type": "float"
},
{
"feature_name": "recommend_match",
"feature_type": "overlap_feature",
"method": "is_contain",
"query": "user:query_recommend",
"title": "item:recommend",
"default_value": "0"
},
{
"feature_name": "query_title_match_ratio",
"feature_type": "overlap_feature",
"method": "query_common_ratio",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "title_term_match_ratio",
"feature_type": "overlap_feature",
"method": "title_common_ratio",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "term_proximity_min_cover",
"feature_type": "overlap_feature",
"method": "proximity_min_cover",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "edit_distance",
"feature_type": "custom_feature",
"operator_name": "EditDistance",
"operator_lib_file": "pyfg/lib/libedit_distance.so",
"expression": ["user:query", "item:title"],
"default_value": "0",
"value_type": "int32",
"value_dimension": 1,
"encoding": "utf-8",
"normalizer": "method=expression,expr=x+10"
}
]
}
if __name__ == '__main__':
handler = pyfg.FgHandler(config)
print("------------------------ meta info ---------------------------")
print("user side inputs:", handler.user_inputs())
print("item side inputs:", handler.item_inputs())
print("context side inputs:", handler.context_inputs())
print("offline table schema:", handler.table_schema())
features = handler.all_feature_names()
print("all generated features:", features)
inputs = {
"goods_id": ["110", "111", "112"],
"query_color": ["red", "pink", "gray"],
"color": ["white", "black", "pink"],
"current_price": [0.5, 0.25, 0.78],
"usr_cate1_clk_cnt_1d": [
{"c1": 1, "c2": 13, "c3": 5},
{"c1": 5, "c2": 3, "c4": 4.5},
{"c7": 7, "c5": 9, "c3": 5}
],
"cate1": ["c1", "c2", "c3"],
"query_recommend": ["High-quality", "Brand", "Premium"],
"recommend": ["High-quality", "Brand", "Carefully-selected"],
"title_terms": [
"Clear\035Men\035Shampoo\035Anti-dandruff\035Refreshing\035Shampoo-cream\035Men\035Vitality\035Sport\035100G",
"Master-Kong\035Jasmine\035Honey-Tea\035330ml*12\035bottles",
"Diao-Brand\035Detergent\035Household\035Large-barrel\035Food-grade\035Dishwashing-liquid\035Fruit-and-vegetable\035Cleaner\035Value-pack\035Dish-soap"
],
"query_terms": [
"Clear\035Shampoo",
"Jasmine\035Green-Tea",
"Detergent\035Household"
],
"query": ["Republic of China", "Feature|Generation", "The tool is very useful"],
"title": ["China", "Feature|Transformation", "The tool is useful"]
}
print("inputs", inputs)
outputs, status = handler.process(inputs) # For TorchRecProcessor
print("status:", status.ok())
print("------------------------ outputs ---------------------------")
for feature in features:
feat = outputs[feature]
if feat.feat_mode in (pyfg.FeatMode.Sparse, pyfg.FeatMode.SeqSparse):
print(feature, "values:", feat.values)
print(feature, "lengths:", feat.lengths)
elif feat.feat_mode in (pyfg.FeatMode.Dense, pyfg.FeatMode.SeqDense):
print(feature, "values:", feat.dense_values)
# debug log: input data & generated features
print("------------------------ debug log ---------------------------")
input_str = handler.to_input_str(inputs)
print("input data:", input_str)
print()
generated_str = handler.to_debug_str(outputs, ',')
print("generated feature:", generated_str)
Jalankan kode di atas. Output-nya sebagai berikut:
------------------------ meta info ---------------------------
user side inputs: {'query_terms', 'usr_cate1_clk_cnt_1d', 'query_color', 'query', 'query_recommend'}
item side inputs: {'cate1', 'color', 'goods_id', 'title_terms', 'title', 'recommend', 'current_price'}
context side inputs: set()
offline table schema: {'goods_id': 'bigint', 'color_pair': 'bigint', 'title_term_match_ratio': 'float', 'usr_cate1_clk_cnt_1d': 'float', 'term_proximity_min_cover': 'float', 'recommend_match': 'float', 'query_title_match_ratio': 'float', 'current_price': 'float', 'edit_distance': 'int'}
all generated features: ['goods_id', 'color_pair', 'current_price', 'usr_cate1_clk_cnt_1d', 'recommend_match', 'query_title_match_ratio', 'title_term_match_ratio', 'term_proximity_min_cover', 'edit_distance']
inputs {'goods_id': ['110', '111', '112'], 'query_color': ['red', 'pink', 'gray'], 'color': ['white', 'black', 'pink'], 'current_price': [0.5, 0.25, 0.78], 'usr_cate1_clk_cnt_1d': [{'c1': 1, 'c2': 13, 'c3': 5}, {'c1': 5, 'c2': 3, 'c4': 4.5}, {'c7': 7, 'c5': 9, 'c3': 5}], 'cate1': ['c1', 'c2', 'c3'], 'query_recommend': ['High-quality', 'Brand', 'Premium'], 'recommend': ['High-quality', 'Brand', 'Carefully-selected'], 'title_terms': ['Clear\x1dMen\x1dShampoo\x1dAnti-dandruff\x1dRefreshing\x1dShampoo-cream\x1dMen\x1dVitality\x1dSport\x1d100G', 'Master-Kong\x1dJasmine\x1dHoney-Tea\x1d330ml*12\x1dbottles', 'Diao-Brand\x1dDetergent\x1dHousehold\x1dLarge-barrel\x1dFood-grade\x1dDishwashing-liquid\x1dFruit-and-vegetable\x1dCleaner\x1dValue-pack\x1dDish-soap'], 'query_terms': ['Clear\x1dShampoo', 'Jasmine\x1dGreen-Tea', 'Detergent\x1dHousehold'], 'query': ['Republic of China', 'Feature|Generation', 'The tool is very useful'], 'title': ['China', 'Feature|Transformation', 'The tool is useful']}
status: True
------------------------ outputs ---------------------------
goods_id values: [89031, 84826, 50041]
goods_id lengths: [1, 1, 1]
color_pair values: [82277, 85822, 86290]
color_pair lengths: [1, 1, 1]
current_price values: [[0.5 ]
[0.25]
[0.78]]
usr_cate1_clk_cnt_1d values: [[1.]
[3.]
[5.]]
recommend_match values: [[1.]
[1.]
[0.]]
query_title_match_ratio values: [[1. ]
[0.5]
[1. ]]
title_term_match_ratio values: [[0.2]
[0.2]
[0.2]]
term_proximity_min_cover values: [[3.]
[0.]
[2.]]
edit_distance values: [[12.]
[12.]
[11.]]
------------------------ debug log ---------------------------
input data: ['cate1:c1 | color:white | current_price:0.5 | goods_id:110 | query:Republic of China | query_color:red | query_recommend:High-quality | query_terms:Clear\x1dShampoo | recommend:High-quality | title:China | title_terms:Clear\x1dMen\x1dShampoo\x1dAnti-dandruff\x1dRefreshing\x1dShampoo-cream\x1dMen\x1dVitality\x1dSport\x1d100G | usr_cate1_clk_cnt_1d:c1:1\x1dc2:13\x1dc3:5', 'cate1:c2 | color:black | current_price:0.25 | goods_id:111 | query:Feature|Generation | query_color:pink | query_recommend:Brand | query_terms:Jasmine\x1dGreen-Tea | recommend:Brand | title:Feature|Transformation | title_terms:Master-Kong\x1dJasmine\x1dHoney-Tea\x1d330ml*12\x1dbottles | usr_cate1_clk_cnt_1d:c1:5\x1dc2:3\x1dc4:4.5', 'cate1:c3 | color:pink | current_price:0.78 | goods_id:112 | query:The tool is very useful | query_color:gray | query_recommend:Premium | query_terms:Detergent\x1dHousehold | recommend:Carefully-selected | title:The tool is useful | title_terms:Diao-Brand\x1dDetergent\x1dHousehold\x1dLarge-barrel\x1dFood-grade\x1dDishwashing-liquid\x1dFruit-and-vegetable\x1dCleaner\x1dValue-pack\x1dDish-soap | usr_cate1_clk_cnt_1d:c3:5\x1dc5:9\x1dc7:7']
generated feature: ['goods_id:89031 | color_pair:82277 | current_price:0.5 | usr_cate1_clk_cnt_1d:1 | recommend_match:1 | query_title_match_ratio:1 | title_term_match_ratio:0.2 | term_proximity_min_cover:3 | edit_distance:12', 'goods_id:84826 | color_pair:85822 | current_price:0.25 | usr_cate1_clk_cnt_1d:3 | recommend_match:1 | query_title_match_ratio:0.5 | title_term_match_ratio:0.2 | term_proximity_min_cover:0 | edit_distance:12', 'goods_id:50041 | color_pair:86290 | current_price:0.78 | usr_cate1_clk_cnt_1d:5 | recommend_match:0 | query_title_match_ratio:1 | title_term_match_ratio:0.2 | term_proximity_min_cover:2 | edit_distance:11']Jalankan pyfg di MaxCompute
Untuk informasi selengkapnya tentang cara menjalankan pyfg di DataWorks, lihat Gunakan FG dalam tugas offline.
Ikuti langkah-langkah berikut untuk mengirimkan tugas secara lokal ke platform MaxCompute:
Instal paket pyfg101
Di lingkungan Python 3.7, jalankan perintah berikut:
pip install http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg101-1.0.1-cp37-cp37m-linux_x86_64.whlJalankan pyfg
#!/usr/bin/env python
from pyfg101 import run_on_odps
fg_task = run_on_odps.FgTask(
'${input_table}',
'${output_table}',
'fg.json',
batch_size=128,
force_delete_output_table=True,
force_update_resource=False,
output_merged_str=False,
debug=False)
fg_task.add_sql_setting('odps.stage.mapper.split.size', 256)
if __name__ == '__main__':
import os
from odps import ODPS
odps = ODPS(
os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),
os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),
project='pai_rec_test_dev',
endpoint='http://service.cn-beijing.maxcompute.aliyun.com/api',
)
fg_task.run(odps)