使用本文介绍的方法来本地测试你的FG配置是否正确。
以下文档是帮助用户理解FG的运行逻辑,可以在Linux环境中根据自己的业务需求来测试配置文件、输入输出是否符合预期。
安装pyfg
在Python 3.11环境下,执行如下命令:
pip install http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-1.0.1-cp311-cp311-linux_x86_64.whlpython 3.10 的安装包路径:http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-1.0.1-cp310-cp310-linux_x86_64.whl
python 3.12 的安装包路径:http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-1.0.1-cp312-cp312-linux_x86_64.whl
执行FG
1. 对应使用EasyRec Processor的模型推理服务(TensorFlow)
#!/usr/bin/env python
import os
import pyfg
config = {
"features": [
{
"feature_name": "goods_id",
"feature_type": "id_feature",
"value_type": "string",
"expression": "item:goods_id",
"default_value": "-1024",
"need_prefix": False,
"value_dimension": 1
},
{
"feature_name": "color_pair",
"feature_type": "combo_feature",
"value_type": "string",
"expression": ["user:query_color", "item:color"],
"default_value": "",
"need_prefix": False,
"value_dimension": 1
},
{
"feature_name": "current_price",
"feature_type": "raw_feature",
"value_type": "double",
"expression": "item:current_price",
"default_value": "0",
"need_prefix": False
},
{
"feature_name": "usr_cate1_clk_cnt_1d",
"feature_type": "lookup_feature",
"map": "user:usr_cate1_clk_cnt_1d",
"key": "item:cate1",
"need_discrete": False,
"need_key": False,
"default_value": "0",
"combiner": "max",
"need_prefix": False,
"value_type": "double"
},
{
"feature_name": "recommend_match",
"feature_type": "overlap_feature",
"method": "is_contain",
"query": "user:query_recommend",
"title": "item:recommend",
"default_value": "0"
},
{
"feature_name": "query_title_match_ratio",
"feature_type": "overlap_feature",
"method": "query_common_ratio",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "title_term_match_ratio",
"feature_type": "overlap_feature",
"method": "title_common_ratio",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "term_proximity_min_cover",
"feature_type": "overlap_feature",
"method": "proximity_min_cover",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
}
]
}
if __name__ == '__main__':
handler = pyfg.FgHandler(config)
print("------------------------ meta info ---------------------------")
print("user side inputs:", handler.user_inputs())
print("item side inputs:", handler.item_inputs())
print("context side inputs:", handler.context_inputs())
print("offline table schema:", handler.table_schema())
features = handler.all_feature_names()
print("all generated features:", features)
inputs = {
"goods_id": ["110", "111", "112"],
"query_color": ["red", "pink", "gray"],
"color": ["white", "black", "pink"],
"current_price": [0.5, 0.25, 0.78],
"usr_cate1_clk_cnt_1d": [
{"c1": 1, "c2": 13, "c3": 5},
{"c1": 5, "c2": 3, "c4": 4.5},
{"c7": 7, "c5": 9, "c3": 5}
],
"cate1": ["c1", "c2", "c3"],
"query_recommend": ["精品", "品牌", "优质"],
"recommend": ["精品", "品牌", "严选"],
"title_terms": [
"清扬\035男士\035洗发水\035去屑\035清爽\035洗发膏\035男士\035活力\035运动\035100G",
"康师傅\035茉莉\035蜜茶\035330ml*12\035瓶",
"雕牌\035洗洁精\035家用\035大桶\035食品级\035洗碗液\035果蔬\035清洗剂\035实惠装\035洗碟精"
],
"query_terms": [
"清扬\035洗发水",
"茉莉\035清茶",
"洗洁精\035家用"
]
}
outputs, status = handler(inputs)
print("status:", status.ok())
print("outputs:", outputs)
# debug log: input data & generated features
print("------------------------ debug log ---------------------------")
input_str = handler.to_input_str(inputs)
print("input data:", input_str)
print()
generated_str = handler.to_debug_str_v2(outputs)
print("generated feature:", generated_str)输出结果如下,其中,value_dimension用来控制输出的维度。
------------------------ meta info ---------------------------
user side inputs: {'usr_cate1_clk_cnt_1d', 'query_terms', 'query_recommend', 'query_color', 'query'}
item side inputs: {'title_terms', 'title', 'recommend', 'goods_id', 'color', 'cate1', 'current_price'}
context side inputs: set()
offline table schema: {'usr_cate1_clk_cnt_1d': 'double', 'query_title_match_ratio': 'float', 'current_price': 'double', 'term_proximity_min_cover': 'float', 'title_term_match_ratio': 'float', 'edit_distance': 'int', 'recommend_match': 'float', 'goods_id': 'string', 'color_pair': 'string'}
all generated features: ['goods_id', 'color_pair', 'current_price', 'usr_cate1_clk_cnt_1d', 'recommend_match', 'query_title_match_ratio', 'title_term_match_ratio', 'term_proximity_min_cover', 'edit_distance']
inputs {'goods_id': ['110', '111', '112'], 'query_color': ['red', 'pink', 'gray'], 'color': ['white', 'black', 'pink'], 'current_price': [0.5, 0.25, 0.78], 'usr_cate1_clk_cnt_1d': [{'c1': 1, 'c2': 13, 'c3': 5}, {'c1': 5, 'c2': 3, 'c4': 4.5}, {'c7': 7, 'c5': 9, 'c3': 5}], 'cate1': ['c1', 'c2', 'c3'], 'query_recommend': ['精品', '品牌', '优质'], 'recommend': ['精品', '品牌', '严选'], 'title_terms': ['清扬\x1d男士\x1d洗发水\x1d去屑\x1d清爽\x1d洗发膏\x1d男士\x1d活力\x1d运动\x1d100G', '康师傅\x1d茉莉\x1d蜜茶\x1d330ml*12\x1d瓶', '雕牌\x1d洗洁精\x1d家用\x1d大桶\x1d食品级\x1d洗碗液\x1d果蔬\x1d清洗剂\x1d实惠装\x1d洗碟精'], 'query_terms': ['清扬\x1d洗发水', '茉莉\x1d清茶', '洗洁精\x1d家用'], 'query': ['中华民国', '特征|生成', '工具真好用'], 'title': ['中国', '特征|变换', '工具好用']}
status: True
outputs: {'title_term_match_ratio': [0.20000000298023224, 0.20000000298023224, 0.20000000298023224], 'term_proximity_min_cover': [3.0, 0.0, 2.0], 'edit_distance': [12, 12, 11], 'query_title_match_ratio': [1.0, 0.5, 1.0], 'color_pair': ['red_white', 'pink_black', 'gray_pink'], 'goods_id': ['110', '111', '112'], 'current_price': [0.5, 0.25, 0.7799999713897705], 'usr_cate1_clk_cnt_1d': [1.0, 3.0, 5.0], 'recommend_match': [1.0, 1.0, 0.0]}
------------------------ debug log ---------------------------
input data: ['cate1:c1 | color:white | current_price:0.5 | goods_id:110 | query:中华民国 | query_color:red | query_recommend:精品 | query_terms:清扬\x1d洗发水 | recommend:精品 | title:中国 | title_terms:清扬\x1d男士\x1d洗发水\x1d去屑\x1d清爽\x1d洗发膏\x1d男士\x1d活力\x1d运动\x1d100G | usr_cate1_clk_cnt_1d:c1:1\x1dc2:13\x1dc3:5', 'cate1:c2 | color:black | current_price:0.25 | goods_id:111 | query:特征|生成 | query_color:pink | query_recommend:品牌 | query_terms:茉莉\x1d清茶 | recommend:品牌 | title:特征|变换 | title_terms:康师傅\x1d茉莉\x1d蜜茶\x1d330ml*12\x1d瓶 | usr_cate1_clk_cnt_1d:c1:5\x1dc2:3\x1dc4:4.5', 'cate1:c3 | color:pink | current_price:0.78 | goods_id:112 | query:工具真好用 | query_color:gray | query_recommend:优质 | query_terms:洗洁精\x1d家用 | recommend:严选 | title:工具好用 | title_terms:雕牌\x1d洗洁精\x1d家用\x1d大桶\x1d食品级\x1d洗碗液\x1d果蔬\x1d清洗剂\x1d实惠装\x1d洗碟精 | usr_cate1_clk_cnt_1d:c3:5\x1dc5:9\x1dc7:7']
generated feature: ['goods_id:110 | color_pair:red_white | current_price:0.5 | usr_cate1_clk_cnt_1d:1 | recommend_match:1 | query_title_match_ratio:1 | title_term_match_ratio:0.2 | term_proximity_min_cover:3 | edit_distance:12', 'goods_id:111 | color_pair:pink_black | current_price:0.25 | usr_cate1_clk_cnt_1d:3 | recommend_match:1 | query_title_match_ratio:0.5 | title_term_match_ratio:0.2 | term_proximity_min_cover:0 | edit_distance:12', 'goods_id:112 | color_pair:gray_pink | current_price:0.78 | usr_cate1_clk_cnt_1d:5 | recommend_match:0 | query_title_match_ratio:1 | title_term_match_ratio:0.2 | term_proximity_min_cover:2 | edit_distance:11']2. 对应使用TorchEasyRec Processor的模型推理服务(PyTorch)
#!/usr/bin/env python
import pyfg
config = {
"features": [
{
"feature_name": "goods_id",
"feature_type": "id_feature",
"value_type": "string",
"expression": "item:goods_id",
"default_value": "-1024",
"need_prefix": False,
"hash_bucket_size": 100000,
"value_dimension": 1
},
{
"feature_name": "color_pair",
"feature_type": "combo_feature",
"value_type": "string",
"expression": ["user:query_color", "item:color"],
"default_value": "",
"need_prefix": False,
"hash_bucket_size": 100000,
"value_dimension": 1
},
{
"feature_name": "current_price",
"feature_type": "raw_feature",
"value_type": "float",
"expression": "item:current_price",
"default_value": "0",
"need_prefix": False
},
{
"feature_name": "usr_cate1_clk_cnt_1d",
"feature_type": "lookup_feature",
"map": "user:usr_cate1_clk_cnt_1d",
"key": "item:cate1",
"need_discrete": False,
"need_key": False,
"default_value": "0",
"combiner": "max",
"need_prefix": False,
"value_type": "float"
},
{
"feature_name": "recommend_match",
"feature_type": "overlap_feature",
"method": "is_contain",
"query": "user:query_recommend",
"title": "item:recommend",
"default_value": "0"
},
{
"feature_name": "query_title_match_ratio",
"feature_type": "overlap_feature",
"method": "query_common_ratio",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "title_term_match_ratio",
"feature_type": "overlap_feature",
"method": "title_common_ratio",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "term_proximity_min_cover",
"feature_type": "overlap_feature",
"method": "proximity_min_cover",
"query": "user:query_terms",
"title": "item:title_terms",
"default_value": "0"
},
{
"feature_name": "edit_distance",
"feature_type": "custom_feature",
"operator_name": "EditDistance",
"operator_lib_file": "pyfg/lib/libedit_distance.so",
"expression": ["user:query", "item:title"],
"default_value": "0",
"value_type": "int32",
"value_dimension": 1,
"encoding": "utf-8",
"normalizer": "method=expression,expr=x+10"
}
]
}
if __name__ == '__main__':
handler = pyfg.FgHandler(config)
print("------------------------ meta info ---------------------------")
print("user side inputs:", handler.user_inputs())
print("item side inputs:", handler.item_inputs())
print("context side inputs:", handler.context_inputs())
print("offline table schema:", handler.table_schema())
features = handler.all_feature_names()
print("all generated features:", features)
inputs = {
"goods_id": ["110", "111", "112"],
"query_color": ["red", "pink", "gray"],
"color": ["white", "black", "pink"],
"current_price": [0.5, 0.25, 0.78],
"usr_cate1_clk_cnt_1d": [
{"c1": 1, "c2": 13, "c3": 5},
{"c1": 5, "c2": 3, "c4": 4.5},
{"c7": 7, "c5": 9, "c3": 5}
],
"cate1": ["c1", "c2", "c3"],
"query_recommend": ["精品", "品牌", "优质"],
"recommend": ["精品", "品牌", "严选"],
"title_terms": [
"清扬\035男士\035洗发水\035去屑\035清爽\035洗发膏\035男士\035活力\035运动\035100G",
"康师傅\035茉莉\035蜜茶\035330ml*12\035瓶",
"雕牌\035洗洁精\035家用\035大桶\035食品级\035洗碗液\035果蔬\035清洗剂\035实惠装\035洗碟精"
],
"query_terms": [
"清扬\035洗发水",
"茉莉\035清茶",
"洗洁精\035家用"
],
"query": ["中华民国", "特征|生成", "工具真好用"],
"title": ["中国", "特征|变换", "工具好用"]
}
print("inputs", inputs)
outputs, status = handler.process(inputs) # 对应 TorchRecProcessor
print("status:", status.ok())
print("------------------------ outputs ---------------------------")
for feature in features:
feat = outputs[feature]
if feat.feat_mode in (pyfg.FeatMode.Sparse, pyfg.FeatMode.SeqSparse):
print(feature, "values:", feat.values)
print(feature, "lengths:", feat.lengths)
elif feat.feat_mode in (pyfg.FeatMode.Dense, pyfg.FeatMode.SeqDense):
print(feature, "values:", feat.dense_values)
# debug log: input data & generated features
print("------------------------ debug log ---------------------------")
input_str = handler.to_input_str(inputs)
print("input data:", input_str)
print()
generated_str = handler.to_debug_str(outputs, ',')
print("generated feature:", generated_str)
执行上述代码,输出如下:
------------------------ meta info ---------------------------
user side inputs: {'query_terms', 'usr_cate1_clk_cnt_1d', 'query_color', 'query', 'query_recommend'}
item side inputs: {'cate1', 'color', 'goods_id', 'title_terms', 'title', 'recommend', 'current_price'}
context side inputs: set()
offline table schema: {'goods_id': 'bigint', 'color_pair': 'bigint', 'title_term_match_ratio': 'float', 'usr_cate1_clk_cnt_1d': 'float', 'term_proximity_min_cover': 'float', 'recommend_match': 'float', 'query_title_match_ratio': 'float', 'current_price': 'float', 'edit_distance': 'int'}
all generated features: ['goods_id', 'color_pair', 'current_price', 'usr_cate1_clk_cnt_1d', 'recommend_match', 'query_title_match_ratio', 'title_term_match_ratio', 'term_proximity_min_cover', 'edit_distance']
inputs {'goods_id': ['110', '111', '112'], 'query_color': ['red', 'pink', 'gray'], 'color': ['white', 'black', 'pink'], 'current_price': [0.5, 0.25, 0.78], 'usr_cate1_clk_cnt_1d': [{'c1': 1, 'c2': 13, 'c3': 5}, {'c1': 5, 'c2': 3, 'c4': 4.5}, {'c7': 7, 'c5': 9, 'c3': 5}], 'cate1': ['c1', 'c2', 'c3'], 'query_recommend': ['精品', '品牌', '优质'], 'recommend': ['精品', '品牌', '严选'], 'title_terms': ['清扬\x1d男士\x1d洗发水\x1d去屑\x1d清爽\x1d洗发膏\x1d男士\x1d活力\x1d运动\x1d100G', '康师傅\x1d茉莉\x1d蜜茶\x1d330ml*12\x1d瓶', '雕牌\x1d洗洁精\x1d家用\x1d大桶\x1d食品级\x1d洗碗液\x1d果蔬\x1d清洗剂\x1d实惠装\x1d洗碟精'], 'query_terms': ['清扬\x1d洗发水', '茉莉\x1d清茶', '洗洁精\x1d家用'], 'query': ['中华民国', '特征|生成', '工具真好用'], 'title': ['中国', '特征|变换', '工具好用']}
status: True
------------------------ outputs ---------------------------
goods_id values: [89031, 84826, 50041]
goods_id lengths: [1, 1, 1]
color_pair values: [82277, 85822, 86290]
color_pair lengths: [1, 1, 1]
current_price values: [[0.5 ]
[0.25]
[0.78]]
usr_cate1_clk_cnt_1d values: [[1.]
[3.]
[5.]]
recommend_match values: [[1.]
[1.]
[0.]]
query_title_match_ratio values: [[1. ]
[0.5]
[1. ]]
title_term_match_ratio values: [[0.2]
[0.2]
[0.2]]
term_proximity_min_cover values: [[3.]
[0.]
[2.]]
edit_distance values: [[12.]
[12.]
[11.]]
------------------------ debug log ---------------------------
input data: ['cate1:c1 | color:white | current_price:0.5 | goods_id:110 | query:中华民国 | query_color:red | query_recommend:精品 | query_terms:清扬\x1d洗发水 | recommend:精品 | title:中国 | title_terms:清扬\x1d男士\x1d洗发水\x1d去屑\x1d清爽\x1d洗发膏\x1d男士\x1d活力\x1d运动\x1d100G | usr_cate1_clk_cnt_1d:c1:1\x1dc2:13\x1dc3:5', 'cate1:c2 | color:black | current_price:0.25 | goods_id:111 | query:特征|生成 | query_color:pink | query_recommend:品牌 | query_terms:茉莉\x1d清茶 | recommend:品牌 | title:特征|变换 | title_terms:康师傅\x1d茉莉\x1d蜜茶\x1d330ml*12\x1d瓶 | usr_cate1_clk_cnt_1d:c1:5\x1dc2:3\x1dc4:4.5', 'cate1:c3 | color:pink | current_price:0.78 | goods_id:112 | query:工具真好用 | query_color:gray | query_recommend:优质 | query_terms:洗洁精\x1d家用 | recommend:严选 | title:工具好用 | title_terms:雕牌\x1d洗洁精\x1d家用\x1d大桶\x1d食品级\x1d洗碗液\x1d果蔬\x1d清洗剂\x1d实惠装\x1d洗碟精 | usr_cate1_clk_cnt_1d:c3:5\x1dc5:9\x1dc7:7']
generated feature: ['goods_id:89031 | color_pair:82277 | current_price:0.5 | usr_cate1_clk_cnt_1d:1 | recommend_match:1 | query_title_match_ratio:1 | title_term_match_ratio:0.2 | term_proximity_min_cover:3 | edit_distance:12', 'goods_id:84826 | color_pair:85822 | current_price:0.25 | usr_cate1_clk_cnt_1d:3 | recommend_match:1 | query_title_match_ratio:0.5 | title_term_match_ratio:0.2 | term_proximity_min_cover:0 | edit_distance:12', 'goods_id:50041 | color_pair:86290 | current_price:0.78 | usr_cate1_clk_cnt_1d:5 | recommend_match:0 | query_title_match_ratio:1 | title_term_match_ratio:0.2 | term_proximity_min_cover:2 | edit_distance:11']在MaxCompute上执行pyfg
在DataWorks上执行pyfg的方法请参考在离线任务中使用FG。
下面介绍在本地提交任务到MaxCompute平台的步骤:
安装 pyfg101 包
在 Python 3.7 环境下,执行如下命令:
pip install http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg101-1.0.1-cp37-cp37m-linux_x86_64.whl执行pyfg
#!/usr/bin/env python
from pyfg101 import run_on_odps
fg_task = run_on_odps.FgTask(
'${input_table}',
'${output_table}',
'fg.json',
batch_size=128,
force_delete_output_table=True,
force_update_resource=False,
output_merged_str=False,
debug=False)
fg_task.add_sql_setting('odps.stage.mapper.split.size', 256)
if __name__ == '__main__':
import os
from odps import ODPS
odps = ODPS(
os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),
os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),
project='pai_rec_test_dev',
endpoint='http://service.cn-beijing.maxcompute.aliyun.com/api',
)
fg_task.run(odps)