# -*- coding: utf-8 -*-
"""local integration test case, used to test the correctness of an algorithm and recognize handwritten digits by using scikit-learn."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import logging
import os
import numpy as np
# Import datasets, classifiers and performance metrics
from sklearn import datasets
from sklearn import ensemble
from sklearn import metrics
import joblib
logger = logging.getLogger()
def run_sk_digits(inputs):
"""Recognize handwritten digits by using scikit-learn."""
# Fake: copy
if inputs.checkpoint_id >= 0:
logger.info('copy model from {} to {}'.format(inputs.checkpoint_id,
inputs.trial_id))
np.random.seed(0)
# The digits dataset
digits = datasets.load_digits()
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
n_samples = int(n_samples * inputs.ratio)
data = data[:n_samples]
digits.target = digits.target[:n_samples]
offset = n_samples // 2
x_train, y_train = data[:offset], digits.target[:offset]
x_test, y_test = data[offset:], digits.target[offset:]
params = {
'n_estimators': inputs.n_estimators,
'max_depth': inputs.max_depth,
'min_samples_split': inputs.min_samples_split,
'learning_rate': inputs.learning_rate,
'loss': 'deviance'
}
clf = ensemble.GradientBoostingClassifier(**params)
assert 0.0 < inputs.used_data_percent <= 1.0
used_data = int(np.ceil(inputs.used_data_percent * len(x_train)))
clf.fit(x_train, y_train)
# TODO(weidan): We should allocate dirs (name) for each trial in the TrialManager.
metric_file = inputs.metric_file
dump_file = inputs.dump_file
dirname = os.path.dirname(metric_file)
if not os.path.exists(dirname):
os.makedirs(dirname)
dirname = os.path.dirname(dump_file)
if not os.path.exists(dirname):
os.makedirs(dirname)
logger.info('saving to: {}, {}'.format(metric_file, dump_file))
joblib.dump(clf, dump_file)
#{"iteration": 0,"metric":"metrics,string","total_processed_sample":10000}
with open(metric_file, 'w') as f:
for _, y_pred in enumerate(clf.staged_predict(x_test)):
result = metrics.classification_report(y_test, y_pred, digits=4)
#avg / total 0.76 0.76 0.76 899
result = result.rstrip().split('\n')[-1].split()[-2]
f.write('%s\n' % result.strip())
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--n_estimators', type=int)
parser.add_argument('--max_depth', type=int)
parser.add_argument('--min_samples_split', type=int)
parser.add_argument('--learning_rate', type=float)
parser.add_argument('--trial_id', type=str)
parser.add_argument('--used_data_percent', type=float, default=1.0)
parser.add_argument('--checkpoint_id', type=int, default=-1)
parser.add_argument('--dump_file', type=str)
parser.add_argument('--metric_file', type=str)
parser.add_argument('--ratio', type=float, default=1.0)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
run_sk_digits(args)