File indexing completed on 2026-04-09 07:58:21
0001
0002 import json
0003 import hashlib
0004 import os
0005 import sys
0006 import time
0007 import traceback
0008
0009 import numpy as np
0010
0011 from sklearn.metrics import roc_curve, auc, confusion_matrix, brier_score_loss, mean_squared_error, log_loss, roc_auc_score
0012 from sklearn import metrics
0013 from sklearn.preprocessing import label_binarize
0014 from sklearn.neighbors import KNeighborsClassifier
0015
0016 from sklearn.model_selection import cross_val_score
0017 from sklearn.model_selection import KFold
0018 from sklearn.preprocessing import LabelEncoder
0019 from sklearn import model_selection
0020 from sklearn.metrics import roc_curve, auc, confusion_matrix
0021 from sklearn.model_selection import train_test_split
0022
0023 from sklearn.preprocessing import StandardScaler
0024 from sklearn.model_selection import StratifiedKFold
0025
0026 from tabulate import tabulate
0027
0028 import xgboost as xgb
0029
0030 from bayes_opt import BayesianOptimization, UtilityFunction
0031
0032
0033 def load_data(workdir='/opt/bdt_0409/ttHyyML_had_single', analysis_type='had'):
0034 currentDir = os.path.dirname(os.path.realpath(__file__))
0035
0036 print("CurrentDir: %s" % currentDir)
0037 print("WorkDir: %s" % workdir)
0038 workdir = os.path.abspath(workdir)
0039 print("Absolute WorkDir: %s" % workdir)
0040 os.chdir(workdir)
0041 sys.path.insert(0, workdir)
0042 sys.argv = ['test']
0043
0044 if analysis_type == 'had':
0045 from load_data_real_auto import load_data_real_hadronic
0046 data, label = load_data_real_hadronic()
0047 elif analysis_type == 'lep':
0048 from load_data_real_auto import load_data_real_leptonic
0049 data, label = load_data_real_leptonic()
0050 sys.path.remove(workdir)
0051 os.chdir(currentDir)
0052 return data, label
0053
0054
0055 def get_param(params, name, default):
0056 if params and name in params:
0057 return params[name]
0058 return default
0059
0060
0061 def getAUC(y_test, score, y_val_weight=None):
0062
0063
0064 print(y_test.shape)
0065 print(score.shape)
0066 if y_val_weight:
0067 print(y_val_weight.shape)
0068 return roc_auc_score(y_test, score, sample_weight=y_val_weight)
0069
0070
0071 def getBrierScore(y_test, score):
0072 return 1 - brier_score_loss(y_test, score)
0073
0074
0075 def evaluateBrierScore(y_pred, data):
0076 label = data.get_label()
0077 return 'brierLoss', 1 - brier_score_loss(y_pred, label)
0078
0079
0080 def getRMSE(y_test, score):
0081 return mean_squared_error(y_test, score) ** 0.5
0082
0083
0084 def getLogLoss(y_test, score):
0085 return log_loss(y_test, score)
0086
0087
0088 def xgb_callback_save_model(model_name, period=1000):
0089 def callback(env):
0090 try:
0091 bst, i, _ = env.model, env.iteration, env.end_iteration
0092 if (i % period == 0):
0093 bst.save_model(model_name)
0094 except Exception:
0095 print(traceback.format_exc())
0096 return callback
0097
0098
0099 def train_bdt(input_x, input_y, params=None, retMethod=None, hist=True, saveModel=False, input_weight=None):
0100
0101 train, val = input_x
0102 y_train_cat, y_val_cat = input_y
0103 if input_weight:
0104 y_train_weight, y_val_weight = input_weight
0105 else:
0106 y_train_weight = None
0107 y_val_weight = None
0108
0109 train = train.reshape((train.shape[0], -1))
0110 val = val.reshape((val.shape[0], -1))
0111
0112 dTrain = xgb.DMatrix(train, label=y_train_cat, weight=y_train_weight)
0113 dVal = xgb.DMatrix(val, label=y_val_cat, weight=y_val_weight)
0114
0115
0116 print('Train model.')
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126 def_params = {}
0127 if not params:
0128 params = {}
0129 if 'num_boost_round' not in params:
0130 params['num_boost_round'] = 100000
0131 if 'objective' not in params:
0132 params['objective'] = 'binary:logistic'
0133
0134 for key in def_params:
0135 if key not in params:
0136 params[key] = def_params[key]
0137
0138 if 'silent' not in params:
0139 params['silent'] = 0
0140
0141 if hist:
0142 params['tree_method'] = 'hist'
0143 params['booster'] = 'gbtree'
0144 params['grow_policy'] = 'lossguide'
0145 params['nthread'] = 4
0146 params['booster'] = 'gbtree'
0147
0148 start = time.time()
0149 evallist = [(dTrain, 'train'), (dVal, 'eval')]
0150 evals_result = {}
0151
0152 try:
0153 save_model_callback = xgb_callback_save_model(params['model'] + "temp" if params and 'model' in params else 'models/default_bdt_temp.h5')
0154
0155 if not saveModel:
0156
0157 bst = xgb.train(params, dTrain, params['num_boost_round'], evals=evallist, early_stopping_rounds=10, evals_result=evals_result, verbose_eval=True)
0158 else:
0159 bst = xgb.train(params, dTrain, params['num_boost_round'], evals=evallist, early_stopping_rounds=10, evals_result=evals_result, callbacks=[save_model_callback])
0160
0161
0162 except KeyboardInterrupt:
0163 print('Finishing on SIGINT.')
0164 print("CPU Training time: %s" % (time.time() - start))
0165
0166
0167 print('Test model.')
0168
0169 score = bst.predict(dVal)
0170 rmse = None
0171 logloss = None
0172 if 'num_class' in params and params['num_class'] == 3:
0173 y_val_cat_binary = label_binarize(y_val_cat, classes=[0, 1, 2])
0174 aucValue = getAUC(y_val_cat_binary[:, 0], score[:, 0])
0175 aucValue1 = getAUC(y_val_cat_binary[:, 1], score[:, 1])
0176 aucValue2 = getAUC(y_val_cat_binary[:, 2], score[:, 2])
0177
0178 print("AUC: %s, %s, %s" % (aucValue, aucValue1, aucValue2))
0179
0180 bslValue = getBrierScore(y_val_cat_binary[:, 0], score[:, 0])
0181 bslValue1 = getBrierScore(y_val_cat_binary[:, 1], score[:, 1])
0182 bslValue2 = getBrierScore(y_val_cat_binary[:, 2], score[:, 2])
0183
0184 print("BrierScoreLoss: %s, %s, %s" % (bslValue, bslValue1, bslValue2))
0185
0186 rmse = getRMSE(y_val_cat_binary[:, 0], score[:, 0])
0187 logloss = getLogLoss(y_val_cat_binary[:, 0], score[:, 0])
0188 else:
0189 aucValue = getAUC(y_val_cat, score)
0190 bslValue = getBrierScore(y_val_cat, score)
0191 rmse = None
0192 logloss = None
0193 auc = None
0194 if 'auc' in evals_result['eval']:
0195 auc = evals_result['eval']['auc'][-1]
0196 if 'rmse' in evals_result['eval']:
0197 rmse = evals_result['eval']['rmse'][-1]
0198 if 'logloss' in evals_result['eval']:
0199 logloss = evals_result['eval']['logloss'][-1]
0200 print("params: %s #, Val AUC: %s, BrierScoreLoss: %s, xgboost rmse: %s, xgboost logloss: %s, xgboost auc: %s" % (params, aucValue, bslValue, rmse, logloss, auc))
0201 rmse = getRMSE(y_val_cat, score)
0202 logloss = getLogLoss(y_val_cat, score)
0203 print("params: %s #, Val AUC: %s, BrierScoreLoss: %s, Val rmse: %s, Val logloss: %s" % (params, aucValue, bslValue, rmse, logloss))
0204
0205
0206
0207 print(bst.get_fscore())
0208
0209
0210 try:
0211 pass
0212
0213
0214
0215
0216
0217 except Exception:
0218 print(traceback.format_exc())
0219
0220 try:
0221 history = {'loss': evals_result['train']['logloss'], 'val_loss': evals_result['eval']['logloss'],
0222 'acc': evals_result['train']['auc'], 'val_acc': evals_result['eval']['auc']}
0223 except Exception:
0224 print(traceback.format_exc())
0225 history = {}
0226
0227 if retMethod:
0228 if retMethod == 'auc':
0229 return aucValue
0230 if retMethod == 'rmse':
0231 return rmse
0232 if retMethod == 'brier':
0233 return bslValue
0234 if retMethod == 'logloss':
0235 return logloss
0236 return score, history
0237
0238
0239 def evaluate_bdt(input_x, input_y, opt_params, retMethod=None, hist=True, saveModel=False, input_weight=None, **kwargs):
0240 params = kwargs
0241 if not params:
0242 params = {}
0243 if params and 'max_depth' in params:
0244 params['max_depth'] = int(params['max_depth'])
0245 if params and 'num_boost_round' in params:
0246 params['num_boost_round'] = int(params['num_boost_round'])
0247 if params and 'seed' in params:
0248 params['seed'] = int(params['seed'])
0249 if params and 'max_bin' in params:
0250 params['max_bin'] = int(params['max_bin'])
0251
0252
0253 if opt_params:
0254 for opt in opt_params:
0255 if opt not in params:
0256 params[opt] = opt_params[opt]
0257
0258 if retMethod and retMethod == 'auc':
0259 params['eval_metric'] = ['rmse', 'logloss', 'auc']
0260 elif retMethod and retMethod == 'logloss':
0261 params['eval_metric'] = ['rmse', 'auc', 'logloss']
0262 elif retMethod and retMethod == 'rmse':
0263 params['eval_metric'] = ['logloss', 'auc', 'rmse']
0264 elif retMethod:
0265 params['eval_metric'] = [retMethod]
0266
0267 print(params)
0268 auc = train_bdt(input_x, input_y, params=params, retMethod=retMethod, hist=hist, saveModel=saveModel, input_weight=input_weight)
0269 print("params: %s, ret: %s" % (params, auc))
0270 return auc
0271
0272
0273 def optimize_bdt(input_x, input_y, opt_params, opt_method='auc', opt_ranges=None, hist=True, input_weight=None):
0274 eval_params = {
0275 'colsample_bytree': (0.1, 1),
0276 'scale_pos_weight': (0, 10),
0277 'max_delta_step': (0, 10),
0278 'seed': (1, 50),
0279 'min_child_weight': (0, 100),
0280 'subsample': (0.1, 1),
0281 'eta': (0, 0.1),
0282 'alpha': (0, 1),
0283
0284 'max_depth': (0, 50),
0285 'gamma': (0, 1),
0286
0287 }
0288
0289 explore_params1 = {
0290
0291 'eta': [0.001, 0.004, 0.007, 0.03, 0.05, 0.07],
0292
0293 'colsample_bytree': [0.67, 0.79, 0.76, 0.5, 0.4, 0.85],
0294 'scale_pos_weight': [4.9, 2.8, 2.1, 4.7, 1.7, 4],
0295 'max_delta_step': [5, 2.2, 1.67, 2.53, 8.8, 8],
0296 'seed': [10, 20, 30, 40, 50, 60],
0297 'min_child_weight': [50, 74, 53, 14, 45, 30],
0298 'subsample': [0.78, 0.82, 0.6, 0.87, 0.5, 0.7],
0299 'alpha': [0.1, 0.2, 0.3, 0.05, 0.15, 0.25],
0300
0301 'max_depth': [6, 7, 10, 14, 19, 28],
0302 'gamma': [0.47, 0.19, 0.33, 0.43, 0.5, 0.76],
0303
0304 }
0305
0306 explore_params = {
0307 'eta': [0.004, 0.03],
0308
0309 'colsample_bytree': [0.67, 0.4],
0310 'scale_pos_weight': [4.9, 1.7],
0311 'max_delta_step': [2.2, 8],
0312 'seed': [10, 50],
0313 'min_child_weight': [50, 30],
0314 'subsample': [0.78, 0.5],
0315 'alpha': [0.2, 0.25],
0316
0317 'max_depth': [10, 28],
0318 'gamma': [0.47, 0.76],
0319
0320 }
0321
0322 print("Eval: %s" % eval_params)
0323 optFunc = lambda **z: evaluate_bdt(input_x, input_y, opt_params, opt_method, hist=hist, saveModel=False, input_weight=input_weight, **z)
0324 bayesopt = BayesianOptimization(optFunc, eval_params)
0325
0326 bayesopt.maximize(init_points=3, n_iter=5)
0327
0328
0329
0330 p = bayesopt.max
0331 print("Best params: %s" % p)
0332
0333
0334 def optimize():
0335 data, label = load_data()
0336 train, val = data
0337 y_train_cat, y_val_cat = label
0338
0339 opt_method = 'auc'
0340 opt_ranges = {"subsample": [0.10131415926000001, 1],
0341 "eta": [0.0100131415926, 0.03],
0342 "colsample_bytree": [0.10131415926000001, 1],
0343 "gamma": [0.00131415926, 1],
0344 "alpha": [0.00131415926, 1],
0345 "max_delta_step": [0.00131415926, 10],
0346 "max_depth": [5.00131415926, 50],
0347 "min_child_weight": [0.00131415926, 100]}
0348 params = {'num_boost_round': 10}
0349
0350 optimize_bdt(input_x=[train, val], input_y=[y_train_cat, y_val_cat], opt_params=params,
0351 opt_ranges=opt_ranges, opt_method=opt_method, hist=True, input_weight=None)
0352
0353
0354 def get_unique_id_for_dict(dict_):
0355 ret = hashlib.sha1(json.dumps(dict_, sort_keys=True).encode()).hexdigest()
0356 return ret
0357
0358
0359 def optimize_bdt1(input_x, input_y, opt_params, opt_method='auc', opt_ranges=None, hist=True, input_weight=None):
0360 eval_params = {
0361 'colsample_bytree': (0.1, 1),
0362 'scale_pos_weight': (0, 10),
0363 'max_delta_step': (0, 10),
0364 'seed': (1, 50),
0365 'min_child_weight': (0, 100),
0366 'subsample': (0.1, 1),
0367 'eta': (0, 0.1),
0368 'alpha': (0, 1),
0369
0370 'max_depth': (0, 50),
0371 'gamma': (0, 1),
0372
0373 }
0374
0375 print("Eval: %s" % eval_params)
0376 optFunc = lambda **z: evaluate_bdt(input_x, input_y, opt_params, opt_method, hist=hist, saveModel=False, input_weight=input_weight, **z)
0377 bayesopt = BayesianOptimization(optFunc, eval_params)
0378 util = UtilityFunction(kind='ucb',
0379 kappa=2.576,
0380 xi=0.0,
0381 kappa_decay=1,
0382 kappa_decay_delay=0)
0383
0384 n_iterations, n_points_per_iteration = 3, 5
0385 for i in range(n_iterations):
0386 points = {}
0387 for j in range(n_points_per_iteration):
0388 x_probe = bayesopt.suggest(util)
0389 u_id = get_unique_id_for_dict(x_probe)
0390 print('x_probe (%s): %s' % (u_id, x_probe))
0391 points[u_id] = {'kwargs': x_probe}
0392 ret = evaluate_bdt(input_x, input_y, opt_params, opt_method, hist=hist, saveModel=False, input_weight=input_weight, **x_probe)
0393 print('ret :%s' % ret)
0394 points[u_id]['ret'] = ret
0395 bayesopt.register(x_probe, ret)
0396
0397
0398
0399
0400
0401 print(bayesopt.res)
0402 p = bayesopt.max
0403 print("Best params: %s" % p)
0404
0405
0406 def get_bayesian_optimizer_and_util(func, opt_params):
0407 bayesopt = BayesianOptimization(func, opt_params)
0408 util = UtilityFunction(kind='ucb',
0409 kappa=2.576,
0410 xi=0.0,
0411 kappa_decay=1,
0412 kappa_decay_delay=0)
0413 return bayesopt, util
0414
0415
0416 def optimize1():
0417 data, label = load_data()
0418 train, val = data
0419 y_train_cat, y_val_cat = label
0420
0421 opt_method = 'auc'
0422 opt_ranges = {"subsample": [0.10131415926000001, 1],
0423 "eta": [0.0100131415926, 0.03],
0424 "colsample_bytree": [0.10131415926000001, 1],
0425 "gamma": [0.00131415926, 1],
0426 "alpha": [0.00131415926, 1],
0427 "max_delta_step": [0.00131415926, 10],
0428 "max_depth": [5.00131415926, 50],
0429 "min_child_weight": [0.00131415926, 100]}
0430 params = {'num_boost_round': 10}
0431
0432 optimize_bdt1(input_x=[train, val], input_y=[y_train_cat, y_val_cat], opt_params=params,
0433 opt_ranges=opt_ranges, opt_method=opt_method, hist=True, input_weight=None)
0434
0435
0436 if __name__ == '__main__':
0437 optimize1()