File indexing completed on 2026-04-10 07:58:46
0001
0002 import json
0003 import hashlib
0004 import os
0005 import sys
0006 import time
0007 import traceback
0008
0009 import numpy as np
0010
0011 from sklearn.metrics import roc_curve, auc, confusion_matrix, brier_score_loss, mean_squared_error, log_loss, roc_auc_score
0012 from sklearn import metrics
0013 from sklearn.preprocessing import label_binarize
0014 from sklearn.neighbors import KNeighborsClassifier
0015
0016 from sklearn.model_selection import cross_val_score
0017 from sklearn.model_selection import KFold
0018 from sklearn.preprocessing import LabelEncoder
0019 from sklearn import model_selection
0020 from sklearn.metrics import roc_curve, auc, confusion_matrix
0021 from sklearn.model_selection import train_test_split
0022
0023 from sklearn.preprocessing import StandardScaler
0024 from sklearn.model_selection import StratifiedKFold
0025
0026 from tabulate import tabulate
0027
0028 import xgboost as xgb
0029
0030 from bayes_opt import BayesianOptimization, UtilityFunction
0031
0032
0033 def load_data(workdir='/opt/bdt_0409/ttHyyML_had_single', analysis_type='had'):
0034 currentDir = os.path.dirname(os.path.realpath(__file__))
0035
0036 print("CurrentDir: %s" % currentDir)
0037 print("WorkDir: %s" % workdir)
0038 workdir = os.path.abspath(workdir)
0039 print("Absolute WorkDir: %s" % workdir)
0040 os.chdir(workdir)
0041 sys.path.insert(0, workdir)
0042 sys.argv = ['test']
0043
0044 if analysis_type == 'had':
0045 from load_data_real_auto import load_data_real_hadronic
0046 data, label = load_data_real_hadronic()
0047 elif analysis_type == 'lep':
0048 from load_data_real_auto import load_data_real_leptonic
0049 data, label = load_data_real_leptonic()
0050 sys.path.remove(workdir)
0051 os.chdir(currentDir)
0052 return data, label
0053
0054
0055 def get_param(params, name, default):
0056 if params and name in params:
0057 return params[name]
0058 return default
0059
0060
0061 def getAUC(y_test, score, y_val_weight=None):
0062
0063
0064 print(y_test.shape)
0065 print(score.shape)
0066 if y_val_weight:
0067 print(y_val_weight.shape)
0068 return roc_auc_score(y_test, score, sample_weight=y_val_weight)
0069
0070
0071 def getBrierScore(y_test, score):
0072 return 1-brier_score_loss(y_test, score)
0073
0074
0075 def evaluateBrierScore(y_pred, data):
0076 label = data.get_label()
0077 return 'brierLoss', 1-brier_score_loss(y_pred, label)
0078
0079
0080 def getRMSE(y_test, score):
0081 return mean_squared_error(y_test, score) ** 0.5
0082
0083
0084 def getLogLoss(y_test, score):
0085 return log_loss(y_test, score)
0086
0087
0088 def xgb_callback_save_model(model_name, period=1000):
0089 def callback(env):
0090 try:
0091 bst, i, _ = env.model, env.iteration, env.end_iteration
0092 if (i % period == 0):
0093 bst.save_model(model_name)
0094 except Exception:
0095 print(traceback.format_exc())
0096 return callback
0097
0098
0099 def train_bdt(input_x, input_y, params=None, retMethod=None, hist=True, saveModel=False, input_weight=None):
0100
0101 train, val = input_x
0102 y_train_cat, y_val_cat = input_y
0103 if input_weight:
0104 y_train_weight, y_val_weight = input_weight
0105 else:
0106 y_train_weight = None
0107 y_val_weight = None
0108
0109 train = train.reshape((train.shape[0], -1))
0110 val = val.reshape((val.shape[0], -1))
0111
0112 dTrain = xgb.DMatrix(train, label=y_train_cat, weight=y_train_weight)
0113 dVal = xgb.DMatrix(val, label=y_val_cat, weight=y_val_weight)
0114
0115
0116 print('Train model.')
0117
0118
0119
0120
0121
0122
0123
0124 def_params = {}
0125 if not params:
0126 params = {}
0127 if 'num_boost_round' not in params:
0128 params['num_boost_round'] = 100000
0129 if 'objective' not in params:
0130 params['objective'] = 'binary:logistic'
0131
0132 for key in def_params:
0133 if key not in params:
0134 params[key] = def_params[key]
0135
0136 if 'silent' not in params:
0137 params['silent'] = 0
0138
0139 if hist:
0140 params['tree_method'] = 'hist'
0141 params['booster'] = 'gbtree'
0142 params['grow_policy'] = 'lossguide'
0143 params['nthread'] = 4
0144 params['booster'] = 'gbtree'
0145
0146 start = time.time()
0147 evallist = [(dTrain, 'train'), (dVal, 'eval')]
0148 evals_result = {}
0149
0150 try:
0151 save_model_callback = xgb_callback_save_model(params['model']+"temp" if params and 'model' in params else 'models/default_bdt_temp.h5')
0152
0153 if not saveModel:
0154
0155 bst = xgb.train(params, dTrain, params['num_boost_round'], evals=evallist, early_stopping_rounds=10, evals_result=evals_result, verbose_eval=True)
0156 else:
0157 bst = xgb.train(params, dTrain, params['num_boost_round'], evals=evallist, early_stopping_rounds=10, evals_result=evals_result, callbacks=[save_model_callback])
0158
0159
0160 except KeyboardInterrupt:
0161 print('Finishing on SIGINT.')
0162 print("CPU Training time: %s" % (time.time() - start))
0163
0164
0165 print('Test model.')
0166
0167 score = bst.predict(dVal)
0168 rmse = None
0169 logloss = None
0170 if 'num_class' in params and params['num_class'] == 3:
0171 y_val_cat_binary = label_binarize(y_val_cat, classes=[0, 1, 2])
0172 aucValue = getAUC(y_val_cat_binary[:, 0], score[:, 0])
0173 aucValue1 = getAUC(y_val_cat_binary[:, 1], score[:, 1])
0174 aucValue2 = getAUC(y_val_cat_binary[:, 2], score[:, 2])
0175
0176 print("AUC: %s, %s, %s" % (aucValue, aucValue1, aucValue2))
0177
0178 bslValue = getBrierScore(y_val_cat_binary[:, 0], score[:, 0])
0179 bslValue1 = getBrierScore(y_val_cat_binary[:, 1], score[:, 1])
0180 bslValue2 = getBrierScore(y_val_cat_binary[:, 2], score[:, 2])
0181
0182 print("BrierScoreLoss: %s, %s, %s" % (bslValue, bslValue1, bslValue2))
0183
0184 rmse = getRMSE(y_val_cat_binary[:, 0], score[:, 0])
0185 logloss = getLogLoss(y_val_cat_binary[:, 0], score[:, 0])
0186 else:
0187 aucValue = getAUC(y_val_cat, score)
0188 bslValue = getBrierScore(y_val_cat, score)
0189 rmse = None
0190 logloss = None
0191 auc = None
0192 if 'auc' in evals_result['eval']:
0193 auc = evals_result['eval']['auc'][-1]
0194 if 'rmse' in evals_result['eval']:
0195 rmse = evals_result['eval']['rmse'][-1]
0196 if 'logloss' in evals_result['eval']:
0197 logloss = evals_result['eval']['logloss'][-1]
0198 print("params: %s #, Val AUC: %s, BrierScoreLoss: %s, xgboost rmse: %s, xgboost logloss: %s, xgboost auc: %s" % (params, aucValue, bslValue, rmse, logloss, auc))
0199 rmse = getRMSE(y_val_cat, score)
0200 logloss = getLogLoss(y_val_cat, score)
0201 print("params: %s #, Val AUC: %s, BrierScoreLoss: %s, Val rmse: %s, Val logloss: %s" % (params, aucValue, bslValue, rmse, logloss))
0202
0203
0204
0205 print(bst.get_fscore())
0206
0207
0208 try:
0209 pass
0210
0211
0212
0213
0214
0215 except Exception:
0216 print(traceback.format_exc())
0217
0218 try:
0219 history = {'loss': evals_result['train']['logloss'], 'val_loss': evals_result['eval']['logloss'],
0220 'acc': evals_result['train']['auc'], 'val_acc': evals_result['eval']['auc']}
0221 except Exception:
0222 print(traceback.format_exc())
0223 history = {}
0224
0225 if retMethod:
0226 if retMethod == 'auc':
0227 return aucValue
0228 if retMethod == 'rmse':
0229 return rmse
0230 if retMethod == 'brier':
0231 return bslValue
0232 if retMethod == 'logloss':
0233 return logloss
0234 return score, history
0235
0236
0237 def evaluate_bdt(input_x, input_y, opt_params, retMethod=None, hist=True, saveModel=False, input_weight=None, **kwargs):
0238 params = kwargs
0239 if not params:
0240 params = {}
0241 if params and 'max_depth' in params:
0242 params['max_depth'] = int(params['max_depth'])
0243 if params and 'num_boost_round' in params:
0244 params['num_boost_round'] = int(params['num_boost_round'])
0245 if params and 'seed' in params:
0246 params['seed'] = int(params['seed'])
0247 if params and 'max_bin' in params:
0248 params['max_bin'] = int(params['max_bin'])
0249
0250
0251 if opt_params:
0252 for opt in opt_params:
0253 if opt not in params:
0254 params[opt] = opt_params[opt]
0255
0256 if retMethod and retMethod == 'auc':
0257 params['eval_metric'] = ['rmse', 'logloss', 'auc']
0258 elif retMethod and retMethod == 'logloss':
0259 params['eval_metric'] = ['rmse', 'auc', 'logloss']
0260 elif retMethod and retMethod == 'rmse':
0261 params['eval_metric'] = ['logloss', 'auc', 'rmse']
0262 elif retMethod:
0263 params['eval_metric'] = [retMethod]
0264
0265 print(params)
0266 auc = train_bdt(input_x, input_y, params=params, retMethod=retMethod, hist=hist, saveModel=saveModel, input_weight=input_weight)
0267 print("params: %s, ret: %s" % (params, auc))
0268 return auc
0269
0270
0271 def optimize_bdt(input_x, input_y, opt_params, opt_method='auc', opt_ranges=None, hist=True, input_weight=None):
0272 eval_params = {
0273 'colsample_bytree': (0.1, 1),
0274 'scale_pos_weight': (0, 10),
0275 'max_delta_step': (0, 10),
0276 'seed': (1, 50),
0277 'min_child_weight': (0, 100),
0278 'subsample': (0.1, 1),
0279 'eta': (0, 0.1),
0280 'alpha': (0, 1),
0281
0282 'max_depth': (0, 50),
0283 'gamma': (0, 1),
0284
0285 }
0286
0287 explore_params1 = {
0288
0289 'eta': [0.001, 0.004, 0.007, 0.03, 0.05, 0.07],
0290
0291 'colsample_bytree': [0.67, 0.79, 0.76, 0.5, 0.4, 0.85],
0292 'scale_pos_weight': [4.9, 2.8, 2.1, 4.7, 1.7, 4],
0293 'max_delta_step': [5, 2.2, 1.67, 2.53, 8.8, 8],
0294 'seed': [10, 20, 30, 40, 50, 60],
0295 'min_child_weight': [50, 74, 53, 14, 45, 30],
0296 'subsample': [0.78, 0.82, 0.6, 0.87, 0.5, 0.7],
0297 'alpha': [0.1, 0.2, 0.3, 0.05, 0.15, 0.25],
0298
0299 'max_depth': [6, 7, 10, 14, 19, 28],
0300 'gamma': [0.47, 0.19, 0.33, 0.43, 0.5, 0.76],
0301
0302 }
0303
0304 explore_params = {
0305 'eta': [0.004, 0.03],
0306
0307 'colsample_bytree': [0.67, 0.4],
0308 'scale_pos_weight': [4.9, 1.7],
0309 'max_delta_step': [2.2, 8],
0310 'seed': [10, 50],
0311 'min_child_weight': [50, 30],
0312 'subsample': [0.78, 0.5],
0313 'alpha': [0.2, 0.25],
0314
0315 'max_depth': [10, 28],
0316 'gamma': [0.47, 0.76],
0317
0318 }
0319
0320 print("Eval: %s" % eval_params)
0321 optFunc = lambda **z: evaluate_bdt(input_x, input_y, opt_params, opt_method, hist=hist, saveModel=False, input_weight=input_weight, **z)
0322 bayesopt = BayesianOptimization(optFunc, eval_params)
0323
0324 bayesopt.maximize(init_points=3, n_iter=5)
0325
0326
0327
0328 p = bayesopt.max
0329 print("Best params: %s" % p)
0330
0331
0332 def optimize():
0333 data, label = load_data()
0334 train, val = data
0335 y_train_cat, y_val_cat = label
0336
0337 opt_method = 'auc'
0338 opt_ranges = {"subsample": [0.10131415926000001, 1],
0339 "eta": [0.0100131415926, 0.03],
0340 "colsample_bytree": [0.10131415926000001, 1],
0341 "gamma": [0.00131415926, 1],
0342 "alpha": [0.00131415926, 1],
0343 "max_delta_step": [0.00131415926, 10],
0344 "max_depth": [5.00131415926, 50],
0345 "min_child_weight": [0.00131415926, 100]}
0346 params = {'num_boost_round': 10}
0347
0348 optimize_bdt(input_x=[train, val], input_y=[y_train_cat, y_val_cat], opt_params=params,
0349 opt_ranges=opt_ranges, opt_method=opt_method, hist=True, input_weight=None)
0350
0351
0352 def get_unique_id_for_dict(dict_):
0353 ret = hashlib.sha1(json.dumps(dict_, sort_keys=True).encode()).hexdigest()
0354 return ret
0355
0356
0357 def optimize_bdt1(input_x, input_y, opt_params, opt_method='auc', opt_ranges=None, hist=True, input_weight=None):
0358 eval_params = {
0359 'colsample_bytree': (0.1, 1),
0360 'scale_pos_weight': (0, 10),
0361 'max_delta_step': (0, 10),
0362 'seed': (1, 50),
0363 'min_child_weight': (0, 100),
0364 'subsample': (0.1, 1),
0365 'eta': (0, 0.1),
0366 'alpha': (0, 1),
0367
0368 'max_depth': (0, 50),
0369 'gamma': (0, 1),
0370
0371 }
0372
0373 print("Eval: %s" % eval_params)
0374 optFunc = lambda **z: evaluate_bdt(input_x, input_y, opt_params, opt_method, hist=hist, saveModel=False, input_weight=input_weight, **z)
0375 bayesopt = BayesianOptimization(optFunc, eval_params)
0376 util = UtilityFunction(kind='ucb',
0377 kappa=2.576,
0378 xi=0.0,
0379 kappa_decay=1,
0380 kappa_decay_delay=0)
0381
0382 n_iterations, n_points_per_iteration = 3, 5
0383 for i in range(n_iterations):
0384 points = {}
0385 for j in range(n_points_per_iteration):
0386 x_probe = bayesopt.suggest(util)
0387 u_id = get_unique_id_for_dict(x_probe)
0388 print('x_probe (%s): %s' % (u_id, x_probe))
0389 points[u_id] = {'kwargs': x_probe}
0390 ret = evaluate_bdt(input_x, input_y, opt_params, retMethod=opt_method, hist=hist, saveModel=False, input_weight=input_weight, **x_probe)
0391 print('ret :%s' % ret)
0392 points[u_id]['ret'] = ret
0393 bayesopt.register(x_probe, ret)
0394
0395
0396
0397
0398
0399 print(bayesopt.res)
0400 p = bayesopt.max
0401 print("Best params: %s" % p)
0402
0403
0404 def optimize1():
0405 data, label = load_data()
0406 train, val = data
0407 y_train_cat, y_val_cat = label
0408
0409 opt_method = 'auc'
0410 opt_ranges = {"subsample": [0.10131415926000001, 1],
0411 "eta": [0.0100131415926, 0.03],
0412 "colsample_bytree": [0.10131415926000001, 1],
0413 "gamma": [0.00131415926, 1],
0414 "alpha": [0.00131415926, 1],
0415 "max_delta_step": [0.00131415926, 10],
0416 "max_depth": [5.00131415926, 50],
0417 "min_child_weight": [0.00131415926, 100]}
0418 params = {'num_boost_round': 10}
0419
0420 optimize_bdt1(input_x=[train, val], input_y=[y_train_cat, y_val_cat], opt_params=params,
0421 opt_ranges=opt_ranges, opt_method=opt_method, hist=True, input_weight=None)
0422
0423
0424 if __name__ == '__main__':
0425 optimize1()