accumulators/statistics/peaks_over_threshold.hpp

0001 ///////////////////////////////////////////////////////////////////////////////
0002 // peaks_over_threshold.hpp
0003 //
0004 //  Copyright 2006 Daniel Egloff, Olivier Gygi. Distributed under the Boost
0005 //  Software License, Version 1.0. (See accompanying file
0006 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0007
0008 #ifndef BOOST_ACCUMULATORS_STATISTICS_PEAKS_OVER_THRESHOLD_HPP_DE_01_01_2006
0009 #define BOOST_ACCUMULATORS_STATISTICS_PEAKS_OVER_THRESHOLD_HPP_DE_01_01_2006
0010
0011 #include <vector>
0012 #include <limits>
0013 #include <numeric>
0014 #include <functional>
0015 #include <boost/config/no_tr1/cmath.hpp> // pow
0016 #include <sstream> // stringstream
0017 #include <stdexcept> // runtime_error
0018 #include <boost/throw_exception.hpp>
0019 #include <boost/range.hpp>
0020 #include <boost/mpl/if.hpp>
0021 #include <boost/mpl/int.hpp>
0022 #include <boost/mpl/placeholders.hpp>
0023 #include <boost/parameter/keyword.hpp>
0024 #include <boost/tuple/tuple.hpp>
0025 #include <boost/accumulators/accumulators_fwd.hpp>
0026 #include <boost/accumulators/framework/accumulator_base.hpp>
0027 #include <boost/accumulators/framework/extractor.hpp>
0028 #include <boost/accumulators/numeric/functional.hpp>
0029 #include <boost/accumulators/framework/parameters/sample.hpp>
0030 #include <boost/accumulators/framework/depends_on.hpp>
0031 #include <boost/accumulators/statistics_fwd.hpp>
0032 #include <boost/accumulators/statistics/parameters/quantile_probability.hpp>
0033 #include <boost/accumulators/statistics/count.hpp>
0034 #include <boost/accumulators/statistics/tail.hpp>
0035
0036 #ifdef _MSC_VER
0037 # pragma warning(push)
0038 # pragma warning(disable: 4127) // conditional expression is constant
0039 #endif
0040
0041 namespace boost { namespace accumulators
0042 {
0043
0044 ///////////////////////////////////////////////////////////////////////////////
0045 // threshold_probability and threshold named parameters
0046 //
0047 BOOST_PARAMETER_NESTED_KEYWORD(tag, pot_threshold_value, threshold_value)
0048 BOOST_PARAMETER_NESTED_KEYWORD(tag, pot_threshold_probability, threshold_probability)
0049
0050 BOOST_ACCUMULATORS_IGNORE_GLOBAL(pot_threshold_value)
0051 BOOST_ACCUMULATORS_IGNORE_GLOBAL(pot_threshold_probability)
0052
0053 namespace impl
0054 {
0055     ///////////////////////////////////////////////////////////////////////////////
0056     // peaks_over_threshold_impl
0057     //  works with an explicit threshold value and does not depend on order statistics
0058     /**
0059         @brief Peaks over Threshold Method for Quantile and Tail Mean Estimation
0060
0061         According to the theorem of Pickands-Balkema-de Haan, the distribution function \f$F_u(x)\f$ of
0062         the excesses \f$x\f$ over some sufficiently high threshold \f$u\f$ of a distribution function \f$F(x)\f$
0063         may be approximated by a generalized Pareto distribution
0064         \f[
0065             G_{\xi,\beta}(x) =
0066             \left\{
0067             \begin{array}{ll}
0068                 \beta^{-1}\left(1+\frac{\xi x}{\beta}\right)^{-1/\xi-1} & \textrm{if }\xi\neq0\\
0069                 \beta^{-1}\exp\left(-\frac{x}{\beta}\right) & \textrm{if }\xi=0,
0070             \end{array}
0071             \right.
0072         \f]
0073         with suitable parameters \f$\xi\f$ and \f$\beta\f$ that can be estimated, e.g., with the method of moments, cf.
0074         Hosking and Wallis (1987),
0075         \f[
0076             \begin{array}{lll}
0077             \hat{\xi} & = & \frac{1}{2}\left[1-\frac{(\hat{\mu}-u)^2}{\hat{\sigma}^2}\right]\\
0078             \hat{\beta} & = & \frac{\hat{\mu}-u}{2}\left[\frac{(\hat{\mu}-u)^2}{\hat{\sigma}^2}+1\right],
0079             \end{array}
0080         \f]
0081         \f$\hat{\mu}\f$ and \f$\hat{\sigma}^2\f$ being the empirical mean and variance of the samples over
0082         the threshold \f$u\f$. Equivalently, the distribution function
0083         \f$F_u(x-u)\f$ of the exceedances \f$x-u\f$ can be approximated by
0084         \f$G_{\xi,\beta}(x-u)=G_{\xi,\beta,u}(x)\f$. Since for \f$x\geq u\f$ the distribution function \f$F(x)\f$
0085         can be written as
0086         \f[
0087             F(x) = [1 - \P(X \leq u)]F_u(x - u) + \P(X \leq u)
0088         \f]
0089         and the probability \f$\P(X \leq u)\f$ can be approximated by the empirical distribution function
0090         \f$F_n(u)\f$ evaluated at \f$u\f$, an estimator of \f$F(x)\f$ is given by
0091         \f[
0092             \widehat{F}(x) = [1 - F_n(u)]G_{\xi,\beta,u}(x) + F_n(u).
0093         \f]
0094         It can be shown that \f$\widehat{F}(x)\f$ is a generalized
0095         Pareto distribution \f$G_{\xi,\bar{\beta},\bar{u}}(x)\f$ with \f$\bar{\beta}=\beta[1-F_n(u)]^{\xi}\f$
0096         and \f$\bar{u}=u-\bar{\beta}\left\{[1-F_n(u)]^{-\xi}-1\right\}/\xi\f$. By inverting \f$\widehat{F}(x)\f$,
0097         one obtains an estimator for the \f$\alpha\f$-quantile,
0098         \f[
0099             \hat{q}_{\alpha} = \bar{u} + \frac{\bar{\beta}}{\xi}\left[(1-\alpha)^{-\xi}-1\right],
0100         \f]
0101         and similarly an estimator for the (coherent) tail mean,
0102         \f[
0103             \widehat{CTM}_{\alpha} = \hat{q}_{\alpha} - \frac{\bar{\beta}}{\xi-1}(1-\alpha)^{-\xi},
0104         \f]
0105         cf. McNeil and Frey (2000).
0106
0107         Note that in case extreme values of the left tail are fitted, the distribution is mirrored with respect to the
0108         \f$y\f$ axis such that the left tail can be treated as a right tail. The computed fit parameters thus define
0109         the Pareto distribution that fits the mirrored left tail. When quantities like a quantile or a tail mean are
0110         computed using the fit parameters obtained from the mirrored data, the result is mirrored back, yielding the
0111         correct result.
0112
0113         For further details, see
0114
0115         J. R. M. Hosking and J. R. Wallis, Parameter and quantile estimation for the generalized Pareto distribution,
0116         Technometrics, Volume 29, 1987, p. 339-349
0117
0118         A. J. McNeil and R. Frey, Estimation of Tail-Related Risk Measures for Heteroscedastic Financial Time Series:
0119         an Extreme Value Approach, Journal of Empirical Finance, Volume 7, 2000, p. 271-300
0120
0121         @param quantile_probability
0122         @param pot_threshold_value
0123     */
0124     template<typename Sample, typename LeftRight>
0125     struct peaks_over_threshold_impl
0126       : accumulator_base
0127     {
0128         typedef typename numeric::functional::fdiv<Sample, std::size_t>::result_type float_type;
0129         // for boost::result_of
0130         typedef boost::tuple<float_type, float_type, float_type> result_type;
0131         // for left tail fitting, mirror the extreme values
0132         typedef mpl::int_<is_same<LeftRight, left>::value ? -1 : 1> sign;
0133
0134         template<typename Args>
0135         peaks_over_threshold_impl(Args const &args)
0136           : Nu_(0)
0137           , mu_(sign::value * numeric::fdiv(args[sample | Sample()], (std::size_t)1))
0138           , sigma2_(numeric::fdiv(args[sample | Sample()], (std::size_t)1))
0139           , threshold_(sign::value * args[pot_threshold_value])
0140           , fit_parameters_(boost::make_tuple(0., 0., 0.))
0141           , is_dirty_(true)
0142         {
0143         }
0144
0145         template<typename Args>
0146         void operator ()(Args const &args)
0147         {
0148             this->is_dirty_ = true;
0149
0150             if (sign::value * args[sample] > this->threshold_)
0151             {
0152                 this->mu_ += args[sample];
0153                 this->sigma2_ += args[sample] * args[sample];
0154                 ++this->Nu_;
0155             }
0156         }
0157
0158         template<typename Args>
0159         result_type result(Args const &args) const
0160         {
0161             if (this->is_dirty_)
0162             {
0163                 this->is_dirty_ = false;
0164
0165                 std::size_t cnt = count(args);
0166
0167                 this->mu_ = sign::value * numeric::fdiv(this->mu_, this->Nu_);
0168                 this->sigma2_ = numeric::fdiv(this->sigma2_, this->Nu_);
0169                 this->sigma2_ -= this->mu_ * this->mu_;
0170
0171                 float_type threshold_probability = numeric::fdiv(cnt - this->Nu_, cnt);
0172
0173                 float_type tmp = numeric::fdiv(( this->mu_ - this->threshold_ )*( this->mu_ - this->threshold_ ), this->sigma2_);
0174                 float_type xi_hat = 0.5 * ( 1. - tmp );
0175                 float_type beta_hat = 0.5 * ( this->mu_ - this->threshold_ ) * ( 1. + tmp );
0176                 float_type beta_bar = beta_hat * std::pow(1. - threshold_probability, xi_hat);
0177                 float_type u_bar = this->threshold_ - beta_bar * ( std::pow(1. - threshold_probability, -xi_hat) - 1.)/xi_hat;
0178                 this->fit_parameters_ = boost::make_tuple(u_bar, beta_bar, xi_hat);
0179             }
0180
0181             return this->fit_parameters_;
0182         }
0183
0184         // make this accumulator serializeable
0185         // TODO: do we need to split to load/save and verify that threshold did not change?
0186         template<class Archive>
0187         void serialize(Archive & ar, const unsigned int file_version)
0188         {
0189             ar & Nu_;
0190             ar & mu_;
0191             ar & sigma2_;
0192             ar & threshold_;
0193             ar & get<0>(fit_parameters_);
0194             ar & get<1>(fit_parameters_);
0195             ar & get<2>(fit_parameters_);
0196             ar & is_dirty_;
0197         }
0198
0199     private:
0200         std::size_t Nu_;                     // number of samples larger than threshold
0201         mutable float_type mu_;              // mean of Nu_ largest samples
0202         mutable float_type sigma2_;          // variance of Nu_ largest samples
0203         float_type threshold_;
0204         mutable result_type fit_parameters_; // boost::tuple that stores fit parameters
0205         mutable bool is_dirty_;
0206     };
0207
0208     ///////////////////////////////////////////////////////////////////////////////
0209     // peaks_over_threshold_prob_impl
0210     //  determines threshold from a given threshold probability using order statistics
0211     /**
0212         @brief Peaks over Threshold Method for Quantile and Tail Mean Estimation
0213
0214         @sa peaks_over_threshold_impl
0215
0216         @param quantile_probability
0217         @param pot_threshold_probability
0218     */
0219     template<typename Sample, typename LeftRight>
0220     struct peaks_over_threshold_prob_impl
0221       : accumulator_base
0222     {
0223         typedef typename numeric::functional::fdiv<Sample, std::size_t>::result_type float_type;
0224         // for boost::result_of
0225         typedef boost::tuple<float_type, float_type, float_type> result_type;
0226         // for left tail fitting, mirror the extreme values
0227         typedef mpl::int_<is_same<LeftRight, left>::value ? -1 : 1> sign;
0228
0229         template<typename Args>
0230         peaks_over_threshold_prob_impl(Args const &args)
0231           : mu_(sign::value * numeric::fdiv(args[sample | Sample()], (std::size_t)1))
0232           , sigma2_(numeric::fdiv(args[sample | Sample()], (std::size_t)1))
0233           , threshold_probability_(args[pot_threshold_probability])
0234           , fit_parameters_(boost::make_tuple(0., 0., 0.))
0235           , is_dirty_(true)
0236         {
0237         }
0238
0239         void operator ()(dont_care)
0240         {
0241             this->is_dirty_ = true;
0242         }
0243
0244         template<typename Args>
0245         result_type result(Args const &args) const
0246         {
0247             if (this->is_dirty_)
0248             {
0249                 this->is_dirty_ = false;
0250
0251                 std::size_t cnt = count(args);
0252
0253                 // the n'th cached sample provides an approximate threshold value u
0254                 std::size_t n = static_cast<std::size_t>(
0255                     std::ceil(
0256                         cnt * ( ( is_same<LeftRight, left>::value ) ? this->threshold_probability_ : 1. - this->threshold_probability_ )
0257                     )
0258                 );
0259
0260                 // If n is in a valid range, return result, otherwise return NaN or throw exception
0261                 if ( n >= static_cast<std::size_t>(tail(args).size()))
0262                 {
0263                     if (std::numeric_limits<float_type>::has_quiet_NaN)
0264                     {
0265                         return boost::make_tuple(
0266                             std::numeric_limits<float_type>::quiet_NaN()
0267                           , std::numeric_limits<float_type>::quiet_NaN()
0268                           , std::numeric_limits<float_type>::quiet_NaN()
0269                         );
0270                     }
0271                     else
0272                     {
0273                         std::ostringstream msg;
0274                         msg << "index n = " << n << " is not in valid range [0, " << tail(args).size() << ")";
0275                         boost::throw_exception(std::runtime_error(msg.str()));
0276                         return boost::make_tuple(Sample(0), Sample(0), Sample(0));
0277                     }
0278                 }
0279                 else
0280                 {
0281                     float_type u = *(tail(args).begin() + n - 1) * sign::value;
0282
0283                     // compute mean and variance of samples above/under threshold value u
0284                     for (std::size_t i = 0; i < n; ++i)
0285                     {
0286                         mu_ += *(tail(args).begin() + i);
0287                         sigma2_ += *(tail(args).begin() + i) * (*(tail(args).begin() + i));
0288                     }
0289
0290                     this->mu_ = sign::value * numeric::fdiv(this->mu_, n);
0291                     this->sigma2_ = numeric::fdiv(this->sigma2_, n);
0292                     this->sigma2_ -= this->mu_ * this->mu_;
0293
0294                     if (is_same<LeftRight, left>::value)
0295                         this->threshold_probability_ = 1. - this->threshold_probability_;
0296
0297                     float_type tmp = numeric::fdiv(( this->mu_ - u )*( this->mu_ - u ), this->sigma2_);
0298                     float_type xi_hat = 0.5 * ( 1. - tmp );
0299                     float_type beta_hat = 0.5 * ( this->mu_ - u ) * ( 1. + tmp );
0300                     float_type beta_bar = beta_hat * std::pow(1. - threshold_probability_, xi_hat);
0301                     float_type u_bar = u - beta_bar * ( std::pow(1. - threshold_probability_, -xi_hat) - 1.)/xi_hat;
0302                     this->fit_parameters_ = boost::make_tuple(u_bar, beta_bar, xi_hat);
0303                 }
0304             }
0305
0306             return this->fit_parameters_;
0307         }
0308
0309         // make this accumulator serializeable
0310         // TODO: do we need to split to load/save and verify that threshold did not change?
0311         template<class Archive>
0312         void serialize(Archive & ar, const unsigned int file_version)
0313         {
0314             ar & mu_;
0315             ar & sigma2_;
0316             ar & threshold_probability_;
0317             ar & get<0>(fit_parameters_);
0318             ar & get<1>(fit_parameters_);
0319             ar & get<2>(fit_parameters_);
0320             ar & is_dirty_;
0321         }
0322
0323     private:
0324         mutable float_type mu_;                     // mean of samples above threshold u
0325         mutable float_type sigma2_;                 // variance of samples above threshold u
0326         mutable float_type threshold_probability_;
0327         mutable result_type fit_parameters_;        // boost::tuple that stores fit parameters
0328         mutable bool is_dirty_;
0329     };
0330
0331 } // namespace impl
0332
0333 ///////////////////////////////////////////////////////////////////////////////
0334 // tag::peaks_over_threshold
0335 //
0336 namespace tag
0337 {
0338     template<typename LeftRight>
0339     struct peaks_over_threshold
0340       : depends_on<count>
0341       , pot_threshold_value
0342     {
0343         /// INTERNAL ONLY
0344         ///
0345         typedef accumulators::impl::peaks_over_threshold_impl<mpl::_1, LeftRight> impl;
0346     };
0347
0348     template<typename LeftRight>
0349     struct peaks_over_threshold_prob
0350       : depends_on<count, tail<LeftRight> >
0351       , pot_threshold_probability
0352     {
0353         /// INTERNAL ONLY
0354         ///
0355         typedef accumulators::impl::peaks_over_threshold_prob_impl<mpl::_1, LeftRight> impl;
0356     };
0357
0358     struct abstract_peaks_over_threshold
0359       : depends_on<>
0360     {
0361     };
0362 }
0363
0364 ///////////////////////////////////////////////////////////////////////////////
0365 // extract::peaks_over_threshold
0366 //
0367 namespace extract
0368 {
0369     extractor<tag::abstract_peaks_over_threshold> const peaks_over_threshold = {};
0370
0371     BOOST_ACCUMULATORS_IGNORE_GLOBAL(peaks_over_threshold)
0372 }
0373
0374 using extract::peaks_over_threshold;
0375
0376 // peaks_over_threshold<LeftRight>(with_threshold_value) -> peaks_over_threshold<LeftRight>
0377 template<typename LeftRight>
0378 struct as_feature<tag::peaks_over_threshold<LeftRight>(with_threshold_value)>
0379 {
0380     typedef tag::peaks_over_threshold<LeftRight> type;
0381 };
0382
0383 // peaks_over_threshold<LeftRight>(with_threshold_probability) -> peaks_over_threshold_prob<LeftRight>
0384 template<typename LeftRight>
0385 struct as_feature<tag::peaks_over_threshold<LeftRight>(with_threshold_probability)>
0386 {
0387     typedef tag::peaks_over_threshold_prob<LeftRight> type;
0388 };
0389
0390 template<typename LeftRight>
0391 struct feature_of<tag::peaks_over_threshold<LeftRight> >
0392   : feature_of<tag::abstract_peaks_over_threshold>
0393 {
0394 };
0395
0396 template<typename LeftRight>
0397 struct feature_of<tag::peaks_over_threshold_prob<LeftRight> >
0398   : feature_of<tag::abstract_peaks_over_threshold>
0399 {
0400 };
0401
0402 // So that peaks_over_threshold can be automatically substituted
0403 // with weighted_peaks_over_threshold when the weight parameter is non-void.
0404 template<typename LeftRight>
0405 struct as_weighted_feature<tag::peaks_over_threshold<LeftRight> >
0406 {
0407     typedef tag::weighted_peaks_over_threshold<LeftRight> type;
0408 };
0409
0410 template<typename LeftRight>
0411 struct feature_of<tag::weighted_peaks_over_threshold<LeftRight> >
0412   : feature_of<tag::peaks_over_threshold<LeftRight> >
0413 {};
0414
0415 // So that peaks_over_threshold_prob can be automatically substituted
0416 // with weighted_peaks_over_threshold_prob when the weight parameter is non-void.
0417 template<typename LeftRight>
0418 struct as_weighted_feature<tag::peaks_over_threshold_prob<LeftRight> >
0419 {
0420     typedef tag::weighted_peaks_over_threshold_prob<LeftRight> type;
0421 };
0422
0423 template<typename LeftRight>
0424 struct feature_of<tag::weighted_peaks_over_threshold_prob<LeftRight> >
0425   : feature_of<tag::peaks_over_threshold_prob<LeftRight> >
0426 {};
0427
0428 }} // namespace boost::accumulators
0429
0430 #ifdef _MSC_VER
0431 # pragma warning(pop)
0432 #endif
0433
0434 #endif