math/tools/univariate_statistics.hpp

0001 //  (C) Copyright Nick Thompson 2018.
0002 //  Use, modification and distribution are subject to the
0003 //  Boost Software License, Version 1.0. (See accompanying file
0004 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0005
0006 #ifndef BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP
0007 #define BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP
0008
0009 #include <algorithm>
0010 #include <iterator>
0011 #include <tuple>
0012 #include <boost/math/tools/assert.hpp>
0013 #include <boost/math/tools/header_deprecated.hpp>
0014
0015 #include <boost/math/tools/is_standalone.hpp>
0016 #ifndef BOOST_MATH_STANDALONE
0017 #include <boost/config.hpp>
0018 #ifdef BOOST_MATH_NO_CXX17_IF_CONSTEXPR
0019 #error "The header <boost/math/norms.hpp> can only be used in C++17 and later."
0020 #endif
0021 #endif
0022
0023 BOOST_MATH_HEADER_DEPRECATED("<boost/math/statistics/univariate_statistics.hpp>");
0024
0025 namespace boost::math::tools {
0026
0027 template<class ForwardIterator>
0028 auto mean(ForwardIterator first, ForwardIterator last)
0029 {
0030     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
0031     BOOST_MATH_ASSERT_MSG(first != last, "At least one sample is required to compute the mean.");
0032     if constexpr (std::is_integral<Real>::value)
0033     {
0034         double mu = 0;
0035         double i = 1;
0036         for(auto it = first; it != last; ++it) {
0037             mu = mu + (*it - mu)/i;
0038             i += 1;
0039         }
0040         return mu;
0041     }
0042     else if constexpr (std::is_same_v<typename std::iterator_traits<ForwardIterator>::iterator_category, std::random_access_iterator_tag>)
0043     {
0044         size_t elements = std::distance(first, last);
0045         Real mu0 = 0;
0046         Real mu1 = 0;
0047         Real mu2 = 0;
0048         Real mu3 = 0;
0049         Real i = 1;
0050         auto end = last - (elements % 4);
0051         for(auto it = first; it != end;  it += 4) {
0052             Real inv = Real(1)/i;
0053             Real tmp0 = (*it - mu0);
0054             Real tmp1 = (*(it+1) - mu1);
0055             Real tmp2 = (*(it+2) - mu2);
0056             Real tmp3 = (*(it+3) - mu3);
0057             // please generate a vectorized fma here
0058             mu0 += tmp0*inv;
0059             mu1 += tmp1*inv;
0060             mu2 += tmp2*inv;
0061             mu3 += tmp3*inv;
0062             i += 1;
0063         }
0064         Real num1 = Real(elements  - (elements %4))/Real(4);
0065         Real num2 = num1 + Real(elements % 4);
0066
0067         for (auto it = end; it != last; ++it)
0068         {
0069             mu3 += (*it-mu3)/i;
0070             i += 1;
0071         }
0072
0073         return (num1*(mu0+mu1+mu2) + num2*mu3)/Real(elements);
0074     }
0075     else
0076     {
0077         auto it = first;
0078         Real mu = *it;
0079         Real i = 2;
0080         while(++it != last)
0081         {
0082             mu += (*it - mu)/i;
0083             i += 1;
0084         }
0085         return mu;
0086     }
0087 }
0088
0089 template<class Container>
0090 inline auto mean(Container const & v)
0091 {
0092     return mean(v.cbegin(), v.cend());
0093 }
0094
0095 template<class ForwardIterator>
0096 auto variance(ForwardIterator first, ForwardIterator last)
0097 {
0098     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
0099     BOOST_MATH_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
0100     // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
0101     if constexpr (std::is_integral<Real>::value)
0102     {
0103         double M = *first;
0104         double Q = 0;
0105         double k = 2;
0106         for (auto it = std::next(first); it != last; ++it)
0107         {
0108             double tmp = *it - M;
0109             Q = Q + ((k-1)*tmp*tmp)/k;
0110             M = M + tmp/k;
0111             k += 1;
0112         }
0113         return Q/(k-1);
0114     }
0115     else
0116     {
0117         Real M = *first;
0118         Real Q = 0;
0119         Real k = 2;
0120         for (auto it = std::next(first); it != last; ++it)
0121         {
0122             Real tmp = (*it - M)/k;
0123             Q += k*(k-1)*tmp*tmp;
0124             M += tmp;
0125             k += 1;
0126         }
0127         return Q/(k-1);
0128     }
0129 }
0130
0131 template<class Container>
0132 inline auto variance(Container const & v)
0133 {
0134     return variance(v.cbegin(), v.cend());
0135 }
0136
0137 template<class ForwardIterator>
0138 auto sample_variance(ForwardIterator first, ForwardIterator last)
0139 {
0140     size_t n = std::distance(first, last);
0141     BOOST_MATH_ASSERT_MSG(n > 1, "At least two samples are required to compute the sample variance.");
0142     return n*variance(first, last)/(n-1);
0143 }
0144
0145 template<class Container>
0146 inline auto sample_variance(Container const & v)
0147 {
0148     return sample_variance(v.cbegin(), v.cend());
0149 }
0150
0151
0152 // Follows equation 1.5 of:
0153 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
0154 template<class ForwardIterator>
0155 auto skewness(ForwardIterator first, ForwardIterator last)
0156 {
0157     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
0158     BOOST_MATH_ASSERT_MSG(first != last, "At least one sample is required to compute skewness.");
0159     if constexpr (std::is_integral<Real>::value)
0160     {
0161         double M1 = *first;
0162         double M2 = 0;
0163         double M3 = 0;
0164         double n = 2;
0165         for (auto it = std::next(first); it != last; ++it)
0166         {
0167             double delta21 = *it - M1;
0168             double tmp = delta21/n;
0169             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
0170             M2 = M2 + tmp*(n-1)*delta21;
0171             M1 = M1 + tmp;
0172             n += 1;
0173         }
0174
0175         double var = M2/(n-1);
0176         if (var == 0)
0177         {
0178             // The limit is technically undefined, but the interpretation here is clear:
0179             // A constant dataset has no skewness.
0180             return static_cast<double>(0);
0181         }
0182         double skew = M3/(M2*sqrt(var));
0183         return skew;
0184     }
0185     else
0186     {
0187         Real M1 = *first;
0188         Real M2 = 0;
0189         Real M3 = 0;
0190         Real n = 2;
0191         for (auto it = std::next(first); it != last; ++it)
0192         {
0193             Real delta21 = *it - M1;
0194             Real tmp = delta21/n;
0195             M3 += tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
0196             M2 += tmp*(n-1)*delta21;
0197             M1 += tmp;
0198             n += 1;
0199         }
0200
0201         Real var = M2/(n-1);
0202         if (var == 0)
0203         {
0204             // The limit is technically undefined, but the interpretation here is clear:
0205             // A constant dataset has no skewness.
0206             return Real(0);
0207         }
0208         Real skew = M3/(M2*sqrt(var));
0209         return skew;
0210     }
0211 }
0212
0213 template<class Container>
0214 inline auto skewness(Container const & v)
0215 {
0216     return skewness(v.cbegin(), v.cend());
0217 }
0218
0219 // Follows equation 1.5/1.6 of:
0220 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
0221 template<class ForwardIterator>
0222 auto first_four_moments(ForwardIterator first, ForwardIterator last)
0223 {
0224     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
0225     BOOST_MATH_ASSERT_MSG(first != last, "At least one sample is required to compute the first four moments.");
0226     if constexpr (std::is_integral<Real>::value)
0227     {
0228         double M1 = *first;
0229         double M2 = 0;
0230         double M3 = 0;
0231         double M4 = 0;
0232         double n = 2;
0233         for (auto it = std::next(first); it != last; ++it)
0234         {
0235             double delta21 = *it - M1;
0236             double tmp = delta21/n;
0237             M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
0238             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
0239             M2 = M2 + tmp*(n-1)*delta21;
0240             M1 = M1 + tmp;
0241             n += 1;
0242         }
0243
0244         return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
0245     }
0246     else
0247     {
0248         Real M1 = *first;
0249         Real M2 = 0;
0250         Real M3 = 0;
0251         Real M4 = 0;
0252         Real n = 2;
0253         for (auto it = std::next(first); it != last; ++it)
0254         {
0255             Real delta21 = *it - M1;
0256             Real tmp = delta21/n;
0257             M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
0258             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
0259             M2 = M2 + tmp*(n-1)*delta21;
0260             M1 = M1 + tmp;
0261             n += 1;
0262         }
0263
0264         return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
0265     }
0266 }
0267
0268 template<class Container>
0269 inline auto first_four_moments(Container const & v)
0270 {
0271     return first_four_moments(v.cbegin(), v.cend());
0272 }
0273
0274
0275 // Follows equation 1.6 of:
0276 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
0277 template<class ForwardIterator>
0278 auto kurtosis(ForwardIterator first, ForwardIterator last)
0279 {
0280     auto [M1, M2, M3, M4] = first_four_moments(first, last);
0281     if (M2 == 0)
0282     {
0283         return M2;
0284     }
0285     return M4/(M2*M2);
0286 }
0287
0288 template<class Container>
0289 inline auto kurtosis(Container const & v)
0290 {
0291     return kurtosis(v.cbegin(), v.cend());
0292 }
0293
0294 template<class ForwardIterator>
0295 auto excess_kurtosis(ForwardIterator first, ForwardIterator last)
0296 {
0297     return kurtosis(first, last) - 3;
0298 }
0299
0300 template<class Container>
0301 inline auto excess_kurtosis(Container const & v)
0302 {
0303     return excess_kurtosis(v.cbegin(), v.cend());
0304 }
0305
0306
0307 template<class RandomAccessIterator>
0308 auto median(RandomAccessIterator first, RandomAccessIterator last)
0309 {
0310     size_t num_elems = std::distance(first, last);
0311     BOOST_MATH_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined.");
0312     if (num_elems & 1)
0313     {
0314         auto middle = first + (num_elems - 1)/2;
0315         std::nth_element(first, middle, last);
0316         return *middle;
0317     }
0318     else
0319     {
0320         auto middle = first + num_elems/2 - 1;
0321         std::nth_element(first, middle, last);
0322         std::nth_element(middle, middle+1, last);
0323         return (*middle + *(middle+1))/2;
0324     }
0325 }
0326
0327
0328 template<class RandomAccessContainer>
0329 inline auto median(RandomAccessContainer & v)
0330 {
0331     return median(v.begin(), v.end());
0332 }
0333
0334 template<class RandomAccessIterator>
0335 auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
0336 {
0337     using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
0338     BOOST_MATH_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples.");
0339
0340     std::sort(first, last);
0341     if constexpr (std::is_integral<Real>::value)
0342     {
0343         double i = 1;
0344         double num = 0;
0345         double denom = 0;
0346         for (auto it = first; it != last; ++it)
0347         {
0348             num += *it*i;
0349             denom += *it;
0350             ++i;
0351         }
0352
0353         // If the l1 norm is zero, all elements are zero, so every element is the same.
0354         if (denom == 0)
0355         {
0356             return static_cast<double>(0);
0357         }
0358
0359         return ((2*num)/denom - i)/(i-1);
0360     }
0361     else
0362     {
0363         Real i = 1;
0364         Real num = 0;
0365         Real denom = 0;
0366         for (auto it = first; it != last; ++it)
0367         {
0368             num += *it*i;
0369             denom += *it;
0370             ++i;
0371         }
0372
0373         // If the l1 norm is zero, all elements are zero, so every element is the same.
0374         if (denom == 0)
0375         {
0376             return Real(0);
0377         }
0378
0379         return ((2*num)/denom - i)/(i-1);
0380     }
0381 }
0382
0383 template<class RandomAccessContainer>
0384 inline auto gini_coefficient(RandomAccessContainer & v)
0385 {
0386     return gini_coefficient(v.begin(), v.end());
0387 }
0388
0389 template<class RandomAccessIterator>
0390 inline auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
0391 {
0392     size_t n = std::distance(first, last);
0393     return n*gini_coefficient(first, last)/(n-1);
0394 }
0395
0396 template<class RandomAccessContainer>
0397 inline auto sample_gini_coefficient(RandomAccessContainer & v)
0398 {
0399     return sample_gini_coefficient(v.begin(), v.end());
0400 }
0401
0402 template<class RandomAccessIterator>
0403 auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN())
0404 {
0405     using std::abs;
0406     using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
0407     using std::isnan;
0408     if (isnan(center))
0409     {
0410         center = boost::math::tools::median(first, last);
0411     }
0412     size_t num_elems = std::distance(first, last);
0413     BOOST_MATH_ASSERT_MSG(num_elems > 0, "The median of a zero-length vector is undefined.");
0414     auto comparator = [&center](Real a, Real b) { return abs(a-center) < abs(b-center);};
0415     if (num_elems & 1)
0416     {
0417         auto middle = first + (num_elems - 1)/2;
0418         std::nth_element(first, middle, last, comparator);
0419         return abs(*middle);
0420     }
0421     else
0422     {
0423         auto middle = first + num_elems/2 - 1;
0424         std::nth_element(first, middle, last, comparator);
0425         std::nth_element(middle, middle+1, last, comparator);
0426         return (abs(*middle) + abs(*(middle+1)))/abs(static_cast<Real>(2));
0427     }
0428 }
0429
0430 template<class RandomAccessContainer>
0431 inline auto median_absolute_deviation(RandomAccessContainer & v, typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN())
0432 {
0433     return median_absolute_deviation(v.begin(), v.end(), center);
0434 }
0435
0436 }
0437 #endif