UnitTests/Benchmarks/BenchmarkTools.cpp

0001 // This file is part of the ACTS project.
0002 //
0003 // Copyright (C) 2016 CERN for the benefit of the ACTS project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
0008
0009 #include <boost/test/unit_test.hpp>
0010
0011 #include "ActsTests/CommonHelpers/BenchmarkTools.hpp"
0012
0013 #include "ActsTests/CommonHelpers/FloatComparisons.hpp"
0014
0015 #include <cmath>
0016 #include <complex>
0017 #include <iostream>
0018 #include <sstream>
0019 #include <tuple>
0020
0021 namespace ActsTests {
0022
0023 // Basic non-timing tests do not validate the core performance aspects of the
0024 // benchmark tools, but have the advantage of being runnable on any system.
0025 BOOST_AUTO_TEST_SUITE(benchmark_tools)
0026
0027 BOOST_AUTO_TEST_CASE(assume_accessed) {
0028   int x = 42;
0029   assumeAccessed(x);
0030   BOOST_CHECK_EQUAL(x, 42);
0031 }
0032
0033 BOOST_AUTO_TEST_CASE(assume_read) {
0034   float x = 4.2f;
0035   assumeRead(x);
0036   BOOST_CHECK_EQUAL(x, 4.2f);
0037
0038   const std::string y = "LOL";
0039   assumeRead(x);
0040   BOOST_CHECK_EQUAL(y, "LOL");
0041
0042   assumeRead(std::make_tuple(1, false, 3.5));
0043 }
0044
0045 BOOST_AUTO_TEST_CASE(assume_written) {
0046   std::complex c(1.2, 3.4);
0047   assumeWritten(c);
0048   BOOST_CHECK_EQUAL(c, std::complex(1.2, 3.4));
0049 }
0050
0051 BOOST_AUTO_TEST_CASE(micro_benchmark_result) {
0052   MicroBenchmarkResult res;
0053   res.iters_per_run = 42;
0054   res.run_timings = {
0055       std::chrono::microseconds(420), std::chrono::microseconds(21),
0056       std::chrono::milliseconds(4),   std::chrono::microseconds(84),
0057       std::chrono::microseconds(294), std::chrono::microseconds(378),
0058       std::chrono::microseconds(126), std::chrono::milliseconds(42)};
0059
0060   CHECK_CLOSE_REL(res.totalTime().count() / 1'000'000., 47.323, 1e-6);
0061
0062   const auto sorted = res.sortedRunTimes();
0063   BOOST_CHECK_EQUAL(sorted.size(), res.run_timings.size());
0064   BOOST_CHECK_EQUAL(sorted[0].count(), 21'000.);
0065   BOOST_CHECK_EQUAL(sorted[1].count(), 84'000.);
0066   BOOST_CHECK_EQUAL(sorted[2].count(), 126'000.);
0067   BOOST_CHECK_EQUAL(sorted[3].count(), 294'000.);
0068   BOOST_CHECK_EQUAL(sorted[4].count(), 378'000.);
0069   BOOST_CHECK_EQUAL(sorted[5].count(), 420'000.);
0070   BOOST_CHECK_EQUAL(sorted[6].count(), 4'000'000.);
0071   BOOST_CHECK_EQUAL(sorted[7].count(), 42'000'000.);
0072
0073   CHECK_CLOSE_REL(res.runTimeMedian().count() / 1000., (294. + 378.) / 2.,
0074                   1e-6);
0075
0076   const auto [firstq, thirdq] = res.runTimeQuartiles();
0077   CHECK_CLOSE_REL(firstq.count() / 1000., (84. + 126.) / 2., 1e-6);
0078   CHECK_CLOSE_REL(thirdq.count() / 1000., (420. + 4000.) / 2., 1e-6);
0079
0080   const auto robustRTStddev = res.runTimeRobustStddev();
0081   CHECK_CLOSE_REL(robustRTStddev.count(), (thirdq - firstq).count() / 1.349,
0082                   1e-3);
0083
0084   const auto runTimeError = res.runTimeError();
0085   CHECK_CLOSE_REL(
0086       runTimeError.count(),
0087       1.2533 * robustRTStddev.count() / std::sqrt(res.run_timings.size()),
0088       1e-3);
0089
0090   CHECK_CLOSE_REL(res.iterTimeAverage().count(),
0091                   res.runTimeMedian().count() / res.iters_per_run, 1e-6);
0092
0093   CHECK_CLOSE_REL(res.iterTimeError().count(),
0094                   runTimeError.count() / std::sqrt(res.iters_per_run), 1e-6);
0095
0096   std::ostringstream os;
0097   os << res;
0098   BOOST_CHECK_EQUAL(os.str(),
0099                     "8 runs of 42 iteration(s), 47.3ms total, "
0100                     "336.0000+/-1355.2296µs per run, "
0101                     "8000.000+/-209116.462ns per iteration");
0102 }
0103
0104 BOOST_AUTO_TEST_CASE(micro_benchmark) {
0105   int counter = 0;
0106   microBenchmark([&] { ++counter; }, 15, 7, std::chrono::milliseconds(0));
0107   BOOST_CHECK_EQUAL(counter, 15 * 7);
0108
0109   counter = 0;
0110   microBenchmark(
0111       [&] {
0112         ++counter;
0113         return counter;
0114       },
0115       17, 11, std::chrono::milliseconds(0));
0116   BOOST_CHECK_EQUAL(counter, 17 * 11);
0117
0118   counter = 0;
0119   int previous = 64;
0120   std::vector<int> ints{1, 2, 4, 8, 16, 32, 64};
0121   microBenchmark(
0122       [&](int input) {
0123         if (input == 1) {
0124           BOOST_CHECK_EQUAL(previous, 64);
0125           counter = 1;
0126         } else {
0127           BOOST_CHECK_EQUAL(input, previous * 2);
0128           counter += input;
0129         }
0130         previous = input;
0131       },
0132       ints, 123, std::chrono::milliseconds(3));
0133   BOOST_CHECK_EQUAL(counter, 127);
0134
0135   counter = 0;
0136   previous = -81;
0137   std::vector<char> chars{-1, 3, -9, 27, -81};
0138   microBenchmark(
0139       [&](int input) {
0140         if (input == -1) {
0141           BOOST_CHECK_EQUAL(previous, -81);
0142           counter = -1;
0143         } else {
0144           BOOST_CHECK_EQUAL(input, -previous * 3);
0145           counter += input;
0146         }
0147         previous = input;
0148         return &previous;
0149       },
0150       chars, 456, std::chrono::milliseconds(8));
0151   BOOST_CHECK_EQUAL(counter, -61);
0152 }
0153
0154 BOOST_AUTO_TEST_SUITE_END()
0155
0156 // Timing tests are perhaps the most important ones for validation of
0157 // benchmarking tools, but they cannot be run by default for two reasons:
0158 // - They take a while to run, and therefore slow down the testing cycle
0159 // - They require a quiet system to succeed, and will likely fail when invoked
0160 //   by a parallel run of CTest or when run on a continuous integration VM.
0161 //
0162 // If you can ensure both of these preconditions, you can run the test with
0163 // ./BenchmarkTools --run_test=benchmark_timings
0164 BOOST_AUTO_TEST_SUITE(benchmark_timings, *boost::unit_test::disabled())
0165
0166 constexpr std::size_t bench_iters = 1'000;
0167
0168 BOOST_AUTO_TEST_CASE(micro_benchmark) {
0169   using namespace std::literals::chrono_literals;
0170
0171   // For simple microbenchmarking needs, plain use of microBenchmark is enough.
0172   //
0173   // For example, here, the microbenchmark loop isn't optimized out even though
0174   // each iteration does literally nothing. If it were optimized out, the time
0175   // per iteration would change, since we wouldn't get linear scaling anymore.
0176   auto nop = [] {};
0177   const auto nop_x10 = microBenchmark(nop, 10 * bench_iters);
0178   std::cout << "nop (10x iters): " << nop_x10 << std::endl;
0179   const auto nop_x100 = microBenchmark(nop, 100 * bench_iters);
0180   std::cout << "nop (100x iters): " << nop_x100 << std::endl;
0181   const double nop_x10_iter_ns = nop_x10.iterTimeAverage().count();
0182   const double nop_x100_iter_ns = nop_x100.iterTimeAverage().count();
0183   CHECK_CLOSE_REL(nop_x10_iter_ns, nop_x100_iter_ns, 0.1);
0184
0185 // These tests reason about the performance characteristics of _optimized_ code,
0186 // and should therefore be compiled out of debug/coverage builds.
0187 #ifdef __OPTIMIZE__
0188   // The microbenchmarking harness is super low overhead, less than 1
0189   // nanosecond per iteration on a modern CPU.
0190   BOOST_CHECK_LT(nop_x100_iter_ns, 1.0);
0191
0192   // With a well-chosen iteration count that keeps per-run times under the OS
0193   // scheduling quantum (typically 1ms), the noise is also super low.
0194   BOOST_CHECK_LT(nop_x100.iterTimeError().count(), 0.1);
0195
0196   // You can measure the overhead of any operation as long as it's not
0197   // _obnoxiously_ amenable to compiler const-propagation or dead code
0198   // elimination. For example, this sqrt throughput microbenchmark works,
0199   // because microBenchmark forces the compiler to assume that "x", "y" and "z"
0200   // are modified on every benchmark iteration...
0201   const double x = 1.2, y = 3.4, z = 5.6;
0202   auto sqrt = microBenchmark(
0203       [&] { return std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x); },
0204       bench_iters);
0205   std::cout << "sqrt (correct): " << sqrt << std::endl;
0206   BOOST_CHECK_GT(sqrt.iterTimeAverage().count(), 10. * nop_x100_iter_ns);
0207
0208   // ...but this variant doesn't work, because the compiler can trivially
0209   // precompute the square root when optimizing the inner lambda...
0210   const auto sqrt_constprop = microBenchmark(
0211       [] {
0212         return std::sqrt(1.2 * 3.4) + std::sqrt(3.4 * 5.6) +
0213                std::sqrt(5.6 * 1.2);
0214       },
0215       bench_iters * 20);
0216   std::cout << "sqrt (constprop'd): " << sqrt_constprop << std::endl;
0217   BOOST_CHECK_LT(sqrt_constprop.iterTimeAverage().count(),
0218                  sqrt.iterTimeAverage().count() / 5.);
0219
0220   // ...and this one doesn't work either, because the compiler can trivially
0221   // infer that the result of the computation is unused and stop computing it.
0222   //
0223   // The lower tolerance of this test is needed because current GCC doesn't
0224   // optimize _everything_ out in its default configuration, as sqrt could still
0225   // have side-effects like setting the errno thread-local variable...
0226   const auto sqrt_deadcode = microBenchmark(
0227       [&] { (void)(std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x)); },
0228       bench_iters * 10);
0229   std::cout << "sqrt (deadcode'd): " << sqrt_deadcode << std::endl;
0230   BOOST_CHECK_LT(sqrt_deadcode.iterTimeAverage().count(),
0231                  sqrt.iterTimeAverage().count() / 3.);
0232 #endif
0233 }
0234
0235 // These tests reason about the performance characteristics of _optimized_ code,
0236 // and should therefore be compiled out of debug/coverage builds.
0237 #ifdef __OPTIMIZE__
0238 BOOST_AUTO_TEST_CASE(assume_read) {
0239   // You can use assumeRead when you want the compiler to assume that the result
0240   // of some computation has been read and therefore the computation shouldn't
0241   // be optimized out. This is what microBenchmark implicitly does to the value
0242   // returned by the benchmark iteration function, if any.
0243   //
0244   // For example, these two computations are almost equivalent. Notice that
0245   // assumeRead can be used on temporaries.
0246   const double x = 1.2, y = 3.4, z = 5.6;
0247   const auto tuple_return = microBenchmark(
0248       [&] {
0249         return std::make_tuple(
0250             std::sqrt(x * y), std::complex(std::sqrt(y * z), std::sqrt(z * x)));
0251       },
0252       bench_iters);
0253   std::cout << "tuple return: " << tuple_return << std::endl;
0254   const auto assumeread = microBenchmark(
0255       [&] {
0256         assumeRead(std::sqrt(x * y));
0257         assumeRead(std::complex(std::sqrt(y * z), std::sqrt(z * x)));
0258       },
0259       bench_iters);
0260   std::cout << "assumeRead: " << assumeread << std::endl;
0261   const double tuple_return_iter_ns = tuple_return.iterTimeAverage().count();
0262   const double assumeRead_iter_ns = assumeread.iterTimeAverage().count();
0263   CHECK_CLOSE_REL(tuple_return_iter_ns, assumeRead_iter_ns, 1e-2);
0264 }
0265 #endif
0266
0267 BOOST_AUTO_TEST_CASE(assume_written) {
0268   // You can use assumeWritten when you want the compiler to assume that some
0269   // variables have been written to, and every dependent computation must
0270   // therefore be recomputed. This is what microBenchmark implicitly does to
0271   // every variable captured by the benchmark iteration lambda.
0272   //
0273   // Since assumeWritten operates on variables in memory, it cannot be used on
0274   // temporaries, but only on mutable variables.
0275   double x = 1.2, y = 3.4, z = 5.6;
0276   auto sqrt_sum = microBenchmark(
0277       [&] { return std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x); },
0278       bench_iters);
0279   std::cout << "sqrt sum: " << sqrt_sum << std::endl;
0280   auto sqrt_2sums = microBenchmark(
0281       [&] {
0282         double tmp = std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x);
0283         assumeWritten(x);
0284         assumeWritten(y);
0285         assumeWritten(z);
0286         return tmp + std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x);
0287       },
0288       bench_iters);
0289   std::cout << "2x(sqrt sum): " << sqrt_2sums << std::endl;
0290   const double sqrt_sum_iter_ns = sqrt_sum.iterTimeAverage().count();
0291   const double sqrt_2sums_iter_ns = sqrt_2sums.iterTimeAverage().count();
0292   CHECK_CLOSE_REL(2. * sqrt_sum_iter_ns, sqrt_2sums_iter_ns, 1e-2);
0293 }
0294
0295 BOOST_AUTO_TEST_SUITE_END()
0296
0297 }  // namespace ActsTests