arrow/util/benchmark_util.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 #include <algorithm>
0019 #include <cstdint>
0020 #include <string>
0021
0022 #include "benchmark/benchmark.h"
0023
0024 #include "arrow/memory_pool.h"
0025 #include "arrow/type_fwd.h"
0026 #include "arrow/util/cpu_info.h"
0027 #include "arrow/util/logging.h"  // IWYU pragma: keep
0028
0029 namespace arrow {
0030
0031 // Benchmark changed its parameter type between releases from
0032 // int to int64_t. As it doesn't have version macros, we need
0033 // to apply C++ template magic.
0034
0035 template <typename Func>
0036 struct BenchmarkArgsType;
0037
0038 // Pattern matching that extracts the vector element type of Benchmark::Args()
0039 template <typename Values>
0040 struct BenchmarkArgsType<benchmark::internal::Benchmark* (
0041     benchmark::internal::Benchmark::*)(const std::vector<Values>&)> {
0042   using type = Values;
0043 };
0044
0045 using ArgsType =
0046     typename BenchmarkArgsType<decltype(&benchmark::internal::Benchmark::Args)>::type;
0047
0048 using internal::CpuInfo;
0049
0050 static const CpuInfo* cpu_info = CpuInfo::GetInstance();
0051
0052 static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L1);
0053 static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L2);
0054 static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L3);
0055 static const int64_t kCantFitInL3Size = kL3Size * 4;
0056 static const std::vector<int64_t> kMemorySizes = {kL1Size, kL2Size, kL3Size,
0057                                                   kCantFitInL3Size};
0058 // 0 is treated as "no nulls"
0059 static const std::vector<ArgsType> kInverseNullProportions = {10000, 100, 10, 2, 1, 0};
0060
0061 struct GenericItemsArgs {
0062   // number of items processed per iteration
0063   const int64_t size;
0064
0065   // proportion of nulls in generated arrays
0066   double null_proportion;
0067
0068   explicit GenericItemsArgs(benchmark::State& state)
0069       : size(state.range(0)), state_(state) {
0070     if (state.range(1) == 0) {
0071       this->null_proportion = 0.0;
0072     } else {
0073       this->null_proportion = std::min(1., 1. / static_cast<double>(state.range(1)));
0074     }
0075   }
0076
0077   ~GenericItemsArgs() {
0078     state_.counters["size"] = static_cast<double>(size);
0079     state_.counters["null_percent"] = null_proportion * 100;
0080     state_.SetItemsProcessed(state_.iterations() * size);
0081   }
0082
0083  private:
0084   benchmark::State& state_;
0085 };
0086
0087 void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench,
0088                                const std::vector<int64_t>& sizes = kMemorySizes) {
0089   bench->Unit(benchmark::kMicrosecond);
0090
0091   for (const auto size : sizes) {
0092     for (const auto inverse_null_proportion : kInverseNullProportions) {
0093       bench->Args({static_cast<ArgsType>(size), inverse_null_proportion});
0094     }
0095   }
0096 }
0097
0098 void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) {
0099   BenchmarkSetArgsWithSizes(bench, kMemorySizes);
0100 }
0101
0102 void RegressionSetArgs(benchmark::internal::Benchmark* bench) {
0103   // Regression do not need to account for cache hierarchy, thus optimize for
0104   // the best case.
0105   BenchmarkSetArgsWithSizes(bench, {kL1Size});
0106 }
0107
0108 // RAII struct to handle some of the boilerplate in regression benchmarks
0109 struct RegressionArgs {
0110   // size of memory tested (per iteration) in bytes
0111   int64_t size;
0112
0113   // proportion of nulls in generated arrays
0114   double null_proportion;
0115
0116   // If size_is_bytes is true, then it's a number of bytes, otherwise it's the
0117   // number of items processed (for reporting)
0118   explicit RegressionArgs(benchmark::State& state, bool size_is_bytes = true)
0119       : size(state.range(0)), state_(state), size_is_bytes_(size_is_bytes) {
0120     if (state.range(1) == 0) {
0121       this->null_proportion = 0.0;
0122     } else {
0123       this->null_proportion = std::min(1., 1. / static_cast<double>(state.range(1)));
0124     }
0125   }
0126
0127   ~RegressionArgs() {
0128     state_.counters["size"] = static_cast<double>(size);
0129     state_.counters["null_percent"] = null_proportion * 100;
0130     if (size_is_bytes_) {
0131       state_.SetBytesProcessed(state_.iterations() * size);
0132     } else {
0133       state_.SetItemsProcessed(state_.iterations() * size);
0134     }
0135   }
0136
0137  private:
0138   benchmark::State& state_;
0139   bool size_is_bytes_;
0140 };
0141
0142 class MemoryPoolMemoryManager : public benchmark::MemoryManager {
0143   void Start() override {
0144     memory_pool = std::make_shared<ProxyMemoryPool>(default_memory_pool());
0145
0146     MemoryPool* default_pool = default_memory_pool();
0147     global_allocations_start = default_pool->num_allocations();
0148   }
0149
0150 // BENCHMARK_DONT_OPTIMIZE is used here to detect Google Benchmark
0151 // 1.8.0. We can remove this Stop(Result*) when we require Google
0152 // Benchmark 1.8.0 or later.
0153 #ifndef BENCHMARK_DONT_OPTIMIZE
0154   void Stop(Result* result) override { Stop(*result); }
0155 #endif
0156
0157   void Stop(benchmark::MemoryManager::Result& result) override {
0158     // If num_allocations is still zero, we assume that the memory pool wasn't passed down
0159     // so we should record them.
0160     MemoryPool* default_pool = default_memory_pool();
0161     int64_t new_default_allocations =
0162         default_pool->num_allocations() - global_allocations_start;
0163
0164     // Only record metrics if (1) there were allocations and (2) we
0165     // recorded at least one.
0166     if (new_default_allocations > 0 && memory_pool->num_allocations() > 0) {
0167       if (new_default_allocations > memory_pool->num_allocations()) {
0168         // If we missed some, let's report that.
0169         int64_t missed_allocations =
0170             new_default_allocations - memory_pool->num_allocations();
0171         ARROW_LOG(WARNING) << "BenchmarkMemoryTracker recorded some allocations "
0172                            << "for a benchmark, but missed " << missed_allocations
0173                            << " allocations.\n";
0174       }
0175
0176       result.max_bytes_used = memory_pool->max_memory();
0177       result.total_allocated_bytes = memory_pool->total_bytes_allocated();
0178       result.num_allocs = memory_pool->num_allocations();
0179     }
0180   }
0181
0182  public:
0183   std::shared_ptr<::arrow::ProxyMemoryPool> memory_pool;
0184
0185  protected:
0186   int64_t global_allocations_start;
0187 };
0188
0189 /// \brief Track memory pool allocations in benchmarks.
0190 ///
0191 /// Instantiate as a global variable to register the hooks into Google Benchmark
0192 /// to collect memory metrics. Before each benchmark, a new ProxyMemoryPool is
0193 /// created. It can then be accessed with memory_pool(). Once the benchmark is
0194 /// complete, the hook will record the maximum memory used, the total bytes
0195 /// allocated, and the total number of allocations. If no allocations were seen,
0196 /// (for example, if you forgot to pass down the memory pool), then these metrics
0197 /// will not be saved.
0198 ///
0199 /// Since this is used as one global variable, this will not work if multiple
0200 /// benchmarks are run concurrently or for multi-threaded benchmarks (ones
0201 /// that use `->ThreadRange(...)`).
0202 class BenchmarkMemoryTracker {
0203  public:
0204   BenchmarkMemoryTracker() : manager_() { ::benchmark::RegisterMemoryManager(&manager_); }
0205   ::arrow::MemoryPool* memory_pool() const { return manager_.memory_pool.get(); }
0206
0207  protected:
0208   ::arrow::MemoryPoolMemoryManager manager_;
0209 };
0210
0211 }  // namespace arrow