arrow/compute/util.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 #pragma once
0019
0020 #include <atomic>
0021 #include <cstdint>
0022 #include <optional>
0023 #include <thread>
0024 #include <unordered_map>
0025 #include <vector>
0026
0027 #include "arrow/compute/expression.h"
0028 #include "arrow/compute/type_fwd.h"
0029 #include "arrow/result.h"
0030 #include "arrow/util/cpu_info.h"
0031 #include "arrow/util/simd.h"
0032
0033 #if defined(__clang__) || defined(__GNUC__)
0034 #  define BYTESWAP(x) __builtin_bswap64(x)
0035 #  define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
0036 #  define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
0037 #elif defined(_MSC_VER)
0038 #  include <intrin.h>
0039 #  define BYTESWAP(x) _byteswap_uint64(x)
0040 #  define ROTL(x, n) _rotl((x), (n))
0041 #  define ROTL64(x, n) _rotl64((x), (n))
0042 #endif
0043
0044 namespace arrow {
0045 namespace util {
0046
0047 // Some platforms typedef int64_t as long int instead of long long int,
0048 // which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
0049 // which need long long.
0050 // We use the cast to the type below in these intrinsics to make the code
0051 // compile in all cases.
0052 //
0053 using int64_for_gather_t = const long long int;  // NOLINT runtime-int
0054
0055 // All MiniBatch... classes use TempVectorStack for vector allocations and can
0056 // only work with vectors up to 1024 elements.
0057 //
0058 // They should only be allocated on the stack to guarantee the right sequence
0059 // of allocation and deallocation of vectors from TempVectorStack.
0060 //
0061 class MiniBatch {
0062  public:
0063   static constexpr int kLogMiniBatchLength = 10;
0064   static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
0065 };
0066
0067 namespace bit_util {
0068
0069 ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
0070                                   const int num_bits, const uint8_t* bits,
0071                                   int* num_indexes, uint16_t* indexes,
0072                                   int bit_offset = 0);
0073
0074 ARROW_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
0075                                       const int num_bits, const uint8_t* bits,
0076                                       const uint16_t* input_indexes, int* num_indexes,
0077                                       uint16_t* indexes, int bit_offset = 0);
0078
0079 // Input and output indexes may be pointing to the same data (in-place filtering).
0080 ARROW_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits,
0081                                      const uint8_t* bits, int* num_indexes_bit0,
0082                                      uint16_t* indexes_bit0, uint16_t* indexes_bit1,
0083                                      int bit_offset = 0);
0084
0085 // Bit 1 is replaced with byte 0xFF.
0086 ARROW_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits,
0087                                 const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
0088
0089 // Return highest bit of each byte.
0090 ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits,
0091                                 const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
0092
0093 ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
0094                                      uint32_t num_bytes);
0095
0096 #if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
0097 // The functions below use BMI2 instructions, be careful before calling!
0098
0099 namespace avx2 {
0100 ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
0101                                            const uint8_t* bits,
0102                                            const uint16_t* input_indexes,
0103                                            int* num_indexes, uint16_t* indexes);
0104 ARROW_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
0105                                        const uint8_t* bits, int* num_indexes,
0106                                        uint16_t* indexes, uint16_t base_index = 0);
0107 ARROW_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
0108                                      uint8_t* bytes);
0109 ARROW_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
0110                                      uint8_t* bits);
0111 ARROW_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
0112 }  // namespace avx2
0113
0114 #endif
0115
0116 }  // namespace bit_util
0117 }  // namespace util
0118
0119 namespace compute {
0120
0121 /// Modify an Expression with pre-order and post-order visitation.
0122 /// `pre` will be invoked on each Expression. `pre` will visit Calls before their
0123 /// arguments, `post_call` will visit Calls (and no other Expressions) after their
0124 /// arguments. Visitors should return the Identical expression to indicate no change; this
0125 /// will prevent unnecessary construction in the common case where a modification is not
0126 /// possible/necessary/...
0127 ///
0128 /// If an argument was modified, `post_call` visits a reconstructed Call with the modified
0129 /// arguments but also receives a pointer to the unmodified Expression as a second
0130 /// argument. If no arguments were modified the unmodified Expression* will be nullptr.
0131 template <typename PreVisit, typename PostVisitCall>
0132 Result<Expression> ModifyExpression(Expression expr, const PreVisit& pre,
0133                                     const PostVisitCall& post_call) {
0134   ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
0135
0136   auto call = expr.call();
0137   if (!call) return expr;
0138
0139   bool at_least_one_modified = false;
0140   std::vector<Expression> modified_arguments;
0141
0142   for (size_t i = 0; i < call->arguments.size(); ++i) {
0143     ARROW_ASSIGN_OR_RAISE(auto modified_argument,
0144                           ModifyExpression(call->arguments[i], pre, post_call));
0145
0146     if (Identical(modified_argument, call->arguments[i])) {
0147       continue;
0148     }
0149
0150     if (!at_least_one_modified) {
0151       modified_arguments = call->arguments;
0152       at_least_one_modified = true;
0153     }
0154
0155     modified_arguments[i] = std::move(modified_argument);
0156   }
0157
0158   if (at_least_one_modified) {
0159     // reconstruct the call expression with the modified arguments
0160     auto modified_call = *call;
0161     modified_call.arguments = std::move(modified_arguments);
0162     return post_call(Expression(std::move(modified_call)), &expr);
0163   }
0164
0165   return post_call(std::move(expr), NULLPTR);
0166 }
0167
0168 // Helper class to calculate the modified number of rows to process using SIMD.
0169 //
0170 // Some array elements at the end will be skipped in order to avoid buffer
0171 // overrun, when doing memory loads and stores using larger word size than a
0172 // single array element.
0173 //
0174 class TailSkipForSIMD {
0175  public:
0176   static int64_t FixBitAccess(int num_bytes_accessed_together, int64_t num_rows,
0177                               int bit_offset) {
0178     int64_t num_bytes = bit_util::BytesForBits(num_rows + bit_offset);
0179     int64_t num_bytes_safe =
0180         std::max(static_cast<int64_t>(0LL), num_bytes - num_bytes_accessed_together + 1);
0181     int64_t num_rows_safe =
0182         std::max(static_cast<int64_t>(0LL), 8 * num_bytes_safe - bit_offset);
0183     return std::min(num_rows_safe, num_rows);
0184   }
0185   static int64_t FixBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
0186                                  int64_t length) {
0187     int64_t num_rows_to_skip = bit_util::CeilDiv(length, num_bytes_accessed_together);
0188     int64_t num_rows_safe =
0189         std::max(static_cast<int64_t>(0LL), num_rows - num_rows_to_skip);
0190     return num_rows_safe;
0191   }
0192   static int64_t FixVarBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
0193                                     const uint32_t* offsets) {
0194     // Do not process rows that could read past the end of the buffer using N
0195     // byte loads/stores.
0196     //
0197     int64_t num_rows_safe = num_rows;
0198     while (num_rows_safe > 0 &&
0199            offsets[num_rows_safe] + num_bytes_accessed_together > offsets[num_rows]) {
0200       --num_rows_safe;
0201     }
0202     return num_rows_safe;
0203   }
0204   static int FixSelection(int64_t num_rows_safe, int num_selected,
0205                           const uint16_t* selection) {
0206     int num_selected_safe = num_selected;
0207     while (num_selected_safe > 0 && selection[num_selected_safe - 1] >= num_rows_safe) {
0208       --num_selected_safe;
0209     }
0210     return num_selected_safe;
0211   }
0212 };
0213
0214 }  // namespace compute
0215 }  // namespace arrow