|
|
|||
File indexing completed on 2026-04-17 08:28:53
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cmath> 0021 #include <cstdint> 0022 #include <memory> 0023 0024 #include "arrow/util/bit_util.h" 0025 #include "arrow/util/logging.h" 0026 #include "parquet/hasher.h" 0027 #include "parquet/platform.h" 0028 #include "parquet/types.h" 0029 0030 namespace parquet { 0031 0032 // A Bloom filter is a compact structure to indicate whether an item is not in a set or 0033 // probably in a set. The Bloom filter usually consists of a bit set that represents a 0034 // set of elements, a hash strategy and a Bloom filter algorithm. 0035 class PARQUET_EXPORT BloomFilter { 0036 public: 0037 // Maximum Bloom filter size, it sets to HDFS default block size 128MB 0038 // This value will be reconsidered when implementing Bloom filter producer. 0039 static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024; 0040 0041 /// Determine whether an element exist in set or not. 0042 /// 0043 /// @param hash the element to contain. 0044 /// @return false if value is definitely not in set, and true means PROBABLY 0045 /// in set. 0046 virtual bool FindHash(uint64_t hash) const = 0; 0047 0048 /// Insert element to set represented by Bloom filter bitset. 0049 /// @param hash the hash of value to insert into Bloom filter. 0050 virtual void InsertHash(uint64_t hash) = 0; 0051 0052 /// Insert elements to set represented by Bloom filter bitset. 0053 /// @param hashes the hash values to insert into Bloom filter. 0054 /// @param num_values the number of hash values to insert. 0055 virtual void InsertHashes(const uint64_t* hashes, int num_values) = 0; 0056 0057 /// Write this Bloom filter to an output stream. A Bloom filter structure should 0058 /// include bitset length, hash strategy, algorithm, and bitset. 0059 /// 0060 /// @param sink the output stream to write 0061 virtual void WriteTo(ArrowOutputStream* sink) const = 0; 0062 0063 /// Get the number of bytes of bitset 0064 virtual uint32_t GetBitsetSize() const = 0; 0065 0066 /// Compute hash for 32 bits value by using its plain encoding result. 0067 /// 0068 /// @param value the value to hash. 0069 /// @return hash result. 0070 virtual uint64_t Hash(int32_t value) const = 0; 0071 0072 /// Compute hash for 64 bits value by using its plain encoding result. 0073 /// 0074 /// @param value the value to hash. 0075 /// @return hash result. 0076 virtual uint64_t Hash(int64_t value) const = 0; 0077 0078 /// Compute hash for float value by using its plain encoding result. 0079 /// 0080 /// @param value the value to hash. 0081 /// @return hash result. 0082 virtual uint64_t Hash(float value) const = 0; 0083 0084 /// Compute hash for double value by using its plain encoding result. 0085 /// 0086 /// @param value the value to hash. 0087 /// @return hash result. 0088 virtual uint64_t Hash(double value) const = 0; 0089 0090 /// Compute hash for Int96 value by using its plain encoding result. 0091 /// 0092 /// @param value the value to hash. 0093 /// @return hash result. 0094 virtual uint64_t Hash(const Int96* value) const = 0; 0095 0096 /// Compute hash for ByteArray value by using its plain encoding result. 0097 /// 0098 /// @param value the value to hash. 0099 /// @return hash result. 0100 virtual uint64_t Hash(const ByteArray* value) const = 0; 0101 0102 /// Compute hash for fixed byte array value by using its plain encoding result. 0103 /// 0104 /// @param value the value address. 0105 /// @param len the value length. 0106 /// @return hash result. 0107 virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; 0108 0109 /// Batch compute hashes for 32 bits values by using its plain encoding result. 0110 /// 0111 /// @param values values a pointer to the values to hash. 0112 /// @param num_values the number of values to hash. 0113 /// @param hashes a pointer to the output hash values, its length should be equal to 0114 /// num_values. 0115 virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const = 0; 0116 0117 /// Batch compute hashes for 64 bits values by using its plain encoding result. 0118 /// 0119 /// @param values values a pointer to the values to hash. 0120 /// @param num_values the number of values to hash. 0121 /// @param hashes a pointer to the output hash values, its length should be equal to 0122 /// num_values. 0123 virtual void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const = 0; 0124 0125 /// Batch compute hashes for float values by using its plain encoding result. 0126 /// 0127 /// @param values values a pointer to the values to hash. 0128 /// @param num_values the number of values to hash. 0129 /// @param hashes a pointer to the output hash values, its length should be equal to 0130 /// num_values. 0131 virtual void Hashes(const float* values, int num_values, uint64_t* hashes) const = 0; 0132 0133 /// Batch compute hashes for double values by using its plain encoding result. 0134 /// 0135 /// @param values values a pointer to the values to hash. 0136 /// @param num_values the number of values to hash. 0137 /// @param hashes a pointer to the output hash values, its length should be equal to 0138 /// num_values. 0139 virtual void Hashes(const double* values, int num_values, uint64_t* hashes) const = 0; 0140 0141 /// Batch compute hashes for Int96 values by using its plain encoding result. 0142 /// 0143 /// @param values values a pointer to the values to hash. 0144 /// @param num_values the number of values to hash. 0145 /// @param hashes a pointer to the output hash values, its length should be equal to 0146 /// num_values. 0147 virtual void Hashes(const Int96* values, int num_values, uint64_t* hashes) const = 0; 0148 0149 /// Batch compute hashes for ByteArray values by using its plain encoding result. 0150 /// 0151 /// @param values values a pointer to the values to hash. 0152 /// @param num_values the number of values to hash. 0153 /// @param hashes a pointer to the output hash values, its length should be equal to 0154 /// num_values. 0155 virtual void Hashes(const ByteArray* values, int num_values, 0156 uint64_t* hashes) const = 0; 0157 0158 /// Batch compute hashes for fixed byte array values by using its plain encoding result. 0159 /// 0160 /// @param values values a pointer to the values to hash. 0161 /// @param type_len the value length. 0162 /// @param num_values the number of values to hash. 0163 /// @param hashes a pointer to the output hash values, its length should be equal to 0164 /// num_values. 0165 virtual void Hashes(const FLBA* values, uint32_t type_len, int num_values, 0166 uint64_t* hashes) const = 0; 0167 0168 virtual ~BloomFilter() = default; 0169 0170 protected: 0171 // Hash strategy available for Bloom filter. 0172 enum class HashStrategy : uint32_t { XXHASH = 0 }; 0173 0174 // Bloom filter algorithm. 0175 enum class Algorithm : uint32_t { BLOCK = 0 }; 0176 0177 enum class CompressionStrategy : uint32_t { UNCOMPRESSED = 0 }; 0178 }; 0179 0180 /// The BlockSplitBloomFilter is implemented using block-based Bloom filters from 0181 /// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to 0182 /// hash the item to a tiny Bloom filter which size fit a single cache line or smaller. 0183 /// 0184 /// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom 0185 /// filter is 32 bytes to take advantage of 32-byte SIMD instructions. 0186 class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { 0187 public: 0188 /// The constructor of BlockSplitBloomFilter. It uses XXH64 as hash function. 0189 /// 0190 /// \param pool memory pool to use. 0191 explicit BlockSplitBloomFilter( 0192 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); 0193 0194 /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within 0195 /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be 0196 /// rounded up/down to lower/upper bound if num_bytes is out of range and also 0197 /// will be rounded up to a power of 2. 0198 /// 0199 /// @param num_bytes The number of bytes to store Bloom filter bitset. 0200 void Init(uint32_t num_bytes); 0201 0202 /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying 0203 /// bitset because the given bitset may not satisfy the 32-byte alignment requirement 0204 /// which may lead to segfault when performing SIMD instructions. It is the caller's 0205 /// responsibility to free the bitset passed in. This is used when reconstructing 0206 /// a Bloom filter from a parquet file. 0207 /// 0208 /// @param bitset The given bitset to initialize the Bloom filter. 0209 /// @param num_bytes The number of bytes of given bitset. 0210 void Init(const uint8_t* bitset, uint32_t num_bytes); 0211 0212 /// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter. 0213 static constexpr uint32_t kMinimumBloomFilterBytes = 32; 0214 0215 /// Calculate optimal size according to the number of distinct values and false 0216 /// positive probability. 0217 /// 0218 /// @param ndv The number of distinct values. 0219 /// @param fpp The false positive probability. 0220 /// @return it always return a value between kMinimumBloomFilterBytes and 0221 /// kMaximumBloomFilterBytes, and the return value is always a power of 2 0222 static uint32_t OptimalNumOfBytes(uint32_t ndv, double fpp) { 0223 uint32_t optimal_num_of_bits = OptimalNumOfBits(ndv, fpp); 0224 ARROW_DCHECK(::arrow::bit_util::IsMultipleOf8(optimal_num_of_bits)); 0225 return optimal_num_of_bits >> 3; 0226 } 0227 0228 /// Calculate optimal size according to the number of distinct values and false 0229 /// positive probability. 0230 /// 0231 /// @param ndv The number of distinct values. 0232 /// @param fpp The false positive probability. 0233 /// @return it always return a value between kMinimumBloomFilterBytes * 8 and 0234 /// kMaximumBloomFilterBytes * 8, and the return value is always a power of 16 0235 static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) { 0236 ARROW_DCHECK(fpp > 0.0 && fpp < 1.0); 0237 const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8)); 0238 uint32_t num_bits; 0239 0240 // Handle overflow. 0241 if (m < 0 || m > kMaximumBloomFilterBytes << 3) { 0242 num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3); 0243 } else { 0244 num_bits = static_cast<uint32_t>(m); 0245 } 0246 0247 // Round up to lower bound 0248 if (num_bits < kMinimumBloomFilterBytes << 3) { 0249 num_bits = kMinimumBloomFilterBytes << 3; 0250 } 0251 0252 // Get next power of 2 if bits is not power of 2. 0253 if ((num_bits & (num_bits - 1)) != 0) { 0254 num_bits = static_cast<uint32_t>(::arrow::bit_util::NextPower2(num_bits)); 0255 } 0256 0257 // Round down to upper bound 0258 if (num_bits > kMaximumBloomFilterBytes << 3) { 0259 num_bits = kMaximumBloomFilterBytes << 3; 0260 } 0261 0262 return num_bits; 0263 } 0264 0265 bool FindHash(uint64_t hash) const override; 0266 void InsertHash(uint64_t hash) override; 0267 void InsertHashes(const uint64_t* hashes, int num_values) override; 0268 void WriteTo(ArrowOutputStream* sink) const override; 0269 uint32_t GetBitsetSize() const override { return num_bytes_; } 0270 0271 uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); } 0272 uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); } 0273 uint64_t Hash(float value) const override { return hasher_->Hash(value); } 0274 uint64_t Hash(double value) const override { return hasher_->Hash(value); } 0275 uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); } 0276 uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); } 0277 uint64_t Hash(const FLBA* value, uint32_t len) const override { 0278 return hasher_->Hash(value, len); 0279 } 0280 0281 void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const override { 0282 hasher_->Hashes(values, num_values, hashes); 0283 } 0284 void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const override { 0285 hasher_->Hashes(values, num_values, hashes); 0286 } 0287 void Hashes(const float* values, int num_values, uint64_t* hashes) const override { 0288 hasher_->Hashes(values, num_values, hashes); 0289 } 0290 void Hashes(const double* values, int num_values, uint64_t* hashes) const override { 0291 hasher_->Hashes(values, num_values, hashes); 0292 } 0293 void Hashes(const Int96* values, int num_values, uint64_t* hashes) const override { 0294 hasher_->Hashes(values, num_values, hashes); 0295 } 0296 void Hashes(const ByteArray* values, int num_values, uint64_t* hashes) const override { 0297 hasher_->Hashes(values, num_values, hashes); 0298 } 0299 void Hashes(const FLBA* values, uint32_t type_len, int num_values, 0300 uint64_t* hashes) const override { 0301 hasher_->Hashes(values, type_len, num_values, hashes); 0302 } 0303 0304 uint64_t Hash(const int32_t* value) const { return hasher_->Hash(*value); } 0305 uint64_t Hash(const int64_t* value) const { return hasher_->Hash(*value); } 0306 uint64_t Hash(const float* value) const { return hasher_->Hash(*value); } 0307 uint64_t Hash(const double* value) const { return hasher_->Hash(*value); } 0308 0309 /// Deserialize the Bloom filter from an input stream. It is used when reconstructing 0310 /// a Bloom filter from a parquet filter. 0311 /// 0312 /// @param properties The parquet reader properties. 0313 /// @param input_stream The input stream from which to construct the bloom filter. 0314 /// @param bloom_filter_length The length of the serialized bloom filter including 0315 /// header. 0316 /// @return The BlockSplitBloomFilter. 0317 static BlockSplitBloomFilter Deserialize( 0318 const ReaderProperties& properties, ArrowInputStream* input_stream, 0319 std::optional<int64_t> bloom_filter_length = std::nullopt); 0320 0321 private: 0322 inline void InsertHashImpl(uint64_t hash); 0323 0324 // Bytes in a tiny Bloom filter block. 0325 static constexpr int kBytesPerFilterBlock = 32; 0326 0327 // The number of bits to be set in each tiny Bloom filter 0328 static constexpr int kBitsSetPerBlock = 8; 0329 0330 // A mask structure used to set bits in each tiny Bloom filter. 0331 struct BlockMask { 0332 uint32_t item[kBitsSetPerBlock]; 0333 }; 0334 0335 // The block-based algorithm needs eight odd SALT values to calculate eight indexes 0336 // of bit to set, one bit in each 32-bit word. 0337 static constexpr uint32_t SALT[kBitsSetPerBlock] = { 0338 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, 0339 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U}; 0340 0341 // Memory pool to allocate aligned buffer for bitset 0342 ::arrow::MemoryPool* pool_; 0343 0344 // The underlying buffer of bitset. 0345 std::shared_ptr<Buffer> data_; 0346 0347 // The number of bytes of Bloom filter bitset. 0348 uint32_t num_bytes_; 0349 0350 // Hash strategy used in this Bloom filter. 0351 HashStrategy hash_strategy_; 0352 0353 // Algorithm used in this Bloom filter. 0354 Algorithm algorithm_; 0355 0356 // Compression used in this Bloom filter. 0357 CompressionStrategy compression_strategy_; 0358 0359 // The hash pointer points to actual hash class used. 0360 std::unique_ptr<Hasher> hasher_; 0361 }; 0362 0363 } // namespace parquet
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|