include/parquet/bloom_filter.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 #pragma once
0019
0020 #include <cmath>
0021 #include <cstdint>
0022 #include <memory>
0023
0024 #include "arrow/util/bit_util.h"
0025 #include "arrow/util/logging.h"
0026 #include "parquet/hasher.h"
0027 #include "parquet/platform.h"
0028 #include "parquet/types.h"
0029
0030 namespace parquet {
0031
0032 // A Bloom filter is a compact structure to indicate whether an item is not in a set or
0033 // probably in a set. The Bloom filter usually consists of a bit set that represents a
0034 // set of elements, a hash strategy and a Bloom filter algorithm.
0035 class PARQUET_EXPORT BloomFilter {
0036  public:
0037   // Maximum Bloom filter size, it sets to HDFS default block size 128MB
0038   // This value will be reconsidered when implementing Bloom filter producer.
0039   static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
0040
0041   /// Determine whether an element exist in set or not.
0042   ///
0043   /// @param hash the element to contain.
0044   /// @return false if value is definitely not in set, and true means PROBABLY
0045   /// in set.
0046   virtual bool FindHash(uint64_t hash) const = 0;
0047
0048   /// Insert element to set represented by Bloom filter bitset.
0049   /// @param hash the hash of value to insert into Bloom filter.
0050   virtual void InsertHash(uint64_t hash) = 0;
0051
0052   /// Insert elements to set represented by Bloom filter bitset.
0053   /// @param hashes the hash values to insert into Bloom filter.
0054   /// @param num_values the number of hash values to insert.
0055   virtual void InsertHashes(const uint64_t* hashes, int num_values) = 0;
0056
0057   /// Write this Bloom filter to an output stream. A Bloom filter structure should
0058   /// include bitset length, hash strategy, algorithm, and bitset.
0059   ///
0060   /// @param sink the output stream to write
0061   virtual void WriteTo(ArrowOutputStream* sink) const = 0;
0062
0063   /// Get the number of bytes of bitset
0064   virtual uint32_t GetBitsetSize() const = 0;
0065
0066   /// Compute hash for 32 bits value by using its plain encoding result.
0067   ///
0068   /// @param value the value to hash.
0069   /// @return hash result.
0070   virtual uint64_t Hash(int32_t value) const = 0;
0071
0072   /// Compute hash for 64 bits value by using its plain encoding result.
0073   ///
0074   /// @param value the value to hash.
0075   /// @return hash result.
0076   virtual uint64_t Hash(int64_t value) const = 0;
0077
0078   /// Compute hash for float value by using its plain encoding result.
0079   ///
0080   /// @param value the value to hash.
0081   /// @return hash result.
0082   virtual uint64_t Hash(float value) const = 0;
0083
0084   /// Compute hash for double value by using its plain encoding result.
0085   ///
0086   /// @param value the value to hash.
0087   /// @return hash result.
0088   virtual uint64_t Hash(double value) const = 0;
0089
0090   /// Compute hash for Int96 value by using its plain encoding result.
0091   ///
0092   /// @param value the value to hash.
0093   /// @return hash result.
0094   virtual uint64_t Hash(const Int96* value) const = 0;
0095
0096   /// Compute hash for ByteArray value by using its plain encoding result.
0097   ///
0098   /// @param value the value to hash.
0099   /// @return hash result.
0100   virtual uint64_t Hash(const ByteArray* value) const = 0;
0101
0102   /// Compute hash for fixed byte array value by using its plain encoding result.
0103   ///
0104   /// @param value the value address.
0105   /// @param len the value length.
0106   /// @return hash result.
0107   virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
0108
0109   /// Batch compute hashes for 32 bits values by using its plain encoding result.
0110   ///
0111   /// @param values values a pointer to the values to hash.
0112   /// @param num_values the number of values to hash.
0113   /// @param hashes a pointer to the output hash values, its length should be equal to
0114   /// num_values.
0115   virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const = 0;
0116
0117   /// Batch compute hashes for 64 bits values by using its plain encoding result.
0118   ///
0119   /// @param values values a pointer to the values to hash.
0120   /// @param num_values the number of values to hash.
0121   /// @param hashes a pointer to the output hash values, its length should be equal to
0122   /// num_values.
0123   virtual void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const = 0;
0124
0125   /// Batch compute hashes for float values by using its plain encoding result.
0126   ///
0127   /// @param values values a pointer to the values to hash.
0128   /// @param num_values the number of values to hash.
0129   /// @param hashes a pointer to the output hash values, its length should be equal to
0130   /// num_values.
0131   virtual void Hashes(const float* values, int num_values, uint64_t* hashes) const = 0;
0132
0133   /// Batch compute hashes for double values by using its plain encoding result.
0134   ///
0135   /// @param values values a pointer to the values to hash.
0136   /// @param num_values the number of values to hash.
0137   /// @param hashes a pointer to the output hash values, its length should be equal to
0138   /// num_values.
0139   virtual void Hashes(const double* values, int num_values, uint64_t* hashes) const = 0;
0140
0141   /// Batch compute hashes for Int96 values by using its plain encoding result.
0142   ///
0143   /// @param values values a pointer to the values to hash.
0144   /// @param num_values the number of values to hash.
0145   /// @param hashes a pointer to the output hash values, its length should be equal to
0146   /// num_values.
0147   virtual void Hashes(const Int96* values, int num_values, uint64_t* hashes) const = 0;
0148
0149   /// Batch compute hashes for ByteArray values by using its plain encoding result.
0150   ///
0151   /// @param values values a pointer to the values to hash.
0152   /// @param num_values the number of values to hash.
0153   /// @param hashes a pointer to the output hash values, its length should be equal to
0154   /// num_values.
0155   virtual void Hashes(const ByteArray* values, int num_values,
0156                       uint64_t* hashes) const = 0;
0157
0158   /// Batch compute hashes for fixed byte array values by using its plain encoding result.
0159   ///
0160   /// @param values values a pointer to the values to hash.
0161   /// @param type_len the value length.
0162   /// @param num_values the number of values to hash.
0163   /// @param hashes a pointer to the output hash values, its length should be equal to
0164   /// num_values.
0165   virtual void Hashes(const FLBA* values, uint32_t type_len, int num_values,
0166                       uint64_t* hashes) const = 0;
0167
0168   virtual ~BloomFilter() = default;
0169
0170  protected:
0171   // Hash strategy available for Bloom filter.
0172   enum class HashStrategy : uint32_t { XXHASH = 0 };
0173
0174   // Bloom filter algorithm.
0175   enum class Algorithm : uint32_t { BLOCK = 0 };
0176
0177   enum class CompressionStrategy : uint32_t { UNCOMPRESSED = 0 };
0178 };
0179
0180 /// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
0181 /// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
0182 /// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
0183 ///
0184 /// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
0185 /// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
0186 class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
0187  public:
0188   /// The constructor of BlockSplitBloomFilter. It uses XXH64 as hash function.
0189   ///
0190   /// \param pool memory pool to use.
0191   explicit BlockSplitBloomFilter(
0192       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0193
0194   /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
0195   /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
0196   /// rounded up/down to lower/upper bound if num_bytes is out of range and also
0197   /// will be rounded up to a power of 2.
0198   ///
0199   /// @param num_bytes The number of bytes to store Bloom filter bitset.
0200   void Init(uint32_t num_bytes);
0201
0202   /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
0203   /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
0204   /// which may lead to segfault when performing SIMD instructions. It is the caller's
0205   /// responsibility to free the bitset passed in. This is used when reconstructing
0206   /// a Bloom filter from a parquet file.
0207   ///
0208   /// @param bitset The given bitset to initialize the Bloom filter.
0209   /// @param num_bytes  The number of bytes of given bitset.
0210   void Init(const uint8_t* bitset, uint32_t num_bytes);
0211
0212   /// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
0213   static constexpr uint32_t kMinimumBloomFilterBytes = 32;
0214
0215   /// Calculate optimal size according to the number of distinct values and false
0216   /// positive probability.
0217   ///
0218   /// @param ndv The number of distinct values.
0219   /// @param fpp The false positive probability.
0220   /// @return it always return a value between kMinimumBloomFilterBytes and
0221   /// kMaximumBloomFilterBytes, and the return value is always a power of 2
0222   static uint32_t OptimalNumOfBytes(uint32_t ndv, double fpp) {
0223     uint32_t optimal_num_of_bits = OptimalNumOfBits(ndv, fpp);
0224     ARROW_DCHECK(::arrow::bit_util::IsMultipleOf8(optimal_num_of_bits));
0225     return optimal_num_of_bits >> 3;
0226   }
0227
0228   /// Calculate optimal size according to the number of distinct values and false
0229   /// positive probability.
0230   ///
0231   /// @param ndv The number of distinct values.
0232   /// @param fpp The false positive probability.
0233   /// @return it always return a value between kMinimumBloomFilterBytes * 8 and
0234   /// kMaximumBloomFilterBytes * 8, and the return value is always a power of 16
0235   static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
0236     ARROW_DCHECK(fpp > 0.0 && fpp < 1.0);
0237     const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
0238     uint32_t num_bits;
0239
0240     // Handle overflow.
0241     if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
0242       num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
0243     } else {
0244       num_bits = static_cast<uint32_t>(m);
0245     }
0246
0247     // Round up to lower bound
0248     if (num_bits < kMinimumBloomFilterBytes << 3) {
0249       num_bits = kMinimumBloomFilterBytes << 3;
0250     }
0251
0252     // Get next power of 2 if bits is not power of 2.
0253     if ((num_bits & (num_bits - 1)) != 0) {
0254       num_bits = static_cast<uint32_t>(::arrow::bit_util::NextPower2(num_bits));
0255     }
0256
0257     // Round down to upper bound
0258     if (num_bits > kMaximumBloomFilterBytes << 3) {
0259       num_bits = kMaximumBloomFilterBytes << 3;
0260     }
0261
0262     return num_bits;
0263   }
0264
0265   bool FindHash(uint64_t hash) const override;
0266   void InsertHash(uint64_t hash) override;
0267   void InsertHashes(const uint64_t* hashes, int num_values) override;
0268   void WriteTo(ArrowOutputStream* sink) const override;
0269   uint32_t GetBitsetSize() const override { return num_bytes_; }
0270
0271   uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
0272   uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
0273   uint64_t Hash(float value) const override { return hasher_->Hash(value); }
0274   uint64_t Hash(double value) const override { return hasher_->Hash(value); }
0275   uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
0276   uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
0277   uint64_t Hash(const FLBA* value, uint32_t len) const override {
0278     return hasher_->Hash(value, len);
0279   }
0280
0281   void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const override {
0282     hasher_->Hashes(values, num_values, hashes);
0283   }
0284   void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const override {
0285     hasher_->Hashes(values, num_values, hashes);
0286   }
0287   void Hashes(const float* values, int num_values, uint64_t* hashes) const override {
0288     hasher_->Hashes(values, num_values, hashes);
0289   }
0290   void Hashes(const double* values, int num_values, uint64_t* hashes) const override {
0291     hasher_->Hashes(values, num_values, hashes);
0292   }
0293   void Hashes(const Int96* values, int num_values, uint64_t* hashes) const override {
0294     hasher_->Hashes(values, num_values, hashes);
0295   }
0296   void Hashes(const ByteArray* values, int num_values, uint64_t* hashes) const override {
0297     hasher_->Hashes(values, num_values, hashes);
0298   }
0299   void Hashes(const FLBA* values, uint32_t type_len, int num_values,
0300               uint64_t* hashes) const override {
0301     hasher_->Hashes(values, type_len, num_values, hashes);
0302   }
0303
0304   uint64_t Hash(const int32_t* value) const { return hasher_->Hash(*value); }
0305   uint64_t Hash(const int64_t* value) const { return hasher_->Hash(*value); }
0306   uint64_t Hash(const float* value) const { return hasher_->Hash(*value); }
0307   uint64_t Hash(const double* value) const { return hasher_->Hash(*value); }
0308
0309   /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
0310   /// a Bloom filter from a parquet filter.
0311   ///
0312   /// @param properties The parquet reader properties.
0313   /// @param input_stream The input stream from which to construct the bloom filter.
0314   /// @param bloom_filter_length The length of the serialized bloom filter including
0315   /// header.
0316   /// @return The BlockSplitBloomFilter.
0317   static BlockSplitBloomFilter Deserialize(
0318       const ReaderProperties& properties, ArrowInputStream* input_stream,
0319       std::optional<int64_t> bloom_filter_length = std::nullopt);
0320
0321  private:
0322   inline void InsertHashImpl(uint64_t hash);
0323
0324   // Bytes in a tiny Bloom filter block.
0325   static constexpr int kBytesPerFilterBlock = 32;
0326
0327   // The number of bits to be set in each tiny Bloom filter
0328   static constexpr int kBitsSetPerBlock = 8;
0329
0330   // A mask structure used to set bits in each tiny Bloom filter.
0331   struct BlockMask {
0332     uint32_t item[kBitsSetPerBlock];
0333   };
0334
0335   // The block-based algorithm needs eight odd SALT values to calculate eight indexes
0336   // of bit to set, one bit in each 32-bit word.
0337   static constexpr uint32_t SALT[kBitsSetPerBlock] = {
0338       0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
0339       0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
0340
0341   // Memory pool to allocate aligned buffer for bitset
0342   ::arrow::MemoryPool* pool_;
0343
0344   // The underlying buffer of bitset.
0345   std::shared_ptr<Buffer> data_;
0346
0347   // The number of bytes of Bloom filter bitset.
0348   uint32_t num_bytes_;
0349
0350   // Hash strategy used in this Bloom filter.
0351   HashStrategy hash_strategy_;
0352
0353   // Algorithm used in this Bloom filter.
0354   Algorithm algorithm_;
0355
0356   // Compression used in this Bloom filter.
0357   CompressionStrategy compression_strategy_;
0358
0359   // The hash pointer points to actual hash class used.
0360   std::unique_ptr<Hasher> hasher_;
0361 };
0362
0363 }  // namespace parquet