File indexing completed on 2026-04-17 08:28:54
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017 #pragma once
0018
0019 #include "parquet/level_conversion.h"
0020
0021 #include <algorithm>
0022 #include <cstdint>
0023 #include <limits>
0024
0025 #include "arrow/util/bit_run_reader.h"
0026 #include "arrow/util/bit_util.h"
0027 #include "arrow/util/bitmap_writer.h"
0028 #include "arrow/util/logging.h"
0029 #include "arrow/util/simd.h"
0030 #include "parquet/exception.h"
0031 #include "parquet/level_comparison.h"
0032
0033 #ifndef PARQUET_IMPL_NAMESPACE
0034 # error "PARQUET_IMPL_NAMESPACE must be defined"
0035 #endif
0036
0037 namespace parquet::internal::PARQUET_IMPL_NAMESPACE {
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072 constexpr int kLookupBits = 5;
0073 constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
0074 {
0075 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0076 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0077 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0078 },
0079 {
0080 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0081 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
0082 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
0083 },
0084 {
0085 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
0086 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
0087 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
0088 },
0089 {
0090 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
0091 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
0092 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
0093 },
0094 {
0095 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
0096 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
0097 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
0098 },
0099 {
0100 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
0101 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
0102 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
0103 },
0104 {
0105 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
0106 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
0107 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
0108 },
0109 {
0110 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
0111 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
0112 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0113 },
0114 {
0115 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
0116 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0117 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0118 },
0119 {
0120 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
0121 0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
0122 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
0123 },
0124 {
0125 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
0126 0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
0127 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
0128 },
0129 {
0130 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
0131 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
0132 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
0133 },
0134 {
0135 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
0136 0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
0137 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
0138 },
0139 {
0140 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
0141 0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
0142 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
0143 },
0144 {
0145 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
0146 0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
0147 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
0148 },
0149 {
0150 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
0151 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
0152 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0153 },
0154 {
0155 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0156 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0157 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0158 },
0159 {
0160 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0161 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
0162 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
0163 },
0164 {
0165 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
0166 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
0167 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
0168 },
0169 {
0170 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
0171 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
0172 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
0173 },
0174 {
0175 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
0176 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
0177 0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
0178 },
0179 {
0180 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
0181 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
0182 0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
0183 },
0184 {
0185 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
0186 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
0187 0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
0188 },
0189 {
0190 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
0191 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
0192 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0193 },
0194 {
0195 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
0196 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0197 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
0198 },
0199 {
0200 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
0201 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
0202 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
0203 },
0204 {
0205 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
0206 0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
0207 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
0208 },
0209 {
0210 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
0211 0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
0212 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
0213 },
0214 {
0215 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
0216 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
0217 0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
0218 },
0219 {
0220 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
0221 0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
0222 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
0223 },
0224 {
0225 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
0226 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
0227 0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
0228 },
0229 {
0230 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
0231 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
0232 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0233 },
0234 };
0235
0236 inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
0237
0238
0239
0240 if (select_bitmap == ~uint64_t{0}) {
0241 return bitmap;
0242 } else if (select_bitmap == 0) {
0243 return 0;
0244 }
0245
0246
0247 uint64_t bit_value = 0;
0248 int bit_len = 0;
0249 constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
0250 while (select_bitmap != 0) {
0251 const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
0252 const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
0253 bit_value |= (value << bit_len);
0254 bit_len += mask_len;
0255 bitmap >>= kLookupBits;
0256 select_bitmap >>= kLookupBits;
0257 }
0258 return bit_value;
0259 }
0260
0261 #ifdef ARROW_HAVE_BMI2
0262
0263
0264 # if UINTPTR_MAX == 0xFFFFFFFF
0265
0266 using extract_bitmap_t = uint32_t;
0267 inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
0268 extract_bitmap_t select_bitmap) {
0269 return _pext_u32(bitmap, select_bitmap);
0270 }
0271
0272 # else
0273
0274 using extract_bitmap_t = uint64_t;
0275 inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
0276 extract_bitmap_t select_bitmap) {
0277 return _pext_u64(bitmap, select_bitmap);
0278 }
0279
0280 # endif
0281
0282 #else
0283
0284
0285 using extract_bitmap_t = uint64_t;
0286 inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
0287 extract_bitmap_t select_bitmap) {
0288 return ExtractBitsSoftware(bitmap, select_bitmap);
0289 }
0290
0291 #endif
0292
0293 static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
0294
0295 template <bool has_repeated_parent>
0296 int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
0297 int64_t upper_bound_remaining, LevelInfo level_info,
0298 ::arrow::internal::FirstTimeBitmapWriter* writer) {
0299 ARROW_DCHECK_LE(batch_size, kExtractBitsSize);
0300
0301
0302 auto defined_bitmap = static_cast<extract_bitmap_t>(
0303 internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
0304
0305 if (has_repeated_parent) {
0306
0307
0308 auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
0309 def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
0310 auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
0311 int64_t selected_count = ::arrow::bit_util::PopCount(present_bitmap);
0312 if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
0313 throw ParquetException("Values read exceeded upper bound");
0314 }
0315 writer->AppendWord(selected_bits, selected_count);
0316 return ::arrow::bit_util::PopCount(selected_bits);
0317 } else {
0318 if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
0319 std::stringstream ss;
0320 ss << "Values read exceeded upper bound";
0321 throw ParquetException(ss.str());
0322 }
0323
0324 writer->AppendWord(defined_bitmap, batch_size);
0325 return ::arrow::bit_util::PopCount(defined_bitmap);
0326 }
0327 }
0328
0329 template <bool has_repeated_parent>
0330 void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
0331 LevelInfo level_info, ValidityBitmapInputOutput* output) {
0332 ::arrow::internal::FirstTimeBitmapWriter writer(
0333 output->valid_bits,
0334 output->valid_bits_offset,
0335 output->values_read_upper_bound);
0336 int64_t set_count = 0;
0337 output->values_read = 0;
0338 int64_t values_read_remaining = output->values_read_upper_bound;
0339 while (num_def_levels > kExtractBitsSize) {
0340 set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
0341 def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
0342 def_levels += kExtractBitsSize;
0343 num_def_levels -= kExtractBitsSize;
0344 values_read_remaining = output->values_read_upper_bound - writer.position();
0345 }
0346 set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
0347 def_levels, num_def_levels, values_read_remaining, level_info, &writer);
0348
0349 output->values_read = writer.position();
0350 output->null_count += output->values_read - set_count;
0351 writer.Finish();
0352 }
0353
0354 }