Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:54

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 
0022 #include "arrow/util/endian.h"
0023 #include "parquet/platform.h"
0024 #include "parquet/schema.h"
0025 
0026 namespace parquet::internal {
0027 
0028 struct PARQUET_EXPORT LevelInfo {
0029   LevelInfo()
0030       : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
0031   LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
0032             int32_t repeated_ancestor_definition_level)
0033       : null_slot_usage(null_slots),
0034         def_level(static_cast<int16_t>(definition_level)),
0035         rep_level(static_cast<int16_t>(repetition_level)),
0036         repeated_ancestor_def_level(
0037             static_cast<int16_t>(repeated_ancestor_definition_level)) {}
0038 
0039   bool operator==(const LevelInfo& b) const {
0040     return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
0041            rep_level == b.rep_level &&
0042            repeated_ancestor_def_level == b.repeated_ancestor_def_level;
0043   }
0044 
0045   bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
0046 
0047   // How many slots an undefined but present (i.e. null) element in
0048   // parquet consumes when decoding to Arrow.
0049   // "Slot" is used in the same context as the Arrow specification
0050   // (i.e. a value holder).
0051   // This is only ever >1 for descendents of FixedSizeList.
0052   int32_t null_slot_usage = 1;
0053 
0054   // The definition level at which the value for the field
0055   // is considered not null (definition levels greater than
0056   // or equal to this value indicate a not-null
0057   // value for the field). For list fields definition levels
0058   // greater than or equal to this field indicate a present,
0059   // possibly null, child value.
0060   int16_t def_level = 0;
0061 
0062   // The repetition level corresponding to this element
0063   // or the closest repeated ancestor.  Any repetition
0064   // level less than this indicates either a new list OR
0065   // an empty list (which is determined in conjunction
0066   // with definition levels).
0067   int16_t rep_level = 0;
0068 
0069   // The definition level indicating the level at which the closest
0070   // repeated ancestor is not empty.  This is used to discriminate
0071   // between a value less than |def_level| being null or excluded entirely.
0072   // For instance if we have an arrow schema like:
0073   // list(struct(f0: int)).  Then then there are the following
0074   // definition levels:
0075   //   0 = null list
0076   //   1 = present but empty list.
0077   //   2 = a null value in the list
0078   //   3 = a non null struct but null integer.
0079   //   4 = a present integer.
0080   // When reconstructing, the struct and integer arrays'
0081   // repeated_ancestor_def_level would be 2.  Any
0082   // def_level < 2 indicates that there isn't a corresponding
0083   // child value in the list.
0084   // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
0085   // has the def levels [0, 1, 2, 3, 4].  The actual
0086   // struct array is only of length 3: [not-set, set, set] and
0087   // the int array is also of length 3: [N/A, null, 1].
0088   //
0089   int16_t repeated_ancestor_def_level = 0;
0090 
0091   /// Increments levels according to the cardinality of node.
0092   void Increment(const schema::Node& node) {
0093     if (node.is_repeated()) {
0094       IncrementRepeated();
0095       return;
0096     }
0097     if (node.is_optional()) {
0098       IncrementOptional();
0099       return;
0100     }
0101   }
0102 
0103   /// Increments level for a optional node.
0104   void IncrementOptional() { def_level++; }
0105 
0106   /// Increments levels for the repeated node.  Returns
0107   /// the previous ancestor_list_def_level.
0108   int16_t IncrementRepeated() {
0109     int16_t last_repeated_ancestor = repeated_ancestor_def_level;
0110 
0111     // Repeated fields add both a repetition and definition level. This is used
0112     // to distinguish between an empty list and a list with an item in it.
0113     ++rep_level;
0114     ++def_level;
0115     // For levels >= repeated_ancestor_def_level it indicates the list was
0116     // non-null and had at least one element.  This is important
0117     // for later decoding because we need to add a slot for these
0118     // values.  for levels < current_def_level no slots are added
0119     // to arrays.
0120     repeated_ancestor_def_level = def_level;
0121     return last_repeated_ancestor;
0122   }
0123 
0124   // Calculates and returns LevelInfo for a column descriptor.
0125   static LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
0126     LevelInfo level_info;
0127     level_info.def_level = descr->max_definition_level();
0128     level_info.rep_level = descr->max_repetition_level();
0129 
0130     int16_t min_spaced_def_level = descr->max_definition_level();
0131     const ::parquet::schema::Node* node = descr->schema_node().get();
0132     while (node && !node->is_repeated()) {
0133       if (node->is_optional()) {
0134         min_spaced_def_level--;
0135       }
0136       node = node->parent();
0137     }
0138     level_info.repeated_ancestor_def_level = min_spaced_def_level;
0139     return level_info;
0140   }
0141 
0142   friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
0143     // This print method is to silence valgrind issues.  What's printed
0144     // is not important because all asserts happen directly on
0145     // members.
0146     os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
0147        << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
0148     if (levels.null_slot_usage > 1) {
0149       os << ", null_slot_usage=" << levels.null_slot_usage;
0150     }
0151     os << "}";
0152     return os;
0153   }
0154 };
0155 
0156 // Input/Output structure for reconstructed validity bitmaps.
0157 struct PARQUET_EXPORT ValidityBitmapInputOutput {
0158   // Input only.
0159   // The maximum number of values_read expected (actual
0160   // values read must be less than or equal to this value).
0161   // If this number is exceeded methods will throw a
0162   // ParquetException. Exceeding this limit indicates
0163   // either a corrupt or incorrectly written file.
0164   int64_t values_read_upper_bound = 0;
0165   // Output only. The number of values added to the encountered
0166   // (this is logically the count of the number of elements
0167   // for an Arrow array).
0168   int64_t values_read = 0;
0169   // Input/Output. The number of nulls encountered.
0170   int64_t null_count = 0;
0171   // Output only. The validity bitmap to populate. Maybe be null only
0172   // for DefRepLevelsToListInfo (if all that is needed is list offsets).
0173   uint8_t* valid_bits = NULLPTR;
0174   // Input only, offset into valid_bits to start at.
0175   int64_t valid_bits_offset = 0;
0176 };
0177 
0178 //  Converts def_levels to validity bitmaps for non-list arrays and structs that have
0179 //  at least one member that is not a list and has no list descendents.
0180 //  For lists use DefRepLevelsToList and structs where all descendants contain
0181 //  a list use DefRepLevelsToBitmap.
0182 void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
0183                                       LevelInfo level_info,
0184                                       ValidityBitmapInputOutput* output);
0185 
0186 // Reconstructs a validity bitmap and list offsets for a list arrays based on
0187 // def/rep levels. The first element of offsets will not be modified if rep_levels
0188 // starts with a new list.  The first element of offsets will be used when calculating
0189 // the next offset.  See documentation onf DefLevelsToBitmap for when to use this
0190 // method vs the other ones in this file for reconstruction.
0191 //
0192 // Offsets must be sized to 1 + values_read_upper_bound.
0193 void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
0194                                        const int16_t* rep_levels, int64_t num_def_levels,
0195                                        LevelInfo level_info,
0196                                        ValidityBitmapInputOutput* output,
0197                                        int32_t* offsets);
0198 void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
0199                                        const int16_t* rep_levels, int64_t num_def_levels,
0200                                        LevelInfo level_info,
0201                                        ValidityBitmapInputOutput* output,
0202                                        int64_t* offsets);
0203 
0204 // Reconstructs a validity bitmap for a struct every member is a list or has
0205 // a list descendant.  See documentation on DefLevelsToBitmap for when more
0206 // details on this method compared to the other ones defined above.
0207 void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
0208                                          const int16_t* rep_levels,
0209                                          int64_t num_def_levels, LevelInfo level_info,
0210                                          ValidityBitmapInputOutput* output);
0211 
0212 // This is exposed to ensure we can properly test a software simulated pext function
0213 // (i.e. it isn't hidden by runtime dispatch).
0214 uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
0215 
0216 }  // namespace parquet::internal