|
|
|||
File indexing completed on 2026-04-17 08:28:54
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 0022 #include "arrow/util/endian.h" 0023 #include "parquet/platform.h" 0024 #include "parquet/schema.h" 0025 0026 namespace parquet::internal { 0027 0028 struct PARQUET_EXPORT LevelInfo { 0029 LevelInfo() 0030 : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {} 0031 LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level, 0032 int32_t repeated_ancestor_definition_level) 0033 : null_slot_usage(null_slots), 0034 def_level(static_cast<int16_t>(definition_level)), 0035 rep_level(static_cast<int16_t>(repetition_level)), 0036 repeated_ancestor_def_level( 0037 static_cast<int16_t>(repeated_ancestor_definition_level)) {} 0038 0039 bool operator==(const LevelInfo& b) const { 0040 return null_slot_usage == b.null_slot_usage && def_level == b.def_level && 0041 rep_level == b.rep_level && 0042 repeated_ancestor_def_level == b.repeated_ancestor_def_level; 0043 } 0044 0045 bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; } 0046 0047 // How many slots an undefined but present (i.e. null) element in 0048 // parquet consumes when decoding to Arrow. 0049 // "Slot" is used in the same context as the Arrow specification 0050 // (i.e. a value holder). 0051 // This is only ever >1 for descendents of FixedSizeList. 0052 int32_t null_slot_usage = 1; 0053 0054 // The definition level at which the value for the field 0055 // is considered not null (definition levels greater than 0056 // or equal to this value indicate a not-null 0057 // value for the field). For list fields definition levels 0058 // greater than or equal to this field indicate a present, 0059 // possibly null, child value. 0060 int16_t def_level = 0; 0061 0062 // The repetition level corresponding to this element 0063 // or the closest repeated ancestor. Any repetition 0064 // level less than this indicates either a new list OR 0065 // an empty list (which is determined in conjunction 0066 // with definition levels). 0067 int16_t rep_level = 0; 0068 0069 // The definition level indicating the level at which the closest 0070 // repeated ancestor is not empty. This is used to discriminate 0071 // between a value less than |def_level| being null or excluded entirely. 0072 // For instance if we have an arrow schema like: 0073 // list(struct(f0: int)). Then then there are the following 0074 // definition levels: 0075 // 0 = null list 0076 // 1 = present but empty list. 0077 // 2 = a null value in the list 0078 // 3 = a non null struct but null integer. 0079 // 4 = a present integer. 0080 // When reconstructing, the struct and integer arrays' 0081 // repeated_ancestor_def_level would be 2. Any 0082 // def_level < 2 indicates that there isn't a corresponding 0083 // child value in the list. 0084 // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]] 0085 // has the def levels [0, 1, 2, 3, 4]. The actual 0086 // struct array is only of length 3: [not-set, set, set] and 0087 // the int array is also of length 3: [N/A, null, 1]. 0088 // 0089 int16_t repeated_ancestor_def_level = 0; 0090 0091 /// Increments levels according to the cardinality of node. 0092 void Increment(const schema::Node& node) { 0093 if (node.is_repeated()) { 0094 IncrementRepeated(); 0095 return; 0096 } 0097 if (node.is_optional()) { 0098 IncrementOptional(); 0099 return; 0100 } 0101 } 0102 0103 /// Increments level for a optional node. 0104 void IncrementOptional() { def_level++; } 0105 0106 /// Increments levels for the repeated node. Returns 0107 /// the previous ancestor_list_def_level. 0108 int16_t IncrementRepeated() { 0109 int16_t last_repeated_ancestor = repeated_ancestor_def_level; 0110 0111 // Repeated fields add both a repetition and definition level. This is used 0112 // to distinguish between an empty list and a list with an item in it. 0113 ++rep_level; 0114 ++def_level; 0115 // For levels >= repeated_ancestor_def_level it indicates the list was 0116 // non-null and had at least one element. This is important 0117 // for later decoding because we need to add a slot for these 0118 // values. for levels < current_def_level no slots are added 0119 // to arrays. 0120 repeated_ancestor_def_level = def_level; 0121 return last_repeated_ancestor; 0122 } 0123 0124 // Calculates and returns LevelInfo for a column descriptor. 0125 static LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) { 0126 LevelInfo level_info; 0127 level_info.def_level = descr->max_definition_level(); 0128 level_info.rep_level = descr->max_repetition_level(); 0129 0130 int16_t min_spaced_def_level = descr->max_definition_level(); 0131 const ::parquet::schema::Node* node = descr->schema_node().get(); 0132 while (node && !node->is_repeated()) { 0133 if (node->is_optional()) { 0134 min_spaced_def_level--; 0135 } 0136 node = node->parent(); 0137 } 0138 level_info.repeated_ancestor_def_level = min_spaced_def_level; 0139 return level_info; 0140 } 0141 0142 friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) { 0143 // This print method is to silence valgrind issues. What's printed 0144 // is not important because all asserts happen directly on 0145 // members. 0146 os << "{def=" << levels.def_level << ", rep=" << levels.rep_level 0147 << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level; 0148 if (levels.null_slot_usage > 1) { 0149 os << ", null_slot_usage=" << levels.null_slot_usage; 0150 } 0151 os << "}"; 0152 return os; 0153 } 0154 }; 0155 0156 // Input/Output structure for reconstructed validity bitmaps. 0157 struct PARQUET_EXPORT ValidityBitmapInputOutput { 0158 // Input only. 0159 // The maximum number of values_read expected (actual 0160 // values read must be less than or equal to this value). 0161 // If this number is exceeded methods will throw a 0162 // ParquetException. Exceeding this limit indicates 0163 // either a corrupt or incorrectly written file. 0164 int64_t values_read_upper_bound = 0; 0165 // Output only. The number of values added to the encountered 0166 // (this is logically the count of the number of elements 0167 // for an Arrow array). 0168 int64_t values_read = 0; 0169 // Input/Output. The number of nulls encountered. 0170 int64_t null_count = 0; 0171 // Output only. The validity bitmap to populate. Maybe be null only 0172 // for DefRepLevelsToListInfo (if all that is needed is list offsets). 0173 uint8_t* valid_bits = NULLPTR; 0174 // Input only, offset into valid_bits to start at. 0175 int64_t valid_bits_offset = 0; 0176 }; 0177 0178 // Converts def_levels to validity bitmaps for non-list arrays and structs that have 0179 // at least one member that is not a list and has no list descendents. 0180 // For lists use DefRepLevelsToList and structs where all descendants contain 0181 // a list use DefRepLevelsToBitmap. 0182 void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, 0183 LevelInfo level_info, 0184 ValidityBitmapInputOutput* output); 0185 0186 // Reconstructs a validity bitmap and list offsets for a list arrays based on 0187 // def/rep levels. The first element of offsets will not be modified if rep_levels 0188 // starts with a new list. The first element of offsets will be used when calculating 0189 // the next offset. See documentation onf DefLevelsToBitmap for when to use this 0190 // method vs the other ones in this file for reconstruction. 0191 // 0192 // Offsets must be sized to 1 + values_read_upper_bound. 0193 void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels, 0194 const int16_t* rep_levels, int64_t num_def_levels, 0195 LevelInfo level_info, 0196 ValidityBitmapInputOutput* output, 0197 int32_t* offsets); 0198 void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels, 0199 const int16_t* rep_levels, int64_t num_def_levels, 0200 LevelInfo level_info, 0201 ValidityBitmapInputOutput* output, 0202 int64_t* offsets); 0203 0204 // Reconstructs a validity bitmap for a struct every member is a list or has 0205 // a list descendant. See documentation on DefLevelsToBitmap for when more 0206 // details on this method compared to the other ones defined above. 0207 void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels, 0208 const int16_t* rep_levels, 0209 int64_t num_def_levels, LevelInfo level_info, 0210 ValidityBitmapInputOutput* output); 0211 0212 // This is exposed to ensure we can properly test a software simulated pext function 0213 // (i.e. it isn't hidden by runtime dispatch). 0214 uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection); 0215 0216 } // namespace parquet::internal
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|