![]() |
|
|||
File indexing completed on 2025-08-28 08:27:08
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 #include <string_view> 0023 0024 #include "arrow/status.h" 0025 #include "arrow/util/macros.h" 0026 #include "arrow/util/visibility.h" 0027 0028 namespace arrow { 0029 0030 class Buffer; 0031 0032 class ARROW_EXPORT BoundaryFinder { 0033 public: 0034 BoundaryFinder() = default; 0035 0036 virtual ~BoundaryFinder(); 0037 0038 /// \brief Find the position of the first delimiter inside block 0039 /// 0040 /// `partial` is taken to be the beginning of the block, and `block` 0041 /// its continuation. Also, `partial` doesn't contain a delimiter. 0042 /// 0043 /// The returned `out_pos` is relative to `block`'s start and should point 0044 /// to the first character after the first delimiter. 0045 /// `out_pos` will be -1 if no delimiter is found. 0046 virtual Status FindFirst(std::string_view partial, std::string_view block, 0047 int64_t* out_pos) = 0; 0048 0049 /// \brief Find the position of the last delimiter inside block 0050 /// 0051 /// The returned `out_pos` is relative to `block`'s start and should point 0052 /// to the first character after the last delimiter. 0053 /// `out_pos` will be -1 if no delimiter is found. 0054 virtual Status FindLast(std::string_view block, int64_t* out_pos) = 0; 0055 0056 /// \brief Find the position of the Nth delimiter inside the block 0057 /// 0058 /// `partial` is taken to be the beginning of the block, and `block` 0059 /// its continuation. Also, `partial` doesn't contain a delimiter. 0060 /// 0061 /// The returned `out_pos` is relative to `block`'s start and should point 0062 /// to the first character after the first delimiter. 0063 /// `out_pos` will be -1 if no delimiter is found. 0064 /// 0065 /// The returned `num_found` is the number of delimiters actually found 0066 virtual Status FindNth(std::string_view partial, std::string_view block, int64_t count, 0067 int64_t* out_pos, int64_t* num_found) = 0; 0068 0069 static constexpr int64_t kNoDelimiterFound = -1; 0070 0071 protected: 0072 ARROW_DISALLOW_COPY_AND_ASSIGN(BoundaryFinder); 0073 }; 0074 0075 ARROW_EXPORT 0076 std::shared_ptr<BoundaryFinder> MakeNewlineBoundaryFinder(); 0077 0078 /// \brief A reusable block-based chunker for delimited data 0079 /// 0080 /// The chunker takes a block of delimited data and helps carve a sub-block 0081 /// which begins and ends on delimiters (suitable for consumption by parsers 0082 /// which can only parse whole objects). 0083 class ARROW_EXPORT Chunker { 0084 public: 0085 explicit Chunker(std::shared_ptr<BoundaryFinder> delimiter); 0086 ~Chunker(); 0087 0088 /// \brief Carve up a chunk in a block of data to contain only whole objects 0089 /// 0090 /// Pre-conditions: 0091 /// - `block` is the start of a valid block of delimited data 0092 /// (i.e. starts just after a delimiter) 0093 /// 0094 /// Post-conditions: 0095 /// - block == whole + partial 0096 /// - `whole` is a valid block of delimited data 0097 /// (i.e. starts just after a delimiter and ends with a delimiter) 0098 /// - `partial` doesn't contain an entire delimited object 0099 /// (IOW: `partial` is generally small) 0100 /// 0101 /// This method will look for the last delimiter in `block` and may 0102 /// therefore be costly. 0103 /// 0104 /// \param[in] block data to be chunked 0105 /// \param[out] whole subrange of block containing whole delimited objects 0106 /// \param[out] partial subrange of block starting with a partial delimited object 0107 Status Process(std::shared_ptr<Buffer> block, std::shared_ptr<Buffer>* whole, 0108 std::shared_ptr<Buffer>* partial); 0109 0110 /// \brief Carve the completion of a partial object out of a block 0111 /// 0112 /// Pre-conditions: 0113 /// - `partial` is the start of a valid block of delimited data 0114 /// (i.e. starts just after a delimiter) 0115 /// - `block` follows `partial` in file order 0116 /// 0117 /// Post-conditions: 0118 /// - block == completion + rest 0119 /// - `partial + completion` is a valid block of delimited data 0120 /// (i.e. starts just after a delimiter and ends with a delimiter) 0121 /// - `completion` doesn't contain an entire delimited object 0122 /// (IOW: `completion` is generally small) 0123 /// 0124 /// This method will look for the first delimiter in `block` and should 0125 /// therefore be reasonably cheap. 0126 /// 0127 /// \param[in] partial incomplete delimited data 0128 /// \param[in] block delimited data following partial 0129 /// \param[out] completion subrange of block containing the completion of partial 0130 /// \param[out] rest subrange of block containing what completion does not cover 0131 Status ProcessWithPartial(std::shared_ptr<Buffer> partial, 0132 std::shared_ptr<Buffer> block, 0133 std::shared_ptr<Buffer>* completion, 0134 std::shared_ptr<Buffer>* rest); 0135 0136 /// \brief Like ProcessWithPartial, but for the last block of a file 0137 /// 0138 /// This method allows for a final delimited object without a trailing delimiter 0139 /// (ProcessWithPartial would return an error in that case). 0140 /// 0141 /// Pre-conditions: 0142 /// - `partial` is the start of a valid block of delimited data 0143 /// - `block` follows `partial` in file order and is the last data block 0144 /// 0145 /// Post-conditions: 0146 /// - block == completion + rest 0147 /// - `partial + completion` is a valid block of delimited data 0148 /// - `completion` doesn't contain an entire delimited object 0149 /// (IOW: `completion` is generally small) 0150 /// 0151 Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block, 0152 std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest); 0153 0154 /// \brief Skip count number of rows 0155 /// Pre-conditions: 0156 /// - `partial` is the start of a valid block of delimited data 0157 /// (i.e. starts just after a delimiter) 0158 /// - `block` follows `partial` in file order 0159 /// 0160 /// Post-conditions: 0161 /// - `count` is updated to indicate the number of rows that still need to be skipped 0162 /// - If `count` is > 0 then `rest` is an incomplete block that should be a future 0163 /// `partial` 0164 /// - Else `rest` could be one or more valid blocks of delimited data which need to be 0165 /// parsed 0166 /// 0167 /// \param[in] partial incomplete delimited data 0168 /// \param[in] block delimited data following partial 0169 /// \param[in] final whether this is the final chunk 0170 /// \param[in,out] count number of rows that need to be skipped 0171 /// \param[out] rest subrange of block containing what was not skipped 0172 Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block, 0173 bool final, int64_t* count, std::shared_ptr<Buffer>* rest); 0174 0175 protected: 0176 ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker); 0177 0178 std::shared_ptr<BoundaryFinder> boundary_finder_; 0179 }; 0180 0181 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |