Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:27:08

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 #include <string_view>
0023 
0024 #include "arrow/status.h"
0025 #include "arrow/util/macros.h"
0026 #include "arrow/util/visibility.h"
0027 
0028 namespace arrow {
0029 
0030 class Buffer;
0031 
0032 class ARROW_EXPORT BoundaryFinder {
0033  public:
0034   BoundaryFinder() = default;
0035 
0036   virtual ~BoundaryFinder();
0037 
0038   /// \brief Find the position of the first delimiter inside block
0039   ///
0040   /// `partial` is taken to be the beginning of the block, and `block`
0041   /// its continuation.  Also, `partial` doesn't contain a delimiter.
0042   ///
0043   /// The returned `out_pos` is relative to `block`'s start and should point
0044   /// to the first character after the first delimiter.
0045   /// `out_pos` will be -1 if no delimiter is found.
0046   virtual Status FindFirst(std::string_view partial, std::string_view block,
0047                            int64_t* out_pos) = 0;
0048 
0049   /// \brief Find the position of the last delimiter inside block
0050   ///
0051   /// The returned `out_pos` is relative to `block`'s start and should point
0052   /// to the first character after the last delimiter.
0053   /// `out_pos` will be -1 if no delimiter is found.
0054   virtual Status FindLast(std::string_view block, int64_t* out_pos) = 0;
0055 
0056   /// \brief Find the position of the Nth delimiter inside the block
0057   ///
0058   /// `partial` is taken to be the beginning of the block, and `block`
0059   /// its continuation.  Also, `partial` doesn't contain a delimiter.
0060   ///
0061   /// The returned `out_pos` is relative to `block`'s start and should point
0062   /// to the first character after the first delimiter.
0063   /// `out_pos` will be -1 if no delimiter is found.
0064   ///
0065   /// The returned `num_found` is the number of delimiters actually found
0066   virtual Status FindNth(std::string_view partial, std::string_view block, int64_t count,
0067                          int64_t* out_pos, int64_t* num_found) = 0;
0068 
0069   static constexpr int64_t kNoDelimiterFound = -1;
0070 
0071  protected:
0072   ARROW_DISALLOW_COPY_AND_ASSIGN(BoundaryFinder);
0073 };
0074 
0075 ARROW_EXPORT
0076 std::shared_ptr<BoundaryFinder> MakeNewlineBoundaryFinder();
0077 
0078 /// \brief A reusable block-based chunker for delimited data
0079 ///
0080 /// The chunker takes a block of delimited data and helps carve a sub-block
0081 /// which begins and ends on delimiters (suitable for consumption by parsers
0082 /// which can only parse whole objects).
0083 class ARROW_EXPORT Chunker {
0084  public:
0085   explicit Chunker(std::shared_ptr<BoundaryFinder> delimiter);
0086   ~Chunker();
0087 
0088   /// \brief Carve up a chunk in a block of data to contain only whole objects
0089   ///
0090   /// Pre-conditions:
0091   /// - `block` is the start of a valid block of delimited data
0092   ///   (i.e. starts just after a delimiter)
0093   ///
0094   /// Post-conditions:
0095   /// - block == whole + partial
0096   /// - `whole` is a valid block of delimited data
0097   ///   (i.e. starts just after a delimiter and ends with a delimiter)
0098   /// - `partial` doesn't contain an entire delimited object
0099   ///   (IOW: `partial` is generally small)
0100   ///
0101   /// This method will look for the last delimiter in `block` and may
0102   /// therefore be costly.
0103   ///
0104   /// \param[in] block data to be chunked
0105   /// \param[out] whole subrange of block containing whole delimited objects
0106   /// \param[out] partial subrange of block starting with a partial delimited object
0107   Status Process(std::shared_ptr<Buffer> block, std::shared_ptr<Buffer>* whole,
0108                  std::shared_ptr<Buffer>* partial);
0109 
0110   /// \brief Carve the completion of a partial object out of a block
0111   ///
0112   /// Pre-conditions:
0113   /// - `partial` is the start of a valid block of delimited data
0114   ///   (i.e. starts just after a delimiter)
0115   /// - `block` follows `partial` in file order
0116   ///
0117   /// Post-conditions:
0118   /// - block == completion + rest
0119   /// - `partial + completion` is a valid block of delimited data
0120   ///   (i.e. starts just after a delimiter and ends with a delimiter)
0121   /// - `completion` doesn't contain an entire delimited object
0122   ///   (IOW: `completion` is generally small)
0123   ///
0124   /// This method will look for the first delimiter in `block` and should
0125   /// therefore be reasonably cheap.
0126   ///
0127   /// \param[in] partial incomplete delimited data
0128   /// \param[in] block delimited data following partial
0129   /// \param[out] completion subrange of block containing the completion of partial
0130   /// \param[out] rest subrange of block containing what completion does not cover
0131   Status ProcessWithPartial(std::shared_ptr<Buffer> partial,
0132                             std::shared_ptr<Buffer> block,
0133                             std::shared_ptr<Buffer>* completion,
0134                             std::shared_ptr<Buffer>* rest);
0135 
0136   /// \brief Like ProcessWithPartial, but for the last block of a file
0137   ///
0138   /// This method allows for a final delimited object without a trailing delimiter
0139   /// (ProcessWithPartial would return an error in that case).
0140   ///
0141   /// Pre-conditions:
0142   /// - `partial` is the start of a valid block of delimited data
0143   /// - `block` follows `partial` in file order and is the last data block
0144   ///
0145   /// Post-conditions:
0146   /// - block == completion + rest
0147   /// - `partial + completion` is a valid block of delimited data
0148   /// - `completion` doesn't contain an entire delimited object
0149   ///   (IOW: `completion` is generally small)
0150   ///
0151   Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
0152                       std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest);
0153 
0154   /// \brief Skip count number of rows
0155   /// Pre-conditions:
0156   /// - `partial` is the start of a valid block of delimited data
0157   ///   (i.e. starts just after a delimiter)
0158   /// - `block` follows `partial` in file order
0159   ///
0160   /// Post-conditions:
0161   /// - `count` is updated to indicate the number of rows that still need to be skipped
0162   /// - If `count` is > 0 then `rest` is an incomplete block that should be a future
0163   /// `partial`
0164   /// - Else `rest` could be one or more valid blocks of delimited data which need to be
0165   /// parsed
0166   ///
0167   /// \param[in] partial incomplete delimited data
0168   /// \param[in] block delimited data following partial
0169   /// \param[in] final whether this is the final chunk
0170   /// \param[in,out] count number of rows that need to be skipped
0171   /// \param[out] rest subrange of block containing what was not skipped
0172   Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
0173                      bool final, int64_t* count, std::shared_ptr<Buffer>* rest);
0174 
0175  protected:
0176   ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker);
0177 
0178   std::shared_ptr<BoundaryFinder> boundary_finder_;
0179 };
0180 
0181 }  // namespace arrow