root/ROOT/RRawFile.hxx

0001 // @(#)root/io:$Id$
0002 // Author: Jakob Blomer
0003
0004 /*************************************************************************
0005  * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers.               *
0006  * All rights reserved.                                                  *
0007  *                                                                       *
0008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0010  *************************************************************************/
0011
0012 #ifndef ROOT_RRawFile
0013 #define ROOT_RRawFile
0014
0015 #include <string_view>
0016
0017 #include <cstddef>
0018 #include <cstdint>
0019 #include <memory>
0020 #include <string>
0021
0022 namespace ROOT {
0023 namespace Internal {
0024
0025 /**
0026  * \class RRawFile RRawFile.hxx
0027  * \ingroup IO
0028  *
0029  * The RRawFile provides read-only access to local and remote files. Data can be read either byte-wise or line-wise.
0030  * The RRawFile base class provides line-wise access and buffering for byte-wise access. Derived classes provide the
0031  * low-level read operations, e.g. from a local file system or from a web server. The RRawFile is used for non-ROOT
0032  * RDataSource implementations and for RNTuple.
0033  *
0034  * Files are addressed by URL consisting of a transport protocol part and a location, like file:///path/to/data
0035  * If the transport protocol part and the :// separator are missing, the default protocol is local file. Files are
0036  * opened when required (on reading, getting file size) and closed on object destruction.
0037  *
0038  * RRawFiles manage system resources and are therefore made non-copyable. They can be explicitly cloned though.
0039  *
0040  * RRawFile objects are conditionally thread safe. See the user manual for further details:
0041  * https://root.cern/manual/thread_safety/
0042  */
0043 class RRawFile {
0044 public:
0045    /// kAuto detects the line break from the first line, kSystem picks the system's default
0046    enum class ELineBreaks { kAuto, kSystem, kUnix, kWindows };
0047
0048    /// On construction, an ROptions parameter can customize the RRawFile behavior
0049    struct ROptions {
0050       static constexpr size_t kUseDefaultBlockSize = std::size_t(-1); ///< Use protocol-dependent default block size
0051
0052       ELineBreaks fLineBreak = ELineBreaks::kAuto;
0053       /// Read at least fBlockSize bytes at a time. A value of zero turns off I/O buffering.
0054       size_t fBlockSize = kUseDefaultBlockSize;
0055       // Define an empty constructor to work around a bug in Clang: https://github.com/llvm/llvm-project/issues/36032
0056       ROptions() {}
0057    };
0058
0059    /// Used for vector reads from multiple offsets into multiple buffers. This is unlike readv(), which scatters a
0060    /// single byte range from disk into multiple buffers.
0061    struct RIOVec {
0062       /// The destination for reading
0063       void *fBuffer = nullptr;
0064       /// The file offset
0065       std::uint64_t fOffset = 0;
0066       /// The number of desired bytes
0067       std::size_t fSize = 0;
0068       /// The number of actually read bytes, set by ReadV()
0069       std::size_t fOutBytes = 0;
0070    };
0071
0072    /// Implementations may enforce limits on the use of vector reads. These limits can depend on the server or
0073    /// the specific file opened and can be queried per RRawFile object through GetReadVLimits().
0074    /// Note that due to such limits, a vector read with a single request can behave differently from a Read() call.
0075    struct RIOVecLimits {
0076       /// Maximum number of elements in a ReadV request vector
0077       std::size_t fMaxReqs = static_cast<std::size_t>(-1);
0078       /// Maximum size in bytes of any single request in the request vector
0079       std::size_t fMaxSingleSize = static_cast<std::size_t>(-1);
0080       /// Maximum size in bytes of the sum of requests in the vector
0081       std::uint64_t fMaxTotalSize = static_cast<std::uint64_t>(-1);
0082
0083       bool HasReqsLimit() const { return fMaxReqs != static_cast<std::size_t>(-1); }
0084       bool HasSizeLimit() const
0085       {
0086          return fMaxSingleSize != static_cast<std::size_t>(-1) || fMaxTotalSize != static_cast<std::uint64_t>(-1);
0087       }
0088    };
0089
0090 private:
0091    /// Don't change without adapting ReadAt()
0092    static constexpr unsigned int kNumBlockBuffers = 2;
0093    struct RBlockBuffer {
0094       /// Where in the open file does fBuffer start
0095       std::uint64_t fBufferOffset = 0;
0096       /// The number of currently buffered bytes in fBuffer
0097       size_t fBufferSize = 0;
0098       /// Points into the I/O buffer with data from the file, not owned.
0099       unsigned char *fBuffer = nullptr;
0100
0101       RBlockBuffer() = default;
0102       RBlockBuffer(const RBlockBuffer &) = delete;
0103       RBlockBuffer &operator=(const RBlockBuffer &) = delete;
0104       ~RBlockBuffer() = default;
0105
0106       /// Tries to copy up to nbytes starting at offset from fBuffer into buffer.  Returns number of bytes copied.
0107       size_t CopyTo(void *buffer, size_t nbytes, std::uint64_t offset);
0108    };
0109    /// To be used modulo kNumBlockBuffers, points to the last used block buffer in fBlockBuffers
0110    unsigned int fBlockBufferIdx = 0;
0111    /// An active buffer and a shadow buffer, which supports "jumping back" to a previously used location in the file
0112    RBlockBuffer fBlockBuffers[kNumBlockBuffers];
0113    /// Memory block containing the block buffers consecutively
0114    std::unique_ptr<unsigned char[]> fBufferSpace;
0115    /// Used as a marker that the file size was not yet queried
0116    static constexpr std::uint64_t kUnknownFileSize = std::uint64_t(-1);
0117    /// The cached file size
0118    std::uint64_t fFileSize = kUnknownFileSize;
0119    /// Files are opened lazily and only when required; the open state is kept by this flag
0120    bool fIsOpen = false;
0121    /// Runtime switch to decide if reads are buffered or directly sent to ReadAtImpl()
0122    bool fIsBuffering = true;
0123
0124 protected:
0125    std::string fUrl;
0126    ROptions fOptions;
0127    /// The current position in the file, which can be changed by Seek, Read, and Readln
0128    std::uint64_t fFilePos = 0;
0129
0130    /// OpenImpl() is called at most once and before any call to either DoReadAt or DoGetSize. If fOptions.fBlocksize
0131    /// is negative, derived classes are responsible to set a sensible value. After a call to OpenImpl(),
0132    /// fOptions.fBlocksize must be larger or equal to zero.
0133    virtual void OpenImpl() = 0;
0134    /// Derived classes should implement low-level reading without buffering. Short reads indicate the end of the file,
0135    /// therefore derived classes should return nbytes bytes if available.
0136    virtual size_t ReadAtImpl(void *buffer, size_t nbytes, std::uint64_t offset) = 0;
0137    /// Derived classes should return the file size
0138    virtual std::uint64_t GetSizeImpl() = 0;
0139
0140    /// By default implemented as a loop of ReadAt calls but can be overwritten, e.g. XRootD or DAVIX implementations
0141    virtual void ReadVImpl(RIOVec *ioVec, unsigned int nReq);
0142
0143    /// Open the file if not already open. Otherwise noop.
0144    void EnsureOpen();
0145
0146 public:
0147    RRawFile(std::string_view url, ROptions options);
0148    RRawFile(const RRawFile &) = delete;
0149    RRawFile &operator=(const RRawFile &) = delete;
0150    virtual ~RRawFile() = default;
0151
0152    /// Create a new RawFile that accesses the same resource.  The file pointer is reset to zero.
0153    virtual std::unique_ptr<RRawFile> Clone() const = 0;
0154
0155    /// Factory method that returns a suitable concrete implementation according to the transport in the url
0156    static std::unique_ptr<RRawFile> Create(std::string_view url, ROptions options = ROptions());
0157    /// Returns only the file location, e.g. "server/file" for http://server/file
0158    static std::string GetLocation(std::string_view url);
0159    /// Returns only the transport protocol in lower case, e.g. "http" for HTTP://server/file
0160    static std::string GetTransport(std::string_view url);
0161
0162    /// Buffered read from a random position. Returns the actual number of bytes read.
0163    /// Short reads indicate the end of the file
0164    size_t ReadAt(void *buffer, size_t nbytes, std::uint64_t offset);
0165    /// Read from fFilePos offset. Returns the actual number of bytes read.
0166    size_t Read(void *buffer, size_t nbytes);
0167    /// Change the cursor fFilePos
0168    void Seek(std::uint64_t offset);
0169    /// Returns the offset for the next Read/Readln call
0170    std::uint64_t GetFilePos() const { return fFilePos; }
0171    /// Returns the size of the file
0172    std::uint64_t GetSize();
0173    /// Returns the url of the file
0174    std::string GetUrl() const;
0175
0176    /// Opens the file if necessary and calls ReadVImpl
0177    void ReadV(RIOVec *ioVec, unsigned int nReq);
0178    /// Returns the limits regarding the ioVec input to ReadV for this specific file; may open the file as a side-effect.
0179    virtual RIOVecLimits GetReadVLimits() { return RIOVecLimits(); }
0180
0181    /// Turn off buffered reads; all scalar read requests go directly to the implementation. Buffering can be turned
0182    /// back on.
0183    void SetBuffering(bool value);
0184    bool IsBuffering() const { return fIsBuffering; }
0185
0186    /// Read the next line starting from the current value of fFilePos. Returns false if the end of the file is reached.
0187    bool Readln(std::string &line);
0188
0189    /// Once opened, the file stay open until destruction of the RRawFile object
0190    bool IsOpen() const { return fIsOpen; }
0191 }; // class RRawFile
0192
0193 } // namespace Internal
0194 } // namespace ROOT
0195
0196 #endif