root/ROOT/RNTupleWriteOptions.hxx

0001 /// \file ROOT/RNTupleWriteOptions.hxx
0002 /// \ingroup NTuple
0003 /// \author Jakob Blomer <jblomer@cern.ch>
0004 /// \date 2024-02-22
0005
0006 /*************************************************************************
0007  * Copyright (C) 1995-2019, Rene Brun and Fons Rademakers.               *
0008  * All rights reserved.                                                  *
0009  *                                                                       *
0010  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0011  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0012  *************************************************************************/
0013
0014 #ifndef ROOT_RNTupleWriteOptions
0015 #define ROOT_RNTupleWriteOptions
0016
0017 #include <Compression.h>
0018
0019 #include <cstdint>
0020 #include <cstddef>
0021 #include <memory>
0022
0023 namespace ROOT {
0024
0025 class RNTupleWriteOptions;
0026
0027 namespace Internal {
0028
0029 class RNTupleWriteOptionsManip final {
0030 public:
0031    static void SetMaxKeySize(RNTupleWriteOptions &options, std::uint64_t maxKeySize);
0032 };
0033
0034 } // namespace Internal
0035
0036 // clang-format off
0037 /**
0038 \class ROOT::RNTupleWriteOptions
0039 \ingroup NTuple
0040 \brief Common user-tunable settings for storing RNTuples
0041
0042 All page sink classes need to support the common options.
0043
0044 <table>
0045 <tr>
0046 <th>Option name</th>
0047 <th>Type</th>
0048 <th>Default</th>
0049 <th>Description</th>
0050 </tr>
0051
0052 <tr>
0053 <td>`Compression`</td>
0054 <td>`std::uint32_t`</td>
0055 <td>RCompressionSetting::EDefaults::kUseGeneralPurpose</td>
0056 <td>
0057 The compression settings for this RNTuple
0058 </td>
0059 </tr>
0060
0061 <tr>
0062 <td>`ApproxZippedClusterSize`</td>
0063 <td>`std::size_t`</td>
0064 <td>128 MiB</td>
0065 <td>
0066 Approximation of the target compressed cluster size
0067 </td>
0068 </tr>
0069
0070 <tr>
0071 <td>`MaxUnzippedClusterSize`</td>
0072 <td>`std::size_t`</td>
0073 <td>1280 MiB</td>
0074 <td>
0075 Memory limit for committing a cluster: with very high compression ratio, we need a limit
0076 on how large the I/O buffer can grow during writing.
0077 </td>
0078 </tr>
0079
0080 <tr>
0081 <td>`InitialUnzippedPageSize`</td>
0082 <td>`std::size_t`</td>
0083 <td>256</td>
0084 <td>
0085 Initially, columns start with a page of this size. The default value is chosen to accomodate at least 32 elements
0086 of 64 bits, or 64 elements of 32 bits. If more elements are needed, pages are increased up until the byte limit
0087 given by the option `MaxUnzippedPageSize` or until the total page buffer limit is reached (as a sum of all page buffers).
0088 The total write buffer limit needs to be large enough to hold the initial pages of all columns.
0089 </td>
0090 </tr>
0091
0092 <tr>
0093 <td>`MaxUnzippedPageSize`</td>
0094 <td>`std::size_t`</td>
0095 <td>1 MiB</td>
0096 <td>
0097 Pages can grow only to the given limit in bytes.
0098 </td>
0099 </tr>
0100
0101 <tr>
0102 <td>`PageBufferBudget`</td>
0103 <td>`std::size_t`</td>
0104 <td>0 / auto</td>
0105 <td>
0106 The maximum size that the sum of all page buffers used for writing into a persistent sink are allowed to use.
0107 If set to zero, RNTuple will auto-adjust the budget based on the value of `ApproxZippedClusterSize`.
0108 If set manually, the size needs to be large enough to hold all initial page buffers.
0109 The total amount of memory for writing is larger, e.g. for the additional compressed buffers etc.
0110 Use RNTupleModel::EstimateWriteMemoryUsage() for the total estimated memory use for writing.
0111 The default values are tuned for a total write memory of around 400 MiB per fill context.
0112 </td>
0113 </tr>
0114
0115 <tr>
0116 <td>`UseBufferedWrite`</td>
0117 <td>`bool`</td>
0118 <td>`true`</td>
0119 <td>
0120 Whether to use buffered writing (with RPageSinkBuf). This buffers compressed pages in memory, reorders them
0121 to keep pages of the same column adjacent, and coalesces the writes when committing a cluster.
0122 </td>
0123 </tr>
0124
0125 <tr>
0126 <td>`UseDirectIO`</td>
0127 <td>`bool`</td>
0128 <td>`false`</td>
0129 <td>
0130 Whether to use Direct I/O for writing. Note that this introduces alignment requirements that may very between
0131 filesystems and platforms.
0132 </td>
0133 </tr>
0134
0135 <tr>
0136 <td>`WriteBufferSize`</td>
0137 <td>`std::size_t`</td>
0138 <td>4 MiB</td>
0139 <td>
0140 Buffer size to use for writing to files, must be a multiple of 4096 bytes. Testing suggests that 4MiB gives best
0141 performance (with Direct I/O) at a reasonable memory consumption.
0142 </td>
0143 </tr>
0144
0145 <tr>
0146 <td>`UseImplicitMT`</td>
0147 <td>EImplicitMT</td>
0148 <td>EImplicitMT::kDefault</td>
0149 <td>
0150 Whether to use implicit multi-threading to compress pages. Only has an effect if buffered writing is turned on.
0151 The meaning of EImplicitMT::kDefault depends on the used writer: For the (sequential) RNTupleWriter, it translates
0152 to EImplicitMT::kOn and the user has to manually disable the use of implicit multi-threading if it is not wanted.
0153 For the RNTupleParalellWriter, the implementation defaults to EImplicitMT::kOff in order to avoid interference with
0154 explicit parallelism that might create one RNTupleFillContext per thread. If implicit multi-threading is wanted on
0155 top of this, the user has to explicitly request EImplicitMT::kOn.
0156 </td>
0157 </tr>
0158
0159 <tr>
0160 <td>`EnablePageChecksums`</td>
0161 <td>`bool`</td>
0162 <td>`true`</td>
0163 <td>
0164 If set, checksums will be calculated and written for every page.
0165 If turned off, will also turn off `EnableSamePageMerging`.
0166 </td>
0167 </tr>
0168
0169 <tr>
0170 <td>`EnableSamePageMerging`</td>
0171 <td>`bool`</td>
0172 <td>`true`</td>
0173 <td>
0174 If set, identical pages are deduplicated and aliased on disk.
0175 Requires `EnablePageChecksums` and will throw if previously disabled.
0176 </td>
0177 </tr>
0178
0179 </table>
0180 */
0181 // clang-format on
0182 class RNTupleWriteOptions {
0183 public:
0184    enum class EImplicitMT {
0185       kOff,
0186       kOn,
0187       kDefault,
0188    };
0189
0190    // clang-format off
0191    static constexpr std::uint64_t kDefaultMaxKeySize = 0x4000'0000; // 1 GiB
0192    // clang-format on
0193
0194    friend Internal::RNTupleWriteOptionsManip;
0195
0196 protected:
0197    std::uint32_t fCompression{RCompressionSetting::EDefaults::kUseGeneralPurpose};
0198    std::size_t fApproxZippedClusterSize = 128 * 1024 * 1024;
0199    std::size_t fMaxUnzippedClusterSize = 10 * fApproxZippedClusterSize;
0200    std::size_t fInitialUnzippedPageSize = 256;
0201    std::size_t fMaxUnzippedPageSize = 1024 * 1024;
0202    std::size_t fPageBufferBudget = 0;
0203    bool fUseBufferedWrite = true;
0204    bool fUseDirectIO = false;
0205    std::size_t fWriteBufferSize = 4 * 1024 * 1024;
0206    EImplicitMT fUseImplicitMT = EImplicitMT::kDefault;
0207    bool fEnablePageChecksums = true;
0208    bool fEnableSamePageMerging = true;
0209    /// Specifies the max size of a payload storeable into a single TKey. When writing an RNTuple to a ROOT file,
0210    /// any payload whose size exceeds this will be split into multiple keys.
0211    std::uint64_t fMaxKeySize = kDefaultMaxKeySize;
0212
0213 public:
0214
0215    virtual ~RNTupleWriteOptions() = default;
0216    virtual std::unique_ptr<RNTupleWriteOptions> Clone() const;
0217
0218    std::uint32_t GetCompression() const { return fCompression; }
0219    void SetCompression(std::uint32_t val) { fCompression = val; }
0220    void SetCompression(RCompressionSetting::EAlgorithm::EValues algorithm, int compressionLevel)
0221    {
0222       fCompression = CompressionSettings(algorithm, compressionLevel);
0223    }
0224
0225    std::size_t GetApproxZippedClusterSize() const { return fApproxZippedClusterSize; }
0226    void SetApproxZippedClusterSize(std::size_t val);
0227
0228    std::size_t GetMaxUnzippedClusterSize() const { return fMaxUnzippedClusterSize; }
0229    void SetMaxUnzippedClusterSize(std::size_t val);
0230
0231    std::size_t GetInitialUnzippedPageSize() const { return fInitialUnzippedPageSize; }
0232    void SetInitialUnzippedPageSize(std::size_t val);
0233
0234    std::size_t GetMaxUnzippedPageSize() const { return fMaxUnzippedPageSize; }
0235    void SetMaxUnzippedPageSize(std::size_t val);
0236
0237    std::size_t GetPageBufferBudget() const;
0238    void SetPageBufferBudget(std::size_t val) { fPageBufferBudget = val; }
0239
0240    bool GetUseBufferedWrite() const { return fUseBufferedWrite; }
0241    void SetUseBufferedWrite(bool val) { fUseBufferedWrite = val; }
0242
0243    bool GetUseDirectIO() const { return fUseDirectIO; }
0244    void SetUseDirectIO(bool val) { fUseDirectIO = val; }
0245
0246    std::size_t GetWriteBufferSize() const { return fWriteBufferSize; }
0247    void SetWriteBufferSize(std::size_t val) { fWriteBufferSize = val; }
0248
0249    EImplicitMT GetUseImplicitMT() const { return fUseImplicitMT; }
0250    void SetUseImplicitMT(EImplicitMT val) { fUseImplicitMT = val; }
0251
0252    bool GetEnablePageChecksums() const { return fEnablePageChecksums; }
0253    /// Note that turning off page checksums will also turn off the same page merging optimization (see tuning.md)
0254    void SetEnablePageChecksums(bool val)
0255    {
0256       fEnablePageChecksums = val;
0257       if (!fEnablePageChecksums) {
0258          fEnableSamePageMerging = false;
0259       }
0260    }
0261
0262    bool GetEnableSamePageMerging() const { return fEnableSamePageMerging; }
0263    void SetEnableSamePageMerging(bool val);
0264
0265    std::uint64_t GetMaxKeySize() const { return fMaxKeySize; }
0266
0267    friend bool operator==(const RNTupleWriteOptions &lhs, const RNTupleWriteOptions &rhs)
0268    {
0269       return lhs.fCompression == rhs.fCompression && lhs.fApproxZippedClusterSize == rhs.fApproxZippedClusterSize &&
0270              lhs.fMaxUnzippedClusterSize == rhs.fMaxUnzippedClusterSize &&
0271              lhs.fInitialUnzippedPageSize == rhs.fInitialUnzippedPageSize &&
0272              lhs.fMaxUnzippedPageSize == rhs.fMaxUnzippedPageSize && lhs.fPageBufferBudget == rhs.fPageBufferBudget &&
0273              lhs.fUseBufferedWrite == rhs.fUseBufferedWrite && lhs.fUseDirectIO == rhs.fUseDirectIO &&
0274              lhs.fWriteBufferSize == rhs.fWriteBufferSize && lhs.fUseImplicitMT == rhs.fUseImplicitMT &&
0275              lhs.fEnablePageChecksums == rhs.fEnablePageChecksums &&
0276              lhs.fEnableSamePageMerging == rhs.fEnableSamePageMerging && lhs.fMaxKeySize == rhs.fMaxKeySize;
0277    }
0278
0279    friend bool operator!=(const RNTupleWriteOptions &lhs, const RNTupleWriteOptions &rhs) { return !(lhs == rhs); }
0280 };
0281
0282 namespace Internal {
0283 inline void RNTupleWriteOptionsManip::SetMaxKeySize(RNTupleWriteOptions &options, std::uint64_t maxKeySize)
0284 {
0285    options.fMaxKeySize = maxKeySize;
0286 }
0287
0288 } // namespace Internal
0289 } // namespace ROOT
0290
0291 #endif // ROOT_RNTupleWriteOptions