Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:10:46

0001 /// \file ROOT/RNTupleSerialize.hxx
0002 /// \ingroup NTuple ROOT7
0003 /// \author Jakob Blomer <jblomer@cern.ch>
0004 /// \author Javier Lopez-Gomez <javier.lopez.gomez@cern.ch>
0005 /// \date 2021-08-02
0006 /// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
0007 /// is welcome!
0008 
0009 /*************************************************************************
0010  * Copyright (C) 1995-2021, Rene Brun and Fons Rademakers.               *
0011  * All rights reserved.                                                  *
0012  *                                                                       *
0013  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0014  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0015  *************************************************************************/
0016 
0017 #ifndef ROOT7_RNTupleSerialize
0018 #define ROOT7_RNTupleSerialize
0019 
0020 #include <ROOT/RError.hxx>
0021 #include <ROOT/RNTupleUtil.hxx>
0022 #include <ROOT/RSpan.hxx>
0023 
0024 #include <cstdint>
0025 #include <map>
0026 #include <string>
0027 #include <vector>
0028 
0029 namespace ROOT {
0030 namespace Experimental {
0031 
0032 enum class EColumnType;
0033 class RClusterDescriptor;
0034 class RNTupleDescriptor;
0035 
0036 namespace Internal {
0037 
0038 class RClusterDescriptorBuilder;
0039 class RNTupleDescriptorBuilder;
0040 
0041 // clang-format off
0042 /**
0043 \class ROOT::Experimental::Internal::RNTupleSerializer
0044 \ingroup NTuple
0045 \brief A helper class for serializing and deserialization of the RNTuple binary format
0046 
0047 All serialization and deserialization routines return the number of bytes processed (written or read).
0048 
0049 The serialization routines can be called with a nullptr buffer, in which case only the size required to perform
0050 a serialization is returned. Deserialization routines must be called with a buffer that is sufficiently large.
0051 
0052 Deserialization errors throw exceptions. Only when indicated or when passed as a parameter is the buffer size checked.
0053 */
0054 // clang-format on
0055 class RNTupleSerializer {
0056 public:
0057    static constexpr std::uint16_t kEnvelopeTypeHeader = 0x01;
0058    static constexpr std::uint16_t kEnvelopeTypeFooter = 0x02;
0059    static constexpr std::uint16_t kEnvelopeTypePageList = 0x03;
0060 
0061    static constexpr std::uint16_t kFlagRepetitiveField = 0x01;
0062 
0063    static constexpr std::uint32_t kFlagSortAscColumn     = 0x01;
0064    static constexpr std::uint32_t kFlagSortDesColumn     = 0x02;
0065    static constexpr std::uint32_t kFlagNonNegativeColumn = 0x04;
0066    static constexpr std::uint32_t kFlagDeferredColumn    = 0x08;
0067 
0068    static constexpr DescriptorId_t kZeroFieldId = std::uint64_t(-2);
0069 
0070    struct REnvelopeLink {
0071       std::uint64_t fLength = 0;
0072       RNTupleLocator fLocator;
0073    };
0074 
0075    struct RClusterSummary {
0076       std::uint64_t fFirstEntry = 0;
0077       std::uint64_t fNEntries = 0;
0078       /// -1 for "all columns"
0079       std::int32_t fColumnGroupID = -1;
0080    };
0081 
0082    struct RClusterGroup {
0083       std::uint64_t fMinEntry = 0;
0084       std::uint64_t fEntrySpan = 0;
0085       std::uint32_t fNClusters = 0;
0086       REnvelopeLink fPageListEnvelopeLink;
0087    };
0088 
0089    /// The serialization context is used for the piecewise serialization of a descriptor.  During header serialization,
0090    /// the mapping of in-memory field and column IDs to on-disk IDs is built so that it can be used for the
0091    /// footer serialization in a second step.
0092    class RContext {
0093    private:
0094       std::uint64_t fHeaderSize = 0;
0095       std::uint64_t fHeaderXxHash3 = 0;
0096       std::map<DescriptorId_t, DescriptorId_t> fMem2OnDiskFieldIDs;
0097       std::map<DescriptorId_t, DescriptorId_t> fMem2OnDiskColumnIDs;
0098       std::map<DescriptorId_t, DescriptorId_t> fMem2OnDiskClusterIDs;
0099       std::map<DescriptorId_t, DescriptorId_t> fMem2OnDiskClusterGroupIDs;
0100       std::vector<DescriptorId_t> fOnDisk2MemFieldIDs;
0101       std::vector<DescriptorId_t> fOnDisk2MemColumnIDs;
0102       std::vector<DescriptorId_t> fOnDisk2MemClusterIDs;
0103       std::vector<DescriptorId_t> fOnDisk2MemClusterGroupIDs;
0104       std::size_t fHeaderExtensionOffset = -1U;
0105 
0106    public:
0107       void SetHeaderSize(std::uint64_t size) { fHeaderSize = size; }
0108       std::uint64_t GetHeaderSize() const { return fHeaderSize; }
0109       void SetHeaderXxHash3(std::uint64_t xxhash3) { fHeaderXxHash3 = xxhash3; }
0110       std::uint64_t GetHeaderXxHash3() const { return fHeaderXxHash3; }
0111       /// Map an in-memory field ID to its on-disk counterpart. It is allowed to call this function multiple times for
0112       /// the same `memId`, in which case the return value is the on-disk ID assigned on the first call.
0113       DescriptorId_t MapFieldId(DescriptorId_t memId) {
0114          auto onDiskId = fOnDisk2MemFieldIDs.size();
0115          const auto &p = fMem2OnDiskFieldIDs.try_emplace(memId, onDiskId);
0116          if (p.second)
0117             fOnDisk2MemFieldIDs.push_back(memId);
0118          return (*p.first).second;
0119       }
0120       /// Map an in-memory column ID to its on-disk counterpart. It is allowed to call this function multiple times for
0121       /// the same `memId`, in which case the return value is the on-disk ID assigned on the first call.
0122       DescriptorId_t MapColumnId(DescriptorId_t memId) {
0123          auto onDiskId = fOnDisk2MemColumnIDs.size();
0124          const auto &p = fMem2OnDiskColumnIDs.try_emplace(memId, onDiskId);
0125          if (p.second)
0126             fOnDisk2MemColumnIDs.push_back(memId);
0127          return (*p.first).second;
0128       }
0129       DescriptorId_t MapClusterId(DescriptorId_t memId) {
0130          auto onDiskId = fOnDisk2MemClusterIDs.size();
0131          fMem2OnDiskClusterIDs[memId] = onDiskId;
0132          fOnDisk2MemClusterIDs.push_back(memId);
0133          return onDiskId;
0134       }
0135       DescriptorId_t MapClusterGroupId(DescriptorId_t memId)
0136       {
0137          auto onDiskId = fOnDisk2MemClusterGroupIDs.size();
0138          fMem2OnDiskClusterGroupIDs[memId] = onDiskId;
0139          fOnDisk2MemClusterGroupIDs.push_back(memId);
0140          return onDiskId;
0141       }
0142       /// Map in-memory field and column IDs to their on-disk counterparts. This function is unconditionally called
0143       /// during header serialization.  This function must be manually called after an incremental schema update as page
0144       /// list serialization requires all columns to be mapped.
0145       void MapSchema(const RNTupleDescriptor &desc, bool forHeaderExtension);
0146 
0147       DescriptorId_t GetOnDiskFieldId(DescriptorId_t memId) const { return fMem2OnDiskFieldIDs.at(memId); }
0148       DescriptorId_t GetOnDiskColumnId(DescriptorId_t memId) const { return fMem2OnDiskColumnIDs.at(memId); }
0149       DescriptorId_t GetOnDiskClusterId(DescriptorId_t memId) const { return fMem2OnDiskClusterIDs.at(memId); }
0150       DescriptorId_t GetOnDiskClusterGroupId(DescriptorId_t memId) const
0151       {
0152          return fMem2OnDiskClusterGroupIDs.at(memId);
0153       }
0154       DescriptorId_t GetMemFieldId(DescriptorId_t onDiskId) const { return fOnDisk2MemFieldIDs[onDiskId]; }
0155       DescriptorId_t GetMemColumnId(DescriptorId_t onDiskId) const { return fOnDisk2MemColumnIDs[onDiskId]; }
0156       DescriptorId_t GetMemClusterId(DescriptorId_t onDiskId) const { return fOnDisk2MemClusterIDs[onDiskId]; }
0157       DescriptorId_t GetMemClusterGroupId(DescriptorId_t onDiskId) const
0158       {
0159          return fOnDisk2MemClusterGroupIDs[onDiskId];
0160       }
0161 
0162       /// Return a vector containing the in-memory field ID for each on-disk counterpart, in order, i.e. the `i`-th
0163       /// value corresponds to the in-memory field ID for `i`-th on-disk ID
0164       const std::vector<DescriptorId_t> &GetOnDiskFieldList() const { return fOnDisk2MemFieldIDs; }
0165       /// Mark the first on-disk field ID that is part of the schema extension
0166       void BeginHeaderExtension() { fHeaderExtensionOffset = fOnDisk2MemFieldIDs.size(); }
0167       /// Return the offset of the first element in `fOnDisk2MemFieldIDs` that is part of the schema extension
0168       std::size_t GetHeaderExtensionOffset() const { return fHeaderExtensionOffset; }
0169    };
0170 
0171    /// Writes a XxHash-3 64bit checksum of the byte range given by data and length.
0172    static std::uint32_t
0173    SerializeXxHash3(const unsigned char *data, std::uint64_t length, std::uint64_t &xxhash3, void *buffer);
0174    /// Expects an xxhash3 checksum in the 8 bytes following data + length and verifies it.
0175    static RResult<void> VerifyXxHash3(const unsigned char *data, std::uint64_t length, std::uint64_t &xxhash3);
0176    static RResult<void> VerifyXxHash3(const unsigned char *data, std::uint64_t length);
0177 
0178    static std::uint32_t SerializeInt16(std::int16_t val, void *buffer);
0179    static std::uint32_t DeserializeInt16(const void *buffer, std::int16_t &val);
0180    static std::uint32_t SerializeUInt16(std::uint16_t val, void *buffer);
0181    static std::uint32_t DeserializeUInt16(const void *buffer, std::uint16_t &val);
0182 
0183    static std::uint32_t SerializeInt32(std::int32_t val, void *buffer);
0184    static std::uint32_t DeserializeInt32(const void *buffer, std::int32_t &val);
0185    static std::uint32_t SerializeUInt32(std::uint32_t val, void *buffer);
0186    static std::uint32_t DeserializeUInt32(const void *buffer, std::uint32_t &val);
0187 
0188    static std::uint32_t SerializeInt64(std::int64_t val, void *buffer);
0189    static std::uint32_t DeserializeInt64(const void *buffer, std::int64_t &val);
0190    static std::uint32_t SerializeUInt64(std::uint64_t val, void *buffer);
0191    static std::uint32_t DeserializeUInt64(const void *buffer, std::uint64_t &val);
0192 
0193    static std::uint32_t SerializeString(const std::string &val, void *buffer);
0194    static RResult<std::uint32_t> DeserializeString(const void *buffer, std::uint64_t bufSize, std::string &val);
0195 
0196    /// While we could just interpret the enums as ints, we make the translation explicit
0197    /// in order to avoid accidentally changing the on-disk numbers when adjusting the enum classes.
0198    static std::uint16_t SerializeFieldStructure(ROOT::Experimental::ENTupleStructure structure, void *buffer);
0199    static std::uint16_t SerializeColumnType(ROOT::Experimental::EColumnType type, void *buffer);
0200    static RResult<std::uint16_t> DeserializeFieldStructure(const void *buffer, ROOT::Experimental::ENTupleStructure &structure);
0201    static RResult<std::uint16_t> DeserializeColumnType(const void *buffer, ROOT::Experimental::EColumnType &type);
0202 
0203    static std::uint32_t SerializeEnvelopePreamble(std::uint16_t envelopeType, void *buffer);
0204    static std::uint32_t SerializeEnvelopePostscript(unsigned char *envelope, std::uint64_t size);
0205    static std::uint32_t
0206    SerializeEnvelopePostscript(unsigned char *envelope, std::uint64_t size, std::uint64_t &xxhash3);
0207    // The bufSize must include the 8 bytes for the final xxhash3 checksum.
0208    static RResult<std::uint32_t>
0209    DeserializeEnvelope(const void *buffer, std::uint64_t bufSize, std::uint16_t expectedType);
0210    static RResult<std::uint32_t>
0211    DeserializeEnvelope(const void *buffer, std::uint64_t bufSize, std::uint16_t expectedType, std::uint64_t &xxhash3);
0212 
0213    static std::uint32_t SerializeRecordFramePreamble(void *buffer);
0214    static std::uint32_t SerializeListFramePreamble(std::uint32_t nitems, void *buffer);
0215    static std::uint32_t SerializeFramePostscript(void *frame, std::uint64_t size);
0216    static RResult<std::uint32_t>
0217    DeserializeFrameHeader(const void *buffer, std::uint64_t bufSize, std::uint64_t &frameSize, std::uint32_t &nitems);
0218    static RResult<std::uint32_t>
0219    DeserializeFrameHeader(const void *buffer, std::uint64_t bufSize, std::uint64_t &frameSize);
0220 
0221    // An empty flags vector will be serialized as a single, zero feature flag
0222    // The most significant bit in every flag is reserved and must _not_ be set
0223    static std::uint32_t SerializeFeatureFlags(const std::vector<std::uint64_t> &flags, void *buffer);
0224    static RResult<std::uint32_t>
0225    DeserializeFeatureFlags(const void *buffer, std::uint64_t bufSize, std::vector<std::uint64_t> &flags);
0226 
0227    static std::uint32_t SerializeLocator(const RNTupleLocator &locator, void *buffer);
0228    static std::uint32_t SerializeEnvelopeLink(const REnvelopeLink &envelopeLink, void *buffer);
0229    static RResult<std::uint32_t> DeserializeLocator(const void *buffer, std::uint64_t bufSize, RNTupleLocator &locator);
0230    static RResult<std::uint32_t>
0231    DeserializeEnvelopeLink(const void *buffer, std::uint64_t bufSize, REnvelopeLink &envelopeLink);
0232 
0233    static std::uint32_t SerializeClusterSummary(const RClusterSummary &clusterSummary, void *buffer);
0234    static std::uint32_t SerializeClusterGroup(const RClusterGroup &clusterGroup, void *buffer);
0235    static RResult<std::uint32_t>
0236    DeserializeClusterSummary(const void *buffer, std::uint64_t bufSize, RClusterSummary &clusterSummary);
0237    static RResult<std::uint32_t>
0238    DeserializeClusterGroup(const void *buffer, std::uint64_t bufSize, RClusterGroup &clusterGroup);
0239 
0240    /// Serialize the schema description in `desc` into `buffer`. If `forHeaderExtension` is true, serialize only the
0241    /// fields and columns tagged as part of the header extension (see `RNTupleDescriptorBuilder::BeginHeaderExtension`).
0242    static std::uint32_t SerializeSchemaDescription(void *buffer, const RNTupleDescriptor &desc, const RContext &context,
0243                                                    bool forHeaderExtension = false);
0244    static RResult<std::uint32_t>
0245    DeserializeSchemaDescription(const void *buffer, std::uint64_t bufSize, RNTupleDescriptorBuilder &descBuilder);
0246 
0247    static RContext SerializeHeader(void *buffer, const RNTupleDescriptor &desc);
0248    static std::uint32_t SerializePageList(void *buffer, const RNTupleDescriptor &desc,
0249                                           std::span<DescriptorId_t> physClusterIDs, const RContext &context);
0250    static std::uint32_t SerializeFooter(void *buffer, const RNTupleDescriptor &desc, const RContext &context);
0251 
0252    static RResult<void>
0253    DeserializeHeader(const void *buffer, std::uint64_t bufSize, RNTupleDescriptorBuilder &descBuilder);
0254    static RResult<void>
0255    DeserializeFooter(const void *buffer, std::uint64_t bufSize, RNTupleDescriptorBuilder &descBuilder);
0256    // The clusters vector must be initialized with the cluster summaries corresponding to the page list
0257    static RResult<void> DeserializePageList(const void *buffer, std::uint64_t bufSize, DescriptorId_t clusterGroupId,
0258                                             RNTupleDescriptor &desc);
0259 }; // class RNTupleSerializer
0260 
0261 } // namespace Internal
0262 } // namespace Experimental
0263 } // namespace ROOT
0264 
0265 #endif // ROOT7_RNTupleSerialize