Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-09-15 09:11:45

0001 /// \file ROOT/RNTupleSerialize.hxx
0002 /// \ingroup NTuple
0003 /// \author Jakob Blomer <jblomer@cern.ch>
0004 /// \author Javier Lopez-Gomez <javier.lopez.gomez@cern.ch>
0005 /// \date 2021-08-02
0006 
0007 /*************************************************************************
0008  * Copyright (C) 1995-2021, Rene Brun and Fons Rademakers.               *
0009  * All rights reserved.                                                  *
0010  *                                                                       *
0011  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0012  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0013  *************************************************************************/
0014 
0015 #ifndef ROOT_RNTupleSerialize
0016 #define ROOT_RNTupleSerialize
0017 
0018 #include <ROOT/RError.hxx>
0019 #include <ROOT/RNTupleUtil.hxx>
0020 #include <ROOT/RSpan.hxx>
0021 
0022 #include <Rtypes.h>
0023 
0024 #include <cstdint>
0025 #include <limits>
0026 #include <map>
0027 #include <string>
0028 #include <unordered_map>
0029 #include <vector>
0030 
0031 class TVirtualStreamerInfo;
0032 
0033 namespace ROOT {
0034 
0035 class RNTupleDescriptor;
0036 class RClusterDescriptor;
0037 enum class EExtraTypeInfoIds;
0038 
0039 namespace Internal {
0040 
0041 class RClusterDescriptorBuilder;
0042 class RNTupleDescriptorBuilder;
0043 
0044 // clang-format off
0045 /**
0046 \class ROOT::Internal::RNTupleSerializer
0047 \ingroup NTuple
0048 \brief A helper class for serializing and deserialization of the RNTuple binary format
0049 
0050 All serialization and deserialization routines return the number of bytes processed (written or read).
0051 
0052 The serialization routines can be called with a nullptr buffer, in which case only the size required to perform
0053 a serialization is returned. Deserialization routines must be called with a buffer that is sufficiently large.
0054 
0055 Deserialization errors throw exceptions. Only when indicated or when passed as a parameter is the buffer size checked.
0056 */
0057 // clang-format on
0058 class RNTupleSerializer {
0059    static RResult<std::vector<ROOT::Internal::RClusterDescriptorBuilder>>
0060    DeserializePageListRaw(const void *buffer, std::uint64_t bufSize, ROOT::DescriptorId_t clusterGroupId,
0061                           const RNTupleDescriptor &desc);
0062 
0063 public:
0064    static constexpr std::uint16_t kEnvelopeTypeHeader = 0x01;
0065    static constexpr std::uint16_t kEnvelopeTypeFooter = 0x02;
0066    static constexpr std::uint16_t kEnvelopeTypePageList = 0x03;
0067 
0068    static constexpr std::uint16_t kFlagRepetitiveField = 0x01;
0069    static constexpr std::uint16_t kFlagProjectedField = 0x02;
0070    static constexpr std::uint16_t kFlagHasTypeChecksum = 0x04;
0071 
0072    static constexpr std::uint16_t kFlagDeferredColumn = 0x01;
0073    static constexpr std::uint16_t kFlagHasValueRange = 0x02;
0074 
0075    static constexpr ROOT::DescriptorId_t kZeroFieldId = std::uint64_t(-2);
0076 
0077    static constexpr int64_t kSuppressedColumnMarker = std::numeric_limits<std::int64_t>::min();
0078 
0079    // In the page sink and the streamer field, the seen streamer infos are stored in a map
0080    // with the unique streamer info number being the key. Sorted by unique number.
0081    using StreamerInfoMap_t = std::map<Int_t, TVirtualStreamerInfo *>;
0082 
0083    struct REnvelopeLink {
0084       std::uint64_t fLength = 0;
0085       RNTupleLocator fLocator;
0086    };
0087 
0088    struct RClusterSummary {
0089       std::uint64_t fFirstEntry = 0;
0090       std::uint64_t fNEntries = 0;
0091       std::uint8_t fFlags = 0;
0092    };
0093 
0094    struct RClusterGroup {
0095       std::uint64_t fMinEntry = 0;
0096       std::uint64_t fEntrySpan = 0;
0097       std::uint32_t fNClusters = 0;
0098       REnvelopeLink fPageListEnvelopeLink;
0099    };
0100 
0101    /// The serialization context is used for the piecewise serialization of a descriptor.  During header serialization,
0102    /// the mapping of in-memory field and column IDs to on-disk IDs is built so that it can be used for the
0103    /// footer serialization in a second step.
0104    class RContext {
0105    private:
0106       std::uint64_t fHeaderSize = 0;
0107       std::uint64_t fHeaderXxHash3 = 0;
0108       std::map<ROOT::DescriptorId_t, ROOT::DescriptorId_t> fMem2OnDiskFieldIDs;
0109       std::map<ROOT::DescriptorId_t, ROOT::DescriptorId_t> fMem2OnDiskColumnIDs;
0110       std::map<ROOT::DescriptorId_t, ROOT::DescriptorId_t> fMem2OnDiskClusterIDs;
0111       std::map<ROOT::DescriptorId_t, ROOT::DescriptorId_t> fMem2OnDiskClusterGroupIDs;
0112       std::vector<ROOT::DescriptorId_t> fOnDisk2MemFieldIDs;
0113       std::vector<ROOT::DescriptorId_t> fOnDisk2MemColumnIDs;
0114       std::vector<ROOT::DescriptorId_t> fOnDisk2MemClusterIDs;
0115       std::vector<ROOT::DescriptorId_t> fOnDisk2MemClusterGroupIDs;
0116 
0117    public:
0118       void SetHeaderSize(std::uint64_t size) { fHeaderSize = size; }
0119       std::uint64_t GetHeaderSize() const { return fHeaderSize; }
0120       void SetHeaderXxHash3(std::uint64_t xxhash3) { fHeaderXxHash3 = xxhash3; }
0121       std::uint64_t GetHeaderXxHash3() const { return fHeaderXxHash3; }
0122       /// Map an in-memory field ID to its on-disk counterpart. It is allowed to call this function multiple times for
0123       /// the same `memId`, in which case the return value is the on-disk ID assigned on the first call.
0124       ROOT::DescriptorId_t MapFieldId(ROOT::DescriptorId_t memId)
0125       {
0126          auto onDiskId = fOnDisk2MemFieldIDs.size();
0127          const auto &p = fMem2OnDiskFieldIDs.try_emplace(memId, onDiskId);
0128          if (p.second)
0129             fOnDisk2MemFieldIDs.push_back(memId);
0130          return (*p.first).second;
0131       }
0132       /// Map an in-memory column ID to its on-disk counterpart. It is allowed to call this function multiple times for
0133       /// the same `memId`, in which case the return value is the on-disk ID assigned on the first call.
0134       /// Note that we only map physical column IDs.  Logical column IDs of alias columns are shifted before the
0135       /// serialization of the extension header.  Also, we only need to query physical column IDs for the page list
0136       /// serialization.
0137       ROOT::DescriptorId_t MapPhysicalColumnId(ROOT::DescriptorId_t memId)
0138       {
0139          auto onDiskId = fOnDisk2MemColumnIDs.size();
0140          const auto &p = fMem2OnDiskColumnIDs.try_emplace(memId, onDiskId);
0141          if (p.second)
0142             fOnDisk2MemColumnIDs.push_back(memId);
0143          return (*p.first).second;
0144       }
0145       ROOT::DescriptorId_t MapClusterId(ROOT::DescriptorId_t memId)
0146       {
0147          auto onDiskId = fOnDisk2MemClusterIDs.size();
0148          fMem2OnDiskClusterIDs[memId] = onDiskId;
0149          fOnDisk2MemClusterIDs.push_back(memId);
0150          return onDiskId;
0151       }
0152       ROOT::DescriptorId_t MapClusterGroupId(ROOT::DescriptorId_t memId)
0153       {
0154          auto onDiskId = fOnDisk2MemClusterGroupIDs.size();
0155          fMem2OnDiskClusterGroupIDs[memId] = onDiskId;
0156          fOnDisk2MemClusterGroupIDs.push_back(memId);
0157          return onDiskId;
0158       }
0159       /// Map in-memory field and column IDs to their on-disk counterparts. This function is unconditionally called
0160       /// during header serialization.  This function must be manually called after an incremental schema update as page
0161       /// list serialization requires all columns to be mapped.
0162       void MapSchema(const RNTupleDescriptor &desc, bool forHeaderExtension);
0163 
0164       ROOT::DescriptorId_t GetOnDiskFieldId(ROOT::DescriptorId_t memId) const { return fMem2OnDiskFieldIDs.at(memId); }
0165       ROOT::DescriptorId_t GetOnDiskColumnId(ROOT::DescriptorId_t memId) const
0166       {
0167          return fMem2OnDiskColumnIDs.at(memId);
0168       }
0169       ROOT::DescriptorId_t GetOnDiskClusterId(ROOT::DescriptorId_t memId) const
0170       {
0171          return fMem2OnDiskClusterIDs.at(memId);
0172       }
0173       ROOT::DescriptorId_t GetOnDiskClusterGroupId(ROOT::DescriptorId_t memId) const
0174       {
0175          return fMem2OnDiskClusterGroupIDs.at(memId);
0176       }
0177       ROOT::DescriptorId_t GetMemFieldId(ROOT::DescriptorId_t onDiskId) const { return fOnDisk2MemFieldIDs[onDiskId]; }
0178       ROOT::DescriptorId_t GetMemColumnId(ROOT::DescriptorId_t onDiskId) const
0179       {
0180          return fOnDisk2MemColumnIDs[onDiskId];
0181       }
0182       ROOT::DescriptorId_t GetMemClusterId(ROOT::DescriptorId_t onDiskId) const
0183       {
0184          return fOnDisk2MemClusterIDs[onDiskId];
0185       }
0186       ROOT::DescriptorId_t GetMemClusterGroupId(ROOT::DescriptorId_t onDiskId) const
0187       {
0188          return fOnDisk2MemClusterGroupIDs[onDiskId];
0189       }
0190 
0191       /// Return a vector containing the in-memory field ID for each on-disk counterpart, in order, i.e. the `i`-th
0192       /// value corresponds to the in-memory field ID for `i`-th on-disk ID
0193       const std::vector<ROOT::DescriptorId_t> &GetOnDiskFieldList() const { return fOnDisk2MemFieldIDs; }
0194    };
0195 
0196    /// Writes a XxHash-3 64bit checksum of the byte range given by data and length.
0197    static std::uint32_t
0198    SerializeXxHash3(const unsigned char *data, std::uint64_t length, std::uint64_t &xxhash3, void *buffer);
0199    /// Expects an xxhash3 checksum in the 8 bytes following data + length and verifies it.
0200    static RResult<void> VerifyXxHash3(const unsigned char *data, std::uint64_t length, std::uint64_t &xxhash3);
0201    static RResult<void> VerifyXxHash3(const unsigned char *data, std::uint64_t length);
0202 
0203    static std::uint32_t SerializeInt16(std::int16_t val, void *buffer);
0204    static std::uint32_t DeserializeInt16(const void *buffer, std::int16_t &val);
0205    static std::uint32_t SerializeUInt16(std::uint16_t val, void *buffer);
0206    static std::uint32_t DeserializeUInt16(const void *buffer, std::uint16_t &val);
0207 
0208    static std::uint32_t SerializeInt32(std::int32_t val, void *buffer);
0209    static std::uint32_t DeserializeInt32(const void *buffer, std::int32_t &val);
0210    static std::uint32_t SerializeUInt32(std::uint32_t val, void *buffer);
0211    static std::uint32_t DeserializeUInt32(const void *buffer, std::uint32_t &val);
0212 
0213    static std::uint32_t SerializeInt64(std::int64_t val, void *buffer);
0214    static std::uint32_t DeserializeInt64(const void *buffer, std::int64_t &val);
0215    static std::uint32_t SerializeUInt64(std::uint64_t val, void *buffer);
0216    static std::uint32_t DeserializeUInt64(const void *buffer, std::uint64_t &val);
0217 
0218    static std::uint32_t SerializeString(const std::string &val, void *buffer);
0219    static RResult<std::uint32_t> DeserializeString(const void *buffer, std::uint64_t bufSize, std::string &val);
0220 
0221    /// While we could just interpret the enums as ints, we make the translation explicit
0222    /// in order to avoid accidentally changing the on-disk numbers when adjusting the enum classes.
0223    static RResult<std::uint32_t> SerializeFieldStructure(ROOT::ENTupleStructure structure, void *buffer);
0224    static RResult<std::uint32_t> SerializeColumnType(ROOT::ENTupleColumnType type, void *buffer);
0225    static RResult<std::uint32_t> SerializeExtraTypeInfoId(ROOT::EExtraTypeInfoIds id, void *buffer);
0226    static RResult<std::uint32_t> DeserializeFieldStructure(const void *buffer, ROOT::ENTupleStructure &structure);
0227    static RResult<std::uint32_t> DeserializeColumnType(const void *buffer, ROOT::ENTupleColumnType &type);
0228    static RResult<std::uint32_t> DeserializeExtraTypeInfoId(const void *buffer, ROOT::EExtraTypeInfoIds &id);
0229 
0230    static std::uint32_t SerializeEnvelopePreamble(std::uint16_t envelopeType, void *buffer);
0231    static RResult<std::uint32_t> SerializeEnvelopePostscript(unsigned char *envelope, std::uint64_t size);
0232    static RResult<std::uint32_t>
0233    SerializeEnvelopePostscript(unsigned char *envelope, std::uint64_t size, std::uint64_t &xxhash3);
0234    // The bufSize must include the 8 bytes for the final xxhash3 checksum.
0235    static RResult<std::uint32_t>
0236    DeserializeEnvelope(const void *buffer, std::uint64_t bufSize, std::uint16_t expectedType);
0237    static RResult<std::uint32_t>
0238    DeserializeEnvelope(const void *buffer, std::uint64_t bufSize, std::uint16_t expectedType, std::uint64_t &xxhash3);
0239 
0240    static std::uint32_t SerializeRecordFramePreamble(void *buffer);
0241    static std::uint32_t SerializeListFramePreamble(std::uint32_t nitems, void *buffer);
0242    static RResult<std::uint32_t> SerializeFramePostscript(void *frame, std::uint64_t size);
0243    static RResult<std::uint32_t>
0244    DeserializeFrameHeader(const void *buffer, std::uint64_t bufSize, std::uint64_t &frameSize, std::uint32_t &nitems);
0245    static RResult<std::uint32_t>
0246    DeserializeFrameHeader(const void *buffer, std::uint64_t bufSize, std::uint64_t &frameSize);
0247 
0248    // An empty flags vector will be serialized as a single, zero feature flag
0249    // The most significant bit in every flag is reserved and must _not_ be set
0250    static RResult<std::uint32_t> SerializeFeatureFlags(const std::vector<std::uint64_t> &flags, void *buffer);
0251    static RResult<std::uint32_t>
0252    DeserializeFeatureFlags(const void *buffer, std::uint64_t bufSize, std::vector<std::uint64_t> &flags);
0253 
0254    static RResult<std::uint32_t> SerializeLocator(const RNTupleLocator &locator, void *buffer);
0255    static RResult<std::uint32_t> SerializeEnvelopeLink(const REnvelopeLink &envelopeLink, void *buffer);
0256    static RResult<std::uint32_t> DeserializeLocator(const void *buffer, std::uint64_t bufSize, RNTupleLocator &locator);
0257    static RResult<std::uint32_t>
0258    DeserializeEnvelopeLink(const void *buffer, std::uint64_t bufSize, REnvelopeLink &envelopeLink);
0259 
0260    static RResult<std::uint32_t> SerializeClusterSummary(const RClusterSummary &clusterSummary, void *buffer);
0261    static RResult<std::uint32_t> SerializeClusterGroup(const RClusterGroup &clusterGroup, void *buffer);
0262    static RResult<std::uint32_t>
0263    DeserializeClusterSummary(const void *buffer, std::uint64_t bufSize, RClusterSummary &clusterSummary);
0264    static RResult<std::uint32_t>
0265    DeserializeClusterGroup(const void *buffer, std::uint64_t bufSize, RClusterGroup &clusterGroup);
0266 
0267    /// Serialize the schema description in `desc` into `buffer`. If `forHeaderExtension` is true, serialize only the
0268    /// fields and columns tagged as part of the header extension (see `RNTupleDescriptorBuilder::BeginHeaderExtension`).
0269    static RResult<std::uint32_t> SerializeSchemaDescription(void *buffer, const RNTupleDescriptor &desc,
0270                                                             const RContext &context, bool forHeaderExtension = false);
0271    static RResult<std::uint32_t> DeserializeSchemaDescription(const void *buffer, std::uint64_t bufSize,
0272                                                               ROOT::Internal::RNTupleDescriptorBuilder &descBuilder);
0273 
0274    static RResult<RContext> SerializeHeader(void *buffer, const RNTupleDescriptor &desc);
0275    static RResult<std::uint32_t> SerializePageList(void *buffer, const RNTupleDescriptor &desc,
0276                                                    std::span<ROOT::DescriptorId_t> physClusterIDs,
0277                                                    const RContext &context);
0278    static RResult<std::uint32_t> SerializeFooter(void *buffer, const RNTupleDescriptor &desc, const RContext &context);
0279 
0280    static RResult<void>
0281    DeserializeHeader(const void *buffer, std::uint64_t bufSize, ROOT::Internal::RNTupleDescriptorBuilder &descBuilder);
0282    static RResult<void>
0283    DeserializeFooter(const void *buffer, std::uint64_t bufSize, ROOT::Internal::RNTupleDescriptorBuilder &descBuilder);
0284 
0285    enum class EDescriptorDeserializeMode {
0286       /// Deserializes the descriptor as-is without performing any additional fixup. The produced descriptor is
0287       /// unsuitable for reading or writing, but it's a faithful representation of the on-disk information.
0288       kRaw,
0289       /// Deserializes the descriptor and performs fixup on the suppressed column ranges. This produces a descriptor
0290       /// that is suitable for writing, but not reading.
0291       kForWriting,
0292       /// Deserializes the descriptor and performs fixup on the suppressed column ranges and on clusters, taking
0293       /// into account the header extension. This produces a descriptor that is suitable for reading.
0294       kForReading,
0295    };
0296    // The clusters vector must be initialized with the cluster summaries corresponding to the page list
0297    static RResult<void> DeserializePageList(const void *buffer, std::uint64_t bufSize,
0298                                             ROOT::DescriptorId_t clusterGroupId, RNTupleDescriptor &desc,
0299                                             EDescriptorDeserializeMode mode);
0300 
0301    // Helper functions to (de-)serialize the streamer info type extra information
0302    static std::string SerializeStreamerInfos(const StreamerInfoMap_t &infos);
0303    static RResult<StreamerInfoMap_t> DeserializeStreamerInfos(const std::string &extraTypeInfoContent);
0304 }; // class RNTupleSerializer
0305 
0306 } // namespace Internal
0307 } // namespace ROOT
0308 
0309 #endif // ROOT_RNTupleSerialize