![]() |
|
|||
File indexing completed on 2025-08-28 08:26:55
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 /// \file abi.h Arrow C Data Interface 0019 /// 0020 /// The Arrow C Data interface defines a very small, stable set 0021 /// of C definitions which can be easily copied into any project's 0022 /// source code and vendored to be used for columnar data interchange 0023 /// in the Arrow format. For non-C/C++ languages and runtimes, 0024 /// it should be almost as easy to translate the C definitions into 0025 /// the corresponding C FFI declarations. 0026 /// 0027 /// Applications and libraries can therefore work with Arrow memory 0028 /// without necessarily using the Arrow libraries or reinventing 0029 /// the wheel. Developers can choose between tight integration 0030 /// with the Arrow software project or minimal integration with 0031 /// the Arrow format only. 0032 0033 #pragma once 0034 0035 #include <stdint.h> 0036 0037 // Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html 0038 0039 #ifdef __cplusplus 0040 extern "C" { 0041 #endif 0042 0043 #ifndef ARROW_C_DATA_INTERFACE 0044 # define ARROW_C_DATA_INTERFACE 0045 0046 # define ARROW_FLAG_DICTIONARY_ORDERED 1 0047 # define ARROW_FLAG_NULLABLE 2 0048 # define ARROW_FLAG_MAP_KEYS_SORTED 4 0049 0050 struct ArrowSchema { 0051 // Array type description 0052 const char* format; 0053 const char* name; 0054 const char* metadata; 0055 int64_t flags; 0056 int64_t n_children; 0057 struct ArrowSchema** children; 0058 struct ArrowSchema* dictionary; 0059 0060 // Release callback 0061 void (*release)(struct ArrowSchema*); 0062 // Opaque producer-specific data 0063 void* private_data; 0064 }; 0065 0066 struct ArrowArray { 0067 // Array data description 0068 int64_t length; 0069 int64_t null_count; 0070 int64_t offset; 0071 int64_t n_buffers; 0072 int64_t n_children; 0073 const void** buffers; 0074 struct ArrowArray** children; 0075 struct ArrowArray* dictionary; 0076 0077 // Release callback 0078 void (*release)(struct ArrowArray*); 0079 // Opaque producer-specific data 0080 void* private_data; 0081 }; 0082 0083 # define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT "ARROW:average_byte_width:exact" 0084 # define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \ 0085 "ARROW:average_byte_width:approximate" 0086 # define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT "ARROW:distinct_count:exact" 0087 # define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \ 0088 "ARROW:distinct_count:approximate" 0089 # define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT "ARROW:max_byte_width:exact" 0090 # define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \ 0091 "ARROW:max_byte_width:approximate" 0092 # define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact" 0093 # define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE "ARROW:max_value:approximate" 0094 # define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact" 0095 # define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE "ARROW:min_value:approximate" 0096 # define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact" 0097 # define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE "ARROW:null_count:approximate" 0098 # define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact" 0099 # define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE "ARROW:row_count:approximate" 0100 0101 #endif // ARROW_C_DATA_INTERFACE 0102 0103 #ifndef ARROW_C_DEVICE_DATA_INTERFACE 0104 # define ARROW_C_DEVICE_DATA_INTERFACE 0105 0106 // Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html 0107 0108 // DeviceType for the allocated memory 0109 typedef int32_t ArrowDeviceType; 0110 0111 // CPU device, same as using ArrowArray directly 0112 # define ARROW_DEVICE_CPU 1 0113 // CUDA GPU Device 0114 # define ARROW_DEVICE_CUDA 2 0115 // Pinned CUDA CPU memory by cudaMallocHost 0116 # define ARROW_DEVICE_CUDA_HOST 3 0117 // OpenCL Device 0118 # define ARROW_DEVICE_OPENCL 4 0119 // Vulkan buffer for next-gen graphics 0120 # define ARROW_DEVICE_VULKAN 7 0121 // Metal for Apple GPU 0122 # define ARROW_DEVICE_METAL 8 0123 // Verilog simulator buffer 0124 # define ARROW_DEVICE_VPI 9 0125 // ROCm GPUs for AMD GPUs 0126 # define ARROW_DEVICE_ROCM 10 0127 // Pinned ROCm CPU memory allocated by hipMallocHost 0128 # define ARROW_DEVICE_ROCM_HOST 11 0129 // Reserved for extension 0130 # define ARROW_DEVICE_EXT_DEV 12 0131 // CUDA managed/unified memory allocated by cudaMallocManaged 0132 # define ARROW_DEVICE_CUDA_MANAGED 13 0133 // unified shared memory allocated on a oneAPI non-partitioned device. 0134 # define ARROW_DEVICE_ONEAPI 14 0135 // GPU support for next-gen WebGPU standard 0136 # define ARROW_DEVICE_WEBGPU 15 0137 // Qualcomm Hexagon DSP 0138 # define ARROW_DEVICE_HEXAGON 16 0139 0140 struct ArrowDeviceArray { 0141 // the Allocated Array 0142 // 0143 // the buffers in the array (along with the buffers of any 0144 // children) are what is allocated on the device. 0145 struct ArrowArray array; 0146 // The device id to identify a specific device 0147 int64_t device_id; 0148 // The type of device which can access this memory. 0149 ArrowDeviceType device_type; 0150 // An event-like object to synchronize on if needed. 0151 void* sync_event; 0152 // Reserved bytes for future expansion. 0153 int64_t reserved[3]; 0154 }; 0155 0156 #endif // ARROW_C_DEVICE_DATA_INTERFACE 0157 0158 #ifndef ARROW_C_STREAM_INTERFACE 0159 # define ARROW_C_STREAM_INTERFACE 0160 0161 struct ArrowArrayStream { 0162 // Callback to get the stream type 0163 // (will be the same for all arrays in the stream). 0164 // 0165 // Return value: 0 if successful, an `errno`-compatible error code otherwise. 0166 // 0167 // If successful, the ArrowSchema must be released independently from the stream. 0168 int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); 0169 0170 // Callback to get the next array 0171 // (if no error and the array is released, the stream has ended) 0172 // 0173 // Return value: 0 if successful, an `errno`-compatible error code otherwise. 0174 // 0175 // If successful, the ArrowArray must be released independently from the stream. 0176 int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); 0177 0178 // Callback to get optional detailed error information. 0179 // This must only be called if the last stream operation failed 0180 // with a non-0 return code. 0181 // 0182 // Return value: pointer to a null-terminated character array describing 0183 // the last error, or NULL if no description is available. 0184 // 0185 // The returned pointer is only valid until the next operation on this stream 0186 // (including release). 0187 const char* (*get_last_error)(struct ArrowArrayStream*); 0188 0189 // Release callback: release the stream's own resources. 0190 // Note that arrays returned by `get_next` must be individually released. 0191 void (*release)(struct ArrowArrayStream*); 0192 0193 // Opaque producer-specific data 0194 void* private_data; 0195 }; 0196 0197 #endif // ARROW_C_STREAM_INTERFACE 0198 0199 #ifndef ARROW_C_DEVICE_STREAM_INTERFACE 0200 # define ARROW_C_DEVICE_STREAM_INTERFACE 0201 0202 // Equivalent to ArrowArrayStream, but for ArrowDeviceArrays. 0203 // 0204 // This stream is intended to provide a stream of data on a single 0205 // device, if a producer wants data to be produced on multiple devices 0206 // then multiple streams should be provided. One per device. 0207 struct ArrowDeviceArrayStream { 0208 // The device that this stream produces data on. 0209 ArrowDeviceType device_type; 0210 0211 // Callback to get the stream schema 0212 // (will be the same for all arrays in the stream). 0213 // 0214 // Return value 0 if successful, an `errno`-compatible error code otherwise. 0215 // 0216 // If successful, the ArrowSchema must be released independently from the stream. 0217 // The schema should be accessible via CPU memory. 0218 int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out); 0219 0220 // Callback to get the next array 0221 // (if no error and the array is released, the stream has ended) 0222 // 0223 // Return value: 0 if successful, an `errno`-compatible error code otherwise. 0224 // 0225 // If successful, the ArrowDeviceArray must be released independently from the stream. 0226 int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out); 0227 0228 // Callback to get optional detailed error information. 0229 // This must only be called if the last stream operation failed 0230 // with a non-0 return code. 0231 // 0232 // Return value: pointer to a null-terminated character array describing 0233 // the last error, or NULL if no description is available. 0234 // 0235 // The returned pointer is only valid until the next operation on this stream 0236 // (including release). 0237 const char* (*get_last_error)(struct ArrowDeviceArrayStream* self); 0238 0239 // Release callback: release the stream's own resources. 0240 // Note that arrays returned by `get_next` must be individually released. 0241 void (*release)(struct ArrowDeviceArrayStream* self); 0242 0243 // Opaque producer-specific data 0244 void* private_data; 0245 }; 0246 0247 #endif // ARROW_C_DEVICE_STREAM_INTERFACE 0248 0249 #ifndef ARROW_C_ASYNC_STREAM_INTERFACE 0250 # define ARROW_C_ASYNC_STREAM_INTERFACE 0251 0252 // EXPERIMENTAL: ArrowAsyncTask represents available data from a producer that was passed 0253 // to an invocation of `on_next_task` on the ArrowAsyncDeviceStreamHandler. 0254 // 0255 // The reason for this Task approach instead of the Async interface returning 0256 // the Array directly is to allow for more complex thread handling and reducing 0257 // context switching and data transfers between CPU cores (e.g. from one L1/L2 0258 // cache to another) if desired. 0259 // 0260 // For example, the `on_next_task` callback can be called when data is ready, while 0261 // the producer puts potential "decoding" logic in the `ArrowAsyncTask` object. This 0262 // allows for the producer to manage the I/O on one thread which calls `on_next_task` 0263 // and the consumer can determine when the decoding (producer logic in the `extract_data` 0264 // callback of the task) occurs and on which thread, to avoid a CPU core transfer 0265 // (data staying in the L2 cache). 0266 struct ArrowAsyncTask { 0267 // This callback should populate the ArrowDeviceArray associated with this task. 0268 // The order of ArrowAsyncTasks provided by the producer enables a consumer to 0269 // ensure the order of data to process. 0270 // 0271 // This function is expected to be synchronous, but should not perform any blocking 0272 // I/O. Ideally it should be as cheap as possible so as to not tie up the consumer 0273 // thread unnecessarily. 0274 // 0275 // Returns: 0 if successful, errno-compatible error otherwise. 0276 // 0277 // If a non-0 value is returned then it should be followed by a call to `on_error` 0278 // on the appropriate ArrowAsyncDeviceStreamHandler. This is because it's highly 0279 // likely that whatever is calling this function may be entirely disconnected from 0280 // the current control flow. Indicating an error here with a non-zero return allows 0281 // the current flow to be aware of the error occurring, while still allowing any 0282 // logging or error handling to still be centralized in the `on_error` callback of 0283 // the original Async handler. 0284 // 0285 // Rather than a release callback, any required cleanup should be performed as part 0286 // of the invocation of `extract_data`. Ownership of the Array is passed to the consumer 0287 // calling this, and so it must be released separately. 0288 // 0289 // It is only valid to call this method exactly once. 0290 int (*extract_data)(struct ArrowAsyncTask* self, struct ArrowDeviceArray* out); 0291 0292 // opaque task-specific data 0293 void* private_data; 0294 }; 0295 0296 // EXPERIMENTAL: ArrowAsyncProducer represents a 1-to-1 relationship between an async 0297 // producer and consumer. This object allows the consumer to perform backpressure and flow 0298 // control on the asynchronous stream processing. This object must be owned by the 0299 // producer who creates it, and thus is responsible for cleaning it up. 0300 struct ArrowAsyncProducer { 0301 // The device type that this stream produces data on. 0302 ArrowDeviceType device_type; 0303 0304 // A consumer must call this function to start receiving on_next_task calls. 0305 // 0306 // It *must* be valid to call this synchronously from within `on_next_task` or 0307 // `on_schema`, but this function *must not* immediately call `on_next_task` so as 0308 // to avoid recursion and reentrant callbacks. 0309 // 0310 // After cancel has been called, additional calls to this function must be NOPs, 0311 // but allowed. While not cancelled, calling this function must register the 0312 // given number of additional arrays/batches to be produced with the producer. 0313 // The producer should only call `on_next_task` at most the registered number 0314 // of arrays before propagating backpressure. 0315 // 0316 // Any error encountered by calling request must be propagated by calling the `on_error` 0317 // callback of the ArrowAsyncDeviceStreamHandler. 0318 // 0319 // While not cancelled, any subsequent calls to `on_next_task`, `on_error` or 0320 // `release` should be scheduled by the producer to be called later. 0321 // 0322 // It is invalid for a consumer to call this with a value of n <= 0, producers should 0323 // error if given such a value. 0324 void (*request)(struct ArrowAsyncProducer* self, int64_t n); 0325 0326 // This cancel callback signals a producer that it must eventually stop making calls 0327 // to on_next_task. It must be idempotent and thread-safe. After calling cancel once, 0328 // subsequent calls must be NOPs. This must not call any consumer-side handlers other 0329 // than `on_error`. 0330 // 0331 // It is not required that calling cancel affect the producer immediately, only that it 0332 // must eventually stop calling on_next_task and subsequently call release on the 0333 // async handler. As such, a consumer must be prepared to receive one or more calls to 0334 // `on_next_task` even after calling cancel if there are still requested arrays pending. 0335 // 0336 // Successful cancellation should *not* result in the producer calling `on_error`, it 0337 // should finish out any remaining tasks and eventually call `release`. 0338 // 0339 // Any error encountered during handling a call to cancel must be reported via the 0340 // on_error callback on the async stream handler. 0341 void (*cancel)(struct ArrowAsyncProducer* self); 0342 0343 // Any additional metadata tied to a specific stream of data. This must either be NULL 0344 // or a valid pointer to metadata which is encoded in the same way schema metadata 0345 // would be. Non-null metadata must be valid for the lifetime of this object. As an 0346 // example a producer could use this to provide the total number of rows and/or batches 0347 // in the stream if known. 0348 const char* additional_metadata; 0349 0350 // producer-specific opaque data. 0351 void* private_data; 0352 }; 0353 0354 // EXPERIMENTAL: Similar to ArrowDeviceArrayStream, except designed for an asynchronous 0355 // style of interaction. While ArrowDeviceArrayStream provides producer 0356 // defined callbacks, this is intended to be created by the consumer instead. 0357 // The consumer passes this handler to the producer, which in turn uses the 0358 // callbacks to inform the consumer of events in the stream. 0359 struct ArrowAsyncDeviceStreamHandler { 0360 // Handler for receiving a schema. The passed in stream_schema must be 0361 // released or moved by the handler (producer is giving ownership of the schema to 0362 // the handler, but not ownership of the top level object itself). 0363 // 0364 // With the exception of an error occurring (on_error), this must be the first 0365 // callback function which is called by a producer and must only be called exactly 0366 // once. As such, the producer should provide a valid ArrowAsyncProducer instance 0367 // so the consumer can control the flow. See the documentation on ArrowAsyncProducer 0368 // for how it works. The ArrowAsyncProducer is owned by the producer who calls this 0369 // function and thus the producer is responsible for cleaning it up when calling 0370 // the release callback of this handler. 0371 // 0372 // If there is any additional metadata tied to this stream, it will be provided as 0373 // a non-null value for the `additional_metadata` field of the ArrowAsyncProducer 0374 // which will be valid at least until the release callback is called. 0375 // 0376 // Return value: 0 if successful, `errno`-compatible error otherwise 0377 // 0378 // A producer that receives a non-zero return here should stop producing and eventually 0379 // call release instead. 0380 int (*on_schema)(struct ArrowAsyncDeviceStreamHandler* self, 0381 struct ArrowSchema* stream_schema); 0382 0383 // Handler for receiving data. This is called when data is available providing an 0384 // ArrowAsyncTask struct to signify it. The producer indicates the end of the stream 0385 // by passing NULL as the value for the task rather than a valid pointer to a task. 0386 // The task object is only valid for the lifetime of this function call, if a consumer 0387 // wants to utilize it after this function returns, it must copy or move the contents 0388 // of it to a new ArrowAsyncTask object. 0389 // 0390 // The `request` callback of a provided ArrowAsyncProducer must be called in order 0391 // to start receiving calls to this handler. 0392 // 0393 // The metadata argument can be null or can be used by a producer 0394 // to pass arbitrary extra information to the consumer (such as total number 0395 // of rows, context info, or otherwise). The data should be passed using the same 0396 // encoding as the metadata within the ArrowSchema struct itself (defined in 0397 // the spec at 0398 // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata) 0399 // 0400 // If metadata is non-null then it only needs to exist for the lifetime of this call, 0401 // a consumer who wants it to live after that must copy it to ensure lifetime. 0402 // 0403 // A producer *must not* call this concurrently from multiple different threads. 0404 // 0405 // A consumer must be prepared to receive one or more calls to this callback even 0406 // after calling cancel on the corresponding ArrowAsyncProducer, as cancel does not 0407 // guarantee it happens immediately. 0408 // 0409 // Return value: 0 if successful, `errno`-compatible error otherwise. 0410 // 0411 // If the consumer returns a non-zero return from this method, that indicates to the 0412 // producer that it should stop propagating data as an error occurred. After receiving 0413 // such a return, the only interaction with this object is for the producer to call 0414 // the `release` callback. 0415 int (*on_next_task)(struct ArrowAsyncDeviceStreamHandler* self, 0416 struct ArrowAsyncTask* task, const char* metadata); 0417 0418 // Handler for encountering an error. The producer should call release after 0419 // this returns to clean up any resources. The `code` passed in can be any error 0420 // code that a producer wants, but should be errno-compatible for consistency. 0421 // 0422 // If the message or metadata are non-null, they will only last as long as this 0423 // function call. The consumer would need to perform a copy of the data if it is 0424 // necessary for them to live past the lifetime of this call. 0425 // 0426 // Error metadata should be encoded as with metadata in ArrowSchema, defined in 0427 // the spec at 0428 // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata 0429 // 0430 // It is valid for this to be called by a producer with or without a preceding call 0431 // to ArrowAsyncProducer.request. 0432 // 0433 // This callback must not call any methods of an ArrowAsyncProducer object. 0434 void (*on_error)(struct ArrowAsyncDeviceStreamHandler* self, int code, 0435 const char* message, const char* metadata); 0436 0437 // Release callback to release any resources for the handler. Should always be 0438 // called by a producer when it is done utilizing a handler. No callbacks should 0439 // be called after this is called. 0440 // 0441 // It is valid for the release callback to be called by a producer with or without 0442 // a preceding call to ArrowAsyncProducer.request. 0443 // 0444 // The release callback must not call any methods of an ArrowAsyncProducer object. 0445 void (*release)(struct ArrowAsyncDeviceStreamHandler* self); 0446 0447 // MUST be populated by the producer BEFORE calling any callbacks other than release. 0448 // This provides the connection between a handler and its producer, and must exist until 0449 // the release callback is called. 0450 struct ArrowAsyncProducer* producer; 0451 0452 // Opaque handler-specific data 0453 void* private_data; 0454 }; 0455 0456 #endif // ARROW_C_ASYNC_STREAM_INTERFACE 0457 0458 #ifdef __cplusplus 0459 } 0460 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |