arrow/util/float16.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 #pragma once
0019
0020 #include <array>
0021 #include <cstdint>
0022 #include <cstring>
0023 #include <iosfwd>
0024 #include <limits>
0025 #include <type_traits>
0026
0027 #include "arrow/util/endian.h"
0028 #include "arrow/util/macros.h"
0029 #include "arrow/util/ubsan.h"
0030 #include "arrow/util/visibility.h"
0031
0032 namespace arrow {
0033 namespace util {
0034
0035 /// \brief Class representing an IEEE half-precision float, encoded as a `uint16_t`
0036 ///
0037 /// The exact format is as follows (from LSB to MSB):
0038 /// - bits 0-10:  mantissa
0039 /// - bits 10-15: exponent
0040 /// - bit 15:     sign
0041 ///
0042 class ARROW_EXPORT Float16 {
0043  public:
0044   Float16() = default;
0045   explicit Float16(float f) : Float16(FromFloat(f)) {}
0046   explicit Float16(double d) : Float16(FromDouble(d)) {}
0047   template <typename T,
0048             typename std::enable_if_t<std::is_convertible_v<T, double>>* = NULLPTR>
0049   explicit Float16(T v) : Float16(static_cast<double>(v)) {}
0050
0051   /// \brief Create a `Float16` from its exact binary representation
0052   constexpr static Float16 FromBits(uint16_t bits) { return Float16{bits, bool{}}; }
0053   /// \brief Create a `Float16` from a 32-bit float (may lose precision)
0054   static Float16 FromFloat(float f);
0055   /// \brief Create a `Float16` from a 64-bit float (may lose precision)
0056   static Float16 FromDouble(double d);
0057
0058   /// \brief Read a `Float16` from memory in native-endian byte order
0059   static Float16 FromBytes(const uint8_t* src) {
0060     return FromBits(SafeLoadAs<uint16_t>(src));
0061   }
0062
0063   /// \brief Read a `Float16` from memory in little-endian byte order
0064   static Float16 FromLittleEndian(const uint8_t* src) {
0065     return FromBits(::arrow::bit_util::FromLittleEndian(SafeLoadAs<uint16_t>(src)));
0066   }
0067
0068   /// \brief Read a `Float16` from memory in big-endian byte order
0069   static Float16 FromBigEndian(const uint8_t* src) {
0070     return FromBits(::arrow::bit_util::FromBigEndian(SafeLoadAs<uint16_t>(src)));
0071   }
0072
0073   /// \brief Return the value's binary representation as a `uint16_t`
0074   constexpr uint16_t bits() const { return bits_; }
0075
0076   /// \brief Return true if the value is negative (sign bit is set)
0077   constexpr bool signbit() const { return (bits_ & 0x8000) != 0; }
0078
0079   /// \brief Return true if the value is NaN
0080   constexpr bool is_nan() const { return (bits_ & 0x7fff) > 0x7c00; }
0081   /// \brief Return true if the value is positive/negative infinity
0082   constexpr bool is_infinity() const { return (bits_ & 0x7fff) == 0x7c00; }
0083   /// \brief Return true if the value is finite and not NaN
0084   constexpr bool is_finite() const { return (bits_ & 0x7c00) != 0x7c00; }
0085   /// \brief Return true if the value is positive/negative zero
0086   constexpr bool is_zero() const { return (bits_ & 0x7fff) == 0; }
0087
0088   /// \brief Convert to a 32-bit float
0089   float ToFloat() const;
0090   /// \brief Convert to a 64-bit float
0091   double ToDouble() const;
0092
0093   explicit operator float() const { return ToFloat(); }
0094   explicit operator double() const { return ToDouble(); }
0095
0096   /// \brief Copy the value's bytes in native-endian byte order
0097   void ToBytes(uint8_t* dest) const { std::memcpy(dest, &bits_, sizeof(bits_)); }
0098   /// \brief Return the value's bytes in native-endian byte order
0099   constexpr std::array<uint8_t, 2> ToBytes() const {
0100 #if ARROW_LITTLE_ENDIAN
0101     return ToLittleEndian();
0102 #else
0103     return ToBigEndian();
0104 #endif
0105   }
0106
0107   /// \brief Copy the value's bytes in little-endian byte order
0108   void ToLittleEndian(uint8_t* dest) const {
0109     const auto bytes = ToLittleEndian();
0110     std::memcpy(dest, bytes.data(), bytes.size());
0111   }
0112   /// \brief Return the value's bytes in little-endian byte order
0113   constexpr std::array<uint8_t, 2> ToLittleEndian() const {
0114     return {uint8_t(bits_ & 0xff), uint8_t(bits_ >> 8)};
0115   }
0116
0117   /// \brief Copy the value's bytes in big-endian byte order
0118   void ToBigEndian(uint8_t* dest) const {
0119     const auto bytes = ToBigEndian();
0120     std::memcpy(dest, bytes.data(), bytes.size());
0121   }
0122   /// \brief Return the value's bytes in big-endian byte order
0123   constexpr std::array<uint8_t, 2> ToBigEndian() const {
0124     return {uint8_t(bits_ >> 8), uint8_t(bits_ & 0xff)};
0125   }
0126
0127   constexpr Float16 operator-() const { return FromBits(bits_ ^ 0x8000); }
0128   constexpr Float16 operator+() const { return FromBits(bits_); }
0129
0130   friend constexpr bool operator==(Float16 lhs, Float16 rhs) {
0131     if (lhs.is_nan() || rhs.is_nan()) return false;
0132     return Float16::CompareEq(lhs, rhs);
0133   }
0134   friend constexpr bool operator!=(Float16 lhs, Float16 rhs) { return !(lhs == rhs); }
0135
0136   friend constexpr bool operator<(Float16 lhs, Float16 rhs) {
0137     if (lhs.is_nan() || rhs.is_nan()) return false;
0138     return Float16::CompareLt(lhs, rhs);
0139   }
0140   friend constexpr bool operator>(Float16 lhs, Float16 rhs) { return rhs < lhs; }
0141
0142   friend constexpr bool operator<=(Float16 lhs, Float16 rhs) {
0143     if (lhs.is_nan() || rhs.is_nan()) return false;
0144     return !Float16::CompareLt(rhs, lhs);
0145   }
0146   friend constexpr bool operator>=(Float16 lhs, Float16 rhs) { return rhs <= lhs; }
0147
0148   ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, Float16 arg);
0149
0150  protected:
0151   uint16_t bits_;
0152
0153  private:
0154   constexpr Float16(uint16_t bits, bool) : bits_(bits) {}
0155
0156   // Comparison helpers that assume neither operand is NaN
0157   static constexpr bool CompareEq(Float16 lhs, Float16 rhs) {
0158     return (lhs.bits() == rhs.bits()) || (lhs.is_zero() && rhs.is_zero());
0159   }
0160   static constexpr bool CompareLt(Float16 lhs, Float16 rhs) {
0161     if (lhs.signbit()) {
0162       if (rhs.signbit()) {
0163         // Both are negative
0164         return lhs.bits() > rhs.bits();
0165       } else {
0166         // Handle +/-0
0167         return !lhs.is_zero() || rhs.bits() != 0;
0168       }
0169     } else if (rhs.signbit()) {
0170       return false;
0171     } else {
0172       // Both are positive
0173       return lhs.bits() < rhs.bits();
0174     }
0175   }
0176 };
0177
0178 static_assert(std::is_trivial_v<Float16>);
0179
0180 }  // namespace util
0181 }  // namespace arrow
0182
0183 // TODO: Not complete
0184 template <>
0185 class std::numeric_limits<arrow::util::Float16> {
0186   using T = arrow::util::Float16;
0187
0188  public:
0189   static constexpr bool is_specialized = true;
0190   static constexpr bool is_signed = true;
0191   static constexpr bool has_infinity = true;
0192   static constexpr bool has_quiet_NaN = true;
0193
0194   static constexpr T min() { return T::FromBits(0b0000010000000000); }
0195   static constexpr T max() { return T::FromBits(0b0111101111111111); }
0196   static constexpr T lowest() { return -max(); }
0197
0198   static constexpr T infinity() { return T::FromBits(0b0111110000000000); }
0199
0200   static constexpr T quiet_NaN() { return T::FromBits(0b0111111111111111); }
0201 };