arrow/util/value_parsing.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 // This is a private header for string-to-number parsing utilities
0019
0020 #pragma once
0021
0022 #include <cassert>
0023 #include <chrono>
0024 #include <cstddef>
0025 #include <cstdint>
0026 #include <limits>
0027 #include <memory>
0028 #include <string>
0029 #include <type_traits>
0030
0031 #include "arrow/type.h"
0032 #include "arrow/type_traits.h"
0033 #include "arrow/util/checked_cast.h"
0034 #include "arrow/util/config.h"
0035 #include "arrow/util/macros.h"
0036 #include "arrow/util/time.h"
0037 #include "arrow/util/visibility.h"
0038 #include "arrow/vendored/datetime.h"
0039 #include "arrow/vendored/strptime.h"
0040
0041 namespace arrow {
0042
0043 /// \brief A virtual string to timestamp parser
0044 class ARROW_EXPORT TimestampParser {
0045  public:
0046   virtual ~TimestampParser() = default;
0047
0048   virtual bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
0049                           int64_t* out,
0050                           bool* out_zone_offset_present = NULLPTR) const = 0;
0051
0052   virtual const char* kind() const = 0;
0053
0054   virtual const char* format() const;
0055
0056   /// \brief Create a TimestampParser that recognizes strptime-like format strings
0057   static std::shared_ptr<TimestampParser> MakeStrptime(std::string format);
0058
0059   /// \brief Create a TimestampParser that recognizes (locale-agnostic) ISO8601
0060   /// timestamps
0061   static std::shared_ptr<TimestampParser> MakeISO8601();
0062 };
0063
0064 namespace internal {
0065
0066 /// \brief The entry point for conversion from strings.
0067 ///
0068 /// Specializations of StringConverter for `ARROW_TYPE` must define:
0069 /// - A default constructible member type `value_type` which will be yielded on a
0070 ///   successful parse.
0071 /// - The static member function `Convert`, callable with signature
0072 ///   `(const ARROW_TYPE& t, const char* s, size_t length, value_type* out)`.
0073 ///   `Convert` returns truthy for successful parses and assigns the parsed values to
0074 ///   `*out`. Parameters required for parsing (for example a timestamp's TimeUnit)
0075 ///   are acquired from the type parameter `t`.
0076 template <typename ARROW_TYPE, typename Enable = void>
0077 struct StringConverter;
0078
0079 template <typename T>
0080 struct is_parseable {
0081   template <typename U, typename = typename StringConverter<U>::value_type>
0082   static std::true_type Test(U*);
0083
0084   template <typename U>
0085   static std::false_type Test(...);
0086
0087   static constexpr bool value = decltype(Test<T>(NULLPTR))::value;
0088 };
0089
0090 template <typename T, typename R = void>
0091 using enable_if_parseable = enable_if_t<is_parseable<T>::value, R>;
0092
0093 template <>
0094 struct StringConverter<BooleanType> {
0095   using value_type = bool;
0096
0097   bool Convert(const BooleanType&, const char* s, size_t length, value_type* out) {
0098     if (length == 1) {
0099       // "0" or "1"?
0100       if (s[0] == '0') {
0101         *out = false;
0102         return true;
0103       }
0104       if (s[0] == '1') {
0105         *out = true;
0106         return true;
0107       }
0108       return false;
0109     }
0110     if (length == 4) {
0111       // "true"?
0112       *out = true;
0113       return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
0114               (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
0115     }
0116     if (length == 5) {
0117       // "false"?
0118       *out = false;
0119       return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
0120               (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
0121               (s[4] == 'e' || s[4] == 'E'));
0122     }
0123     return false;
0124   }
0125 };
0126
0127 // Ideas for faster float parsing:
0128 // - http://rapidjson.org/md_doc_internals.html#ParsingDouble
0129 // - https://github.com/google/double-conversion [used here]
0130 // - https://github.com/achan001/dtoa-fast
0131
0132 ARROW_EXPORT
0133 bool StringToFloat(const char* s, size_t length, char decimal_point, float* out);
0134
0135 ARROW_EXPORT
0136 bool StringToFloat(const char* s, size_t length, char decimal_point, double* out);
0137
0138 ARROW_EXPORT
0139 bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* out);
0140
0141 template <>
0142 struct StringConverter<FloatType> {
0143   using value_type = float;
0144
0145   explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
0146
0147   bool Convert(const FloatType&, const char* s, size_t length, value_type* out) {
0148     return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
0149   }
0150
0151  private:
0152   const char decimal_point;
0153 };
0154
0155 template <>
0156 struct StringConverter<DoubleType> {
0157   using value_type = double;
0158
0159   explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
0160
0161   bool Convert(const DoubleType&, const char* s, size_t length, value_type* out) {
0162     return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
0163   }
0164
0165  private:
0166   const char decimal_point;
0167 };
0168
0169 template <>
0170 struct StringConverter<HalfFloatType> {
0171   using value_type = uint16_t;
0172
0173   explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
0174
0175   bool Convert(const HalfFloatType&, const char* s, size_t length, value_type* out) {
0176     return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
0177   }
0178
0179  private:
0180   const char decimal_point;
0181 };
0182
0183 // NOTE: HalfFloatType would require a half<->float conversion library
0184
0185 inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - '0'); }
0186
0187 #define PARSE_UNSIGNED_ITERATION(C_TYPE)          \
0188   if (length > 0) {                               \
0189     uint8_t digit = ParseDecimalDigit(*s++);      \
0190     result = static_cast<C_TYPE>(result * 10U);   \
0191     length--;                                     \
0192     if (ARROW_PREDICT_FALSE(digit > 9U)) {        \
0193       /* Non-digit */                             \
0194       return false;                               \
0195     }                                             \
0196     result = static_cast<C_TYPE>(result + digit); \
0197   } else {                                        \
0198     break;                                        \
0199   }
0200
0201 #define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE)                                     \
0202   if (length > 0) {                                                               \
0203     if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \
0204       /* Overflow */                                                              \
0205       return false;                                                               \
0206     }                                                                             \
0207     uint8_t digit = ParseDecimalDigit(*s++);                                      \
0208     result = static_cast<C_TYPE>(result * 10U);                                   \
0209     C_TYPE new_result = static_cast<C_TYPE>(result + digit);                      \
0210     if (ARROW_PREDICT_FALSE(--length > 0)) {                                      \
0211       /* Too many digits */                                                       \
0212       return false;                                                               \
0213     }                                                                             \
0214     if (ARROW_PREDICT_FALSE(digit > 9U)) {                                        \
0215       /* Non-digit */                                                             \
0216       return false;                                                               \
0217     }                                                                             \
0218     if (ARROW_PREDICT_FALSE(new_result < result)) {                               \
0219       /* Overflow */                                                              \
0220       return false;                                                               \
0221     }                                                                             \
0222     result = new_result;                                                          \
0223   }
0224
0225 inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) {
0226   uint8_t result = 0;
0227
0228   do {
0229     PARSE_UNSIGNED_ITERATION(uint8_t);
0230     PARSE_UNSIGNED_ITERATION(uint8_t);
0231     PARSE_UNSIGNED_ITERATION_LAST(uint8_t);
0232   } while (false);
0233   *out = result;
0234   return true;
0235 }
0236
0237 inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) {
0238   uint16_t result = 0;
0239   do {
0240     PARSE_UNSIGNED_ITERATION(uint16_t);
0241     PARSE_UNSIGNED_ITERATION(uint16_t);
0242     PARSE_UNSIGNED_ITERATION(uint16_t);
0243     PARSE_UNSIGNED_ITERATION(uint16_t);
0244     PARSE_UNSIGNED_ITERATION_LAST(uint16_t);
0245   } while (false);
0246   *out = result;
0247   return true;
0248 }
0249
0250 inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) {
0251   uint32_t result = 0;
0252   do {
0253     PARSE_UNSIGNED_ITERATION(uint32_t);
0254     PARSE_UNSIGNED_ITERATION(uint32_t);
0255     PARSE_UNSIGNED_ITERATION(uint32_t);
0256     PARSE_UNSIGNED_ITERATION(uint32_t);
0257     PARSE_UNSIGNED_ITERATION(uint32_t);
0258
0259     PARSE_UNSIGNED_ITERATION(uint32_t);
0260     PARSE_UNSIGNED_ITERATION(uint32_t);
0261     PARSE_UNSIGNED_ITERATION(uint32_t);
0262     PARSE_UNSIGNED_ITERATION(uint32_t);
0263
0264     PARSE_UNSIGNED_ITERATION_LAST(uint32_t);
0265   } while (false);
0266   *out = result;
0267   return true;
0268 }
0269
0270 inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
0271   uint64_t result = 0;
0272   do {
0273     PARSE_UNSIGNED_ITERATION(uint64_t);
0274     PARSE_UNSIGNED_ITERATION(uint64_t);
0275     PARSE_UNSIGNED_ITERATION(uint64_t);
0276     PARSE_UNSIGNED_ITERATION(uint64_t);
0277     PARSE_UNSIGNED_ITERATION(uint64_t);
0278
0279     PARSE_UNSIGNED_ITERATION(uint64_t);
0280     PARSE_UNSIGNED_ITERATION(uint64_t);
0281     PARSE_UNSIGNED_ITERATION(uint64_t);
0282     PARSE_UNSIGNED_ITERATION(uint64_t);
0283     PARSE_UNSIGNED_ITERATION(uint64_t);
0284
0285     PARSE_UNSIGNED_ITERATION(uint64_t);
0286     PARSE_UNSIGNED_ITERATION(uint64_t);
0287     PARSE_UNSIGNED_ITERATION(uint64_t);
0288     PARSE_UNSIGNED_ITERATION(uint64_t);
0289     PARSE_UNSIGNED_ITERATION(uint64_t);
0290
0291     PARSE_UNSIGNED_ITERATION(uint64_t);
0292     PARSE_UNSIGNED_ITERATION(uint64_t);
0293     PARSE_UNSIGNED_ITERATION(uint64_t);
0294     PARSE_UNSIGNED_ITERATION(uint64_t);
0295
0296     PARSE_UNSIGNED_ITERATION_LAST(uint64_t);
0297   } while (false);
0298   *out = result;
0299   return true;
0300 }
0301
0302 #undef PARSE_UNSIGNED_ITERATION
0303 #undef PARSE_UNSIGNED_ITERATION_LAST
0304
0305 template <typename T>
0306 bool ParseHex(const char* s, size_t length, T* out) {
0307   // lets make sure that the length of the string is not too big
0308   if (!ARROW_PREDICT_TRUE(sizeof(T) * 2 >= length && length > 0)) {
0309     return false;
0310   }
0311   T result = 0;
0312   for (size_t i = 0; i < length; i++) {
0313     result = static_cast<T>(result << 4);
0314     if (s[i] >= '0' && s[i] <= '9') {
0315       result = static_cast<T>(result | (s[i] - '0'));
0316     } else if (s[i] >= 'A' && s[i] <= 'F') {
0317       result = static_cast<T>(result | (s[i] - 'A' + 10));
0318     } else if (s[i] >= 'a' && s[i] <= 'f') {
0319       result = static_cast<T>(result | (s[i] - 'a' + 10));
0320     } else {
0321       /* Non-digit */
0322       return false;
0323     }
0324   }
0325   *out = result;
0326   return true;
0327 }
0328
0329 template <class ARROW_TYPE>
0330 struct StringToUnsignedIntConverterMixin {
0331   using value_type = typename ARROW_TYPE::c_type;
0332
0333   bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
0334     if (ARROW_PREDICT_FALSE(length == 0)) {
0335       return false;
0336     }
0337     // If it starts with 0x then its hex
0338     if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
0339       length -= 2;
0340       s += 2;
0341
0342       return ARROW_PREDICT_TRUE(ParseHex(s, length, out));
0343     }
0344     // Skip leading zeros
0345     while (length > 0 && *s == '0') {
0346       length--;
0347       s++;
0348     }
0349     return ParseUnsigned(s, length, out);
0350   }
0351 };
0352
0353 template <>
0354 struct StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {
0355   using StringToUnsignedIntConverterMixin<UInt8Type>::StringToUnsignedIntConverterMixin;
0356 };
0357
0358 template <>
0359 struct StringConverter<UInt16Type>
0360     : public StringToUnsignedIntConverterMixin<UInt16Type> {
0361   using StringToUnsignedIntConverterMixin<UInt16Type>::StringToUnsignedIntConverterMixin;
0362 };
0363
0364 template <>
0365 struct StringConverter<UInt32Type>
0366     : public StringToUnsignedIntConverterMixin<UInt32Type> {
0367   using StringToUnsignedIntConverterMixin<UInt32Type>::StringToUnsignedIntConverterMixin;
0368 };
0369
0370 template <>
0371 struct StringConverter<UInt64Type>
0372     : public StringToUnsignedIntConverterMixin<UInt64Type> {
0373   using StringToUnsignedIntConverterMixin<UInt64Type>::StringToUnsignedIntConverterMixin;
0374 };
0375
0376 template <class ARROW_TYPE>
0377 struct StringToSignedIntConverterMixin {
0378   using value_type = typename ARROW_TYPE::c_type;
0379   using unsigned_type = typename std::make_unsigned<value_type>::type;
0380
0381   bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
0382     static constexpr auto max_positive =
0383         static_cast<unsigned_type>(std::numeric_limits<value_type>::max());
0384     // Assuming two's complement
0385     static constexpr unsigned_type max_negative = max_positive + 1;
0386     bool negative = false;
0387     unsigned_type unsigned_value = 0;
0388
0389     if (ARROW_PREDICT_FALSE(length == 0)) {
0390       return false;
0391     }
0392     // If it starts with 0x then its hex
0393     if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
0394       length -= 2;
0395       s += 2;
0396
0397       if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) {
0398         return false;
0399       }
0400       *out = static_cast<value_type>(unsigned_value);
0401       return true;
0402     }
0403
0404     if (*s == '-') {
0405       negative = true;
0406       s++;
0407       if (--length == 0) {
0408         return false;
0409       }
0410     }
0411     // Skip leading zeros
0412     while (length > 0 && *s == '0') {
0413       length--;
0414       s++;
0415     }
0416     if (!ARROW_PREDICT_TRUE(ParseUnsigned(s, length, &unsigned_value))) {
0417       return false;
0418     }
0419     if (negative) {
0420       if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) {
0421         return false;
0422       }
0423       // To avoid both compiler warnings (with unsigned negation)
0424       // and undefined behaviour (with signed negation overflow),
0425       // use the expanded formula for 2's complement negation.
0426       *out = static_cast<value_type>(~unsigned_value + 1);
0427     } else {
0428       if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) {
0429         return false;
0430       }
0431       *out = static_cast<value_type>(unsigned_value);
0432     }
0433     return true;
0434   }
0435 };
0436
0437 template <>
0438 struct StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {
0439   using StringToSignedIntConverterMixin<Int8Type>::StringToSignedIntConverterMixin;
0440 };
0441
0442 template <>
0443 struct StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {
0444   using StringToSignedIntConverterMixin<Int16Type>::StringToSignedIntConverterMixin;
0445 };
0446
0447 template <>
0448 struct StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {
0449   using StringToSignedIntConverterMixin<Int32Type>::StringToSignedIntConverterMixin;
0450 };
0451
0452 template <>
0453 struct StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {
0454   using StringToSignedIntConverterMixin<Int64Type>::StringToSignedIntConverterMixin;
0455 };
0456
0457 namespace detail {
0458
0459 // Inline-able ISO-8601 parser
0460
0461 using ts_type = TimestampType::c_type;
0462
0463 template <typename Duration>
0464 static inline bool ParseHH(const char* s, Duration* out) {
0465   uint8_t hours = 0;
0466   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
0467     return false;
0468   }
0469   if (ARROW_PREDICT_FALSE(hours >= 24)) {
0470     return false;
0471   }
0472   *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours));
0473   return true;
0474 }
0475
0476 template <typename Duration>
0477 static inline bool ParseHH_MM(const char* s, Duration* out) {
0478   uint8_t hours = 0;
0479   uint8_t minutes = 0;
0480   if (ARROW_PREDICT_FALSE(s[2] != ':')) {
0481     return false;
0482   }
0483   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
0484     return false;
0485   }
0486   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
0487     return false;
0488   }
0489   if (ARROW_PREDICT_FALSE(hours >= 24)) {
0490     return false;
0491   }
0492   if (ARROW_PREDICT_FALSE(minutes >= 60)) {
0493     return false;
0494   }
0495   *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
0496                                               std::chrono::minutes(minutes));
0497   return true;
0498 }
0499
0500 template <typename Duration>
0501 static inline bool ParseHHMM(const char* s, Duration* out) {
0502   uint8_t hours = 0;
0503   uint8_t minutes = 0;
0504   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
0505     return false;
0506   }
0507   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 2, 2, &minutes))) {
0508     return false;
0509   }
0510   if (ARROW_PREDICT_FALSE(hours >= 24)) {
0511     return false;
0512   }
0513   if (ARROW_PREDICT_FALSE(minutes >= 60)) {
0514     return false;
0515   }
0516   *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
0517                                               std::chrono::minutes(minutes));
0518   return true;
0519 }
0520
0521 template <typename Duration>
0522 static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
0523   uint8_t hours = 0;
0524   uint8_t minutes = 0;
0525   uint8_t seconds = 0;
0526   if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
0527     return false;
0528   }
0529   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
0530     return false;
0531   }
0532   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
0533     return false;
0534   }
0535   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 6, 2, &seconds))) {
0536     return false;
0537   }
0538   if (ARROW_PREDICT_FALSE(hours >= 24)) {
0539     return false;
0540   }
0541   if (ARROW_PREDICT_FALSE(minutes >= 60)) {
0542     return false;
0543   }
0544   if (ARROW_PREDICT_FALSE(seconds >= 60)) {
0545     return false;
0546   }
0547   *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
0548                                               std::chrono::minutes(minutes) +
0549                                               std::chrono::seconds(seconds));
0550   return true;
0551 }
0552
0553 static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type unit,
0554                                    uint32_t* out) {
0555   // The decimal point has been peeled off at this point
0556
0557   // Fail if number of decimal places provided exceeds what the unit can hold.
0558   // Calculate how many trailing decimal places are omitted for the unit
0559   // e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
0560   size_t omitted = 0;
0561   switch (unit) {
0562     case TimeUnit::MILLI:
0563       if (ARROW_PREDICT_FALSE(length > 3)) {
0564         return false;
0565       }
0566       if (length < 3) {
0567         omitted = 3 - length;
0568       }
0569       break;
0570     case TimeUnit::MICRO:
0571       if (ARROW_PREDICT_FALSE(length > 6)) {
0572         return false;
0573       }
0574       if (length < 6) {
0575         omitted = 6 - length;
0576       }
0577       break;
0578     case TimeUnit::NANO:
0579       if (ARROW_PREDICT_FALSE(length > 9)) {
0580         return false;
0581       }
0582       if (length < 9) {
0583         omitted = 9 - length;
0584       }
0585       break;
0586     default:
0587       return false;
0588   }
0589
0590   if (ARROW_PREDICT_TRUE(omitted == 0)) {
0591     return ParseUnsigned(s, length, out);
0592   } else {
0593     uint32_t subseconds = 0;
0594     bool success = ParseUnsigned(s, length, &subseconds);
0595     if (ARROW_PREDICT_TRUE(success)) {
0596       switch (omitted) {
0597         case 1:
0598           *out = subseconds * 10;
0599           break;
0600         case 2:
0601           *out = subseconds * 100;
0602           break;
0603         case 3:
0604           *out = subseconds * 1000;
0605           break;
0606         case 4:
0607           *out = subseconds * 10000;
0608           break;
0609         case 5:
0610           *out = subseconds * 100000;
0611           break;
0612         case 6:
0613           *out = subseconds * 1000000;
0614           break;
0615         case 7:
0616           *out = subseconds * 10000000;
0617           break;
0618         case 8:
0619           *out = subseconds * 100000000;
0620           break;
0621         default:
0622           // Impossible case
0623           break;
0624       }
0625       return true;
0626     } else {
0627       return false;
0628     }
0629   }
0630 }
0631
0632 }  // namespace detail
0633
0634 template <typename Duration>
0635 static inline bool ParseYYYY_MM_DD(const char* s, Duration* since_epoch) {
0636   uint16_t year = 0;
0637   uint8_t month = 0;
0638   uint8_t day = 0;
0639   if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
0640     return false;
0641   }
0642   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
0643     return false;
0644   }
0645   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
0646     return false;
0647   }
0648   if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
0649     return false;
0650   }
0651   arrow_vendored::date::year_month_day ymd{arrow_vendored::date::year{year},
0652                                            arrow_vendored::date::month{month},
0653                                            arrow_vendored::date::day{day}};
0654   if (ARROW_PREDICT_FALSE(!ymd.ok())) return false;
0655
0656   *since_epoch = std::chrono::duration_cast<Duration>(
0657       arrow_vendored::date::sys_days{ymd}.time_since_epoch());
0658   return true;
0659 }
0660
0661 static inline bool ParseTimestampISO8601(const char* s, size_t length,
0662                                          TimeUnit::type unit, TimestampType::c_type* out,
0663                                          bool* out_zone_offset_present = NULLPTR) {
0664   using seconds_type = std::chrono::duration<TimestampType::c_type>;
0665
0666   // We allow the following zone offset formats:
0667   // - (none)
0668   // - Z
0669   // - [+-]HH(:?MM)?
0670   //
0671   // We allow the following formats for all units:
0672   // - "YYYY-MM-DD"
0673   // - "YYYY-MM-DD[ T]hhZ?"
0674   // - "YYYY-MM-DD[ T]hh:mmZ?"
0675   // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
0676   //
0677   // We allow the following formats for unit == MILLI, MICRO, or NANO:
0678   // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
0679   //
0680   // We allow the following formats for unit == MICRO, or NANO:
0681   // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
0682   //
0683   // We allow the following formats for unit == NANO:
0684   // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
0685   //
0686   // UTC is always assumed, and the DataType's timezone is ignored.
0687   //
0688
0689   if (ARROW_PREDICT_FALSE(length < 10)) return false;
0690
0691   seconds_type seconds_since_epoch;
0692   if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &seconds_since_epoch))) {
0693     return false;
0694   }
0695
0696   if (length == 10) {
0697     *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
0698     return true;
0699   }
0700
0701   if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
0702     return false;
0703   }
0704
0705   if (out_zone_offset_present) {
0706     *out_zone_offset_present = false;
0707   }
0708
0709   seconds_type zone_offset(0);
0710   if (s[length - 1] == 'Z') {
0711     --length;
0712     if (out_zone_offset_present) *out_zone_offset_present = true;
0713   } else if (s[length - 3] == '+' || s[length - 3] == '-') {
0714     // [+-]HH
0715     length -= 3;
0716     if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + length + 1, &zone_offset))) {
0717       return false;
0718     }
0719     if (s[length] == '+') zone_offset *= -1;
0720     if (out_zone_offset_present) *out_zone_offset_present = true;
0721   } else if (s[length - 5] == '+' || s[length - 5] == '-') {
0722     // [+-]HHMM
0723     length -= 5;
0724     if (ARROW_PREDICT_FALSE(!detail::ParseHHMM(s + length + 1, &zone_offset))) {
0725       return false;
0726     }
0727     if (s[length] == '+') zone_offset *= -1;
0728     if (out_zone_offset_present) *out_zone_offset_present = true;
0729   } else if ((s[length - 6] == '+' || s[length - 6] == '-') && (s[length - 3] == ':')) {
0730     // [+-]HH:MM
0731     length -= 6;
0732     if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + length + 1, &zone_offset))) {
0733       return false;
0734     }
0735     if (s[length] == '+') zone_offset *= -1;
0736     if (out_zone_offset_present) *out_zone_offset_present = true;
0737   }
0738
0739   seconds_type seconds_since_midnight;
0740   switch (length) {
0741     case 13:  // YYYY-MM-DD[ T]hh
0742       if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + 11, &seconds_since_midnight))) {
0743         return false;
0744       }
0745       break;
0746     case 16:  // YYYY-MM-DD[ T]hh:mm
0747       if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + 11, &seconds_since_midnight))) {
0748         return false;
0749       }
0750       break;
0751     case 19:  // YYYY-MM-DD[ T]hh:mm:ss
0752     case 21:  // YYYY-MM-DD[ T]hh:mm:ss.s
0753     case 22:  // YYYY-MM-DD[ T]hh:mm:ss.ss
0754     case 23:  // YYYY-MM-DD[ T]hh:mm:ss.sss
0755     case 24:  // YYYY-MM-DD[ T]hh:mm:ss.ssss
0756     case 25:  // YYYY-MM-DD[ T]hh:mm:ss.sssss
0757     case 26:  // YYYY-MM-DD[ T]hh:mm:ss.ssssss
0758     case 27:  // YYYY-MM-DD[ T]hh:mm:ss.sssssss
0759     case 28:  // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
0760     case 29:  // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
0761       if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s + 11, &seconds_since_midnight))) {
0762         return false;
0763       }
0764       break;
0765     default:
0766       return false;
0767   }
0768
0769   seconds_since_epoch += seconds_since_midnight;
0770   seconds_since_epoch += zone_offset;
0771
0772   if (length <= 19) {
0773     *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
0774     return true;
0775   }
0776
0777   if (ARROW_PREDICT_FALSE(s[19] != '.')) {
0778     return false;
0779   }
0780
0781   uint32_t subseconds = 0;
0782   if (ARROW_PREDICT_FALSE(
0783           !detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
0784     return false;
0785   }
0786
0787   *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count()) + subseconds;
0788   return true;
0789 }
0790
0791 #if defined(_WIN32) || defined(ARROW_WITH_MUSL)
0792 static constexpr bool kStrptimeSupportsZone = false;
0793 #else
0794 static constexpr bool kStrptimeSupportsZone = true;
0795 #endif
0796
0797 /// \brief Returns time since the UNIX epoch in the requested unit
0798 static inline bool ParseTimestampStrptime(const char* buf, size_t length,
0799                                           const char* format, bool ignore_time_in_day,
0800                                           bool allow_trailing_chars, TimeUnit::type unit,
0801                                           int64_t* out) {
0802   // NOTE: strptime() is more than 10x faster than arrow_vendored::date::parse().
0803   // The buffer may not be nul-terminated
0804   std::string clean_copy(buf, length);
0805   struct tm result;
0806   memset(&result, 0, sizeof(struct tm));
0807 #ifdef _WIN32
0808   char* ret = arrow_strptime(clean_copy.c_str(), format, &result);
0809 #else
0810   char* ret = strptime(clean_copy.c_str(), format, &result);
0811 #endif
0812   if (ret == NULLPTR) {
0813     return false;
0814   }
0815   if (!allow_trailing_chars && static_cast<size_t>(ret - clean_copy.c_str()) != length) {
0816     return false;
0817   }
0818   // ignore the time part
0819   arrow_vendored::date::sys_seconds secs =
0820       arrow_vendored::date::sys_days(arrow_vendored::date::year(result.tm_year + 1900) /
0821                                      (result.tm_mon + 1) / std::max(result.tm_mday, 1));
0822   if (!ignore_time_in_day) {
0823     secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
0824              std::chrono::seconds(result.tm_sec));
0825 #ifndef _WIN32
0826     secs -= std::chrono::seconds(result.tm_gmtoff);
0827 #endif
0828   }
0829   *out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count());
0830   return true;
0831 }
0832
0833 template <>
0834 struct StringConverter<TimestampType> {
0835   using value_type = int64_t;
0836
0837   bool Convert(const TimestampType& type, const char* s, size_t length, value_type* out) {
0838     return ParseTimestampISO8601(s, length, type.unit(), out);
0839   }
0840 };
0841
0842 template <>
0843 struct StringConverter<DurationType>
0844     : public StringToSignedIntConverterMixin<DurationType> {
0845   using StringToSignedIntConverterMixin<DurationType>::StringToSignedIntConverterMixin;
0846 };
0847
0848 template <typename DATE_TYPE>
0849 struct StringConverter<DATE_TYPE, enable_if_date<DATE_TYPE>> {
0850   using value_type = typename DATE_TYPE::c_type;
0851
0852   using duration_type =
0853       typename std::conditional<std::is_same<DATE_TYPE, Date32Type>::value,
0854                                 arrow_vendored::date::days,
0855                                 std::chrono::milliseconds>::type;
0856
0857   bool Convert(const DATE_TYPE& type, const char* s, size_t length, value_type* out) {
0858     if (ARROW_PREDICT_FALSE(length != 10)) {
0859       return false;
0860     }
0861
0862     duration_type since_epoch;
0863     if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &since_epoch))) {
0864       return false;
0865     }
0866
0867     *out = static_cast<value_type>(since_epoch.count());
0868     return true;
0869   }
0870 };
0871
0872 template <typename TIME_TYPE>
0873 struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
0874   using value_type = typename TIME_TYPE::c_type;
0875
0876   // We allow the following formats for all units:
0877   // - "hh:mm"
0878   // - "hh:mm:ss"
0879   //
0880   // We allow the following formats for unit == MILLI, MICRO, or NANO:
0881   // - "hh:mm:ss.s{1,3}"
0882   //
0883   // We allow the following formats for unit == MICRO, or NANO:
0884   // - "hh:mm:ss.s{4,6}"
0885   //
0886   // We allow the following formats for unit == NANO:
0887   // - "hh:mm:ss.s{7,9}"
0888
0889   bool Convert(const TIME_TYPE& type, const char* s, size_t length, value_type* out) {
0890     const auto unit = type.unit();
0891     std::chrono::seconds since_midnight;
0892
0893     if (length == 5) {
0894       if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s, &since_midnight))) {
0895         return false;
0896       }
0897       *out =
0898           static_cast<value_type>(util::CastSecondsToUnit(unit, since_midnight.count()));
0899       return true;
0900     }
0901
0902     if (ARROW_PREDICT_FALSE(length < 8)) {
0903       return false;
0904     }
0905     if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s, &since_midnight))) {
0906       return false;
0907     }
0908
0909     *out = static_cast<value_type>(util::CastSecondsToUnit(unit, since_midnight.count()));
0910
0911     if (length == 8) {
0912       return true;
0913     }
0914
0915     if (ARROW_PREDICT_FALSE(s[8] != '.')) {
0916       return false;
0917     }
0918
0919     uint32_t subseconds_count = 0;
0920     if (ARROW_PREDICT_FALSE(
0921             !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
0922       return false;
0923     }
0924
0925     *out += subseconds_count;
0926     return true;
0927   }
0928 };
0929
0930 /// \brief Convenience wrappers around internal::StringConverter.
0931 template <typename T>
0932 bool ParseValue(const T& type, const char* s, size_t length,
0933                 typename StringConverter<T>::value_type* out) {
0934   return StringConverter<T>{}.Convert(type, s, length, out);
0935 }
0936
0937 template <typename T>
0938 enable_if_parameter_free<T, bool> ParseValue(
0939     const char* s, size_t length, typename StringConverter<T>::value_type* out) {
0940   static T type;
0941   return StringConverter<T>{}.Convert(type, s, length, out);
0942 }
0943
0944 }  // namespace internal
0945 }  // namespace arrow