File indexing completed on 2025-01-18 10:02:19
0001
0002
0003
0004
0005
0006
0007
0008
0009 #pragma once
0010
0011 #include <array> // array
0012 #include <clocale> // localeconv
0013 #include <cstddef> // size_t
0014 #include <cstdio> // snprintf
0015 #include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
0016 #include <initializer_list> // initializer_list
0017 #include <string> // char_traits, string
0018 #include <utility> // move
0019 #include <vector> // vector
0020
0021 #include <nlohmann/detail/input/input_adapters.hpp>
0022 #include <nlohmann/detail/input/position_t.hpp>
0023 #include <nlohmann/detail/macro_scope.hpp>
0024
0025 NLOHMANN_JSON_NAMESPACE_BEGIN
0026 namespace detail
0027 {
0028
0029
0030
0031
0032
0033 template<typename BasicJsonType>
0034 class lexer_base
0035 {
0036 public:
0037
0038 enum class token_type
0039 {
0040 uninitialized,
0041 literal_true,
0042 literal_false,
0043 literal_null,
0044 value_string,
0045 value_unsigned,
0046 value_integer,
0047 value_float,
0048 begin_array,
0049 begin_object,
0050 end_array,
0051 end_object,
0052 name_separator,
0053 value_separator,
0054 parse_error,
0055 end_of_input,
0056 literal_or_value
0057 };
0058
0059
0060 JSON_HEDLEY_RETURNS_NON_NULL
0061 JSON_HEDLEY_CONST
0062 static const char* token_type_name(const token_type t) noexcept
0063 {
0064 switch (t)
0065 {
0066 case token_type::uninitialized:
0067 return "<uninitialized>";
0068 case token_type::literal_true:
0069 return "true literal";
0070 case token_type::literal_false:
0071 return "false literal";
0072 case token_type::literal_null:
0073 return "null literal";
0074 case token_type::value_string:
0075 return "string literal";
0076 case token_type::value_unsigned:
0077 case token_type::value_integer:
0078 case token_type::value_float:
0079 return "number literal";
0080 case token_type::begin_array:
0081 return "'['";
0082 case token_type::begin_object:
0083 return "'{'";
0084 case token_type::end_array:
0085 return "']'";
0086 case token_type::end_object:
0087 return "'}'";
0088 case token_type::name_separator:
0089 return "':'";
0090 case token_type::value_separator:
0091 return "','";
0092 case token_type::parse_error:
0093 return "<parse error>";
0094 case token_type::end_of_input:
0095 return "end of input";
0096 case token_type::literal_or_value:
0097 return "'[', '{', or a literal";
0098
0099 default:
0100 return "unknown token";
0101
0102 }
0103 }
0104 };
0105
0106
0107
0108
0109
0110 template<typename BasicJsonType, typename InputAdapterType>
0111 class lexer : public lexer_base<BasicJsonType>
0112 {
0113 using number_integer_t = typename BasicJsonType::number_integer_t;
0114 using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
0115 using number_float_t = typename BasicJsonType::number_float_t;
0116 using string_t = typename BasicJsonType::string_t;
0117 using char_type = typename InputAdapterType::char_type;
0118 using char_int_type = typename std::char_traits<char_type>::int_type;
0119
0120 public:
0121 using token_type = typename lexer_base<BasicJsonType>::token_type;
0122
0123 explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
0124 : ia(std::move(adapter))
0125 , ignore_comments(ignore_comments_)
0126 , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
0127 {}
0128
0129
0130 lexer(const lexer&) = delete;
0131 lexer(lexer&&) = default;
0132 lexer& operator=(lexer&) = delete;
0133 lexer& operator=(lexer&&) = default;
0134 ~lexer() = default;
0135
0136 private:
0137
0138
0139
0140
0141
0142 JSON_HEDLEY_PURE
0143 static char get_decimal_point() noexcept
0144 {
0145 const auto* loc = localeconv();
0146 JSON_ASSERT(loc != nullptr);
0147 return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
0148 }
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169 int get_codepoint()
0170 {
0171
0172 JSON_ASSERT(current == 'u');
0173 int codepoint = 0;
0174
0175 const auto factors = { 12u, 8u, 4u, 0u };
0176 for (const auto factor : factors)
0177 {
0178 get();
0179
0180 if (current >= '0' && current <= '9')
0181 {
0182 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
0183 }
0184 else if (current >= 'A' && current <= 'F')
0185 {
0186 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
0187 }
0188 else if (current >= 'a' && current <= 'f')
0189 {
0190 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
0191 }
0192 else
0193 {
0194 return -1;
0195 }
0196 }
0197
0198 JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
0199 return codepoint;
0200 }
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214
0215
0216
0217 bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
0218 {
0219 JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
0220 add(current);
0221
0222 for (auto range = ranges.begin(); range != ranges.end(); ++range)
0223 {
0224 get();
0225 if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
0226 {
0227 add(current);
0228 }
0229 else
0230 {
0231 error_message = "invalid string: ill-formed UTF-8 byte";
0232 return false;
0233 }
0234 }
0235
0236 return true;
0237 }
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249
0250
0251
0252
0253
0254 token_type scan_string()
0255 {
0256
0257 reset();
0258
0259
0260 JSON_ASSERT(current == '\"');
0261
0262 while (true)
0263 {
0264
0265 switch (get())
0266 {
0267
0268 case std::char_traits<char_type>::eof():
0269 {
0270 error_message = "invalid string: missing closing quote";
0271 return token_type::parse_error;
0272 }
0273
0274
0275 case '\"':
0276 {
0277 return token_type::value_string;
0278 }
0279
0280
0281 case '\\':
0282 {
0283 switch (get())
0284 {
0285
0286 case '\"':
0287 add('\"');
0288 break;
0289
0290 case '\\':
0291 add('\\');
0292 break;
0293
0294 case '/':
0295 add('/');
0296 break;
0297
0298 case 'b':
0299 add('\b');
0300 break;
0301
0302 case 'f':
0303 add('\f');
0304 break;
0305
0306 case 'n':
0307 add('\n');
0308 break;
0309
0310 case 'r':
0311 add('\r');
0312 break;
0313
0314 case 't':
0315 add('\t');
0316 break;
0317
0318
0319 case 'u':
0320 {
0321 const int codepoint1 = get_codepoint();
0322 int codepoint = codepoint1;
0323
0324 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
0325 {
0326 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
0327 return token_type::parse_error;
0328 }
0329
0330
0331 if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
0332 {
0333
0334 if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
0335 {
0336 const int codepoint2 = get_codepoint();
0337
0338 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
0339 {
0340 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
0341 return token_type::parse_error;
0342 }
0343
0344
0345 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
0346 {
0347
0348 codepoint = static_cast<int>(
0349
0350 (static_cast<unsigned int>(codepoint1) << 10u)
0351
0352 + static_cast<unsigned int>(codepoint2)
0353
0354
0355
0356 - 0x35FDC00u);
0357 }
0358 else
0359 {
0360 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
0361 return token_type::parse_error;
0362 }
0363 }
0364 else
0365 {
0366 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
0367 return token_type::parse_error;
0368 }
0369 }
0370 else
0371 {
0372 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
0373 {
0374 error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
0375 return token_type::parse_error;
0376 }
0377 }
0378
0379
0380 JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
0381
0382
0383 if (codepoint < 0x80)
0384 {
0385
0386 add(static_cast<char_int_type>(codepoint));
0387 }
0388 else if (codepoint <= 0x7FF)
0389 {
0390
0391 add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
0392 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
0393 }
0394 else if (codepoint <= 0xFFFF)
0395 {
0396
0397 add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
0398 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
0399 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
0400 }
0401 else
0402 {
0403
0404 add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
0405 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
0406 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
0407 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
0408 }
0409
0410 break;
0411 }
0412
0413
0414 default:
0415 error_message = "invalid string: forbidden character after backslash";
0416 return token_type::parse_error;
0417 }
0418
0419 break;
0420 }
0421
0422
0423 case 0x00:
0424 {
0425 error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
0426 return token_type::parse_error;
0427 }
0428
0429 case 0x01:
0430 {
0431 error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
0432 return token_type::parse_error;
0433 }
0434
0435 case 0x02:
0436 {
0437 error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
0438 return token_type::parse_error;
0439 }
0440
0441 case 0x03:
0442 {
0443 error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
0444 return token_type::parse_error;
0445 }
0446
0447 case 0x04:
0448 {
0449 error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
0450 return token_type::parse_error;
0451 }
0452
0453 case 0x05:
0454 {
0455 error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
0456 return token_type::parse_error;
0457 }
0458
0459 case 0x06:
0460 {
0461 error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
0462 return token_type::parse_error;
0463 }
0464
0465 case 0x07:
0466 {
0467 error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
0468 return token_type::parse_error;
0469 }
0470
0471 case 0x08:
0472 {
0473 error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
0474 return token_type::parse_error;
0475 }
0476
0477 case 0x09:
0478 {
0479 error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
0480 return token_type::parse_error;
0481 }
0482
0483 case 0x0A:
0484 {
0485 error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
0486 return token_type::parse_error;
0487 }
0488
0489 case 0x0B:
0490 {
0491 error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
0492 return token_type::parse_error;
0493 }
0494
0495 case 0x0C:
0496 {
0497 error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
0498 return token_type::parse_error;
0499 }
0500
0501 case 0x0D:
0502 {
0503 error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
0504 return token_type::parse_error;
0505 }
0506
0507 case 0x0E:
0508 {
0509 error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
0510 return token_type::parse_error;
0511 }
0512
0513 case 0x0F:
0514 {
0515 error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
0516 return token_type::parse_error;
0517 }
0518
0519 case 0x10:
0520 {
0521 error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
0522 return token_type::parse_error;
0523 }
0524
0525 case 0x11:
0526 {
0527 error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
0528 return token_type::parse_error;
0529 }
0530
0531 case 0x12:
0532 {
0533 error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
0534 return token_type::parse_error;
0535 }
0536
0537 case 0x13:
0538 {
0539 error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
0540 return token_type::parse_error;
0541 }
0542
0543 case 0x14:
0544 {
0545 error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
0546 return token_type::parse_error;
0547 }
0548
0549 case 0x15:
0550 {
0551 error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
0552 return token_type::parse_error;
0553 }
0554
0555 case 0x16:
0556 {
0557 error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
0558 return token_type::parse_error;
0559 }
0560
0561 case 0x17:
0562 {
0563 error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
0564 return token_type::parse_error;
0565 }
0566
0567 case 0x18:
0568 {
0569 error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
0570 return token_type::parse_error;
0571 }
0572
0573 case 0x19:
0574 {
0575 error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
0576 return token_type::parse_error;
0577 }
0578
0579 case 0x1A:
0580 {
0581 error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
0582 return token_type::parse_error;
0583 }
0584
0585 case 0x1B:
0586 {
0587 error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
0588 return token_type::parse_error;
0589 }
0590
0591 case 0x1C:
0592 {
0593 error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
0594 return token_type::parse_error;
0595 }
0596
0597 case 0x1D:
0598 {
0599 error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
0600 return token_type::parse_error;
0601 }
0602
0603 case 0x1E:
0604 {
0605 error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
0606 return token_type::parse_error;
0607 }
0608
0609 case 0x1F:
0610 {
0611 error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
0612 return token_type::parse_error;
0613 }
0614
0615
0616 case 0x20:
0617 case 0x21:
0618 case 0x23:
0619 case 0x24:
0620 case 0x25:
0621 case 0x26:
0622 case 0x27:
0623 case 0x28:
0624 case 0x29:
0625 case 0x2A:
0626 case 0x2B:
0627 case 0x2C:
0628 case 0x2D:
0629 case 0x2E:
0630 case 0x2F:
0631 case 0x30:
0632 case 0x31:
0633 case 0x32:
0634 case 0x33:
0635 case 0x34:
0636 case 0x35:
0637 case 0x36:
0638 case 0x37:
0639 case 0x38:
0640 case 0x39:
0641 case 0x3A:
0642 case 0x3B:
0643 case 0x3C:
0644 case 0x3D:
0645 case 0x3E:
0646 case 0x3F:
0647 case 0x40:
0648 case 0x41:
0649 case 0x42:
0650 case 0x43:
0651 case 0x44:
0652 case 0x45:
0653 case 0x46:
0654 case 0x47:
0655 case 0x48:
0656 case 0x49:
0657 case 0x4A:
0658 case 0x4B:
0659 case 0x4C:
0660 case 0x4D:
0661 case 0x4E:
0662 case 0x4F:
0663 case 0x50:
0664 case 0x51:
0665 case 0x52:
0666 case 0x53:
0667 case 0x54:
0668 case 0x55:
0669 case 0x56:
0670 case 0x57:
0671 case 0x58:
0672 case 0x59:
0673 case 0x5A:
0674 case 0x5B:
0675 case 0x5D:
0676 case 0x5E:
0677 case 0x5F:
0678 case 0x60:
0679 case 0x61:
0680 case 0x62:
0681 case 0x63:
0682 case 0x64:
0683 case 0x65:
0684 case 0x66:
0685 case 0x67:
0686 case 0x68:
0687 case 0x69:
0688 case 0x6A:
0689 case 0x6B:
0690 case 0x6C:
0691 case 0x6D:
0692 case 0x6E:
0693 case 0x6F:
0694 case 0x70:
0695 case 0x71:
0696 case 0x72:
0697 case 0x73:
0698 case 0x74:
0699 case 0x75:
0700 case 0x76:
0701 case 0x77:
0702 case 0x78:
0703 case 0x79:
0704 case 0x7A:
0705 case 0x7B:
0706 case 0x7C:
0707 case 0x7D:
0708 case 0x7E:
0709 case 0x7F:
0710 {
0711 add(current);
0712 break;
0713 }
0714
0715
0716 case 0xC2:
0717 case 0xC3:
0718 case 0xC4:
0719 case 0xC5:
0720 case 0xC6:
0721 case 0xC7:
0722 case 0xC8:
0723 case 0xC9:
0724 case 0xCA:
0725 case 0xCB:
0726 case 0xCC:
0727 case 0xCD:
0728 case 0xCE:
0729 case 0xCF:
0730 case 0xD0:
0731 case 0xD1:
0732 case 0xD2:
0733 case 0xD3:
0734 case 0xD4:
0735 case 0xD5:
0736 case 0xD6:
0737 case 0xD7:
0738 case 0xD8:
0739 case 0xD9:
0740 case 0xDA:
0741 case 0xDB:
0742 case 0xDC:
0743 case 0xDD:
0744 case 0xDE:
0745 case 0xDF:
0746 {
0747 if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
0748 {
0749 return token_type::parse_error;
0750 }
0751 break;
0752 }
0753
0754
0755 case 0xE0:
0756 {
0757 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
0758 {
0759 return token_type::parse_error;
0760 }
0761 break;
0762 }
0763
0764
0765
0766 case 0xE1:
0767 case 0xE2:
0768 case 0xE3:
0769 case 0xE4:
0770 case 0xE5:
0771 case 0xE6:
0772 case 0xE7:
0773 case 0xE8:
0774 case 0xE9:
0775 case 0xEA:
0776 case 0xEB:
0777 case 0xEC:
0778 case 0xEE:
0779 case 0xEF:
0780 {
0781 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
0782 {
0783 return token_type::parse_error;
0784 }
0785 break;
0786 }
0787
0788
0789 case 0xED:
0790 {
0791 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
0792 {
0793 return token_type::parse_error;
0794 }
0795 break;
0796 }
0797
0798
0799 case 0xF0:
0800 {
0801 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
0802 {
0803 return token_type::parse_error;
0804 }
0805 break;
0806 }
0807
0808
0809 case 0xF1:
0810 case 0xF2:
0811 case 0xF3:
0812 {
0813 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
0814 {
0815 return token_type::parse_error;
0816 }
0817 break;
0818 }
0819
0820
0821 case 0xF4:
0822 {
0823 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
0824 {
0825 return token_type::parse_error;
0826 }
0827 break;
0828 }
0829
0830
0831 default:
0832 {
0833 error_message = "invalid string: ill-formed UTF-8 byte";
0834 return token_type::parse_error;
0835 }
0836 }
0837 }
0838 }
0839
0840
0841
0842
0843
0844 bool scan_comment()
0845 {
0846 switch (get())
0847 {
0848
0849 case '/':
0850 {
0851 while (true)
0852 {
0853 switch (get())
0854 {
0855 case '\n':
0856 case '\r':
0857 case std::char_traits<char_type>::eof():
0858 case '\0':
0859 return true;
0860
0861 default:
0862 break;
0863 }
0864 }
0865 }
0866
0867
0868 case '*':
0869 {
0870 while (true)
0871 {
0872 switch (get())
0873 {
0874 case std::char_traits<char_type>::eof():
0875 case '\0':
0876 {
0877 error_message = "invalid comment; missing closing '*/'";
0878 return false;
0879 }
0880
0881 case '*':
0882 {
0883 switch (get())
0884 {
0885 case '/':
0886 return true;
0887
0888 default:
0889 {
0890 unget();
0891 continue;
0892 }
0893 }
0894 }
0895
0896 default:
0897 continue;
0898 }
0899 }
0900 }
0901
0902
0903 default:
0904 {
0905 error_message = "invalid comment; expecting '/' or '*' after '/'";
0906 return false;
0907 }
0908 }
0909 }
0910
0911 JSON_HEDLEY_NON_NULL(2)
0912 static void strtof(float& f, const char* str, char** endptr) noexcept
0913 {
0914 f = std::strtof(str, endptr);
0915 }
0916
0917 JSON_HEDLEY_NON_NULL(2)
0918 static void strtof(double& f, const char* str, char** endptr) noexcept
0919 {
0920 f = std::strtod(str, endptr);
0921 }
0922
0923 JSON_HEDLEY_NON_NULL(2)
0924 static void strtof(long double& f, const char* str, char** endptr) noexcept
0925 {
0926 f = std::strtold(str, endptr);
0927 }
0928
0929
0930
0931
0932
0933
0934
0935
0936
0937
0938
0939
0940
0941
0942
0943
0944
0945
0946
0947
0948
0949
0950
0951
0952
0953
0954
0955
0956
0957
0958
0959
0960
0961
0962
0963
0964
0965
0966
0967
0968
0969 token_type scan_number()
0970 {
0971
0972 reset();
0973
0974
0975
0976 token_type number_type = token_type::value_unsigned;
0977
0978
0979 switch (current)
0980 {
0981 case '-':
0982 {
0983 add(current);
0984 goto scan_number_minus;
0985 }
0986
0987 case '0':
0988 {
0989 add(current);
0990 goto scan_number_zero;
0991 }
0992
0993 case '1':
0994 case '2':
0995 case '3':
0996 case '4':
0997 case '5':
0998 case '6':
0999 case '7':
1000 case '8':
1001 case '9':
1002 {
1003 add(current);
1004 goto scan_number_any1;
1005 }
1006
1007
1008 default:
1009 JSON_ASSERT(false);
1010 }
1011
1012 scan_number_minus:
1013
1014 number_type = token_type::value_integer;
1015 switch (get())
1016 {
1017 case '0':
1018 {
1019 add(current);
1020 goto scan_number_zero;
1021 }
1022
1023 case '1':
1024 case '2':
1025 case '3':
1026 case '4':
1027 case '5':
1028 case '6':
1029 case '7':
1030 case '8':
1031 case '9':
1032 {
1033 add(current);
1034 goto scan_number_any1;
1035 }
1036
1037 default:
1038 {
1039 error_message = "invalid number; expected digit after '-'";
1040 return token_type::parse_error;
1041 }
1042 }
1043
1044 scan_number_zero:
1045
1046 switch (get())
1047 {
1048 case '.':
1049 {
1050 add(decimal_point_char);
1051 goto scan_number_decimal1;
1052 }
1053
1054 case 'e':
1055 case 'E':
1056 {
1057 add(current);
1058 goto scan_number_exponent;
1059 }
1060
1061 default:
1062 goto scan_number_done;
1063 }
1064
1065 scan_number_any1:
1066
1067 switch (get())
1068 {
1069 case '0':
1070 case '1':
1071 case '2':
1072 case '3':
1073 case '4':
1074 case '5':
1075 case '6':
1076 case '7':
1077 case '8':
1078 case '9':
1079 {
1080 add(current);
1081 goto scan_number_any1;
1082 }
1083
1084 case '.':
1085 {
1086 add(decimal_point_char);
1087 goto scan_number_decimal1;
1088 }
1089
1090 case 'e':
1091 case 'E':
1092 {
1093 add(current);
1094 goto scan_number_exponent;
1095 }
1096
1097 default:
1098 goto scan_number_done;
1099 }
1100
1101 scan_number_decimal1:
1102
1103 number_type = token_type::value_float;
1104 switch (get())
1105 {
1106 case '0':
1107 case '1':
1108 case '2':
1109 case '3':
1110 case '4':
1111 case '5':
1112 case '6':
1113 case '7':
1114 case '8':
1115 case '9':
1116 {
1117 add(current);
1118 goto scan_number_decimal2;
1119 }
1120
1121 default:
1122 {
1123 error_message = "invalid number; expected digit after '.'";
1124 return token_type::parse_error;
1125 }
1126 }
1127
1128 scan_number_decimal2:
1129
1130 switch (get())
1131 {
1132 case '0':
1133 case '1':
1134 case '2':
1135 case '3':
1136 case '4':
1137 case '5':
1138 case '6':
1139 case '7':
1140 case '8':
1141 case '9':
1142 {
1143 add(current);
1144 goto scan_number_decimal2;
1145 }
1146
1147 case 'e':
1148 case 'E':
1149 {
1150 add(current);
1151 goto scan_number_exponent;
1152 }
1153
1154 default:
1155 goto scan_number_done;
1156 }
1157
1158 scan_number_exponent:
1159
1160 number_type = token_type::value_float;
1161 switch (get())
1162 {
1163 case '+':
1164 case '-':
1165 {
1166 add(current);
1167 goto scan_number_sign;
1168 }
1169
1170 case '0':
1171 case '1':
1172 case '2':
1173 case '3':
1174 case '4':
1175 case '5':
1176 case '6':
1177 case '7':
1178 case '8':
1179 case '9':
1180 {
1181 add(current);
1182 goto scan_number_any2;
1183 }
1184
1185 default:
1186 {
1187 error_message =
1188 "invalid number; expected '+', '-', or digit after exponent";
1189 return token_type::parse_error;
1190 }
1191 }
1192
1193 scan_number_sign:
1194
1195 switch (get())
1196 {
1197 case '0':
1198 case '1':
1199 case '2':
1200 case '3':
1201 case '4':
1202 case '5':
1203 case '6':
1204 case '7':
1205 case '8':
1206 case '9':
1207 {
1208 add(current);
1209 goto scan_number_any2;
1210 }
1211
1212 default:
1213 {
1214 error_message = "invalid number; expected digit after exponent sign";
1215 return token_type::parse_error;
1216 }
1217 }
1218
1219 scan_number_any2:
1220
1221 switch (get())
1222 {
1223 case '0':
1224 case '1':
1225 case '2':
1226 case '3':
1227 case '4':
1228 case '5':
1229 case '6':
1230 case '7':
1231 case '8':
1232 case '9':
1233 {
1234 add(current);
1235 goto scan_number_any2;
1236 }
1237
1238 default:
1239 goto scan_number_done;
1240 }
1241
1242 scan_number_done:
1243
1244
1245 unget();
1246
1247 char* endptr = nullptr;
1248 errno = 0;
1249
1250
1251 if (number_type == token_type::value_unsigned)
1252 {
1253 const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1254
1255
1256 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1257
1258 if (errno == 0)
1259 {
1260 value_unsigned = static_cast<number_unsigned_t>(x);
1261 if (value_unsigned == x)
1262 {
1263 return token_type::value_unsigned;
1264 }
1265 }
1266 }
1267 else if (number_type == token_type::value_integer)
1268 {
1269 const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1270
1271
1272 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1273
1274 if (errno == 0)
1275 {
1276 value_integer = static_cast<number_integer_t>(x);
1277 if (value_integer == x)
1278 {
1279 return token_type::value_integer;
1280 }
1281 }
1282 }
1283
1284
1285
1286 strtof(value_float, token_buffer.data(), &endptr);
1287
1288
1289 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1290
1291 return token_type::value_float;
1292 }
1293
1294
1295
1296
1297
1298
1299 JSON_HEDLEY_NON_NULL(2)
1300 token_type scan_literal(const char_type* literal_text, const std::size_t length,
1301 token_type return_type)
1302 {
1303 JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
1304 for (std::size_t i = 1; i < length; ++i)
1305 {
1306 if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1307 {
1308 error_message = "invalid literal";
1309 return token_type::parse_error;
1310 }
1311 }
1312 return return_type;
1313 }
1314
1315
1316
1317
1318
1319
1320 void reset() noexcept
1321 {
1322 token_buffer.clear();
1323 token_string.clear();
1324 token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1325 }
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337 char_int_type get()
1338 {
1339 ++position.chars_read_total;
1340 ++position.chars_read_current_line;
1341
1342 if (next_unget)
1343 {
1344
1345 next_unget = false;
1346 }
1347 else
1348 {
1349 current = ia.get_character();
1350 }
1351
1352 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1353 {
1354 token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1355 }
1356
1357 if (current == '\n')
1358 {
1359 ++position.lines_read;
1360 position.chars_read_current_line = 0;
1361 }
1362
1363 return current;
1364 }
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374 void unget()
1375 {
1376 next_unget = true;
1377
1378 --position.chars_read_total;
1379
1380
1381 if (position.chars_read_current_line == 0)
1382 {
1383 if (position.lines_read > 0)
1384 {
1385 --position.lines_read;
1386 }
1387 }
1388 else
1389 {
1390 --position.chars_read_current_line;
1391 }
1392
1393 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1394 {
1395 JSON_ASSERT(!token_string.empty());
1396 token_string.pop_back();
1397 }
1398 }
1399
1400
1401 void add(char_int_type c)
1402 {
1403 token_buffer.push_back(static_cast<typename string_t::value_type>(c));
1404 }
1405
1406 public:
1407
1408
1409
1410
1411
1412 constexpr number_integer_t get_number_integer() const noexcept
1413 {
1414 return value_integer;
1415 }
1416
1417
1418 constexpr number_unsigned_t get_number_unsigned() const noexcept
1419 {
1420 return value_unsigned;
1421 }
1422
1423
1424 constexpr number_float_t get_number_float() const noexcept
1425 {
1426 return value_float;
1427 }
1428
1429
1430 string_t& get_string()
1431 {
1432 return token_buffer;
1433 }
1434
1435
1436
1437
1438
1439
1440 constexpr position_t get_position() const noexcept
1441 {
1442 return position;
1443 }
1444
1445
1446
1447
1448 std::string get_token_string() const
1449 {
1450
1451 std::string result;
1452 for (const auto c : token_string)
1453 {
1454 if (static_cast<unsigned char>(c) <= '\x1F')
1455 {
1456
1457 std::array<char, 9> cs{{}};
1458 static_cast<void>((std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c)));
1459 result += cs.data();
1460 }
1461 else
1462 {
1463
1464 result.push_back(static_cast<std::string::value_type>(c));
1465 }
1466 }
1467
1468 return result;
1469 }
1470
1471
1472 JSON_HEDLEY_RETURNS_NON_NULL
1473 constexpr const char* get_error_message() const noexcept
1474 {
1475 return error_message;
1476 }
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486 bool skip_bom()
1487 {
1488 if (get() == 0xEF)
1489 {
1490
1491 return get() == 0xBB && get() == 0xBF;
1492 }
1493
1494
1495
1496 unget();
1497 return true;
1498 }
1499
1500 void skip_whitespace()
1501 {
1502 do
1503 {
1504 get();
1505 }
1506 while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
1507 }
1508
1509 token_type scan()
1510 {
1511
1512 if (position.chars_read_total == 0 && !skip_bom())
1513 {
1514 error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1515 return token_type::parse_error;
1516 }
1517
1518
1519 skip_whitespace();
1520
1521
1522 while (ignore_comments && current == '/')
1523 {
1524 if (!scan_comment())
1525 {
1526 return token_type::parse_error;
1527 }
1528
1529
1530 skip_whitespace();
1531 }
1532
1533 switch (current)
1534 {
1535
1536 case '[':
1537 return token_type::begin_array;
1538 case ']':
1539 return token_type::end_array;
1540 case '{':
1541 return token_type::begin_object;
1542 case '}':
1543 return token_type::end_object;
1544 case ':':
1545 return token_type::name_separator;
1546 case ',':
1547 return token_type::value_separator;
1548
1549
1550 case 't':
1551 {
1552 std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}};
1553 return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
1554 }
1555 case 'f':
1556 {
1557 std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}};
1558 return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
1559 }
1560 case 'n':
1561 {
1562 std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}};
1563 return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
1564 }
1565
1566
1567 case '\"':
1568 return scan_string();
1569
1570
1571 case '-':
1572 case '0':
1573 case '1':
1574 case '2':
1575 case '3':
1576 case '4':
1577 case '5':
1578 case '6':
1579 case '7':
1580 case '8':
1581 case '9':
1582 return scan_number();
1583
1584
1585
1586 case '\0':
1587 case std::char_traits<char_type>::eof():
1588 return token_type::end_of_input;
1589
1590
1591 default:
1592 error_message = "invalid literal";
1593 return token_type::parse_error;
1594 }
1595 }
1596
1597 private:
1598
1599 InputAdapterType ia;
1600
1601
1602 const bool ignore_comments = false;
1603
1604
1605 char_int_type current = std::char_traits<char_type>::eof();
1606
1607
1608 bool next_unget = false;
1609
1610
1611 position_t position {};
1612
1613
1614 std::vector<char_type> token_string {};
1615
1616
1617 string_t token_buffer {};
1618
1619
1620 const char* error_message = "";
1621
1622
1623 number_integer_t value_integer = 0;
1624 number_unsigned_t value_unsigned = 0;
1625 number_float_t value_float = 0;
1626
1627
1628 const char_int_type decimal_point_char = '.';
1629 };
1630
1631 }
1632 NLOHMANN_JSON_NAMESPACE_END