File indexing completed on 2026-05-03 08:13:51
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #ifndef _LIBCPP___FORMAT_UNICODE_H
0011 #define _LIBCPP___FORMAT_UNICODE_H
0012
0013 #include <__assert>
0014 #include <__bit/countl.h>
0015 #include <__concepts/same_as.h>
0016 #include <__config>
0017 #include <__format/extended_grapheme_cluster_table.h>
0018 #include <__format/indic_conjunct_break_table.h>
0019 #include <__iterator/concepts.h>
0020 #include <__iterator/readable_traits.h> // iter_value_t
0021 #include <__utility/unreachable.h>
0022 #include <string_view>
0023
0024 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
0025 # pragma GCC system_header
0026 #endif
0027
0028 _LIBCPP_BEGIN_NAMESPACE_STD
0029
0030 #if _LIBCPP_STD_VER >= 20
0031
0032 namespace __unicode {
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043 struct __consume_result {
0044
0045
0046 char32_t __code_point : 31;
0047
0048 enum : char32_t {
0049
0050 __ok = 0,
0051
0052 __error = 1
0053 } __status : 1 {__ok};
0054 };
0055 static_assert(sizeof(__consume_result) == sizeof(char32_t));
0056
0057 # if _LIBCPP_HAS_UNICODE
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072 inline constexpr char32_t __replacement_character = U'\ufffd';
0073
0074
0075
0076
0077
0078
0079 inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
0080
0081 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
0082 return __value >= 0xd800 && __value <= 0xdbff;
0083 }
0084
0085 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
0086 return __value >= 0xdc00 && __value <= 0xdfff;
0087 }
0088
0089
0090 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
0091 return __value >= 0xd800 && __value <= 0xdfff;
0092 }
0093
0094
0095 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
0096 return __value <= 0x10ffff;
0097 }
0098
0099
0100 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
0101 return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
0102 }
0103
0104 template <contiguous_iterator _Iterator>
0105 requires same_as<iter_value_t<_Iterator>, char>
0106 _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
0107 do {
0108 if ((*__char & 0b1100'0000) != 0b1000'0000)
0109 return false;
0110 --__count;
0111 ++__char;
0112 } while (__count);
0113 return true;
0114 }
0115
0116
0117
0118
0119
0120 template <class _CharT>
0121 class __code_point_view;
0122
0123
0124 template <>
0125 class __code_point_view<char> {
0126 using _Iterator _LIBCPP_NODEBUG = basic_string_view<char>::const_iterator;
0127
0128 public:
0129 _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
0130 : __first_(__first), __last_(__last) {}
0131
0132 _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
0133 _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
0158 _LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
0159
0160
0161
0162
0163 switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
0164 case 0:
0165 return {static_cast<unsigned char>(*__first_++)};
0166
0167 case 2: {
0168 if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
0169 break;
0170
0171 char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
0172 __value <<= 6;
0173 __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
0174
0175
0176 if (__value < 0x0080) [[unlikely]]
0177 return __consume_result_error;
0178
0179 return {__value};
0180 }
0181
0182 case 3: {
0183 if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
0184 break;
0185
0186 char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
0187 __value <<= 6;
0188 __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
0189 __value <<= 6;
0190 __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
0191
0192
0193 if (__value < 0x0800) [[unlikely]]
0194 return __consume_result_error;
0195
0196
0197 if (__unicode::__is_surrogate(__value)) [[unlikely]]
0198 return __consume_result_error;
0199
0200 return {__value};
0201 }
0202
0203 case 4: {
0204 if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
0205 break;
0206
0207 char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
0208 __value <<= 6;
0209 __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
0210 __value <<= 6;
0211 __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
0212 __value <<= 6;
0213 __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
0214
0215
0216 if (__value < 0x10000) [[unlikely]]
0217 return __consume_result_error;
0218
0219
0220 if (!__unicode::__is_code_point(__value)) [[unlikely]]
0221 return __consume_result_error;
0222
0223 return {__value};
0224 }
0225 }
0226
0227
0228
0229 ++__first_;
0230 return __consume_result_error;
0231 }
0232
0233 private:
0234 _Iterator __first_;
0235 _Iterator __last_;
0236 };
0237
0238 # if _LIBCPP_HAS_WIDE_CHARACTERS
0239 _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) {
0240 return __value >= 0xd800 && __value <= 0xdbff;
0241 }
0242
0243 _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
0244 return __value >= 0xdc00 && __value <= 0xdfff;
0245 }
0246
0247
0248
0249
0250 template <>
0251 class __code_point_view<wchar_t> {
0252 using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<wchar_t>::const_iterator;
0253
0254 public:
0255 static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value");
0256
0257 _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
0258 : __first_(__first), __last_(__last) {}
0259
0260 _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
0261 _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
0262
0263 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
0264 _LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
0265
0266 char32_t __value = static_cast<char32_t>(*__first_++);
0267 if constexpr (sizeof(wchar_t) == 2) {
0268 if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
0269 return __consume_result_error;
0270
0271 if (__unicode::__is_high_surrogate(__value)) {
0272 if (__first_ == __last_ || !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]]
0273 return __consume_result_error;
0274
0275 __value -= 0xd800;
0276 __value <<= 10;
0277 __value += static_cast<char32_t>(*__first_++) - 0xdc00;
0278 __value += 0x10000;
0279
0280 if (!__unicode::__is_code_point(__value)) [[unlikely]]
0281 return __consume_result_error;
0282 }
0283 } else {
0284 if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
0285 return __consume_result_error;
0286 }
0287
0288 return {__value};
0289 }
0290
0291 private:
0292 _Iterator __first_;
0293 _Iterator __last_;
0294 };
0295 # endif
0296
0297
0298
0299
0300
0301
0302 class __extended_grapheme_cluster_break {
0303 using __EGC_property _LIBCPP_NODEBUG = __extended_grapheme_custer_property_boundary::__property;
0304 using __inCB_property _LIBCPP_NODEBUG = __indic_conjunct_break::__property;
0305
0306 public:
0307 _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_break(char32_t __first_code_point)
0308 : __prev_code_point_(__first_code_point),
0309 __prev_property_(__extended_grapheme_custer_property_boundary::__get_property(__first_code_point)) {
0310
0311 if (__prev_property_ == __EGC_property::__Extended_Pictographic)
0312 __active_rule_ = __rule::__GB11_emoji;
0313 else if (__prev_property_ == __EGC_property::__Regional_Indicator)
0314 __active_rule_ = __rule::__GB12_GB13_regional_indicator;
0315 else if (__indic_conjunct_break::__get_property(__first_code_point) == __inCB_property::__Consonant)
0316 __active_rule_ = __rule::__GB9c_indic_conjunct_break;
0317 }
0318
0319 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool operator()(char32_t __next_code_point) {
0320 __EGC_property __next_property = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point);
0321 bool __result = __evaluate(__next_code_point, __next_property);
0322 __prev_code_point_ = __next_code_point;
0323 __prev_property_ = __next_property;
0324 return __result;
0325 }
0326
0327
0328
0329 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr char32_t __current_code_point() const { return __prev_code_point_; }
0330
0331 private:
0332
0333
0334
0335 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
0336 __evaluate(char32_t __next_code_point, __EGC_property __next_property) {
0337 switch (__active_rule_) {
0338 case __rule::__none:
0339 return __evaluate_none(__next_code_point, __next_property);
0340 case __rule::__GB9c_indic_conjunct_break:
0341 return __evaluate_GB9c_indic_conjunct_break(__next_code_point, __next_property);
0342 case __rule::__GB11_emoji:
0343 return __evaluate_GB11_emoji(__next_code_point, __next_property);
0344 case __rule::__GB12_GB13_regional_indicator:
0345 return __evaluate_GB12_GB13_regional_indicator(__next_code_point, __next_property);
0346 }
0347 __libcpp_unreachable();
0348 }
0349
0350 _LIBCPP_HIDE_FROM_ABI constexpr bool __evaluate_none(char32_t __next_code_point, __EGC_property __next_property) {
0351
0352
0353 _LIBCPP_ASSERT_INTERNAL(__prev_property_ != __EGC_property::__sot, "should be handled in the constructor");
0354 _LIBCPP_ASSERT_INTERNAL(__prev_property_ != __EGC_property::__eot, "should be handled by our caller");
0355
0356
0357 if (__prev_property_ == __EGC_property::__CR && __next_property == __EGC_property::__LF)
0358 return false;
0359
0360 if (__prev_property_ == __EGC_property::__Control || __prev_property_ == __EGC_property::__CR ||
0361 __prev_property_ == __EGC_property::__LF)
0362 return true;
0363
0364 if (__next_property == __EGC_property::__Control || __next_property == __EGC_property::__CR ||
0365 __next_property == __EGC_property::__LF)
0366 return true;
0367
0368
0369 if (__prev_property_ == __EGC_property::__L &&
0370 (__next_property == __EGC_property::__L || __next_property == __EGC_property::__V ||
0371 __next_property == __EGC_property::__LV || __next_property == __EGC_property::__LVT))
0372 return false;
0373
0374 if ((__prev_property_ == __EGC_property::__LV || __prev_property_ == __EGC_property::__V) &&
0375 (__next_property == __EGC_property::__V || __next_property == __EGC_property::__T))
0376 return false;
0377
0378 if ((__prev_property_ == __EGC_property::__LVT || __prev_property_ == __EGC_property::__T) &&
0379 __next_property == __EGC_property::__T)
0380 return false;
0381
0382
0383 if (__next_property == __EGC_property::__Extend || __next_property == __EGC_property::__ZWJ)
0384 return false;
0385
0386
0387 if (__next_property == __EGC_property::__SpacingMark)
0388 return false;
0389
0390 if (__prev_property_ == __EGC_property::__Prepend)
0391 return false;
0392
0393
0394 if (__indic_conjunct_break::__get_property(__next_code_point) == __inCB_property::__Consonant) {
0395 __active_rule_ = __rule::__GB9c_indic_conjunct_break;
0396 __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
0397 return true;
0398 }
0399
0400
0401 if (__next_property == __EGC_property::__Extended_Pictographic) {
0402 __active_rule_ = __rule::__GB11_emoji;
0403 __GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
0404 return true;
0405 }
0406
0407
0408
0409
0410
0411 if (__next_property == __EGC_property::__Regional_Indicator) {
0412 __active_rule_ = __rule::__GB12_GB13_regional_indicator;
0413 return true;
0414 }
0415
0416
0417 return true;
0418 }
0419
0420 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
0421 __evaluate_GB9c_indic_conjunct_break(char32_t __next_code_point, __EGC_property __next_property) {
0422 __inCB_property __break = __indic_conjunct_break::__get_property(__next_code_point);
0423 if (__break == __inCB_property::__none) {
0424 __active_rule_ = __rule::__none;
0425 return __evaluate_none(__next_code_point, __next_property);
0426 }
0427
0428 switch (__GB9c_indic_conjunct_break_state_) {
0429 case __GB9c_indic_conjunct_break_state::__Consonant:
0430 if (__break == __inCB_property::__Extend) {
0431 return false;
0432 }
0433 if (__break == __inCB_property::__Linker) {
0434 __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Linker;
0435 return false;
0436 }
0437 __active_rule_ = __rule::__none;
0438 return __evaluate_none(__next_code_point, __next_property);
0439
0440 case __GB9c_indic_conjunct_break_state::__Linker:
0441 if (__break == __inCB_property::__Extend) {
0442 return false;
0443 }
0444 if (__break == __inCB_property::__Linker) {
0445 return false;
0446 }
0447 if (__break == __inCB_property::__Consonant) {
0448 __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
0449 return false;
0450 }
0451 __active_rule_ = __rule::__none;
0452 return __evaluate_none(__next_code_point, __next_property);
0453 }
0454 __libcpp_unreachable();
0455 }
0456
0457 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
0458 __evaluate_GB11_emoji(char32_t __next_code_point, __EGC_property __next_property) {
0459 switch (__GB11_emoji_state_) {
0460 case __GB11_emoji_state::__Extended_Pictographic:
0461 if (__next_property == __EGC_property::__Extend) {
0462 __GB11_emoji_state_ = __GB11_emoji_state::__Extend;
0463 return false;
0464 }
0465 [[fallthrough]];
0466 case __GB11_emoji_state::__Extend:
0467 if (__next_property == __EGC_property::__ZWJ) {
0468 __GB11_emoji_state_ = __GB11_emoji_state::__ZWJ;
0469 return false;
0470 }
0471 if (__next_property == __EGC_property::__Extend)
0472 return false;
0473 __active_rule_ = __rule::__none;
0474 return __evaluate_none(__next_code_point, __next_property);
0475
0476 case __GB11_emoji_state::__ZWJ:
0477 if (__next_property == __EGC_property::__Extended_Pictographic) {
0478 __GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
0479 return false;
0480 }
0481 __active_rule_ = __rule::__none;
0482 return __evaluate_none(__next_code_point, __next_property);
0483 }
0484 __libcpp_unreachable();
0485 }
0486
0487 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
0488 __evaluate_GB12_GB13_regional_indicator(char32_t __next_code_point, __EGC_property __next_property) {
0489 __active_rule_ = __rule::__none;
0490 if (__next_property == __EGC_property::__Regional_Indicator)
0491 return false;
0492 return __evaluate_none(__next_code_point, __next_property);
0493 }
0494
0495 char32_t __prev_code_point_;
0496 __EGC_property __prev_property_;
0497
0498 enum class __rule {
0499 __none,
0500 __GB9c_indic_conjunct_break,
0501 __GB11_emoji,
0502 __GB12_GB13_regional_indicator,
0503 };
0504 __rule __active_rule_ = __rule::__none;
0505
0506 enum class __GB11_emoji_state {
0507 __Extended_Pictographic,
0508 __Extend,
0509 __ZWJ,
0510 };
0511 __GB11_emoji_state __GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
0512
0513 enum class __GB9c_indic_conjunct_break_state {
0514 __Consonant,
0515 __Linker,
0516 };
0517
0518 __GB9c_indic_conjunct_break_state __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
0519
0520
0521 };
0522
0523
0524
0525
0526
0527
0528 template <class _CharT>
0529 class __extended_grapheme_cluster_view {
0530 using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<_CharT>::const_iterator;
0531
0532 public:
0533 _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
0534 : __code_point_view_(__first, __last), __at_break_(__code_point_view_.__consume().__code_point) {}
0535
0536 struct __cluster {
0537
0538
0539
0540
0541 char32_t __code_point_;
0542
0543
0544
0545
0546
0547 _Iterator __last_;
0548 };
0549
0550 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
0551 char32_t __code_point = __at_break_.__current_code_point();
0552 _Iterator __position = __code_point_view_.__position();
0553 while (!__code_point_view_.__at_end()) {
0554 if (__at_break_(__code_point_view_.__consume().__code_point))
0555 break;
0556 __position = __code_point_view_.__position();
0557 }
0558 return {__code_point, __position};
0559 }
0560
0561 private:
0562 __code_point_view<_CharT> __code_point_view_;
0563 __extended_grapheme_cluster_break __at_break_;
0564 };
0565
0566 template <contiguous_iterator _Iterator>
0567 __extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cluster_view<iter_value_t<_Iterator>>;
0568
0569 # else
0570
0571
0572
0573 template <class _CharT>
0574 class __code_point_view {
0575 using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<_CharT>::const_iterator;
0576
0577 public:
0578 _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
0579 : __first_(__first), __last_(__last) {}
0580
0581 _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
0582 _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
0583
0584 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
0585 _LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
0586 return {static_cast<char32_t>(*__first_++)};
0587 }
0588
0589 private:
0590 _Iterator __first_;
0591 _Iterator __last_;
0592 };
0593
0594 # endif
0595
0596 }
0597
0598 #endif
0599
0600 _LIBCPP_END_NAMESPACE_STD
0601
0602 #endif