File indexing completed on 2025-11-19 09:50:43
0001 #ifndef Py_CPYTHON_UNICODEOBJECT_H
0002 # error "this header file must not be included directly"
0003 #endif
0004
0005
0006
0007
0008
0009 Py_DEPRECATED(3.13) typedef wchar_t PY_UNICODE_TYPE;
0010 Py_DEPRECATED(3.13) typedef wchar_t Py_UNICODE;
0011
0012
0013
0014
0015
0016 static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
0017 return (0xD800 <= ch && ch <= 0xDFFF);
0018 }
0019 static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
0020 return (0xD800 <= ch && ch <= 0xDBFF);
0021 }
0022 static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
0023 return (0xDC00 <= ch && ch <= 0xDFFF);
0024 }
0025
0026
0027 static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) {
0028 assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
0029 assert(Py_UNICODE_IS_LOW_SURROGATE(low));
0030 return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
0031 }
0032
0033
0034
0035 static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
0036 assert(0x10000 <= ch && ch <= 0x10ffff);
0037 return (0xD800 - (0x10000 >> 10) + (ch >> 10));
0038 }
0039
0040
0041
0042 static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
0043 assert(0x10000 <= ch && ch <= 0x10ffff);
0044 return (0xDC00 + (ch & 0x3FF));
0045 }
0046
0047
0048
0049
0050
0051
0052
0053
0054 typedef struct {
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099 PyObject_HEAD
0100 Py_ssize_t length;
0101 Py_hash_t hash;
0102 struct {
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112 unsigned int interned:2;
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135 unsigned int kind:3;
0136
0137
0138
0139
0140 unsigned int compact:1;
0141
0142
0143
0144 unsigned int ascii:1;
0145
0146 unsigned int statically_allocated:1;
0147
0148
0149 unsigned int :24;
0150 } state;
0151 } PyASCIIObject;
0152
0153
0154
0155
0156 typedef struct {
0157 PyASCIIObject _base;
0158 Py_ssize_t utf8_length;
0159
0160 char *utf8;
0161 } PyCompactUnicodeObject;
0162
0163
0164 typedef struct {
0165 PyCompactUnicodeObject _base;
0166 union {
0167 void *any;
0168 Py_UCS1 *latin1;
0169 Py_UCS2 *ucs2;
0170 Py_UCS4 *ucs4;
0171 } data;
0172 } PyUnicodeObject;
0173
0174
0175 #define _PyASCIIObject_CAST(op) \
0176 (assert(PyUnicode_Check(op)), \
0177 _Py_CAST(PyASCIIObject*, (op)))
0178 #define _PyCompactUnicodeObject_CAST(op) \
0179 (assert(PyUnicode_Check(op)), \
0180 _Py_CAST(PyCompactUnicodeObject*, (op)))
0181 #define _PyUnicodeObject_CAST(op) \
0182 (assert(PyUnicode_Check(op)), \
0183 _Py_CAST(PyUnicodeObject*, (op)))
0184
0185
0186
0187
0188
0189
0190
0191 #define SSTATE_NOT_INTERNED 0
0192 #define SSTATE_INTERNED_MORTAL 1
0193 #define SSTATE_INTERNED_IMMORTAL 2
0194 #define SSTATE_INTERNED_IMMORTAL_STATIC 3
0195
0196
0197 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
0198 return _PyASCIIObject_CAST(op)->state.interned;
0199 }
0200 #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
0201
0202
0203 static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
0204 return 1;
0205 }
0206 #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
0207
0208
0209
0210
0211 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
0212 return _PyASCIIObject_CAST(op)->state.ascii;
0213 }
0214 #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
0215
0216
0217
0218 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
0219 return _PyASCIIObject_CAST(op)->state.compact;
0220 }
0221 #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
0222
0223
0224
0225 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
0226 return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
0227 }
0228 #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
0229
0230 enum PyUnicode_Kind {
0231
0232 PyUnicode_1BYTE_KIND = 1,
0233 PyUnicode_2BYTE_KIND = 2,
0234 PyUnicode_4BYTE_KIND = 4
0235 };
0236
0237
0238
0239
0240
0241
0242
0243 #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
0244
0245
0246 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
0247 if (PyUnicode_IS_ASCII(op)) {
0248 return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
0249 }
0250 return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
0251 }
0252
0253 static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
0254 void *data;
0255 assert(!PyUnicode_IS_COMPACT(op));
0256 data = _PyUnicodeObject_CAST(op)->data.any;
0257 assert(data != NULL);
0258 return data;
0259 }
0260
0261 static inline void* PyUnicode_DATA(PyObject *op) {
0262 if (PyUnicode_IS_COMPACT(op)) {
0263 return _PyUnicode_COMPACT_DATA(op);
0264 }
0265 return _PyUnicode_NONCOMPACT_DATA(op);
0266 }
0267 #define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
0268
0269
0270
0271
0272
0273
0274 #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
0275 #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
0276 #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
0277
0278
0279 static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
0280 return _PyASCIIObject_CAST(op)->length;
0281 }
0282 #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
0283
0284
0285
0286
0287
0288
0289 static inline void PyUnicode_WRITE(int kind, void *data,
0290 Py_ssize_t index, Py_UCS4 value)
0291 {
0292 assert(index >= 0);
0293 if (kind == PyUnicode_1BYTE_KIND) {
0294 assert(value <= 0xffU);
0295 _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
0296 }
0297 else if (kind == PyUnicode_2BYTE_KIND) {
0298 assert(value <= 0xffffU);
0299 _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
0300 }
0301 else {
0302 assert(kind == PyUnicode_4BYTE_KIND);
0303 assert(value <= 0x10ffffU);
0304 _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
0305 }
0306 }
0307 #define PyUnicode_WRITE(kind, data, index, value) \
0308 PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
0309 (index), _Py_STATIC_CAST(Py_UCS4, value))
0310
0311
0312
0313 static inline Py_UCS4 PyUnicode_READ(int kind,
0314 const void *data, Py_ssize_t index)
0315 {
0316 assert(index >= 0);
0317 if (kind == PyUnicode_1BYTE_KIND) {
0318 return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
0319 }
0320 if (kind == PyUnicode_2BYTE_KIND) {
0321 return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
0322 }
0323 assert(kind == PyUnicode_4BYTE_KIND);
0324 return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
0325 }
0326 #define PyUnicode_READ(kind, data, index) \
0327 PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
0328 _Py_STATIC_CAST(const void*, data), \
0329 (index))
0330
0331
0332
0333
0334
0335 static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
0336 {
0337 int kind;
0338
0339 assert(index >= 0);
0340
0341 assert(index <= PyUnicode_GET_LENGTH(unicode));
0342
0343 kind = PyUnicode_KIND(unicode);
0344 if (kind == PyUnicode_1BYTE_KIND) {
0345 return PyUnicode_1BYTE_DATA(unicode)[index];
0346 }
0347 if (kind == PyUnicode_2BYTE_KIND) {
0348 return PyUnicode_2BYTE_DATA(unicode)[index];
0349 }
0350 assert(kind == PyUnicode_4BYTE_KIND);
0351 return PyUnicode_4BYTE_DATA(unicode)[index];
0352 }
0353 #define PyUnicode_READ_CHAR(unicode, index) \
0354 PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
0355
0356
0357
0358
0359 static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
0360 {
0361 int kind;
0362
0363 if (PyUnicode_IS_ASCII(op)) {
0364 return 0x7fU;
0365 }
0366
0367 kind = PyUnicode_KIND(op);
0368 if (kind == PyUnicode_1BYTE_KIND) {
0369 return 0xffU;
0370 }
0371 if (kind == PyUnicode_2BYTE_KIND) {
0372 return 0xffffU;
0373 }
0374 assert(kind == PyUnicode_4BYTE_KIND);
0375 return 0x10ffffU;
0376 }
0377 #define PyUnicode_MAX_CHAR_VALUE(op) \
0378 PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
0379
0380
0381
0382
0383
0384
0385
0386 PyAPI_FUNC(PyObject*) PyUnicode_New(
0387 Py_ssize_t size,
0388 Py_UCS4 maxchar
0389 );
0390
0391
0392 static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
0393 {
0394 return 0;
0395 }
0396 #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
0397
0398
0399
0400
0401
0402
0403
0404
0405
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415
0416 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
0417 PyObject *to,
0418 Py_ssize_t to_start,
0419 PyObject *from,
0420 Py_ssize_t from_start,
0421 Py_ssize_t how_many
0422 );
0423
0424
0425
0426
0427
0428
0429
0430
0431
0432 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
0433 PyObject *unicode,
0434 Py_ssize_t start,
0435 Py_ssize_t length,
0436 Py_UCS4 fill_char
0437 );
0438
0439
0440
0441 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
0442 int kind,
0443 const void *buffer,
0444 Py_ssize_t size);
0445
0446
0447
0448
0449 typedef struct {
0450 PyObject *buffer;
0451 void *data;
0452 int kind;
0453 Py_UCS4 maxchar;
0454 Py_ssize_t size;
0455 Py_ssize_t pos;
0456
0457
0458 Py_ssize_t min_length;
0459
0460
0461 Py_UCS4 min_char;
0462
0463
0464 unsigned char overallocate;
0465
0466
0467
0468 unsigned char readonly;
0469 } _PyUnicodeWriter ;
0470
0471
0472
0473
0474
0475
0476 PyAPI_FUNC(void)
0477 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
0478
0479
0480
0481
0482
0483 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
0484 (((MAXCHAR) <= (WRITER)->maxchar \
0485 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
0486 ? 0 \
0487 : (((LENGTH) == 0) \
0488 ? 0 \
0489 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
0490
0491
0492
0493 PyAPI_FUNC(int)
0494 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
0495 Py_ssize_t length, Py_UCS4 maxchar);
0496
0497
0498
0499
0500
0501
0502 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
0503 ((KIND) <= (WRITER)->kind \
0504 ? 0 \
0505 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
0506
0507
0508
0509 PyAPI_FUNC(int)
0510 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
0511 int kind);
0512
0513
0514
0515 PyAPI_FUNC(int)
0516 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
0517 Py_UCS4 ch
0518 );
0519
0520
0521
0522 PyAPI_FUNC(int)
0523 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
0524 PyObject *str
0525 );
0526
0527
0528
0529 PyAPI_FUNC(int)
0530 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
0531 PyObject *str,
0532 Py_ssize_t start,
0533 Py_ssize_t end
0534 );
0535
0536
0537
0538 PyAPI_FUNC(int)
0539 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
0540 const char *str,
0541 Py_ssize_t len
0542 );
0543
0544
0545
0546 PyAPI_FUNC(int)
0547 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
0548 const char *str,
0549 Py_ssize_t len
0550 );
0551
0552
0553
0554
0555 PyAPI_FUNC(PyObject *)
0556 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
0557
0558
0559 PyAPI_FUNC(void)
0560 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
0561
0562
0563
0564
0565
0566
0567
0568
0569
0570
0571
0572
0573
0574
0575
0576
0577
0578 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
0579
0580
0581 #define _PyUnicode_AsString PyUnicode_AsUTF8
0582
0583
0584
0585
0586
0587
0588
0589
0590
0591
0592
0593 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
0594 Py_UCS4 ch
0595 );
0596
0597 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
0598 Py_UCS4 ch
0599 );
0600
0601 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
0602 Py_UCS4 ch
0603 );
0604
0605 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
0606 const Py_UCS4 ch
0607 );
0608
0609 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
0610 const Py_UCS4 ch
0611 );
0612
0613 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
0614 Py_UCS4 ch
0615 );
0616
0617 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
0618 Py_UCS4 ch
0619 );
0620
0621 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
0622 Py_UCS4 ch
0623 );
0624
0625 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
0626 Py_UCS4 ch
0627 );
0628
0629 PyAPI_FUNC(int) _PyUnicode_ToDigit(
0630 Py_UCS4 ch
0631 );
0632
0633 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
0634 Py_UCS4 ch
0635 );
0636
0637 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
0638 Py_UCS4 ch
0639 );
0640
0641 PyAPI_FUNC(int) _PyUnicode_IsDigit(
0642 Py_UCS4 ch
0643 );
0644
0645 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
0646 Py_UCS4 ch
0647 );
0648
0649 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
0650 Py_UCS4 ch
0651 );
0652
0653 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
0654 Py_UCS4 ch
0655 );
0656
0657
0658 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
0659
0660
0661
0662
0663
0664 static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
0665 if (ch < 128) {
0666 return _Py_ascii_whitespace[ch];
0667 }
0668 return _PyUnicode_IsWhitespace(ch);
0669 }
0670
0671 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
0672 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
0673 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
0674 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
0675
0676 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
0677 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
0678 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
0679
0680 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
0681 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
0682 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
0683 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
0684
0685 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
0686 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
0687 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
0688
0689 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
0690
0691 static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
0692 return (Py_UNICODE_ISALPHA(ch)
0693 || Py_UNICODE_ISDECIMAL(ch)
0694 || Py_UNICODE_ISDIGIT(ch)
0695 || Py_UNICODE_ISNUMERIC(ch));
0696 }
0697
0698
0699
0700
0701
0702
0703 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);