python3.13/cpython/unicodeobject.h

0001 #ifndef Py_CPYTHON_UNICODEOBJECT_H
0002 #  error "this header file must not be included directly"
0003 #endif
0004
0005 /* Py_UNICODE was the native Unicode storage format (code unit) used by
0006    Python and represents a single Unicode element in the Unicode type.
0007    With PEP 393, Py_UNICODE is deprecated and replaced with a
0008    typedef to wchar_t. */
0009 Py_DEPRECATED(3.13) typedef wchar_t PY_UNICODE_TYPE;
0010 Py_DEPRECATED(3.13) typedef wchar_t Py_UNICODE;
0011
0012
0013 /* --- Internal Unicode Operations ---------------------------------------- */
0014
0015 // Static inline functions to work with surrogates
0016 static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
0017     return (0xD800 <= ch && ch <= 0xDFFF);
0018 }
0019 static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
0020     return (0xD800 <= ch && ch <= 0xDBFF);
0021 }
0022 static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
0023     return (0xDC00 <= ch && ch <= 0xDFFF);
0024 }
0025
0026 // Join two surrogate characters and return a single Py_UCS4 value.
0027 static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
0028     assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
0029     assert(Py_UNICODE_IS_LOW_SURROGATE(low));
0030     return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
0031 }
0032
0033 // High surrogate = top 10 bits added to 0xD800.
0034 // The character must be in the range [U+10000; U+10ffff].
0035 static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
0036     assert(0x10000 <= ch && ch <= 0x10ffff);
0037     return (0xD800 - (0x10000 >> 10) + (ch >> 10));
0038 }
0039
0040 // Low surrogate = bottom 10 bits added to 0xDC00.
0041 // The character must be in the range [U+10000; U+10ffff].
0042 static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
0043     assert(0x10000 <= ch && ch <= 0x10ffff);
0044     return (0xDC00 + (ch & 0x3FF));
0045 }
0046
0047
0048 /* --- Unicode Type ------------------------------------------------------- */
0049
0050 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
0051    structure. state.ascii and state.compact are set, and the data
0052    immediately follow the structure. utf8_length can be found
0053    in the length field; the utf8 pointer is equal to the data pointer. */
0054 typedef struct {
0055     /* There are 4 forms of Unicode strings:
0056
0057        - compact ascii:
0058
0059          * structure = PyASCIIObject
0060          * test: PyUnicode_IS_COMPACT_ASCII(op)
0061          * kind = PyUnicode_1BYTE_KIND
0062          * compact = 1
0063          * ascii = 1
0064          * (length is the length of the utf8)
0065          * (data starts just after the structure)
0066          * (since ASCII is decoded from UTF-8, the utf8 string are the data)
0067
0068        - compact:
0069
0070          * structure = PyCompactUnicodeObject
0071          * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
0072          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
0073            PyUnicode_4BYTE_KIND
0074          * compact = 1
0075          * ascii = 0
0076          * utf8 is not shared with data
0077          * utf8_length = 0 if utf8 is NULL
0078          * (data starts just after the structure)
0079
0080        - legacy string:
0081
0082          * structure = PyUnicodeObject structure
0083          * test: !PyUnicode_IS_COMPACT(op)
0084          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
0085            PyUnicode_4BYTE_KIND
0086          * compact = 0
0087          * data.any is not NULL
0088          * utf8 is shared and utf8_length = length with data.any if ascii = 1
0089          * utf8_length = 0 if utf8 is NULL
0090
0091        Compact strings use only one memory block (structure + characters),
0092        whereas legacy strings use one block for the structure and one block
0093        for characters.
0094
0095        Legacy strings are created by subclasses of Unicode.
0096
0097        See also _PyUnicode_CheckConsistency().
0098     */
0099     PyObject_HEAD
0100     Py_ssize_t length;          /* Number of code points in the string */
0101     Py_hash_t hash;             /* Hash value; -1 if not set */
0102     struct {
0103         /* If interned is non-zero, the two references from the
0104            dictionary to this object are *not* counted in ob_refcnt.
0105            The possible values here are:
0106                0: Not Interned
0107                1: Interned
0108                2: Interned and Immortal
0109                3: Interned, Immortal, and Static
0110            This categorization allows the runtime to determine the right
0111            cleanup mechanism at runtime shutdown. */
0112         unsigned int interned:2;
0113         /* Character size:
0114
0115            - PyUnicode_1BYTE_KIND (1):
0116
0117              * character type = Py_UCS1 (8 bits, unsigned)
0118              * all characters are in the range U+0000-U+00FF (latin1)
0119              * if ascii is set, all characters are in the range U+0000-U+007F
0120                (ASCII), otherwise at least one character is in the range
0121                U+0080-U+00FF
0122
0123            - PyUnicode_2BYTE_KIND (2):
0124
0125              * character type = Py_UCS2 (16 bits, unsigned)
0126              * all characters are in the range U+0000-U+FFFF (BMP)
0127              * at least one character is in the range U+0100-U+FFFF
0128
0129            - PyUnicode_4BYTE_KIND (4):
0130
0131              * character type = Py_UCS4 (32 bits, unsigned)
0132              * all characters are in the range U+0000-U+10FFFF
0133              * at least one character is in the range U+10000-U+10FFFF
0134          */
0135         unsigned int kind:3;
0136         /* Compact is with respect to the allocation scheme. Compact unicode
0137            objects only require one memory block while non-compact objects use
0138            one block for the PyUnicodeObject struct and another for its data
0139            buffer. */
0140         unsigned int compact:1;
0141         /* The string only contains characters in the range U+0000-U+007F (ASCII)
0142            and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
0143            set, use the PyASCIIObject structure. */
0144         unsigned int ascii:1;
0145         /* The object is statically allocated. */
0146         unsigned int statically_allocated:1;
0147         /* Padding to ensure that PyUnicode_DATA() is always aligned to
0148            4 bytes (see issue #19537 on m68k). */
0149         unsigned int :24;
0150     } state;
0151 } PyASCIIObject;
0152
0153 /* Non-ASCII strings allocated through PyUnicode_New use the
0154    PyCompactUnicodeObject structure. state.compact is set, and the data
0155    immediately follow the structure. */
0156 typedef struct {
0157     PyASCIIObject _base;
0158     Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
0159                                  * terminating \0. */
0160     char *utf8;                 /* UTF-8 representation (null-terminated) */
0161 } PyCompactUnicodeObject;
0162
0163 /* Object format for Unicode subclasses. */
0164 typedef struct {
0165     PyCompactUnicodeObject _base;
0166     union {
0167         void *any;
0168         Py_UCS1 *latin1;
0169         Py_UCS2 *ucs2;
0170         Py_UCS4 *ucs4;
0171     } data;                     /* Canonical, smallest-form Unicode buffer */
0172 } PyUnicodeObject;
0173
0174
0175 #define _PyASCIIObject_CAST(op) \
0176     (assert(PyUnicode_Check(op)), \
0177      _Py_CAST(PyASCIIObject*, (op)))
0178 #define _PyCompactUnicodeObject_CAST(op) \
0179     (assert(PyUnicode_Check(op)), \
0180      _Py_CAST(PyCompactUnicodeObject*, (op)))
0181 #define _PyUnicodeObject_CAST(op) \
0182     (assert(PyUnicode_Check(op)), \
0183      _Py_CAST(PyUnicodeObject*, (op)))
0184
0185
0186 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
0187
0188 /* Values for PyASCIIObject.state: */
0189
0190 /* Interning state. */
0191 #define SSTATE_NOT_INTERNED 0
0192 #define SSTATE_INTERNED_MORTAL 1
0193 #define SSTATE_INTERNED_IMMORTAL 2
0194 #define SSTATE_INTERNED_IMMORTAL_STATIC 3
0195
0196 /* Use only if you know it's a string */
0197 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
0198     return _PyASCIIObject_CAST(op)->state.interned;
0199 }
0200 #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
0201
0202 /* For backward compatibility */
0203 static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
0204     return 1;
0205 }
0206 #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
0207
0208 /* Return true if the string contains only ASCII characters, or 0 if not. The
0209    string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
0210    ready. */
0211 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
0212     return _PyASCIIObject_CAST(op)->state.ascii;
0213 }
0214 #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
0215
0216 /* Return true if the string is compact or 0 if not.
0217    No type checks or Ready calls are performed. */
0218 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
0219     return _PyASCIIObject_CAST(op)->state.compact;
0220 }
0221 #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
0222
0223 /* Return true if the string is a compact ASCII string (use PyASCIIObject
0224    structure), or 0 if not.  No type checks or Ready calls are performed. */
0225 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
0226     return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
0227 }
0228 #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
0229
0230 enum PyUnicode_Kind {
0231 /* Return values of the PyUnicode_KIND() function: */
0232     PyUnicode_1BYTE_KIND = 1,
0233     PyUnicode_2BYTE_KIND = 2,
0234     PyUnicode_4BYTE_KIND = 4
0235 };
0236
0237 // PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
0238 //
0239 // gh-89653: Converting this macro to a static inline function would introduce
0240 // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
0241 // unsigned numbers) where kind type is an int or on
0242 // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
0243 #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
0244
0245 /* Return a void pointer to the raw unicode buffer. */
0246 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
0247     if (PyUnicode_IS_ASCII(op)) {
0248         return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
0249     }
0250     return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
0251 }
0252
0253 static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
0254     void *data;
0255     assert(!PyUnicode_IS_COMPACT(op));
0256     data = _PyUnicodeObject_CAST(op)->data.any;
0257     assert(data != NULL);
0258     return data;
0259 }
0260
0261 static inline void* PyUnicode_DATA(PyObject *op) {
0262     if (PyUnicode_IS_COMPACT(op)) {
0263         return _PyUnicode_COMPACT_DATA(op);
0264     }
0265     return _PyUnicode_NONCOMPACT_DATA(op);
0266 }
0267 #define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
0268
0269 /* Return pointers to the canonical representation cast to unsigned char,
0270    Py_UCS2, or Py_UCS4 for direct character access.
0271    No checks are performed, use PyUnicode_KIND() before to ensure
0272    these will work correctly. */
0273
0274 #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
0275 #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
0276 #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
0277
0278 /* Returns the length of the unicode string. */
0279 static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
0280     return _PyASCIIObject_CAST(op)->length;
0281 }
0282 #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
0283
0284 /* Write into the canonical representation, this function does not do any sanity
0285    checks and is intended for usage in loops.  The caller should cache the
0286    kind and data pointers obtained from other function calls.
0287    index is the index in the string (starts at 0) and value is the new
0288    code point value which should be written to that location. */
0289 static inline void PyUnicode_WRITE(int kind, void *data,
0290                                    Py_ssize_t index, Py_UCS4 value)
0291 {
0292     assert(index >= 0);
0293     if (kind == PyUnicode_1BYTE_KIND) {
0294         assert(value <= 0xffU);
0295         _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
0296     }
0297     else if (kind == PyUnicode_2BYTE_KIND) {
0298         assert(value <= 0xffffU);
0299         _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
0300     }
0301     else {
0302         assert(kind == PyUnicode_4BYTE_KIND);
0303         assert(value <= 0x10ffffU);
0304         _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
0305     }
0306 }
0307 #define PyUnicode_WRITE(kind, data, index, value) \
0308     PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
0309                     (index), _Py_STATIC_CAST(Py_UCS4, value))
0310
0311 /* Read a code point from the string's canonical representation.  No checks
0312    or ready calls are performed. */
0313 static inline Py_UCS4 PyUnicode_READ(int kind,
0314                                      const void *data, Py_ssize_t index)
0315 {
0316     assert(index >= 0);
0317     if (kind == PyUnicode_1BYTE_KIND) {
0318         return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
0319     }
0320     if (kind == PyUnicode_2BYTE_KIND) {
0321         return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
0322     }
0323     assert(kind == PyUnicode_4BYTE_KIND);
0324     return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
0325 }
0326 #define PyUnicode_READ(kind, data, index) \
0327     PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
0328                    _Py_STATIC_CAST(const void*, data), \
0329                    (index))
0330
0331 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
0332    calls PyUnicode_KIND() and might call it twice.  For single reads, use
0333    PyUnicode_READ_CHAR, for multiple consecutive reads callers should
0334    cache kind and use PyUnicode_READ instead. */
0335 static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
0336 {
0337     int kind;
0338
0339     assert(index >= 0);
0340     // Tolerate reading the NUL character at str[len(str)]
0341     assert(index <= PyUnicode_GET_LENGTH(unicode));
0342
0343     kind = PyUnicode_KIND(unicode);
0344     if (kind == PyUnicode_1BYTE_KIND) {
0345         return PyUnicode_1BYTE_DATA(unicode)[index];
0346     }
0347     if (kind == PyUnicode_2BYTE_KIND) {
0348         return PyUnicode_2BYTE_DATA(unicode)[index];
0349     }
0350     assert(kind == PyUnicode_4BYTE_KIND);
0351     return PyUnicode_4BYTE_DATA(unicode)[index];
0352 }
0353 #define PyUnicode_READ_CHAR(unicode, index) \
0354     PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
0355
0356 /* Return a maximum character value which is suitable for creating another
0357    string based on op.  This is always an approximation but more efficient
0358    than iterating over the string. */
0359 static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
0360 {
0361     int kind;
0362
0363     if (PyUnicode_IS_ASCII(op)) {
0364         return 0x7fU;
0365     }
0366
0367     kind = PyUnicode_KIND(op);
0368     if (kind == PyUnicode_1BYTE_KIND) {
0369        return 0xffU;
0370     }
0371     if (kind == PyUnicode_2BYTE_KIND) {
0372         return 0xffffU;
0373     }
0374     assert(kind == PyUnicode_4BYTE_KIND);
0375     return 0x10ffffU;
0376 }
0377 #define PyUnicode_MAX_CHAR_VALUE(op) \
0378     PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
0379
0380
0381 /* === Public API ========================================================= */
0382
0383 /* With PEP 393, this is the recommended way to allocate a new unicode object.
0384    This function will allocate the object and its buffer in a single memory
0385    block.  Objects created using this function are not resizable. */
0386 PyAPI_FUNC(PyObject*) PyUnicode_New(
0387     Py_ssize_t size,            /* Number of code points in the new string */
0388     Py_UCS4 maxchar             /* maximum code point value in the string */
0389     );
0390
0391 /* For backward compatibility */
0392 static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
0393 {
0394     return 0;
0395 }
0396 #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
0397
0398 /* Copy character from one unicode object into another, this function performs
0399    character conversion when necessary and falls back to memcpy() if possible.
0400
0401    Fail if to is too small (smaller than *how_many* or smaller than
0402    len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
0403    kind(to), or if *to* has more than 1 reference.
0404
0405    Return the number of written character, or return -1 and raise an exception
0406    on error.
0407
0408    Pseudo-code:
0409
0410        how_many = min(how_many, len(from) - from_start)
0411        to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
0412        return how_many
0413
0414    Note: The function doesn't write a terminating null character.
0415    */
0416 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
0417     PyObject *to,
0418     Py_ssize_t to_start,
0419     PyObject *from,
0420     Py_ssize_t from_start,
0421     Py_ssize_t how_many
0422     );
0423
0424 /* Fill a string with a character: write fill_char into
0425    unicode[start:start+length].
0426
0427    Fail if fill_char is bigger than the string maximum character, or if the
0428    string has more than 1 reference.
0429
0430    Return the number of written character, or return -1 and raise an exception
0431    on error. */
0432 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
0433     PyObject *unicode,
0434     Py_ssize_t start,
0435     Py_ssize_t length,
0436     Py_UCS4 fill_char
0437     );
0438
0439 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
0440    Scan the string to find the maximum character. */
0441 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
0442     int kind,
0443     const void *buffer,
0444     Py_ssize_t size);
0445
0446
0447 /* --- _PyUnicodeWriter API ----------------------------------------------- */
0448
0449 typedef struct {
0450     PyObject *buffer;
0451     void *data;
0452     int kind;
0453     Py_UCS4 maxchar;
0454     Py_ssize_t size;
0455     Py_ssize_t pos;
0456
0457     /* minimum number of allocated characters (default: 0) */
0458     Py_ssize_t min_length;
0459
0460     /* minimum character (default: 127, ASCII) */
0461     Py_UCS4 min_char;
0462
0463     /* If non-zero, overallocate the buffer (default: 0). */
0464     unsigned char overallocate;
0465
0466     /* If readonly is 1, buffer is a shared string (cannot be modified)
0467        and size is set to 0. */
0468     unsigned char readonly;
0469 } _PyUnicodeWriter ;
0470
0471 // Initialize a Unicode writer.
0472 //
0473 // By default, the minimum buffer size is 0 character and overallocation is
0474 // disabled. Set min_length, min_char and overallocate attributes to control
0475 // the allocation of the buffer.
0476 PyAPI_FUNC(void)
0477 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
0478
0479 /* Prepare the buffer to write 'length' characters
0480    with the specified maximum character.
0481
0482    Return 0 on success, raise an exception and return -1 on error. */
0483 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
0484     (((MAXCHAR) <= (WRITER)->maxchar                                  \
0485       && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
0486      ? 0                                                              \
0487      : (((LENGTH) == 0)                                               \
0488         ? 0                                                           \
0489         : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
0490
0491 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
0492    instead. */
0493 PyAPI_FUNC(int)
0494 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
0495                                  Py_ssize_t length, Py_UCS4 maxchar);
0496
0497 /* Prepare the buffer to have at least the kind KIND.
0498    For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
0499    support characters in range U+000-U+FFFF.
0500
0501    Return 0 on success, raise an exception and return -1 on error. */
0502 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
0503     ((KIND) <= (WRITER)->kind                                         \
0504      ? 0                                                              \
0505      : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
0506
0507 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
0508    macro instead. */
0509 PyAPI_FUNC(int)
0510 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
0511                                      int kind);
0512
0513 /* Append a Unicode character.
0514    Return 0 on success, raise an exception and return -1 on error. */
0515 PyAPI_FUNC(int)
0516 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
0517     Py_UCS4 ch
0518     );
0519
0520 /* Append a Unicode string.
0521    Return 0 on success, raise an exception and return -1 on error. */
0522 PyAPI_FUNC(int)
0523 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
0524     PyObject *str               /* Unicode string */
0525     );
0526
0527 /* Append a substring of a Unicode string.
0528    Return 0 on success, raise an exception and return -1 on error. */
0529 PyAPI_FUNC(int)
0530 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
0531     PyObject *str,              /* Unicode string */
0532     Py_ssize_t start,
0533     Py_ssize_t end
0534     );
0535
0536 /* Append an ASCII-encoded byte string.
0537    Return 0 on success, raise an exception and return -1 on error. */
0538 PyAPI_FUNC(int)
0539 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
0540     const char *str,           /* ASCII-encoded byte string */
0541     Py_ssize_t len             /* number of bytes, or -1 if unknown */
0542     );
0543
0544 /* Append a latin1-encoded byte string.
0545    Return 0 on success, raise an exception and return -1 on error. */
0546 PyAPI_FUNC(int)
0547 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
0548     const char *str,           /* latin1-encoded byte string */
0549     Py_ssize_t len             /* length in bytes */
0550     );
0551
0552 /* Get the value of the writer as a Unicode string. Clear the
0553    buffer of the writer. Raise an exception and return NULL
0554    on error. */
0555 PyAPI_FUNC(PyObject *)
0556 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
0557
0558 /* Deallocate memory of a writer (clear its internal buffer). */
0559 PyAPI_FUNC(void)
0560 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
0561
0562
0563 /* --- Manage the default encoding ---------------------------------------- */
0564
0565 /* Returns a pointer to the default encoding (UTF-8) of the
0566    Unicode object unicode.
0567
0568    Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
0569    in the unicodeobject.
0570
0571    _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
0572    support the previous internal function with the same behaviour.
0573
0574    Use of this API is DEPRECATED since no size information can be
0575    extracted from the returned data.
0576 */
0577
0578 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
0579
0580 // Alias kept for backward compatibility
0581 #define _PyUnicode_AsString PyUnicode_AsUTF8
0582
0583
0584 /* === Characters Type APIs =============================================== */
0585
0586 /* These should not be used directly. Use the Py_UNICODE_IS* and
0587    Py_UNICODE_TO* macros instead.
0588
0589    These APIs are implemented in Objects/unicodectype.c.
0590
0591 */
0592
0593 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
0594     Py_UCS4 ch       /* Unicode character */
0595     );
0596
0597 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
0598     Py_UCS4 ch       /* Unicode character */
0599     );
0600
0601 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
0602     Py_UCS4 ch       /* Unicode character */
0603     );
0604
0605 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
0606     const Py_UCS4 ch         /* Unicode character */
0607     );
0608
0609 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
0610     const Py_UCS4 ch         /* Unicode character */
0611     );
0612
0613 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
0614     Py_UCS4 ch       /* Unicode character */
0615     );
0616
0617 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
0618     Py_UCS4 ch       /* Unicode character */
0619     );
0620
0621 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
0622     Py_UCS4 ch       /* Unicode character */
0623     );
0624
0625 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
0626     Py_UCS4 ch       /* Unicode character */
0627     );
0628
0629 PyAPI_FUNC(int) _PyUnicode_ToDigit(
0630     Py_UCS4 ch       /* Unicode character */
0631     );
0632
0633 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
0634     Py_UCS4 ch       /* Unicode character */
0635     );
0636
0637 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
0638     Py_UCS4 ch       /* Unicode character */
0639     );
0640
0641 PyAPI_FUNC(int) _PyUnicode_IsDigit(
0642     Py_UCS4 ch       /* Unicode character */
0643     );
0644
0645 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
0646     Py_UCS4 ch       /* Unicode character */
0647     );
0648
0649 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
0650     Py_UCS4 ch       /* Unicode character */
0651     );
0652
0653 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
0654     Py_UCS4 ch       /* Unicode character */
0655     );
0656
0657 // Helper array used by Py_UNICODE_ISSPACE().
0658 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
0659
0660 // Since splitting on whitespace is an important use case, and
0661 // whitespace in most situations is solely ASCII whitespace, we
0662 // optimize for the common case by using a quick look-up table
0663 // _Py_ascii_whitespace (see below) with an inlined check.
0664 static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
0665     if (ch < 128) {
0666         return _Py_ascii_whitespace[ch];
0667     }
0668     return _PyUnicode_IsWhitespace(ch);
0669 }
0670
0671 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
0672 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
0673 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
0674 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
0675
0676 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
0677 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
0678 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
0679
0680 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
0681 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
0682 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
0683 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
0684
0685 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
0686 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
0687 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
0688
0689 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
0690
0691 static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
0692    return (Py_UNICODE_ISALPHA(ch)
0693            || Py_UNICODE_ISDECIMAL(ch)
0694            || Py_UNICODE_ISDIGIT(ch)
0695            || Py_UNICODE_ISNUMERIC(ch));
0696 }
0697
0698
0699 /* === Misc functions ===================================================== */
0700
0701 // Return an interned Unicode object for an Identifier; may fail if there is no
0702 // memory.
0703 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);