Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:06:42

0001 #ifndef Py_CPYTHON_UNICODEOBJECT_H
0002 #  error "this header file must not be included directly"
0003 #endif
0004 
0005 /* Py_UNICODE was the native Unicode storage format (code unit) used by
0006    Python and represents a single Unicode element in the Unicode type.
0007    With PEP 393, Py_UNICODE is deprecated and replaced with a
0008    typedef to wchar_t. */
0009 #define PY_UNICODE_TYPE wchar_t
0010 /* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
0011 
0012 /* --- Internal Unicode Operations ---------------------------------------- */
0013 
0014 // Static inline functions to work with surrogates
0015 static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
0016     return (0xD800 <= ch && ch <= 0xDFFF);
0017 }
0018 static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
0019     return (0xD800 <= ch && ch <= 0xDBFF);
0020 }
0021 static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
0022     return (0xDC00 <= ch && ch <= 0xDFFF);
0023 }
0024 
0025 // Join two surrogate characters and return a single Py_UCS4 value.
0026 static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
0027     assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
0028     assert(Py_UNICODE_IS_LOW_SURROGATE(low));
0029     return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
0030 }
0031 
0032 // High surrogate = top 10 bits added to 0xD800.
0033 // The character must be in the range [U+10000; U+10ffff].
0034 static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
0035     assert(0x10000 <= ch && ch <= 0x10ffff);
0036     return (0xD800 - (0x10000 >> 10) + (ch >> 10));
0037 }
0038 
0039 // Low surrogate = bottom 10 bits added to 0xDC00.
0040 // The character must be in the range [U+10000; U+10ffff].
0041 static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
0042     assert(0x10000 <= ch && ch <= 0x10ffff);
0043     return (0xDC00 + (ch & 0x3FF));
0044 }
0045 
0046 /* --- Unicode Type ------------------------------------------------------- */
0047 
0048 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
0049    structure. state.ascii and state.compact are set, and the data
0050    immediately follow the structure. utf8_length can be found
0051    in the length field; the utf8 pointer is equal to the data pointer. */
0052 typedef struct {
0053     /* There are 4 forms of Unicode strings:
0054 
0055        - compact ascii:
0056 
0057          * structure = PyASCIIObject
0058          * test: PyUnicode_IS_COMPACT_ASCII(op)
0059          * kind = PyUnicode_1BYTE_KIND
0060          * compact = 1
0061          * ascii = 1
0062          * (length is the length of the utf8)
0063          * (data starts just after the structure)
0064          * (since ASCII is decoded from UTF-8, the utf8 string are the data)
0065 
0066        - compact:
0067 
0068          * structure = PyCompactUnicodeObject
0069          * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
0070          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
0071            PyUnicode_4BYTE_KIND
0072          * compact = 1
0073          * ascii = 0
0074          * utf8 is not shared with data
0075          * utf8_length = 0 if utf8 is NULL
0076          * (data starts just after the structure)
0077 
0078        - legacy string:
0079 
0080          * structure = PyUnicodeObject structure
0081          * test: !PyUnicode_IS_COMPACT(op)
0082          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
0083            PyUnicode_4BYTE_KIND
0084          * compact = 0
0085          * data.any is not NULL
0086          * utf8 is shared and utf8_length = length with data.any if ascii = 1
0087          * utf8_length = 0 if utf8 is NULL
0088 
0089        Compact strings use only one memory block (structure + characters),
0090        whereas legacy strings use one block for the structure and one block
0091        for characters.
0092 
0093        Legacy strings are created by subclasses of Unicode.
0094 
0095        See also _PyUnicode_CheckConsistency().
0096     */
0097     PyObject_HEAD
0098     Py_ssize_t length;          /* Number of code points in the string */
0099     Py_hash_t hash;             /* Hash value; -1 if not set */
0100     struct {
0101         /* If interned is non-zero, the two references from the
0102            dictionary to this object are *not* counted in ob_refcnt.
0103            The possible values here are:
0104                0: Not Interned
0105                1: Interned
0106                2: Interned and Immortal
0107                3: Interned, Immortal, and Static
0108            This categorization allows the runtime to determine the right
0109            cleanup mechanism at runtime shutdown. */
0110         unsigned int interned:2;
0111         /* Character size:
0112 
0113            - PyUnicode_1BYTE_KIND (1):
0114 
0115              * character type = Py_UCS1 (8 bits, unsigned)
0116              * all characters are in the range U+0000-U+00FF (latin1)
0117              * if ascii is set, all characters are in the range U+0000-U+007F
0118                (ASCII), otherwise at least one character is in the range
0119                U+0080-U+00FF
0120 
0121            - PyUnicode_2BYTE_KIND (2):
0122 
0123              * character type = Py_UCS2 (16 bits, unsigned)
0124              * all characters are in the range U+0000-U+FFFF (BMP)
0125              * at least one character is in the range U+0100-U+FFFF
0126 
0127            - PyUnicode_4BYTE_KIND (4):
0128 
0129              * character type = Py_UCS4 (32 bits, unsigned)
0130              * all characters are in the range U+0000-U+10FFFF
0131              * at least one character is in the range U+10000-U+10FFFF
0132          */
0133         unsigned int kind:3;
0134         /* Compact is with respect to the allocation scheme. Compact unicode
0135            objects only require one memory block while non-compact objects use
0136            one block for the PyUnicodeObject struct and another for its data
0137            buffer. */
0138         unsigned int compact:1;
0139         /* The string only contains characters in the range U+0000-U+007F (ASCII)
0140            and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
0141            set, use the PyASCIIObject structure. */
0142         unsigned int ascii:1;
0143         /* The object is statically allocated. */
0144         unsigned int statically_allocated:1;
0145         /* Padding to ensure that PyUnicode_DATA() is always aligned to
0146            4 bytes (see issue #19537 on m68k). */
0147         unsigned int :24;
0148     } state;
0149 } PyASCIIObject;
0150 
0151 /* Non-ASCII strings allocated through PyUnicode_New use the
0152    PyCompactUnicodeObject structure. state.compact is set, and the data
0153    immediately follow the structure. */
0154 typedef struct {
0155     PyASCIIObject _base;
0156     Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
0157                                  * terminating \0. */
0158     char *utf8;                 /* UTF-8 representation (null-terminated) */
0159 } PyCompactUnicodeObject;
0160 
0161 /* Object format for Unicode subclasses. */
0162 typedef struct {
0163     PyCompactUnicodeObject _base;
0164     union {
0165         void *any;
0166         Py_UCS1 *latin1;
0167         Py_UCS2 *ucs2;
0168         Py_UCS4 *ucs4;
0169     } data;                     /* Canonical, smallest-form Unicode buffer */
0170 } PyUnicodeObject;
0171 
0172 PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
0173     PyObject *op,
0174     int check_content);
0175 
0176 
0177 #define _PyASCIIObject_CAST(op) \
0178     (assert(PyUnicode_Check(op)), \
0179      _Py_CAST(PyASCIIObject*, (op)))
0180 #define _PyCompactUnicodeObject_CAST(op) \
0181     (assert(PyUnicode_Check(op)), \
0182      _Py_CAST(PyCompactUnicodeObject*, (op)))
0183 #define _PyUnicodeObject_CAST(op) \
0184     (assert(PyUnicode_Check(op)), \
0185      _Py_CAST(PyUnicodeObject*, (op)))
0186 
0187 
0188 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
0189 
0190 /* Values for PyASCIIObject.state: */
0191 
0192 /* Interning state. */
0193 #define SSTATE_NOT_INTERNED 0
0194 #define SSTATE_INTERNED_MORTAL 1
0195 #define SSTATE_INTERNED_IMMORTAL 2
0196 #define SSTATE_INTERNED_IMMORTAL_STATIC 3
0197 
0198 /* Use only if you know it's a string */
0199 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
0200     return _PyASCIIObject_CAST(op)->state.interned;
0201 }
0202 #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
0203 
0204 /* For backward compatibility */
0205 static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
0206     return 1;
0207 }
0208 #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
0209 
0210 /* Return true if the string contains only ASCII characters, or 0 if not. The
0211    string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
0212    ready. */
0213 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
0214     return _PyASCIIObject_CAST(op)->state.ascii;
0215 }
0216 #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
0217 
0218 /* Return true if the string is compact or 0 if not.
0219    No type checks or Ready calls are performed. */
0220 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
0221     return _PyASCIIObject_CAST(op)->state.compact;
0222 }
0223 #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
0224 
0225 /* Return true if the string is a compact ASCII string (use PyASCIIObject
0226    structure), or 0 if not.  No type checks or Ready calls are performed. */
0227 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
0228     return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
0229 }
0230 #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
0231 
0232 enum PyUnicode_Kind {
0233 /* Return values of the PyUnicode_KIND() function: */
0234     PyUnicode_1BYTE_KIND = 1,
0235     PyUnicode_2BYTE_KIND = 2,
0236     PyUnicode_4BYTE_KIND = 4
0237 };
0238 
0239 // PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
0240 //
0241 // gh-89653: Converting this macro to a static inline function would introduce
0242 // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
0243 // unsigned numbers) where kind type is an int or on
0244 // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
0245 #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
0246 
0247 /* Return a void pointer to the raw unicode buffer. */
0248 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
0249     if (PyUnicode_IS_ASCII(op)) {
0250         return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
0251     }
0252     return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
0253 }
0254 
0255 static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
0256     void *data;
0257     assert(!PyUnicode_IS_COMPACT(op));
0258     data = _PyUnicodeObject_CAST(op)->data.any;
0259     assert(data != NULL);
0260     return data;
0261 }
0262 
0263 static inline void* PyUnicode_DATA(PyObject *op) {
0264     if (PyUnicode_IS_COMPACT(op)) {
0265         return _PyUnicode_COMPACT_DATA(op);
0266     }
0267     return _PyUnicode_NONCOMPACT_DATA(op);
0268 }
0269 #define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
0270 
0271 /* Return pointers to the canonical representation cast to unsigned char,
0272    Py_UCS2, or Py_UCS4 for direct character access.
0273    No checks are performed, use PyUnicode_KIND() before to ensure
0274    these will work correctly. */
0275 
0276 #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
0277 #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
0278 #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
0279 
0280 /* Returns the length of the unicode string. */
0281 static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
0282     return _PyASCIIObject_CAST(op)->length;
0283 }
0284 #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
0285 
0286 /* Write into the canonical representation, this function does not do any sanity
0287    checks and is intended for usage in loops.  The caller should cache the
0288    kind and data pointers obtained from other function calls.
0289    index is the index in the string (starts at 0) and value is the new
0290    code point value which should be written to that location. */
0291 static inline void PyUnicode_WRITE(int kind, void *data,
0292                                    Py_ssize_t index, Py_UCS4 value)
0293 {
0294     assert(index >= 0);
0295     if (kind == PyUnicode_1BYTE_KIND) {
0296         assert(value <= 0xffU);
0297         _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
0298     }
0299     else if (kind == PyUnicode_2BYTE_KIND) {
0300         assert(value <= 0xffffU);
0301         _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
0302     }
0303     else {
0304         assert(kind == PyUnicode_4BYTE_KIND);
0305         assert(value <= 0x10ffffU);
0306         _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
0307     }
0308 }
0309 #define PyUnicode_WRITE(kind, data, index, value) \
0310     PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
0311                     (index), _Py_STATIC_CAST(Py_UCS4, value))
0312 
0313 /* Read a code point from the string's canonical representation.  No checks
0314    or ready calls are performed. */
0315 static inline Py_UCS4 PyUnicode_READ(int kind,
0316                                      const void *data, Py_ssize_t index)
0317 {
0318     assert(index >= 0);
0319     if (kind == PyUnicode_1BYTE_KIND) {
0320         return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
0321     }
0322     if (kind == PyUnicode_2BYTE_KIND) {
0323         return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
0324     }
0325     assert(kind == PyUnicode_4BYTE_KIND);
0326     return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
0327 }
0328 #define PyUnicode_READ(kind, data, index) \
0329     PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
0330                    _Py_STATIC_CAST(const void*, data), \
0331                    (index))
0332 
0333 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
0334    calls PyUnicode_KIND() and might call it twice.  For single reads, use
0335    PyUnicode_READ_CHAR, for multiple consecutive reads callers should
0336    cache kind and use PyUnicode_READ instead. */
0337 static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
0338 {
0339     int kind;
0340 
0341     assert(index >= 0);
0342     // Tolerate reading the NUL character at str[len(str)]
0343     assert(index <= PyUnicode_GET_LENGTH(unicode));
0344 
0345     kind = PyUnicode_KIND(unicode);
0346     if (kind == PyUnicode_1BYTE_KIND) {
0347         return PyUnicode_1BYTE_DATA(unicode)[index];
0348     }
0349     if (kind == PyUnicode_2BYTE_KIND) {
0350         return PyUnicode_2BYTE_DATA(unicode)[index];
0351     }
0352     assert(kind == PyUnicode_4BYTE_KIND);
0353     return PyUnicode_4BYTE_DATA(unicode)[index];
0354 }
0355 #define PyUnicode_READ_CHAR(unicode, index) \
0356     PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
0357 
0358 /* Return a maximum character value which is suitable for creating another
0359    string based on op.  This is always an approximation but more efficient
0360    than iterating over the string. */
0361 static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
0362 {
0363     int kind;
0364 
0365     if (PyUnicode_IS_ASCII(op)) {
0366         return 0x7fU;
0367     }
0368 
0369     kind = PyUnicode_KIND(op);
0370     if (kind == PyUnicode_1BYTE_KIND) {
0371        return 0xffU;
0372     }
0373     if (kind == PyUnicode_2BYTE_KIND) {
0374         return 0xffffU;
0375     }
0376     assert(kind == PyUnicode_4BYTE_KIND);
0377     return 0x10ffffU;
0378 }
0379 #define PyUnicode_MAX_CHAR_VALUE(op) \
0380     PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
0381 
0382 /* === Public API ========================================================= */
0383 
0384 /* --- Plain Py_UNICODE --------------------------------------------------- */
0385 
0386 /* With PEP 393, this is the recommended way to allocate a new unicode object.
0387    This function will allocate the object and its buffer in a single memory
0388    block.  Objects created using this function are not resizable. */
0389 PyAPI_FUNC(PyObject*) PyUnicode_New(
0390     Py_ssize_t size,            /* Number of code points in the new string */
0391     Py_UCS4 maxchar             /* maximum code point value in the string */
0392     );
0393 
0394 /* For backward compatibility */
0395 static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
0396 {
0397     return 0;
0398 }
0399 #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
0400 
0401 /* Get a copy of a Unicode string. */
0402 PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
0403     PyObject *unicode
0404     );
0405 
0406 /* Copy character from one unicode object into another, this function performs
0407    character conversion when necessary and falls back to memcpy() if possible.
0408 
0409    Fail if to is too small (smaller than *how_many* or smaller than
0410    len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
0411    kind(to), or if *to* has more than 1 reference.
0412 
0413    Return the number of written character, or return -1 and raise an exception
0414    on error.
0415 
0416    Pseudo-code:
0417 
0418        how_many = min(how_many, len(from) - from_start)
0419        to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
0420        return how_many
0421 
0422    Note: The function doesn't write a terminating null character.
0423    */
0424 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
0425     PyObject *to,
0426     Py_ssize_t to_start,
0427     PyObject *from,
0428     Py_ssize_t from_start,
0429     Py_ssize_t how_many
0430     );
0431 
0432 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
0433    may crash if parameters are invalid (e.g. if the output string
0434    is too short). */
0435 PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
0436     PyObject *to,
0437     Py_ssize_t to_start,
0438     PyObject *from,
0439     Py_ssize_t from_start,
0440     Py_ssize_t how_many
0441     );
0442 
0443 /* Fill a string with a character: write fill_char into
0444    unicode[start:start+length].
0445 
0446    Fail if fill_char is bigger than the string maximum character, or if the
0447    string has more than 1 reference.
0448 
0449    Return the number of written character, or return -1 and raise an exception
0450    on error. */
0451 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
0452     PyObject *unicode,
0453     Py_ssize_t start,
0454     Py_ssize_t length,
0455     Py_UCS4 fill_char
0456     );
0457 
0458 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
0459    if parameters are invalid (e.g. if length is longer than the string). */
0460 PyAPI_FUNC(void) _PyUnicode_FastFill(
0461     PyObject *unicode,
0462     Py_ssize_t start,
0463     Py_ssize_t length,
0464     Py_UCS4 fill_char
0465     );
0466 
0467 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
0468    Scan the string to find the maximum character. */
0469 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
0470     int kind,
0471     const void *buffer,
0472     Py_ssize_t size);
0473 
0474 /* Create a new string from a buffer of ASCII characters.
0475    WARNING: Don't check if the string contains any non-ASCII character. */
0476 PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
0477     const char *buffer,
0478     Py_ssize_t size);
0479 
0480 /* Compute the maximum character of the substring unicode[start:end].
0481    Return 127 for an empty string. */
0482 PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
0483     PyObject *unicode,
0484     Py_ssize_t start,
0485     Py_ssize_t end);
0486 
0487 /* --- _PyUnicodeWriter API ----------------------------------------------- */
0488 
0489 typedef struct {
0490     PyObject *buffer;
0491     void *data;
0492     int kind;
0493     Py_UCS4 maxchar;
0494     Py_ssize_t size;
0495     Py_ssize_t pos;
0496 
0497     /* minimum number of allocated characters (default: 0) */
0498     Py_ssize_t min_length;
0499 
0500     /* minimum character (default: 127, ASCII) */
0501     Py_UCS4 min_char;
0502 
0503     /* If non-zero, overallocate the buffer (default: 0). */
0504     unsigned char overallocate;
0505 
0506     /* If readonly is 1, buffer is a shared string (cannot be modified)
0507        and size is set to 0. */
0508     unsigned char readonly;
0509 } _PyUnicodeWriter ;
0510 
0511 /* Initialize a Unicode writer.
0512  *
0513  * By default, the minimum buffer size is 0 character and overallocation is
0514  * disabled. Set min_length, min_char and overallocate attributes to control
0515  * the allocation of the buffer. */
0516 PyAPI_FUNC(void)
0517 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
0518 
0519 /* Prepare the buffer to write 'length' characters
0520    with the specified maximum character.
0521 
0522    Return 0 on success, raise an exception and return -1 on error. */
0523 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
0524     (((MAXCHAR) <= (WRITER)->maxchar                                  \
0525       && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
0526      ? 0                                                              \
0527      : (((LENGTH) == 0)                                               \
0528         ? 0                                                           \
0529         : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
0530 
0531 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
0532    instead. */
0533 PyAPI_FUNC(int)
0534 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
0535                                  Py_ssize_t length, Py_UCS4 maxchar);
0536 
0537 /* Prepare the buffer to have at least the kind KIND.
0538    For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
0539    support characters in range U+000-U+FFFF.
0540 
0541    Return 0 on success, raise an exception and return -1 on error. */
0542 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
0543     ((KIND) <= (WRITER)->kind                                         \
0544      ? 0                                                              \
0545      : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
0546 
0547 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
0548    macro instead. */
0549 PyAPI_FUNC(int)
0550 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
0551                                      int kind);
0552 
0553 /* Append a Unicode character.
0554    Return 0 on success, raise an exception and return -1 on error. */
0555 PyAPI_FUNC(int)
0556 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
0557     Py_UCS4 ch
0558     );
0559 
0560 /* Append a Unicode string.
0561    Return 0 on success, raise an exception and return -1 on error. */
0562 PyAPI_FUNC(int)
0563 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
0564     PyObject *str               /* Unicode string */
0565     );
0566 
0567 /* Append a substring of a Unicode string.
0568    Return 0 on success, raise an exception and return -1 on error. */
0569 PyAPI_FUNC(int)
0570 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
0571     PyObject *str,              /* Unicode string */
0572     Py_ssize_t start,
0573     Py_ssize_t end
0574     );
0575 
0576 /* Append an ASCII-encoded byte string.
0577    Return 0 on success, raise an exception and return -1 on error. */
0578 PyAPI_FUNC(int)
0579 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
0580     const char *str,           /* ASCII-encoded byte string */
0581     Py_ssize_t len             /* number of bytes, or -1 if unknown */
0582     );
0583 
0584 /* Append a latin1-encoded byte string.
0585    Return 0 on success, raise an exception and return -1 on error. */
0586 PyAPI_FUNC(int)
0587 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
0588     const char *str,           /* latin1-encoded byte string */
0589     Py_ssize_t len             /* length in bytes */
0590     );
0591 
0592 /* Get the value of the writer as a Unicode string. Clear the
0593    buffer of the writer. Raise an exception and return NULL
0594    on error. */
0595 PyAPI_FUNC(PyObject *)
0596 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
0597 
0598 /* Deallocate memory of a writer (clear its internal buffer). */
0599 PyAPI_FUNC(void)
0600 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
0601 
0602 
0603 /* Format the object based on the format_spec, as defined in PEP 3101
0604    (Advanced String Formatting). */
0605 PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
0606     _PyUnicodeWriter *writer,
0607     PyObject *obj,
0608     PyObject *format_spec,
0609     Py_ssize_t start,
0610     Py_ssize_t end);
0611 
0612 /* --- Manage the default encoding ---------------------------------------- */
0613 
0614 /* Returns a pointer to the default encoding (UTF-8) of the
0615    Unicode object unicode.
0616 
0617    Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
0618    in the unicodeobject.
0619 
0620    _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
0621    support the previous internal function with the same behaviour.
0622 
0623    Use of this API is DEPRECATED since no size information can be
0624    extracted from the returned data.
0625 */
0626 
0627 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
0628 
0629 #define _PyUnicode_AsString PyUnicode_AsUTF8
0630 
0631 /* --- UTF-7 Codecs ------------------------------------------------------- */
0632 
0633 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
0634     PyObject *unicode,          /* Unicode object */
0635     int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
0636     int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
0637     const char *errors          /* error handling */
0638     );
0639 
0640 /* --- UTF-8 Codecs ------------------------------------------------------- */
0641 
0642 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
0643     PyObject *unicode,
0644     const char *errors);
0645 
0646 /* --- UTF-32 Codecs ------------------------------------------------------ */
0647 
0648 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
0649     PyObject *object,           /* Unicode object */
0650     const char *errors,         /* error handling */
0651     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
0652     );
0653 
0654 /* --- UTF-16 Codecs ------------------------------------------------------ */
0655 
0656 /* Returns a Python string object holding the UTF-16 encoded value of
0657    the Unicode data.
0658 
0659    If byteorder is not 0, output is written according to the following
0660    byte order:
0661 
0662    byteorder == -1: little endian
0663    byteorder == 0:  native byte order (writes a BOM mark)
0664    byteorder == 1:  big endian
0665 
0666    If byteorder is 0, the output string will always start with the
0667    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
0668    prepended.
0669 */
0670 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
0671     PyObject* unicode,          /* Unicode object */
0672     const char *errors,         /* error handling */
0673     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
0674     );
0675 
0676 /* --- Unicode-Escape Codecs ---------------------------------------------- */
0677 
0678 /* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
0679 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
0680         const char *string,     /* Unicode-Escape encoded string */
0681         Py_ssize_t length,      /* size of string */
0682         const char *errors,     /* error handling */
0683         Py_ssize_t *consumed    /* bytes consumed */
0684 );
0685 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
0686    chars. */
0687 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
0688         const char *string,     /* Unicode-Escape encoded string */
0689         Py_ssize_t length,      /* size of string */
0690         const char *errors,     /* error handling */
0691         Py_ssize_t *consumed,   /* bytes consumed */
0692         const char **first_invalid_escape  /* on return, points to first
0693                                               invalid escaped char in
0694                                               string. */
0695 );
0696 
0697 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
0698 
0699 /* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
0700 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
0701         const char *string,     /* Unicode-Escape encoded string */
0702         Py_ssize_t length,      /* size of string */
0703         const char *errors,     /* error handling */
0704         Py_ssize_t *consumed    /* bytes consumed */
0705 );
0706 
0707 /* --- Latin-1 Codecs ----------------------------------------------------- */
0708 
0709 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
0710     PyObject* unicode,
0711     const char* errors);
0712 
0713 /* --- ASCII Codecs ------------------------------------------------------- */
0714 
0715 PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
0716     PyObject* unicode,
0717     const char* errors);
0718 
0719 /* --- Character Map Codecs ----------------------------------------------- */
0720 
0721 /* Translate an Unicode object by applying a character mapping table to
0722    it and return the resulting Unicode object.
0723 
0724    The mapping table must map Unicode ordinal integers to Unicode strings,
0725    Unicode ordinal integers or None (causing deletion of the character).
0726 
0727    Mapping tables may be dictionaries or sequences. Unmapped character
0728    ordinals (ones which cause a LookupError) are left untouched and
0729    are copied as-is.
0730 */
0731 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
0732     PyObject *unicode,          /* Unicode object */
0733     PyObject *mapping,          /* encoding mapping */
0734     const char *errors          /* error handling */
0735     );
0736 
0737 /* --- Decimal Encoder ---------------------------------------------------- */
0738 
0739 /* Coverts a Unicode object holding a decimal value to an ASCII string
0740    for using in int, float and complex parsers.
0741    Transforms code points that have decimal digit property to the
0742    corresponding ASCII digit code points.  Transforms spaces to ASCII.
0743    Transforms code points starting from the first non-ASCII code point that
0744    is neither a decimal digit nor a space to the end into '?'. */
0745 
0746 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
0747     PyObject *unicode           /* Unicode object */
0748     );
0749 
0750 /* --- Methods & Slots ---------------------------------------------------- */
0751 
0752 PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
0753     PyObject *separator,
0754     PyObject *const *items,
0755     Py_ssize_t seqlen
0756     );
0757 
0758 /* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
0759    0 otherwise.  The right argument must be ASCII identifier.
0760    Any error occurs inside will be cleared before return. */
0761 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
0762     PyObject *left,             /* Left string */
0763     _Py_Identifier *right       /* Right identifier */
0764     );
0765 
0766 /* Test whether a unicode is equal to ASCII string.  Return 1 if true,
0767    0 otherwise.  The right argument must be ASCII-encoded string.
0768    Any error occurs inside will be cleared before return. */
0769 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
0770     PyObject *left,
0771     const char *right           /* ASCII-encoded string */
0772     );
0773 
0774 /* Externally visible for str.strip(unicode) */
0775 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
0776     PyObject *self,
0777     int striptype,
0778     PyObject *sepobj
0779     );
0780 
0781 /* Using explicit passed-in values, insert the thousands grouping
0782    into the string pointed to by buffer.  For the argument descriptions,
0783    see Objects/stringlib/localeutil.h */
0784 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
0785     _PyUnicodeWriter *writer,
0786     Py_ssize_t n_buffer,
0787     PyObject *digits,
0788     Py_ssize_t d_pos,
0789     Py_ssize_t n_digits,
0790     Py_ssize_t min_width,
0791     const char *grouping,
0792     PyObject *thousands_sep,
0793     Py_UCS4 *maxchar);
0794 
0795 /* === Characters Type APIs =============================================== */
0796 
0797 /* These should not be used directly. Use the Py_UNICODE_IS* and
0798    Py_UNICODE_TO* macros instead.
0799 
0800    These APIs are implemented in Objects/unicodectype.c.
0801 
0802 */
0803 
0804 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
0805     Py_UCS4 ch       /* Unicode character */
0806     );
0807 
0808 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
0809     Py_UCS4 ch       /* Unicode character */
0810     );
0811 
0812 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
0813     Py_UCS4 ch       /* Unicode character */
0814     );
0815 
0816 PyAPI_FUNC(int) _PyUnicode_IsXidStart(
0817     Py_UCS4 ch       /* Unicode character */
0818     );
0819 
0820 PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
0821     Py_UCS4 ch       /* Unicode character */
0822     );
0823 
0824 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
0825     const Py_UCS4 ch         /* Unicode character */
0826     );
0827 
0828 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
0829     const Py_UCS4 ch         /* Unicode character */
0830     );
0831 
0832 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
0833     Py_UCS4 ch       /* Unicode character */
0834     );
0835 
0836 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
0837     Py_UCS4 ch       /* Unicode character */
0838     );
0839 
0840 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
0841     Py_UCS4 ch       /* Unicode character */
0842     );
0843 
0844 PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
0845     Py_UCS4 ch,       /* Unicode character */
0846     Py_UCS4 *res
0847     );
0848 
0849 PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
0850     Py_UCS4 ch,       /* Unicode character */
0851     Py_UCS4 *res
0852     );
0853 
0854 PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
0855     Py_UCS4 ch,       /* Unicode character */
0856     Py_UCS4 *res
0857     );
0858 
0859 PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
0860     Py_UCS4 ch,       /* Unicode character */
0861     Py_UCS4 *res
0862     );
0863 
0864 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
0865     Py_UCS4 ch         /* Unicode character */
0866     );
0867 
0868 PyAPI_FUNC(int) _PyUnicode_IsCased(
0869     Py_UCS4 ch         /* Unicode character */
0870     );
0871 
0872 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
0873     Py_UCS4 ch       /* Unicode character */
0874     );
0875 
0876 PyAPI_FUNC(int) _PyUnicode_ToDigit(
0877     Py_UCS4 ch       /* Unicode character */
0878     );
0879 
0880 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
0881     Py_UCS4 ch       /* Unicode character */
0882     );
0883 
0884 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
0885     Py_UCS4 ch       /* Unicode character */
0886     );
0887 
0888 PyAPI_FUNC(int) _PyUnicode_IsDigit(
0889     Py_UCS4 ch       /* Unicode character */
0890     );
0891 
0892 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
0893     Py_UCS4 ch       /* Unicode character */
0894     );
0895 
0896 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
0897     Py_UCS4 ch       /* Unicode character */
0898     );
0899 
0900 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
0901     Py_UCS4 ch       /* Unicode character */
0902     );
0903 
0904 // Helper array used by Py_UNICODE_ISSPACE().
0905 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
0906 
0907 // Since splitting on whitespace is an important use case, and
0908 // whitespace in most situations is solely ASCII whitespace, we
0909 // optimize for the common case by using a quick look-up table
0910 // _Py_ascii_whitespace (see below) with an inlined check.
0911 static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
0912     if (ch < 128) {
0913         return _Py_ascii_whitespace[ch];
0914     }
0915     return _PyUnicode_IsWhitespace(ch);
0916 }
0917 
0918 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
0919 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
0920 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
0921 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
0922 
0923 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
0924 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
0925 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
0926 
0927 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
0928 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
0929 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
0930 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
0931 
0932 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
0933 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
0934 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
0935 
0936 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
0937 
0938 static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
0939    return (Py_UNICODE_ISALPHA(ch)
0940            || Py_UNICODE_ISDECIMAL(ch)
0941            || Py_UNICODE_ISDIGIT(ch)
0942            || Py_UNICODE_ISNUMERIC(ch));
0943 }
0944 
0945 
0946 /* === Misc functions ===================================================== */
0947 
0948 PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
0949 
0950 /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
0951 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
0952 
0953 /* Fast equality check when the inputs are known to be exact unicode types
0954    and where the hash values are equal (i.e. a very probable match) */
0955 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
0956 
0957 /* Equality check. */
0958 PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
0959 
0960 PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
0961 PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
0962 
0963 PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);