include/python3.13/unicodeobject.h

0001 #ifndef Py_UNICODEOBJECT_H
0002 #define Py_UNICODEOBJECT_H
0003
0004 /*
0005
0006 Unicode implementation based on original code by Fredrik Lundh,
0007 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
0008 Unicode Integration Proposal. (See
0009 http://www.egenix.com/files/python/unicode-proposal.txt).
0010
0011 Copyright (c) Corporation for National Research Initiatives.
0012
0013
0014  Original header:
0015  --------------------------------------------------------------------
0016
0017  * Yet another Unicode string type for Python.  This type supports the
0018  * 16-bit Basic Multilingual Plane (BMP) only.
0019  *
0020  * Written by Fredrik Lundh, January 1999.
0021  *
0022  * Copyright (c) 1999 by Secret Labs AB.
0023  * Copyright (c) 1999 by Fredrik Lundh.
0024  *
0025  * fredrik@pythonware.com
0026  * http://www.pythonware.com
0027  *
0028  * --------------------------------------------------------------------
0029  * This Unicode String Type is
0030  *
0031  * Copyright (c) 1999 by Secret Labs AB
0032  * Copyright (c) 1999 by Fredrik Lundh
0033  *
0034  * By obtaining, using, and/or copying this software and/or its
0035  * associated documentation, you agree that you have read, understood,
0036  * and will comply with the following terms and conditions:
0037  *
0038  * Permission to use, copy, modify, and distribute this software and its
0039  * associated documentation for any purpose and without fee is hereby
0040  * granted, provided that the above copyright notice appears in all
0041  * copies, and that both that copyright notice and this permission notice
0042  * appear in supporting documentation, and that the name of Secret Labs
0043  * AB or the author not be used in advertising or publicity pertaining to
0044  * distribution of the software without specific, written prior
0045  * permission.
0046  *
0047  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
0048  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
0049  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
0050  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
0051  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
0052  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
0053  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
0054  * -------------------------------------------------------------------- */
0055
0056 /* === Internal API ======================================================= */
0057
0058 /* --- Internal Unicode Format -------------------------------------------- */
0059
0060 /* Python 3.x requires unicode */
0061 #define Py_USING_UNICODE
0062
0063 #ifndef SIZEOF_WCHAR_T
0064 #error Must define SIZEOF_WCHAR_T
0065 #endif
0066
0067 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
0068
0069 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
0070    Otherwise, Unicode strings are stored as UCS-2 (with limited support
0071    for UTF-16) */
0072
0073 #if Py_UNICODE_SIZE >= 4
0074 #define Py_UNICODE_WIDE
0075 #endif
0076
0077 /* Set these flags if the platform has "wchar.h" and the
0078    wchar_t type is a 16-bit unsigned type */
0079 /* #define HAVE_WCHAR_H */
0080 /* #define HAVE_USABLE_WCHAR_T */
0081
0082 /* If the compiler provides a wchar_t type we try to support it
0083    through the interface functions PyUnicode_FromWideChar(),
0084    PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
0085
0086 #ifdef HAVE_USABLE_WCHAR_T
0087 # ifndef HAVE_WCHAR_H
0088 #  define HAVE_WCHAR_H
0089 # endif
0090 #endif
0091
0092 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
0093    unicode representations. */
0094 typedef uint32_t Py_UCS4;
0095 typedef uint16_t Py_UCS2;
0096 typedef uint8_t Py_UCS1;
0097
0098 #ifdef __cplusplus
0099 extern "C" {
0100 #endif
0101
0102
0103 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
0104 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
0105
0106 #define PyUnicode_Check(op) \
0107     PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
0108 #define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type)
0109
0110 /* --- Constants ---------------------------------------------------------- */
0111
0112 /* This Unicode character will be used as replacement character during
0113    decoding if the errors argument is set to "replace". Note: the
0114    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
0115    Unicode 3.0. */
0116
0117 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
0118
0119 /* === Public API ========================================================= */
0120
0121 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
0122 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
0123     const char *u,             /* UTF-8 encoded string */
0124     Py_ssize_t size            /* size of buffer */
0125     );
0126
0127 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
0128    UTF-8 encoded bytes.  The size is determined with strlen(). */
0129 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
0130     const char *u              /* UTF-8 encoded string */
0131     );
0132
0133 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
0134 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
0135     PyObject *str,
0136     Py_ssize_t start,
0137     Py_ssize_t end);
0138 #endif
0139
0140 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
0141 /* Copy the string into a UCS4 buffer including the null character if copy_null
0142    is set. Return NULL and raise an exception on error. Raise a SystemError if
0143    the buffer is smaller than the string. Return buffer on success.
0144
0145    buflen is the length of the buffer in (Py_UCS4) characters. */
0146 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
0147     PyObject *unicode,
0148     Py_UCS4* buffer,
0149     Py_ssize_t buflen,
0150     int copy_null);
0151
0152 /* Copy the string into a UCS4 buffer. A new buffer is allocated using
0153  * PyMem_Malloc; if this fails, NULL is returned with a memory error
0154    exception set. */
0155 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
0156 #endif
0157
0158 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
0159 /* Get the length of the Unicode object. */
0160
0161 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
0162     PyObject *unicode
0163 );
0164 #endif
0165
0166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
0167 /* Read a character from the string. */
0168
0169 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
0170     PyObject *unicode,
0171     Py_ssize_t index
0172     );
0173
0174 /* Write a character to the string. The string must have been created through
0175    PyUnicode_New, must not be shared, and must not have been hashed yet.
0176
0177    Return 0 on success, -1 on error. */
0178
0179 PyAPI_FUNC(int) PyUnicode_WriteChar(
0180     PyObject *unicode,
0181     Py_ssize_t index,
0182     Py_UCS4 character
0183     );
0184 #endif
0185
0186 /* Resize a Unicode object. The length is the number of codepoints.
0187
0188    *unicode is modified to point to the new (resized) object and 0
0189    returned on success.
0190
0191    Try to resize the string in place (which is usually faster than allocating
0192    a new string and copy characters), or create a new string.
0193
0194    Error handling is implemented as follows: an exception is set, -1
0195    is returned and *unicode left untouched.
0196
0197    WARNING: The function doesn't check string content, the result may not be a
0198             string in canonical representation. */
0199
0200 PyAPI_FUNC(int) PyUnicode_Resize(
0201     PyObject **unicode,         /* Pointer to the Unicode object */
0202     Py_ssize_t length           /* New length */
0203     );
0204
0205 /* Decode obj to a Unicode object.
0206
0207    bytes, bytearray and other bytes-like objects are decoded according to the
0208    given encoding and error handler. The encoding and error handler can be
0209    NULL to have the interface use UTF-8 and "strict".
0210
0211    All other objects (including Unicode objects) raise an exception.
0212
0213    The API returns NULL in case of an error. The caller is responsible
0214    for decref'ing the returned objects.
0215
0216 */
0217
0218 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
0219     PyObject *obj,              /* Object */
0220     const char *encoding,       /* encoding */
0221     const char *errors          /* error handling */
0222     );
0223
0224 /* Copy an instance of a Unicode subtype to a new true Unicode object if
0225    necessary. If obj is already a true Unicode object (not a subtype), return
0226    the reference with *incremented* refcount.
0227
0228    The API returns NULL in case of an error. The caller is responsible
0229    for decref'ing the returned objects.
0230
0231 */
0232
0233 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
0234     PyObject *obj      /* Object */
0235     );
0236
0237 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
0238     const char *format,   /* ASCII-encoded string  */
0239     va_list vargs
0240     );
0241 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
0242     const char *format,   /* ASCII-encoded string  */
0243     ...
0244     );
0245
0246 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
0247 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
0248     const char *u              /* UTF-8 encoded string */
0249     );
0250
0251 /* --- wchar_t support for platforms which support it --------------------- */
0252
0253 #ifdef HAVE_WCHAR_H
0254
0255 /* Create a Unicode Object from the wchar_t buffer w of the given
0256    size.
0257
0258    The buffer is copied into the new object. */
0259
0260 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
0261     const wchar_t *w,           /* wchar_t buffer */
0262     Py_ssize_t size             /* size of buffer */
0263     );
0264
0265 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
0266    most size wchar_t characters are copied.
0267
0268    Note that the resulting wchar_t string may or may not be
0269    0-terminated.  It is the responsibility of the caller to make sure
0270    that the wchar_t string is 0-terminated in case this is required by
0271    the application.
0272
0273    Returns the number of wchar_t characters copied (excluding a
0274    possibly trailing 0-termination character) or -1 in case of an
0275    error. */
0276
0277 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
0278     PyObject *unicode,          /* Unicode object */
0279     wchar_t *w,                 /* wchar_t buffer */
0280     Py_ssize_t size             /* size of buffer */
0281     );
0282
0283 /* Convert the Unicode object to a wide character string. The output string
0284    always ends with a nul character. If size is not NULL, write the number of
0285    wide characters (excluding the null character) into *size.
0286
0287    Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
0288    on success. On error, returns NULL, *size is undefined and raises a
0289    MemoryError. */
0290
0291 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
0292     PyObject *unicode,          /* Unicode object */
0293     Py_ssize_t *size            /* number of characters of the result */
0294     );
0295
0296 #endif
0297
0298 /* --- Unicode ordinals --------------------------------------------------- */
0299
0300 /* Create a Unicode Object from the given Unicode code point ordinal.
0301
0302    The ordinal must be in range(0x110000). A ValueError is
0303    raised in case it is not.
0304
0305 */
0306
0307 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
0308
0309 /* === Builtin Codecs =====================================================
0310
0311    Many of these APIs take two arguments encoding and errors. These
0312    parameters encoding and errors have the same semantics as the ones
0313    of the builtin str() API.
0314
0315    Setting encoding to NULL causes the default encoding (UTF-8) to be used.
0316
0317    Error handling is set by errors which may also be set to NULL
0318    meaning to use the default handling defined for the codec. Default
0319    error handling for all builtin codecs is "strict" (ValueErrors are
0320    raised).
0321
0322    The codecs all use a similar interface. Only deviation from the
0323    generic ones are documented.
0324
0325 */
0326
0327 /* --- Manage the default encoding ---------------------------------------- */
0328
0329 /* Returns "utf-8".  */
0330 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
0331
0332 /* --- Generic Codecs ----------------------------------------------------- */
0333
0334 /* Create a Unicode object by decoding the encoded string s of the
0335    given size. */
0336
0337 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
0338     const char *s,              /* encoded string */
0339     Py_ssize_t size,            /* size of buffer */
0340     const char *encoding,       /* encoding */
0341     const char *errors          /* error handling */
0342     );
0343
0344 /* Decode a Unicode object unicode and return the result as Python
0345    object.
0346
0347    This API is DEPRECATED. The only supported standard encoding is rot13.
0348    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
0349    that decode from str. */
0350
0351 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
0352     PyObject *unicode,          /* Unicode object */
0353     const char *encoding,       /* encoding */
0354     const char *errors          /* error handling */
0355     );
0356
0357 /* Decode a Unicode object unicode and return the result as Unicode
0358    object.
0359
0360    This API is DEPRECATED. The only supported standard encoding is rot13.
0361    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
0362    that decode from str to str. */
0363
0364 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
0365     PyObject *unicode,          /* Unicode object */
0366     const char *encoding,       /* encoding */
0367     const char *errors          /* error handling */
0368     );
0369
0370 /* Encodes a Unicode object and returns the result as Python
0371    object.
0372
0373    This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
0374    since all standard encodings (except rot13) encode str to bytes.
0375    Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
0376    that encode form str to non-bytes. */
0377
0378 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
0379     PyObject *unicode,          /* Unicode object */
0380     const char *encoding,       /* encoding */
0381     const char *errors          /* error handling */
0382     );
0383
0384 /* Encodes a Unicode object and returns the result as Python string
0385    object. */
0386
0387 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
0388     PyObject *unicode,          /* Unicode object */
0389     const char *encoding,       /* encoding */
0390     const char *errors          /* error handling */
0391     );
0392
0393 /* Encodes a Unicode object and returns the result as Unicode
0394    object.
0395
0396    This API is DEPRECATED.  The only supported standard encodings is rot13.
0397    Use PyCodec_Encode() to encode with rot13 and non-standard codecs
0398    that encode from str to str. */
0399
0400 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
0401     PyObject *unicode,          /* Unicode object */
0402     const char *encoding,       /* encoding */
0403     const char *errors          /* error handling */
0404     );
0405
0406 /* Build an encoding map. */
0407
0408 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
0409     PyObject* string            /* 256 character map */
0410    );
0411
0412 /* --- UTF-7 Codecs ------------------------------------------------------- */
0413
0414 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
0415     const char *string,         /* UTF-7 encoded string */
0416     Py_ssize_t length,          /* size of string */
0417     const char *errors          /* error handling */
0418     );
0419
0420 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
0421     const char *string,         /* UTF-7 encoded string */
0422     Py_ssize_t length,          /* size of string */
0423     const char *errors,         /* error handling */
0424     Py_ssize_t *consumed        /* bytes consumed */
0425     );
0426
0427 /* --- UTF-8 Codecs ------------------------------------------------------- */
0428
0429 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
0430     const char *string,         /* UTF-8 encoded string */
0431     Py_ssize_t length,          /* size of string */
0432     const char *errors          /* error handling */
0433     );
0434
0435 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
0436     const char *string,         /* UTF-8 encoded string */
0437     Py_ssize_t length,          /* size of string */
0438     const char *errors,         /* error handling */
0439     Py_ssize_t *consumed        /* bytes consumed */
0440     );
0441
0442 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
0443     PyObject *unicode           /* Unicode object */
0444     );
0445
0446 /* Returns a pointer to the default encoding (UTF-8) of the
0447    Unicode object unicode and the size of the encoded representation
0448    in bytes stored in *size.
0449
0450    In case of an error, no *size is set.
0451
0452    This function caches the UTF-8 encoded string in the unicodeobject
0453    and subsequent calls will return the same string.  The memory is released
0454    when the unicodeobject is deallocated.
0455 */
0456
0457 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
0458 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
0459     PyObject *unicode,
0460     Py_ssize_t *size);
0461 #endif
0462
0463 /* --- UTF-32 Codecs ------------------------------------------------------ */
0464
0465 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
0466    the corresponding Unicode object.
0467
0468    errors (if non-NULL) defines the error handling. It defaults
0469    to "strict".
0470
0471    If byteorder is non-NULL, the decoder starts decoding using the
0472    given byte order:
0473
0474     *byteorder == -1: little endian
0475     *byteorder == 0:  native order
0476     *byteorder == 1:  big endian
0477
0478    In native mode, the first four bytes of the stream are checked for a
0479    BOM mark. If found, the BOM mark is analysed, the byte order
0480    adjusted and the BOM skipped.  In the other modes, no BOM mark
0481    interpretation is done. After completion, *byteorder is set to the
0482    current byte order at the end of input data.
0483
0484    If byteorder is NULL, the codec starts in native order mode.
0485
0486 */
0487
0488 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
0489     const char *string,         /* UTF-32 encoded string */
0490     Py_ssize_t length,          /* size of string */
0491     const char *errors,         /* error handling */
0492     int *byteorder              /* pointer to byteorder to use
0493                                    0=native;-1=LE,1=BE; updated on
0494                                    exit */
0495     );
0496
0497 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
0498     const char *string,         /* UTF-32 encoded string */
0499     Py_ssize_t length,          /* size of string */
0500     const char *errors,         /* error handling */
0501     int *byteorder,             /* pointer to byteorder to use
0502                                    0=native;-1=LE,1=BE; updated on
0503                                    exit */
0504     Py_ssize_t *consumed        /* bytes consumed */
0505     );
0506
0507 /* Returns a Python string using the UTF-32 encoding in native byte
0508    order. The string always starts with a BOM mark.  */
0509
0510 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
0511     PyObject *unicode           /* Unicode object */
0512     );
0513
0514 /* Returns a Python string object holding the UTF-32 encoded value of
0515    the Unicode data.
0516
0517    If byteorder is not 0, output is written according to the following
0518    byte order:
0519
0520    byteorder == -1: little endian
0521    byteorder == 0:  native byte order (writes a BOM mark)
0522    byteorder == 1:  big endian
0523
0524    If byteorder is 0, the output string will always start with the
0525    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
0526    prepended.
0527
0528 */
0529
0530 /* --- UTF-16 Codecs ------------------------------------------------------ */
0531
0532 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
0533    the corresponding Unicode object.
0534
0535    errors (if non-NULL) defines the error handling. It defaults
0536    to "strict".
0537
0538    If byteorder is non-NULL, the decoder starts decoding using the
0539    given byte order:
0540
0541     *byteorder == -1: little endian
0542     *byteorder == 0:  native order
0543     *byteorder == 1:  big endian
0544
0545    In native mode, the first two bytes of the stream are checked for a
0546    BOM mark. If found, the BOM mark is analysed, the byte order
0547    adjusted and the BOM skipped.  In the other modes, no BOM mark
0548    interpretation is done. After completion, *byteorder is set to the
0549    current byte order at the end of input data.
0550
0551    If byteorder is NULL, the codec starts in native order mode.
0552
0553 */
0554
0555 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
0556     const char *string,         /* UTF-16 encoded string */
0557     Py_ssize_t length,          /* size of string */
0558     const char *errors,         /* error handling */
0559     int *byteorder              /* pointer to byteorder to use
0560                                    0=native;-1=LE,1=BE; updated on
0561                                    exit */
0562     );
0563
0564 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
0565     const char *string,         /* UTF-16 encoded string */
0566     Py_ssize_t length,          /* size of string */
0567     const char *errors,         /* error handling */
0568     int *byteorder,             /* pointer to byteorder to use
0569                                    0=native;-1=LE,1=BE; updated on
0570                                    exit */
0571     Py_ssize_t *consumed        /* bytes consumed */
0572     );
0573
0574 /* Returns a Python string using the UTF-16 encoding in native byte
0575    order. The string always starts with a BOM mark.  */
0576
0577 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
0578     PyObject *unicode           /* Unicode object */
0579     );
0580
0581 /* --- Unicode-Escape Codecs ---------------------------------------------- */
0582
0583 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
0584     const char *string,         /* Unicode-Escape encoded string */
0585     Py_ssize_t length,          /* size of string */
0586     const char *errors          /* error handling */
0587     );
0588
0589 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
0590     PyObject *unicode           /* Unicode object */
0591     );
0592
0593 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
0594
0595 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
0596     const char *string,         /* Raw-Unicode-Escape encoded string */
0597     Py_ssize_t length,          /* size of string */
0598     const char *errors          /* error handling */
0599     );
0600
0601 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
0602     PyObject *unicode           /* Unicode object */
0603     );
0604
0605 /* --- Latin-1 Codecs -----------------------------------------------------
0606
0607    Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
0608
0609 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
0610     const char *string,         /* Latin-1 encoded string */
0611     Py_ssize_t length,          /* size of string */
0612     const char *errors          /* error handling */
0613     );
0614
0615 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
0616     PyObject *unicode           /* Unicode object */
0617     );
0618
0619 /* --- ASCII Codecs -------------------------------------------------------
0620
0621    Only 7-bit ASCII data is expected. All other codes generate errors.
0622
0623 */
0624
0625 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
0626     const char *string,         /* ASCII encoded string */
0627     Py_ssize_t length,          /* size of string */
0628     const char *errors          /* error handling */
0629     );
0630
0631 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
0632     PyObject *unicode           /* Unicode object */
0633     );
0634
0635 /* --- Character Map Codecs -----------------------------------------------
0636
0637    This codec uses mappings to encode and decode characters.
0638
0639    Decoding mappings must map byte ordinals (integers in the range from 0 to
0640    255) to Unicode strings, integers (which are then interpreted as Unicode
0641    ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
0642    as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
0643    mapping" and cause an error.
0644
0645    Encoding mappings must map Unicode ordinal integers to bytes objects,
0646    integers in the range from 0 to 255 or None.  Unmapped character
0647    ordinals (ones which cause a LookupError) as well as mapped to
0648    None are treated as "undefined mapping" and cause an error.
0649
0650 */
0651
0652 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
0653     const char *string,         /* Encoded string */
0654     Py_ssize_t length,          /* size of string */
0655     PyObject *mapping,          /* decoding mapping */
0656     const char *errors          /* error handling */
0657     );
0658
0659 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
0660     PyObject *unicode,          /* Unicode object */
0661     PyObject *mapping           /* encoding mapping */
0662     );
0663
0664 /* --- MBCS codecs for Windows -------------------------------------------- */
0665
0666 #ifdef MS_WINDOWS
0667 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
0668     const char *string,         /* MBCS encoded string */
0669     Py_ssize_t length,          /* size of string */
0670     const char *errors          /* error handling */
0671     );
0672
0673 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
0674     const char *string,         /* MBCS encoded string */
0675     Py_ssize_t length,          /* size of string */
0676     const char *errors,         /* error handling */
0677     Py_ssize_t *consumed        /* bytes consumed */
0678     );
0679
0680 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
0681 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
0682     int code_page,              /* code page number */
0683     const char *string,         /* encoded string */
0684     Py_ssize_t length,          /* size of string */
0685     const char *errors,         /* error handling */
0686     Py_ssize_t *consumed        /* bytes consumed */
0687     );
0688 #endif
0689
0690 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
0691     PyObject *unicode           /* Unicode object */
0692     );
0693
0694 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
0695 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
0696     int code_page,              /* code page number */
0697     PyObject *unicode,          /* Unicode object */
0698     const char *errors          /* error handling */
0699     );
0700 #endif
0701
0702 #endif /* MS_WINDOWS */
0703
0704 /* --- Locale encoding --------------------------------------------------- */
0705
0706 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
0707 /* Decode a string from the current locale encoding. The decoder is strict if
0708    *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
0709    error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
0710    be decoded as a surrogate character and *surrogateescape* is not equal to
0711    zero, the byte sequence is escaped using the 'surrogateescape' error handler
0712    instead of being decoded. *str* must end with a null character but cannot
0713    contain embedded null characters. */
0714
0715 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
0716     const char *str,
0717     Py_ssize_t len,
0718     const char *errors);
0719
0720 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
0721    length using strlen(). */
0722
0723 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
0724     const char *str,
0725     const char *errors);
0726
0727 /* Encode a Unicode object to the current locale encoding. The encoder is
0728    strict is *surrogateescape* is equal to zero, otherwise the
0729    "surrogateescape" error handler is used. Return a bytes object. The string
0730    cannot contain embedded null characters. */
0731
0732 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
0733     PyObject *unicode,
0734     const char *errors
0735     );
0736 #endif
0737
0738 /* --- File system encoding ---------------------------------------------- */
0739
0740 /* ParseTuple converter: encode str objects to bytes using
0741    PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
0742
0743 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
0744
0745 /* ParseTuple converter: decode bytes objects to unicode using
0746    PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
0747
0748 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
0749
0750 /* Decode a null-terminated string from the Python filesystem encoding
0751    and error handler.
0752
0753    If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */
0754 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
0755     const char *s               /* encoded string */
0756     );
0757
0758 /* Decode a string from the Python filesystem encoding and error handler. */
0759 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
0760     const char *s,               /* encoded string */
0761     Py_ssize_t size              /* size */
0762     );
0763
0764 /* Encode a Unicode object to the Python filesystem encoding and error handler.
0765    Return bytes. */
0766 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
0767     PyObject *unicode
0768     );
0769
0770 /* --- Methods & Slots ----------------------------------------------------
0771
0772    These are capable of handling Unicode objects and strings on input
0773    (we refer to them as strings in the descriptions) and return
0774    Unicode objects or integers as appropriate. */
0775
0776 /* Concat two strings giving a new Unicode string. */
0777
0778 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
0779     PyObject *left,             /* Left string */
0780     PyObject *right             /* Right string */
0781     );
0782
0783 /* Concat two strings and put the result in *pleft
0784    (sets *pleft to NULL on error) */
0785
0786 PyAPI_FUNC(void) PyUnicode_Append(
0787     PyObject **pleft,           /* Pointer to left string */
0788     PyObject *right             /* Right string */
0789     );
0790
0791 /* Concat two strings, put the result in *pleft and drop the right object
0792    (sets *pleft to NULL on error) */
0793
0794 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
0795     PyObject **pleft,           /* Pointer to left string */
0796     PyObject *right             /* Right string */
0797     );
0798
0799 /* Split a string giving a list of Unicode strings.
0800
0801    If sep is NULL, splitting will be done at all whitespace
0802    substrings. Otherwise, splits occur at the given separator.
0803
0804    At most maxsplit splits will be done. If negative, no limit is set.
0805
0806    Separators are not included in the resulting list.
0807
0808 */
0809
0810 PyAPI_FUNC(PyObject*) PyUnicode_Split(
0811     PyObject *s,                /* String to split */
0812     PyObject *sep,              /* String separator */
0813     Py_ssize_t maxsplit         /* Maxsplit count */
0814     );
0815
0816 /* Dito, but split at line breaks.
0817
0818    CRLF is considered to be one line break. Line breaks are not
0819    included in the resulting list. */
0820
0821 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
0822     PyObject *s,                /* String to split */
0823     int keepends                /* If true, line end markers are included */
0824     );
0825
0826 /* Partition a string using a given separator. */
0827
0828 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
0829     PyObject *s,                /* String to partition */
0830     PyObject *sep               /* String separator */
0831     );
0832
0833 /* Partition a string using a given separator, searching from the end of the
0834    string. */
0835
0836 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
0837     PyObject *s,                /* String to partition */
0838     PyObject *sep               /* String separator */
0839     );
0840
0841 /* Split a string giving a list of Unicode strings.
0842
0843    If sep is NULL, splitting will be done at all whitespace
0844    substrings. Otherwise, splits occur at the given separator.
0845
0846    At most maxsplit splits will be done. But unlike PyUnicode_Split
0847    PyUnicode_RSplit splits from the end of the string. If negative,
0848    no limit is set.
0849
0850    Separators are not included in the resulting list.
0851
0852 */
0853
0854 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
0855     PyObject *s,                /* String to split */
0856     PyObject *sep,              /* String separator */
0857     Py_ssize_t maxsplit         /* Maxsplit count */
0858     );
0859
0860 /* Translate a string by applying a character mapping table to it and
0861    return the resulting Unicode object.
0862
0863    The mapping table must map Unicode ordinal integers to Unicode strings,
0864    Unicode ordinal integers or None (causing deletion of the character).
0865
0866    Mapping tables may be dictionaries or sequences. Unmapped character
0867    ordinals (ones which cause a LookupError) are left untouched and
0868    are copied as-is.
0869
0870 */
0871
0872 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
0873     PyObject *str,              /* String */
0874     PyObject *table,            /* Translate table */
0875     const char *errors          /* error handling */
0876     );
0877
0878 /* Join a sequence of strings using the given separator and return
0879    the resulting Unicode string. */
0880
0881 PyAPI_FUNC(PyObject*) PyUnicode_Join(
0882     PyObject *separator,        /* Separator string */
0883     PyObject *seq               /* Sequence object */
0884     );
0885
0886 /* Return 1 if substr matches str[start:end] at the given tail end, 0
0887    otherwise. */
0888
0889 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
0890     PyObject *str,              /* String */
0891     PyObject *substr,           /* Prefix or Suffix string */
0892     Py_ssize_t start,           /* Start index */
0893     Py_ssize_t end,             /* Stop index */
0894     int direction               /* Tail end: -1 prefix, +1 suffix */
0895     );
0896
0897 /* Return the first position of substr in str[start:end] using the
0898    given search direction or -1 if not found. -2 is returned in case
0899    an error occurred and an exception is set. */
0900
0901 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
0902     PyObject *str,              /* String */
0903     PyObject *substr,           /* Substring to find */
0904     Py_ssize_t start,           /* Start index */
0905     Py_ssize_t end,             /* Stop index */
0906     int direction               /* Find direction: +1 forward, -1 backward */
0907     );
0908
0909 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
0910 /* Like PyUnicode_Find, but search for single character only. */
0911 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
0912     PyObject *str,
0913     Py_UCS4 ch,
0914     Py_ssize_t start,
0915     Py_ssize_t end,
0916     int direction
0917     );
0918 #endif
0919
0920 /* Count the number of occurrences of substr in str[start:end]. */
0921
0922 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
0923     PyObject *str,              /* String */
0924     PyObject *substr,           /* Substring to count */
0925     Py_ssize_t start,           /* Start index */
0926     Py_ssize_t end              /* Stop index */
0927     );
0928
0929 /* Replace at most maxcount occurrences of substr in str with replstr
0930    and return the resulting Unicode object. */
0931
0932 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
0933     PyObject *str,              /* String */
0934     PyObject *substr,           /* Substring to find */
0935     PyObject *replstr,          /* Substring to replace */
0936     Py_ssize_t maxcount         /* Max. number of replacements to apply;
0937                                    -1 = all */
0938     );
0939
0940 /* Compare two strings and return -1, 0, 1 for less than, equal,
0941    greater than resp.
0942    Raise an exception and return -1 on error. */
0943
0944 PyAPI_FUNC(int) PyUnicode_Compare(
0945     PyObject *left,             /* Left string */
0946     PyObject *right             /* Right string */
0947     );
0948
0949 /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
0950    equal, and greater than, respectively.  It is best to pass only
0951    ASCII-encoded strings, but the function interprets the input string as
0952    ISO-8859-1 if it contains non-ASCII characters.
0953    This function does not raise exceptions. */
0954
0955 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
0956     PyObject *left,
0957     const char *right           /* ASCII-encoded string */
0958     );
0959
0960 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
0961 /* Compare a Unicode object with UTF-8 encoded C string.
0962    Return 1 if they are equal, or 0 otherwise.
0963    This function does not raise exceptions. */
0964
0965 PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *);
0966 PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t);
0967 #endif
0968
0969 /* Rich compare two strings and return one of the following:
0970
0971    - NULL in case an exception was raised
0972    - Py_True or Py_False for successful comparisons
0973    - Py_NotImplemented in case the type combination is unknown
0974
0975    Possible values for op:
0976
0977      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
0978
0979 */
0980
0981 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
0982     PyObject *left,             /* Left string */
0983     PyObject *right,            /* Right string */
0984     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
0985     );
0986
0987 /* Apply an argument tuple or dictionary to a format string and return
0988    the resulting Unicode string. */
0989
0990 PyAPI_FUNC(PyObject *) PyUnicode_Format(
0991     PyObject *format,           /* Format string */
0992     PyObject *args              /* Argument tuple or dictionary */
0993     );
0994
0995 /* Checks whether element is contained in container and return 1/0
0996    accordingly.
0997
0998    element has to coerce to a one element Unicode string. -1 is
0999    returned in case of an error. */
1000
1001 PyAPI_FUNC(int) PyUnicode_Contains(
1002     PyObject *container,        /* Container string */
1003     PyObject *element           /* Element string */
1004     );
1005
1006 /* Checks whether argument is a valid identifier. */
1007
1008 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1009
1010 /* === Characters Type APIs =============================================== */
1011
1012 #ifndef Py_LIMITED_API
1013 #  define Py_CPYTHON_UNICODEOBJECT_H
1014 #  include "cpython/unicodeobject.h"
1015 #  undef Py_CPYTHON_UNICODEOBJECT_H
1016 #endif
1017
1018 #ifdef __cplusplus
1019 }
1020 #endif
1021 #endif /* !Py_UNICODEOBJECT_H */