|
|
|||
File indexing completed on 2025-11-19 09:50:55
0001 #ifndef Py_UNICODEOBJECT_H 0002 #define Py_UNICODEOBJECT_H 0003 0004 /* 0005 0006 Unicode implementation based on original code by Fredrik Lundh, 0007 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 0008 Unicode Integration Proposal. (See 0009 http://www.egenix.com/files/python/unicode-proposal.txt). 0010 0011 Copyright (c) Corporation for National Research Initiatives. 0012 0013 0014 Original header: 0015 -------------------------------------------------------------------- 0016 0017 * Yet another Unicode string type for Python. This type supports the 0018 * 16-bit Basic Multilingual Plane (BMP) only. 0019 * 0020 * Written by Fredrik Lundh, January 1999. 0021 * 0022 * Copyright (c) 1999 by Secret Labs AB. 0023 * Copyright (c) 1999 by Fredrik Lundh. 0024 * 0025 * fredrik@pythonware.com 0026 * http://www.pythonware.com 0027 * 0028 * -------------------------------------------------------------------- 0029 * This Unicode String Type is 0030 * 0031 * Copyright (c) 1999 by Secret Labs AB 0032 * Copyright (c) 1999 by Fredrik Lundh 0033 * 0034 * By obtaining, using, and/or copying this software and/or its 0035 * associated documentation, you agree that you have read, understood, 0036 * and will comply with the following terms and conditions: 0037 * 0038 * Permission to use, copy, modify, and distribute this software and its 0039 * associated documentation for any purpose and without fee is hereby 0040 * granted, provided that the above copyright notice appears in all 0041 * copies, and that both that copyright notice and this permission notice 0042 * appear in supporting documentation, and that the name of Secret Labs 0043 * AB or the author not be used in advertising or publicity pertaining to 0044 * distribution of the software without specific, written prior 0045 * permission. 0046 * 0047 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 0048 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 0049 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 0050 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 0051 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 0052 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 0053 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 0054 * -------------------------------------------------------------------- */ 0055 0056 /* === Internal API ======================================================= */ 0057 0058 /* --- Internal Unicode Format -------------------------------------------- */ 0059 0060 /* Python 3.x requires unicode */ 0061 #define Py_USING_UNICODE 0062 0063 #ifndef SIZEOF_WCHAR_T 0064 #error Must define SIZEOF_WCHAR_T 0065 #endif 0066 0067 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T 0068 0069 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 0070 Otherwise, Unicode strings are stored as UCS-2 (with limited support 0071 for UTF-16) */ 0072 0073 #if Py_UNICODE_SIZE >= 4 0074 #define Py_UNICODE_WIDE 0075 #endif 0076 0077 /* Set these flags if the platform has "wchar.h" and the 0078 wchar_t type is a 16-bit unsigned type */ 0079 /* #define HAVE_WCHAR_H */ 0080 /* #define HAVE_USABLE_WCHAR_T */ 0081 0082 /* If the compiler provides a wchar_t type we try to support it 0083 through the interface functions PyUnicode_FromWideChar(), 0084 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 0085 0086 #ifdef HAVE_USABLE_WCHAR_T 0087 # ifndef HAVE_WCHAR_H 0088 # define HAVE_WCHAR_H 0089 # endif 0090 #endif 0091 0092 /* Py_UCS4 and Py_UCS2 are typedefs for the respective 0093 unicode representations. */ 0094 typedef uint32_t Py_UCS4; 0095 typedef uint16_t Py_UCS2; 0096 typedef uint8_t Py_UCS1; 0097 0098 #ifdef __cplusplus 0099 extern "C" { 0100 #endif 0101 0102 0103 PyAPI_DATA(PyTypeObject) PyUnicode_Type; 0104 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 0105 0106 #define PyUnicode_Check(op) \ 0107 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 0108 #define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type) 0109 0110 /* --- Constants ---------------------------------------------------------- */ 0111 0112 /* This Unicode character will be used as replacement character during 0113 decoding if the errors argument is set to "replace". Note: the 0114 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 0115 Unicode 3.0. */ 0116 0117 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 0118 0119 /* === Public API ========================================================= */ 0120 0121 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 0122 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 0123 const char *u, /* UTF-8 encoded string */ 0124 Py_ssize_t size /* size of buffer */ 0125 ); 0126 0127 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 0128 UTF-8 encoded bytes. The size is determined with strlen(). */ 0129 PyAPI_FUNC(PyObject*) PyUnicode_FromString( 0130 const char *u /* UTF-8 encoded string */ 0131 ); 0132 0133 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0134 PyAPI_FUNC(PyObject*) PyUnicode_Substring( 0135 PyObject *str, 0136 Py_ssize_t start, 0137 Py_ssize_t end); 0138 #endif 0139 0140 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0141 /* Copy the string into a UCS4 buffer including the null character if copy_null 0142 is set. Return NULL and raise an exception on error. Raise a SystemError if 0143 the buffer is smaller than the string. Return buffer on success. 0144 0145 buflen is the length of the buffer in (Py_UCS4) characters. */ 0146 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 0147 PyObject *unicode, 0148 Py_UCS4* buffer, 0149 Py_ssize_t buflen, 0150 int copy_null); 0151 0152 /* Copy the string into a UCS4 buffer. A new buffer is allocated using 0153 * PyMem_Malloc; if this fails, NULL is returned with a memory error 0154 exception set. */ 0155 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 0156 #endif 0157 0158 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0159 /* Get the length of the Unicode object. */ 0160 0161 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 0162 PyObject *unicode 0163 ); 0164 #endif 0165 0166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0167 /* Read a character from the string. */ 0168 0169 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 0170 PyObject *unicode, 0171 Py_ssize_t index 0172 ); 0173 0174 /* Write a character to the string. The string must have been created through 0175 PyUnicode_New, must not be shared, and must not have been hashed yet. 0176 0177 Return 0 on success, -1 on error. */ 0178 0179 PyAPI_FUNC(int) PyUnicode_WriteChar( 0180 PyObject *unicode, 0181 Py_ssize_t index, 0182 Py_UCS4 character 0183 ); 0184 #endif 0185 0186 /* Resize a Unicode object. The length is the number of codepoints. 0187 0188 *unicode is modified to point to the new (resized) object and 0 0189 returned on success. 0190 0191 Try to resize the string in place (which is usually faster than allocating 0192 a new string and copy characters), or create a new string. 0193 0194 Error handling is implemented as follows: an exception is set, -1 0195 is returned and *unicode left untouched. 0196 0197 WARNING: The function doesn't check string content, the result may not be a 0198 string in canonical representation. */ 0199 0200 PyAPI_FUNC(int) PyUnicode_Resize( 0201 PyObject **unicode, /* Pointer to the Unicode object */ 0202 Py_ssize_t length /* New length */ 0203 ); 0204 0205 /* Decode obj to a Unicode object. 0206 0207 bytes, bytearray and other bytes-like objects are decoded according to the 0208 given encoding and error handler. The encoding and error handler can be 0209 NULL to have the interface use UTF-8 and "strict". 0210 0211 All other objects (including Unicode objects) raise an exception. 0212 0213 The API returns NULL in case of an error. The caller is responsible 0214 for decref'ing the returned objects. 0215 0216 */ 0217 0218 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 0219 PyObject *obj, /* Object */ 0220 const char *encoding, /* encoding */ 0221 const char *errors /* error handling */ 0222 ); 0223 0224 /* Copy an instance of a Unicode subtype to a new true Unicode object if 0225 necessary. If obj is already a true Unicode object (not a subtype), return 0226 the reference with *incremented* refcount. 0227 0228 The API returns NULL in case of an error. The caller is responsible 0229 for decref'ing the returned objects. 0230 0231 */ 0232 0233 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 0234 PyObject *obj /* Object */ 0235 ); 0236 0237 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 0238 const char *format, /* ASCII-encoded string */ 0239 va_list vargs 0240 ); 0241 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 0242 const char *format, /* ASCII-encoded string */ 0243 ... 0244 ); 0245 0246 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 0247 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 0248 const char *u /* UTF-8 encoded string */ 0249 ); 0250 0251 /* --- wchar_t support for platforms which support it --------------------- */ 0252 0253 #ifdef HAVE_WCHAR_H 0254 0255 /* Create a Unicode Object from the wchar_t buffer w of the given 0256 size. 0257 0258 The buffer is copied into the new object. */ 0259 0260 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 0261 const wchar_t *w, /* wchar_t buffer */ 0262 Py_ssize_t size /* size of buffer */ 0263 ); 0264 0265 /* Copies the Unicode Object contents into the wchar_t buffer w. At 0266 most size wchar_t characters are copied. 0267 0268 Note that the resulting wchar_t string may or may not be 0269 0-terminated. It is the responsibility of the caller to make sure 0270 that the wchar_t string is 0-terminated in case this is required by 0271 the application. 0272 0273 Returns the number of wchar_t characters copied (excluding a 0274 possibly trailing 0-termination character) or -1 in case of an 0275 error. */ 0276 0277 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 0278 PyObject *unicode, /* Unicode object */ 0279 wchar_t *w, /* wchar_t buffer */ 0280 Py_ssize_t size /* size of buffer */ 0281 ); 0282 0283 /* Convert the Unicode object to a wide character string. The output string 0284 always ends with a nul character. If size is not NULL, write the number of 0285 wide characters (excluding the null character) into *size. 0286 0287 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 0288 on success. On error, returns NULL, *size is undefined and raises a 0289 MemoryError. */ 0290 0291 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 0292 PyObject *unicode, /* Unicode object */ 0293 Py_ssize_t *size /* number of characters of the result */ 0294 ); 0295 0296 #endif 0297 0298 /* --- Unicode ordinals --------------------------------------------------- */ 0299 0300 /* Create a Unicode Object from the given Unicode code point ordinal. 0301 0302 The ordinal must be in range(0x110000). A ValueError is 0303 raised in case it is not. 0304 0305 */ 0306 0307 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 0308 0309 /* === Builtin Codecs ===================================================== 0310 0311 Many of these APIs take two arguments encoding and errors. These 0312 parameters encoding and errors have the same semantics as the ones 0313 of the builtin str() API. 0314 0315 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 0316 0317 Error handling is set by errors which may also be set to NULL 0318 meaning to use the default handling defined for the codec. Default 0319 error handling for all builtin codecs is "strict" (ValueErrors are 0320 raised). 0321 0322 The codecs all use a similar interface. Only deviation from the 0323 generic ones are documented. 0324 0325 */ 0326 0327 /* --- Manage the default encoding ---------------------------------------- */ 0328 0329 /* Returns "utf-8". */ 0330 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 0331 0332 /* --- Generic Codecs ----------------------------------------------------- */ 0333 0334 /* Create a Unicode object by decoding the encoded string s of the 0335 given size. */ 0336 0337 PyAPI_FUNC(PyObject*) PyUnicode_Decode( 0338 const char *s, /* encoded string */ 0339 Py_ssize_t size, /* size of buffer */ 0340 const char *encoding, /* encoding */ 0341 const char *errors /* error handling */ 0342 ); 0343 0344 /* Decode a Unicode object unicode and return the result as Python 0345 object. 0346 0347 This API is DEPRECATED. The only supported standard encoding is rot13. 0348 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 0349 that decode from str. */ 0350 0351 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 0352 PyObject *unicode, /* Unicode object */ 0353 const char *encoding, /* encoding */ 0354 const char *errors /* error handling */ 0355 ); 0356 0357 /* Decode a Unicode object unicode and return the result as Unicode 0358 object. 0359 0360 This API is DEPRECATED. The only supported standard encoding is rot13. 0361 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 0362 that decode from str to str. */ 0363 0364 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 0365 PyObject *unicode, /* Unicode object */ 0366 const char *encoding, /* encoding */ 0367 const char *errors /* error handling */ 0368 ); 0369 0370 /* Encodes a Unicode object and returns the result as Python 0371 object. 0372 0373 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() 0374 since all standard encodings (except rot13) encode str to bytes. 0375 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs 0376 that encode form str to non-bytes. */ 0377 0378 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 0379 PyObject *unicode, /* Unicode object */ 0380 const char *encoding, /* encoding */ 0381 const char *errors /* error handling */ 0382 ); 0383 0384 /* Encodes a Unicode object and returns the result as Python string 0385 object. */ 0386 0387 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 0388 PyObject *unicode, /* Unicode object */ 0389 const char *encoding, /* encoding */ 0390 const char *errors /* error handling */ 0391 ); 0392 0393 /* Encodes a Unicode object and returns the result as Unicode 0394 object. 0395 0396 This API is DEPRECATED. The only supported standard encodings is rot13. 0397 Use PyCodec_Encode() to encode with rot13 and non-standard codecs 0398 that encode from str to str. */ 0399 0400 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 0401 PyObject *unicode, /* Unicode object */ 0402 const char *encoding, /* encoding */ 0403 const char *errors /* error handling */ 0404 ); 0405 0406 /* Build an encoding map. */ 0407 0408 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 0409 PyObject* string /* 256 character map */ 0410 ); 0411 0412 /* --- UTF-7 Codecs ------------------------------------------------------- */ 0413 0414 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 0415 const char *string, /* UTF-7 encoded string */ 0416 Py_ssize_t length, /* size of string */ 0417 const char *errors /* error handling */ 0418 ); 0419 0420 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 0421 const char *string, /* UTF-7 encoded string */ 0422 Py_ssize_t length, /* size of string */ 0423 const char *errors, /* error handling */ 0424 Py_ssize_t *consumed /* bytes consumed */ 0425 ); 0426 0427 /* --- UTF-8 Codecs ------------------------------------------------------- */ 0428 0429 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 0430 const char *string, /* UTF-8 encoded string */ 0431 Py_ssize_t length, /* size of string */ 0432 const char *errors /* error handling */ 0433 ); 0434 0435 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 0436 const char *string, /* UTF-8 encoded string */ 0437 Py_ssize_t length, /* size of string */ 0438 const char *errors, /* error handling */ 0439 Py_ssize_t *consumed /* bytes consumed */ 0440 ); 0441 0442 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 0443 PyObject *unicode /* Unicode object */ 0444 ); 0445 0446 /* Returns a pointer to the default encoding (UTF-8) of the 0447 Unicode object unicode and the size of the encoded representation 0448 in bytes stored in *size. 0449 0450 In case of an error, no *size is set. 0451 0452 This function caches the UTF-8 encoded string in the unicodeobject 0453 and subsequent calls will return the same string. The memory is released 0454 when the unicodeobject is deallocated. 0455 */ 0456 0457 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 0458 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( 0459 PyObject *unicode, 0460 Py_ssize_t *size); 0461 #endif 0462 0463 /* --- UTF-32 Codecs ------------------------------------------------------ */ 0464 0465 /* Decodes length bytes from a UTF-32 encoded buffer string and returns 0466 the corresponding Unicode object. 0467 0468 errors (if non-NULL) defines the error handling. It defaults 0469 to "strict". 0470 0471 If byteorder is non-NULL, the decoder starts decoding using the 0472 given byte order: 0473 0474 *byteorder == -1: little endian 0475 *byteorder == 0: native order 0476 *byteorder == 1: big endian 0477 0478 In native mode, the first four bytes of the stream are checked for a 0479 BOM mark. If found, the BOM mark is analysed, the byte order 0480 adjusted and the BOM skipped. In the other modes, no BOM mark 0481 interpretation is done. After completion, *byteorder is set to the 0482 current byte order at the end of input data. 0483 0484 If byteorder is NULL, the codec starts in native order mode. 0485 0486 */ 0487 0488 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 0489 const char *string, /* UTF-32 encoded string */ 0490 Py_ssize_t length, /* size of string */ 0491 const char *errors, /* error handling */ 0492 int *byteorder /* pointer to byteorder to use 0493 0=native;-1=LE,1=BE; updated on 0494 exit */ 0495 ); 0496 0497 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 0498 const char *string, /* UTF-32 encoded string */ 0499 Py_ssize_t length, /* size of string */ 0500 const char *errors, /* error handling */ 0501 int *byteorder, /* pointer to byteorder to use 0502 0=native;-1=LE,1=BE; updated on 0503 exit */ 0504 Py_ssize_t *consumed /* bytes consumed */ 0505 ); 0506 0507 /* Returns a Python string using the UTF-32 encoding in native byte 0508 order. The string always starts with a BOM mark. */ 0509 0510 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 0511 PyObject *unicode /* Unicode object */ 0512 ); 0513 0514 /* Returns a Python string object holding the UTF-32 encoded value of 0515 the Unicode data. 0516 0517 If byteorder is not 0, output is written according to the following 0518 byte order: 0519 0520 byteorder == -1: little endian 0521 byteorder == 0: native byte order (writes a BOM mark) 0522 byteorder == 1: big endian 0523 0524 If byteorder is 0, the output string will always start with the 0525 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 0526 prepended. 0527 0528 */ 0529 0530 /* --- UTF-16 Codecs ------------------------------------------------------ */ 0531 0532 /* Decodes length bytes from a UTF-16 encoded buffer string and returns 0533 the corresponding Unicode object. 0534 0535 errors (if non-NULL) defines the error handling. It defaults 0536 to "strict". 0537 0538 If byteorder is non-NULL, the decoder starts decoding using the 0539 given byte order: 0540 0541 *byteorder == -1: little endian 0542 *byteorder == 0: native order 0543 *byteorder == 1: big endian 0544 0545 In native mode, the first two bytes of the stream are checked for a 0546 BOM mark. If found, the BOM mark is analysed, the byte order 0547 adjusted and the BOM skipped. In the other modes, no BOM mark 0548 interpretation is done. After completion, *byteorder is set to the 0549 current byte order at the end of input data. 0550 0551 If byteorder is NULL, the codec starts in native order mode. 0552 0553 */ 0554 0555 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 0556 const char *string, /* UTF-16 encoded string */ 0557 Py_ssize_t length, /* size of string */ 0558 const char *errors, /* error handling */ 0559 int *byteorder /* pointer to byteorder to use 0560 0=native;-1=LE,1=BE; updated on 0561 exit */ 0562 ); 0563 0564 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 0565 const char *string, /* UTF-16 encoded string */ 0566 Py_ssize_t length, /* size of string */ 0567 const char *errors, /* error handling */ 0568 int *byteorder, /* pointer to byteorder to use 0569 0=native;-1=LE,1=BE; updated on 0570 exit */ 0571 Py_ssize_t *consumed /* bytes consumed */ 0572 ); 0573 0574 /* Returns a Python string using the UTF-16 encoding in native byte 0575 order. The string always starts with a BOM mark. */ 0576 0577 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 0578 PyObject *unicode /* Unicode object */ 0579 ); 0580 0581 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 0582 0583 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 0584 const char *string, /* Unicode-Escape encoded string */ 0585 Py_ssize_t length, /* size of string */ 0586 const char *errors /* error handling */ 0587 ); 0588 0589 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 0590 PyObject *unicode /* Unicode object */ 0591 ); 0592 0593 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 0594 0595 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 0596 const char *string, /* Raw-Unicode-Escape encoded string */ 0597 Py_ssize_t length, /* size of string */ 0598 const char *errors /* error handling */ 0599 ); 0600 0601 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 0602 PyObject *unicode /* Unicode object */ 0603 ); 0604 0605 /* --- Latin-1 Codecs ----------------------------------------------------- 0606 0607 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ 0608 0609 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 0610 const char *string, /* Latin-1 encoded string */ 0611 Py_ssize_t length, /* size of string */ 0612 const char *errors /* error handling */ 0613 ); 0614 0615 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 0616 PyObject *unicode /* Unicode object */ 0617 ); 0618 0619 /* --- ASCII Codecs ------------------------------------------------------- 0620 0621 Only 7-bit ASCII data is expected. All other codes generate errors. 0622 0623 */ 0624 0625 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 0626 const char *string, /* ASCII encoded string */ 0627 Py_ssize_t length, /* size of string */ 0628 const char *errors /* error handling */ 0629 ); 0630 0631 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 0632 PyObject *unicode /* Unicode object */ 0633 ); 0634 0635 /* --- Character Map Codecs ----------------------------------------------- 0636 0637 This codec uses mappings to encode and decode characters. 0638 0639 Decoding mappings must map byte ordinals (integers in the range from 0 to 0640 255) to Unicode strings, integers (which are then interpreted as Unicode 0641 ordinals) or None. Unmapped data bytes (ones which cause a LookupError) 0642 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined 0643 mapping" and cause an error. 0644 0645 Encoding mappings must map Unicode ordinal integers to bytes objects, 0646 integers in the range from 0 to 255 or None. Unmapped character 0647 ordinals (ones which cause a LookupError) as well as mapped to 0648 None are treated as "undefined mapping" and cause an error. 0649 0650 */ 0651 0652 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 0653 const char *string, /* Encoded string */ 0654 Py_ssize_t length, /* size of string */ 0655 PyObject *mapping, /* decoding mapping */ 0656 const char *errors /* error handling */ 0657 ); 0658 0659 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 0660 PyObject *unicode, /* Unicode object */ 0661 PyObject *mapping /* encoding mapping */ 0662 ); 0663 0664 /* --- MBCS codecs for Windows -------------------------------------------- */ 0665 0666 #ifdef MS_WINDOWS 0667 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 0668 const char *string, /* MBCS encoded string */ 0669 Py_ssize_t length, /* size of string */ 0670 const char *errors /* error handling */ 0671 ); 0672 0673 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 0674 const char *string, /* MBCS encoded string */ 0675 Py_ssize_t length, /* size of string */ 0676 const char *errors, /* error handling */ 0677 Py_ssize_t *consumed /* bytes consumed */ 0678 ); 0679 0680 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0681 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 0682 int code_page, /* code page number */ 0683 const char *string, /* encoded string */ 0684 Py_ssize_t length, /* size of string */ 0685 const char *errors, /* error handling */ 0686 Py_ssize_t *consumed /* bytes consumed */ 0687 ); 0688 #endif 0689 0690 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 0691 PyObject *unicode /* Unicode object */ 0692 ); 0693 0694 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0695 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 0696 int code_page, /* code page number */ 0697 PyObject *unicode, /* Unicode object */ 0698 const char *errors /* error handling */ 0699 ); 0700 #endif 0701 0702 #endif /* MS_WINDOWS */ 0703 0704 /* --- Locale encoding --------------------------------------------------- */ 0705 0706 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0707 /* Decode a string from the current locale encoding. The decoder is strict if 0708 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 0709 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 0710 be decoded as a surrogate character and *surrogateescape* is not equal to 0711 zero, the byte sequence is escaped using the 'surrogateescape' error handler 0712 instead of being decoded. *str* must end with a null character but cannot 0713 contain embedded null characters. */ 0714 0715 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 0716 const char *str, 0717 Py_ssize_t len, 0718 const char *errors); 0719 0720 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 0721 length using strlen(). */ 0722 0723 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 0724 const char *str, 0725 const char *errors); 0726 0727 /* Encode a Unicode object to the current locale encoding. The encoder is 0728 strict is *surrogateescape* is equal to zero, otherwise the 0729 "surrogateescape" error handler is used. Return a bytes object. The string 0730 cannot contain embedded null characters. */ 0731 0732 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 0733 PyObject *unicode, 0734 const char *errors 0735 ); 0736 #endif 0737 0738 /* --- File system encoding ---------------------------------------------- */ 0739 0740 /* ParseTuple converter: encode str objects to bytes using 0741 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 0742 0743 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 0744 0745 /* ParseTuple converter: decode bytes objects to unicode using 0746 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 0747 0748 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 0749 0750 /* Decode a null-terminated string from the Python filesystem encoding 0751 and error handler. 0752 0753 If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */ 0754 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 0755 const char *s /* encoded string */ 0756 ); 0757 0758 /* Decode a string from the Python filesystem encoding and error handler. */ 0759 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 0760 const char *s, /* encoded string */ 0761 Py_ssize_t size /* size */ 0762 ); 0763 0764 /* Encode a Unicode object to the Python filesystem encoding and error handler. 0765 Return bytes. */ 0766 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 0767 PyObject *unicode 0768 ); 0769 0770 /* --- Methods & Slots ---------------------------------------------------- 0771 0772 These are capable of handling Unicode objects and strings on input 0773 (we refer to them as strings in the descriptions) and return 0774 Unicode objects or integers as appropriate. */ 0775 0776 /* Concat two strings giving a new Unicode string. */ 0777 0778 PyAPI_FUNC(PyObject*) PyUnicode_Concat( 0779 PyObject *left, /* Left string */ 0780 PyObject *right /* Right string */ 0781 ); 0782 0783 /* Concat two strings and put the result in *pleft 0784 (sets *pleft to NULL on error) */ 0785 0786 PyAPI_FUNC(void) PyUnicode_Append( 0787 PyObject **pleft, /* Pointer to left string */ 0788 PyObject *right /* Right string */ 0789 ); 0790 0791 /* Concat two strings, put the result in *pleft and drop the right object 0792 (sets *pleft to NULL on error) */ 0793 0794 PyAPI_FUNC(void) PyUnicode_AppendAndDel( 0795 PyObject **pleft, /* Pointer to left string */ 0796 PyObject *right /* Right string */ 0797 ); 0798 0799 /* Split a string giving a list of Unicode strings. 0800 0801 If sep is NULL, splitting will be done at all whitespace 0802 substrings. Otherwise, splits occur at the given separator. 0803 0804 At most maxsplit splits will be done. If negative, no limit is set. 0805 0806 Separators are not included in the resulting list. 0807 0808 */ 0809 0810 PyAPI_FUNC(PyObject*) PyUnicode_Split( 0811 PyObject *s, /* String to split */ 0812 PyObject *sep, /* String separator */ 0813 Py_ssize_t maxsplit /* Maxsplit count */ 0814 ); 0815 0816 /* Dito, but split at line breaks. 0817 0818 CRLF is considered to be one line break. Line breaks are not 0819 included in the resulting list. */ 0820 0821 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 0822 PyObject *s, /* String to split */ 0823 int keepends /* If true, line end markers are included */ 0824 ); 0825 0826 /* Partition a string using a given separator. */ 0827 0828 PyAPI_FUNC(PyObject*) PyUnicode_Partition( 0829 PyObject *s, /* String to partition */ 0830 PyObject *sep /* String separator */ 0831 ); 0832 0833 /* Partition a string using a given separator, searching from the end of the 0834 string. */ 0835 0836 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 0837 PyObject *s, /* String to partition */ 0838 PyObject *sep /* String separator */ 0839 ); 0840 0841 /* Split a string giving a list of Unicode strings. 0842 0843 If sep is NULL, splitting will be done at all whitespace 0844 substrings. Otherwise, splits occur at the given separator. 0845 0846 At most maxsplit splits will be done. But unlike PyUnicode_Split 0847 PyUnicode_RSplit splits from the end of the string. If negative, 0848 no limit is set. 0849 0850 Separators are not included in the resulting list. 0851 0852 */ 0853 0854 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 0855 PyObject *s, /* String to split */ 0856 PyObject *sep, /* String separator */ 0857 Py_ssize_t maxsplit /* Maxsplit count */ 0858 ); 0859 0860 /* Translate a string by applying a character mapping table to it and 0861 return the resulting Unicode object. 0862 0863 The mapping table must map Unicode ordinal integers to Unicode strings, 0864 Unicode ordinal integers or None (causing deletion of the character). 0865 0866 Mapping tables may be dictionaries or sequences. Unmapped character 0867 ordinals (ones which cause a LookupError) are left untouched and 0868 are copied as-is. 0869 0870 */ 0871 0872 PyAPI_FUNC(PyObject *) PyUnicode_Translate( 0873 PyObject *str, /* String */ 0874 PyObject *table, /* Translate table */ 0875 const char *errors /* error handling */ 0876 ); 0877 0878 /* Join a sequence of strings using the given separator and return 0879 the resulting Unicode string. */ 0880 0881 PyAPI_FUNC(PyObject*) PyUnicode_Join( 0882 PyObject *separator, /* Separator string */ 0883 PyObject *seq /* Sequence object */ 0884 ); 0885 0886 /* Return 1 if substr matches str[start:end] at the given tail end, 0 0887 otherwise. */ 0888 0889 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 0890 PyObject *str, /* String */ 0891 PyObject *substr, /* Prefix or Suffix string */ 0892 Py_ssize_t start, /* Start index */ 0893 Py_ssize_t end, /* Stop index */ 0894 int direction /* Tail end: -1 prefix, +1 suffix */ 0895 ); 0896 0897 /* Return the first position of substr in str[start:end] using the 0898 given search direction or -1 if not found. -2 is returned in case 0899 an error occurred and an exception is set. */ 0900 0901 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 0902 PyObject *str, /* String */ 0903 PyObject *substr, /* Substring to find */ 0904 Py_ssize_t start, /* Start index */ 0905 Py_ssize_t end, /* Stop index */ 0906 int direction /* Find direction: +1 forward, -1 backward */ 0907 ); 0908 0909 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0910 /* Like PyUnicode_Find, but search for single character only. */ 0911 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 0912 PyObject *str, 0913 Py_UCS4 ch, 0914 Py_ssize_t start, 0915 Py_ssize_t end, 0916 int direction 0917 ); 0918 #endif 0919 0920 /* Count the number of occurrences of substr in str[start:end]. */ 0921 0922 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 0923 PyObject *str, /* String */ 0924 PyObject *substr, /* Substring to count */ 0925 Py_ssize_t start, /* Start index */ 0926 Py_ssize_t end /* Stop index */ 0927 ); 0928 0929 /* Replace at most maxcount occurrences of substr in str with replstr 0930 and return the resulting Unicode object. */ 0931 0932 PyAPI_FUNC(PyObject *) PyUnicode_Replace( 0933 PyObject *str, /* String */ 0934 PyObject *substr, /* Substring to find */ 0935 PyObject *replstr, /* Substring to replace */ 0936 Py_ssize_t maxcount /* Max. number of replacements to apply; 0937 -1 = all */ 0938 ); 0939 0940 /* Compare two strings and return -1, 0, 1 for less than, equal, 0941 greater than resp. 0942 Raise an exception and return -1 on error. */ 0943 0944 PyAPI_FUNC(int) PyUnicode_Compare( 0945 PyObject *left, /* Left string */ 0946 PyObject *right /* Right string */ 0947 ); 0948 0949 /* Compare a Unicode object with C string and return -1, 0, 1 for less than, 0950 equal, and greater than, respectively. It is best to pass only 0951 ASCII-encoded strings, but the function interprets the input string as 0952 ISO-8859-1 if it contains non-ASCII characters. 0953 This function does not raise exceptions. */ 0954 0955 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 0956 PyObject *left, 0957 const char *right /* ASCII-encoded string */ 0958 ); 0959 0960 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 0961 /* Compare a Unicode object with UTF-8 encoded C string. 0962 Return 1 if they are equal, or 0 otherwise. 0963 This function does not raise exceptions. */ 0964 0965 PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); 0966 PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t); 0967 #endif 0968 0969 /* Rich compare two strings and return one of the following: 0970 0971 - NULL in case an exception was raised 0972 - Py_True or Py_False for successful comparisons 0973 - Py_NotImplemented in case the type combination is unknown 0974 0975 Possible values for op: 0976 0977 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 0978 0979 */ 0980 0981 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 0982 PyObject *left, /* Left string */ 0983 PyObject *right, /* Right string */ 0984 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 0985 ); 0986 0987 /* Apply an argument tuple or dictionary to a format string and return 0988 the resulting Unicode string. */ 0989 0990 PyAPI_FUNC(PyObject *) PyUnicode_Format( 0991 PyObject *format, /* Format string */ 0992 PyObject *args /* Argument tuple or dictionary */ 0993 ); 0994 0995 /* Checks whether element is contained in container and return 1/0 0996 accordingly. 0997 0998 element has to coerce to a one element Unicode string. -1 is 0999 returned in case of an error. */ 1000 1001 PyAPI_FUNC(int) PyUnicode_Contains( 1002 PyObject *container, /* Container string */ 1003 PyObject *element /* Element string */ 1004 ); 1005 1006 /* Checks whether argument is a valid identifier. */ 1007 1008 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1009 1010 /* === Characters Type APIs =============================================== */ 1011 1012 #ifndef Py_LIMITED_API 1013 # define Py_CPYTHON_UNICODEOBJECT_H 1014 # include "cpython/unicodeobject.h" 1015 # undef Py_CPYTHON_UNICODEOBJECT_H 1016 #endif 1017 1018 #ifdef __cplusplus 1019 } 1020 #endif 1021 #endif /* !Py_UNICODEOBJECT_H */
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|