|
||||
File indexing completed on 2025-01-18 10:06:51
0001 #ifndef Py_UNICODEOBJECT_H 0002 #define Py_UNICODEOBJECT_H 0003 0004 #include <stdarg.h> // va_list 0005 0006 /* 0007 0008 Unicode implementation based on original code by Fredrik Lundh, 0009 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 0010 Unicode Integration Proposal. (See 0011 http://www.egenix.com/files/python/unicode-proposal.txt). 0012 0013 Copyright (c) Corporation for National Research Initiatives. 0014 0015 0016 Original header: 0017 -------------------------------------------------------------------- 0018 0019 * Yet another Unicode string type for Python. This type supports the 0020 * 16-bit Basic Multilingual Plane (BMP) only. 0021 * 0022 * Written by Fredrik Lundh, January 1999. 0023 * 0024 * Copyright (c) 1999 by Secret Labs AB. 0025 * Copyright (c) 1999 by Fredrik Lundh. 0026 * 0027 * fredrik@pythonware.com 0028 * http://www.pythonware.com 0029 * 0030 * -------------------------------------------------------------------- 0031 * This Unicode String Type is 0032 * 0033 * Copyright (c) 1999 by Secret Labs AB 0034 * Copyright (c) 1999 by Fredrik Lundh 0035 * 0036 * By obtaining, using, and/or copying this software and/or its 0037 * associated documentation, you agree that you have read, understood, 0038 * and will comply with the following terms and conditions: 0039 * 0040 * Permission to use, copy, modify, and distribute this software and its 0041 * associated documentation for any purpose and without fee is hereby 0042 * granted, provided that the above copyright notice appears in all 0043 * copies, and that both that copyright notice and this permission notice 0044 * appear in supporting documentation, and that the name of Secret Labs 0045 * AB or the author not be used in advertising or publicity pertaining to 0046 * distribution of the software without specific, written prior 0047 * permission. 0048 * 0049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 0050 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 0051 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 0052 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 0053 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 0054 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 0055 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 0056 * -------------------------------------------------------------------- */ 0057 0058 #include <ctype.h> 0059 0060 /* === Internal API ======================================================= */ 0061 0062 /* --- Internal Unicode Format -------------------------------------------- */ 0063 0064 /* Python 3.x requires unicode */ 0065 #define Py_USING_UNICODE 0066 0067 #ifndef SIZEOF_WCHAR_T 0068 #error Must define SIZEOF_WCHAR_T 0069 #endif 0070 0071 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T 0072 0073 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 0074 Otherwise, Unicode strings are stored as UCS-2 (with limited support 0075 for UTF-16) */ 0076 0077 #if Py_UNICODE_SIZE >= 4 0078 #define Py_UNICODE_WIDE 0079 #endif 0080 0081 /* Set these flags if the platform has "wchar.h" and the 0082 wchar_t type is a 16-bit unsigned type */ 0083 /* #define HAVE_WCHAR_H */ 0084 /* #define HAVE_USABLE_WCHAR_T */ 0085 0086 /* If the compiler provides a wchar_t type we try to support it 0087 through the interface functions PyUnicode_FromWideChar(), 0088 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 0089 0090 #ifdef HAVE_USABLE_WCHAR_T 0091 # ifndef HAVE_WCHAR_H 0092 # define HAVE_WCHAR_H 0093 # endif 0094 #endif 0095 0096 #ifdef HAVE_WCHAR_H 0097 # include <wchar.h> 0098 #endif 0099 0100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective 0101 unicode representations. */ 0102 typedef uint32_t Py_UCS4; 0103 typedef uint16_t Py_UCS2; 0104 typedef uint8_t Py_UCS1; 0105 0106 #ifdef __cplusplus 0107 extern "C" { 0108 #endif 0109 0110 0111 PyAPI_DATA(PyTypeObject) PyUnicode_Type; 0112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 0113 0114 #define PyUnicode_Check(op) \ 0115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 0116 #define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type) 0117 0118 /* --- Constants ---------------------------------------------------------- */ 0119 0120 /* This Unicode character will be used as replacement character during 0121 decoding if the errors argument is set to "replace". Note: the 0122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 0123 Unicode 3.0. */ 0124 0125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 0126 0127 /* === Public API ========================================================= */ 0128 0129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 0130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 0131 const char *u, /* UTF-8 encoded string */ 0132 Py_ssize_t size /* size of buffer */ 0133 ); 0134 0135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 0136 UTF-8 encoded bytes. The size is determined with strlen(). */ 0137 PyAPI_FUNC(PyObject*) PyUnicode_FromString( 0138 const char *u /* UTF-8 encoded string */ 0139 ); 0140 0141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0142 PyAPI_FUNC(PyObject*) PyUnicode_Substring( 0143 PyObject *str, 0144 Py_ssize_t start, 0145 Py_ssize_t end); 0146 #endif 0147 0148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0149 /* Copy the string into a UCS4 buffer including the null character if copy_null 0150 is set. Return NULL and raise an exception on error. Raise a SystemError if 0151 the buffer is smaller than the string. Return buffer on success. 0152 0153 buflen is the length of the buffer in (Py_UCS4) characters. */ 0154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 0155 PyObject *unicode, 0156 Py_UCS4* buffer, 0157 Py_ssize_t buflen, 0158 int copy_null); 0159 0160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using 0161 * PyMem_Malloc; if this fails, NULL is returned with a memory error 0162 exception set. */ 0163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 0164 #endif 0165 0166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0167 /* Get the length of the Unicode object. */ 0168 0169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 0170 PyObject *unicode 0171 ); 0172 #endif 0173 0174 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0175 /* Read a character from the string. */ 0176 0177 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 0178 PyObject *unicode, 0179 Py_ssize_t index 0180 ); 0181 0182 /* Write a character to the string. The string must have been created through 0183 PyUnicode_New, must not be shared, and must not have been hashed yet. 0184 0185 Return 0 on success, -1 on error. */ 0186 0187 PyAPI_FUNC(int) PyUnicode_WriteChar( 0188 PyObject *unicode, 0189 Py_ssize_t index, 0190 Py_UCS4 character 0191 ); 0192 #endif 0193 0194 /* Resize a Unicode object. The length is the number of codepoints. 0195 0196 *unicode is modified to point to the new (resized) object and 0 0197 returned on success. 0198 0199 Try to resize the string in place (which is usually faster than allocating 0200 a new string and copy characters), or create a new string. 0201 0202 Error handling is implemented as follows: an exception is set, -1 0203 is returned and *unicode left untouched. 0204 0205 WARNING: The function doesn't check string content, the result may not be a 0206 string in canonical representation. */ 0207 0208 PyAPI_FUNC(int) PyUnicode_Resize( 0209 PyObject **unicode, /* Pointer to the Unicode object */ 0210 Py_ssize_t length /* New length */ 0211 ); 0212 0213 /* Decode obj to a Unicode object. 0214 0215 bytes, bytearray and other bytes-like objects are decoded according to the 0216 given encoding and error handler. The encoding and error handler can be 0217 NULL to have the interface use UTF-8 and "strict". 0218 0219 All other objects (including Unicode objects) raise an exception. 0220 0221 The API returns NULL in case of an error. The caller is responsible 0222 for decref'ing the returned objects. 0223 0224 */ 0225 0226 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 0227 PyObject *obj, /* Object */ 0228 const char *encoding, /* encoding */ 0229 const char *errors /* error handling */ 0230 ); 0231 0232 /* Copy an instance of a Unicode subtype to a new true Unicode object if 0233 necessary. If obj is already a true Unicode object (not a subtype), return 0234 the reference with *incremented* refcount. 0235 0236 The API returns NULL in case of an error. The caller is responsible 0237 for decref'ing the returned objects. 0238 0239 */ 0240 0241 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 0242 PyObject *obj /* Object */ 0243 ); 0244 0245 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 0246 const char *format, /* ASCII-encoded string */ 0247 va_list vargs 0248 ); 0249 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 0250 const char *format, /* ASCII-encoded string */ 0251 ... 0252 ); 0253 0254 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 0255 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 0256 const char *u /* UTF-8 encoded string */ 0257 ); 0258 0259 /* --- wchar_t support for platforms which support it --------------------- */ 0260 0261 #ifdef HAVE_WCHAR_H 0262 0263 /* Create a Unicode Object from the wchar_t buffer w of the given 0264 size. 0265 0266 The buffer is copied into the new object. */ 0267 0268 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 0269 const wchar_t *w, /* wchar_t buffer */ 0270 Py_ssize_t size /* size of buffer */ 0271 ); 0272 0273 /* Copies the Unicode Object contents into the wchar_t buffer w. At 0274 most size wchar_t characters are copied. 0275 0276 Note that the resulting wchar_t string may or may not be 0277 0-terminated. It is the responsibility of the caller to make sure 0278 that the wchar_t string is 0-terminated in case this is required by 0279 the application. 0280 0281 Returns the number of wchar_t characters copied (excluding a 0282 possibly trailing 0-termination character) or -1 in case of an 0283 error. */ 0284 0285 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 0286 PyObject *unicode, /* Unicode object */ 0287 wchar_t *w, /* wchar_t buffer */ 0288 Py_ssize_t size /* size of buffer */ 0289 ); 0290 0291 /* Convert the Unicode object to a wide character string. The output string 0292 always ends with a nul character. If size is not NULL, write the number of 0293 wide characters (excluding the null character) into *size. 0294 0295 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 0296 on success. On error, returns NULL, *size is undefined and raises a 0297 MemoryError. */ 0298 0299 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 0300 PyObject *unicode, /* Unicode object */ 0301 Py_ssize_t *size /* number of characters of the result */ 0302 ); 0303 0304 #endif 0305 0306 /* --- Unicode ordinals --------------------------------------------------- */ 0307 0308 /* Create a Unicode Object from the given Unicode code point ordinal. 0309 0310 The ordinal must be in range(0x110000). A ValueError is 0311 raised in case it is not. 0312 0313 */ 0314 0315 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 0316 0317 /* === Builtin Codecs ===================================================== 0318 0319 Many of these APIs take two arguments encoding and errors. These 0320 parameters encoding and errors have the same semantics as the ones 0321 of the builtin str() API. 0322 0323 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 0324 0325 Error handling is set by errors which may also be set to NULL 0326 meaning to use the default handling defined for the codec. Default 0327 error handling for all builtin codecs is "strict" (ValueErrors are 0328 raised). 0329 0330 The codecs all use a similar interface. Only deviation from the 0331 generic ones are documented. 0332 0333 */ 0334 0335 /* --- Manage the default encoding ---------------------------------------- */ 0336 0337 /* Returns "utf-8". */ 0338 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 0339 0340 /* --- Generic Codecs ----------------------------------------------------- */ 0341 0342 /* Create a Unicode object by decoding the encoded string s of the 0343 given size. */ 0344 0345 PyAPI_FUNC(PyObject*) PyUnicode_Decode( 0346 const char *s, /* encoded string */ 0347 Py_ssize_t size, /* size of buffer */ 0348 const char *encoding, /* encoding */ 0349 const char *errors /* error handling */ 0350 ); 0351 0352 /* Decode a Unicode object unicode and return the result as Python 0353 object. 0354 0355 This API is DEPRECATED. The only supported standard encoding is rot13. 0356 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 0357 that decode from str. */ 0358 0359 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 0360 PyObject *unicode, /* Unicode object */ 0361 const char *encoding, /* encoding */ 0362 const char *errors /* error handling */ 0363 ); 0364 0365 /* Decode a Unicode object unicode and return the result as Unicode 0366 object. 0367 0368 This API is DEPRECATED. The only supported standard encoding is rot13. 0369 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 0370 that decode from str to str. */ 0371 0372 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 0373 PyObject *unicode, /* Unicode object */ 0374 const char *encoding, /* encoding */ 0375 const char *errors /* error handling */ 0376 ); 0377 0378 /* Encodes a Unicode object and returns the result as Python 0379 object. 0380 0381 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() 0382 since all standard encodings (except rot13) encode str to bytes. 0383 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs 0384 that encode form str to non-bytes. */ 0385 0386 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 0387 PyObject *unicode, /* Unicode object */ 0388 const char *encoding, /* encoding */ 0389 const char *errors /* error handling */ 0390 ); 0391 0392 /* Encodes a Unicode object and returns the result as Python string 0393 object. */ 0394 0395 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 0396 PyObject *unicode, /* Unicode object */ 0397 const char *encoding, /* encoding */ 0398 const char *errors /* error handling */ 0399 ); 0400 0401 /* Encodes a Unicode object and returns the result as Unicode 0402 object. 0403 0404 This API is DEPRECATED. The only supported standard encodings is rot13. 0405 Use PyCodec_Encode() to encode with rot13 and non-standard codecs 0406 that encode from str to str. */ 0407 0408 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 0409 PyObject *unicode, /* Unicode object */ 0410 const char *encoding, /* encoding */ 0411 const char *errors /* error handling */ 0412 ); 0413 0414 /* Build an encoding map. */ 0415 0416 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 0417 PyObject* string /* 256 character map */ 0418 ); 0419 0420 /* --- UTF-7 Codecs ------------------------------------------------------- */ 0421 0422 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 0423 const char *string, /* UTF-7 encoded string */ 0424 Py_ssize_t length, /* size of string */ 0425 const char *errors /* error handling */ 0426 ); 0427 0428 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 0429 const char *string, /* UTF-7 encoded string */ 0430 Py_ssize_t length, /* size of string */ 0431 const char *errors, /* error handling */ 0432 Py_ssize_t *consumed /* bytes consumed */ 0433 ); 0434 0435 /* --- UTF-8 Codecs ------------------------------------------------------- */ 0436 0437 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 0438 const char *string, /* UTF-8 encoded string */ 0439 Py_ssize_t length, /* size of string */ 0440 const char *errors /* error handling */ 0441 ); 0442 0443 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 0444 const char *string, /* UTF-8 encoded string */ 0445 Py_ssize_t length, /* size of string */ 0446 const char *errors, /* error handling */ 0447 Py_ssize_t *consumed /* bytes consumed */ 0448 ); 0449 0450 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 0451 PyObject *unicode /* Unicode object */ 0452 ); 0453 0454 /* Returns a pointer to the default encoding (UTF-8) of the 0455 Unicode object unicode and the size of the encoded representation 0456 in bytes stored in *size. 0457 0458 In case of an error, no *size is set. 0459 0460 This function caches the UTF-8 encoded string in the unicodeobject 0461 and subsequent calls will return the same string. The memory is released 0462 when the unicodeobject is deallocated. 0463 */ 0464 0465 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 0466 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( 0467 PyObject *unicode, 0468 Py_ssize_t *size); 0469 #endif 0470 0471 /* --- UTF-32 Codecs ------------------------------------------------------ */ 0472 0473 /* Decodes length bytes from a UTF-32 encoded buffer string and returns 0474 the corresponding Unicode object. 0475 0476 errors (if non-NULL) defines the error handling. It defaults 0477 to "strict". 0478 0479 If byteorder is non-NULL, the decoder starts decoding using the 0480 given byte order: 0481 0482 *byteorder == -1: little endian 0483 *byteorder == 0: native order 0484 *byteorder == 1: big endian 0485 0486 In native mode, the first four bytes of the stream are checked for a 0487 BOM mark. If found, the BOM mark is analysed, the byte order 0488 adjusted and the BOM skipped. In the other modes, no BOM mark 0489 interpretation is done. After completion, *byteorder is set to the 0490 current byte order at the end of input data. 0491 0492 If byteorder is NULL, the codec starts in native order mode. 0493 0494 */ 0495 0496 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 0497 const char *string, /* UTF-32 encoded string */ 0498 Py_ssize_t length, /* size of string */ 0499 const char *errors, /* error handling */ 0500 int *byteorder /* pointer to byteorder to use 0501 0=native;-1=LE,1=BE; updated on 0502 exit */ 0503 ); 0504 0505 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 0506 const char *string, /* UTF-32 encoded string */ 0507 Py_ssize_t length, /* size of string */ 0508 const char *errors, /* error handling */ 0509 int *byteorder, /* pointer to byteorder to use 0510 0=native;-1=LE,1=BE; updated on 0511 exit */ 0512 Py_ssize_t *consumed /* bytes consumed */ 0513 ); 0514 0515 /* Returns a Python string using the UTF-32 encoding in native byte 0516 order. The string always starts with a BOM mark. */ 0517 0518 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 0519 PyObject *unicode /* Unicode object */ 0520 ); 0521 0522 /* Returns a Python string object holding the UTF-32 encoded value of 0523 the Unicode data. 0524 0525 If byteorder is not 0, output is written according to the following 0526 byte order: 0527 0528 byteorder == -1: little endian 0529 byteorder == 0: native byte order (writes a BOM mark) 0530 byteorder == 1: big endian 0531 0532 If byteorder is 0, the output string will always start with the 0533 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 0534 prepended. 0535 0536 */ 0537 0538 /* --- UTF-16 Codecs ------------------------------------------------------ */ 0539 0540 /* Decodes length bytes from a UTF-16 encoded buffer string and returns 0541 the corresponding Unicode object. 0542 0543 errors (if non-NULL) defines the error handling. It defaults 0544 to "strict". 0545 0546 If byteorder is non-NULL, the decoder starts decoding using the 0547 given byte order: 0548 0549 *byteorder == -1: little endian 0550 *byteorder == 0: native order 0551 *byteorder == 1: big endian 0552 0553 In native mode, the first two bytes of the stream are checked for a 0554 BOM mark. If found, the BOM mark is analysed, the byte order 0555 adjusted and the BOM skipped. In the other modes, no BOM mark 0556 interpretation is done. After completion, *byteorder is set to the 0557 current byte order at the end of input data. 0558 0559 If byteorder is NULL, the codec starts in native order mode. 0560 0561 */ 0562 0563 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 0564 const char *string, /* UTF-16 encoded string */ 0565 Py_ssize_t length, /* size of string */ 0566 const char *errors, /* error handling */ 0567 int *byteorder /* pointer to byteorder to use 0568 0=native;-1=LE,1=BE; updated on 0569 exit */ 0570 ); 0571 0572 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 0573 const char *string, /* UTF-16 encoded string */ 0574 Py_ssize_t length, /* size of string */ 0575 const char *errors, /* error handling */ 0576 int *byteorder, /* pointer to byteorder to use 0577 0=native;-1=LE,1=BE; updated on 0578 exit */ 0579 Py_ssize_t *consumed /* bytes consumed */ 0580 ); 0581 0582 /* Returns a Python string using the UTF-16 encoding in native byte 0583 order. The string always starts with a BOM mark. */ 0584 0585 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 0586 PyObject *unicode /* Unicode object */ 0587 ); 0588 0589 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 0590 0591 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 0592 const char *string, /* Unicode-Escape encoded string */ 0593 Py_ssize_t length, /* size of string */ 0594 const char *errors /* error handling */ 0595 ); 0596 0597 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 0598 PyObject *unicode /* Unicode object */ 0599 ); 0600 0601 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 0602 0603 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 0604 const char *string, /* Raw-Unicode-Escape encoded string */ 0605 Py_ssize_t length, /* size of string */ 0606 const char *errors /* error handling */ 0607 ); 0608 0609 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 0610 PyObject *unicode /* Unicode object */ 0611 ); 0612 0613 /* --- Latin-1 Codecs ----------------------------------------------------- 0614 0615 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ 0616 0617 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 0618 const char *string, /* Latin-1 encoded string */ 0619 Py_ssize_t length, /* size of string */ 0620 const char *errors /* error handling */ 0621 ); 0622 0623 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 0624 PyObject *unicode /* Unicode object */ 0625 ); 0626 0627 /* --- ASCII Codecs ------------------------------------------------------- 0628 0629 Only 7-bit ASCII data is expected. All other codes generate errors. 0630 0631 */ 0632 0633 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 0634 const char *string, /* ASCII encoded string */ 0635 Py_ssize_t length, /* size of string */ 0636 const char *errors /* error handling */ 0637 ); 0638 0639 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 0640 PyObject *unicode /* Unicode object */ 0641 ); 0642 0643 /* --- Character Map Codecs ----------------------------------------------- 0644 0645 This codec uses mappings to encode and decode characters. 0646 0647 Decoding mappings must map byte ordinals (integers in the range from 0 to 0648 255) to Unicode strings, integers (which are then interpreted as Unicode 0649 ordinals) or None. Unmapped data bytes (ones which cause a LookupError) 0650 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined 0651 mapping" and cause an error. 0652 0653 Encoding mappings must map Unicode ordinal integers to bytes objects, 0654 integers in the range from 0 to 255 or None. Unmapped character 0655 ordinals (ones which cause a LookupError) as well as mapped to 0656 None are treated as "undefined mapping" and cause an error. 0657 0658 */ 0659 0660 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 0661 const char *string, /* Encoded string */ 0662 Py_ssize_t length, /* size of string */ 0663 PyObject *mapping, /* decoding mapping */ 0664 const char *errors /* error handling */ 0665 ); 0666 0667 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 0668 PyObject *unicode, /* Unicode object */ 0669 PyObject *mapping /* encoding mapping */ 0670 ); 0671 0672 /* --- MBCS codecs for Windows -------------------------------------------- */ 0673 0674 #ifdef MS_WINDOWS 0675 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 0676 const char *string, /* MBCS encoded string */ 0677 Py_ssize_t length, /* size of string */ 0678 const char *errors /* error handling */ 0679 ); 0680 0681 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 0682 const char *string, /* MBCS encoded string */ 0683 Py_ssize_t length, /* size of string */ 0684 const char *errors, /* error handling */ 0685 Py_ssize_t *consumed /* bytes consumed */ 0686 ); 0687 0688 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0689 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 0690 int code_page, /* code page number */ 0691 const char *string, /* encoded string */ 0692 Py_ssize_t length, /* size of string */ 0693 const char *errors, /* error handling */ 0694 Py_ssize_t *consumed /* bytes consumed */ 0695 ); 0696 #endif 0697 0698 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 0699 PyObject *unicode /* Unicode object */ 0700 ); 0701 0702 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0703 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 0704 int code_page, /* code page number */ 0705 PyObject *unicode, /* Unicode object */ 0706 const char *errors /* error handling */ 0707 ); 0708 #endif 0709 0710 #endif /* MS_WINDOWS */ 0711 0712 /* --- Locale encoding --------------------------------------------------- */ 0713 0714 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0715 /* Decode a string from the current locale encoding. The decoder is strict if 0716 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 0717 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 0718 be decoded as a surrogate character and *surrogateescape* is not equal to 0719 zero, the byte sequence is escaped using the 'surrogateescape' error handler 0720 instead of being decoded. *str* must end with a null character but cannot 0721 contain embedded null characters. */ 0722 0723 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 0724 const char *str, 0725 Py_ssize_t len, 0726 const char *errors); 0727 0728 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 0729 length using strlen(). */ 0730 0731 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 0732 const char *str, 0733 const char *errors); 0734 0735 /* Encode a Unicode object to the current locale encoding. The encoder is 0736 strict is *surrogateescape* is equal to zero, otherwise the 0737 "surrogateescape" error handler is used. Return a bytes object. The string 0738 cannot contain embedded null characters. */ 0739 0740 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 0741 PyObject *unicode, 0742 const char *errors 0743 ); 0744 #endif 0745 0746 /* --- File system encoding ---------------------------------------------- */ 0747 0748 /* ParseTuple converter: encode str objects to bytes using 0749 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 0750 0751 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 0752 0753 /* ParseTuple converter: decode bytes objects to unicode using 0754 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 0755 0756 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 0757 0758 /* Decode a null-terminated string from the Python filesystem encoding 0759 and error handler. 0760 0761 If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */ 0762 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 0763 const char *s /* encoded string */ 0764 ); 0765 0766 /* Decode a string from the Python filesystem encoding and error handler. */ 0767 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 0768 const char *s, /* encoded string */ 0769 Py_ssize_t size /* size */ 0770 ); 0771 0772 /* Encode a Unicode object to the Python filesystem encoding and error handler. 0773 Return bytes. */ 0774 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 0775 PyObject *unicode 0776 ); 0777 0778 /* --- Methods & Slots ---------------------------------------------------- 0779 0780 These are capable of handling Unicode objects and strings on input 0781 (we refer to them as strings in the descriptions) and return 0782 Unicode objects or integers as appropriate. */ 0783 0784 /* Concat two strings giving a new Unicode string. */ 0785 0786 PyAPI_FUNC(PyObject*) PyUnicode_Concat( 0787 PyObject *left, /* Left string */ 0788 PyObject *right /* Right string */ 0789 ); 0790 0791 /* Concat two strings and put the result in *pleft 0792 (sets *pleft to NULL on error) */ 0793 0794 PyAPI_FUNC(void) PyUnicode_Append( 0795 PyObject **pleft, /* Pointer to left string */ 0796 PyObject *right /* Right string */ 0797 ); 0798 0799 /* Concat two strings, put the result in *pleft and drop the right object 0800 (sets *pleft to NULL on error) */ 0801 0802 PyAPI_FUNC(void) PyUnicode_AppendAndDel( 0803 PyObject **pleft, /* Pointer to left string */ 0804 PyObject *right /* Right string */ 0805 ); 0806 0807 /* Split a string giving a list of Unicode strings. 0808 0809 If sep is NULL, splitting will be done at all whitespace 0810 substrings. Otherwise, splits occur at the given separator. 0811 0812 At most maxsplit splits will be done. If negative, no limit is set. 0813 0814 Separators are not included in the resulting list. 0815 0816 */ 0817 0818 PyAPI_FUNC(PyObject*) PyUnicode_Split( 0819 PyObject *s, /* String to split */ 0820 PyObject *sep, /* String separator */ 0821 Py_ssize_t maxsplit /* Maxsplit count */ 0822 ); 0823 0824 /* Dito, but split at line breaks. 0825 0826 CRLF is considered to be one line break. Line breaks are not 0827 included in the resulting list. */ 0828 0829 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 0830 PyObject *s, /* String to split */ 0831 int keepends /* If true, line end markers are included */ 0832 ); 0833 0834 /* Partition a string using a given separator. */ 0835 0836 PyAPI_FUNC(PyObject*) PyUnicode_Partition( 0837 PyObject *s, /* String to partition */ 0838 PyObject *sep /* String separator */ 0839 ); 0840 0841 /* Partition a string using a given separator, searching from the end of the 0842 string. */ 0843 0844 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 0845 PyObject *s, /* String to partition */ 0846 PyObject *sep /* String separator */ 0847 ); 0848 0849 /* Split a string giving a list of Unicode strings. 0850 0851 If sep is NULL, splitting will be done at all whitespace 0852 substrings. Otherwise, splits occur at the given separator. 0853 0854 At most maxsplit splits will be done. But unlike PyUnicode_Split 0855 PyUnicode_RSplit splits from the end of the string. If negative, 0856 no limit is set. 0857 0858 Separators are not included in the resulting list. 0859 0860 */ 0861 0862 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 0863 PyObject *s, /* String to split */ 0864 PyObject *sep, /* String separator */ 0865 Py_ssize_t maxsplit /* Maxsplit count */ 0866 ); 0867 0868 /* Translate a string by applying a character mapping table to it and 0869 return the resulting Unicode object. 0870 0871 The mapping table must map Unicode ordinal integers to Unicode strings, 0872 Unicode ordinal integers or None (causing deletion of the character). 0873 0874 Mapping tables may be dictionaries or sequences. Unmapped character 0875 ordinals (ones which cause a LookupError) are left untouched and 0876 are copied as-is. 0877 0878 */ 0879 0880 PyAPI_FUNC(PyObject *) PyUnicode_Translate( 0881 PyObject *str, /* String */ 0882 PyObject *table, /* Translate table */ 0883 const char *errors /* error handling */ 0884 ); 0885 0886 /* Join a sequence of strings using the given separator and return 0887 the resulting Unicode string. */ 0888 0889 PyAPI_FUNC(PyObject*) PyUnicode_Join( 0890 PyObject *separator, /* Separator string */ 0891 PyObject *seq /* Sequence object */ 0892 ); 0893 0894 /* Return 1 if substr matches str[start:end] at the given tail end, 0 0895 otherwise. */ 0896 0897 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 0898 PyObject *str, /* String */ 0899 PyObject *substr, /* Prefix or Suffix string */ 0900 Py_ssize_t start, /* Start index */ 0901 Py_ssize_t end, /* Stop index */ 0902 int direction /* Tail end: -1 prefix, +1 suffix */ 0903 ); 0904 0905 /* Return the first position of substr in str[start:end] using the 0906 given search direction or -1 if not found. -2 is returned in case 0907 an error occurred and an exception is set. */ 0908 0909 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 0910 PyObject *str, /* String */ 0911 PyObject *substr, /* Substring to find */ 0912 Py_ssize_t start, /* Start index */ 0913 Py_ssize_t end, /* Stop index */ 0914 int direction /* Find direction: +1 forward, -1 backward */ 0915 ); 0916 0917 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 0918 /* Like PyUnicode_Find, but search for single character only. */ 0919 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 0920 PyObject *str, 0921 Py_UCS4 ch, 0922 Py_ssize_t start, 0923 Py_ssize_t end, 0924 int direction 0925 ); 0926 #endif 0927 0928 /* Count the number of occurrences of substr in str[start:end]. */ 0929 0930 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 0931 PyObject *str, /* String */ 0932 PyObject *substr, /* Substring to count */ 0933 Py_ssize_t start, /* Start index */ 0934 Py_ssize_t end /* Stop index */ 0935 ); 0936 0937 /* Replace at most maxcount occurrences of substr in str with replstr 0938 and return the resulting Unicode object. */ 0939 0940 PyAPI_FUNC(PyObject *) PyUnicode_Replace( 0941 PyObject *str, /* String */ 0942 PyObject *substr, /* Substring to find */ 0943 PyObject *replstr, /* Substring to replace */ 0944 Py_ssize_t maxcount /* Max. number of replacements to apply; 0945 -1 = all */ 0946 ); 0947 0948 /* Compare two strings and return -1, 0, 1 for less than, equal, 0949 greater than resp. 0950 Raise an exception and return -1 on error. */ 0951 0952 PyAPI_FUNC(int) PyUnicode_Compare( 0953 PyObject *left, /* Left string */ 0954 PyObject *right /* Right string */ 0955 ); 0956 0957 /* Compare a Unicode object with C string and return -1, 0, 1 for less than, 0958 equal, and greater than, respectively. It is best to pass only 0959 ASCII-encoded strings, but the function interprets the input string as 0960 ISO-8859-1 if it contains non-ASCII characters. 0961 This function does not raise exceptions. */ 0962 0963 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 0964 PyObject *left, 0965 const char *right /* ASCII-encoded string */ 0966 ); 0967 0968 /* Rich compare two strings and return one of the following: 0969 0970 - NULL in case an exception was raised 0971 - Py_True or Py_False for successful comparisons 0972 - Py_NotImplemented in case the type combination is unknown 0973 0974 Possible values for op: 0975 0976 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 0977 0978 */ 0979 0980 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 0981 PyObject *left, /* Left string */ 0982 PyObject *right, /* Right string */ 0983 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 0984 ); 0985 0986 /* Apply an argument tuple or dictionary to a format string and return 0987 the resulting Unicode string. */ 0988 0989 PyAPI_FUNC(PyObject *) PyUnicode_Format( 0990 PyObject *format, /* Format string */ 0991 PyObject *args /* Argument tuple or dictionary */ 0992 ); 0993 0994 /* Checks whether element is contained in container and return 1/0 0995 accordingly. 0996 0997 element has to coerce to a one element Unicode string. -1 is 0998 returned in case of an error. */ 0999 1000 PyAPI_FUNC(int) PyUnicode_Contains( 1001 PyObject *container, /* Container string */ 1002 PyObject *element /* Element string */ 1003 ); 1004 1005 /* Checks whether argument is a valid identifier. */ 1006 1007 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1008 1009 /* === Characters Type APIs =============================================== */ 1010 1011 #ifndef Py_LIMITED_API 1012 # define Py_CPYTHON_UNICODEOBJECT_H 1013 # include "cpython/unicodeobject.h" 1014 # undef Py_CPYTHON_UNICODEOBJECT_H 1015 #endif 1016 1017 #ifdef __cplusplus 1018 } 1019 #endif 1020 #endif /* !Py_UNICODEOBJECT_H */
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |