Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-11-19 09:50:51

0001 #ifndef Py_INTERNAL_UNICODEOBJECT_H
0002 #define Py_INTERNAL_UNICODEOBJECT_H
0003 #ifdef __cplusplus
0004 extern "C" {
0005 #endif
0006 
0007 #ifndef Py_BUILD_CORE
0008 #  error "this header requires Py_BUILD_CORE define"
0009 #endif
0010 
0011 #include "pycore_lock.h"          // PyMutex
0012 #include "pycore_fileutils.h"     // _Py_error_handler
0013 #include "pycore_identifier.h"    // _Py_Identifier
0014 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
0015 #include "pycore_global_objects.h"  // _Py_SINGLETON
0016 
0017 /* --- Characters Type APIs ----------------------------------------------- */
0018 
0019 extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
0020 extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
0021 extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
0022 extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
0023 extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
0024 extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
0025 extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
0026 extern int _PyUnicode_IsCased(Py_UCS4 ch);
0027 
0028 /* --- Unicode API -------------------------------------------------------- */
0029 
0030 // Export for '_json' shared extension
0031 PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
0032     PyObject *op,
0033     int check_content);
0034 
0035 PyAPI_FUNC(void) _PyUnicode_ExactDealloc(PyObject *op);
0036 extern Py_ssize_t _PyUnicode_InternedSize(void);
0037 extern Py_ssize_t _PyUnicode_InternedSize_Immortal(void);
0038 
0039 // Get a copy of a Unicode string.
0040 // Export for '_datetime' shared extension.
0041 PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
0042     PyObject *unicode);
0043 
0044 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
0045    if parameters are invalid (e.g. if length is longer than the string). */
0046 extern void _PyUnicode_FastFill(
0047     PyObject *unicode,
0048     Py_ssize_t start,
0049     Py_ssize_t length,
0050     Py_UCS4 fill_char
0051     );
0052 
0053 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
0054    may crash if parameters are invalid (e.g. if the output string
0055    is too short). */
0056 extern void _PyUnicode_FastCopyCharacters(
0057     PyObject *to,
0058     Py_ssize_t to_start,
0059     PyObject *from,
0060     Py_ssize_t from_start,
0061     Py_ssize_t how_many
0062     );
0063 
0064 /* Create a new string from a buffer of ASCII characters.
0065    WARNING: Don't check if the string contains any non-ASCII character. */
0066 extern PyObject* _PyUnicode_FromASCII(
0067     const char *buffer,
0068     Py_ssize_t size);
0069 
0070 /* Compute the maximum character of the substring unicode[start:end].
0071    Return 127 for an empty string. */
0072 extern Py_UCS4 _PyUnicode_FindMaxChar (
0073     PyObject *unicode,
0074     Py_ssize_t start,
0075     Py_ssize_t end);
0076 
0077 /* --- _PyUnicodeWriter API ----------------------------------------------- */
0078 
0079 /* Format the object based on the format_spec, as defined in PEP 3101
0080    (Advanced String Formatting). */
0081 extern int _PyUnicode_FormatAdvancedWriter(
0082     _PyUnicodeWriter *writer,
0083     PyObject *obj,
0084     PyObject *format_spec,
0085     Py_ssize_t start,
0086     Py_ssize_t end);
0087 
0088 /* --- UTF-7 Codecs ------------------------------------------------------- */
0089 
0090 extern PyObject* _PyUnicode_EncodeUTF7(
0091     PyObject *unicode,          /* Unicode object */
0092     int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
0093     int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
0094     const char *errors);        /* error handling */
0095 
0096 /* --- UTF-8 Codecs ------------------------------------------------------- */
0097 
0098 // Export for '_tkinter' shared extension.
0099 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
0100     PyObject *unicode,
0101     const char *errors);
0102 
0103 /* --- UTF-32 Codecs ------------------------------------------------------ */
0104 
0105 // Export for '_tkinter' shared extension
0106 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
0107     PyObject *object,           /* Unicode object */
0108     const char *errors,         /* error handling */
0109     int byteorder);             /* byteorder to use 0=BOM+native;-1=LE,1=BE */
0110 
0111 /* --- UTF-16 Codecs ------------------------------------------------------ */
0112 
0113 // Returns a Python string object holding the UTF-16 encoded value of
0114 // the Unicode data.
0115 //
0116 // If byteorder is not 0, output is written according to the following
0117 // byte order:
0118 //
0119 // byteorder == -1: little endian
0120 // byteorder == 0:  native byte order (writes a BOM mark)
0121 // byteorder == 1:  big endian
0122 //
0123 // If byteorder is 0, the output string will always start with the
0124 // Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
0125 // prepended.
0126 //
0127 // Export for '_tkinter' shared extension
0128 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
0129     PyObject* unicode,          /* Unicode object */
0130     const char *errors,         /* error handling */
0131     int byteorder);             /* byteorder to use 0=BOM+native;-1=LE,1=BE */
0132 
0133 /* --- Unicode-Escape Codecs ---------------------------------------------- */
0134 
0135 /* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
0136 extern PyObject* _PyUnicode_DecodeUnicodeEscapeStateful(
0137     const char *string,     /* Unicode-Escape encoded string */
0138     Py_ssize_t length,      /* size of string */
0139     const char *errors,     /* error handling */
0140     Py_ssize_t *consumed);  /* bytes consumed */
0141 
0142 // Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
0143 // chars.
0144 // Export for test_peg_generator.
0145 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
0146     const char *string,     /* Unicode-Escape encoded string */
0147     Py_ssize_t length,      /* size of string */
0148     const char *errors,     /* error handling */
0149     Py_ssize_t *consumed,   /* bytes consumed */
0150     int *first_invalid_escape_char, /* on return, if not -1, contain the first
0151                                        invalid escaped char (<= 0xff) or invalid
0152                                        octal escape (> 0xff) in string. */
0153     const char **first_invalid_escape_ptr); /* on return, if not NULL, may
0154                                         point to the first invalid escaped
0155                                         char in string.
0156                                         May be NULL if errors is not NULL. */
0157 // Export for binary compatibility.
0158 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
0159     const char *string,     /* Unicode-Escape encoded string */
0160     Py_ssize_t length,      /* size of string */
0161     const char *errors,     /* error handling */
0162     Py_ssize_t *consumed,   /* bytes consumed */
0163     const char **first_invalid_escape); /* on return, points to first
0164                                            invalid escaped char in
0165                                            string. */
0166 
0167 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
0168 
0169 /* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
0170 extern PyObject* _PyUnicode_DecodeRawUnicodeEscapeStateful(
0171     const char *string,     /* Unicode-Escape encoded string */
0172     Py_ssize_t length,      /* size of string */
0173     const char *errors,     /* error handling */
0174     Py_ssize_t *consumed);  /* bytes consumed */
0175 
0176 /* --- Latin-1 Codecs ----------------------------------------------------- */
0177 
0178 extern PyObject* _PyUnicode_AsLatin1String(
0179     PyObject* unicode,
0180     const char* errors);
0181 
0182 /* --- ASCII Codecs ------------------------------------------------------- */
0183 
0184 extern PyObject* _PyUnicode_AsASCIIString(
0185     PyObject* unicode,
0186     const char* errors);
0187 
0188 /* --- Character Map Codecs ----------------------------------------------- */
0189 
0190 /* Translate an Unicode object by applying a character mapping table to
0191    it and return the resulting Unicode object.
0192 
0193    The mapping table must map Unicode ordinal integers to Unicode strings,
0194    Unicode ordinal integers or None (causing deletion of the character).
0195 
0196    Mapping tables may be dictionaries or sequences. Unmapped character
0197    ordinals (ones which cause a LookupError) are left untouched and
0198    are copied as-is.
0199 */
0200 extern PyObject* _PyUnicode_EncodeCharmap(
0201     PyObject *unicode,          /* Unicode object */
0202     PyObject *mapping,          /* encoding mapping */
0203     const char *errors);        /* error handling */
0204 
0205 /* --- Decimal Encoder ---------------------------------------------------- */
0206 
0207 // Coverts a Unicode object holding a decimal value to an ASCII string
0208 // for using in int, float and complex parsers.
0209 // Transforms code points that have decimal digit property to the
0210 // corresponding ASCII digit code points.  Transforms spaces to ASCII.
0211 // Transforms code points starting from the first non-ASCII code point that
0212 // is neither a decimal digit nor a space to the end into '?'.
0213 //
0214 // Export for '_testinternalcapi' shared extension.
0215 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
0216     PyObject *unicode);         /* Unicode object */
0217 
0218 /* --- Methods & Slots ---------------------------------------------------- */
0219 
0220 PyAPI_FUNC(PyObject*) _PyUnicode_JoinArray(
0221     PyObject *separator,
0222     PyObject *const *items,
0223     Py_ssize_t seqlen
0224     );
0225 
0226 /* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
0227    0 otherwise.  The right argument must be ASCII identifier.
0228    Any error occurs inside will be cleared before return. */
0229 extern int _PyUnicode_EqualToASCIIId(
0230     PyObject *left,             /* Left string */
0231     _Py_Identifier *right       /* Right identifier */
0232     );
0233 
0234 // Test whether a unicode is equal to ASCII string.  Return 1 if true,
0235 // 0 otherwise.  The right argument must be ASCII-encoded string.
0236 // Any error occurs inside will be cleared before return.
0237 // Export for '_ctypes' shared extension
0238 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
0239     PyObject *left,
0240     const char *right           /* ASCII-encoded string */
0241     );
0242 
0243 /* Externally visible for str.strip(unicode) */
0244 extern PyObject* _PyUnicode_XStrip(
0245     PyObject *self,
0246     int striptype,
0247     PyObject *sepobj
0248     );
0249 
0250 
0251 /* Using explicit passed-in values, insert the thousands grouping
0252    into the string pointed to by buffer.  For the argument descriptions,
0253    see Objects/stringlib/localeutil.h */
0254 extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
0255     _PyUnicodeWriter *writer,
0256     Py_ssize_t n_buffer,
0257     PyObject *digits,
0258     Py_ssize_t d_pos,
0259     Py_ssize_t n_digits,
0260     Py_ssize_t min_width,
0261     const char *grouping,
0262     PyObject *thousands_sep,
0263     Py_UCS4 *maxchar);
0264 
0265 /* --- Misc functions ----------------------------------------------------- */
0266 
0267 extern PyObject* _PyUnicode_FormatLong(PyObject *, int, int, int);
0268 
0269 /* Fast equality check when the inputs are known to be exact unicode types
0270    and where the hash values are equal (i.e. a very probable match) */
0271 extern int _PyUnicode_EQ(PyObject *, PyObject *);
0272 
0273 // Equality check.
0274 // Export for '_pickle' shared extension.
0275 PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
0276 
0277 extern int _PyUnicode_WideCharString_Converter(PyObject *, void *);
0278 extern int _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
0279 
0280 // Export for test_peg_generator
0281 PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
0282 
0283 /* --- Runtime lifecycle -------------------------------------------------- */
0284 
0285 extern void _PyUnicode_InitState(PyInterpreterState *);
0286 extern PyStatus _PyUnicode_InitGlobalObjects(PyInterpreterState *);
0287 extern PyStatus _PyUnicode_InitTypes(PyInterpreterState *);
0288 extern void _PyUnicode_Fini(PyInterpreterState *);
0289 extern void _PyUnicode_FiniTypes(PyInterpreterState *);
0290 
0291 extern PyTypeObject _PyUnicodeASCIIIter_Type;
0292 
0293 /* --- Interning ---------------------------------------------------------- */
0294 
0295 // All these are "ref-neutral", like the public PyUnicode_InternInPlace.
0296 
0297 // Explicit interning routines:
0298 PyAPI_FUNC(void) _PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **);
0299 PyAPI_FUNC(void) _PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **);
0300 // Left here to help backporting:
0301 PyAPI_FUNC(void) _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p);
0302 // Only for singletons in the _PyRuntime struct:
0303 extern void _PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **);
0304 
0305 /* --- Other API ---------------------------------------------------------- */
0306 
0307 struct _Py_unicode_runtime_ids {
0308     PyMutex mutex;
0309     // next_index value must be preserved when Py_Initialize()/Py_Finalize()
0310     // is called multiple times: see _PyUnicode_FromId() implementation.
0311     Py_ssize_t next_index;
0312 };
0313 
0314 struct _Py_unicode_runtime_state {
0315     struct _Py_unicode_runtime_ids ids;
0316 };
0317 
0318 /* fs_codec.encoding is initialized to NULL.
0319    Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
0320 struct _Py_unicode_fs_codec {
0321     char *encoding;   // Filesystem encoding (encoded to UTF-8)
0322     int utf8;         // encoding=="utf-8"?
0323     char *errors;     // Filesystem errors (encoded to UTF-8)
0324     _Py_error_handler error_handler;
0325 };
0326 
0327 struct _Py_unicode_ids {
0328     Py_ssize_t size;
0329     PyObject **array;
0330 };
0331 
0332 struct _Py_unicode_state {
0333     struct _Py_unicode_fs_codec fs_codec;
0334 
0335     _PyUnicode_Name_CAPI *ucnhash_capi;
0336 
0337     // Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
0338     struct _Py_unicode_ids ids;
0339 };
0340 
0341 extern void _PyUnicode_ClearInterned(PyInterpreterState *interp);
0342 
0343 // Like PyUnicode_AsUTF8(), but check for embedded null characters.
0344 // Export for '_sqlite3' shared extension.
0345 PyAPI_FUNC(const char *) _PyUnicode_AsUTF8NoNUL(PyObject *);
0346 
0347 
0348 #ifdef __cplusplus
0349 }
0350 #endif
0351 #endif /* !Py_INTERNAL_UNICODEOBJECT_H */