|
||||
File indexing completed on 2025-01-17 09:56:13
0001 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 0002 /* Normalization forms (composition and decomposition) of Unicode strings. 0003 Copyright (C) 2001-2002, 2009-2024 Free Software Foundation, Inc. 0004 Written by Bruno Haible <bruno@clisp.org>, 2009. 0005 0006 This file is free software: you can redistribute it and/or modify 0007 it under the terms of the GNU Lesser General Public License as 0008 published by the Free Software Foundation; either version 2.1 of the 0009 License, or (at your option) any later version. 0010 0011 This file is distributed in the hope that it will be useful, 0012 but WITHOUT ANY WARRANTY; without even the implied warranty of 0013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 0014 GNU Lesser General Public License for more details. 0015 0016 You should have received a copy of the GNU Lesser General Public License 0017 along with this program. If not, see <https://www.gnu.org/licenses/>. */ 0018 0019 #ifndef _UNINORM_H 0020 #define _UNINORM_H 0021 0022 /* Get common macros for C. */ 0023 #include <unistring/cdefs.h> 0024 0025 /* Get size_t. */ 0026 #include <stddef.h> 0027 0028 #include "unitypes.h" 0029 0030 #if 1 0031 # include <unistring/woe32dll.h> 0032 #else 0033 # define LIBUNISTRING_DLL_VARIABLE 0034 #endif 0035 0036 0037 #ifdef __cplusplus 0038 extern "C" { 0039 #endif 0040 0041 0042 /* Conventions: 0043 0044 All functions prefixed with u8_ operate on UTF-8 encoded strings. 0045 Their unit is an uint8_t (1 byte). 0046 0047 All functions prefixed with u16_ operate on UTF-16 encoded strings. 0048 Their unit is an uint16_t (a 2-byte word). 0049 0050 All functions prefixed with u32_ operate on UCS-4 encoded strings. 0051 Their unit is an uint32_t (a 4-byte word). 0052 0053 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly 0054 n units. 0055 0056 Functions returning a string result take a (resultbuf, lengthp) argument 0057 pair. If resultbuf is not NULL and the result fits into *lengthp units, 0058 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly 0059 allocated string is returned. In both cases, *lengthp is set to the 0060 length (number of units) of the returned string. In case of error, 0061 NULL is returned and errno is set. */ 0062 0063 0064 enum 0065 { 0066 UC_DECOMP_CANONICAL,/* Canonical decomposition. */ 0067 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */ 0068 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */ 0069 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */ 0070 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */ 0071 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */ 0072 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */ 0073 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */ 0074 UC_DECOMP_SUPER, /* <super> A superscript form. */ 0075 UC_DECOMP_SUB, /* <sub> A subscript form. */ 0076 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */ 0077 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */ 0078 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */ 0079 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */ 0080 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */ 0081 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */ 0082 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */ 0083 }; 0084 0085 /* Maximum size of decomposition of a single Unicode character. */ 0086 #define UC_DECOMPOSITION_MAX_LENGTH 32 0087 0088 /* Return the character decomposition mapping of a Unicode character. 0089 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH 0090 ucs_t elements. 0091 When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are 0092 filled and N is returned. Otherwise -1 is returned. */ 0093 extern int 0094 uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition); 0095 0096 /* Return the canonical character decomposition mapping of a Unicode character. 0097 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH 0098 ucs_t elements. 0099 When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is 0100 returned. Otherwise -1 is returned. */ 0101 extern int 0102 uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition); 0103 0104 0105 /* Attempt to combine the Unicode characters uc1, uc2. 0106 uc1 is known to have canonical combining class 0. 0107 Return the combination of uc1 and uc2, if it exists. 0108 Return 0 otherwise. 0109 Not all decompositions can be recombined using this function. See the 0110 Unicode file CompositionExclusions.txt for details. */ 0111 extern ucs4_t 0112 uc_composition (ucs4_t uc1, ucs4_t uc2) 0113 _UC_ATTRIBUTE_CONST; 0114 0115 0116 /* An object of type uninorm_t denotes a Unicode normalization form. */ 0117 struct unicode_normalization_form; 0118 typedef const struct unicode_normalization_form *uninorm_t; 0119 0120 /* UNINORM_NFD: Normalization form D: canonical decomposition. */ 0121 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd; 0122 #define UNINORM_NFD (&uninorm_nfd) 0123 0124 /* UNINORM_NFC: Normalization form C: canonical decomposition, then 0125 canonical composition. */ 0126 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc; 0127 #define UNINORM_NFC (&uninorm_nfc) 0128 0129 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */ 0130 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd; 0131 #define UNINORM_NFKD (&uninorm_nfkd) 0132 0133 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then 0134 canonical composition. */ 0135 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc; 0136 #define UNINORM_NFKC (&uninorm_nfkc) 0137 0138 /* Test whether a normalization form does compatibility decomposition. */ 0139 #define uninorm_is_compat_decomposing(nf) \ 0140 ((* (const unsigned int *) (nf) >> 0) & 1) 0141 0142 /* Test whether a normalization form includes canonical composition. */ 0143 #define uninorm_is_composing(nf) \ 0144 ((* (const unsigned int *) (nf) >> 1) & 1) 0145 0146 /* Return the decomposing variant of a normalization form. 0147 This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */ 0148 extern uninorm_t 0149 uninorm_decomposing_form (uninorm_t nf) 0150 _UC_ATTRIBUTE_PURE; 0151 0152 0153 /* Return the specified normalization form of a string. */ 0154 extern uint8_t * 0155 u8_normalize (uninorm_t nf, const uint8_t *s, size_t n, 0156 uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp); 0157 extern uint16_t * 0158 u16_normalize (uninorm_t nf, const uint16_t *s, size_t n, 0159 uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp); 0160 extern uint32_t * 0161 u32_normalize (uninorm_t nf, const uint32_t *s, size_t n, 0162 uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp); 0163 0164 0165 /* Compare S1 and S2, ignoring differences in normalization. 0166 NF must be either UNINORM_NFD or UNINORM_NFKD. 0167 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and 0168 return 0. Upon failure, return -1 with errno set. */ 0169 extern int 0170 u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, 0171 uninorm_t nf, int *resultp); 0172 extern int 0173 u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, 0174 uninorm_t nf, int *resultp); 0175 extern int 0176 u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, 0177 uninorm_t nf, int *resultp); 0178 0179 0180 /* Converts the string S of length N to a NUL-terminated byte sequence, in such 0181 a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is 0182 equivalent to comparing S1 and S2 with uN_normcoll(). 0183 NF must be either UNINORM_NFC or UNINORM_NFKC. */ 0184 extern char * 0185 u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf, 0186 char *resultbuf, size_t *lengthp); 0187 extern char * 0188 u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf, 0189 char *resultbuf, size_t *lengthp); 0190 extern char * 0191 u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf, 0192 char *resultbuf, size_t *lengthp); 0193 0194 0195 /* Compare S1 and S2, ignoring differences in normalization, using the 0196 collation rules of the current locale. 0197 NF must be either UNINORM_NFC or UNINORM_NFKC. 0198 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and 0199 return 0. Upon failure, return -1 with errno set. */ 0200 extern int 0201 u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, 0202 uninorm_t nf, int *resultp); 0203 extern int 0204 u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, 0205 uninorm_t nf, int *resultp); 0206 extern int 0207 u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, 0208 uninorm_t nf, int *resultp); 0209 0210 0211 /* Normalization of a stream of Unicode characters. 0212 0213 A "stream of Unicode characters" is essentially a function that accepts an 0214 ucs4_t argument repeatedly, optionally combined with a function that 0215 "flushes" the stream. */ 0216 0217 /* Data type of a stream of Unicode characters that normalizes its input 0218 according to a given normalization form and passes the normalized character 0219 sequence to the encapsulated stream of Unicode characters. */ 0220 struct uninorm_filter; 0221 0222 /* Bring data buffered in the filter to its destination, the encapsulated 0223 stream, then close and free the filter. 0224 Return 0 if successful, or -1 with errno set upon failure. */ 0225 extern int 0226 uninorm_filter_free (struct uninorm_filter *filter); 0227 0228 /* Create and return a normalization filter for Unicode characters. 0229 The pair (stream_func, stream_data) is the encapsulated stream. 0230 stream_func (stream_data, uc) receives the Unicode character uc 0231 and returns 0 if successful, or -1 with errno set upon failure. 0232 Return the new filter, or NULL with errno set upon failure. */ 0233 extern struct uninorm_filter * 0234 uninorm_filter_create (uninorm_t nf, 0235 int (*stream_func) (void *stream_data, ucs4_t uc), 0236 void *stream_data) 0237 _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1); 0238 0239 /* Stuff a Unicode character into a normalizing filter. 0240 Return 0 if successful, or -1 with errno set upon failure. */ 0241 extern int 0242 uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc); 0243 0244 /* Bring data buffered in the filter to its destination, the encapsulated 0245 stream. 0246 Return 0 if successful, or -1 with errno set upon failure. 0247 Note! If after calling this function, additional characters are written 0248 into the filter, the resulting character sequence in the encapsulated stream 0249 will not necessarily be normalized. */ 0250 extern int 0251 uninorm_filter_flush (struct uninorm_filter *filter); 0252 0253 0254 #ifdef __cplusplus 0255 } 0256 #endif 0257 0258 0259 #endif /* _UNINORM_H */
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |