Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2024-05-18 08:30:28

0001 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
0002 /* Normalization forms (composition and decomposition) of Unicode strings.
0003    Copyright (C) 2001-2002, 2009-2022 Free Software Foundation, Inc.
0004    Written by Bruno Haible <bruno@clisp.org>, 2009.
0005 
0006    This file is free software: you can redistribute it and/or modify
0007    it under the terms of the GNU Lesser General Public License as
0008    published by the Free Software Foundation; either version 2.1 of the
0009    License, or (at your option) any later version.
0010 
0011    This file is distributed in the hope that it will be useful,
0012    but WITHOUT ANY WARRANTY; without even the implied warranty of
0013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0014    GNU Lesser General Public License for more details.
0015 
0016    You should have received a copy of the GNU Lesser General Public License
0017    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
0018 
0019 #ifndef _UNINORM_H
0020 #define _UNINORM_H
0021 
0022 /* Get common macros for C.  */
0023 #include <unistring/cdefs.h>
0024 
0025 /* Get LIBUNISTRING_DLL_VARIABLE.  */
0026 #include <unistring/woe32dll.h>
0027 
0028 /* Get size_t.  */
0029 #include <stddef.h>
0030 
0031 #include "unitypes.h"
0032 
0033 
0034 #ifdef __cplusplus
0035 extern "C" {
0036 #endif
0037 
0038 
0039 /* Conventions:
0040 
0041    All functions prefixed with u8_ operate on UTF-8 encoded strings.
0042    Their unit is an uint8_t (1 byte).
0043 
0044    All functions prefixed with u16_ operate on UTF-16 encoded strings.
0045    Their unit is an uint16_t (a 2-byte word).
0046 
0047    All functions prefixed with u32_ operate on UCS-4 encoded strings.
0048    Their unit is an uint32_t (a 4-byte word).
0049 
0050    All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
0051    n units.
0052 
0053    Functions returning a string result take a (resultbuf, lengthp) argument
0054    pair.  If resultbuf is not NULL and the result fits into *lengthp units,
0055    it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
0056    allocated string is returned.  In both cases, *lengthp is set to the
0057    length (number of units) of the returned string.  In case of error,
0058    NULL is returned and errno is set.  */
0059 
0060 
0061 enum
0062 {
0063   UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
0064   UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
0065   UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
0066   UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
0067   UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
0068   UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
0069   UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
0070   UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
0071   UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
0072   UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
0073   UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
0074   UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
0075   UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
0076   UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
0077   UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
0078   UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
0079   UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
0080 };
0081 
0082 /* Maximum size of decomposition of a single Unicode character.  */
0083 #define UC_DECOMPOSITION_MAX_LENGTH 32
0084 
0085 /* Return the character decomposition mapping of a Unicode character.
0086    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
0087    ucs_t elements.
0088    When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
0089    filled and N is returned.  Otherwise -1 is returned.  */
0090 extern int
0091        uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
0092 
0093 /* Return the canonical character decomposition mapping of a Unicode character.
0094    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
0095    ucs_t elements.
0096    When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
0097    returned.  Otherwise -1 is returned.  */
0098 extern int
0099        uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
0100 
0101 
0102 /* Attempt to combine the Unicode characters uc1, uc2.
0103    uc1 is known to have canonical combining class 0.
0104    Return the combination of uc1 and uc2, if it exists.
0105    Return 0 otherwise.
0106    Not all decompositions can be recombined using this function.  See the
0107    Unicode file CompositionExclusions.txt for details.  */
0108 extern ucs4_t
0109        uc_composition (ucs4_t uc1, ucs4_t uc2)
0110        _UC_ATTRIBUTE_CONST;
0111 
0112 
0113 /* An object of type uninorm_t denotes a Unicode normalization form.  */
0114 struct unicode_normalization_form;
0115 typedef const struct unicode_normalization_form *uninorm_t;
0116 
0117 /* UNINORM_NFD: Normalization form D: canonical decomposition.  */
0118 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd;
0119 #define UNINORM_NFD (&uninorm_nfd)
0120 
0121 /* UNINORM_NFC: Normalization form C: canonical decomposition, then
0122    canonical composition.  */
0123 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc;
0124 #define UNINORM_NFC (&uninorm_nfc)
0125 
0126 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
0127 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd;
0128 #define UNINORM_NFKD (&uninorm_nfkd)
0129 
0130 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
0131    canonical composition.  */
0132 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc;
0133 #define UNINORM_NFKC (&uninorm_nfkc)
0134 
0135 /* Test whether a normalization form does compatibility decomposition.  */
0136 #define uninorm_is_compat_decomposing(nf) \
0137   ((* (const unsigned int *) (nf) >> 0) & 1)
0138 
0139 /* Test whether a normalization form includes canonical composition.  */
0140 #define uninorm_is_composing(nf) \
0141   ((* (const unsigned int *) (nf) >> 1) & 1)
0142 
0143 /* Return the decomposing variant of a normalization form.
0144    This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
0145 extern uninorm_t
0146        uninorm_decomposing_form (uninorm_t nf)
0147        _UC_ATTRIBUTE_PURE;
0148 
0149 
0150 /* Return the specified normalization form of a string.  */
0151 extern uint8_t *
0152        u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
0153                      uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp);
0154 extern uint16_t *
0155        u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
0156                       uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp);
0157 extern uint32_t *
0158        u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
0159                       uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp);
0160 
0161 
0162 /* Compare S1 and S2, ignoring differences in normalization.
0163    NF must be either UNINORM_NFD or UNINORM_NFKD.
0164    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
0165    return 0.  Upon failure, return -1 with errno set.  */
0166 extern int
0167        u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
0168                    uninorm_t nf, int *resultp);
0169 extern int
0170        u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
0171                     uninorm_t nf, int *resultp);
0172 extern int
0173        u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
0174                     uninorm_t nf, int *resultp);
0175 
0176 
0177 /* Converts the string S of length N to a NUL-terminated byte sequence, in such
0178    a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
0179    equivalent to comparing S1 and S2 with uN_normcoll().
0180    NF must be either UNINORM_NFC or UNINORM_NFKC.  */
0181 extern char *
0182        u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
0183                     char *resultbuf, size_t *lengthp);
0184 extern char *
0185        u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
0186                      char *resultbuf, size_t *lengthp);
0187 extern char *
0188        u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
0189                      char *resultbuf, size_t *lengthp);
0190 
0191 
0192 /* Compare S1 and S2, ignoring differences in normalization, using the
0193    collation rules of the current locale.
0194    NF must be either UNINORM_NFC or UNINORM_NFKC.
0195    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
0196    return 0.  Upon failure, return -1 with errno set.  */
0197 extern int
0198        u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
0199                     uninorm_t nf, int *resultp);
0200 extern int
0201        u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
0202                      uninorm_t nf, int *resultp);
0203 extern int
0204        u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
0205                      uninorm_t nf, int *resultp);
0206 
0207 
0208 /* Normalization of a stream of Unicode characters.
0209 
0210    A "stream of Unicode characters" is essentially a function that accepts an
0211    ucs4_t argument repeatedly, optionally combined with a function that
0212    "flushes" the stream.  */
0213 
0214 /* Data type of a stream of Unicode characters that normalizes its input
0215    according to a given normalization form and passes the normalized character
0216    sequence to the encapsulated stream of Unicode characters.  */
0217 struct uninorm_filter;
0218 
0219 /* Bring data buffered in the filter to its destination, the encapsulated
0220    stream, then close and free the filter.
0221    Return 0 if successful, or -1 with errno set upon failure.  */
0222 extern int
0223        uninorm_filter_free (struct uninorm_filter *filter);
0224 
0225 /* Create and return a normalization filter for Unicode characters.
0226    The pair (stream_func, stream_data) is the encapsulated stream.
0227    stream_func (stream_data, uc) receives the Unicode character uc
0228    and returns 0 if successful, or -1 with errno set upon failure.
0229    Return the new filter, or NULL with errno set upon failure.  */
0230 extern struct uninorm_filter *
0231        uninorm_filter_create (uninorm_t nf,
0232                               int (*stream_func) (void *stream_data, ucs4_t uc),
0233                               void *stream_data)
0234        _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1);
0235 
0236 /* Stuff a Unicode character into a normalizing filter.
0237    Return 0 if successful, or -1 with errno set upon failure.  */
0238 extern int
0239        uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
0240 
0241 /* Bring data buffered in the filter to its destination, the encapsulated
0242    stream.
0243    Return 0 if successful, or -1 with errno set upon failure.
0244    Note! If after calling this function, additional characters are written
0245    into the filter, the resulting character sequence in the encapsulated stream
0246    will not necessarily be normalized.  */
0247 extern int
0248        uninorm_filter_flush (struct uninorm_filter *filter);
0249 
0250 
0251 #ifdef __cplusplus
0252 }
0253 #endif
0254 
0255 
0256 #endif /* _UNINORM_H */