Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-17 09:56:13

0001 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
0002 /* Normalization forms (composition and decomposition) of Unicode strings.
0003    Copyright (C) 2001-2002, 2009-2024 Free Software Foundation, Inc.
0004    Written by Bruno Haible <bruno@clisp.org>, 2009.
0005 
0006    This file is free software: you can redistribute it and/or modify
0007    it under the terms of the GNU Lesser General Public License as
0008    published by the Free Software Foundation; either version 2.1 of the
0009    License, or (at your option) any later version.
0010 
0011    This file is distributed in the hope that it will be useful,
0012    but WITHOUT ANY WARRANTY; without even the implied warranty of
0013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0014    GNU Lesser General Public License for more details.
0015 
0016    You should have received a copy of the GNU Lesser General Public License
0017    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
0018 
0019 #ifndef _UNINORM_H
0020 #define _UNINORM_H
0021 
0022 /* Get common macros for C.  */
0023 #include <unistring/cdefs.h>
0024 
0025 /* Get size_t.  */
0026 #include <stddef.h>
0027 
0028 #include "unitypes.h"
0029 
0030 #if 1
0031 # include <unistring/woe32dll.h>
0032 #else
0033 # define LIBUNISTRING_DLL_VARIABLE
0034 #endif
0035 
0036 
0037 #ifdef __cplusplus
0038 extern "C" {
0039 #endif
0040 
0041 
0042 /* Conventions:
0043 
0044    All functions prefixed with u8_ operate on UTF-8 encoded strings.
0045    Their unit is an uint8_t (1 byte).
0046 
0047    All functions prefixed with u16_ operate on UTF-16 encoded strings.
0048    Their unit is an uint16_t (a 2-byte word).
0049 
0050    All functions prefixed with u32_ operate on UCS-4 encoded strings.
0051    Their unit is an uint32_t (a 4-byte word).
0052 
0053    All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
0054    n units.
0055 
0056    Functions returning a string result take a (resultbuf, lengthp) argument
0057    pair.  If resultbuf is not NULL and the result fits into *lengthp units,
0058    it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
0059    allocated string is returned.  In both cases, *lengthp is set to the
0060    length (number of units) of the returned string.  In case of error,
0061    NULL is returned and errno is set.  */
0062 
0063 
0064 enum
0065 {
0066   UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
0067   UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
0068   UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
0069   UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
0070   UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
0071   UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
0072   UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
0073   UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
0074   UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
0075   UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
0076   UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
0077   UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
0078   UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
0079   UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
0080   UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
0081   UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
0082   UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
0083 };
0084 
0085 /* Maximum size of decomposition of a single Unicode character.  */
0086 #define UC_DECOMPOSITION_MAX_LENGTH 32
0087 
0088 /* Return the character decomposition mapping of a Unicode character.
0089    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
0090    ucs_t elements.
0091    When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
0092    filled and N is returned.  Otherwise -1 is returned.  */
0093 extern int
0094        uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
0095 
0096 /* Return the canonical character decomposition mapping of a Unicode character.
0097    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
0098    ucs_t elements.
0099    When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
0100    returned.  Otherwise -1 is returned.  */
0101 extern int
0102        uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
0103 
0104 
0105 /* Attempt to combine the Unicode characters uc1, uc2.
0106    uc1 is known to have canonical combining class 0.
0107    Return the combination of uc1 and uc2, if it exists.
0108    Return 0 otherwise.
0109    Not all decompositions can be recombined using this function.  See the
0110    Unicode file CompositionExclusions.txt for details.  */
0111 extern ucs4_t
0112        uc_composition (ucs4_t uc1, ucs4_t uc2)
0113        _UC_ATTRIBUTE_CONST;
0114 
0115 
0116 /* An object of type uninorm_t denotes a Unicode normalization form.  */
0117 struct unicode_normalization_form;
0118 typedef const struct unicode_normalization_form *uninorm_t;
0119 
0120 /* UNINORM_NFD: Normalization form D: canonical decomposition.  */
0121 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd;
0122 #define UNINORM_NFD (&uninorm_nfd)
0123 
0124 /* UNINORM_NFC: Normalization form C: canonical decomposition, then
0125    canonical composition.  */
0126 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc;
0127 #define UNINORM_NFC (&uninorm_nfc)
0128 
0129 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
0130 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd;
0131 #define UNINORM_NFKD (&uninorm_nfkd)
0132 
0133 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
0134    canonical composition.  */
0135 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc;
0136 #define UNINORM_NFKC (&uninorm_nfkc)
0137 
0138 /* Test whether a normalization form does compatibility decomposition.  */
0139 #define uninorm_is_compat_decomposing(nf) \
0140   ((* (const unsigned int *) (nf) >> 0) & 1)
0141 
0142 /* Test whether a normalization form includes canonical composition.  */
0143 #define uninorm_is_composing(nf) \
0144   ((* (const unsigned int *) (nf) >> 1) & 1)
0145 
0146 /* Return the decomposing variant of a normalization form.
0147    This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
0148 extern uninorm_t
0149        uninorm_decomposing_form (uninorm_t nf)
0150        _UC_ATTRIBUTE_PURE;
0151 
0152 
0153 /* Return the specified normalization form of a string.  */
0154 extern uint8_t *
0155        u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
0156                      uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp);
0157 extern uint16_t *
0158        u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
0159                       uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp);
0160 extern uint32_t *
0161        u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
0162                       uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp);
0163 
0164 
0165 /* Compare S1 and S2, ignoring differences in normalization.
0166    NF must be either UNINORM_NFD or UNINORM_NFKD.
0167    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
0168    return 0.  Upon failure, return -1 with errno set.  */
0169 extern int
0170        u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
0171                    uninorm_t nf, int *resultp);
0172 extern int
0173        u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
0174                     uninorm_t nf, int *resultp);
0175 extern int
0176        u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
0177                     uninorm_t nf, int *resultp);
0178 
0179 
0180 /* Converts the string S of length N to a NUL-terminated byte sequence, in such
0181    a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
0182    equivalent to comparing S1 and S2 with uN_normcoll().
0183    NF must be either UNINORM_NFC or UNINORM_NFKC.  */
0184 extern char *
0185        u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
0186                     char *resultbuf, size_t *lengthp);
0187 extern char *
0188        u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
0189                      char *resultbuf, size_t *lengthp);
0190 extern char *
0191        u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
0192                      char *resultbuf, size_t *lengthp);
0193 
0194 
0195 /* Compare S1 and S2, ignoring differences in normalization, using the
0196    collation rules of the current locale.
0197    NF must be either UNINORM_NFC or UNINORM_NFKC.
0198    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
0199    return 0.  Upon failure, return -1 with errno set.  */
0200 extern int
0201        u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
0202                     uninorm_t nf, int *resultp);
0203 extern int
0204        u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
0205                      uninorm_t nf, int *resultp);
0206 extern int
0207        u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
0208                      uninorm_t nf, int *resultp);
0209 
0210 
0211 /* Normalization of a stream of Unicode characters.
0212 
0213    A "stream of Unicode characters" is essentially a function that accepts an
0214    ucs4_t argument repeatedly, optionally combined with a function that
0215    "flushes" the stream.  */
0216 
0217 /* Data type of a stream of Unicode characters that normalizes its input
0218    according to a given normalization form and passes the normalized character
0219    sequence to the encapsulated stream of Unicode characters.  */
0220 struct uninorm_filter;
0221 
0222 /* Bring data buffered in the filter to its destination, the encapsulated
0223    stream, then close and free the filter.
0224    Return 0 if successful, or -1 with errno set upon failure.  */
0225 extern int
0226        uninorm_filter_free (struct uninorm_filter *filter);
0227 
0228 /* Create and return a normalization filter for Unicode characters.
0229    The pair (stream_func, stream_data) is the encapsulated stream.
0230    stream_func (stream_data, uc) receives the Unicode character uc
0231    and returns 0 if successful, or -1 with errno set upon failure.
0232    Return the new filter, or NULL with errno set upon failure.  */
0233 extern struct uninorm_filter *
0234        uninorm_filter_create (uninorm_t nf,
0235                               int (*stream_func) (void *stream_data, ucs4_t uc),
0236                               void *stream_data)
0237        _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1);
0238 
0239 /* Stuff a Unicode character into a normalizing filter.
0240    Return 0 if successful, or -1 with errno set upon failure.  */
0241 extern int
0242        uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
0243 
0244 /* Bring data buffered in the filter to its destination, the encapsulated
0245    stream.
0246    Return 0 if successful, or -1 with errno set upon failure.
0247    Note! If after calling this function, additional characters are written
0248    into the filter, the resulting character sequence in the encapsulated stream
0249    will not necessarily be normalized.  */
0250 extern int
0251        uninorm_filter_flush (struct uninorm_filter *filter);
0252 
0253 
0254 #ifdef __cplusplus
0255 }
0256 #endif
0257 
0258 
0259 #endif /* _UNINORM_H */