llvm/Support/ConvertUTF.h

0001 /*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
0002  *
0003  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004  * See https://llvm.org/LICENSE.txt for license information.
0005  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006  *
0007  *==------------------------------------------------------------------------==*/
0008 /*
0009  * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
0010  * Distributed under the Terms of Use in
0011  * http://www.unicode.org/copyright.html.
0012  *
0013  * Permission is hereby granted, free of charge, to any person obtaining
0014  * a copy of the Unicode data files and any associated documentation
0015  * (the "Data Files") or Unicode software and any associated documentation
0016  * (the "Software") to deal in the Data Files or Software
0017  * without restriction, including without limitation the rights to use,
0018  * copy, modify, merge, publish, distribute, and/or sell copies of
0019  * the Data Files or Software, and to permit persons to whom the Data Files
0020  * or Software are furnished to do so, provided that
0021  * (a) this copyright and permission notice appear with all copies
0022  * of the Data Files or Software,
0023  * (b) this copyright and permission notice appear in associated
0024  * documentation, and
0025  * (c) there is clear notice in each modified Data File or in the Software
0026  * as well as in the documentation associated with the Data File(s) or
0027  * Software that the data or software has been modified.
0028  *
0029  * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
0030  * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
0031  * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0032  * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
0033  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
0034  * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
0035  * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
0036  * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
0037  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
0038  * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
0039  *
0040  * Except as contained in this notice, the name of a copyright holder
0041  * shall not be used in advertising or otherwise to promote the sale,
0042  * use or other dealings in these Data Files or Software without prior
0043  * written authorization of the copyright holder.
0044  */
0045
0046 /* ---------------------------------------------------------------------
0047
0048     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
0049
0050     Several funtions are included here, forming a complete set of
0051     conversions between the three formats.  UTF-7 is not included
0052     here, but is handled in a separate source file.
0053
0054     Each of these routines takes pointers to input buffers and output
0055     buffers.  The input buffers are const.
0056
0057     Each routine converts the text between *sourceStart and sourceEnd,
0058     putting the result into the buffer between *targetStart and
0059     targetEnd. Note: the end pointers are *after* the last item: e.g.
0060     *(sourceEnd - 1) is the last item.
0061
0062     The return result indicates whether the conversion was successful,
0063     and if not, whether the problem was in the source or target buffers.
0064     (Only the first encountered problem is indicated.)
0065
0066     After the conversion, *sourceStart and *targetStart are both
0067     updated to point to the end of last text successfully converted in
0068     the respective buffers.
0069
0070     Input parameters:
0071         sourceStart - pointer to a pointer to the source buffer.
0072                 The contents of this are modified on return so that
0073                 it points at the next thing to be converted.
0074         targetStart - similarly, pointer to pointer to the target buffer.
0075         sourceEnd, targetEnd - respectively pointers to the ends of the
0076                 two buffers, for overflow checking only.
0077
0078     These conversion functions take a ConversionFlags argument. When this
0079     flag is set to strict, both irregular sequences and isolated surrogates
0080     will cause an error.  When the flag is set to lenient, both irregular
0081     sequences and isolated surrogates are converted.
0082
0083     Whether the flag is strict or lenient, all illegal sequences will cause
0084     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
0085     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
0086     must check for illegal sequences.
0087
0088     When the flag is set to lenient, characters over 0x10FFFF are converted
0089     to the replacement character; otherwise (when the flag is set to strict)
0090     they constitute an error.
0091
0092     Output parameters:
0093         The value "sourceIllegal" is returned from some routines if the input
0094         sequence is malformed.  When "sourceIllegal" is returned, the source
0095         value will point to the illegal value that caused the problem. E.g.,
0096         in UTF-8 when a sequence is malformed, it points to the start of the
0097         malformed sequence.
0098
0099     Author: Mark E. Davis, 1994.
0100     Rev History: Rick McGowan, fixes & updates May 2001.
0101          Fixes & updates, Sept 2001.
0102
0103 ------------------------------------------------------------------------ */
0104
0105 #ifndef LLVM_SUPPORT_CONVERTUTF_H
0106 #define LLVM_SUPPORT_CONVERTUTF_H
0107
0108 #include <cstddef>
0109 #include <string>
0110
0111 #if defined(_WIN32)
0112 #include <system_error>
0113 #endif
0114
0115 // Wrap everything in namespace llvm so that programs can link with llvm and
0116 // their own version of the unicode libraries.
0117
0118 namespace llvm {
0119
0120 /* ---------------------------------------------------------------------
0121     The following 4 definitions are compiler-specific.
0122     The C standard does not guarantee that wchar_t has at least
0123     16 bits, so wchar_t is no less portable than unsigned short!
0124     All should be unsigned values to avoid sign extension during
0125     bit mask & shift operations.
0126 ------------------------------------------------------------------------ */
0127
0128 typedef unsigned int    UTF32;  /* at least 32 bits */
0129 typedef unsigned short  UTF16;  /* at least 16 bits */
0130 typedef unsigned char   UTF8;   /* typically 8 bits */
0131 typedef unsigned char   Boolean; /* 0 or 1 */
0132
0133 /* Some fundamental constants */
0134 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
0135 #define UNI_MAX_BMP (UTF32)0x0000FFFF
0136 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
0137 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
0138 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
0139
0140 #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
0141
0142 #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE  0xFEFF
0143 #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
0144
0145 #define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
0146 #define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000
0147
0148 typedef enum {
0149   conversionOK,           /* conversion successful */
0150   sourceExhausted,        /* partial character in source, but hit end */
0151   targetExhausted,        /* insuff. room in target for conversion */
0152   sourceIllegal           /* source sequence is illegal/malformed */
0153 } ConversionResult;
0154
0155 typedef enum {
0156   strictConversion = 0,
0157   lenientConversion
0158 } ConversionFlags;
0159
0160 ConversionResult ConvertUTF8toUTF16 (
0161   const UTF8** sourceStart, const UTF8* sourceEnd,
0162   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
0163
0164 /**
0165  * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
0166  * incomplete code unit sequence, returns \c sourceExhausted.
0167  */
0168 ConversionResult ConvertUTF8toUTF32Partial(
0169   const UTF8** sourceStart, const UTF8* sourceEnd,
0170   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
0171
0172 /**
0173  * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
0174  * incomplete code unit sequence, returns \c sourceIllegal.
0175  */
0176 ConversionResult ConvertUTF8toUTF32(
0177   const UTF8** sourceStart, const UTF8* sourceEnd,
0178   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
0179
0180 ConversionResult ConvertUTF16toUTF8 (
0181   const UTF16** sourceStart, const UTF16* sourceEnd,
0182   UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
0183
0184 ConversionResult ConvertUTF32toUTF8 (
0185   const UTF32** sourceStart, const UTF32* sourceEnd,
0186   UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
0187
0188 ConversionResult ConvertUTF16toUTF32 (
0189   const UTF16** sourceStart, const UTF16* sourceEnd,
0190   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
0191
0192 ConversionResult ConvertUTF32toUTF16 (
0193   const UTF32** sourceStart, const UTF32* sourceEnd,
0194   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
0195
0196 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
0197
0198 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
0199
0200 unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
0201
0202 unsigned getNumBytesForUTF8(UTF8 firstByte);
0203
0204 /*************************************************************************/
0205 /* Below are LLVM-specific wrappers of the functions above. */
0206
0207 template <typename T> class ArrayRef;
0208 template <typename T> class SmallVectorImpl;
0209 class StringRef;
0210
0211 /**
0212  * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
0213  * WideCharWidth. The converted data is written to ResultPtr, which needs to
0214  * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
0215  * ResultPtr will point one after the end of the copied string. On failure,
0216  * ResultPtr will not be changed, and ErrorPtr will be set to the location of
0217  * the first character which could not be converted.
0218  * \return true on success.
0219  */
0220 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
0221                        char *&ResultPtr, const UTF8 *&ErrorPtr);
0222
0223 /**
0224 * Converts a UTF-8 StringRef to a std::wstring.
0225 * \return true on success.
0226 */
0227 bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result);
0228
0229 /**
0230 * Converts a UTF-8 C-string to a std::wstring.
0231 * \return true on success.
0232 */
0233 bool ConvertUTF8toWide(const char *Source, std::wstring &Result);
0234
0235 /**
0236 * Converts a std::wstring to a UTF-8 encoded std::string.
0237 * \return true on success.
0238 */
0239 bool convertWideToUTF8(const std::wstring &Source, std::string &Result);
0240
0241
0242 /**
0243  * Convert an Unicode code point to UTF8 sequence.
0244  *
0245  * \param Source a Unicode code point.
0246  * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
0247  * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes.  On success \c ResultPtr is
0248  * updated one past end of the converted sequence.
0249  *
0250  * \returns true on success.
0251  */
0252 bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
0253
0254 /**
0255  * Convert the first UTF8 sequence in the given source buffer to a UTF32
0256  * code point.
0257  *
0258  * \param [in,out] source A pointer to the source buffer. If the conversion
0259  * succeeds, this pointer will be updated to point to the byte just past the
0260  * end of the converted sequence.
0261  * \param sourceEnd A pointer just past the end of the source buffer.
0262  * \param [out] target The converted code
0263  * \param flags Whether the conversion is strict or lenient.
0264  *
0265  * \returns conversionOK on success
0266  *
0267  * \sa ConvertUTF8toUTF32
0268  */
0269 inline ConversionResult convertUTF8Sequence(const UTF8 **source,
0270                                             const UTF8 *sourceEnd,
0271                                             UTF32 *target,
0272                                             ConversionFlags flags) {
0273   if (*source == sourceEnd)
0274     return sourceExhausted;
0275   unsigned size = getNumBytesForUTF8(**source);
0276   if ((ptrdiff_t)size > sourceEnd - *source)
0277     return sourceExhausted;
0278   return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
0279 }
0280
0281 /**
0282  * Returns true if a blob of text starts with a UTF-16 big or little endian byte
0283  * order mark.
0284  */
0285 bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
0286
0287 /**
0288  * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
0289  *
0290  * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
0291  * \param [out] Out Converted UTF-8 is stored here on success.
0292  * \returns true on success
0293  */
0294 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
0295
0296 /**
0297 * Converts a UTF16 string into a UTF8 std::string.
0298 *
0299 * \param [in] Src A buffer of UTF-16 encoded text.
0300 * \param [out] Out Converted UTF-8 is stored here on success.
0301 * \returns true on success
0302 */
0303 bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
0304
0305 /**
0306  * Converts a stream of raw bytes assumed to be UTF32 into a UTF8 std::string.
0307  *
0308  * \param [in] SrcBytes A buffer of what is assumed to be UTF-32 encoded text.
0309  * \param [out] Out Converted UTF-8 is stored here on success.
0310  * \returns true on success
0311  */
0312 bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
0313
0314 /**
0315  * Converts a UTF32 string into a UTF8 std::string.
0316  *
0317  * \param [in] Src A buffer of UTF-32 encoded text.
0318  * \param [out] Out Converted UTF-8 is stored here on success.
0319  * \returns true on success
0320  */
0321 bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out);
0322
0323 /**
0324  * Converts a UTF-8 string into a UTF-16 string with native endianness.
0325  *
0326  * \returns true on success
0327  */
0328 bool convertUTF8ToUTF16String(StringRef SrcUTF8,
0329                               SmallVectorImpl<UTF16> &DstUTF16);
0330
0331 #if defined(_WIN32)
0332 namespace sys {
0333 namespace windows {
0334 std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
0335 /// Convert to UTF16 from the current code page used in the system
0336 std::error_code CurCPToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
0337 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
0338                             SmallVectorImpl<char> &utf8);
0339 /// Convert from UTF16 to the current code page used in the system
0340 std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
0341                              SmallVectorImpl<char> &utf8);
0342 } // namespace windows
0343 } // namespace sys
0344 #endif
0345
0346 } /* end namespace llvm */
0347
0348 #endif