Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-12-10 10:23:54

0001 //========================================================================
0002 //
0003 // UTF.h
0004 //
0005 // This file is licensed under the GPLv2 or later
0006 //
0007 // Copyright (C) 2012, 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
0008 // Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
0009 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
0010 // Copyright (C) 2018 Nelson Benítez León <nbenitezl@gmail.com>
0011 // Copyright (C) 2019-2022 Albert Astals Cid <aacid@kde.org>
0012 // Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net.
0013 //
0014 //========================================================================
0015 
0016 #ifndef UTF_H
0017 #define UTF_H
0018 
0019 #include <cstdint>
0020 #include <climits>
0021 #include <memory>
0022 
0023 #include "goo/GooString.h"
0024 #include "CharTypes.h"
0025 #include "poppler_private_export.h"
0026 
0027 // Convert a UTF-16 string to a UCS-4
0028 //   utf16      - utf16 bytes
0029 //   utf16_len  - number of UTF-16 characters
0030 //   ucs4_out   - if not NULL, allocates and returns UCS-4 string. Free with gfree.
0031 //   returns number of UCS-4 characters
0032 int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4_out);
0033 
0034 // Convert a PDF Text String to UCS-4
0035 //   s          - PDF text string
0036 //   ucs4       - if the number of UCS-4 characters is > 0, allocates and
0037 //                returns UCS-4 string. Free with gfree.
0038 //   returns number of UCS-4 characters
0039 int POPPLER_PRIVATE_EXPORT TextStringToUCS4(const std::string &textStr, Unicode **ucs4);
0040 
0041 // check if UCS-4 character is valid
0042 bool UnicodeIsValid(Unicode ucs4);
0043 
0044 // is a unicode whitespace character
0045 bool UnicodeIsWhitespace(Unicode ucs4);
0046 
0047 // Count number of UCS-4 characters required to convert a UTF-8 string to
0048 // UCS-4 (excluding terminating NULL).
0049 int POPPLER_PRIVATE_EXPORT utf8CountUCS4(const char *utf8);
0050 
0051 // Convert a UTF-8 string to a UCS-4
0052 //   utf8      - utf8 bytes
0053 //   ucs4_out   - if not NULL, allocates and returns UCS-4 string. Free with gfree.
0054 //   returns number of UCS-4 characters
0055 int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out);
0056 
0057 // Count number of UTF-16 code units required to convert a UTF-8 string
0058 // (excluding terminating NULL). Each invalid byte is counted as a
0059 // code point since the UTF-8 conversion functions will replace it with
0060 // REPLACEMENT_CHAR.
0061 int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(const char *utf8);
0062 
0063 // Convert UTF-8 to UTF-16
0064 //  utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
0065 //        bytes to convert
0066 //  utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
0067 //  maxUtf16 - maximum size of output buffer including space for null.
0068 //  maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
0069 //            either this count is reached or a null is encountered.
0070 // Returns number of UTF-16 code units written (excluding NULL).
0071 int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16 = INT_MAX, int maxUtf8 = INT_MAX);
0072 
0073 // Allocate utf16 string and convert utf8 into it.
0074 uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr);
0075 
0076 // Converts a UTF-8 string to a big endian UTF-16 string with BOM.
0077 // The caller owns the returned pointer.
0078 //  utf8 - UTF-8 string to convert. An empty string is acceptable.
0079 // Returns a big endian UTF-16 string with BOM or an empty string without BOM.
0080 std::unique_ptr<GooString> POPPLER_PRIVATE_EXPORT utf8ToUtf16WithBom(const std::string &utf8);
0081 
0082 // Count number of UTF-8 bytes required to convert a UTF-16 string to
0083 // UTF-8 (excluding terminating NULL).
0084 int POPPLER_PRIVATE_EXPORT utf16CountUtf8Bytes(const uint16_t *utf16);
0085 
0086 // Convert UTF-16 to UTF-8
0087 //  utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
0088 //        code units to convert
0089 //  utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
0090 //  maxUtf8 - maximum size of output buffer including space for null.
0091 //  maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
0092 //            either this count is reached or a null is encountered.
0093 // Returns number of UTF-8 bytes written (excluding NULL).
0094 int POPPLER_PRIVATE_EXPORT utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX);
0095 
0096 // Allocate utf8 string and convert utf16 into it.
0097 char POPPLER_PRIVATE_EXPORT *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr);
0098 
0099 // Convert a UCS-4 string to pure ASCII (7bit)
0100 //   in       - UCS-4 string bytes
0101 //   len      - number of UCS-4 characters
0102 //   ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
0103 //   out_len  - number of UCS-4 characters in ucs4_out.
0104 //   in_idx   - if not NULL, the int array returned by the out fourth parameter of
0105 //              unicodeNormalizeNFKC() function. Optional, needed for @indices out parameter.
0106 //   indices  - if not NULL, @indices is assigned the location of a newly-allocated array
0107 //              of length @out_len + 1, for each character in the ascii string giving the index
0108 //              of the corresponding character in the text of the line (thanks to this info
0109 //              being passed in @in_idx parameter).
0110 void POPPLER_PRIVATE_EXPORT unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices);
0111 
0112 #endif