|
|
|||
File indexing completed on 2026-05-10 08:44:35
0001 //===- llvm/Support/Unicode.h - Unicode character properties -*- C++ -*-=====// 0002 // 0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 0004 // See https://llvm.org/LICENSE.txt for license information. 0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 0006 // 0007 //===----------------------------------------------------------------------===// 0008 // 0009 // This file defines functions that allow querying certain properties of Unicode 0010 // characters. 0011 // 0012 //===----------------------------------------------------------------------===// 0013 0014 #ifndef LLVM_SUPPORT_UNICODE_H 0015 #define LLVM_SUPPORT_UNICODE_H 0016 0017 #include "llvm/ADT/SmallString.h" 0018 #include <optional> 0019 #include <string> 0020 0021 namespace llvm { 0022 class StringRef; 0023 0024 namespace sys { 0025 namespace unicode { 0026 0027 enum ColumnWidthErrors { 0028 ErrorInvalidUTF8 = -2, 0029 ErrorNonPrintableCharacter = -1 0030 }; 0031 0032 /// Determines if a character is likely to be displayed correctly on the 0033 /// terminal. Exact implementation would have to depend on the specific 0034 /// terminal, so we define the semantic that should be suitable for generic case 0035 /// of a terminal capable to output Unicode characters. 0036 /// 0037 /// Printable codepoints are those in the categories L, M, N, P, S and Zs 0038 /// \return true if the character is considered printable. 0039 bool isPrintable(int UCS); 0040 0041 // Formatting codepoints are codepoints in the Cf category. 0042 bool isFormatting(int UCS); 0043 0044 /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy 0045 /// when output on a terminal ("character width"). This depends on the 0046 /// implementation of the terminal, and there's no standard definition of 0047 /// character width. 0048 /// 0049 /// The implementation defines it in a way that is expected to be compatible 0050 /// with a generic Unicode-capable terminal. 0051 /// 0052 /// \return Character width: 0053 /// * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable 0054 /// characters (as identified by isPrintable); 0055 /// * 0 for each non-spacing and enclosing combining mark; 0056 /// * 2 for each CJK character excluding halfwidth forms; 0057 /// * 1 for each of the remaining characters. 0058 int columnWidthUTF8(StringRef Text); 0059 0060 /// Fold input unicode character according the Simple unicode case folding 0061 /// rules. 0062 int foldCharSimple(int C); 0063 0064 /// Maps the name or the alias of a Unicode character to its associated 0065 /// codepoints. 0066 /// The names and aliases are derived from UnicodeData.txt and NameAliases.txt 0067 /// For compatibility with the semantics of named character escape sequences in 0068 /// C++, this mapping does an exact match sensitive to casing and spacing. 0069 /// \return The codepoint of the corresponding character, if any. 0070 std::optional<char32_t> nameToCodepointStrict(StringRef Name); 0071 0072 struct LooseMatchingResult { 0073 char32_t CodePoint; 0074 SmallString<64> Name; 0075 }; 0076 0077 std::optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name); 0078 0079 struct MatchForCodepointName { 0080 std::string Name; 0081 uint32_t Distance = 0; 0082 char32_t Value = 0; 0083 }; 0084 0085 SmallVector<MatchForCodepointName> 0086 nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount); 0087 0088 } // namespace unicode 0089 } // namespace sys 0090 } // namespace llvm 0091 0092 #endif
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|