llvm/Support/Unicode.h

0001 //===- llvm/Support/Unicode.h - Unicode character properties  -*- C++ -*-=====//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 //
0009 // This file defines functions that allow querying certain properties of Unicode
0010 // characters.
0011 //
0012 //===----------------------------------------------------------------------===//
0013
0014 #ifndef LLVM_SUPPORT_UNICODE_H
0015 #define LLVM_SUPPORT_UNICODE_H
0016
0017 #include "llvm/ADT/SmallString.h"
0018 #include <optional>
0019 #include <string>
0020
0021 namespace llvm {
0022 class StringRef;
0023
0024 namespace sys {
0025 namespace unicode {
0026
0027 enum ColumnWidthErrors {
0028   ErrorInvalidUTF8 = -2,
0029   ErrorNonPrintableCharacter = -1
0030 };
0031
0032 /// Determines if a character is likely to be displayed correctly on the
0033 /// terminal. Exact implementation would have to depend on the specific
0034 /// terminal, so we define the semantic that should be suitable for generic case
0035 /// of a terminal capable to output Unicode characters.
0036 ///
0037 /// Printable codepoints are those in the categories L, M, N, P, S and Zs
0038 /// \return true if the character is considered printable.
0039 bool isPrintable(int UCS);
0040
0041 // Formatting codepoints are codepoints in the Cf category.
0042 bool isFormatting(int UCS);
0043
0044 /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
0045 /// when output on a terminal ("character width"). This depends on the
0046 /// implementation of the terminal, and there's no standard definition of
0047 /// character width.
0048 ///
0049 /// The implementation defines it in a way that is expected to be compatible
0050 /// with a generic Unicode-capable terminal.
0051 ///
0052 /// \return Character width:
0053 ///   * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable
0054 ///     characters (as identified by isPrintable);
0055 ///   * 0 for each non-spacing and enclosing combining mark;
0056 ///   * 2 for each CJK character excluding halfwidth forms;
0057 ///   * 1 for each of the remaining characters.
0058 int columnWidthUTF8(StringRef Text);
0059
0060 /// Fold input unicode character according the Simple unicode case folding
0061 /// rules.
0062 int foldCharSimple(int C);
0063
0064 /// Maps the name or the alias of a Unicode character to its associated
0065 /// codepoints.
0066 /// The names and aliases are derived from UnicodeData.txt and NameAliases.txt
0067 /// For compatibility with the semantics of named character escape sequences in
0068 /// C++, this mapping does an exact match sensitive to casing and spacing.
0069 /// \return The codepoint of the corresponding character, if any.
0070 std::optional<char32_t> nameToCodepointStrict(StringRef Name);
0071
0072 struct LooseMatchingResult {
0073   char32_t CodePoint;
0074   SmallString<64> Name;
0075 };
0076
0077 std::optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name);
0078
0079 struct MatchForCodepointName {
0080   std::string Name;
0081   uint32_t Distance = 0;
0082   char32_t Value = 0;
0083 };
0084
0085 SmallVector<MatchForCodepointName>
0086 nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount);
0087
0088 } // namespace unicode
0089 } // namespace sys
0090 } // namespace llvm
0091
0092 #endif