text/internal/encode.h

0001 // Protocol Buffers - Google's data interchange format
0002 // Copyright 2023 Google LLC.  All rights reserved.
0003 //
0004 // Use of this source code is governed by a BSD-style
0005 // license that can be found in the LICENSE file or at
0006 // https://developers.google.com/open-source/licenses/bsd
0007
0008 #ifndef UPB_TEXT_ENCODE_INTERNAL_H_
0009 #define UPB_TEXT_ENCODE_INTERNAL_H_
0010
0011 #include <stdarg.h>
0012 #include <string.h>
0013
0014 #include "upb/base/descriptor_constants.h"
0015 #include "upb/base/string_view.h"
0016 #include "upb/message/array.h"
0017 #include "upb/message/internal/map_sorter.h"
0018 #include "upb/port/vsnprintf_compat.h"
0019 #include "upb/text/options.h"
0020 #include "upb/wire/eps_copy_input_stream.h"
0021 #include "utf8_range.h"
0022
0023 // Must be last.
0024 #include "upb/port/def.inc"
0025
0026 typedef struct {
0027   char *buf, *ptr, *end;
0028   size_t overflow;
0029   int indent_depth;
0030   int options;
0031   const struct upb_DefPool* ext_pool;
0032   _upb_mapsorter sorter;
0033 } txtenc;
0034
0035 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutBytes)(txtenc* e,
0036                                                       const void* data,
0037                                                       size_t len) {
0038   size_t have = e->end - e->ptr;
0039   if (UPB_LIKELY(have >= len)) {
0040     memcpy(e->ptr, data, len);
0041     e->ptr += len;
0042   } else {
0043     if (have) {
0044       memcpy(e->ptr, data, have);
0045       e->ptr += have;
0046     }
0047     e->overflow += (len - have);
0048   }
0049 }
0050
0051 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutStr)(txtenc* e,
0052                                                     const char* str) {
0053   UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, str, strlen(str));
0054 }
0055
0056 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Printf)(txtenc* e, const char* fmt,
0057                                                     ...) {
0058   size_t n;
0059   size_t have = e->end - e->ptr;
0060   va_list args;
0061
0062   va_start(args, fmt);
0063   n = _upb_vsnprintf(e->ptr, have, fmt, args);
0064   va_end(args);
0065
0066   if (UPB_LIKELY(have > n)) {
0067     e->ptr += n;
0068   } else {
0069     e->ptr = UPB_PTRADD(e->ptr, have);
0070     e->overflow += (n - have);
0071   }
0072 }
0073
0074 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Indent)(txtenc* e) {
0075   if ((e->options & UPB_TXTENC_SINGLELINE) == 0) {
0076     int i = e->indent_depth;
0077     while (i-- > 0) {
0078       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "  ");
0079     }
0080   }
0081 }
0082
0083 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_EndField)(txtenc* e) {
0084   if (e->options & UPB_TXTENC_SINGLELINE) {
0085     UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " ");
0086   } else {
0087     UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\n");
0088   }
0089 }
0090
0091 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Escaped)(txtenc* e,
0092                                                      unsigned char ch) {
0093   switch (ch) {
0094     case '\n':
0095       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\n");
0096       break;
0097     case '\r':
0098       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\r");
0099       break;
0100     case '\t':
0101       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\t");
0102       break;
0103     case '\"':
0104       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\"");
0105       break;
0106     case '\'':
0107       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\'");
0108       break;
0109     case '\\':
0110       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\\");
0111       break;
0112     default:
0113       UPB_PRIVATE(_upb_TextEncode_Printf)(e, "\\%03o", ch);
0114       break;
0115   }
0116 }
0117
0118 // Returns true if `ch` needs to be escaped in TextFormat, independent of any
0119 // UTF-8 validity issues.
0120 UPB_INLINE bool UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(unsigned char ch) {
0121   if (ch < 32) return true;
0122   switch (ch) {
0123     case '\"':
0124     case '\'':
0125     case '\\':
0126     case 127:
0127       return true;
0128   }
0129   return false;
0130 }
0131
0132 UPB_INLINE bool UPB_PRIVATE(_upb_AsciiIsPrint)(unsigned char ch) {
0133   return ch >= 32 && ch < 127;
0134 }
0135
0136 // Returns true if this is a high byte that requires UTF-8 validation.  If the
0137 // UTF-8 validation fails, we must escape the byte.
0138 UPB_INLINE bool UPB_PRIVATE(_upb_NeedsUtf8Validation)(unsigned char ch) {
0139   return ch > 127;
0140 }
0141
0142 // Returns the number of bytes in the prefix of `val` that do not need escaping.
0143 // This is like utf8_range::SpanStructurallyValid(), except that it also
0144 // terminates at any ASCII char that needs to be escaped in TextFormat (any char
0145 // that has `DefinitelyNeedsEscape(ch) == true`).
0146 //
0147 // If we could get a variant of utf8_range::SpanStructurallyValid() that could
0148 // terminate on any of these chars, that might be more efficient, but it would
0149 // be much more complicated to modify that heavily SIMD code.
0150 UPB_INLINE size_t UPB_PRIVATE(_SkipPassthroughBytes)(const char* ptr,
0151                                                      size_t size) {
0152   for (size_t i = 0; i < size; i++) {
0153     unsigned char uc = ptr[i];
0154     if (UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(uc)) return i;
0155     if (UPB_PRIVATE(_upb_NeedsUtf8Validation)(uc)) {
0156       // Find the end of this region of consecutive high bytes, so that we only
0157       // give high bytes to the UTF-8 checker.  This avoids needing to perform
0158       // a second scan of the ASCII characters looking for characters that
0159       // need escaping.
0160       //
0161       // We assume that high bytes are less frequent than plain, printable ASCII
0162       // bytes, so we accept the double-scan of high bytes.
0163       size_t end = i + 1;
0164       for (; end < size; end++) {
0165         if (!UPB_PRIVATE(_upb_NeedsUtf8Validation)(ptr[end])) break;
0166       }
0167       size_t n = end - i;
0168       size_t ok = utf8_range_ValidPrefix(ptr + i, n);
0169       if (ok != n) return i + ok;
0170       i += ok - 1;
0171     }
0172   }
0173   return size;
0174 }
0175
0176 UPB_INLINE void UPB_PRIVATE(_upb_HardenedPrintString)(txtenc* e,
0177                                                       const char* ptr,
0178                                                       size_t len) {
0179   // Print as UTF-8, while guarding against any invalid UTF-8 in the string
0180   // field.
0181   //
0182   // If in the future we have a guaranteed invariant that invalid UTF-8 will
0183   // never be present, we could avoid the UTF-8 check here.
0184   UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
0185   const char* end = ptr + len;
0186   while (ptr < end) {
0187     size_t n = UPB_PRIVATE(_SkipPassthroughBytes)(ptr, end - ptr);
0188     if (n != 0) {
0189       UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, n);
0190       ptr += n;
0191       if (ptr == end) break;
0192     }
0193
0194     // If repeated calls to CEscape() and PrintString() are expensive, we could
0195     // consider batching them, at the cost of some complexity.
0196     UPB_PRIVATE(_upb_TextEncode_Escaped)(e, *ptr);
0197     ptr++;
0198   }
0199   UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
0200 }
0201
0202 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Bytes)(txtenc* e,
0203                                                    upb_StringView data) {
0204   const char* ptr = data.data;
0205   const char* end = ptr + data.size;
0206   UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
0207   for (; ptr < end; ptr++) {
0208     unsigned char uc = *ptr;
0209     if (UPB_PRIVATE(_upb_AsciiIsPrint)(uc)) {
0210       UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, 1);
0211     } else {
0212       UPB_PRIVATE(_upb_TextEncode_Escaped)(e, uc);
0213     }
0214   }
0215   UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
0216 }
0217
0218 UPB_INLINE size_t UPB_PRIVATE(_upb_TextEncode_Nullz)(txtenc* e, size_t size) {
0219   size_t ret = e->ptr - e->buf + e->overflow;
0220
0221   if (size > 0) {
0222     if (e->ptr == e->end) e->ptr--;
0223     *e->ptr = '\0';
0224   }
0225
0226   return ret;
0227 }
0228
0229 const char* UPB_PRIVATE(_upb_TextEncode_Unknown)(txtenc* e, const char* ptr,
0230                                                  upb_EpsCopyInputStream* stream,
0231                                                  int groupnum);
0232
0233 // Must not be called for ctype = kUpb_CType_Enum, as they require different
0234 // handling depending on whether or not we're doing reflection-based encoding.
0235 void UPB_PRIVATE(_upb_TextEncode_Scalar)(txtenc* e, upb_MessageValue val,
0236                                          upb_CType ctype);
0237
0238 #include "upb/port/undef.inc"
0239
0240 #endif  // UPB_TEXT_ENCODE_INTERNAL_H_