Add support for escaped UTF-8 strings

brunexgeek · Apr 27, 2024 · cbe054d · cbe054d
1 parent 3292bf8
commit cbe054d
Show file tree

Hide file tree

Showing 4 changed files with 249 additions and 50 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,6 +10,12 @@ set(PROTOGEN_PATCH 0)
 configure_file("compiler/cmake.hh.in" "__include/cmake.hh")
 
 if (UNIX)
+    set(ENABLE_SANITIZER OFF CACHE BOOL "")
+    if (ENABLE_SANITIZER)
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fsanitize=address")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fsanitize=address")
+    endif()
+
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wfatal-errors -pedantic -std=c++11 -Wl,--no-undefined -fPIC -Wall -Wextra -Wconversion -Wmaybe-uninitialized -Werror=return-type")
 	set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 endif()
@@ -38,8 +44,6 @@ add_dependencies(libprotogen_static process_template)
 set_target_properties(libprotogen_static PROPERTIES
     OUTPUT_NAME "protogen_static"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
     PREFIX "lib" )
 
 add_executable(protogen
@@ -49,15 +53,29 @@ target_include_directories(protogen
 target_link_libraries(protogen libprotogen_static)
 set_target_properties(protogen PROPERTIES
     OUTPUT_NAME "protogen"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
 
-add_custom_target(process_template
+add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/__include/auto-code.hh"
+    DEPENDS template "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/code.txt"
     COMMAND "${CMAKE_BINARY_DIR}/template" "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/code.txt" "${CMAKE_BINARY_DIR}/__include/auto-code.hh"
+)
+
+add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/__include/auto-protogen.hh"
+    DEPENDS template "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/protogen.hh"
     COMMAND "${CMAKE_BINARY_DIR}/template" "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/protogen.hh" "${CMAKE_BINARY_DIR}/__include/auto-protogen.hh"
+)
+
+add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/__include/auto-json.hh"
+    DEPENDS template "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/json.hh"
     COMMAND "${CMAKE_BINARY_DIR}/template" "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/json.hh" "${CMAKE_BINARY_DIR}/__include/auto-json.hh"
-    DEPENDS template)
+)
+
+add_custom_target(process_template
+    DEPENDS "${CMAKE_BINARY_DIR}/__include/auto-code.hh" "${CMAKE_BINARY_DIR}/__include/auto-protogen.hh" "${CMAKE_BINARY_DIR}/__include/auto-json.hh"
+)
 
 add_executable(template "compiler/template.cc")
 target_include_directories(template
@@ -72,19 +90,19 @@ set(PROTOGEN_EXEC "${CMAKE_BINARY_DIR}/protogen")
 
 add_custom_target(generate_test1
     "${PROTOGEN_EXEC}" "${CMAKE_CURRENT_LIST_DIR}/tests/test1.proto" "${CMAKE_BINARY_DIR}/__include/test1.pg.hh"
-    DEPENDS protogen)
+    DEPENDS protogen process_template)
 
 add_custom_target(generate_test2
     "${PROTOGEN_EXEC}" "${CMAKE_CURRENT_LIST_DIR}/tests/test2.proto" "${CMAKE_BINARY_DIR}/__include/test2.pg.hh"
-    DEPENDS protogen)
+    DEPENDS protogen process_template)
 
 add_custom_target(generate_test3
     "${PROTOGEN_EXEC}" "${CMAKE_CURRENT_LIST_DIR}/tests/test3.proto" "${CMAKE_BINARY_DIR}/__include/test3.pg.hh"
-    DEPENDS protogen)
+    DEPENDS protogen process_template)
 
 add_custom_target(generate_test7
     "${PROTOGEN_EXEC}" "${CMAKE_CURRENT_LIST_DIR}/tests/test7.proto" "${CMAKE_BINARY_DIR}/__include/test7.pg.hh"
-    DEPENDS protogen)
+    DEPENDS protogen process_template)
 
 
 add_executable(tests "tests/tests.cc")
@@ -94,8 +112,6 @@ target_include_directories(tests
 add_dependencies(tests generate_test1 generate_test7 generate_test3)
 set_target_properties(tests PROPERTIES
     OUTPUT_NAME "run-tests"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
 
 install(TARGETS protogen DESTINATION bin)
diff --git a/compiler/cpp/json.hh b/compiler/cpp/json.hh
@@ -98,6 +98,7 @@ struct json<T, typename std::enable_if<is_container<T>::value>::type >
 
 // Base64 encoder/decoder based on Joe DF's implementation
 // Original source at <https://github.com/joedf/base64.c> (MIT licensed)
+
 template <>
 struct json< std::vector<uint8_t> >
 {
@@ -217,6 +218,13 @@ struct json<bool, void>
     static void swap( bool &a, bool &b ) { std::swap(a, b); }
 };
 
+static void write_escaped_utf8(internal::ostream *out, uint32_t codepoint)
+{
+    char buffer[7];
+    snprintf(buffer, sizeof(buffer), "\\u%04x", codepoint);
+    (*out) << buffer;
+}
+
 template<>
 struct json<std::string, void>
 {
@@ -232,21 +240,81 @@ struct json<std::string, void>
     static void write( json_context &ctx, const std::string &value )
     {
         (*ctx.os) <<  '"';
-        for (std::string::const_iterator it = value.begin(); it != value.end(); ++it)
+        size_t size = value.size();
+        for (size_t i = 0; i < size;)
         {
-            switch (*it)
+            uint8_t byte1 = value[i];
+            // 1-byte character
+            if (byte1 <= 0x7F)
             {
-                case '"':  (*ctx.os) <<  "\\\""; break;
-                case '\\': (*ctx.os) <<  "\\\\"; break;
-                case '/':  (*ctx.os) <<  "\\/"; break;
-                case '\b': (*ctx.os) <<  "\\b"; break;
-                case '\f': (*ctx.os) <<  "\\f"; break;
-                case '\r': (*ctx.os) <<  "\\r"; break;
-                case '\n': (*ctx.os) <<  "\\n"; break;
-                case '\t': (*ctx.os) <<  "\\t"; break;
-                default:   (*ctx.os) << *it;
+                switch (byte1)
+                {
+                    case '"':  (*ctx.os) <<  "\\\""; break;
+                    case '\\': (*ctx.os) <<  "\\\\"; break;
+                    case '/':  (*ctx.os) <<  "\\/"; break;
+                    case '\b': (*ctx.os) <<  "\\b"; break;
+                    case '\f': (*ctx.os) <<  "\\f"; break;
+                    case '\r': (*ctx.os) <<  "\\r"; break;
+                    case '\n': (*ctx.os) <<  "\\n"; break;
+                    case '\t': (*ctx.os) <<  "\\t"; break;
+                    default:   (*ctx.os) << (char) byte1;
+                }
+                i++;
+            }
+            else
+            {
+                // 2-byte character
+
+                if (i + 1 >= size)
+                    goto ESCAPE; // TODO return error
+
+                uint8_t byte2 = value[i + 1];
+                if (byte1 >= 0xC0 && byte1 <= 0xDF && (byte2 & 0xC0) == 0x80)
+                {
+                    uint32_t codepoint = ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
+                    write_escaped_utf8(ctx.os, codepoint);
+                    i += 2;
+                    continue;
+                }
+
+                // 3-byte character
+
+                if (i + 2 >= size)
+                    goto ESCAPE; // TODO return error
+
+                uint8_t byte3 = value[i + 2];
+                if (byte1 >= 0xE0 && byte1 <= 0xEF && i + 2 < size && (byte2 & 0xC0) == 0x80 && (byte3 & 0xC0) == 0x80)
+                {
+                    uint32_t codepoint = ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
+                    write_escaped_utf8(ctx.os, codepoint);
+                    i += 3;
+                    continue;
+                }
+
+                // 4-byte character
+
+                if (i + 3 >= size)
+                    goto ESCAPE; // TODO return error
+
+                uint8_t byte4 = value[i + 3];
+                if (byte1 >= 0xF0 && byte1 <= 0xF4 && i + 3 < size && (byte2 & 0xC0) == 0x80 && (byte3 & 0xC0) == 0x80 && (byte4 & 0xC0) == 0x80)
+                {
+                    uint32_t codepoint = ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
+
+                    // break the codepoint into UTF-16 surrogate pair
+                    static const uint32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
+                    uint32_t lead = LEAD_OFFSET + (codepoint >> 10);
+                    uint32_t trail = 0xDC00 + (codepoint & 0x3FF);
+                    // write the surrogate pair
+                    write_escaped_utf8(ctx.os, lead);
+                    write_escaped_utf8(ctx.os, trail);
+                    i += 4;
+                }
+                // TODO do something in case of invalid UTF-8 character
             }
         }
+
+        ESCAPE:
         (*ctx.os) <<  '"';
     }
     static bool empty( const std::string &value ) { return value.empty(); }

diff --git a/compiler/cpp/protogen.hh b/compiler/cpp/protogen.hh
@@ -18,10 +18,12 @@
 #define PROTOGEN_X_Y_Z
 
 #include <string>
+#include <sstream>
 #include <vector>
 #include <iostream>
 #include <forward_list>
 #include <istream>
+#include <iomanip>
 #include <iterator>
 #include <memory>
 
@@ -308,6 +310,7 @@ class tokenizer
 
         token parse_string()
         {
+            int32_t lead = 0;
             std::string value;
             int line = input_.line();
             int column = input_.column();
@@ -327,25 +330,93 @@ class tokenizer
                     c = input_.peek();
                     switch (c)
                     {
-                        case '"':  c = '"'; break;
-                        case '\\': c = '\\'; break;
-                        case '/':  c = '/'; break;
-                        case 'b':  c = '\b'; break;
-                        case 'f':  c = '\f'; break;
-                        case 'r':  c = '\r'; break;
-                        case 'n':  c = '\n'; break;
-                        case 't':  c = '\t'; break;
-                        // TODO: handle escaped unicode (\uXXXX)
+                        case '"':  value += '"'; break;
+                        case '\\': value += '\\'; break;
+                        case '/':  value += '/'; break;
+                        case 'b':  value += '\b'; break;
+                        case 'f':  value += '\f'; break;
+                        case 'r':  value += '\r'; break;
+                        case 'n':  value += '\n'; break;
+                        case 't':  value += '\t'; break;
+                        case 'u':
+                            if (!parse_escaped_utf8(value, lead))
+                                goto ESCAPE;
+                            break;
                         default: goto ESCAPE;
                     }
                 }
-                if (c == 0) goto ESCAPE;
-                value += (char) c;
+                else
+                {
+                    if (c == 0)
+                        goto ESCAPE;
+                    value += (char) c;
+                }
             }
             ESCAPE:
             return token(token_id::NONE, "", line, column);
         }
 
+        bool parse_escaped_utf8(std::string &value, int32_t &lead)
+        {
+            char temp[5] = {0};
+            for (int i = 0; i < 4; ++i)
+            {
+                input_.next();
+                auto c = input_.peek();
+                if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
+                    temp[i] = (char) c;
+                else
+                    return false;
+            }
+            int32_t codepoint = (int32_t) strtol(temp, nullptr, 16);
+
+            // first 2-byte UTF-16 surrogate pair
+            if (codepoint >= 0xD800 && codepoint <= 0xDBFF)
+            {
+                lead = codepoint;
+                return true;
+            }
+            else
+            // second 2-byte UTF-16 surrogate pair
+            if (codepoint >= 0xDC00 && codepoint <= 0xDFFF)
+            {
+                // check whether we have a lead (first value in the surrogate pair)
+                if (lead == 0)
+                    return false;
+                // compute the final codepoint
+                static const int32_t SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
+                codepoint = (lead << 10) + codepoint + SURROGATE_OFFSET;
+
+                // 4-byte UTF-8 = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                value += (char) (0xF0 | ((codepoint >> 18) & 0x07));
+                value += (char) (0x80 | ((codepoint >> 12) & 0x3F));
+                value += (char) (0x80 | ((codepoint >> 6) & 0x3F));
+                value += (char) (0x80 | (codepoint & 0x3F));
+            }
+            else
+            // 2-byte UTF-8 = 110xxxxx 10xxxxxx
+            if (codepoint >= 0x80 && codepoint <= 0x7FF)
+            {
+                value += (char) (0xC0 | ((codepoint >> 6) & 0x1F));
+                value += (char) (0x80 | (codepoint & 0x3F));
+            }
+            else
+            // 3-byte UTF-8 = 1110xxxx 10xxxxxx 10xxxxxx
+            if (codepoint >= 0x800 && codepoint <= 0xFFFF)
+            {
+                value += (char) (0xE0 | ((codepoint >> 12) & 0x0F));
+                value += (char) (0x80 | ((codepoint >> 6) & 0x3F));
+                value += (char) (0x80 | (codepoint & 0x3F));
+            }
+            else
+                return false;
+
+            // reset the surrogate pair lead
+            lead = 0;
+
+            return true;
+        }
+
         bool parse_keyword( const std::string &keyword )
         {
             for (auto c : keyword)