Skip to content

Commit

Permalink
Add support for escaped UTF-8 strings
Browse files Browse the repository at this point in the history
  • Loading branch information
brunexgeek committed Apr 27, 2024
1 parent 3292bf8 commit cbe054d
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 50 deletions.
40 changes: 28 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ set(PROTOGEN_PATCH 0)
configure_file("compiler/cmake.hh.in" "__include/cmake.hh")

if (UNIX)
set(ENABLE_SANITIZER OFF CACHE BOOL "")
if (ENABLE_SANITIZER)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fsanitize=address")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fsanitize=address")
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wfatal-errors -pedantic -std=c++11 -Wl,--no-undefined -fPIC -Wall -Wextra -Wconversion -Wmaybe-uninitialized -Werror=return-type")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
endif()
Expand Down Expand Up @@ -38,8 +44,6 @@ add_dependencies(libprotogen_static process_template)
set_target_properties(libprotogen_static PROPERTIES
OUTPUT_NAME "protogen_static"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
PREFIX "lib" )

add_executable(protogen
Expand All @@ -49,15 +53,29 @@ target_include_directories(protogen
target_link_libraries(protogen libprotogen_static)
set_target_properties(protogen PROPERTIES
OUTPUT_NAME "protogen"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )

add_custom_target(process_template
add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/__include/auto-code.hh"
DEPENDS template "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/code.txt"
COMMAND "${CMAKE_BINARY_DIR}/template" "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/code.txt" "${CMAKE_BINARY_DIR}/__include/auto-code.hh"
)

add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/__include/auto-protogen.hh"
DEPENDS template "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/protogen.hh"
COMMAND "${CMAKE_BINARY_DIR}/template" "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/protogen.hh" "${CMAKE_BINARY_DIR}/__include/auto-protogen.hh"
)

add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/__include/auto-json.hh"
DEPENDS template "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/json.hh"
COMMAND "${CMAKE_BINARY_DIR}/template" "${CMAKE_CURRENT_LIST_DIR}/compiler/cpp/json.hh" "${CMAKE_BINARY_DIR}/__include/auto-json.hh"
DEPENDS template)
)

add_custom_target(process_template
DEPENDS "${CMAKE_BINARY_DIR}/__include/auto-code.hh" "${CMAKE_BINARY_DIR}/__include/auto-protogen.hh" "${CMAKE_BINARY_DIR}/__include/auto-json.hh"
)

add_executable(template "compiler/template.cc")
target_include_directories(template
Expand All @@ -72,19 +90,19 @@ set(PROTOGEN_EXEC "${CMAKE_BINARY_DIR}/protogen")

add_custom_target(generate_test1
"${PROTOGEN_EXEC}" "${CMAKE_CURRENT_LIST_DIR}/tests/test1.proto" "${CMAKE_BINARY_DIR}/__include/test1.pg.hh"
DEPENDS protogen)
DEPENDS protogen process_template)

add_custom_target(generate_test2
"${PROTOGEN_EXEC}" "${CMAKE_CURRENT_LIST_DIR}/tests/test2.proto" "${CMAKE_BINARY_DIR}/__include/test2.pg.hh"
DEPENDS protogen)
DEPENDS protogen process_template)

add_custom_target(generate_test3
"${PROTOGEN_EXEC}" "${CMAKE_CURRENT_LIST_DIR}/tests/test3.proto" "${CMAKE_BINARY_DIR}/__include/test3.pg.hh"
DEPENDS protogen)
DEPENDS protogen process_template)

add_custom_target(generate_test7
"${PROTOGEN_EXEC}" "${CMAKE_CURRENT_LIST_DIR}/tests/test7.proto" "${CMAKE_BINARY_DIR}/__include/test7.pg.hh"
DEPENDS protogen)
DEPENDS protogen process_template)


add_executable(tests "tests/tests.cc")
Expand All @@ -94,8 +112,6 @@ target_include_directories(tests
add_dependencies(tests generate_test1 generate_test7 generate_test3)
set_target_properties(tests PROPERTIES
OUTPUT_NAME "run-tests"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )

install(TARGETS protogen DESTINATION bin)
90 changes: 79 additions & 11 deletions compiler/cpp/json.hh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ struct json<T, typename std::enable_if<is_container<T>::value>::type >

// Base64 encoder/decoder based on Joe DF's implementation
// Original source at <https://github.com/joedf/base64.c> (MIT licensed)

template <>
struct json< std::vector<uint8_t> >
{
Expand Down Expand Up @@ -217,6 +218,13 @@ struct json<bool, void>
static void swap( bool &a, bool &b ) { std::swap(a, b); }
};

static void write_escaped_utf8(internal::ostream *out, uint32_t codepoint)
{
char buffer[7];
snprintf(buffer, sizeof(buffer), "\\u%04x", codepoint);
(*out) << buffer;
}

template<>
struct json<std::string, void>
{
Expand All @@ -232,21 +240,81 @@ struct json<std::string, void>
static void write( json_context &ctx, const std::string &value )
{
(*ctx.os) << '"';
for (std::string::const_iterator it = value.begin(); it != value.end(); ++it)
size_t size = value.size();
for (size_t i = 0; i < size;)
{
switch (*it)
uint8_t byte1 = value[i];
// 1-byte character
if (byte1 <= 0x7F)
{
case '"': (*ctx.os) << "\\\""; break;
case '\\': (*ctx.os) << "\\\\"; break;
case '/': (*ctx.os) << "\\/"; break;
case '\b': (*ctx.os) << "\\b"; break;
case '\f': (*ctx.os) << "\\f"; break;
case '\r': (*ctx.os) << "\\r"; break;
case '\n': (*ctx.os) << "\\n"; break;
case '\t': (*ctx.os) << "\\t"; break;
default: (*ctx.os) << *it;
switch (byte1)
{
case '"': (*ctx.os) << "\\\""; break;
case '\\': (*ctx.os) << "\\\\"; break;
case '/': (*ctx.os) << "\\/"; break;
case '\b': (*ctx.os) << "\\b"; break;
case '\f': (*ctx.os) << "\\f"; break;
case '\r': (*ctx.os) << "\\r"; break;
case '\n': (*ctx.os) << "\\n"; break;
case '\t': (*ctx.os) << "\\t"; break;
default: (*ctx.os) << (char) byte1;
}
i++;
}
else
{
// 2-byte character

if (i + 1 >= size)
goto ESCAPE; // TODO return error

uint8_t byte2 = value[i + 1];
if (byte1 >= 0xC0 && byte1 <= 0xDF && (byte2 & 0xC0) == 0x80)
{
uint32_t codepoint = ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
write_escaped_utf8(ctx.os, codepoint);
i += 2;
continue;
}

// 3-byte character

if (i + 2 >= size)
goto ESCAPE; // TODO return error

uint8_t byte3 = value[i + 2];
if (byte1 >= 0xE0 && byte1 <= 0xEF && i + 2 < size && (byte2 & 0xC0) == 0x80 && (byte3 & 0xC0) == 0x80)
{
uint32_t codepoint = ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
write_escaped_utf8(ctx.os, codepoint);
i += 3;
continue;
}

// 4-byte character

if (i + 3 >= size)
goto ESCAPE; // TODO return error

uint8_t byte4 = value[i + 3];
if (byte1 >= 0xF0 && byte1 <= 0xF4 && i + 3 < size && (byte2 & 0xC0) == 0x80 && (byte3 & 0xC0) == 0x80 && (byte4 & 0xC0) == 0x80)
{
uint32_t codepoint = ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);

// break the codepoint into UTF-16 surrogate pair
static const uint32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
uint32_t lead = LEAD_OFFSET + (codepoint >> 10);
uint32_t trail = 0xDC00 + (codepoint & 0x3FF);
// write the surrogate pair
write_escaped_utf8(ctx.os, lead);
write_escaped_utf8(ctx.os, trail);
i += 4;
}
// TODO do something in case of invalid UTF-8 character
}
}

ESCAPE:
(*ctx.os) << '"';
}
static bool empty( const std::string &value ) { return value.empty(); }
Expand Down
93 changes: 82 additions & 11 deletions compiler/cpp/protogen.hh
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
#define PROTOGEN_X_Y_Z

#include <string>
#include <sstream>
#include <vector>
#include <iostream>
#include <forward_list>
#include <istream>
#include <iomanip>
#include <iterator>
#include <memory>

Expand Down Expand Up @@ -308,6 +310,7 @@ class tokenizer

token parse_string()
{
int32_t lead = 0;
std::string value;
int line = input_.line();
int column = input_.column();
Expand All @@ -327,25 +330,93 @@ class tokenizer
c = input_.peek();
switch (c)
{
case '"': c = '"'; break;
case '\\': c = '\\'; break;
case '/': c = '/'; break;
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'r': c = '\r'; break;
case 'n': c = '\n'; break;
case 't': c = '\t'; break;
// TODO: handle escaped unicode (\uXXXX)
case '"': value += '"'; break;
case '\\': value += '\\'; break;
case '/': value += '/'; break;
case 'b': value += '\b'; break;
case 'f': value += '\f'; break;
case 'r': value += '\r'; break;
case 'n': value += '\n'; break;
case 't': value += '\t'; break;
case 'u':
if (!parse_escaped_utf8(value, lead))
goto ESCAPE;
break;
default: goto ESCAPE;
}
}
if (c == 0) goto ESCAPE;
value += (char) c;
else
{
if (c == 0)
goto ESCAPE;
value += (char) c;
}
}
ESCAPE:
return token(token_id::NONE, "", line, column);
}

bool parse_escaped_utf8(std::string &value, int32_t &lead)
{
char temp[5] = {0};
for (int i = 0; i < 4; ++i)
{
input_.next();
auto c = input_.peek();
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
temp[i] = (char) c;
else
return false;
}
int32_t codepoint = (int32_t) strtol(temp, nullptr, 16);

// first 2-byte UTF-16 surrogate pair
if (codepoint >= 0xD800 && codepoint <= 0xDBFF)
{
lead = codepoint;
return true;
}
else
// second 2-byte UTF-16 surrogate pair
if (codepoint >= 0xDC00 && codepoint <= 0xDFFF)
{
// check whether we have a lead (first value in the surrogate pair)
if (lead == 0)
return false;
// compute the final codepoint
static const int32_t SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
codepoint = (lead << 10) + codepoint + SURROGATE_OFFSET;

// 4-byte UTF-8 = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
value += (char) (0xF0 | ((codepoint >> 18) & 0x07));
value += (char) (0x80 | ((codepoint >> 12) & 0x3F));
value += (char) (0x80 | ((codepoint >> 6) & 0x3F));
value += (char) (0x80 | (codepoint & 0x3F));
}
else
// 2-byte UTF-8 = 110xxxxx 10xxxxxx
if (codepoint >= 0x80 && codepoint <= 0x7FF)
{
value += (char) (0xC0 | ((codepoint >> 6) & 0x1F));
value += (char) (0x80 | (codepoint & 0x3F));
}
else
// 3-byte UTF-8 = 1110xxxx 10xxxxxx 10xxxxxx
if (codepoint >= 0x800 && codepoint <= 0xFFFF)
{
value += (char) (0xE0 | ((codepoint >> 12) & 0x0F));
value += (char) (0x80 | ((codepoint >> 6) & 0x3F));
value += (char) (0x80 | (codepoint & 0x3F));
}
else
return false;

// reset the surrogate pair lead
lead = 0;

return true;
}

bool parse_keyword( const std::string &keyword )
{
for (auto c : keyword)
Expand Down
Loading

0 comments on commit cbe054d

Please sign in to comment.