Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RegEx support using RE2 #665

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ matrix:
apt:
packages:
- g++-4.9
- libre2-dev
sources: &sources
- llvm-toolchain-precise-3.8
- ubuntu-toolchain-r-test
Expand All @@ -16,6 +17,7 @@ matrix:
apt:
packages:
- clang-3.8
- libre2-dev
sources: *sources
- os: osx
osx_image: xcode8
Expand Down Expand Up @@ -49,4 +51,4 @@ notifications:
channels:
- "chat.freenode.net#jsonnet"
template:
- "%{repository}/%{branch} (%{commit} - %{author}): %{message}"
- "%{repository}/%{branch} (%{commit} - %{author}): %{message}"
46 changes: 44 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,50 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})

# Include external RE2 project. This runs a CMake sub-script
# (RE2CMakeLists.txt.in) that downloads googletest source. It's then built as part
# of the jsonnet project. The conventional way of handling CMake dependencies is
# to use a find_package script, which finds and installs the library from
# known locations on the local machine. Downloading the library ourselves
# allows us to pin to a specific version and makes things easier for users
# who don't have package managers.

# Generate and download RE2 project.
set(RE2_DIR ${GLOBAL_OUTPUT_PATH}/re2-download)
configure_file(RE2CMakeLists.txt.in ${RE2_DIR}/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${RE2_DIR}
)
if(result)
message(FATAL_ERROR "RE2 download failed: ${result}")
endif()

# Build RE2.
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${RE2_DIR})
if(result)
message(FATAL_ERROR "Build step for re2 failed: ${result}")
endif()

# Add RE2 directly to our build. This defines
# the re2 target.
add_subdirectory(${GLOBAL_OUTPUT_PATH}/re2-src
${GLOBAL_OUTPUT_PATH}/re2-build)

# Include RE2 headers.
include_directories("${RE2_SOURCE_DIR}/include")

# Allow linking into a shared library.
set_property(TARGET re2 PROPERTY POSITION_INDEPENDENT_CODE ON)

# RE2 requires pthreads
set_property(TARGET re2 PROPERTY INTERFACE_COMPILE_OPTIONS $<${UNIX}:-pthread>)
set_property(TARGET re2 PROPERTY INTERFACE_LINK_LIBRARIES $<${UNIX}:-pthread>)

# Include external googletest project. This runs a CMake sub-script
# (CMakeLists.txt.in) that downloads googletest source. It's then built as part
# (GoogleTestCMakeLists.txt.in) that downloads googletest source. It's then built as part
# of the jsonnet project. The conventional way of handling CMake dependencies is
# to use a find_package script, which finds and installs the library from
# known locations on the local machine. Downloading the library ourselves
Expand All @@ -41,7 +83,7 @@ if (BUILD_TESTS AND NOT USE_SYSTEM_GTEST)

# Generate and download googletest project.
set(GOOGLETEST_DIR ${GLOBAL_OUTPUT_PATH}/googletest-download)
configure_file(CMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
configure_file(GoogleTestCMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${GOOGLETEST_DIR}
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ CFLAGS ?= -g $(OPT) -Wall -Wextra -pedantic -std=c99 -fPIC -Iinclude
MAKEDEPENDFLAGS ?= -Iinclude -Ithird_party/md5 -Ithird_party/json
EMCXXFLAGS = $(CXXFLAGS) -g0 -Os --memory-init-file 0 -s DISABLE_EXCEPTION_CATCHING=0 -s OUTLINING_LIMIT=10000 -s RESERVED_FUNCTION_POINTERS=20 -s ASSERTIONS=1 -s ALLOW_MEMORY_GROWTH=1
EMCFLAGS = $(CFLAGS) --memory-init-file 0 -s DISABLE_EXCEPTION_CATCHING=0 -s ASSERTIONS=1 -s ALLOW_MEMORY_GROWTH=1
LDFLAGS ?=
LDFLAGS ?= -lre2

SHARED_LDFLAGS ?= -shared

Expand Down Expand Up @@ -121,11 +121,11 @@ core/desugarer.cpp: core/std.jsonnet.h

# Commandline executable.
jsonnet: cmd/jsonnet.cpp cmd/utils.cpp $(LIB_OBJ)
$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)

# Commandline executable (reformatter).
jsonnetfmt: cmd/jsonnetfmt.cpp cmd/utils.cpp $(LIB_OBJ)
$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)

# C binding.
libjsonnet.so: $(LIB_OBJ)
Expand Down
18 changes: 18 additions & 0 deletions RE2CMakeLists.txt.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# CMake script run a generation-time. This must be separate from the main
# CMakeLists.txt file to allow downloading and building googletest at generation
# time.
cmake_minimum_required(VERSION 2.8.2)

project(re2-download NONE)

include(ExternalProject)
ExternalProject_Add(re2
GIT_REPOSITORY https://github.com/google/re2.git
GIT_TAG 2019-06-01
SOURCE_DIR "${GLOBAL_OUTPUT_PATH}/re2-src"
BINARY_DIR "${GLOBAL_OUTPUT_PATH}/re2-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
10 changes: 9 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,19 @@ git_repository(
git_repository(
name = "com_google_googletest",
remote = "https://github.com/google/googletest.git",
# If updating googletest version, also update CMakeLists.txt.in.
# If updating googletest version, also update GoogleTestCMakeLists.txt.in.
commit = "2fe3bd994b3189899d93f1d5a881e725e046fdc2", # release: release-1.8.1
shallow_since = "1535728917 -0400",
)

git_repository(
name = "com_googlesource_code_re2",
remote = "https://github.com/google/re2.git",
# If updating RE2 version, also update RE2CMakeLists.txt.in.
commit = "0c95bcce2f1f0f071a786ca2c42384b211b8caba", # release: 2019-06-01
shallow_since = "1558525654 +0000",
)

load("//tools/build_defs:python_repo.bzl", "python_interpreter")

python_interpreter(name = "default_python")
1 change: 1 addition & 0 deletions core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ cc_library(
"//stdlib:std",
"//third_party/json",
"//third_party/md5:libmd5",
"@com_googlesource_code_re2//:re2",
],
)

Expand Down
8 changes: 4 additions & 4 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ set(LIBJSONNET_SOURCE
vm.cpp)

add_library(libjsonnet SHARED ${LIBJSONNET_HEADERS} ${LIBJSONNET_SOURCE})
add_dependencies(libjsonnet md5 stdlib)
target_link_libraries(libjsonnet md5)
add_dependencies(libjsonnet md5 re2 stdlib)
target_link_libraries(libjsonnet md5 re2)

# CMake prepends CMAKE_SHARED_LIBRARY_PREFIX to shared libraries, so without
# this step the output would be |liblibjsonnet|.
Expand All @@ -45,8 +45,8 @@ install(TARGETS libjsonnet

# Static library for jsonnet command-line tool.
add_library(libjsonnet_static STATIC ${LIBJSONNET_SOURCE})
add_dependencies(libjsonnet_static md5 stdlib)
target_link_libraries(libjsonnet_static md5)
add_dependencies(libjsonnet_static md5 re2 stdlib)
target_link_libraries(libjsonnet_static md5 re2)
set_target_properties(libjsonnet_static PROPERTIES OUTPUT_NAME jsonnet)
install(TARGETS libjsonnet_static DESTINATION "${CMAKE_INSTALL_LIBDIR}")

Expand Down
7 changes: 6 additions & 1 deletion core/desugarer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct BuiltinDecl {
std::vector<UString> params;
};

static unsigned long max_builtin = 37;
static unsigned long max_builtin = 42;
BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
{
switch (builtin) {
Expand Down Expand Up @@ -76,6 +76,11 @@ BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
case 35: return {U"parseJson", {U"str"}};
case 36: return {U"encodeUTF8", {U"str"}};
case 37: return {U"decodeUTF8", {U"arr"}};
case 38: return {U"regexFullMatch", {U"pattern", U"str"}};
case 39: return {U"regexPartialMatch", {U"pattern", U"str"}};
case 40: return {U"regexQuoteMeta", {U"str"}};
case 41: return {U"regexReplace", {U"str", U"pattern", U"to"}};
case 42: return {U"regexGlobalReplace", {U"str", U"pattern", U"to"}};
default:
std::cerr << "INTERNAL ERROR: Unrecognized builtin function: " << builtin << std::endl;
std::abort();
Expand Down
133 changes: 133 additions & 0 deletions core/vm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ limitations under the License.
#include "json.hpp"
#include "md5.h"
#include "parser.h"
#include "re2/re2.h"
#include "state.h"
#include "static_analysis.h"
#include "string_utils.h"
Expand All @@ -35,6 +36,10 @@ using json = nlohmann::json;

namespace {

static const Fodder EF; // Empty fodder.

static const LocationRange E; // Empty.

/** Turn a path e.g. "/a/b/c" into a dir, e.g. "/a/b/". If there is no path returns "".
*/
std::string dir_name(const std::string &path)
Expand Down Expand Up @@ -881,6 +886,11 @@ class Interpreter {
builtins["parseJson"] = &Interpreter::builtinParseJson;
builtins["encodeUTF8"] = &Interpreter::builtinEncodeUTF8;
builtins["decodeUTF8"] = &Interpreter::builtinDecodeUTF8;
builtins["regexFullMatch"] = &Interpreter::builtinRegexFullMatch;
builtins["regexPartialMatch"] = &Interpreter::builtinRegexPartialMatch;
builtins["regexQuoteMeta"] = &Interpreter::builtinRegexQuoteMeta;
builtins["regexReplace"] = &Interpreter::builtinRegexReplace;
builtins["regexGlobalReplace"] = &Interpreter::builtinRegexGlobalReplace;
}

/** Clean up the heap, stack, stash, and builtin function ASTs. */
Expand Down Expand Up @@ -1373,6 +1383,129 @@ class Interpreter {
return decodeUTF8();
}

const AST *regexMatch(const std::string &pattern, const std::string &string, bool full)
{
RE2 re(pattern, RE2::CannedOptions::Quiet);
if (!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

int num_groups = re.NumberOfCapturingGroups();

std::vector<std::string> rcaptures(num_groups);
std::vector<RE2::Arg> rargv(num_groups);
std::vector<const RE2::Arg*> rargs(num_groups);
for (int i = 0; i < num_groups; ++i) {
rargs[i] = &rargv[i];
rargv[i] = &rcaptures[i];
}

if (full ? RE2::FullMatchN(string, re, rargs.data(), num_groups)
: RE2::PartialMatchN(string, re, rargs.data(), num_groups)) {
std::map<const Identifier *, HeapSimpleObject::Field> fields;

const Identifier *fid = alloc->makeIdentifier(U"string");
fields[fid].hide = ObjectField::VISIBLE;
fields[fid].body = alloc->make<LiteralString>(E, EF, decode_utf8(string), LiteralString::DOUBLE, "", "");

fid = alloc->makeIdentifier(U"captures");
fields[fid].hide = ObjectField::VISIBLE;
std::vector<Array::Element> captures;
for (int i = 0; i < num_groups; ++i) {
captures.push_back(Array::Element(
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[i]), LiteralString::DOUBLE, "", ""),
EF));
}
fields[fid].body = alloc->make<Array>(E, EF, captures, false, EF);

fid = alloc->makeIdentifier(U"namedCaptures");
fields[fid].hide = ObjectField::VISIBLE;
DesugaredObject::Fields named_captures;
const std::map<std::string, int> &named_groups = re.NamedCapturingGroups();
for (auto it = named_groups.cbegin(); it != named_groups.cend(); ++it) {
named_captures.push_back(DesugaredObject::Field(
ObjectField::VISIBLE,
alloc->make<LiteralString>(E, EF, decode_utf8(it->first), LiteralString::DOUBLE, "", ""),
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[it->second-1]), LiteralString::DOUBLE, "", "")));
}
fields[fid].body = alloc->make<DesugaredObject>(E, ASTs{}, named_captures);

scratch = makeObject<HeapSimpleObject>(BindingFrame{}, fields, ASTs{});
} else {
scratch = makeNull();
}
return nullptr;
}

const AST *builtinRegexFullMatch(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexFullMatch", args, {Value::STRING, Value::STRING});

std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);

return regexMatch(pattern, string, true);
}

const AST *builtinRegexPartialMatch(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexPartialMatch", args, {Value::STRING, Value::STRING});

std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);

return regexMatch(pattern, string, false);
}

const AST *builtinRegexQuoteMeta(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexQuoteMeta", args, {Value::STRING});
scratch = makeString(decode_utf8(RE2::QuoteMeta(encode_utf8(static_cast<HeapString *>(args[0].v.h)->value))));
return nullptr;
}

const AST *builtinRegexReplace(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexReplace", args, {Value::STRING, Value::STRING, Value::STRING});

std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);

RE2 re(pattern, RE2::CannedOptions::Quiet);
if(!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

RE2::Replace(&string, re, replace);
scratch = makeString(decode_utf8(string));
return nullptr;
}

const AST *builtinRegexGlobalReplace(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexGlobalReplace", args, {Value::STRING, Value::STRING, Value::STRING});

std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);

RE2 re(pattern, RE2::CannedOptions::Quiet);
if(!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

RE2::GlobalReplace(&string, re, replace);
scratch = makeString(decode_utf8(string));
return nullptr;
}

const AST *builtinTrace(const LocationRange &loc, const std::vector<Value> &args)
{
if(args[0].t != Value::STRING) {
Expand Down
Loading