From e3d1449533133ea973610395e34826e912162179 Mon Sep 17 00:00:00 2001 From: David Zero Date: Mon, 11 Sep 2023 20:44:40 -0400 Subject: [PATCH] deps: Remove ICU data dep Builds ICU data into a static library as part of the build instead of fetching a binary data release archive, and links the library into the URL lib. --- .bazelrc | 6 +- WORKSPACE | 16 ++-- third_party/icu.BUILD | 166 ++++++++++++++++++++++++++++++++++++++++-- url/BUILD | 14 +++- url/url.cpp | 35 --------- 5 files changed, 182 insertions(+), 55 deletions(-) diff --git a/.bazelrc b/.bazelrc index 6431de17e..c92d11366 100644 --- a/.bazelrc +++ b/.bazelrc @@ -2,7 +2,8 @@ # ========================================================= build --enable_platform_specific_config -build --test_env=HASTUR_ICU_DATA=external/icu-data/ +build --subcommands +build --verbose_failures coverage --combined_report=lcov test --test_output=errors test --test_summary=terse @@ -30,11 +31,14 @@ build:linux --cxxopt='-fno-rtti' build:linux --copt='-gdwarf-4' build:windows --enable_runfiles +build:windows --action_env=LOCALAPPDATA +build:windows --action_env=ProgramData build:windows --cxxopt='/std:c++latest' build:windows --cxxopt='/GR-' # Disable rtti. build:windows --copt='/permissive-' # Conform to the standard. build:windows --copt='/Zc:__cplusplus' # Report the real supported C++ version, not just C++98. build:windows --copt='-utf-8' # Use UTF-8 as the source and execution character sets. +build:windows --host_copt='-utf-8' # Use UTF-8 as the source and execution character sets. # Special build options # ========================================================= diff --git a/WORKSPACE b/WORKSPACE index 755cfca0a..3b89c969e 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -126,18 +126,12 @@ http_archive( patch_cmds = [ "rm source/common/BUILD.bazel", "rm source/stubdata/BUILD.bazel", + "rm source/tools/toolutil/BUILD.bazel", + "rm source/i18n/BUILD.bazel", ], - sha256 = "818a80712ed3caacd9b652305e01afc7fa167e6f2e94996da44b90c2ab604ce1", - strip_prefix = "icu", - url = "https://github.com/unicode-org/icu/releases/download/release-73-2/icu4c-73_2-src.tgz", -) - -# https://github.com/unicode-org/icu -http_archive( - name = "icu-data", # Unicode-DFS-2016 - build_file_content = """exports_files(["icudt73l.dat"])""", - sha256 = "2657bd18c23b930ddf63f466192832cc083256515e07b5a5e7d79c5c1db058a1", - url = "https://github.com/unicode-org/icu/releases/download/release-73-2/icu4c-73_2-data-bin-l.zip", + sha256 = "4b6c4a79b0648d228d505601e58780a59e9ad4eaad54be75cc637bd635aa46d6", + strip_prefix = "icu-release-73-2/icu4c", + url = "https://github.com/unicode-org/icu/archive/refs/tags/release-73-2.zip", ) # https://github.com/ocornut/imgui diff --git a/third_party/icu.BUILD b/third_party/icu.BUILD index bbec5c462..17cf262bd 100644 --- a/third_party/icu.BUILD +++ b/third_party/icu.BUILD @@ -1,4 +1,9 @@ -load("@rules_cc//cc:defs.bzl", "cc_library") +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") + +config_setting( + name = "is_ubsan_build", + values = {"features": "ubsan"}, +) cc_library( name = "common", @@ -9,6 +14,7 @@ cc_library( "source/stubdata/*.cpp", ]), hdrs = glob([ + "source/common/*.h", "source/common/unicode/*.h", ]), copts = select({ @@ -33,10 +39,6 @@ cc_library( }), defines = [ "U_STATIC_IMPLEMENTATION", - "U_COMMON_IMPLEMENTATION", - "U_CHARSET_IS_UTF8=1", - "U_HIDE_OBSOLETE_UTF_OLD_H=1", - "UCONFIG_NO_CONVERSION=1", ], linkopts = select({ "@platforms//os:windows": [ @@ -45,6 +47,160 @@ cc_library( "//conditions:default": ["-ldl"], }), linkstatic = True, + local_defines = [ + "U_COMMON_IMPLEMENTATION", + "U_CHARSET_IS_UTF8=1", + "U_HIDE_OBSOLETE_UTF_OLD_H=1", + ], strip_include_prefix = "source/common/", visibility = ["//visibility:public"], ) + +cc_library( + name = "toolutil", + srcs = glob(["source/tools/toolutil/*.cpp"]), + hdrs = glob(["source/tools/toolutil/*.h"]), + copts = select({ + "@platforms//os:windows": [ + "/GR", + ], + "//conditions:default": [ + "-frtti", + ], + }), + linkstatic = True, + local_defines = ["U_TOOLUTIL_IMPLEMENTATION"] + select({ + "@platforms//os:windows": [], + "//conditions:default": [ + "U_ELF", + ], + }), + strip_include_prefix = "source/tools/toolutil", + visibility = ["//visibility:private"], + deps = [ + ":common", + ":i18n", + ], +) + +cc_library( + name = "i18n", + srcs = glob(["source/i18n/*.cpp"]), + hdrs = glob([ + "source/i18n/*.h", + "source/i18n/unicode/*.h", + ]), + copts = select({ + "@platforms//os:windows": [ + "/GR", + ], + "//conditions:default": [ + "-frtti", + ], + }), + linkstatic = True, + local_defines = [ + "U_I18N_IMPLEMENTATION", + ], + strip_include_prefix = "source/i18n", + visibility = ["//visibility:private"], + deps = [":common"], +) + +cc_binary( + name = "gensprep", + srcs = glob(["source/tools/gensprep/*.c"]) + ["source/tools/gensprep/gensprep.h"], + visibility = ["//visibility:private"], + deps = [ + ":common", + ":i18n", + ":toolutil", + ], +) + +SPREP_DATA = glob(["source/data/sprep/*.txt"]) + +SPREP_DATA_COMPILED = [s.replace("txt", "spp").rpartition("/")[2] for s in SPREP_DATA] + +filegroup( + name = "normalizations", + srcs = ["source/data/unidata/NormalizationCorrections.txt"], +) + +[genrule( + name = "run_sprep_" + input.replace(".txt", "").rpartition("/")[2], + srcs = [input], + outs = [input.replace("txt", "spp").rpartition("/")[2]], + cmd = "./$(location gensprep) -d $(RULEDIR) -b " + input.replace(".txt", "").rpartition("/")[2] + " -m external/icu/source/data/unidata/ -u 3.2.0 $<", + tools = [ + ":gensprep", + ":normalizations", + ], + visibility = ["//visibility:public"], +) for input in SPREP_DATA] + +genrule( + name = "create_pkgdata_lst", + srcs = SPREP_DATA_COMPILED, + outs = ["pkgdata.lst"], + cmd = "echo -e \"" + "\\n".join(SPREP_DATA_COMPILED) + "\" > $(RULEDIR)/pkgdata.lst && echo uts46.nrm >> $(RULEDIR)/pkgdata.lst", +) + +genrule( + name = "move uts46.nrm", + srcs = ["source/data/in/uts46.nrm"], + outs = ["uts46.nrm"], + cmd = "cp $< $(RULEDIR)", +) + +cc_binary( + name = "icupkg", + srcs = ["source/tools/icupkg/icupkg.cpp"], + visibility = ["//visibility:private"], + deps = [ + ":common", + ":i18n", + ":toolutil", + ], +) + +cc_binary( + name = "pkgdata", + srcs = [ + "source/tools/pkgdata/pkgdata.cpp", + "source/tools/pkgdata/pkgtypes.c", + "source/tools/pkgdata/pkgtypes.h", + ], + copts = select({ + ":is_ubsan_build": [ + "-Wl,--whole-archive", + "-L/usr/lib/clang/15/lib/linux/", + "-lclang_rt.ubsan_standalone_cxx-x86_64", + "-Wl,--no-whole-archive", + ], + "//conditions:default": [], + }), + visibility = ["//visibility:private"], + deps = [ + ":common", + ":i18n", + ":toolutil", + ], +) + +genrule( + name = "run_pkgdata", + srcs = [ + "pkgdata.lst", + "uts46.nrm", + ] + SPREP_DATA_COMPILED, + outs = ["libicudt73l.a"], + #cmd = r"srcs=($(SRCS)); PATH=$$PATH:$(location icupkg):`'C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe' -latest -find **\lib.exe | head -n1 | sed 's/\\[^\\]*$$/\\/'` $(location pkgdata) -e icudt73 -s $(RULEDIR) -d $(RULEDIR) -p icudt73l -m static $${srcs[0]}", + #cmd = r"srcs=($(SRCS)); PATH=$$PATH:$(location icupkg):`'C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe' -latest -find **\lib.exe | head -n1 | rev | cut -d '\' -f2- | rev` $(location pkgdata) -e icudt73 -s $(RULEDIR) -d $(RULEDIR) -p icudt73l -m static $${srcs[0]}", + cmd = r"""srcs=($(SRCS)); export PATH=$$PATH:$(location icupkg):"/$$('C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe' -latest -find '**\lib.exe' | head -n1 | awk -F '\' 'BEGIN{OFS=FS} {$$NF=""; print}' | tr -d ':' | tr '\134' '/')"; echo "BULLSHIT: $$('C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe' -latest -find '**\lib.exe')"; $(location pkgdata) -e icudt73 -s $(RULEDIR) -d $(RULEDIR) -p icudt73l -m static $${srcs[0]}""", + tools = [ + ":icupkg", + ":pkgdata", + ], + visibility = ["//visibility:public"], +) diff --git a/url/BUILD b/url/BUILD index df4e278e9..b55757ef6 100644 --- a/url/BUILD +++ b/url/BUILD @@ -1,4 +1,4 @@ -load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") +load("@rules_cc//cc:defs.bzl", "cc_import", "cc_library", "cc_test") load("@rules_fuzzing//fuzzing:cc_defs.bzl", "cc_fuzz_test") load("//bzl:copts.bzl", "HASTUR_COPTS", "HASTUR_FUZZ_PLATFORMS") @@ -13,14 +13,22 @@ cc_library( deps = ["@icu//:common"], ) +cc_import( + name = "icudata", + static_library = "@icu//:libicudt73l.a", + alwayslink = True, +) + cc_library( name = "url", - srcs = ["url.cpp"], + srcs = [ + "url.cpp", + ], hdrs = ["url.h"], copts = HASTUR_COPTS, - data = ["@icu-data//:icudt73l.dat"], visibility = ["//visibility:public"], deps = [ + ":icudata", ":rtti_hack", "//util:base_parser", "//util:string", diff --git a/url/url.cpp b/url/url.cpp index 9d0ea9e4c..599b6ed12 100644 --- a/url/url.cpp +++ b/url/url.cpp @@ -118,39 +118,6 @@ struct PercentEncodeSet { static constexpr bool component(char c) { return userinfo(c) || (c >= '$' && c <= '&') || c == '+' || c == ','; } }; -void icu_init() { - static std::atomic called_once = false; - - if (called_once.exchange(true)) { - return; - } - - // NOLINTNEXTLINE(concurrency-mt-unsafe): This is going away soon. - char *data = std::getenv("HASTUR_ICU_DATA"); - - if (data != nullptr) { - std::filesystem::path env_path{data}; - - if (std::filesystem::is_directory(env_path)) { - u_setDataDirectory(env_path.string().c_str()); - } - } else { - // Use current working directory as a last resort. - // TODO(zero-one): Look at engine config for paths. - u_setDataDirectory(std::filesystem::current_path().string().c_str()); - } - - UErrorCode err = U_ZERO_ERROR; - - std::uint32_t opts = - UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_USE_STD3_RULES; - - [[maybe_unused]] auto *uts = icu::IDNA::createUTS46Instance(opts, err); - - assert(!U_FAILURE(err)); - - delete uts; -} } // namespace void icu_cleanup() { @@ -1189,8 +1156,6 @@ void UrlParser::state_fragment() { // https://url.spec.whatwg.org/#concept-domain-to-ascii std::optional UrlParser::domain_to_ascii(std::string_view domain, bool be_strict) const { - icu_init(); - std::string ascii_domain; icu::StringByteSink tmp{&ascii_domain};