Skip to content

Commit

Permalink
test-utf8: UTF-8 parser test program
Browse files Browse the repository at this point in the history
  • Loading branch information
imaami committed Aug 16, 2024
1 parent f596f79 commit 542267c
Show file tree
Hide file tree
Showing 3 changed files with 235 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
/.vscode
/build
/src/test
/src/test-utf8
5 changes: 4 additions & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
override THIS_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))

override TARGETS := test
override TARGETS := test test-utf8

all:| $(TARGETS)
clean:| $(TARGETS:%=clean-%)
Expand All @@ -23,6 +23,9 @@ override DBG_test := dbg.c
override LIBS_test = $(CJSON_LIBS)
override CFLAGS_test.c = $(CJSON_CFLAGS)

override SRC_test-utf8 := letopt.c test-utf8.c utf8.c
override DBG_test-utf8 := dbg.c

$(call pkg_libs,libcjson,CJSON_LIBS,-lcjson)
$(call pkg_cflags,libcjson,CJSON_CFLAGS)
$(call target_rules,$(TARGETS))
Expand Down
230 changes: 230 additions & 0 deletions src/test-utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
/* SPDX-License-Identifier: LGPL-3.0-or-later */
/** @file test-utf8.c
*
* @author Juuso Alasuutari
*/
#include <errno.h>
#include <inttypes.h>

#define PROGNAME "test-utf8"
#define SYNOPSIS "[OPTION]... [--] [STRING]..."
#define PURPOSE "Inspect UTF-8 encoded strings"

#define OPTIONS(X) \
X(boolean, help, 'h', "help", \
"print this help text and exit") \
\
X(boolean, bytes, 'b', "bytes", \
"count bytes inside valid UTF-8") \
\
X(boolean, chars, 'c', "chars", \
"count valid unicode characters") \
\
X(boolean, join, 'j', "join", \
"treat arguments as one string") \
\
X(boolean, print, 'p', "print", \
"print strings even when counting") \
\
X(boolean, skip, 's', "skip", \
"skip invalid UTF-8, don't replace") \
\
X(boolean, quiet, 'q', "quiet", \
"report invalid UTF-8 via exit code")

#define DETAILS \
"The default behaviour without option arguments is to\n" \
"print input strings on different lines, substituting\n" \
"the replacement symbol U+FFFD for invalid sequences."

#include "letopt.h"

#undef DETAILS
#undef OPTIONS
#undef PURPOSE
#undef SYNOPSIS
#undef PROGNAME

#include "dbg.h"
#include "utf8.h"

static const_inline size_t
saturated_add_uz (size_t a,
size_t b)
{
if ((a += b) < b)
a = SIZE_MAX;
return a;
}

static struct uz2 {
size_t max;
size_t sum;
} arg_sizes (struct letopt const *opt)
{
struct uz2 ret = {0};
for (int i = 0; i < letopt_nargs(opt); ++i) {
size_t n = strlen(letopt_arg(opt, i));
ret.sum = saturated_add_uz(ret.sum, n);
if (n > ret.max)
ret.max = n;
}
return ret;
}

static int
arg_conflict (struct letopt *opt);

int
main (int c,
char **v)
{
struct letopt opt = letopt_init(c, v);

if (arg_conflict(&opt))
return letopt_fini(&opt);

if (letopt_nargs(&opt) < 1 || opt.m_help)
letopt_helpful_exit(&opt);

struct uz2 sz = arg_sizes(&opt);
size_t buf_sz = opt.m_join ? sz.sum : sz.max;
buf_sz = saturated_add_uz(
saturated_add_uz(buf_sz, 2U),
saturated_add_uz(buf_sz, buf_sz));

if (buf_sz == SIZE_MAX) {
pr_err_("%s", strerror(EOVERFLOW));
(void)letopt_fini(&opt);
return EXIT_FAILURE;
}

char *buf = malloc(buf_sz);
if (!buf) {
pr_errno_(errno, "malloc(%zu)", buf_sz);
(void)letopt_fini(&opt);
return EXIT_FAILURE;
}

const bool count = opt.m_bytes || opt.m_chars;
const bool print = !opt.m_quiet && (opt.m_print || !count);
size_t n_chars = 0, n_bytes = 0;
struct utf8 u8p = utf8();
char *out = buf;

for (int i = 0; i < letopt_nargs(&opt); ++i) {
uint8_t const *p = (uint8_t const *)letopt_arg(&opt, i);
for (uint8_t const *q = p; *q; q = p) {
p = utf8_parse_next_code_point(&u8p, q);
if (u8p.error) {
if (print &&
!opt.m_skip && (*p || !opt.m_join)) {
*out++ = (char)(unsigned char)0xefU;
*out++ = (char)(unsigned char)0xbfU;
*out++ = (char)(unsigned char)0xbdU;
}
if (!*p)
break;
if (p == q && utf8_expects_leading_byte(&u8p))
++p;
utf8_reset(&u8p);
continue;
}

if (opt.m_bytes)
n_bytes += utf8_size(&u8p);
if (opt.m_chars)
n_chars += 1U;

if (print) {
for (char const *s = utf8_result(&u8p); *s;) {
*out++ = *s++;
}
}
}

if (!opt.m_join) {
if (print) {
if (out == buf || *(out - 1) != '\n')
*out++ = '\n';
*out = '\0';
out = buf;

if (!count)
(void)fputs(buf, stdout);
else if (!opt.m_bytes || !opt.m_chars)
(void)printf("%zu\t%s", opt.m_bytes
? n_bytes : n_chars, buf);
else
(void)printf("%zu\t%zu\t%s", n_chars,
n_bytes, buf);

} else if (count) {
if (!opt.m_bytes || !opt.m_chars)
(void)printf("%zu\n", opt.m_bytes
? n_bytes : n_chars);
else
(void)printf("%zu\t%zu\n", n_chars,
n_bytes);
}

n_chars = 0;
n_bytes = 0;
utf8_reset(&u8p);
}
}

if (opt.m_join) {
if (print) {
if (out == buf || *(out - 1) != '\n')
*out++ = '\n';
*out = '\0';

if (!count)
(void)fputs(buf, stdout);
else if (!opt.m_bytes || !opt.m_chars)
(void)printf("%zu\t%s", opt.m_bytes
? n_bytes : n_chars, buf);
else
(void)printf("%zu\t%zu\t%s", n_chars,
n_bytes, buf);

} else if (count) {
if (!opt.m_bytes || !opt.m_chars)
(void)printf("%zu\n", opt.m_bytes
? n_bytes : n_chars);
else
(void)printf("%zu\t%zu\n", n_chars,
n_bytes);
}
}

free(buf);
return letopt_fini(&opt);
}

static int
arg_conflict (struct letopt *opt)
{
bool count = opt->m_bytes || opt->m_chars;
int e = 0;

if (opt->m_quiet) {
if (count) {
pr_err_("can't count quietly");
e = EINVAL;
}

if (opt->m_print) {
pr_err_("can't print quietly");
e = EINVAL;
}
}

if (opt->m_skip &&
(opt->m_quiet || (!opt->m_print && count)))
pr_wrn_("skip option ignored");

opt->p.e = e;
return e;
}

0 comments on commit 542267c

Please sign in to comment.