forked from imaami/libcanth
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
test-utf8: UTF-8 parser test program
- Loading branch information
Showing
3 changed files
with
235 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,3 +14,4 @@ | |
/.vscode | ||
/build | ||
/src/test | ||
/src/test-utf8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
/* SPDX-License-Identifier: LGPL-3.0-or-later */ | ||
/** @file test-utf8.c | ||
* | ||
* @author Juuso Alasuutari | ||
*/ | ||
#include <errno.h> | ||
#include <inttypes.h> | ||
|
||
#define PROGNAME "test-utf8" | ||
#define SYNOPSIS "[OPTION]... [--] [STRING]..." | ||
#define PURPOSE "Inspect UTF-8 encoded strings" | ||
|
||
#define OPTIONS(X) \ | ||
X(boolean, help, 'h', "help", \ | ||
"print this help text and exit") \ | ||
\ | ||
X(boolean, bytes, 'b', "bytes", \ | ||
"count bytes inside valid UTF-8") \ | ||
\ | ||
X(boolean, chars, 'c', "chars", \ | ||
"count valid unicode characters") \ | ||
\ | ||
X(boolean, join, 'j', "join", \ | ||
"treat arguments as one string") \ | ||
\ | ||
X(boolean, print, 'p', "print", \ | ||
"print strings even when counting") \ | ||
\ | ||
X(boolean, skip, 's', "skip", \ | ||
"skip invalid UTF-8, don't replace") \ | ||
\ | ||
X(boolean, quiet, 'q', "quiet", \ | ||
"report invalid UTF-8 via exit code") | ||
|
||
#define DETAILS \ | ||
"The default behaviour without option arguments is to\n" \ | ||
"print input strings on different lines, substituting\n" \ | ||
"the replacement symbol U+FFFD for invalid sequences." | ||
|
||
#include "letopt.h" | ||
|
||
#undef DETAILS | ||
#undef OPTIONS | ||
#undef PURPOSE | ||
#undef SYNOPSIS | ||
#undef PROGNAME | ||
|
||
#include "dbg.h" | ||
#include "utf8.h" | ||
|
||
static const_inline size_t | ||
saturated_add_uz (size_t a, | ||
size_t b) | ||
{ | ||
if ((a += b) < b) | ||
a = SIZE_MAX; | ||
return a; | ||
} | ||
|
||
static struct uz2 { | ||
size_t max; | ||
size_t sum; | ||
} arg_sizes (struct letopt const *opt) | ||
{ | ||
struct uz2 ret = {0}; | ||
for (int i = 0; i < letopt_nargs(opt); ++i) { | ||
size_t n = strlen(letopt_arg(opt, i)); | ||
ret.sum = saturated_add_uz(ret.sum, n); | ||
if (n > ret.max) | ||
ret.max = n; | ||
} | ||
return ret; | ||
} | ||
|
||
static int | ||
arg_conflict (struct letopt *opt); | ||
|
||
int | ||
main (int c, | ||
char **v) | ||
{ | ||
struct letopt opt = letopt_init(c, v); | ||
|
||
if (arg_conflict(&opt)) | ||
return letopt_fini(&opt); | ||
|
||
if (letopt_nargs(&opt) < 1 || opt.m_help) | ||
letopt_helpful_exit(&opt); | ||
|
||
struct uz2 sz = arg_sizes(&opt); | ||
size_t buf_sz = opt.m_join ? sz.sum : sz.max; | ||
buf_sz = saturated_add_uz( | ||
saturated_add_uz(buf_sz, 2U), | ||
saturated_add_uz(buf_sz, buf_sz)); | ||
|
||
if (buf_sz == SIZE_MAX) { | ||
pr_err_("%s", strerror(EOVERFLOW)); | ||
(void)letopt_fini(&opt); | ||
return EXIT_FAILURE; | ||
} | ||
|
||
char *buf = malloc(buf_sz); | ||
if (!buf) { | ||
pr_errno_(errno, "malloc(%zu)", buf_sz); | ||
(void)letopt_fini(&opt); | ||
return EXIT_FAILURE; | ||
} | ||
|
||
const bool count = opt.m_bytes || opt.m_chars; | ||
const bool print = !opt.m_quiet && (opt.m_print || !count); | ||
size_t n_chars = 0, n_bytes = 0; | ||
struct utf8 u8p = utf8(); | ||
char *out = buf; | ||
|
||
for (int i = 0; i < letopt_nargs(&opt); ++i) { | ||
uint8_t const *p = (uint8_t const *)letopt_arg(&opt, i); | ||
for (uint8_t const *q = p; *q; q = p) { | ||
p = utf8_parse_next_code_point(&u8p, q); | ||
if (u8p.error) { | ||
if (print && | ||
!opt.m_skip && (*p || !opt.m_join)) { | ||
*out++ = (char)(unsigned char)0xefU; | ||
*out++ = (char)(unsigned char)0xbfU; | ||
*out++ = (char)(unsigned char)0xbdU; | ||
} | ||
if (!*p) | ||
break; | ||
if (p == q && utf8_expects_leading_byte(&u8p)) | ||
++p; | ||
utf8_reset(&u8p); | ||
continue; | ||
} | ||
|
||
if (opt.m_bytes) | ||
n_bytes += utf8_size(&u8p); | ||
if (opt.m_chars) | ||
n_chars += 1U; | ||
|
||
if (print) { | ||
for (char const *s = utf8_result(&u8p); *s;) { | ||
*out++ = *s++; | ||
} | ||
} | ||
} | ||
|
||
if (!opt.m_join) { | ||
if (print) { | ||
if (out == buf || *(out - 1) != '\n') | ||
*out++ = '\n'; | ||
*out = '\0'; | ||
out = buf; | ||
|
||
if (!count) | ||
(void)fputs(buf, stdout); | ||
else if (!opt.m_bytes || !opt.m_chars) | ||
(void)printf("%zu\t%s", opt.m_bytes | ||
? n_bytes : n_chars, buf); | ||
else | ||
(void)printf("%zu\t%zu\t%s", n_chars, | ||
n_bytes, buf); | ||
|
||
} else if (count) { | ||
if (!opt.m_bytes || !opt.m_chars) | ||
(void)printf("%zu\n", opt.m_bytes | ||
? n_bytes : n_chars); | ||
else | ||
(void)printf("%zu\t%zu\n", n_chars, | ||
n_bytes); | ||
} | ||
|
||
n_chars = 0; | ||
n_bytes = 0; | ||
utf8_reset(&u8p); | ||
} | ||
} | ||
|
||
if (opt.m_join) { | ||
if (print) { | ||
if (out == buf || *(out - 1) != '\n') | ||
*out++ = '\n'; | ||
*out = '\0'; | ||
|
||
if (!count) | ||
(void)fputs(buf, stdout); | ||
else if (!opt.m_bytes || !opt.m_chars) | ||
(void)printf("%zu\t%s", opt.m_bytes | ||
? n_bytes : n_chars, buf); | ||
else | ||
(void)printf("%zu\t%zu\t%s", n_chars, | ||
n_bytes, buf); | ||
|
||
} else if (count) { | ||
if (!opt.m_bytes || !opt.m_chars) | ||
(void)printf("%zu\n", opt.m_bytes | ||
? n_bytes : n_chars); | ||
else | ||
(void)printf("%zu\t%zu\n", n_chars, | ||
n_bytes); | ||
} | ||
} | ||
|
||
free(buf); | ||
return letopt_fini(&opt); | ||
} | ||
|
||
static int | ||
arg_conflict (struct letopt *opt) | ||
{ | ||
bool count = opt->m_bytes || opt->m_chars; | ||
int e = 0; | ||
|
||
if (opt->m_quiet) { | ||
if (count) { | ||
pr_err_("can't count quietly"); | ||
e = EINVAL; | ||
} | ||
|
||
if (opt->m_print) { | ||
pr_err_("can't print quietly"); | ||
e = EINVAL; | ||
} | ||
} | ||
|
||
if (opt->m_skip && | ||
(opt->m_quiet || (!opt->m_print && count))) | ||
pr_wrn_("skip option ignored"); | ||
|
||
opt->p.e = e; | ||
return e; | ||
} |