From 2b6359734a9a91fed7eb2f9b22d5013d9b463d66 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Thu, 22 Feb 2024 12:17:43 -0800 Subject: [PATCH] Add ABNF snippets for language tag and media type (#437) Extracted/derived from the RFCs referenced in the GEDCOM spec Signed-off-by: Dave Thaler Co-authored-by: Dave Thaler --- build/extract-grammars.py | 6 +++ extracted-files/core.abnf | 7 ++++ extracted-files/languagetag.abnf | 72 ++++++++++++++++++++++++++++++++ extracted-files/mediatype.abnf | 40 ++++++++++++++++++ 4 files changed, 125 insertions(+) create mode 100644 extracted-files/core.abnf create mode 100644 extracted-files/languagetag.abnf create mode 100644 extracted-files/mediatype.abnf diff --git a/build/extract-grammars.py b/build/extract-grammars.py index 523f8fee..7522017c 100644 --- a/build/extract-grammars.py +++ b/build/extract-grammars.py @@ -42,6 +42,12 @@ def get_paths(): header = line if '{' in header: header = header[:header.find('{')] header = header.strip('# \n\r\t') + with open('languagetag.abnf') as f: + abnf.append(f.read()) + with open('mediatype.abnf') as f: + abnf.append(f.read()) + with open('core.abnf') as f: + abnf.append(f.read()) with open(join(dst,'grammar.abnf'), 'w') as f: f.write('''; This document is in ABNF, see ; This document uses RFC 7405 to add case-sensitive literals to ABNF. diff --git a/extracted-files/core.abnf b/extracted-files/core.abnf new file mode 100644 index 00000000..2de2347b --- /dev/null +++ b/extracted-files/core.abnf @@ -0,0 +1,7 @@ +; Core Rules extracted from RFC 5234 section B.1 +ALPHA = %x41-5A / %x61-7A ; A-Z / a-z +;DIGIT = %x30-39 ; 0-9 +SP = %x20 +HTAB = %x09 ; horizontal tab +DQUOTE = %x22 ; " (Double Quote) +VCHAR = %x21-7E ; visible (printing) characters diff --git a/extracted-files/languagetag.abnf b/extracted-files/languagetag.abnf new file mode 100644 index 00000000..f9414116 --- /dev/null +++ b/extracted-files/languagetag.abnf @@ -0,0 +1,72 @@ +; ABNF derived from RFC 5646 section 2.1 +Language-Tag = langtag ; normal language tags + / privateuse ; private use tag + / grandfathered ; grandfathered tags +langtag = language + ["-" script] + ["-" region] + *("-" variant) + *("-" extension) + ["-" privateuse] + +language = 2*3ALPHA ; shortest ISO 639 code + ["-" extlang] ; sometimes followed by + ; extended language subtags + / 4ALPHA ; or reserved for future use + / 5*8ALPHA ; or registered language subtag + +extlang = 3ALPHA ; selected ISO 639 codes + *2("-" 3ALPHA) ; permanently reserved + +script = 4ALPHA ; ISO 15924 code + +region = 2ALPHA ; ISO 3166-1 code + / 3digit ; UN M.49 code + +variant = 5*8alphanum ; registered variants + / (digit 3alphanum) + +extension = singleton 1*("-" (2*8alphanum)) + + ; Single alphanumerics + ; "x" reserved for private use +singleton = digit ; 0 - 9 + / %x41-57 ; A - W + / %x59-5A ; Y - Z + / %x61-77 ; a - w + / %x79-7A ; y - z + +privateuse = "x" 1*("-" (1*8alphanum)) + +grandfathered = irregular ; non-redundant tags registered + / regular ; during the RFC 3066 era + +irregular = "en-GB-oed" ; irregular tags do not match + / "i-ami" ; the 'langtag' production and + / "i-bnn" ; would not otherwise be + / "i-default" ; considered 'well-formed' + / "i-enochian" ; These tags are all valid, + / "i-hak" ; but most are deprecated + / "i-klingon" ; in favor of more modern + / "i-lux" ; subtags or subtag + / "i-mingo" ; combination + / "i-navajo" + / "i-pwn" + / "i-tao" + / "i-tay" + / "i-tsu" + / "sgn-BE-FR" + / "sgn-BE-NL" + / "sgn-CH-DE" + +regular = "art-lojban" ; these tags match the 'langtag' + / "cel-gaulish" ; production, but their subtags + / "no-bok" ; are not extended language + / "no-nyn" ; or variant subtags: their meaning + / "zh-guoyu" ; is defined by their registration + / "zh-hakka" ; and all of these are deprecated + / "zh-min" ; in favor of a more modern + / "zh-min-nan" ; subtag or sequence of subtags + / "zh-xiang" + +alphanum = (ALPHA / digit) ; letters and numbers diff --git a/extracted-files/mediatype.abnf b/extracted-files/mediatype.abnf new file mode 100644 index 00000000..d518e0da --- /dev/null +++ b/extracted-files/mediatype.abnf @@ -0,0 +1,40 @@ +; ABNF derived from RFC 2045 section 5.1 +type = discrete-type / composite-type +discrete-type = "text" / "image" / "audio" / "video" / + "application" / extension-token +composite-type = "message" / "multipart" / extension-token +extension-token = ietf-token / x-token +ietf-token = type-name +x-token = "x-" token +subtype = extension-token / iana-token +iana-token = subtype-name + +; ABNF derived from RFC 6838 section 4.2 +type-name = restricted-name +subtype-name = restricted-name + +restricted-name = restricted-name-first *126restricted-name-chars +restricted-name-first = ALPHA / digit +restricted-name-chars = ALPHA / digit / "!" / "#" / + "$" / "&" / "-" / "^" / "_" +restricted-name-chars =/ "." ; Characters before first dot always + ; specify a facet name +restricted-name-chars =/ "+" ; Characters after last plus always + ; specify a structured syntax suffix + +; ABNF derived from RFC 9110 section 5.6 +parameters = *( OWS ";" OWS [ parameter ] ) +parameter = parameter-name "=" parameter-value +parameter-name = token +parameter-value = ( token / quoted-string ) +token = 1*tchar +tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" + / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" + / digit / ALPHA + ; any VCHAR, except delimiters +OWS = *( SP / HTAB ) + ; optional whitespace +quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE +qdtext = HTAB / SP / %x21 / %x23-5B / %x5D-7E / obs-text +obs-text = %x80-FF +quoted-pair = "\" ( HTAB / SP / VCHAR / obs-text )