From 1750ccafc0f43e354cbe3eff6ef5487e92584119 Mon Sep 17 00:00:00 2001 From: Mukti Date: Sun, 30 Jun 2024 12:32:12 +0700 Subject: [PATCH] fix: drop incorrect namespace handling (#15) --- README.md | 2 +- docs/USAGE.md | 4 +- internal/gpx/schema/extensions.go | 2 +- internal/gpx/schema/gpx.go | 2 +- internal/gpx/schema/metadata.go | 6 +-- internal/gpx/schema/track.go | 6 +-- internal/main.go | 4 +- internal/xlsx/schema/sheet.go | 6 +-- token.go | 43 +++++++---------- token_test.go | 20 ++++---- tokenizer.go | 21 ++++---- tokenizer_test.go | 79 +++++++++++++++++++++---------- 12 files changed, 112 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 05414fa..7c12bcc 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![CodeCov](https://codecov.io/gh/muktihari/xmltokenizer/branch/master/graph/badge.svg)](https://codecov.io/gh/muktihari/xmltokenizer) [![Go Report Card](https://goreportcard.com/badge/github.com/muktihari/xmltokenizer)](https://goreportcard.com/report/github.com/muktihari/xmltokenizer) -XML Tokenizer is a low-memory high performance library for parsing simple XML 1.0. This is an alternative option to the standard library's xml when speed is your main concern. This may not cover all XML files, but it can cover typical XML files. +XML Tokenizer is a low-memory high performance non-namespace parser library for parsing simple XML 1.0. This is an alternative option to the standard library's xml when speed is your main concern and you are willing to sacrifice certain features, such as handling the namespace, in favor of speed ([discussion](https://www.reddit.com/r/golang/comments/1drdji3/xml_tokenizer_thats_4x_faster_than_stdlibs_xml/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button)). This may not cover all XML files, but it can cover typical XML files. # Motivation diff --git a/docs/USAGE.md b/docs/USAGE.md index fb9f7cf..424f88d 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -99,7 +99,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token if token.IsEndElementOf(se) { // Reach desired EndElement return nil } - if token.IsEndElement() { // Ignore child's EndElements + if token.IsEndElement { // Ignore child's EndElements continue } switch string(token.Name.Local) { @@ -145,7 +145,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke if token.IsEndElementOf(se) { // Reach desired EndElement return nil } - if token.IsEndElement() { // Ignore child's EndElements + if token.IsEndElement { // Ignore child's EndElements continue } switch string(token.Name.Local) { diff --git a/internal/gpx/schema/extensions.go b/internal/gpx/schema/extensions.go index c3e070a..f679038 100644 --- a/internal/gpx/schema/extensions.go +++ b/internal/gpx/schema/extensions.go @@ -38,7 +38,7 @@ func (t *TrackpointExtension) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xm if token.IsEndElementOf(se) { return nil } - if token.IsEndElement() { + if token.IsEndElement { continue } diff --git a/internal/gpx/schema/gpx.go b/internal/gpx/schema/gpx.go index f67fe8a..fcd1590 100644 --- a/internal/gpx/schema/gpx.go +++ b/internal/gpx/schema/gpx.go @@ -35,7 +35,7 @@ func (g *GPX) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token if token.IsEndElementOf(se) { return nil } - if token.IsEndElement() { + if token.IsEndElement { continue } diff --git a/internal/gpx/schema/metadata.go b/internal/gpx/schema/metadata.go index d91af34..aa061a6 100644 --- a/internal/gpx/schema/metadata.go +++ b/internal/gpx/schema/metadata.go @@ -27,7 +27,7 @@ func (m *Metadata) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer. if token.IsEndElementOf(se) { return nil } - if token.IsEndElement() { + if token.IsEndElement { continue } @@ -123,7 +123,7 @@ func (a *Author) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.To if token.IsEndElementOf(se) { return nil } - if token.IsEndElement() { + if token.IsEndElement { continue } @@ -199,7 +199,7 @@ func (a *Link) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke if token.IsEndElementOf(se) { return nil } - if token.IsEndElement() { + if token.IsEndElement { continue } diff --git a/internal/gpx/schema/track.go b/internal/gpx/schema/track.go index 0d0fdce..1f82109 100644 --- a/internal/gpx/schema/track.go +++ b/internal/gpx/schema/track.go @@ -26,7 +26,7 @@ func (t *Track) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Tok if token.IsEndElementOf(se) { return nil } - if token.IsEndElement() { + if token.IsEndElement { continue } @@ -98,7 +98,7 @@ func (t *TrackSegment) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokeni if token.IsEndElementOf(se) { return nil } - if token.IsEndElement() { + if token.IsEndElement { continue } @@ -186,7 +186,7 @@ func (w *Waypoint) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer. if token.IsEndElementOf(se) { return nil } - if token.IsEndElement() { + if token.IsEndElement { continue } diff --git a/internal/main.go b/internal/main.go index d18a256..06897bb 100644 --- a/internal/main.go +++ b/internal/main.go @@ -77,7 +77,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token if token.IsEndElementOf(se) { // Reach desired EndElement return nil } - if token.IsEndElement() { // Ignore child's EndElements + if token.IsEndElement { // Ignore child's EndElements continue } switch string(token.Name.Local) { @@ -123,7 +123,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke if token.IsEndElementOf(se) { // Reach desired EndElement return nil } - if token.IsEndElement() { // Ignore child's EndElements + if token.IsEndElement { // Ignore child's EndElements continue } switch string(token.Name.Local) { diff --git a/internal/xlsx/schema/sheet.go b/internal/xlsx/schema/sheet.go index 0ebf88a..d9da549 100644 --- a/internal/xlsx/schema/sheet.go +++ b/internal/xlsx/schema/sheet.go @@ -21,7 +21,7 @@ func (s *SheetData) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer if token.IsEndElementOf(se) { break } - if token.IsEndElement() { + if token.IsEndElement { continue } @@ -67,7 +67,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token if token.IsEndElementOf(se) { break } - if token.IsEndElement() { + if token.IsEndElement { continue } @@ -127,7 +127,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke if token.IsEndElementOf(se) { break } - if token.IsEndElement() { + if token.IsEndElement { continue } diff --git a/token.go b/token.go index a72b4d8..caa6508 100644 --- a/token.go +++ b/token.go @@ -22,29 +22,21 @@ func PutToken(t *Token) { pool.Put(t) } // // // ]> +// +// Token includes CharData or CDATA in Data field when it appears right after the start element. type Token struct { - Name Name // Name can be a StartElement: "name", a EndElement: "/name" or empty when a tag starts with " 0. - Data []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "" e.g. . Also true when a tag starts with " -func (t *Token) IsEndElement() bool { - if len(t.Name.Full) > 0 && t.Name.Full[0] == '/' { - return true - } - return false + Name Name // Name is an XML name, empty when a tag starts with " 0. + Data []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "" e.g. . Also true when a tag starts with " or . } // IsEndElementOf checks whether the given token represent a -// n end element (closing tag) of given startElement. -func (t *Token) IsEndElementOf(t2 *Token) bool { - if !t.IsEndElement() { - return false - } - if string(t.Name.Full[1:]) == string(t2.Name.Full) { +// n end element (closing tag) of given StartElement. +func (t *Token) IsEndElementOf(se *Token) bool { + if t.IsEndElement && + string(t.Name.Full) == string(se.Name.Full) { return true } return false @@ -53,12 +45,13 @@ func (t *Token) IsEndElementOf(t2 *Token) bool { // Copy copies src Token into t, returning t. Attrs should be // consumed immediately since it's only being shallow copied. func (t *Token) Copy(src Token) *Token { - t.Name.Space = append(t.Name.Space[:0], src.Name.Space...) + t.Name.Prefix = append(t.Name.Prefix[:0], src.Name.Prefix...) t.Name.Local = append(t.Name.Local[:0], src.Name.Local...) t.Name.Full = append(t.Name.Full[:0], src.Name.Full...) t.Attrs = append(t.Attrs[:0], src.Attrs...) // shallow copy t.Data = append(t.Data[:0], src.Data...) t.SelfClosing = src.SelfClosing + t.IsEndElement = src.IsEndElement return t } @@ -68,10 +61,10 @@ type Attr struct { Value []byte } -// Name represents an XML name (Local) annotated -// with a name space identifier (Space). +// Name represents an XML name , +// we don't manage the bookkeeping of namespaces. type Name struct { - Space []byte - Local []byte - Full []byte // Full is combination of "space:local" + Prefix []byte + Local []byte + Full []byte // Full is combination of "prefix:local" } diff --git a/token_test.go b/token_test.go index 782f539..82d9ec7 100644 --- a/token_test.go +++ b/token_test.go @@ -27,8 +27,9 @@ func TestIsEndElement(t *testing.T) { name: "an end element", token: xmltokenizer.Token{ Name: xmltokenizer.Name{ - Full: []byte("/worksheet"), + Full: []byte("worksheet"), }, + IsEndElement: true, }, expected: true, }, @@ -54,7 +55,7 @@ func TestIsEndElement(t *testing.T) { for _, tc := range tt { t.Run(tc.name, func(t *testing.T) { - if r := tc.token.IsEndElement(); r != tc.expected { + if r := tc.token.IsEndElement; r != tc.expected { t.Fatalf("expected: %t, got: %t", tc.expected, r) } }) @@ -71,8 +72,9 @@ func TestIsEndElementOf(t *testing.T) { name: "correct end element", t1: xmltokenizer.Token{ Name: xmltokenizer.Name{ - Full: []byte("/worksheet"), + Full: []byte("worksheet"), }, + IsEndElement: true, }, t2: xmltokenizer.Token{ Name: xmltokenizer.Name{ @@ -123,15 +125,15 @@ func TestIsEndElementOf(t *testing.T) { func TestCopy(t *testing.T) { t1 := xmltokenizer.Token{ Name: xmltokenizer.Name{ - Space: []byte("gpxtpx"), - Local: []byte("hr"), - Full: []byte("gpxtpx:hr"), + Prefix: []byte("gpxtpx"), + Local: []byte("hr"), + Full: []byte("gpxtpx:hr"), }, Attrs: []xmltokenizer.Attr{{ Name: xmltokenizer.Name{ - Space: nil, - Local: []byte("units"), - Full: []byte("units"), + Prefix: nil, + Local: []byte("units"), + Full: []byte("units"), }, Value: []byte("bpm"), }}, diff --git a/tokenizer.go b/tokenizer.go index 1f9c715..913da13 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -17,7 +17,7 @@ const ( const ( defaultReadBufferSize = 4 << 10 autoGrowBufferMaxLimitSize = 1000 << 10 - defaultAttrsBufferSize = 8 + defaultAttrsBufferSize = 16 ) // Tokenizer is a XML tokenizer. @@ -283,12 +283,13 @@ func (t *Tokenizer) manageBuffer() error { } func (t *Tokenizer) clearToken() { - t.token.Name.Space = nil + t.token.Name.Prefix = nil t.token.Name.Local = nil t.token.Name.Full = nil t.token.Attrs = t.token.Attrs[:0] t.token.Data = nil t.token.SelfClosing = false + t.token.IsEndElement = false } // consumeNonTagIdentifier consumes identifier starts with "', ' ': // e.g. , if b[i] == '>' && b[i-1] == '/' { // In case we encounter @@ -324,14 +329,14 @@ func (t *Tokenizer) consumeTagName(b []byte) []byte { } func (t *Tokenizer) consumeAttrs(b []byte) []byte { - var space, local, full []byte + var prefix, local, full []byte var pos, fullpos int var inquote bool for i := range b { switch b[i] { case ':': if !inquote { - space = trim(b[pos:i]) + prefix = trim(b[pos:i]) pos = i + 1 } case '=': @@ -345,10 +350,10 @@ func (t *Tokenizer) consumeAttrs(b []byte) []byte { continue } t.token.Attrs = append(t.token.Attrs, Attr{ - Name: Name{Space: space, Local: local, Full: full}, + Name: Name{Prefix: prefix, Local: local, Full: full}, Value: trim(b[pos+1 : i]), }) - space, local, full = nil, nil, nil + prefix, local, full = nil, nil, nil pos = i + 1 fullpos = i + 1 } diff --git a/tokenizer_test.go b/tokenizer_test.go index d5f5253..a74b939 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -59,9 +59,9 @@ func TestTokenWithInmemXML(t *testing.T) { { Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, Attrs: []xmltokenizer.Attr{ - {Name: xmltokenizer.Name{Space: []byte("xmlns"), Local: []byte("foo"), Full: []byte("xmlns:foo")}, Value: []byte("ns1")}, + {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("foo"), Full: []byte("xmlns:foo")}, Value: []byte("ns1")}, {Name: xmltokenizer.Name{Local: []byte("xmlns"), Full: []byte("xmlns")}, Value: []byte("ns2")}, - {Name: xmltokenizer.Name{Space: []byte("xmlns"), Local: []byte("tag"), Full: []byte("xmlns:tag")}, Value: []byte("ns3")}, + {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("tag"), Full: []byte("xmlns:tag")}, Value: []byte("ns3")}, }, }, { @@ -72,14 +72,16 @@ func TestTokenWithInmemXML(t *testing.T) { Data: []byte("World <>'" 白鵬翔"), }, { - Name: xmltokenizer.Name{Local: []byte("/hello"), Full: []byte("/hello")}, + Name: xmltokenizer.Name{Local: []byte("hello"), Full: []byte("hello")}, + IsEndElement: true, }, { Name: xmltokenizer.Name{Local: []byte("query"), Full: []byte("query")}, Data: []byte("&何; &is-it;"), }, { - Name: xmltokenizer.Name{Local: []byte("/query"), Full: []byte("/query")}, + Name: xmltokenizer.Name{Local: []byte("query"), Full: []byte("query")}, + IsEndElement: true, }, { Name: xmltokenizer.Name{Local: []byte("goodbye"), Full: []byte("goodbye")}, @@ -88,8 +90,8 @@ func TestTokenWithInmemXML(t *testing.T) { { Name: xmltokenizer.Name{Local: []byte("outer"), Full: []byte("outer")}, Attrs: []xmltokenizer.Attr{ - {Name: xmltokenizer.Name{Space: []byte("foo"), Local: []byte("attr"), Full: []byte("foo:attr")}, Value: []byte("value")}, - {Name: xmltokenizer.Name{Space: []byte("xmlns"), Local: []byte("tag"), Full: []byte("xmlns:tag")}, Value: []byte("ns4")}, + {Name: xmltokenizer.Name{Prefix: []byte("foo"), Local: []byte("attr"), Full: []byte("foo:attr")}, Value: []byte("value")}, + {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("tag"), Full: []byte("xmlns:tag")}, Value: []byte("ns4")}, }, }, { @@ -97,17 +99,20 @@ func TestTokenWithInmemXML(t *testing.T) { SelfClosing: true, }, { - Name: xmltokenizer.Name{Local: []byte("/outer"), Full: []byte("/outer")}, + Name: xmltokenizer.Name{Local: []byte("outer"), Full: []byte("outer")}, + IsEndElement: true, }, { - Name: xmltokenizer.Name{Space: []byte("tag"), Local: []byte("name"), Full: []byte("tag:name")}, + Name: xmltokenizer.Name{Prefix: []byte("tag"), Local: []byte("name"), Full: []byte("tag:name")}, Data: []byte("Some text here."), }, { - Name: xmltokenizer.Name{Space: []byte("/tag"), Local: []byte("name"), Full: []byte("/tag:name")}, + Name: xmltokenizer.Name{Prefix: []byte("tag"), Local: []byte("name"), Full: []byte("tag:name")}, + IsEndElement: true, }, { - Name: xmltokenizer.Name{Local: []byte("/body"), Full: []byte("/body")}, + Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, + IsEndElement: true, }, { Data: []byte(""), @@ -135,7 +140,7 @@ func TestTokenWithInmemXML(t *testing.T) { SelfClosing: true, }, {Name: xmltokenizer.Name{Local: []byte("a"), Full: []byte("a")}}, - {Name: xmltokenizer.Name{Local: []byte("/a"), Full: []byte("/a")}}, + {Name: xmltokenizer.Name{Local: []byte("a"), Full: []byte("a")}, IsEndElement: true}, }, }, } @@ -179,18 +184,30 @@ func TestTokenWithSmallXMLFiles(t *testing.T) { Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, Data: []byte("text"), }, - {Name: xmltokenizer.Name{Local: []byte("/data"), Full: []byte("/data")}}, + { + Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, + IsEndElement: true, + }, { Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, Data: []byte("text"), }, - {Name: xmltokenizer.Name{Local: []byte("/data"), Full: []byte("/data")}}, + { + Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, + IsEndElement: true, + }, { Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, Data: []byte("text"), }, - {Name: xmltokenizer.Name{Local: []byte("/data"), Full: []byte("/data")}}, - {Name: xmltokenizer.Name{Local: []byte("/content"), Full: []byte("/content")}}, + { + Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, + IsEndElement: true, + }, + { + Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}, + IsEndElement: true, + }, }}, {filename: "cdata_clrf.xml", expecteds: []xmltokenizer.Token{ tokenHeader, @@ -199,18 +216,30 @@ func TestTokenWithSmallXMLFiles(t *testing.T) { Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, Data: []byte("text"), }, - {Name: xmltokenizer.Name{Local: []byte("/data"), Full: []byte("/data")}}, + { + Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, + IsEndElement: true, + }, { Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, Data: []byte("text"), }, - {Name: xmltokenizer.Name{Local: []byte("/data"), Full: []byte("/data")}}, + { + Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, + IsEndElement: true, + }, { Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, Data: []byte("text"), }, - {Name: xmltokenizer.Name{Local: []byte("/data"), Full: []byte("/data")}}, - {Name: xmltokenizer.Name{Local: []byte("/content"), Full: []byte("/content")}}, + { + Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, + IsEndElement: true, + }, + { + Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}, + IsEndElement: true, + }, }}, {filename: filepath.Join("corrupted", "cdata_truncated.xml"), expecteds: []xmltokenizer.Token{ tokenHeader, @@ -242,16 +271,16 @@ func TestTokenWithSmallXMLFiles(t *testing.T) { }, {Name: xmltokenizer.Name{Local: []byte("note"), Full: []byte("note")}}, {Name: xmltokenizer.Name{Local: []byte("to"), Full: []byte("to")}, Data: []byte("Tove")}, - {Name: xmltokenizer.Name{Local: []byte("/to"), Full: []byte("/to")}}, + {Name: xmltokenizer.Name{Local: []byte("to"), Full: []byte("to")}, IsEndElement: true}, {Name: xmltokenizer.Name{Local: []byte("from"), Full: []byte("from")}, Data: []byte("Jani")}, - {Name: xmltokenizer.Name{Local: []byte("/from"), Full: []byte("/from")}}, + {Name: xmltokenizer.Name{Local: []byte("from"), Full: []byte("from")}, IsEndElement: true}, {Name: xmltokenizer.Name{Local: []byte("heading"), Full: []byte("heading")}, Data: []byte("Reminder")}, - {Name: xmltokenizer.Name{Local: []byte("/heading"), Full: []byte("/heading")}}, + {Name: xmltokenizer.Name{Local: []byte("heading"), Full: []byte("heading")}, IsEndElement: true}, {Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, Data: []byte("Don't forget me this weekend!")}, - {Name: xmltokenizer.Name{Local: []byte("/body"), Full: []byte("/body")}}, + {Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, IsEndElement: true}, {Name: xmltokenizer.Name{Local: []byte("footer"), Full: []byte("footer")}, Data: []byte("&writer; ©right;")}, - {Name: xmltokenizer.Name{Local: []byte("/footer"), Full: []byte("/footer")}}, - {Name: xmltokenizer.Name{Local: []byte("/note"), Full: []byte("/note")}}, + {Name: xmltokenizer.Name{Local: []byte("footer"), Full: []byte("footer")}, IsEndElement: true}, + {Name: xmltokenizer.Name{Local: []byte("note"), Full: []byte("note")}, IsEndElement: true}, }}, }