From 7abc00c539e06379148814ecba94d6defa0f958f Mon Sep 17 00:00:00 2001 From: Carlana Johnson Date: Fri, 17 May 2024 15:19:20 -0400 Subject: [PATCH] xhtml: Use rangefuncs --- internal/blocko/clean.go | 22 ++++++------ internal/blocko/html.go | 14 ++++---- internal/blocko/minify.go | 2 +- internal/iterx/iterx.go | 32 +++++++++++++++++ internal/mailchimp/import.go | 4 +-- internal/xhtml/children.go | 28 +++++++-------- internal/xhtml/children_test.go | 13 ++++--- internal/xhtml/iter.go | 64 ++++++++++++++++----------------- internal/xhtml/new_test.go | 16 ++++----- internal/xhtml/string.go | 5 +-- internal/xhtml/string_test.go | 2 +- internal/xhtml/table.go | 6 ++-- pkg/almanack/service-gdocs.go | 30 ++++++++-------- run.sh | 2 ++ 14 files changed, 137 insertions(+), 103 deletions(-) create mode 100644 internal/iterx/iterx.go diff --git a/internal/blocko/clean.go b/internal/blocko/clean.go index 2b8251ccd..91d73975c 100644 --- a/internal/blocko/clean.go +++ b/internal/blocko/clean.go @@ -19,7 +19,7 @@ func Clean(root *html.Node) { func MergeSiblings(root *html.Node) { // find all matches first - inlineSiblings := xhtml.FindAll(root, func(n *html.Node) bool { + inlineSiblings := xhtml.SelectSlice(root, func(n *html.Node) bool { brother := n.NextSibling return brother != nil && inlineElements[n.DataAtom] && @@ -38,7 +38,7 @@ func MergeSiblings(root *html.Node) { } func RemoveEmptyP(root *html.Node) { - emptyP := xhtml.FindAll(root, func(n *html.Node) bool { + emptyP := xhtml.SelectSlice(root, func(n *html.Node) bool { return n.DataAtom == atom.P && isEmpty(n) }) for _, n := range emptyP { @@ -47,7 +47,7 @@ func RemoveEmptyP(root *html.Node) { } func RemoveMarks(root *html.Node) { - marks := xhtml.FindAll(root, xhtml.WithAtom(atom.Mark)) + marks := xhtml.SelectSlice(root, xhtml.WithAtom(atom.Mark)) for _, mark := range marks { xhtml.UnnestChildren(mark) } @@ -62,9 +62,9 @@ var whitespaceReplacer = strings.NewReplacer( ) func replaceWhitespace(root *html.Node) { - xhtml.VisitAll(root, func(n *html.Node) { + for n := range xhtml.All(root) { if n.Type != html.TextNode { - return + continue } // Ignore children of pre/code codeblock := xhtml.Closest(n, func(n *html.Node) bool { @@ -75,7 +75,7 @@ func replaceWhitespace(root *html.Node) { if codeblock == nil { n.Data = whitespaceReplacer.Replace(n.Data) } - }) + } } var specialReplacer = strings.NewReplacer( @@ -92,21 +92,21 @@ var specialReplacer = strings.NewReplacer( ) func replaceSpecials(root *html.Node) { - xhtml.VisitAll(root, func(n *html.Node) { + for n := range xhtml.All(root) { if n.Type != html.TextNode { - return + continue } // Ignore children not of p codeblock := xhtml.Closest(n, xhtml.WithAtom(atom.P)) if codeblock == nil { - return + continue } n.Data = specialReplacer.Replace(n.Data) - }) + } } func fixBareLI(root *html.Node) { - bareLIs := xhtml.FindAll(root, func(n *html.Node) bool { + bareLIs := xhtml.SelectSlice(root, func(n *html.Node) bool { child := n.FirstChild return n.DataAtom == atom.Li && child != nil && (child.Type == html.TextNode || diff --git a/internal/blocko/html.go b/internal/blocko/html.go index 9de0936d9..a2af0a455 100644 --- a/internal/blocko/html.go +++ b/internal/blocko/html.go @@ -57,23 +57,23 @@ var inlineElements = map[atom.Atom]bool{ func isEmpty(n *html.Node) bool { root := n - n = xhtml.Find(n, func(n *html.Node) bool { + for n := range xhtml.All(n) { if n == root { - return false + continue } switch n.Type { case html.TextNode: s := strings.ReplaceAll(n.Data, "\n", " ") s = strings.TrimSpace(s) if s == "" { - return false + continue } case html.ElementNode: if inlineElements[n.DataAtom] { - return false + continue } } - return true - }) - return n == nil + return false + } + return true } diff --git a/internal/blocko/minify.go b/internal/blocko/minify.go index e45cecdd1..5883482e4 100644 --- a/internal/blocko/minify.go +++ b/internal/blocko/minify.go @@ -25,7 +25,7 @@ func Minify(r io.Reader) (*nethtml.Node, error) { if err != nil { return nil, err } - body := xhtml.Find(doc, xhtml.WithBody) + body := xhtml.Select(doc, xhtml.WithBody) if body == nil { return nil, fmt.Errorf("could not find body") } diff --git a/internal/iterx/iterx.go b/internal/iterx/iterx.go new file mode 100644 index 000000000..13769bd66 --- /dev/null +++ b/internal/iterx/iterx.go @@ -0,0 +1,32 @@ +// Package iterx has iteration utilities. +package iterx + +import "iter" + +// Filter returns a sequence of matching items. +func Filter[T any](seq iter.Seq[T], match func(T) bool) iter.Seq[T] { + return func(yield func(T) bool) { + for v := range seq { + if match(v) && !yield(v) { + return + } + } + } +} + +// Collect returns a slice collected from a sequence. +func Collect[T any](seq iter.Seq[T]) []T { + var s []T + for v := range seq { + s = append(s, v) + } + return s +} + +// First returns the first item in a sequence or the zero value. +func First[T any](seq iter.Seq[T]) (v T) { + for v := range seq { + return v + } + return +} diff --git a/internal/mailchimp/import.go b/internal/mailchimp/import.go index bdfca3032..2370de971 100644 --- a/internal/mailchimp/import.go +++ b/internal/mailchimp/import.go @@ -29,13 +29,13 @@ func ImportPage(ctx context.Context, cl *http.Client, page string) (body string, } func PageContent(doc *html.Node) (body string, err error) { - bNode := xhtml.Find(doc, xhtml.WithBody) + bNode := xhtml.Select(doc, xhtml.WithBody) if bNode == nil { err = fmt.Errorf("could not find body element") return } - remove := xhtml.FindAll(bNode, func(n *html.Node) bool { + remove := xhtml.SelectSlice(bNode, func(n *html.Node) bool { return n.Type == html.CommentNode || n.DataAtom == atom.Style || n.DataAtom == atom.Script || diff --git a/internal/xhtml/children.go b/internal/xhtml/children.go index 1e8066f0b..f77269615 100644 --- a/internal/xhtml/children.go +++ b/internal/xhtml/children.go @@ -1,25 +1,25 @@ package xhtml import ( + "iter" "strings" + "github.com/spotlightpa/almanack/internal/iterx" "golang.org/x/net/html" ) -// Children returns a slice containing the children of n. -func Children(n *html.Node) []*html.Node { - if n == nil { - return nil - } - count := 0 - for c := n.FirstChild; c != nil; c = c.NextSibling { - count++ - } - s := make([]*html.Node, 0, count) - for c := n.FirstChild; c != nil; c = c.NextSibling { - s = append(s, c) +// Children returns a seq of the children of n. +func Children(n *html.Node) iter.Seq[*html.Node] { + return func(yield func(*html.Node) bool) { + if n == nil { + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if !yield(c) { + return + } + } } - return s } func ReplaceWith(old, new *html.Node) { @@ -79,7 +79,7 @@ func UnnestChildren(n *html.Node) { if n.Parent == nil { return } - children := Children(n) + children := iterx.Collect(Children(n)) RemoveAll(children) for _, c := range children { n.Parent.InsertBefore(c, n) diff --git a/internal/xhtml/children_test.go b/internal/xhtml/children_test.go index 421b759aa..490d1376f 100644 --- a/internal/xhtml/children_test.go +++ b/internal/xhtml/children_test.go @@ -32,37 +32,36 @@ func TestUnnestChildren(t *testing.T) { { clone := xhtml.Clone(n) - i := xhtml.Find(clone, xhtml.WithAtom(atom.I)) + i := xhtml.Select(clone, xhtml.WithAtom(atom.I)) xhtml.UnnestChildren(i) be.Equal(t, `test one two `, xhtml.InnerHTML(clone)) } { clone := xhtml.Clone(n) - em := xhtml.Find(clone, xhtml.WithAtom(atom.Em)) + em := xhtml.Select(clone, xhtml.WithAtom(atom.Em)) xhtml.UnnestChildren(em) be.Equal(t, `test one two `, xhtml.InnerHTML(clone)) } { clone := xhtml.Clone(n) - a := xhtml.Find(clone, xhtml.WithAtom(atom.A)) + a := xhtml.Select(clone, xhtml.WithAtom(atom.A)) xhtml.UnnestChildren(a) be.Equal(t, `test one two `, xhtml.InnerHTML(clone)) } { clone := xhtml.Clone(n) - b := xhtml.Find(clone, xhtml.WithAtom(atom.B)) + b := xhtml.Select(clone, xhtml.WithAtom(atom.B)) xhtml.UnnestChildren(b) be.Equal(t, `test one two `, xhtml.InnerHTML(clone)) } { clone := xhtml.Clone(n) - is := xhtml.FindAll(clone, xhtml.WithAtom(atom.I)) - for _, n := range is { - xhtml.UnnestChildren(n) + for _, c := range xhtml.SelectSlice(clone, xhtml.WithAtom(atom.I)) { + xhtml.UnnestChildren(c) } be.Equal(t, `test one two `, xhtml.InnerHTML(clone)) diff --git a/internal/xhtml/iter.go b/internal/xhtml/iter.go index 3a64b3ac0..f5a382183 100644 --- a/internal/xhtml/iter.go +++ b/internal/xhtml/iter.go @@ -1,7 +1,12 @@ +//go:build goexperiment.rangefunc + // Package xhtml makes x/net/html easier package xhtml import ( + "iter" + + "github.com/spotlightpa/almanack/internal/iterx" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) @@ -27,47 +32,42 @@ func all(n *html.Node, yield func(*html.Node) bool) bool { return _continue } -// Find returns the first matching child node or nil. -func Find(n *html.Node, match func(*html.Node) bool) *html.Node { - var found *html.Node - all(n, func(n *html.Node) bool { - if match(n) { - found = n - return _break - } - return _continue - }) - return found +// All vists all child nodes in depth-first pre-order. +func All(n *html.Node) iter.Seq[*html.Node] { + return func(yield func(*html.Node) bool) { + all(n, yield) + } } -// VisitAll vists child nodes in depth-first pre-order. -func VisitAll(n *html.Node, callback func(*html.Node)) { - all(n, func(n *html.Node) bool { - callback(n) - return _continue - }) +// SelectAll returns an iterator yielding matching nodes. +func SelectAll(n *html.Node, match func(*html.Node) bool) iter.Seq[*html.Node] { + return iterx.Filter(All(n), match) } -// FindAll returns a slice of matching nodes. -func FindAll(root *html.Node, match func(*html.Node) bool) []*html.Node { - var found []*html.Node - VisitAll(root, func(n *html.Node) { - if match(n) { - found = append(found, n) +// SelectSlice returns a slice of child nodes matched by the selector. +func SelectSlice(n *html.Node, match func(*html.Node) bool) []*html.Node { + return iterx.Collect(SelectAll(n, match)) +} + +// SelectSlice returns the first child node matched by the selector or nil. +func Select(n *html.Node, match func(*html.Node) bool) *html.Node { + return iterx.First(SelectAll(n, match)) +} + +// Parents returns an iterator traversing the node and its parents. +func Parents(n *html.Node) iter.Seq[*html.Node] { + return func(yield func(*html.Node) bool) { + for p := n; p != nil; p = p.Parent { + if !yield(p) { + return + } } - }) - return found + } } // Closest traverses the node and its parents until it finds a node that matches. func Closest(n *html.Node, match func(*html.Node) bool) *html.Node { - for n != nil { - if match(n) { - return n - } - n = n.Parent - } - return nil + return iterx.First(iterx.Filter(Parents(n), match)) } func WithAtom(a atom.Atom) func(n *html.Node) bool { diff --git a/internal/xhtml/new_test.go b/internal/xhtml/new_test.go index 7704f5024..beaf81890 100644 --- a/internal/xhtml/new_test.go +++ b/internal/xhtml/new_test.go @@ -20,28 +20,28 @@ func TestClone(t *testing.T) { n, err := html.Parse(strings.NewReader(tc)) be.NilErr(t, err) body := n.FirstChild.FirstChild.NextSibling - be.Equal(t, xhtml.Find(n, xhtml.WithAtom(atom.Body)), body) + be.Equal(t, xhtml.Select(n, xhtml.WithAtom(atom.Body)), body) s := xhtml.InnerHTML(body) be.Equal(be.Relaxed(t), tc, s) n2 := xhtml.Clone(n) body2 := n2.FirstChild.FirstChild.NextSibling - be.Equal(t, xhtml.Find(n2, xhtml.WithAtom(atom.Body)), body2) + be.Equal(t, xhtml.Select(n2, xhtml.WithAtom(atom.Body)), body2) be.Unequal(t, body, body2) s = xhtml.InnerHTML(body2) be.Equal(be.Relaxed(t), tc, s) m := map[*html.Node]bool{} - xhtml.VisitAll(n, func(n *html.Node) { - m[n] = true - }) + for c := range xhtml.All(n) { + m[c] = true + } - xhtml.VisitAll(n2, func(n *html.Node) { - if m[n] { + for c := range xhtml.All(n2) { + if m[c] { t.Error("duplicate node:", n) } - }) + } } } diff --git a/internal/xhtml/string.go b/internal/xhtml/string.go index 1d183cb43..6baf2fdb9 100644 --- a/internal/xhtml/string.go +++ b/internal/xhtml/string.go @@ -60,10 +60,11 @@ func InnerText(n *html.Node) string { var buf strings.Builder buf.Grow(256) - VisitAll(n, func(n *html.Node) { + for n := range All(n) { if n.Type == html.TextNode { buf.WriteString(n.Data) } - }) + } + return strings.TrimSpace(buf.String()) } diff --git a/internal/xhtml/string_test.go b/internal/xhtml/string_test.go index 5518c57aa..afd8a3da0 100644 --- a/internal/xhtml/string_test.go +++ b/internal/xhtml/string_test.go @@ -22,7 +22,7 @@ func TestInnerText(t *testing.T) { } { doc, err := html.Parse(strings.NewReader(tc.input)) be.NilErr(t, err) - p := xhtml.Find(doc, xhtml.WithAtom(atom.P)) + p := xhtml.Select(doc, xhtml.WithAtom(atom.P)) got := xhtml.InnerText(p) be.Equal(t, tc.want, got) } diff --git a/internal/xhtml/table.go b/internal/xhtml/table.go index a2c94cc21..acc3cb613 100644 --- a/internal/xhtml/table.go +++ b/internal/xhtml/table.go @@ -9,12 +9,12 @@ import ( ) func Tables(root *html.Node, f func(tbl *html.Node, rows TableNodes)) { - tables := FindAll(root, WithAtom(atom.Table)) + tables := SelectSlice(root, WithAtom(atom.Table)) for _, tblNode := range tables { var tbl TableNodes - rows := FindAll(tblNode, WithAtom(atom.Tr)) + rows := SelectSlice(tblNode, WithAtom(atom.Tr)) for _, row := range rows { - tds := FindAll(row, func(n *html.Node) bool { + tds := SelectSlice(row, func(n *html.Node) bool { return n.DataAtom == atom.Td || n.DataAtom == atom.Th }) tbl = append(tbl, tds) diff --git a/pkg/almanack/service-gdocs.go b/pkg/almanack/service-gdocs.go index f95c5a33f..f7dc925d6 100644 --- a/pkg/almanack/service-gdocs.go +++ b/pkg/almanack/service-gdocs.go @@ -104,7 +104,7 @@ func (svc Services) ProcessGDocsDoc(ctx context.Context, dbDoc db.GDocsDoc) (err } docHTML := gdocs.Convert(&dbDoc.Document) - if n := xhtml.Find(docHTML, xhtml.WithAtom(atom.Data)); n != nil { + if n := xhtml.Select(docHTML, xhtml.WithAtom(atom.Data)); n != nil { return fmt.Errorf( "document unexpectedly contains element: %q", xhtml.OuterHTML(n), @@ -202,10 +202,10 @@ func (svc Services) ProcessGDocsDoc(ctx context.Context, dbDoc db.GDocsDoc) (err blocko.RemoveMarks(docHTML) // Warn about fake headings - xhtml.VisitAll(docHTML, func(n *html.Node) { + for n := range xhtml.All(docHTML) { //

with only b/i/strong/em for a child if n.DataAtom != atom.P { - return + continue } if n.FirstChild != nil && n.FirstChild == n.LastChild && @@ -221,10 +221,10 @@ func (svc Services) ProcessGDocsDoc(ctx context.Context, dbDoc db.GDocsDoc) (err "Paragraph beginning %q looks like a header, but does not use H-tag.", text) warnings = append(warnings, warning) } - }) + } // Warn about
- if n := xhtml.Find(docHTML, xhtml.WithAtom(atom.Br)); n != nil { + if n := xhtml.Select(docHTML, xhtml.WithAtom(atom.Br)); n != nil { warnings = append(warnings, "Document contains
line breaks. Are you sure you want to use a line break? In Google Docs, select View > Show non-printing characters to see them.") } @@ -305,7 +305,7 @@ func (svc Services) replaceImageEmbed( return imageEmbed, "" } - linkTag := xhtml.Find(tbl, xhtml.WithAtom(atom.A)) + linkTag := xhtml.Select(tbl, xhtml.WithAtom(atom.A)) if href := xhtml.Attr(linkTag, "href"); href != "" { path, err := svc.ReplaceAndUploadImageURL(ctx, href, imageEmbed.Description, imageEmbed.Credit) switch { @@ -323,7 +323,7 @@ func (svc Services) replaceImageEmbed( } } - image := xhtml.Find(tbl, xhtml.WithAtom(atom.Img)) + image := xhtml.Select(tbl, xhtml.WithAtom(atom.Img)) if image == nil { return nil, fmt.Sprintf( "Table %d missing image", n, @@ -453,7 +453,7 @@ func (svc Services) replaceMetadata( return "" } - linkTag := xhtml.Find(cell, xhtml.WithAtom(atom.A)) + linkTag := xhtml.Select(cell, xhtml.WithAtom(atom.A)) if href := xhtml.Attr(linkTag, "href"); href != "" { path, err := svc.ReplaceAndUploadImageURL(ctx, href, metadata.LedeImageDescription, metadata.LedeImageCredit) switch { @@ -470,7 +470,7 @@ func (svc Services) replaceMetadata( } } - image := xhtml.Find(tbl, xhtml.WithAtom(atom.Img)) + image := xhtml.Select(tbl, xhtml.WithAtom(atom.Img)) if image == nil { return "" } @@ -501,7 +501,7 @@ func (svc Services) replaceMetadata( } func fixRichTextPlaceholders(richText *html.Node) { - embeds := xhtml.FindAll(richText, xhtml.WithAtom(atom.Data)) + embeds := xhtml.SelectSlice(richText, xhtml.WithAtom(atom.Data)) for _, dataEl := range embeds { embed := extractEmbed(dataEl) if embed.Type == db.SpotlightRawEmbedTag { @@ -521,7 +521,7 @@ func extractEmbed(n *html.Node) db.Embed { } func fixRawHTMLPlaceholders(rawHTML *html.Node) { - embeds := xhtml.FindAll(rawHTML, xhtml.WithAtom(atom.Data)) + embeds := xhtml.SelectSlice(rawHTML, xhtml.WithAtom(atom.Data)) for _, dataEl := range embeds { embed := extractEmbed(dataEl) switch embed.Type { @@ -541,7 +541,7 @@ func fixRawHTMLPlaceholders(rawHTML *html.Node) { } func fixMarkdownPlaceholders(rawHTML *html.Node) { - embeds := xhtml.FindAll(rawHTML, xhtml.WithAtom(atom.Data)) + embeds := xhtml.SelectSlice(rawHTML, xhtml.WithAtom(atom.Data)) for _, dataEl := range embeds { embed := extractEmbed(dataEl) switch embed.Type { @@ -650,17 +650,17 @@ func processToc(doc *html.Node, rows xhtml.TableNodes) string { depth int } var headers []header - xhtml.VisitAll(doc, func(n *html.Node) { + for n := range xhtml.All(doc) { switch n.DataAtom { case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6: default: - return + continue } id := fmt.Sprintf("spl-heading-%d", len(headers)+1) xhtml.SetAttr(n, "id", id) depth := int(n.Data[1] - '0') headers = append(headers, header{xhtml.InnerText(n), id, depth}) - }) + } container := xhtml.New("div") h3 := xhtml.New("h3") xhtml.AppendText(h3, cmp.Or( diff --git a/run.sh b/run.sh index 06db81841..39adc7d18 100755 --- a/run.sh +++ b/run.sh @@ -7,6 +7,8 @@ THIS_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd "$THIS_DIR" +export GOEXPERIMENT=rangefunc + function _default() { # shellcheck disable=SC2119 api