Skip to content

Commit

Permalink
fix(html): update how to truncate new line
Browse files Browse the repository at this point in the history
  • Loading branch information
huantt committed Aug 7, 2023
1 parent 3c8395e commit 4980a81
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 10 deletions.
14 changes: 7 additions & 7 deletions html/extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ package html

import (
"golang.org/x/net/html"
"regexp"
"strings"
)

// HTMLExtractor represents an HTML-specific plain text extractor.
// Extractor represents an HTML-specific plain text extractor.
type Extractor struct {
blockTags map[string]bool
}
Expand Down Expand Up @@ -34,7 +35,7 @@ func (e *Extractor) PlainText(input string) (*string, error) {
e.extractText(&plainText, doc)

output := plainText.String()
output = strings.ReplaceAll(output, "\n ", "\n")
output = string(regexp.MustCompile("\n+\\s+").ReplaceAll([]byte(output), []byte("\n")))
return &output, nil
}

Expand All @@ -45,11 +46,7 @@ func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) {
text := strings.TrimSpace(node.Data)
if text != "" {
if plainText.Len() > 0 {
if found := e.blockTags[node.Parent.DataAtom.String()]; found {
plainText.WriteString("\n")
} else {
plainText.WriteString(" ")
}
plainText.WriteString(" ")
}
plainText.WriteString(text)
}
Expand All @@ -62,4 +59,7 @@ func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) {
for child := node.FirstChild; child != nil; child = child.NextSibling {
e.extractText(plainText, child)
}
if found := e.blockTags[node.DataAtom.String()]; found {
plainText.WriteString("\n")
}
}
9 changes: 6 additions & 3 deletions html/extractor_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package html

import (
_ "embed"
"github.com/stretchr/testify/assert"
"testing"
)
Expand All @@ -12,10 +13,12 @@ func TestExtract(t *testing.T) {
expected string
}{
{`a<br>b`, "a\nb"},
{`a<br><h1>b</h1>`, "a\n\nb"},
{`a<br><h1>b</h1>`, "a\nb\n"},
{`<a href="https://example.com">link</a>`, "link"},
{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link"},
{"<div><h1>Heading 1</h1><h2>Heading 2</h2><ul><li>Item 1</li><li>Item 2</li></ul></div>", "Heading 1\nHeading 2\nItem 1\nItem 2"},
{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link\n"},
{"<div><h1>Heading 1</h1><h2>Heading 2</h2><ul><li>Item 1</li><li>Item 2</li></ul></div>", "Heading 1\nHeading 2\nItem 1\nItem 2\n"},
{"<p><span>a</span><span>b</span></p> c", "a b\nc"},
{"a\n \nb", "a\nb"},
}
for _, test := range tests {
output, err := extractor.PlainText(test.input)
Expand Down

0 comments on commit 4980a81

Please sign in to comment.