fix(html): update how to truncate new line

huantt · Aug 7, 2023 · 4980a81 · 4980a81
1 parent 3c8395e
commit 4980a81
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 10 deletions.
diff --git a/html/extractor.go b/html/extractor.go
@@ -2,10 +2,11 @@ package html
 
 import (
 	"golang.org/x/net/html"
+	"regexp"
 	"strings"
 )
 
-// HTMLExtractor represents an HTML-specific plain text extractor.
+// Extractor represents an HTML-specific plain text extractor.
 type Extractor struct {
 	blockTags map[string]bool
 }
@@ -34,7 +35,7 @@ func (e *Extractor) PlainText(input string) (*string, error) {
 	e.extractText(&plainText, doc)
 
 	output := plainText.String()
-	output = strings.ReplaceAll(output, "\n ", "\n")
+	output = string(regexp.MustCompile("\n+\\s+").ReplaceAll([]byte(output), []byte("\n")))
 	return &output, nil
 }
 
@@ -45,11 +46,7 @@ func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) {
 		text := strings.TrimSpace(node.Data)
 		if text != "" {
 			if plainText.Len() > 0 {
-				if found := e.blockTags[node.Parent.DataAtom.String()]; found {
-					plainText.WriteString("\n")
-				} else {
-					plainText.WriteString(" ")
-				}
+				plainText.WriteString(" ")
 			}
 			plainText.WriteString(text)
 		}
@@ -62,4 +59,7 @@ func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) {
 	for child := node.FirstChild; child != nil; child = child.NextSibling {
 		e.extractText(plainText, child)
 	}
+	if found := e.blockTags[node.DataAtom.String()]; found {
+		plainText.WriteString("\n")
+	}
 }
diff --git a/html/extractor_test.go b/html/extractor_test.go
@@ -1,6 +1,7 @@
 package html
 
 import (
+	_ "embed"
 	"github.com/stretchr/testify/assert"
 	"testing"
 )
@@ -12,10 +13,12 @@ func TestExtract(t *testing.T) {
 		expected string
 	}{
 		{`a<br>b`, "a\nb"},
-		{`a<br><h1>b</h1>`, "a\n\nb"},
+		{`a<br><h1>b</h1>`, "a\nb\n"},
 		{`<a href="https://example.com">link</a>`, "link"},
-		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link"},
-		{"<div><h1>Heading 1</h1><h2>Heading 2</h2><ul><li>Item 1</li><li>Item 2</li></ul></div>", "Heading 1\nHeading 2\nItem 1\nItem 2"},
+		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link\n"},
+		{"<div><h1>Heading 1</h1><h2>Heading 2</h2><ul><li>Item 1</li><li>Item 2</li></ul></div>", "Heading 1\nHeading 2\nItem 1\nItem 2\n"},
+		{"<p><span>a</span><span>b</span></p> c", "a b\nc"},
+		{"a\n \nb", "a\nb"},
 	}
 	for _, test := range tests {
 		output, err := extractor.PlainText(test.input)