Fix HTML parsing / rendering of <del>

2022-02-20 17:29:10 +01:00 · 2022-02-20 17:29:10 +01:00 · 83b63ecac0
parent 391ef064b9
commit 83b63ecac0
2 changed files with 39 additions and 28 deletions
--- a/server.go
+++ b/server.go
@ -339,7 +339,7 @@ func getStringValue(pageName string, v []interface{}, trim ...bool) (result []st
 	if len(trim) == 0 || trim[0] == true {
 		var newResults []string
 		for _, e := range result {
-			e = normalizeStringCharacters(e)
+			e = strings.TrimSpace(e)
 			if len(e) > 0 {
 				newResults = append(newResults, e)
 			}
@ -360,7 +360,7 @@ func processIndexDirectory(filePath, indexPath, kind string, wg *sync.WaitGroup)
 		if path.Ext(e.Name()) == ".json" {
 			for _, v := range parseCategoryPageIndex(path.Join(indexPath, e.Name())) {
 				wg.Add(1)
-				go func(entry *albumEntry) {
+				func(entry *albumEntry) {
 					defer wg.Done()

 					contents, err := ioutil.ReadFile(path.Join(filePath, "pages", fmt.Sprintf("%d.wiki", entry.Id)))
--- a/wikiparser/parser.go
+++ b/wikiparser/parser.go
@ -3,6 +3,7 @@ package wikiparser
 import (
 	"golang.org/x/text/unicode/norm"
 	"strings"
+	"unicode"
 )

 func NormalizeWikiTitle(title string) string {
@ -21,31 +22,34 @@ type HTML struct {

 type HTMLTag struct {
 	Parent     *HTMLTag
-	Name       string
-	Parameters string
+	Name       []byte
+	Parameters []byte
 	Content    []*HTMLTag
 }

 func (t *HTMLTag) String() (r string) {
-	if t.Name == "#text" {
-		return t.Parameters
-	}
-	for _, c := range t.Content {
-		r += c.String()
+	if string(t.Name) == "#text" {
+		r = string(t.Parameters)
+	} else {
+		for _, c := range t.Content {
+			r += c.String()
+		}
 	}

-	if t.Name == "del" { //add strikethrough
-		var runes []rune
-		for _, r := range []rune(norm.NFD.String(r)) {
-			runes = append(runes, '\u0336') //combining long stroke overlay
-			runes = append(runes, r)
+	if string(t.Name) == "del" { //add strikethrough
+		var runeList []rune
+		for _, runeEntry := range []rune(norm.NFD.String(r)) {
+			if runeEntry <= unicode.MaxASCII {
+				runeList = append(runeList, '\u0336') //combining long stroke overlay
+			}
+			runeList = append(runeList, runeEntry)
 		}
-		r = norm.NFC.String(string(runes))
-	} else if t.Name == "ref" { //remove references
+		r = norm.NFC.String(string(runeList))
+	} else if string(t.Name) == "ref" { //remove references
 		return ""
-	} else if t.Name == "br" { //new line
+	} else if string(t.Name) == "br" { //new line
 		return "\n"
-	} else if t.Name == "script" {
+	} else if string(t.Name) == "script" {
 		return ""
 	}
 	return
@ -106,7 +110,11 @@ func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
 	for i = index; i < len(text); i++ {
 		c = text[i]

-		if c == '<' {
+		if c == '<' && i < len(text)-1 && text[i+1] == '/' {
+			isTerminating = true
+			readingTag = true
+			readingParameters = false
+		} else if c == '<' {
 			newTag := &HTMLTag{
 				Parent: tag,
 			}
@ -128,30 +136,33 @@ func ParseHTML(text string, index int, depth int) (i int, html *HTML) {

 			if isTerminating {
 				tagDepth--
-				tag = tag.Parent
+				if tag != nil {
+					tag = tag.Parent
+				}
+				isTerminating = false
 			}
-			if tagDepth == 0 || tag.Parent == nil {
+			if tagDepth == 0 {
 				return i + 1, html
 			}
 		} else if readingTag && c == '/' {
 			isTerminating = true
-		} else if readingTag {
+		} else if !isTerminating && readingTag {
 			if c == ' ' {
 				readingParameters = true
 			}
 			if readingParameters {
-				tag.Parameters += string(c)
+				tag.Parameters = append(tag.Parameters, c)
 			} else {
-				tag.Name += string(c)
+				tag.Name = append(tag.Name, c)
 			}
-		} else if tagDepth > 0 {
-			if len(tag.Content) == 0 || tag.Content[len(tag.Content)-1].Name != "#text" {
+		} else if !isTerminating && tagDepth > 0 {
+			if len(tag.Content) == 0 || string(tag.Content[len(tag.Content)-1].Name) != "#text" {
 				tag.Content = append(tag.Content, &HTMLTag{
 					Parent: tag,
-					Name:   "#text",
+					Name:   []byte("#text"),
 				})
 			}
-			tag.Content[len(tag.Content)-1].Parameters += string(c)
+			tag.Content[len(tag.Content)-1].Parameters = append(tag.Content[len(tag.Content)-1].Parameters, c)
 		}
 	}