Fix HTML parsing / rendering of <del>

This commit is contained in:
DataHoarder 2022-02-20 17:29:10 +01:00
parent 391ef064b9
commit 83b63ecac0
2 changed files with 39 additions and 28 deletions

View file

@ -339,7 +339,7 @@ func getStringValue(pageName string, v []interface{}, trim ...bool) (result []st
if len(trim) == 0 || trim[0] == true {
var newResults []string
for _, e := range result {
e = normalizeStringCharacters(e)
e = strings.TrimSpace(e)
if len(e) > 0 {
newResults = append(newResults, e)
}
@ -360,7 +360,7 @@ func processIndexDirectory(filePath, indexPath, kind string, wg *sync.WaitGroup)
if path.Ext(e.Name()) == ".json" {
for _, v := range parseCategoryPageIndex(path.Join(indexPath, e.Name())) {
wg.Add(1)
go func(entry *albumEntry) {
func(entry *albumEntry) {
defer wg.Done()
contents, err := ioutil.ReadFile(path.Join(filePath, "pages", fmt.Sprintf("%d.wiki", entry.Id)))

View file

@ -3,6 +3,7 @@ package wikiparser
import (
"golang.org/x/text/unicode/norm"
"strings"
"unicode"
)
func NormalizeWikiTitle(title string) string {
@ -21,31 +22,34 @@ type HTML struct {
type HTMLTag struct {
Parent *HTMLTag
Name string
Parameters string
Name []byte
Parameters []byte
Content []*HTMLTag
}
func (t *HTMLTag) String() (r string) {
if t.Name == "#text" {
return t.Parameters
}
for _, c := range t.Content {
r += c.String()
if string(t.Name) == "#text" {
r = string(t.Parameters)
} else {
for _, c := range t.Content {
r += c.String()
}
}
if t.Name == "del" { //add strikethrough
var runes []rune
for _, r := range []rune(norm.NFD.String(r)) {
runes = append(runes, '\u0336') //combining long stroke overlay
runes = append(runes, r)
if string(t.Name) == "del" { //add strikethrough
var runeList []rune
for _, runeEntry := range []rune(norm.NFD.String(r)) {
if runeEntry <= unicode.MaxASCII {
runeList = append(runeList, '\u0336') //combining long stroke overlay
}
runeList = append(runeList, runeEntry)
}
r = norm.NFC.String(string(runes))
} else if t.Name == "ref" { //remove references
r = norm.NFC.String(string(runeList))
} else if string(t.Name) == "ref" { //remove references
return ""
} else if t.Name == "br" { //new line
} else if string(t.Name) == "br" { //new line
return "\n"
} else if t.Name == "script" {
} else if string(t.Name) == "script" {
return ""
}
return
@ -106,7 +110,11 @@ func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
for i = index; i < len(text); i++ {
c = text[i]
if c == '<' {
if c == '<' && i < len(text)-1 && text[i+1] == '/' {
isTerminating = true
readingTag = true
readingParameters = false
} else if c == '<' {
newTag := &HTMLTag{
Parent: tag,
}
@ -128,30 +136,33 @@ func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
if isTerminating {
tagDepth--
tag = tag.Parent
if tag != nil {
tag = tag.Parent
}
isTerminating = false
}
if tagDepth == 0 || tag.Parent == nil {
if tagDepth == 0 {
return i + 1, html
}
} else if readingTag && c == '/' {
isTerminating = true
} else if readingTag {
} else if !isTerminating && readingTag {
if c == ' ' {
readingParameters = true
}
if readingParameters {
tag.Parameters += string(c)
tag.Parameters = append(tag.Parameters, c)
} else {
tag.Name += string(c)
tag.Name = append(tag.Name, c)
}
} else if tagDepth > 0 {
if len(tag.Content) == 0 || tag.Content[len(tag.Content)-1].Name != "#text" {
} else if !isTerminating && tagDepth > 0 {
if len(tag.Content) == 0 || string(tag.Content[len(tag.Content)-1].Name) != "#text" {
tag.Content = append(tag.Content, &HTMLTag{
Parent: tag,
Name: "#text",
Name: []byte("#text"),
})
}
tag.Content[len(tag.Content)-1].Parameters += string(c)
tag.Content[len(tag.Content)-1].Parameters = append(tag.Content[len(tag.Content)-1].Parameters, c)
}
}