Fix HTML parsing / rendering of <del>
This commit is contained in:
parent
391ef064b9
commit
83b63ecac0
|
@ -339,7 +339,7 @@ func getStringValue(pageName string, v []interface{}, trim ...bool) (result []st
|
|||
if len(trim) == 0 || trim[0] == true {
|
||||
var newResults []string
|
||||
for _, e := range result {
|
||||
e = normalizeStringCharacters(e)
|
||||
e = strings.TrimSpace(e)
|
||||
if len(e) > 0 {
|
||||
newResults = append(newResults, e)
|
||||
}
|
||||
|
@ -360,7 +360,7 @@ func processIndexDirectory(filePath, indexPath, kind string, wg *sync.WaitGroup)
|
|||
if path.Ext(e.Name()) == ".json" {
|
||||
for _, v := range parseCategoryPageIndex(path.Join(indexPath, e.Name())) {
|
||||
wg.Add(1)
|
||||
go func(entry *albumEntry) {
|
||||
func(entry *albumEntry) {
|
||||
defer wg.Done()
|
||||
|
||||
contents, err := ioutil.ReadFile(path.Join(filePath, "pages", fmt.Sprintf("%d.wiki", entry.Id)))
|
||||
|
|
|
@ -3,6 +3,7 @@ package wikiparser
|
|||
import (
|
||||
"golang.org/x/text/unicode/norm"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
func NormalizeWikiTitle(title string) string {
|
||||
|
@ -21,31 +22,34 @@ type HTML struct {
|
|||
|
||||
type HTMLTag struct {
|
||||
Parent *HTMLTag
|
||||
Name string
|
||||
Parameters string
|
||||
Name []byte
|
||||
Parameters []byte
|
||||
Content []*HTMLTag
|
||||
}
|
||||
|
||||
func (t *HTMLTag) String() (r string) {
|
||||
if t.Name == "#text" {
|
||||
return t.Parameters
|
||||
}
|
||||
for _, c := range t.Content {
|
||||
r += c.String()
|
||||
if string(t.Name) == "#text" {
|
||||
r = string(t.Parameters)
|
||||
} else {
|
||||
for _, c := range t.Content {
|
||||
r += c.String()
|
||||
}
|
||||
}
|
||||
|
||||
if t.Name == "del" { //add strikethrough
|
||||
var runes []rune
|
||||
for _, r := range []rune(norm.NFD.String(r)) {
|
||||
runes = append(runes, '\u0336') //combining long stroke overlay
|
||||
runes = append(runes, r)
|
||||
if string(t.Name) == "del" { //add strikethrough
|
||||
var runeList []rune
|
||||
for _, runeEntry := range []rune(norm.NFD.String(r)) {
|
||||
if runeEntry <= unicode.MaxASCII {
|
||||
runeList = append(runeList, '\u0336') //combining long stroke overlay
|
||||
}
|
||||
runeList = append(runeList, runeEntry)
|
||||
}
|
||||
r = norm.NFC.String(string(runes))
|
||||
} else if t.Name == "ref" { //remove references
|
||||
r = norm.NFC.String(string(runeList))
|
||||
} else if string(t.Name) == "ref" { //remove references
|
||||
return ""
|
||||
} else if t.Name == "br" { //new line
|
||||
} else if string(t.Name) == "br" { //new line
|
||||
return "\n"
|
||||
} else if t.Name == "script" {
|
||||
} else if string(t.Name) == "script" {
|
||||
return ""
|
||||
}
|
||||
return
|
||||
|
@ -106,7 +110,11 @@ func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
|
|||
for i = index; i < len(text); i++ {
|
||||
c = text[i]
|
||||
|
||||
if c == '<' {
|
||||
if c == '<' && i < len(text)-1 && text[i+1] == '/' {
|
||||
isTerminating = true
|
||||
readingTag = true
|
||||
readingParameters = false
|
||||
} else if c == '<' {
|
||||
newTag := &HTMLTag{
|
||||
Parent: tag,
|
||||
}
|
||||
|
@ -128,30 +136,33 @@ func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
|
|||
|
||||
if isTerminating {
|
||||
tagDepth--
|
||||
tag = tag.Parent
|
||||
if tag != nil {
|
||||
tag = tag.Parent
|
||||
}
|
||||
isTerminating = false
|
||||
}
|
||||
if tagDepth == 0 || tag.Parent == nil {
|
||||
if tagDepth == 0 {
|
||||
return i + 1, html
|
||||
}
|
||||
} else if readingTag && c == '/' {
|
||||
isTerminating = true
|
||||
} else if readingTag {
|
||||
} else if !isTerminating && readingTag {
|
||||
if c == ' ' {
|
||||
readingParameters = true
|
||||
}
|
||||
if readingParameters {
|
||||
tag.Parameters += string(c)
|
||||
tag.Parameters = append(tag.Parameters, c)
|
||||
} else {
|
||||
tag.Name += string(c)
|
||||
tag.Name = append(tag.Name, c)
|
||||
}
|
||||
} else if tagDepth > 0 {
|
||||
if len(tag.Content) == 0 || tag.Content[len(tag.Content)-1].Name != "#text" {
|
||||
} else if !isTerminating && tagDepth > 0 {
|
||||
if len(tag.Content) == 0 || string(tag.Content[len(tag.Content)-1].Name) != "#text" {
|
||||
tag.Content = append(tag.Content, &HTMLTag{
|
||||
Parent: tag,
|
||||
Name: "#text",
|
||||
Name: []byte("#text"),
|
||||
})
|
||||
}
|
||||
tag.Content[len(tag.Content)-1].Parameters += string(c)
|
||||
tag.Content[len(tag.Content)-1].Parameters = append(tag.Content[len(tag.Content)-1].Parameters, c)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue