wikitext-parser/html.go

119 lines
2.5 KiB
Go

package wikitext_parser
import (
"golang.org/x/text/unicode/norm"
"unicode"
)
type HTML struct {
Tag *HTMLTag
}
type HTMLTag struct {
Parent *HTMLTag
Name []byte
Parameters []byte
Content []*HTMLTag
}
func (t *HTMLTag) String() (r string) {
if string(t.Name) == "#text" {
r = string(t.Parameters)
} else {
for _, c := range t.Content {
r += c.String()
}
}
if string(t.Name) == "del" { //add strikethrough
var runeList []rune
for _, runeEntry := range []rune(norm.NFD.String(r)) {
if runeEntry <= unicode.MaxASCII {
runeList = append(runeList, '\u0336') //combining long stroke overlay
}
runeList = append(runeList, runeEntry)
}
r = norm.NFC.String(string(runeList))
} else if string(t.Name) == "ref" { //remove references
return ""
} else if string(t.Name) == "br" { //new line
return "\n"
} else if string(t.Name) == "script" {
return ""
}
return
}
func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
var c byte
html = &HTML{}
readingTag := false
readingParameters := false
isTerminating := false
var tag *HTMLTag
tagDepth := 0
for i = index; i < len(text); i++ {
c = text[i]
if c == '<' && i < len(text)-1 && text[i+1] == '/' {
isTerminating = true
readingTag = true
readingParameters = false
} else if c == '<' {
newTag := &HTMLTag{
Parent: tag,
}
if tag != nil {
tag.Content = append(tag.Content, newTag)
}
tag = newTag
readingTag = true
readingParameters = false
isTerminating = false
if tagDepth == 0 && html.Tag == nil {
html.Tag = tag
}
tagDepth++
} else if readingTag && c == '>' {
readingTag = false
readingParameters = false
if isTerminating {
tagDepth--
if tag != nil {
tag = tag.Parent
}
isTerminating = false
}
if tagDepth == 0 {
return i + 1, html
}
} else if readingTag && c == '/' {
isTerminating = true
} else if !isTerminating && readingTag {
if c == ' ' {
readingParameters = true
}
if readingParameters {
tag.Parameters = append(tag.Parameters, c)
} else {
tag.Name = append(tag.Name, c)
}
} else if !isTerminating && tagDepth > 0 {
if len(tag.Content) == 0 || string(tag.Content[len(tag.Content)-1].Name) != "#text" {
tag.Content = append(tag.Content, &HTMLTag{
Parent: tag,
Name: []byte("#text"),
})
}
tag.Content[len(tag.Content)-1].Parameters = append(tag.Content[len(tag.Content)-1].Parameters, c)
}
}
return
}