119 lines
2.5 KiB
Go
119 lines
2.5 KiB
Go
|
package wikitext_parser
|
||
|
|
||
|
import (
|
||
|
"golang.org/x/text/unicode/norm"
|
||
|
"unicode"
|
||
|
)
|
||
|
|
||
|
type HTML struct {
|
||
|
Tag *HTMLTag
|
||
|
}
|
||
|
|
||
|
type HTMLTag struct {
|
||
|
Parent *HTMLTag
|
||
|
Name []byte
|
||
|
Parameters []byte
|
||
|
Content []*HTMLTag
|
||
|
}
|
||
|
|
||
|
func (t *HTMLTag) String() (r string) {
|
||
|
if string(t.Name) == "#text" {
|
||
|
r = string(t.Parameters)
|
||
|
} else {
|
||
|
for _, c := range t.Content {
|
||
|
r += c.String()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if string(t.Name) == "del" { //add strikethrough
|
||
|
var runeList []rune
|
||
|
for _, runeEntry := range []rune(norm.NFD.String(r)) {
|
||
|
if runeEntry <= unicode.MaxASCII {
|
||
|
runeList = append(runeList, '\u0336') //combining long stroke overlay
|
||
|
}
|
||
|
runeList = append(runeList, runeEntry)
|
||
|
}
|
||
|
r = norm.NFC.String(string(runeList))
|
||
|
} else if string(t.Name) == "ref" { //remove references
|
||
|
return ""
|
||
|
} else if string(t.Name) == "br" { //new line
|
||
|
return "\n"
|
||
|
} else if string(t.Name) == "script" {
|
||
|
return ""
|
||
|
}
|
||
|
return
|
||
|
}
|
||
|
|
||
|
func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
|
||
|
var c byte
|
||
|
|
||
|
html = &HTML{}
|
||
|
|
||
|
readingTag := false
|
||
|
readingParameters := false
|
||
|
isTerminating := false
|
||
|
var tag *HTMLTag
|
||
|
tagDepth := 0
|
||
|
|
||
|
for i = index; i < len(text); i++ {
|
||
|
c = text[i]
|
||
|
|
||
|
if c == '<' && i < len(text)-1 && text[i+1] == '/' {
|
||
|
isTerminating = true
|
||
|
readingTag = true
|
||
|
readingParameters = false
|
||
|
} else if c == '<' {
|
||
|
newTag := &HTMLTag{
|
||
|
Parent: tag,
|
||
|
}
|
||
|
|
||
|
if tag != nil {
|
||
|
tag.Content = append(tag.Content, newTag)
|
||
|
}
|
||
|
tag = newTag
|
||
|
readingTag = true
|
||
|
readingParameters = false
|
||
|
isTerminating = false
|
||
|
if tagDepth == 0 && html.Tag == nil {
|
||
|
html.Tag = tag
|
||
|
}
|
||
|
tagDepth++
|
||
|
} else if readingTag && c == '>' {
|
||
|
readingTag = false
|
||
|
readingParameters = false
|
||
|
|
||
|
if isTerminating {
|
||
|
tagDepth--
|
||
|
if tag != nil {
|
||
|
tag = tag.Parent
|
||
|
}
|
||
|
isTerminating = false
|
||
|
}
|
||
|
if tagDepth == 0 {
|
||
|
return i + 1, html
|
||
|
}
|
||
|
} else if readingTag && c == '/' {
|
||
|
isTerminating = true
|
||
|
} else if !isTerminating && readingTag {
|
||
|
if c == ' ' {
|
||
|
readingParameters = true
|
||
|
}
|
||
|
if readingParameters {
|
||
|
tag.Parameters = append(tag.Parameters, c)
|
||
|
} else {
|
||
|
tag.Name = append(tag.Name, c)
|
||
|
}
|
||
|
} else if !isTerminating && tagDepth > 0 {
|
||
|
if len(tag.Content) == 0 || string(tag.Content[len(tag.Content)-1].Name) != "#text" {
|
||
|
tag.Content = append(tag.Content, &HTMLTag{
|
||
|
Parent: tag,
|
||
|
Name: []byte("#text"),
|
||
|
})
|
||
|
}
|
||
|
tag.Content[len(tag.Content)-1].Parameters = append(tag.Content[len(tag.Content)-1].Parameters, c)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return
|
||
|
}
|