582 lines
13 KiB
Go
582 lines
13 KiB
Go
package wikiparser
|
|
|
|
import (
|
|
"golang.org/x/text/unicode/norm"
|
|
"strings"
|
|
)
|
|
|
|
func NormalizeWikiTitle(title string) string {
|
|
return strings.Replace(title, " ", "_", -1)
|
|
}
|
|
|
|
type Link struct {
|
|
URL string
|
|
IsExternal bool
|
|
Name []interface{}
|
|
}
|
|
|
|
type HTML struct {
|
|
Tag *HTMLTag
|
|
}
|
|
|
|
type HTMLTag struct {
|
|
Parent *HTMLTag
|
|
Name string
|
|
Parameters string
|
|
Content []*HTMLTag
|
|
}
|
|
|
|
func (t *HTMLTag) String() (r string) {
|
|
if t.Name == "#text" {
|
|
return t.Parameters
|
|
}
|
|
for _, c := range t.Content {
|
|
r += c.String()
|
|
}
|
|
|
|
if t.Name == "del" { //add strikethrough
|
|
var runes []rune
|
|
for _, r := range []rune(norm.NFD.String(r)) {
|
|
runes = append(runes, '\u0336') //combining long stroke overlay
|
|
runes = append(runes, r)
|
|
}
|
|
r = norm.NFC.String(string(runes))
|
|
} else if t.Name == "ref" { //remove references
|
|
return ""
|
|
} else if t.Name == "br" { //new line
|
|
return "\n"
|
|
} else if t.Name == "script" {
|
|
return ""
|
|
}
|
|
return
|
|
}
|
|
|
|
type NewLineToken struct {
|
|
}
|
|
|
|
//ParseWikiText small WikiText parser that extracts text, Templates, and its arguments/parameters
|
|
func ParseWikiText(text string) (result []interface{}) {
|
|
index := 0
|
|
|
|
for index < len(text) {
|
|
templateIndex := strings.Index(text[index:], "{{")
|
|
linkIndex := strings.Index(text[index:], "[[")
|
|
if templateIndex == -1 && linkIndex == -1 {
|
|
t := strings.TrimSpace(text[index:])
|
|
if len(t) > 0 {
|
|
result = append(result, text[index:])
|
|
}
|
|
break
|
|
} else {
|
|
bestIndex := templateIndex
|
|
if templateIndex == -1 {
|
|
bestIndex = linkIndex
|
|
} else {
|
|
if linkIndex != -1 && linkIndex < bestIndex {
|
|
bestIndex = linkIndex
|
|
}
|
|
}
|
|
|
|
t := strings.TrimSpace(text[index : index+bestIndex])
|
|
if len(t) > 0 {
|
|
result = append(result, text[index:index+bestIndex])
|
|
}
|
|
var tpl *Template
|
|
index, tpl = ParseTemplate(text, index+bestIndex+2, 0, text[index+bestIndex])
|
|
if tpl != nil {
|
|
result = append(result, tpl)
|
|
}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
|
|
var c byte
|
|
|
|
html = &HTML{}
|
|
|
|
readingTag := false
|
|
readingParameters := false
|
|
isTerminating := false
|
|
var tag *HTMLTag
|
|
tagDepth := 0
|
|
|
|
for i = index; i < len(text); i++ {
|
|
c = text[i]
|
|
|
|
if c == '<' {
|
|
newTag := &HTMLTag{
|
|
Parent: tag,
|
|
}
|
|
|
|
if tag != nil {
|
|
tag.Content = append(tag.Content, newTag)
|
|
}
|
|
tag = newTag
|
|
readingTag = true
|
|
readingParameters = false
|
|
isTerminating = false
|
|
if tagDepth == 0 && html.Tag == nil {
|
|
html.Tag = tag
|
|
}
|
|
tagDepth++
|
|
} else if readingTag && c == '>' {
|
|
readingTag = false
|
|
readingParameters = false
|
|
|
|
if isTerminating {
|
|
tagDepth--
|
|
tag = tag.Parent
|
|
}
|
|
if tagDepth == 0 || tag.Parent == nil {
|
|
return i + 1, html
|
|
}
|
|
} else if readingTag && c == '/' {
|
|
isTerminating = true
|
|
} else if readingTag {
|
|
if c == ' ' {
|
|
readingParameters = true
|
|
}
|
|
if readingParameters {
|
|
tag.Parameters += string(c)
|
|
} else {
|
|
tag.Name += string(c)
|
|
}
|
|
} else if tagDepth > 0 {
|
|
if len(tag.Content) == 0 || tag.Content[len(tag.Content)-1].Name != "#text" {
|
|
tag.Content = append(tag.Content, &HTMLTag{
|
|
Parent: tag,
|
|
Name: "#text",
|
|
})
|
|
}
|
|
tag.Content[len(tag.Content)-1].Parameters += string(c)
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func ParseLink(text string, index int, depth int, startCharacter byte) (i int, link *Link) {
|
|
|
|
var c byte
|
|
lastToken := index
|
|
|
|
addValue := func() int {
|
|
if lastToken < len(text) && i-lastToken > 0 {
|
|
t := strings.TrimSpace(text[lastToken:i])
|
|
if len(t) > 0 {
|
|
if link == nil {
|
|
link = &Link{URL: t, IsExternal: startCharacter == '['}
|
|
} else {
|
|
link.Name = append(link.Name, t)
|
|
}
|
|
}
|
|
|
|
return len(t)
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
for i = index; i < len(text); i++ {
|
|
c = text[i]
|
|
|
|
if c == ' ' || c == '\t' && link == nil {
|
|
addValue()
|
|
lastToken = i + 1
|
|
} else if startCharacter == '{' && c == '}' {
|
|
addValue()
|
|
i += 1
|
|
break
|
|
} else if startCharacter == '[' && c == ']' { //end of link
|
|
addValue()
|
|
i += 1
|
|
break
|
|
//template or light might have parameters
|
|
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
|
|
addValue()
|
|
var tpl *Template
|
|
var scanIndex int
|
|
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
|
|
if tpl != nil {
|
|
if link == nil {
|
|
link = &Link{}
|
|
}
|
|
|
|
link.Name = append(link.Name, tpl)
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func ParseTemplate(text string, index int, depth int, startCharacter byte) (i int, template *Template) {
|
|
|
|
var c byte
|
|
lastToken := index
|
|
|
|
var key string
|
|
|
|
addValue := func() int {
|
|
if lastToken < len(text) && i-lastToken > 0 {
|
|
t := strings.TrimSpace(text[lastToken:i])
|
|
if len(t) > 0 {
|
|
if template == nil {
|
|
template = NewTemplate(t, startCharacter == '[')
|
|
} else {
|
|
if key == "" {
|
|
template.AddParameterUnkeyed(text[lastToken:i])
|
|
} else {
|
|
template.AddParameter(key, text[lastToken:i])
|
|
}
|
|
}
|
|
}
|
|
|
|
return len(t)
|
|
}
|
|
|
|
return 0
|
|
}
|
|
addKey := func() {
|
|
if lastToken < len(text) && i-lastToken > 0 {
|
|
t := strings.TrimSpace(text[lastToken:i])
|
|
if len(t) > 0 {
|
|
key = t
|
|
}
|
|
}
|
|
}
|
|
|
|
afterNewLine := false
|
|
|
|
for i = index; i < len(text); i++ {
|
|
c = text[i]
|
|
|
|
if startCharacter == '{' && c == '}' && i < len(text)-1 && text[i+1] == '}' { //end of template
|
|
addValue()
|
|
i += 2
|
|
break
|
|
} else if startCharacter == '[' && c == ']' && i < len(text)-1 && text[i+1] == ']' { //end of link
|
|
addValue()
|
|
i += 2
|
|
break
|
|
//template or light might have parameters
|
|
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
|
|
addValue()
|
|
var tpl *Template
|
|
var scanIndex int
|
|
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
|
|
if tpl != nil {
|
|
if key == "" {
|
|
template.AddParameterUnkeyed(tpl)
|
|
} else {
|
|
template.AddParameter(key, tpl)
|
|
}
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
|
|
addValue()
|
|
var link *Link
|
|
var scanIndex int
|
|
scanIndex, link = ParseLink(text, i+1, depth+1, c)
|
|
if link != nil && template != nil {
|
|
if key == "" {
|
|
template.AddParameterUnkeyed(link)
|
|
} else {
|
|
template.AddParameter(key, link)
|
|
}
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if c == '<' { //html trigger
|
|
addValue()
|
|
var html *HTML
|
|
var scanIndex int
|
|
scanIndex, html = ParseHTML(text, i, depth+1)
|
|
if html != nil && template != nil {
|
|
if key == "" {
|
|
template.AddParameterUnkeyed(html)
|
|
} else {
|
|
template.AddParameter(key, html)
|
|
}
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if c == '|' {
|
|
hasTemplate := template != nil
|
|
addValue()
|
|
lastToken = i + 1
|
|
if hasTemplate {
|
|
template.UnkeyedIndex++
|
|
}
|
|
key = ""
|
|
} else if c == '\n' {
|
|
addValue()
|
|
lastToken = i + 1
|
|
afterNewLine = true
|
|
|
|
if template != nil {
|
|
if key == "" {
|
|
template.AddParameterUnkeyed(NewLineToken{})
|
|
} else {
|
|
template.AddParameter(key, NewLineToken{})
|
|
}
|
|
}
|
|
} else if afterNewLine && (c == '*' || c == '#') {
|
|
addValue()
|
|
var list *UnorderedList
|
|
var scanIndex int
|
|
scanIndex, list = ParseUnorderedList(text, i, depth+1, 1, c)
|
|
if list != nil {
|
|
if key == "" {
|
|
template.AddParameterUnkeyed(list)
|
|
} else {
|
|
template.AddParameter(key, list)
|
|
}
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if c == ';' {
|
|
addValue()
|
|
var list *DescriptionList
|
|
var scanIndex int
|
|
scanIndex, list = ParseDescriptionList(text, i+1, depth+1)
|
|
if list != nil {
|
|
if key == "" {
|
|
template.AddParameterUnkeyed(list)
|
|
} else {
|
|
template.AddParameter(key, list)
|
|
}
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if afterNewLine && c == ':' {
|
|
addValue()
|
|
lastToken = i + 1
|
|
} else if c == '=' {
|
|
if key == "" {
|
|
addKey()
|
|
lastToken = i + 1
|
|
}
|
|
}
|
|
|
|
if afterNewLine && c != '\n' && c != ' ' && c != '\t' {
|
|
afterNewLine = false
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func ParseUnorderedList(text string, index int, depth int, indent int, startCharacter byte) (i int, list *UnorderedList) {
|
|
|
|
list = &UnorderedList{}
|
|
var c byte
|
|
lastToken := index
|
|
|
|
var currentValue []interface{}
|
|
|
|
addValue := func() int {
|
|
if lastToken < len(text) && i-lastToken > 0 {
|
|
t := strings.TrimSpace(text[lastToken:i])
|
|
if len(t) > 0 {
|
|
currentValue = append(currentValue, text[lastToken:i])
|
|
}
|
|
|
|
return len(t)
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
afterNewLine := true
|
|
processIndent := true
|
|
|
|
indentation := 0
|
|
|
|
for i = index; i < len(text); i++ {
|
|
c = text[i]
|
|
|
|
if c == ' ' || c == '\t' {
|
|
//keep the check for new line
|
|
if !afterNewLine {
|
|
processIndent = false
|
|
}
|
|
} else if processIndent && c == startCharacter {
|
|
indentation++
|
|
lastToken = i + 1
|
|
afterNewLine = false
|
|
} else if afterNewLine { //no new list values
|
|
if len(currentValue) > 0 {
|
|
list.Entries = append(list.Entries, currentValue)
|
|
currentValue = []interface{}{}
|
|
}
|
|
return lastToken, list
|
|
} else if indentation > indent {
|
|
if len(currentValue) > 0 {
|
|
list.Entries = append(list.Entries, currentValue)
|
|
currentValue = []interface{}{}
|
|
}
|
|
var level *UnorderedList
|
|
var scanIndex int
|
|
scanIndex, level = ParseUnorderedList(text, lastToken-indentation, depth+1, indentation, startCharacter)
|
|
if level != nil {
|
|
list.Entries = append(list.Entries, level)
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
indentation = 0
|
|
afterNewLine = true
|
|
processIndent = true
|
|
} else if indentation < indent {
|
|
if len(currentValue) > 0 {
|
|
list.Entries = append(list.Entries, currentValue)
|
|
currentValue = []interface{}{}
|
|
}
|
|
return lastToken - indentation, list
|
|
} else if c == '\n' {
|
|
addValue()
|
|
if len(currentValue) > 0 {
|
|
list.Entries = append(list.Entries, currentValue)
|
|
currentValue = []interface{}{}
|
|
}
|
|
indentation = 0
|
|
lastToken = i + 1
|
|
afterNewLine = true
|
|
processIndent = true
|
|
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
|
|
addValue()
|
|
var tpl *Template
|
|
var scanIndex int
|
|
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
|
|
if tpl != nil {
|
|
currentValue = append(currentValue, tpl)
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
|
|
addValue()
|
|
var link *Link
|
|
var scanIndex int
|
|
scanIndex, link = ParseLink(text, i+1, depth+1, c)
|
|
if link != nil {
|
|
currentValue = append(currentValue, link)
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if c == '<' { //html trigger
|
|
addValue()
|
|
var html *HTML
|
|
var scanIndex int
|
|
scanIndex, html = ParseHTML(text, i, depth+1)
|
|
if html != nil {
|
|
currentValue = append(currentValue, html)
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else {
|
|
processIndent = false
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func ParseDescriptionList(text string, index int, depth int) (i int, list *DescriptionList) {
|
|
|
|
var c byte
|
|
lastToken := index
|
|
|
|
list = &DescriptionList{}
|
|
|
|
hasKey := false
|
|
|
|
addValue := func() int {
|
|
if lastToken < len(text) && i-lastToken > 0 {
|
|
t := strings.TrimSpace(text[lastToken:i])
|
|
if len(t) > 0 {
|
|
if !hasKey {
|
|
list.Name = append(list.Name, text[lastToken:i])
|
|
} else {
|
|
list.Entries = append(list.Entries, text[lastToken:i])
|
|
}
|
|
}
|
|
|
|
return len(t)
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
afterNewLine := false
|
|
|
|
for i = index; i < len(text); i++ {
|
|
c = text[i]
|
|
|
|
if c == ' ' || c == '\t' {
|
|
//keep the check for new line
|
|
} else if c == ':' {
|
|
addValue()
|
|
lastToken = i + 1
|
|
afterNewLine = false
|
|
hasKey = true
|
|
} else if afterNewLine { //no new list values
|
|
return lastToken, list
|
|
} else if c == '\n' {
|
|
addValue()
|
|
lastToken = i + 1
|
|
afterNewLine = true
|
|
hasKey = true
|
|
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
|
|
addValue()
|
|
var tpl *Template
|
|
var scanIndex int
|
|
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
|
|
if tpl != nil {
|
|
if !hasKey {
|
|
list.Name = append(list.Name, tpl)
|
|
} else {
|
|
list.Entries = append(list.Entries, tpl)
|
|
}
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
|
|
addValue()
|
|
var link *Link
|
|
var scanIndex int
|
|
scanIndex, link = ParseLink(text, i+1, depth+1, c)
|
|
if link != nil {
|
|
if !hasKey {
|
|
list.Name = append(list.Name, link)
|
|
} else {
|
|
list.Entries = append(list.Entries, link)
|
|
}
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
} else if c == '<' { //html trigger
|
|
addValue()
|
|
var html *HTML
|
|
var scanIndex int
|
|
scanIndex, html = ParseHTML(text, i, depth+1)
|
|
if html != nil {
|
|
if !hasKey {
|
|
list.Name = append(list.Name, html)
|
|
} else {
|
|
list.Entries = append(list.Entries, html)
|
|
}
|
|
}
|
|
lastToken = scanIndex
|
|
i = scanIndex - 1
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|