touhouwiki-mirror/wikiparser/parser.go
2022-02-20 16:20:11 +01:00

582 lines
13 KiB
Go

package wikiparser
import (
"golang.org/x/text/unicode/norm"
"strings"
)
func NormalizeWikiTitle(title string) string {
return strings.Replace(title, " ", "_", -1)
}
type Link struct {
URL string
IsExternal bool
Name []interface{}
}
type HTML struct {
Tag *HTMLTag
}
type HTMLTag struct {
Parent *HTMLTag
Name string
Parameters string
Content []*HTMLTag
}
func (t *HTMLTag) String() (r string) {
if t.Name == "#text" {
return t.Parameters
}
for _, c := range t.Content {
r += c.String()
}
if t.Name == "del" { //add strikethrough
var runes []rune
for _, r := range []rune(norm.NFD.String(r)) {
runes = append(runes, '\u0336') //combining long stroke overlay
runes = append(runes, r)
}
r = norm.NFC.String(string(runes))
} else if t.Name == "ref" { //remove references
return ""
} else if t.Name == "br" { //new line
return "\n"
} else if t.Name == "script" {
return ""
}
return
}
type NewLineToken struct {
}
//ParseWikiText small WikiText parser that extracts text, Templates, and its arguments/parameters
func ParseWikiText(text string) (result []interface{}) {
index := 0
for index < len(text) {
templateIndex := strings.Index(text[index:], "{{")
linkIndex := strings.Index(text[index:], "[[")
if templateIndex == -1 && linkIndex == -1 {
t := strings.TrimSpace(text[index:])
if len(t) > 0 {
result = append(result, text[index:])
}
break
} else {
bestIndex := templateIndex
if templateIndex == -1 {
bestIndex = linkIndex
} else {
if linkIndex != -1 && linkIndex < bestIndex {
bestIndex = linkIndex
}
}
t := strings.TrimSpace(text[index : index+bestIndex])
if len(t) > 0 {
result = append(result, text[index:index+bestIndex])
}
var tpl *Template
index, tpl = ParseTemplate(text, index+bestIndex+2, 0, text[index+bestIndex])
if tpl != nil {
result = append(result, tpl)
}
}
}
return
}
func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
var c byte
html = &HTML{}
readingTag := false
readingParameters := false
isTerminating := false
var tag *HTMLTag
tagDepth := 0
for i = index; i < len(text); i++ {
c = text[i]
if c == '<' {
newTag := &HTMLTag{
Parent: tag,
}
if tag != nil {
tag.Content = append(tag.Content, newTag)
}
tag = newTag
readingTag = true
readingParameters = false
isTerminating = false
if tagDepth == 0 && html.Tag == nil {
html.Tag = tag
}
tagDepth++
} else if readingTag && c == '>' {
readingTag = false
readingParameters = false
if isTerminating {
tagDepth--
tag = tag.Parent
}
if tagDepth == 0 || tag.Parent == nil {
return i + 1, html
}
} else if readingTag && c == '/' {
isTerminating = true
} else if readingTag {
if c == ' ' {
readingParameters = true
}
if readingParameters {
tag.Parameters += string(c)
} else {
tag.Name += string(c)
}
} else if tagDepth > 0 {
if len(tag.Content) == 0 || tag.Content[len(tag.Content)-1].Name != "#text" {
tag.Content = append(tag.Content, &HTMLTag{
Parent: tag,
Name: "#text",
})
}
tag.Content[len(tag.Content)-1].Parameters += string(c)
}
}
return
}
func ParseLink(text string, index int, depth int, startCharacter byte) (i int, link *Link) {
var c byte
lastToken := index
addValue := func() int {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
if link == nil {
link = &Link{URL: t, IsExternal: startCharacter == '['}
} else {
link.Name = append(link.Name, t)
}
}
return len(t)
}
return 0
}
for i = index; i < len(text); i++ {
c = text[i]
if c == ' ' || c == '\t' && link == nil {
addValue()
lastToken = i + 1
} else if startCharacter == '{' && c == '}' {
addValue()
i += 1
break
} else if startCharacter == '[' && c == ']' { //end of link
addValue()
i += 1
break
//template or light might have parameters
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
addValue()
var tpl *Template
var scanIndex int
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
if tpl != nil {
if link == nil {
link = &Link{}
}
link.Name = append(link.Name, tpl)
}
lastToken = scanIndex
i = scanIndex - 1
}
}
return
}
func ParseTemplate(text string, index int, depth int, startCharacter byte) (i int, template *Template) {
var c byte
lastToken := index
var key string
addValue := func() int {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
if template == nil {
template = NewTemplate(t, startCharacter == '[')
} else {
if key == "" {
template.AddParameterUnkeyed(text[lastToken:i])
} else {
template.AddParameter(key, text[lastToken:i])
}
}
}
return len(t)
}
return 0
}
addKey := func() {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
key = t
}
}
}
afterNewLine := false
for i = index; i < len(text); i++ {
c = text[i]
if startCharacter == '{' && c == '}' && i < len(text)-1 && text[i+1] == '}' { //end of template
addValue()
i += 2
break
} else if startCharacter == '[' && c == ']' && i < len(text)-1 && text[i+1] == ']' { //end of link
addValue()
i += 2
break
//template or light might have parameters
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
addValue()
var tpl *Template
var scanIndex int
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
if tpl != nil {
if key == "" {
template.AddParameterUnkeyed(tpl)
} else {
template.AddParameter(key, tpl)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
addValue()
var link *Link
var scanIndex int
scanIndex, link = ParseLink(text, i+1, depth+1, c)
if link != nil && template != nil {
if key == "" {
template.AddParameterUnkeyed(link)
} else {
template.AddParameter(key, link)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == '<' { //html trigger
addValue()
var html *HTML
var scanIndex int
scanIndex, html = ParseHTML(text, i, depth+1)
if html != nil && template != nil {
if key == "" {
template.AddParameterUnkeyed(html)
} else {
template.AddParameter(key, html)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == '|' {
hasTemplate := template != nil
addValue()
lastToken = i + 1
if hasTemplate {
template.UnkeyedIndex++
}
key = ""
} else if c == '\n' {
addValue()
lastToken = i + 1
afterNewLine = true
if template != nil {
if key == "" {
template.AddParameterUnkeyed(NewLineToken{})
} else {
template.AddParameter(key, NewLineToken{})
}
}
} else if afterNewLine && (c == '*' || c == '#') {
addValue()
var list *UnorderedList
var scanIndex int
scanIndex, list = ParseUnorderedList(text, i, depth+1, 1, c)
if list != nil {
if key == "" {
template.AddParameterUnkeyed(list)
} else {
template.AddParameter(key, list)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == ';' {
addValue()
var list *DescriptionList
var scanIndex int
scanIndex, list = ParseDescriptionList(text, i+1, depth+1)
if list != nil {
if key == "" {
template.AddParameterUnkeyed(list)
} else {
template.AddParameter(key, list)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if afterNewLine && c == ':' {
addValue()
lastToken = i + 1
} else if c == '=' {
if key == "" {
addKey()
lastToken = i + 1
}
}
if afterNewLine && c != '\n' && c != ' ' && c != '\t' {
afterNewLine = false
}
}
return
}
func ParseUnorderedList(text string, index int, depth int, indent int, startCharacter byte) (i int, list *UnorderedList) {
list = &UnorderedList{}
var c byte
lastToken := index
var currentValue []interface{}
addValue := func() int {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
currentValue = append(currentValue, text[lastToken:i])
}
return len(t)
}
return 0
}
afterNewLine := true
processIndent := true
indentation := 0
for i = index; i < len(text); i++ {
c = text[i]
if c == ' ' || c == '\t' {
//keep the check for new line
if !afterNewLine {
processIndent = false
}
} else if processIndent && c == startCharacter {
indentation++
lastToken = i + 1
afterNewLine = false
} else if afterNewLine { //no new list values
if len(currentValue) > 0 {
list.Entries = append(list.Entries, currentValue)
currentValue = []interface{}{}
}
return lastToken, list
} else if indentation > indent {
if len(currentValue) > 0 {
list.Entries = append(list.Entries, currentValue)
currentValue = []interface{}{}
}
var level *UnorderedList
var scanIndex int
scanIndex, level = ParseUnorderedList(text, lastToken-indentation, depth+1, indentation, startCharacter)
if level != nil {
list.Entries = append(list.Entries, level)
}
lastToken = scanIndex
i = scanIndex - 1
indentation = 0
afterNewLine = true
processIndent = true
} else if indentation < indent {
if len(currentValue) > 0 {
list.Entries = append(list.Entries, currentValue)
currentValue = []interface{}{}
}
return lastToken - indentation, list
} else if c == '\n' {
addValue()
if len(currentValue) > 0 {
list.Entries = append(list.Entries, currentValue)
currentValue = []interface{}{}
}
indentation = 0
lastToken = i + 1
afterNewLine = true
processIndent = true
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
addValue()
var tpl *Template
var scanIndex int
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
if tpl != nil {
currentValue = append(currentValue, tpl)
}
lastToken = scanIndex
i = scanIndex - 1
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
addValue()
var link *Link
var scanIndex int
scanIndex, link = ParseLink(text, i+1, depth+1, c)
if link != nil {
currentValue = append(currentValue, link)
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == '<' { //html trigger
addValue()
var html *HTML
var scanIndex int
scanIndex, html = ParseHTML(text, i, depth+1)
if html != nil {
currentValue = append(currentValue, html)
}
lastToken = scanIndex
i = scanIndex - 1
} else {
processIndent = false
}
}
return
}
func ParseDescriptionList(text string, index int, depth int) (i int, list *DescriptionList) {
var c byte
lastToken := index
list = &DescriptionList{}
hasKey := false
addValue := func() int {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
if !hasKey {
list.Name = append(list.Name, text[lastToken:i])
} else {
list.Entries = append(list.Entries, text[lastToken:i])
}
}
return len(t)
}
return 0
}
afterNewLine := false
for i = index; i < len(text); i++ {
c = text[i]
if c == ' ' || c == '\t' {
//keep the check for new line
} else if c == ':' {
addValue()
lastToken = i + 1
afterNewLine = false
hasKey = true
} else if afterNewLine { //no new list values
return lastToken, list
} else if c == '\n' {
addValue()
lastToken = i + 1
afterNewLine = true
hasKey = true
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
addValue()
var tpl *Template
var scanIndex int
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
if tpl != nil {
if !hasKey {
list.Name = append(list.Name, tpl)
} else {
list.Entries = append(list.Entries, tpl)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
addValue()
var link *Link
var scanIndex int
scanIndex, link = ParseLink(text, i+1, depth+1, c)
if link != nil {
if !hasKey {
list.Name = append(list.Name, link)
} else {
list.Entries = append(list.Entries, link)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == '<' { //html trigger
addValue()
var html *HTML
var scanIndex int
scanIndex, html = ParseHTML(text, i, depth+1)
if html != nil {
if !hasKey {
list.Name = append(list.Name, html)
} else {
list.Entries = append(list.Entries, html)
}
}
lastToken = scanIndex
i = scanIndex - 1
}
}
return
}