Initial version

This commit is contained in:
DataHoarder 2022-02-20 20:19:16 +01:00
commit a0cd37f626
10 changed files with 773 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/.idea

9
LICENSE Normal file
View file

@ -0,0 +1,9 @@
Copyright (c) 2022 wikitext-parser Contributors All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

7
README.md Normal file
View file

@ -0,0 +1,7 @@
# wikitext-parser
Utilities to parse and handle wikitext template-based pages.
Might not be useful for recursive matching.
See other projects using this for usage examples.

5
go.mod Normal file
View file

@ -0,0 +1,5 @@
module git.gammaspectra.live/S.O.N.G/wikitext-parser
go 1.18
require golang.org/x/text v0.3.7

2
go.sum Normal file
View file

@ -0,0 +1,2 @@
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=

118
html.go Normal file
View file

@ -0,0 +1,118 @@
package wikitext_parser
import (
"golang.org/x/text/unicode/norm"
"unicode"
)
type HTML struct {
Tag *HTMLTag
}
type HTMLTag struct {
Parent *HTMLTag
Name []byte
Parameters []byte
Content []*HTMLTag
}
func (t *HTMLTag) String() (r string) {
if string(t.Name) == "#text" {
r = string(t.Parameters)
} else {
for _, c := range t.Content {
r += c.String()
}
}
if string(t.Name) == "del" { //add strikethrough
var runeList []rune
for _, runeEntry := range []rune(norm.NFD.String(r)) {
if runeEntry <= unicode.MaxASCII {
runeList = append(runeList, '\u0336') //combining long stroke overlay
}
runeList = append(runeList, runeEntry)
}
r = norm.NFC.String(string(runeList))
} else if string(t.Name) == "ref" { //remove references
return ""
} else if string(t.Name) == "br" { //new line
return "\n"
} else if string(t.Name) == "script" {
return ""
}
return
}
func ParseHTML(text string, index int, depth int) (i int, html *HTML) {
var c byte
html = &HTML{}
readingTag := false
readingParameters := false
isTerminating := false
var tag *HTMLTag
tagDepth := 0
for i = index; i < len(text); i++ {
c = text[i]
if c == '<' && i < len(text)-1 && text[i+1] == '/' {
isTerminating = true
readingTag = true
readingParameters = false
} else if c == '<' {
newTag := &HTMLTag{
Parent: tag,
}
if tag != nil {
tag.Content = append(tag.Content, newTag)
}
tag = newTag
readingTag = true
readingParameters = false
isTerminating = false
if tagDepth == 0 && html.Tag == nil {
html.Tag = tag
}
tagDepth++
} else if readingTag && c == '>' {
readingTag = false
readingParameters = false
if isTerminating {
tagDepth--
if tag != nil {
tag = tag.Parent
}
isTerminating = false
}
if tagDepth == 0 {
return i + 1, html
}
} else if readingTag && c == '/' {
isTerminating = true
} else if !isTerminating && readingTag {
if c == ' ' {
readingParameters = true
}
if readingParameters {
tag.Parameters = append(tag.Parameters, c)
} else {
tag.Name = append(tag.Name, c)
}
} else if !isTerminating && tagDepth > 0 {
if len(tag.Content) == 0 || string(tag.Content[len(tag.Content)-1].Name) != "#text" {
tag.Content = append(tag.Content, &HTMLTag{
Parent: tag,
Name: []byte("#text"),
})
}
tag.Content[len(tag.Content)-1].Parameters = append(tag.Content[len(tag.Content)-1].Parameters, c)
}
}
return
}

66
link.go Normal file
View file

@ -0,0 +1,66 @@
package wikitext_parser
import "strings"
type Link struct {
URL string
IsExternal bool
Name []interface{}
}
func ParseLink(text string, index int, depth int, startCharacter byte) (i int, link *Link) {
var c byte
lastToken := index
addValue := func() int {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
if link == nil {
link = &Link{URL: t, IsExternal: startCharacter == '['}
} else {
link.Name = append(link.Name, t)
}
}
return len(t)
}
return 0
}
for i = index; i < len(text); i++ {
c = text[i]
if c == ' ' || c == '\t' && link == nil {
addValue()
lastToken = i + 1
} else if startCharacter == '{' && c == '}' {
addValue()
i += 1
break
} else if startCharacter == '[' && c == ']' { //end of link
addValue()
i += 1
break
//template or light might have parameters
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
addValue()
var tpl *Template
var scanIndex int
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
if tpl != nil {
if link == nil {
link = &Link{}
}
link.Name = append(link.Name, tpl)
}
lastToken = scanIndex
i = scanIndex - 1
}
}
return
}

219
list.go Normal file
View file

@ -0,0 +1,219 @@
package wikitext_parser
import "strings"
type DescriptionList struct {
Name []interface{}
Entries []interface{}
}
type UnorderedList struct {
Entries []interface{}
}
func ParseUnorderedList(text string, index int, depth int, indent int, startCharacter byte) (i int, list *UnorderedList) {
list = &UnorderedList{}
var c byte
lastToken := index
var currentValue []interface{}
addValue := func() int {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
currentValue = append(currentValue, text[lastToken:i])
}
return len(t)
}
return 0
}
afterNewLine := true
processIndent := true
indentation := 0
for i = index; i < len(text); i++ {
c = text[i]
if c == ' ' || c == '\t' {
//keep the check for new line
if !afterNewLine {
processIndent = false
}
} else if processIndent && c == startCharacter {
indentation++
lastToken = i + 1
afterNewLine = false
} else if afterNewLine { //no new list values
if len(currentValue) > 0 {
list.Entries = append(list.Entries, currentValue)
currentValue = []interface{}{}
}
return lastToken, list
} else if indentation > indent {
if len(currentValue) > 0 {
list.Entries = append(list.Entries, currentValue)
currentValue = []interface{}{}
}
var level *UnorderedList
var scanIndex int
scanIndex, level = ParseUnorderedList(text, lastToken-indentation, depth+1, indentation, startCharacter)
if level != nil {
list.Entries = append(list.Entries, level)
}
lastToken = scanIndex
i = scanIndex - 1
indentation = 0
afterNewLine = true
processIndent = true
} else if indentation < indent {
if len(currentValue) > 0 {
list.Entries = append(list.Entries, currentValue)
currentValue = []interface{}{}
}
return lastToken - indentation, list
} else if c == '\n' {
addValue()
if len(currentValue) > 0 {
list.Entries = append(list.Entries, currentValue)
currentValue = []interface{}{}
}
indentation = 0
lastToken = i + 1
afterNewLine = true
processIndent = true
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
addValue()
var tpl *Template
var scanIndex int
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
if tpl != nil {
currentValue = append(currentValue, tpl)
}
lastToken = scanIndex
i = scanIndex - 1
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
addValue()
var link *Link
var scanIndex int
scanIndex, link = ParseLink(text, i+1, depth+1, c)
if link != nil {
currentValue = append(currentValue, link)
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == '<' { //html trigger
addValue()
var html *HTML
var scanIndex int
scanIndex, html = ParseHTML(text, i, depth+1)
if html != nil {
currentValue = append(currentValue, html)
}
lastToken = scanIndex
i = scanIndex - 1
} else {
processIndent = false
}
}
return
}
func ParseDescriptionList(text string, index int, depth int) (i int, list *DescriptionList) {
var c byte
lastToken := index
list = &DescriptionList{}
hasKey := false
addValue := func() int {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
if !hasKey {
list.Name = append(list.Name, text[lastToken:i])
} else {
list.Entries = append(list.Entries, text[lastToken:i])
}
}
return len(t)
}
return 0
}
afterNewLine := false
for i = index; i < len(text); i++ {
c = text[i]
if c == ' ' || c == '\t' {
//keep the check for new line
} else if c == ':' {
addValue()
lastToken = i + 1
afterNewLine = false
hasKey = true
} else if afterNewLine { //no new list values
return lastToken, list
} else if c == '\n' {
addValue()
lastToken = i + 1
afterNewLine = true
hasKey = true
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
addValue()
var tpl *Template
var scanIndex int
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
if tpl != nil {
if !hasKey {
list.Name = append(list.Name, tpl)
} else {
list.Entries = append(list.Entries, tpl)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
addValue()
var link *Link
var scanIndex int
scanIndex, link = ParseLink(text, i+1, depth+1, c)
if link != nil {
if !hasKey {
list.Name = append(list.Name, link)
} else {
list.Entries = append(list.Entries, link)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == '<' { //html trigger
addValue()
var html *HTML
var scanIndex int
scanIndex, html = ParseHTML(text, i, depth+1)
if html != nil {
if !hasKey {
list.Name = append(list.Name, html)
} else {
list.Entries = append(list.Entries, html)
}
}
lastToken = scanIndex
i = scanIndex - 1
}
}
return
}

190
template.go Normal file
View file

@ -0,0 +1,190 @@
package wikitext_parser
import (
"fmt"
"strings"
)
type Template struct {
Name string
IsLink bool
Parameters map[string][]interface{}
UnkeyedIndex int
}
func NewTemplate(name string, isLink bool) *Template {
return &Template{
Name: name,
IsLink: isLink,
Parameters: make(map[string][]interface{}),
}
}
func (t *Template) AddParameterUnkeyed(value interface{}) {
t.AddParameter(fmt.Sprintf("%d", t.UnkeyedIndex), value)
}
func (t *Template) AddParameter(key string, value interface{}) {
if _, ok := t.Parameters[key]; !ok {
t.Parameters[key] = make([]interface{}, 0, 1)
}
t.Parameters[key] = append(t.Parameters[key], value)
}
func ParseTemplate(text string, index int, depth int, startCharacter byte) (i int, template *Template) {
var c byte
lastToken := index
var key string
addValue := func() int {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
if template == nil {
template = NewTemplate(t, startCharacter == '[')
} else {
if key == "" {
template.AddParameterUnkeyed(text[lastToken:i])
} else {
template.AddParameter(key, text[lastToken:i])
}
}
}
return len(t)
}
return 0
}
addKey := func() {
if lastToken < len(text) && i-lastToken > 0 {
t := strings.TrimSpace(text[lastToken:i])
if len(t) > 0 {
key = t
}
}
}
afterNewLine := false
for i = index; i < len(text); i++ {
c = text[i]
if startCharacter == '{' && c == '}' && i < len(text)-1 && text[i+1] == '}' { //end of template
addValue()
i += 2
break
} else if startCharacter == '[' && c == ']' && i < len(text)-1 && text[i+1] == ']' { //end of link
addValue()
i += 2
break
//template or light might have parameters
} else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') {
addValue()
var tpl *Template
var scanIndex int
scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c)
if tpl != nil {
if key == "" {
template.AddParameterUnkeyed(tpl)
} else {
template.AddParameter(key, tpl)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') {
addValue()
var link *Link
var scanIndex int
scanIndex, link = ParseLink(text, i+1, depth+1, c)
if link != nil && template != nil {
if key == "" {
template.AddParameterUnkeyed(link)
} else {
template.AddParameter(key, link)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == '<' { //html trigger
addValue()
var html *HTML
var scanIndex int
scanIndex, html = ParseHTML(text, i, depth+1)
if html != nil && template != nil {
if key == "" {
template.AddParameterUnkeyed(html)
} else {
template.AddParameter(key, html)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == '|' {
hasTemplate := template != nil
addValue()
lastToken = i + 1
if hasTemplate {
template.UnkeyedIndex++
}
key = ""
} else if c == '\n' {
addValue()
lastToken = i + 1
afterNewLine = true
if template != nil {
if key == "" {
template.AddParameterUnkeyed(NewLineToken{})
} else {
template.AddParameter(key, NewLineToken{})
}
}
} else if afterNewLine && (c == '*' || c == '#') {
addValue()
var list *UnorderedList
var scanIndex int
scanIndex, list = ParseUnorderedList(text, i, depth+1, 1, c)
if list != nil {
if key == "" {
template.AddParameterUnkeyed(list)
} else {
template.AddParameter(key, list)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if c == ';' {
addValue()
var list *DescriptionList
var scanIndex int
scanIndex, list = ParseDescriptionList(text, i+1, depth+1)
if list != nil {
if key == "" {
template.AddParameterUnkeyed(list)
} else {
template.AddParameter(key, list)
}
}
lastToken = scanIndex
i = scanIndex - 1
} else if afterNewLine && c == ':' {
addValue()
lastToken = i + 1
} else if c == '=' {
if key == "" {
addKey()
lastToken = i + 1
}
}
if afterNewLine && c != '\n' && c != ' ' && c != '\t' {
afterNewLine = false
}
}
return
}

156
wikitext.go Normal file
View file

@ -0,0 +1,156 @@
package wikitext_parser
import (
"strings"
)
func NormalizeWikiTitle(title string) string {
return strings.Replace(title, " ", "_", -1)
}
type NewLineToken struct {
}
type GetInterfaceSliceStringValueOptions struct {
PageName string
Trim bool
StringHandler func(value string, opt *GetInterfaceSliceStringValueOptions) []string
HTMLHandler func(value *HTML, opt *GetInterfaceSliceStringValueOptions) []string
LinkHandler func(value *Link, opt *GetInterfaceSliceStringValueOptions) []string
TemplateLinkHandler func(value *Template, opt *GetInterfaceSliceStringValueOptions) []string
TemplateHandler func(value *Template, opt *GetInterfaceSliceStringValueOptions) []string
UnorderedListHandler func(value *UnorderedList, opt *GetInterfaceSliceStringValueOptions) []string
DescriptionListHandler func(value *DescriptionList, opt *GetInterfaceSliceStringValueOptions) []string
NewLineHandler func(opt *GetInterfaceSliceStringValueOptions) []string
}
func (o *GetInterfaceSliceStringValueOptions) Default() {
o.Trim = true
o.StringHandler = func(value string, opt *GetInterfaceSliceStringValueOptions) []string {
return []string{value}
}
o.HTMLHandler = func(value *HTML, opt *GetInterfaceSliceStringValueOptions) []string {
return []string{value.Tag.String()}
}
o.NewLineHandler = func(opt *GetInterfaceSliceStringValueOptions) []string {
return []string{"\n"}
}
o.LinkHandler = func(value *Link, opt *GetInterfaceSliceStringValueOptions) (result []string) {
if len(value.Name) > 0 {
result = append(result, GetWikiStringValue(value.Name, opt)...)
} else {
result = append(result, value.URL)
}
result = append(result, GetWikiStringValue(value.Name, opt)...)
return
}
o.TemplateLinkHandler = func(value *Template, opt *GetInterfaceSliceStringValueOptions) (result []string) {
output := 0
for _, vv := range value.Parameters {
for _, vvv := range GetWikiStringValue(vv, opt) {
vvv = strings.TrimSpace(vvv)
if len(vvv) > 0 {
output++
result = append(result, vvv)
}
}
}
if output == 0 {
result = append(result, value.Name)
}
return
}
o.TemplateHandler = func(value *Template, opt *GetInterfaceSliceStringValueOptions) (result []string) {
switch strings.ToUpper(value.Name) {
case "PAGENAME", "SUBPAGENAME":
result = append(result, opt.PageName)
default:
result = append(result, value.Name)
}
return
}
o.UnorderedListHandler = func(value *UnorderedList, opt *GetInterfaceSliceStringValueOptions) []string {
return GetWikiStringValue(value.Entries, opt)
}
o.DescriptionListHandler = func(value *DescriptionList, opt *GetInterfaceSliceStringValueOptions) []string {
return []string{strings.Join(GetWikiStringValue(value.Name, opt), ", ") + ": " + strings.Join(GetWikiStringValue(value.Entries, opt), ", ")}
}
}
func GetWikiStringValue(v []interface{}, opts *GetInterfaceSliceStringValueOptions) (r []string) {
var result []string
for _, value := range v {
if text, ok := value.(string); ok {
result = append(result, opts.StringHandler(text, opts)...)
} else if template, ok := value.(*Template); ok {
if template.IsLink {
result = append(result, opts.TemplateLinkHandler(template, opts)...)
} else {
result = append(result, opts.TemplateHandler(template, opts)...)
}
} else if html, ok := value.(*HTML); ok && html.Tag != nil {
result = append(result, opts.HTMLHandler(html, opts)...)
} else if _, ok := value.(NewLineToken); ok {
result = append(result, opts.NewLineHandler(opts)...)
} else if link, ok := value.(*Link); ok {
result = append(result, opts.LinkHandler(link, opts)...)
} else if unorderedList, ok := value.(*UnorderedList); ok {
result = append(result, opts.UnorderedListHandler(unorderedList, opts)...)
} else if descriptionList, ok := value.(*DescriptionList); ok {
result = append(result, opts.DescriptionListHandler(descriptionList, opts)...)
}
}
r = make([]string, 0, len(result))
for _, e := range result {
if opts.Trim {
e = strings.TrimSpace(e)
}
if len(e) > 0 {
r = append(r, e)
}
}
return
}
//ParseWikiText small WikiText parser that extracts text, Templates, and its arguments/parameters
func ParseWikiText(text string) (result []interface{}) {
index := 0
for index < len(text) {
templateIndex := strings.Index(text[index:], "{{")
linkIndex := strings.Index(text[index:], "[[")
if templateIndex == -1 && linkIndex == -1 {
t := strings.TrimSpace(text[index:])
if len(t) > 0 {
result = append(result, text[index:])
}
break
} else {
bestIndex := templateIndex
if templateIndex == -1 {
bestIndex = linkIndex
} else {
if linkIndex != -1 && linkIndex < bestIndex {
bestIndex = linkIndex
}
}
t := strings.TrimSpace(text[index : index+bestIndex])
if len(t) > 0 {
result = append(result, text[index:index+bestIndex])
}
var tpl *Template
index, tpl = ParseTemplate(text, index+bestIndex+2, 0, text[index+bestIndex])
if tpl != nil {
result = append(result, tpl)
}
}
}
return
}