commit a0cd37f62668c8518f9713e0887e41e2e90e7be8 Author: WeebDataHoarder <57538841+WeebDataHoarder@users.noreply.github.com> Date: Sun Feb 20 20:19:16 2022 +0100 Initial version diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..757fee3 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6f14385 --- /dev/null +++ b/LICENSE @@ -0,0 +1,9 @@ +Copyright (c) 2022 wikitext-parser Contributors All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a32fb7e --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# wikitext-parser + +Utilities to parse and handle wikitext template-based pages. + +Might not be useful for recursive matching. + +See other projects using this for usage examples. \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..5ca6e64 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module git.gammaspectra.live/S.O.N.G/wikitext-parser + +go 1.18 + +require golang.org/x/text v0.3.7 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..1f78e03 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= diff --git a/html.go b/html.go new file mode 100644 index 0000000..4c8bee1 --- /dev/null +++ b/html.go @@ -0,0 +1,118 @@ +package wikitext_parser + +import ( + "golang.org/x/text/unicode/norm" + "unicode" +) + +type HTML struct { + Tag *HTMLTag +} + +type HTMLTag struct { + Parent *HTMLTag + Name []byte + Parameters []byte + Content []*HTMLTag +} + +func (t *HTMLTag) String() (r string) { + if string(t.Name) == "#text" { + r = string(t.Parameters) + } else { + for _, c := range t.Content { + r += c.String() + } + } + + if string(t.Name) == "del" { //add strikethrough + var runeList []rune + for _, runeEntry := range []rune(norm.NFD.String(r)) { + if runeEntry <= unicode.MaxASCII { + runeList = append(runeList, '\u0336') //combining long stroke overlay + } + runeList = append(runeList, runeEntry) + } + r = norm.NFC.String(string(runeList)) + } else if string(t.Name) == "ref" { //remove references + return "" + } else if string(t.Name) == "br" { //new line + return "\n" + } else if string(t.Name) == "script" { + return "" + } + return +} + +func ParseHTML(text string, index int, depth int) (i int, html *HTML) { + var c byte + + html = &HTML{} + + readingTag := false + readingParameters := false + isTerminating := false + var tag *HTMLTag + tagDepth := 0 + + for i = index; i < len(text); i++ { + c = text[i] + + if c == '<' && i < len(text)-1 && text[i+1] == '/' { + isTerminating = true + readingTag = true + readingParameters = false + } else if c == '<' { + newTag := &HTMLTag{ + Parent: tag, + } + + if tag != nil { + tag.Content = append(tag.Content, newTag) + } + tag = newTag + readingTag = true + readingParameters = false + isTerminating = false + if tagDepth == 0 && html.Tag == nil { + html.Tag = tag + } + tagDepth++ + } else if readingTag && c == '>' { + readingTag = false + readingParameters = false + + if isTerminating { + tagDepth-- + if tag != nil { + tag = tag.Parent + } + isTerminating = false + } + if tagDepth == 0 { + return i + 1, html + } + } else if readingTag && c == '/' { + isTerminating = true + } else if !isTerminating && readingTag { + if c == ' ' { + readingParameters = true + } + if readingParameters { + tag.Parameters = append(tag.Parameters, c) + } else { + tag.Name = append(tag.Name, c) + } + } else if !isTerminating && tagDepth > 0 { + if len(tag.Content) == 0 || string(tag.Content[len(tag.Content)-1].Name) != "#text" { + tag.Content = append(tag.Content, &HTMLTag{ + Parent: tag, + Name: []byte("#text"), + }) + } + tag.Content[len(tag.Content)-1].Parameters = append(tag.Content[len(tag.Content)-1].Parameters, c) + } + } + + return +} diff --git a/link.go b/link.go new file mode 100644 index 0000000..fd3be92 --- /dev/null +++ b/link.go @@ -0,0 +1,66 @@ +package wikitext_parser + +import "strings" + +type Link struct { + URL string + IsExternal bool + Name []interface{} +} + +func ParseLink(text string, index int, depth int, startCharacter byte) (i int, link *Link) { + + var c byte + lastToken := index + + addValue := func() int { + if lastToken < len(text) && i-lastToken > 0 { + t := strings.TrimSpace(text[lastToken:i]) + if len(t) > 0 { + if link == nil { + link = &Link{URL: t, IsExternal: startCharacter == '['} + } else { + link.Name = append(link.Name, t) + } + } + + return len(t) + } + + return 0 + } + + for i = index; i < len(text); i++ { + c = text[i] + + if c == ' ' || c == '\t' && link == nil { + addValue() + lastToken = i + 1 + } else if startCharacter == '{' && c == '}' { + addValue() + i += 1 + break + } else if startCharacter == '[' && c == ']' { //end of link + addValue() + i += 1 + break + //template or light might have parameters + } else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') { + addValue() + var tpl *Template + var scanIndex int + scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c) + if tpl != nil { + if link == nil { + link = &Link{} + } + + link.Name = append(link.Name, tpl) + } + lastToken = scanIndex + i = scanIndex - 1 + } + } + + return +} diff --git a/list.go b/list.go new file mode 100644 index 0000000..21449c4 --- /dev/null +++ b/list.go @@ -0,0 +1,219 @@ +package wikitext_parser + +import "strings" + +type DescriptionList struct { + Name []interface{} + Entries []interface{} +} + +type UnorderedList struct { + Entries []interface{} +} + +func ParseUnorderedList(text string, index int, depth int, indent int, startCharacter byte) (i int, list *UnorderedList) { + + list = &UnorderedList{} + var c byte + lastToken := index + + var currentValue []interface{} + + addValue := func() int { + if lastToken < len(text) && i-lastToken > 0 { + t := strings.TrimSpace(text[lastToken:i]) + if len(t) > 0 { + currentValue = append(currentValue, text[lastToken:i]) + } + + return len(t) + } + + return 0 + } + + afterNewLine := true + processIndent := true + + indentation := 0 + + for i = index; i < len(text); i++ { + c = text[i] + + if c == ' ' || c == '\t' { + //keep the check for new line + if !afterNewLine { + processIndent = false + } + } else if processIndent && c == startCharacter { + indentation++ + lastToken = i + 1 + afterNewLine = false + } else if afterNewLine { //no new list values + if len(currentValue) > 0 { + list.Entries = append(list.Entries, currentValue) + currentValue = []interface{}{} + } + return lastToken, list + } else if indentation > indent { + if len(currentValue) > 0 { + list.Entries = append(list.Entries, currentValue) + currentValue = []interface{}{} + } + var level *UnorderedList + var scanIndex int + scanIndex, level = ParseUnorderedList(text, lastToken-indentation, depth+1, indentation, startCharacter) + if level != nil { + list.Entries = append(list.Entries, level) + } + lastToken = scanIndex + i = scanIndex - 1 + indentation = 0 + afterNewLine = true + processIndent = true + } else if indentation < indent { + if len(currentValue) > 0 { + list.Entries = append(list.Entries, currentValue) + currentValue = []interface{}{} + } + return lastToken - indentation, list + } else if c == '\n' { + addValue() + if len(currentValue) > 0 { + list.Entries = append(list.Entries, currentValue) + currentValue = []interface{}{} + } + indentation = 0 + lastToken = i + 1 + afterNewLine = true + processIndent = true + } else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') { + addValue() + var tpl *Template + var scanIndex int + scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c) + if tpl != nil { + currentValue = append(currentValue, tpl) + } + lastToken = scanIndex + i = scanIndex - 1 + } else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') { + addValue() + var link *Link + var scanIndex int + scanIndex, link = ParseLink(text, i+1, depth+1, c) + if link != nil { + currentValue = append(currentValue, link) + } + lastToken = scanIndex + i = scanIndex - 1 + } else if c == '<' { //html trigger + addValue() + var html *HTML + var scanIndex int + scanIndex, html = ParseHTML(text, i, depth+1) + if html != nil { + currentValue = append(currentValue, html) + } + lastToken = scanIndex + i = scanIndex - 1 + } else { + processIndent = false + } + } + + return +} + +func ParseDescriptionList(text string, index int, depth int) (i int, list *DescriptionList) { + + var c byte + lastToken := index + + list = &DescriptionList{} + + hasKey := false + + addValue := func() int { + if lastToken < len(text) && i-lastToken > 0 { + t := strings.TrimSpace(text[lastToken:i]) + if len(t) > 0 { + if !hasKey { + list.Name = append(list.Name, text[lastToken:i]) + } else { + list.Entries = append(list.Entries, text[lastToken:i]) + } + } + + return len(t) + } + + return 0 + } + + afterNewLine := false + + for i = index; i < len(text); i++ { + c = text[i] + + if c == ' ' || c == '\t' { + //keep the check for new line + } else if c == ':' { + addValue() + lastToken = i + 1 + afterNewLine = false + hasKey = true + } else if afterNewLine { //no new list values + return lastToken, list + } else if c == '\n' { + addValue() + lastToken = i + 1 + afterNewLine = true + hasKey = true + } else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') { + addValue() + var tpl *Template + var scanIndex int + scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c) + if tpl != nil { + if !hasKey { + list.Name = append(list.Name, tpl) + } else { + list.Entries = append(list.Entries, tpl) + } + } + lastToken = scanIndex + i = scanIndex - 1 + } else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') { + addValue() + var link *Link + var scanIndex int + scanIndex, link = ParseLink(text, i+1, depth+1, c) + if link != nil { + if !hasKey { + list.Name = append(list.Name, link) + } else { + list.Entries = append(list.Entries, link) + } + } + lastToken = scanIndex + i = scanIndex - 1 + } else if c == '<' { //html trigger + addValue() + var html *HTML + var scanIndex int + scanIndex, html = ParseHTML(text, i, depth+1) + if html != nil { + if !hasKey { + list.Name = append(list.Name, html) + } else { + list.Entries = append(list.Entries, html) + } + } + lastToken = scanIndex + i = scanIndex - 1 + } + } + + return +} diff --git a/template.go b/template.go new file mode 100644 index 0000000..5773359 --- /dev/null +++ b/template.go @@ -0,0 +1,190 @@ +package wikitext_parser + +import ( + "fmt" + "strings" +) + +type Template struct { + Name string + IsLink bool + Parameters map[string][]interface{} + UnkeyedIndex int +} + +func NewTemplate(name string, isLink bool) *Template { + return &Template{ + Name: name, + IsLink: isLink, + Parameters: make(map[string][]interface{}), + } +} + +func (t *Template) AddParameterUnkeyed(value interface{}) { + t.AddParameter(fmt.Sprintf("%d", t.UnkeyedIndex), value) +} + +func (t *Template) AddParameter(key string, value interface{}) { + if _, ok := t.Parameters[key]; !ok { + t.Parameters[key] = make([]interface{}, 0, 1) + } + t.Parameters[key] = append(t.Parameters[key], value) +} + +func ParseTemplate(text string, index int, depth int, startCharacter byte) (i int, template *Template) { + + var c byte + lastToken := index + + var key string + + addValue := func() int { + if lastToken < len(text) && i-lastToken > 0 { + t := strings.TrimSpace(text[lastToken:i]) + if len(t) > 0 { + if template == nil { + template = NewTemplate(t, startCharacter == '[') + } else { + if key == "" { + template.AddParameterUnkeyed(text[lastToken:i]) + } else { + template.AddParameter(key, text[lastToken:i]) + } + } + } + + return len(t) + } + + return 0 + } + addKey := func() { + if lastToken < len(text) && i-lastToken > 0 { + t := strings.TrimSpace(text[lastToken:i]) + if len(t) > 0 { + key = t + } + } + } + + afterNewLine := false + + for i = index; i < len(text); i++ { + c = text[i] + + if startCharacter == '{' && c == '}' && i < len(text)-1 && text[i+1] == '}' { //end of template + addValue() + i += 2 + break + } else if startCharacter == '[' && c == ']' && i < len(text)-1 && text[i+1] == ']' { //end of link + addValue() + i += 2 + break + //template or light might have parameters + } else if (c == '{' && i < len(text)-1 && text[i+1] == '{') || (c == '[' && i < len(text)-1 && text[i+1] == '[') { + addValue() + var tpl *Template + var scanIndex int + scanIndex, tpl = ParseTemplate(text, i+2, depth+1, c) + if tpl != nil { + if key == "" { + template.AddParameterUnkeyed(tpl) + } else { + template.AddParameter(key, tpl) + } + } + lastToken = scanIndex + i = scanIndex - 1 + } else if (c == '{' && i < len(text)-1 && text[i+1] != '{' && text[i+1] != '[') || (c == '[' && i < len(text)-1 && text[i+1] != '[' && text[i+1] != '{') { + addValue() + var link *Link + var scanIndex int + scanIndex, link = ParseLink(text, i+1, depth+1, c) + if link != nil && template != nil { + if key == "" { + template.AddParameterUnkeyed(link) + } else { + template.AddParameter(key, link) + } + } + lastToken = scanIndex + i = scanIndex - 1 + } else if c == '<' { //html trigger + addValue() + var html *HTML + var scanIndex int + scanIndex, html = ParseHTML(text, i, depth+1) + if html != nil && template != nil { + if key == "" { + template.AddParameterUnkeyed(html) + } else { + template.AddParameter(key, html) + } + } + lastToken = scanIndex + i = scanIndex - 1 + } else if c == '|' { + hasTemplate := template != nil + addValue() + lastToken = i + 1 + if hasTemplate { + template.UnkeyedIndex++ + } + key = "" + } else if c == '\n' { + addValue() + lastToken = i + 1 + afterNewLine = true + + if template != nil { + if key == "" { + template.AddParameterUnkeyed(NewLineToken{}) + } else { + template.AddParameter(key, NewLineToken{}) + } + } + } else if afterNewLine && (c == '*' || c == '#') { + addValue() + var list *UnorderedList + var scanIndex int + scanIndex, list = ParseUnorderedList(text, i, depth+1, 1, c) + if list != nil { + if key == "" { + template.AddParameterUnkeyed(list) + } else { + template.AddParameter(key, list) + } + } + lastToken = scanIndex + i = scanIndex - 1 + } else if c == ';' { + addValue() + var list *DescriptionList + var scanIndex int + scanIndex, list = ParseDescriptionList(text, i+1, depth+1) + if list != nil { + if key == "" { + template.AddParameterUnkeyed(list) + } else { + template.AddParameter(key, list) + } + } + lastToken = scanIndex + i = scanIndex - 1 + } else if afterNewLine && c == ':' { + addValue() + lastToken = i + 1 + } else if c == '=' { + if key == "" { + addKey() + lastToken = i + 1 + } + } + + if afterNewLine && c != '\n' && c != ' ' && c != '\t' { + afterNewLine = false + } + } + + return +} diff --git a/wikitext.go b/wikitext.go new file mode 100644 index 0000000..94e396c --- /dev/null +++ b/wikitext.go @@ -0,0 +1,156 @@ +package wikitext_parser + +import ( + "strings" +) + +func NormalizeWikiTitle(title string) string { + return strings.Replace(title, " ", "_", -1) +} + +type NewLineToken struct { +} + +type GetInterfaceSliceStringValueOptions struct { + PageName string + Trim bool + StringHandler func(value string, opt *GetInterfaceSliceStringValueOptions) []string + HTMLHandler func(value *HTML, opt *GetInterfaceSliceStringValueOptions) []string + LinkHandler func(value *Link, opt *GetInterfaceSliceStringValueOptions) []string + TemplateLinkHandler func(value *Template, opt *GetInterfaceSliceStringValueOptions) []string + TemplateHandler func(value *Template, opt *GetInterfaceSliceStringValueOptions) []string + UnorderedListHandler func(value *UnorderedList, opt *GetInterfaceSliceStringValueOptions) []string + DescriptionListHandler func(value *DescriptionList, opt *GetInterfaceSliceStringValueOptions) []string + NewLineHandler func(opt *GetInterfaceSliceStringValueOptions) []string +} + +func (o *GetInterfaceSliceStringValueOptions) Default() { + o.Trim = true + o.StringHandler = func(value string, opt *GetInterfaceSliceStringValueOptions) []string { + return []string{value} + } + o.HTMLHandler = func(value *HTML, opt *GetInterfaceSliceStringValueOptions) []string { + return []string{value.Tag.String()} + } + o.NewLineHandler = func(opt *GetInterfaceSliceStringValueOptions) []string { + return []string{"\n"} + } + o.LinkHandler = func(value *Link, opt *GetInterfaceSliceStringValueOptions) (result []string) { + if len(value.Name) > 0 { + result = append(result, GetWikiStringValue(value.Name, opt)...) + } else { + result = append(result, value.URL) + } + result = append(result, GetWikiStringValue(value.Name, opt)...) + return + } + o.TemplateLinkHandler = func(value *Template, opt *GetInterfaceSliceStringValueOptions) (result []string) { + output := 0 + for _, vv := range value.Parameters { + for _, vvv := range GetWikiStringValue(vv, opt) { + vvv = strings.TrimSpace(vvv) + if len(vvv) > 0 { + output++ + result = append(result, vvv) + } + } + } + if output == 0 { + result = append(result, value.Name) + } + + return + } + o.TemplateHandler = func(value *Template, opt *GetInterfaceSliceStringValueOptions) (result []string) { + switch strings.ToUpper(value.Name) { + case "PAGENAME", "SUBPAGENAME": + result = append(result, opt.PageName) + default: + result = append(result, value.Name) + } + + return + } + o.UnorderedListHandler = func(value *UnorderedList, opt *GetInterfaceSliceStringValueOptions) []string { + return GetWikiStringValue(value.Entries, opt) + } + o.DescriptionListHandler = func(value *DescriptionList, opt *GetInterfaceSliceStringValueOptions) []string { + return []string{strings.Join(GetWikiStringValue(value.Name, opt), ", ") + ": " + strings.Join(GetWikiStringValue(value.Entries, opt), ", ")} + } +} + +func GetWikiStringValue(v []interface{}, opts *GetInterfaceSliceStringValueOptions) (r []string) { + var result []string + for _, value := range v { + + if text, ok := value.(string); ok { + result = append(result, opts.StringHandler(text, opts)...) + } else if template, ok := value.(*Template); ok { + if template.IsLink { + result = append(result, opts.TemplateLinkHandler(template, opts)...) + } else { + result = append(result, opts.TemplateHandler(template, opts)...) + } + } else if html, ok := value.(*HTML); ok && html.Tag != nil { + result = append(result, opts.HTMLHandler(html, opts)...) + } else if _, ok := value.(NewLineToken); ok { + result = append(result, opts.NewLineHandler(opts)...) + } else if link, ok := value.(*Link); ok { + result = append(result, opts.LinkHandler(link, opts)...) + } else if unorderedList, ok := value.(*UnorderedList); ok { + result = append(result, opts.UnorderedListHandler(unorderedList, opts)...) + } else if descriptionList, ok := value.(*DescriptionList); ok { + result = append(result, opts.DescriptionListHandler(descriptionList, opts)...) + } + } + + r = make([]string, 0, len(result)) + + for _, e := range result { + if opts.Trim { + e = strings.TrimSpace(e) + } + if len(e) > 0 { + r = append(r, e) + } + } + return +} + +//ParseWikiText small WikiText parser that extracts text, Templates, and its arguments/parameters +func ParseWikiText(text string) (result []interface{}) { + index := 0 + + for index < len(text) { + templateIndex := strings.Index(text[index:], "{{") + linkIndex := strings.Index(text[index:], "[[") + if templateIndex == -1 && linkIndex == -1 { + t := strings.TrimSpace(text[index:]) + if len(t) > 0 { + result = append(result, text[index:]) + } + break + } else { + bestIndex := templateIndex + if templateIndex == -1 { + bestIndex = linkIndex + } else { + if linkIndex != -1 && linkIndex < bestIndex { + bestIndex = linkIndex + } + } + + t := strings.TrimSpace(text[index : index+bestIndex]) + if len(t) > 0 { + result = append(result, text[index:index+bestIndex]) + } + var tpl *Template + index, tpl = ParseTemplate(text, index+bestIndex+2, 0, text[index+bestIndex]) + if tpl != nil { + result = append(result, tpl) + } + } + } + + return +}