This repository has been archived on 2022-02-16. You can view files and clone it, but cannot push or open issues or pull requests.
gowiki/tokenize.go
2021-12-16 23:39:56 +01:00

811 lines
18 KiB
Go

/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import (
"errors"
"fmt"
"regexp"
"sort"
"strings"
"unicode"
"unicode/utf8"
)
type Token struct {
TText string `json:"tText,omitempty"`
TType string `json:"tType,omitempty"`
TAttr string `json:"tAttr,omitempty"`
TLink WikiLink `json:"tLink,omitempty"`
TClosed bool `json:"tClosed,omitempty"`
TPipes []string `json:"tPipes,omitempty"`
}
type Prefixes []string
var ExtLinkPrefixes Prefixes = []string{"https://", "http://", "ftp://", "//"}
var FileLinkPrefixes Prefixes = []string{"[[image:", "[[media:", "[[file:"}
func (a *Article) parseRedirectLine(l string) ([]*Token, error) {
nt := make([]*Token, 0, 2)
nt = append(nt, &Token{TType: "redirect"})
nnt, err := a.parseInlineText(l, 9, len(l))
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
return nt, nil
}
func (a *Article) parseWikiPreLine(l string) ([]*Token, error) {
nt := make([]*Token, 0, 2)
nt = append(nt, &Token{TType: "wikipre"})
nnt, err := a.parseInlineText(l, 1, len(l))
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
return nt, nil
}
func (a *Article) parseHRuler(l string) ([]*Token, error) {
pos := 0
for i, rv := range l {
if rv != '-' {
pos = i
break
}
}
nt := make([]*Token, 0, 2)
nt = append(nt, &Token{TType: "hrule"})
if pos != 0 {
nnt, err := a.parseInlineText(l, pos, len(l))
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
}
return nt, nil
}
func (a *Article) parseHeadingLine(l string) ([]*Token, error) {
pf := 0
pl := 0
for i, rv := range l {
if rv == '=' {
pl = i
}
}
for {
pf++
if pf == pl || l[pf] != '=' {
pf--
break
}
pl--
if pf == pl || l[pl] != '=' {
pl++
pf--
break
}
}
pf++
if pf > 6 {
diff := pf - 6
pf -= diff
pl += diff
}
nt := make([]*Token, 0, 2)
nt = append(nt, &Token{TType: fmt.Sprintf("h%d", pf)})
nnt, err := a.parseInlineText(l, pf, pl)
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
return nt, nil
}
func (a *Article) parseListLine(l string) ([]*Token, error) {
nt := make([]*Token, 0, 2)
pos := 0
for ; pos < len(l); pos++ {
switch l[pos] {
case ';', ':', '*', '#':
nt = append(nt, &Token{TType: l[pos : pos+1]})
continue
}
break
}
if pos < len(l) {
nnt, err := a.parseInlineText(l, pos, len(l))
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
}
return nt, nil
}
func (a *Article) parseTableLine(l string) ([]*Token, error) {
nt := make([]*Token, 0, 0)
return nt, nil
}
func isValidHTMLtag(tag string) bool {
return true
}
func (a *Article) decodeHTMLtag(l string) (int, string, string, bool, bool) {
matchingpos := 0
inquote := false
lastbackslash := false
quote := '#'
closefound := false
tagend := 0
tagstart := 0
dhtLoop:
for idx, rv := range l {
// fmt.Println(string(rv), inquote, string(quote), idx, matchingpos)
switch rv {
case '>':
if !inquote {
matchingpos = idx
break dhtLoop
}
case '\'', '"':
switch {
case inquote && quote == rv && !lastbackslash:
inquote = false
case !inquote:
inquote = true
quote = rv
}
case ' ', '\t', '\r':
case '/':
closefound = true
}
lastbackslash = (rv == '\\')
if !unicode.IsSpace(rv) && tagstart == 0 {
tagstart = idx
}
if rv != '/' && !unicode.IsSpace(rv) {
closefound = false
}
if unicode.IsSpace(rv) && tagstart != 0 && tagend == 0 {
tagend = idx
}
}
if matchingpos == 0 || tagstart == 0 {
return 0, "", "", false, false
}
var tag string
var attr string
if tagend == 0 {
tag = l[tagstart:matchingpos]
attr = ""
} else {
tag = l[tagstart:tagend]
attr = l[tagend:matchingpos]
}
return matchingpos + 1, tag, attr, closefound, true
}
func matchPrefixes(s string, prefixes []string) bool {
for i := range prefixes {
if len(s) >= len(prefixes[i]) && strings.EqualFold(s[:len(prefixes[i])], prefixes[i]) {
return true
}
}
return false
}
func isExtLink(l string) bool {
return matchPrefixes(l, ExtLinkPrefixes)
}
func possibleFileLink(l string) bool {
return matchPrefixes(l, FileLinkPrefixes)
}
func (a *Article) parseLink(l string) (int, []*Token, bool) {
if len(l) < 5 {
return 0, nil, false
}
if l[1] == '[' {
if possibleFileLink(l) {
return a.parseFileLink(l)
}
return a.parseInternalLink(l)
}
return a.parseExternalLink(l)
}
func (a *Article) parseInternalLink(l string) (int, []*Token, bool) {
// possible internal link
pipepos := 0
closed := false
matchingpos := 0
linktrail := 0
//plLoop:
for idx, rv := range l {
if idx < 2 {
continue
}
if matchingpos == 0 {
switch rv {
case '\x07': //prevent special tags in internal link
if pipepos == 0 { //only in the link portion
return 0, nil, false
}
case '[':
if idx == 2 || len(l) > idx+1 && l[idx+1] == '[' {
return 0, nil, false
}
case ']':
if len(l) > idx+1 && l[idx+1] == ']' {
matchingpos = idx
}
case '|':
if pipepos == 0 {
pipepos = idx
}
default:
}
continue
}
if !closed {
closed = true
continue
}
if unicode.IsLetter(rv) {
linktrail = idx
continue
}
break
}
if !closed {
return 0, nil, false
}
var link WikiLink
var nt []*Token = nil
var err error = nil
if pipepos == 0 {
innerstring := l[2:matchingpos]
if linktrail != 0 {
innerstring += l[matchingpos+2 : linktrail+1]
}
link = WikiCanonicalForm(l[2:matchingpos])
nt = []*Token{&Token{TText: innerstring, TType: "text"}}
} else {
innerstring := l[pipepos+1 : matchingpos]
if linktrail != 0 {
innerstring += l[matchingpos+2 : linktrail+1]
}
link = WikiCanonicalForm(l[2:pipepos])
if pipepos+1 < matchingpos {
nt, err = a.parseInlineText(innerstring, 0, len(innerstring))
if err != nil {
return 0, nil, false
}
}
}
tokens := make([]*Token, 0, 2)
tokens = append(tokens, &Token{TLink: link, TType: "link"})
if nt != nil {
tokens = append(tokens, nt...)
}
tokens = append(tokens, &Token{TType: "closelink"})
if linktrail != 0 {
return linktrail + 1, tokens, true
}
return matchingpos + 2, tokens, true
}
func (a *Article) parseExternalLink(l string) (int, []*Token, bool) {
// possible external link
spacepos := 0
matchingpos := 0
endpos := 0
intLinkOpen := false
skipNext := false
plLoop2:
for idx, rv := range l {
if idx < 1 {
continue
}
if skipNext {
skipNext = false
continue
}
switch rv {
case '\x07':
if spacepos == 0 {
return 0, nil, false
}
case '[':
if len(l) > idx+1 && l[idx+1] == '[' {
intLinkOpen = true
}
case ' ':
if spacepos == 0 {
spacepos = idx
}
case '<':
if spacepos > 0 {
_, tag, _, _, ok := a.decodeHTMLtag(l[idx:len(l)])
if ok && tag == "/ref" {
matchingpos = idx
endpos = idx
break plLoop2
}
}
case ']':
if intLinkOpen && len(l) > idx+1 && l[idx+1] == ']' {
intLinkOpen = false
skipNext = true
continue
}
matchingpos = idx
endpos = idx + 1
break plLoop2
}
}
if matchingpos == 0 {
return 0, nil, false
}
var link string
var nt []*Token = nil
var err error = nil
if spacepos == 0 {
link = l[1:matchingpos]
if !isExtLink(link) {
return 0, nil, false
}
} else {
link = l[1:spacepos]
if !isExtLink(link) {
return 0, nil, false
}
if spacepos+1 < matchingpos {
nt, err = a.parseInlineText(l, spacepos+1, matchingpos)
if err != nil {
return 0, nil, false
}
}
}
tokens := make([]*Token, 0, 2)
tokens = append(tokens, &Token{TText: link, TType: "extlink"})
if nt != nil {
tokens = append(tokens, nt...)
}
tokens = append(tokens, &Token{TType: "closeextlink"})
return endpos, tokens, true
}
func (a *Article) parseFileLink(l string) (int, []*Token, bool) {
// possible internal link
pipepos := make([]int, 0, 0)
closed := false
matchingpos := 0
intLinkOpen := false
skipNext := false
plLoop:
for idx, rv := range l {
if idx < 2 {
continue
}
if skipNext {
skipNext = false
continue
}
switch rv {
case '\x07': //prevent special tags in internal link
if len(pipepos) == 0 { //only in the link portion
return 0, nil, false
}
case '[':
if len(l) > idx+1 && l[idx+1] == '[' {
intLinkOpen = true
skipNext = true
continue
}
case ']':
if len(l) > idx+1 && l[idx+1] == ']' {
if intLinkOpen {
intLinkOpen = false
skipNext = true
continue
}
matchingpos = idx
closed = true
break plLoop
}
case '|':
if !intLinkOpen {
pipepos = append(pipepos, idx)
}
default:
}
}
if !closed {
return 0, nil, false
}
var link WikiLink
var pipes = make([]string, 0, 0)
var nt []*Token = nil
var err error = nil
if len(pipepos) == 0 {
link = WikiCanonicalForm(l[2:matchingpos])
nt = []*Token{&Token{TText: l[2:matchingpos], TType: "text"}}
} else {
link = WikiCanonicalForm(l[2:pipepos[0]])
for i := 0; i < len(pipepos)-1; i++ {
pipes = append(pipes, l[pipepos[i]+1:pipepos[i+1]])
}
if pipepos[len(pipepos)-1]+1 < matchingpos {
nt, err = a.parseInlineText(l, pipepos[len(pipepos)-1]+1, matchingpos)
if err != nil {
return 0, nil, false
}
}
}
tokens := make([]*Token, 0, 2)
tokens = append(tokens, &Token{TLink: link, TType: "filelink", TPipes: pipes})
if nt != nil {
tokens = append(tokens, nt...)
}
tokens = append(tokens, &Token{TType: "closefilelink"})
return matchingpos + 2, tokens, true
}
func min(a, b int) int {
if a <= b {
return a
}
return b
}
var behavswitchre = regexp.MustCompile(`^__[A-Z]+__`)
func (a *Article) decodeBehavSwitch(l string) (int, bool) {
match := behavswitchre.FindString(l)
if len(match) == 0 {
return 0, false
} else {
return len(match), true
}
}
func (a *Article) parseInlineText(l string, start, end int) ([]*Token, error) {
nt := make([]*Token, 0)
tStart, tEnd := start, start
for pos := start; pos < end; {
rv, rune_len := utf8.DecodeRuneInString(l[pos:end])
switch rv {
case '<':
e, tag, attr, closed, ok := a.decodeHTMLtag(l[pos:end])
if ok {
pos += e
if isValidHTMLtag(tag) {
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "html", TText: tag, TAttr: attr, TClosed: closed})
tStart = pos
}
tEnd = pos
continue
}
case '[':
e, lt, ok := a.parseLink(l[pos:end])
if ok {
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, lt...)
pos += e
tStart, tEnd = pos, pos
continue
}
case '_':
e, ok := a.decodeBehavSwitch(l[pos:end])
if ok {
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "magic", TAttr: l[pos : pos+e]})
pos += e
tStart, tEnd = pos, pos
continue
}
case ' ', '\t', '\r':
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "space"})
tStart = pos + rune_len
case '\'':
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "quote"})
tStart = pos + rune_len
case ':':
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "colon"})
tStart = pos + rune_len
case '\x07':
// case '@':
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "special", TText: l[pos : pos+8]})
pos += 8
tStart, tEnd = pos, pos
continue
}
pos += rune_len
tEnd = pos
}
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
return nt, nil
}
func (a *Article) isHeading(l string) bool {
if l[0] != '=' {
return false
}
done := 0
lastEqual := false
for _, rv := range l {
done++
if done > 2 {
if unicode.IsSpace(rv) {
continue
}
if rv == '=' {
lastEqual = true
continue
}
lastEqual = false
}
}
return lastEqual
}
func (a *Article) isTable(l string) bool {
return (len(l) > 1 && (l[0:2] == "{|" || l[0:2] == "|}" || l[0:2] == "|+" || l[0:2] == "|-")) || (len(l) > 0 && (l[0:1] == "|" || l[0:1] == "!"))
}
func (a *Article) lineType(l string) string {
switch {
case len(l) == 0:
return "blank"
case len(l) > 8 && strings.ToLower(l[0:9]) == "#redirect":
return "redirect"
case len(l) > 3 && l[0:4] == "----":
return "hr"
case a.isHeading(l):
return "heading"
case l[0] == ';' || l[0] == ':' || l[0] == '*' || l[0] == '#':
return "list"
case a.isTable(l):
return "table"
case l[0] == ' ':
return "wikipre"
}
return "normal"
}
func (a *Article) Tokenize(mw string, g PageGetter) ([]*Token, error) {
mwnc := a.stripComments(mw)
mwnr := a.stripRefs(mwnc)
mw_stripped, nowikipremathmap := a.stripNowikiPreMath(mwnr)
mw_tmpl, templatemap := a.processTemplates(mw_stripped, nowikipremathmap, g)
mw_links := a.preprocessLinks(mw_tmpl)
lines := strings.Split(mw_links, "\n")
tokens := make([]*Token, 0, 16)
for _, l := range lines {
var nt []*Token
var err error = nil
lt := a.lineType(l)
switch lt {
case "normal":
nt, err = a.parseInlineText(l, 0, len(l))
case "redirect":
nt, err = a.parseRedirectLine(l)
case "hr":
nt, err = a.parseHRuler(l)
case "heading":
nt, err = a.parseHeadingLine(l)
case "list":
nt, err = a.parseListLine(l)
case "table":
nt, err = a.parseTableLine(l)
case "wikipre":
nt, err = a.parseWikiPreLine(l)
case "blank":
nt = []*Token{&Token{TType: "blank"}}
}
if err != nil {
return nil, err
}
nt = append(nt, &Token{TType: "newline"})
tokens = append(tokens, nt...)
}
specialcount := 0
for i := range tokens {
if tokens[i].TType == "special" {
specialcount++
t, ok := templatemap[tokens[i].TText]
if !ok {
return nil, errors.New("special not in map")
}
tokens[i] = t
}
}
if specialcount != len(templatemap) {
if DebugLevel > 0 {
fmt.Println("[Tokenize] Warning: number of specials in map differs from number found")
}
}
return tokens, nil
}
var commentsRe = regexp.MustCompile(`(?isU)<!--.*(?:-->|\z)`)
func (a *Article) stripComments(mw string) string {
return commentsRe.ReplaceAllLiteralString(mw, "")
}
var refRe = regexp.MustCompile(`(?msU)<ref.*</ref>`)
func (a *Article) stripRefs(mw string) string {
return refRe.ReplaceAllLiteralString(mw, "")
}
var nowikiOpenRe = regexp.MustCompile(`(?i)<\s*(nowiki)\s*[^>/]*>`)
var nowikiCloseRe = regexp.MustCompile(`(?i)<(/nowiki)\s*[^>/]*>`)
var preOpenRe = regexp.MustCompile(`(?i)<\s*(pre)\s*[^>]*>`)
var preCloseRe = regexp.MustCompile(`(?i)<(/pre)\s*[^>]*>`)
var mathOpenRe = regexp.MustCompile(`(?i)<\s*(math)\s*[^>]*>`)
var mathCloseRe = regexp.MustCompile(`(?i)<(/math)\s*[^>]*>`)
type ssInt [][]int
func (a ssInt) Len() int { return len(a) }
func (a ssInt) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ssInt) Less(i, j int) bool { return a[i][0] < a[j][0] }
func (a *Article) stripNowikiPreMath(mw string) (string, map[string]*Token) {
nwoc := nowikiOpenRe.FindAllStringSubmatchIndex(mw, -1)
nwcc := nowikiCloseRe.FindAllStringSubmatchIndex(mw, -1)
poc := preOpenRe.FindAllStringSubmatchIndex(mw, -1)
pcc := preCloseRe.FindAllStringSubmatchIndex(mw, -1)
moc := mathOpenRe.FindAllStringSubmatchIndex(mw, -1)
mcc := mathCloseRe.FindAllStringSubmatchIndex(mw, -1)
for i := range nwoc {
nwoc[i] = append(nwoc[i], 0)
}
for i := range nwcc {
nwcc[i] = append(nwcc[i], 1)
}
for i := range poc {
poc[i] = append(poc[i], 2)
}
for i := range pcc {
pcc[i] = append(pcc[i], 3)
}
for i := range moc {
moc[i] = append(moc[i], 4)
}
for i := range mcc {
mcc[i] = append(mcc[i], 5)
}
am := make([][]int, 0, len(nwoc)+len(nwcc)+len(poc)+len(pcc)+len(moc)+len(mcc))
am = append(am, nwoc...)
am = append(am, nwcc...)
am = append(am, poc...)
am = append(am, pcc...)
am = append(am, moc...)
am = append(am, mcc...)
sort.Sort(ssInt(am))
tokens := make(map[string]*Token, len(am))
if len(am) == 0 {
return mw, tokens
}
ctype := -1
out := ""
lastclose := 0
openidx := 0
count := 0
for i := range am {
if (ctype != -1) && (am[i][4] == ctype+1) && (am[openidx][1] <= am[i][0]) {
// closing an open one
special := fmt.Sprintf("\x07%07d", count)
tokens[special] = &Token{
TText: mw[am[openidx][1]:am[i][0]],
TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]),
TAttr: mw[am[openidx][3] : am[openidx][1]-1],
}
out += special
ctype = -1
lastclose = am[i][1]
count++
} else if (ctype == -1) && (am[i][4]&1 == 0) && (lastclose <= am[i][0]) {
// open a new one
out += mw[lastclose:am[i][0]]
ctype = am[i][4]
openidx = i
}
}
if ctype != -1 {
//it's open: close it
special := fmt.Sprintf("\x07%07d", count)
tokens[special] = &Token{
TText: mw[am[openidx][1]:len(mw)],
TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]),
TAttr: mw[am[openidx][3] : am[openidx][1]-1],
}
out += special
ctype = -1
count++
} else {
out += mw[lastclose:]
}
return out, tokens
}
var multiLineLinksRe = regexp.MustCompile(`(?sm)\[\[[^\n|]*\|.*?\]\]`)
/* TODO: add preprocessing as in Parser.php:pstPass2() to enable pipe tricks
*/
func (a *Article) preprocessLinks(s string) string {
mw := []byte(s)
mll := multiLineLinksRe.FindAllSubmatchIndex(mw, -1)
for _, pair := range mll {
for i := pair[0]; i < pair[1]; {
// we have to walk this string carefully, by rune, not by i
rv, rlen := utf8.DecodeRune(mw[i:])
if rv == '\n' {
mw[i] = ' '
}
i += rlen
}
}
return string(mw)
}