811 lines
18 KiB
Go
811 lines
18 KiB
Go
/*
|
|
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package gowiki
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type Token struct {
|
|
TText string `json:"tText,omitempty"`
|
|
TType string `json:"tType,omitempty"`
|
|
TAttr string `json:"tAttr,omitempty"`
|
|
TLink WikiLink `json:"tLink,omitempty"`
|
|
TClosed bool `json:"tClosed,omitempty"`
|
|
TPipes []string `json:"tPipes,omitempty"`
|
|
}
|
|
|
|
type Prefixes []string
|
|
|
|
var ExtLinkPrefixes Prefixes = []string{"https://", "http://", "ftp://", "//"}
|
|
var FileLinkPrefixes Prefixes = []string{"[[image:", "[[media:", "[[file:"}
|
|
|
|
func (a *Article) parseRedirectLine(l string) ([]*Token, error) {
|
|
nt := make([]*Token, 0, 2)
|
|
nt = append(nt, &Token{TType: "redirect"})
|
|
nnt, err := a.parseInlineText(l, 9, len(l))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nt = append(nt, nnt...)
|
|
return nt, nil
|
|
}
|
|
|
|
func (a *Article) parseWikiPreLine(l string) ([]*Token, error) {
|
|
nt := make([]*Token, 0, 2)
|
|
nt = append(nt, &Token{TType: "wikipre"})
|
|
nnt, err := a.parseInlineText(l, 1, len(l))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nt = append(nt, nnt...)
|
|
return nt, nil
|
|
}
|
|
|
|
func (a *Article) parseHRuler(l string) ([]*Token, error) {
|
|
pos := 0
|
|
for i, rv := range l {
|
|
if rv != '-' {
|
|
pos = i
|
|
break
|
|
}
|
|
}
|
|
nt := make([]*Token, 0, 2)
|
|
nt = append(nt, &Token{TType: "hrule"})
|
|
if pos != 0 {
|
|
nnt, err := a.parseInlineText(l, pos, len(l))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nt = append(nt, nnt...)
|
|
}
|
|
return nt, nil
|
|
}
|
|
|
|
func (a *Article) parseHeadingLine(l string) ([]*Token, error) {
|
|
pf := 0
|
|
pl := 0
|
|
for i, rv := range l {
|
|
if rv == '=' {
|
|
pl = i
|
|
}
|
|
}
|
|
for {
|
|
pf++
|
|
if pf == pl || l[pf] != '=' {
|
|
pf--
|
|
break
|
|
}
|
|
pl--
|
|
if pf == pl || l[pl] != '=' {
|
|
pl++
|
|
pf--
|
|
break
|
|
}
|
|
}
|
|
pf++
|
|
if pf > 6 {
|
|
diff := pf - 6
|
|
pf -= diff
|
|
pl += diff
|
|
}
|
|
nt := make([]*Token, 0, 2)
|
|
nt = append(nt, &Token{TType: fmt.Sprintf("h%d", pf)})
|
|
nnt, err := a.parseInlineText(l, pf, pl)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nt = append(nt, nnt...)
|
|
return nt, nil
|
|
}
|
|
|
|
func (a *Article) parseListLine(l string) ([]*Token, error) {
|
|
nt := make([]*Token, 0, 2)
|
|
pos := 0
|
|
for ; pos < len(l); pos++ {
|
|
switch l[pos] {
|
|
case ';', ':', '*', '#':
|
|
nt = append(nt, &Token{TType: l[pos : pos+1]})
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
if pos < len(l) {
|
|
nnt, err := a.parseInlineText(l, pos, len(l))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nt = append(nt, nnt...)
|
|
}
|
|
return nt, nil
|
|
}
|
|
|
|
func (a *Article) parseTableLine(l string) ([]*Token, error) {
|
|
nt := make([]*Token, 0, 0)
|
|
return nt, nil
|
|
}
|
|
|
|
func isValidHTMLtag(tag string) bool {
|
|
return true
|
|
}
|
|
|
|
func (a *Article) decodeHTMLtag(l string) (int, string, string, bool, bool) {
|
|
matchingpos := 0
|
|
inquote := false
|
|
lastbackslash := false
|
|
quote := '#'
|
|
closefound := false
|
|
tagend := 0
|
|
tagstart := 0
|
|
dhtLoop:
|
|
for idx, rv := range l {
|
|
// fmt.Println(string(rv), inquote, string(quote), idx, matchingpos)
|
|
switch rv {
|
|
case '>':
|
|
if !inquote {
|
|
matchingpos = idx
|
|
break dhtLoop
|
|
}
|
|
case '\'', '"':
|
|
switch {
|
|
case inquote && quote == rv && !lastbackslash:
|
|
inquote = false
|
|
case !inquote:
|
|
inquote = true
|
|
quote = rv
|
|
}
|
|
case ' ', '\t', '\r':
|
|
case '/':
|
|
closefound = true
|
|
}
|
|
lastbackslash = (rv == '\\')
|
|
if !unicode.IsSpace(rv) && tagstart == 0 {
|
|
tagstart = idx
|
|
}
|
|
if rv != '/' && !unicode.IsSpace(rv) {
|
|
closefound = false
|
|
}
|
|
if unicode.IsSpace(rv) && tagstart != 0 && tagend == 0 {
|
|
tagend = idx
|
|
}
|
|
}
|
|
if matchingpos == 0 || tagstart == 0 {
|
|
return 0, "", "", false, false
|
|
}
|
|
var tag string
|
|
var attr string
|
|
|
|
if tagend == 0 {
|
|
tag = l[tagstart:matchingpos]
|
|
attr = ""
|
|
} else {
|
|
tag = l[tagstart:tagend]
|
|
attr = l[tagend:matchingpos]
|
|
}
|
|
return matchingpos + 1, tag, attr, closefound, true
|
|
}
|
|
|
|
func matchPrefixes(s string, prefixes []string) bool {
|
|
for i := range prefixes {
|
|
if len(s) >= len(prefixes[i]) && strings.EqualFold(s[:len(prefixes[i])], prefixes[i]) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isExtLink(l string) bool {
|
|
return matchPrefixes(l, ExtLinkPrefixes)
|
|
}
|
|
|
|
func possibleFileLink(l string) bool {
|
|
return matchPrefixes(l, FileLinkPrefixes)
|
|
}
|
|
|
|
func (a *Article) parseLink(l string) (int, []*Token, bool) {
|
|
if len(l) < 5 {
|
|
return 0, nil, false
|
|
}
|
|
if l[1] == '[' {
|
|
if possibleFileLink(l) {
|
|
return a.parseFileLink(l)
|
|
}
|
|
return a.parseInternalLink(l)
|
|
}
|
|
return a.parseExternalLink(l)
|
|
}
|
|
|
|
func (a *Article) parseInternalLink(l string) (int, []*Token, bool) {
|
|
|
|
// possible internal link
|
|
pipepos := 0
|
|
closed := false
|
|
matchingpos := 0
|
|
linktrail := 0
|
|
//plLoop:
|
|
for idx, rv := range l {
|
|
if idx < 2 {
|
|
continue
|
|
}
|
|
if matchingpos == 0 {
|
|
switch rv {
|
|
case '\x07': //prevent special tags in internal link
|
|
if pipepos == 0 { //only in the link portion
|
|
return 0, nil, false
|
|
}
|
|
case '[':
|
|
if idx == 2 || len(l) > idx+1 && l[idx+1] == '[' {
|
|
return 0, nil, false
|
|
}
|
|
|
|
case ']':
|
|
if len(l) > idx+1 && l[idx+1] == ']' {
|
|
matchingpos = idx
|
|
}
|
|
case '|':
|
|
if pipepos == 0 {
|
|
pipepos = idx
|
|
}
|
|
default:
|
|
}
|
|
continue
|
|
}
|
|
if !closed {
|
|
closed = true
|
|
continue
|
|
}
|
|
if unicode.IsLetter(rv) {
|
|
linktrail = idx
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
if !closed {
|
|
return 0, nil, false
|
|
}
|
|
var link WikiLink
|
|
var nt []*Token = nil
|
|
var err error = nil
|
|
if pipepos == 0 {
|
|
innerstring := l[2:matchingpos]
|
|
if linktrail != 0 {
|
|
innerstring += l[matchingpos+2 : linktrail+1]
|
|
}
|
|
link = WikiCanonicalForm(l[2:matchingpos])
|
|
nt = []*Token{&Token{TText: innerstring, TType: "text"}}
|
|
|
|
} else {
|
|
innerstring := l[pipepos+1 : matchingpos]
|
|
if linktrail != 0 {
|
|
innerstring += l[matchingpos+2 : linktrail+1]
|
|
}
|
|
link = WikiCanonicalForm(l[2:pipepos])
|
|
if pipepos+1 < matchingpos {
|
|
nt, err = a.parseInlineText(innerstring, 0, len(innerstring))
|
|
if err != nil {
|
|
return 0, nil, false
|
|
}
|
|
}
|
|
}
|
|
tokens := make([]*Token, 0, 2)
|
|
tokens = append(tokens, &Token{TLink: link, TType: "link"})
|
|
if nt != nil {
|
|
tokens = append(tokens, nt...)
|
|
}
|
|
tokens = append(tokens, &Token{TType: "closelink"})
|
|
if linktrail != 0 {
|
|
return linktrail + 1, tokens, true
|
|
}
|
|
return matchingpos + 2, tokens, true
|
|
}
|
|
|
|
func (a *Article) parseExternalLink(l string) (int, []*Token, bool) {
|
|
// possible external link
|
|
spacepos := 0
|
|
matchingpos := 0
|
|
endpos := 0
|
|
intLinkOpen := false
|
|
skipNext := false
|
|
plLoop2:
|
|
for idx, rv := range l {
|
|
if idx < 1 {
|
|
continue
|
|
}
|
|
if skipNext {
|
|
skipNext = false
|
|
continue
|
|
}
|
|
switch rv {
|
|
case '\x07':
|
|
if spacepos == 0 {
|
|
return 0, nil, false
|
|
}
|
|
case '[':
|
|
if len(l) > idx+1 && l[idx+1] == '[' {
|
|
intLinkOpen = true
|
|
}
|
|
case ' ':
|
|
if spacepos == 0 {
|
|
spacepos = idx
|
|
}
|
|
case '<':
|
|
if spacepos > 0 {
|
|
_, tag, _, _, ok := a.decodeHTMLtag(l[idx:len(l)])
|
|
if ok && tag == "/ref" {
|
|
matchingpos = idx
|
|
endpos = idx
|
|
break plLoop2
|
|
}
|
|
}
|
|
case ']':
|
|
if intLinkOpen && len(l) > idx+1 && l[idx+1] == ']' {
|
|
intLinkOpen = false
|
|
skipNext = true
|
|
continue
|
|
}
|
|
matchingpos = idx
|
|
endpos = idx + 1
|
|
break plLoop2
|
|
}
|
|
}
|
|
if matchingpos == 0 {
|
|
return 0, nil, false
|
|
}
|
|
var link string
|
|
var nt []*Token = nil
|
|
var err error = nil
|
|
if spacepos == 0 {
|
|
link = l[1:matchingpos]
|
|
if !isExtLink(link) {
|
|
return 0, nil, false
|
|
}
|
|
} else {
|
|
link = l[1:spacepos]
|
|
if !isExtLink(link) {
|
|
return 0, nil, false
|
|
}
|
|
if spacepos+1 < matchingpos {
|
|
nt, err = a.parseInlineText(l, spacepos+1, matchingpos)
|
|
if err != nil {
|
|
return 0, nil, false
|
|
}
|
|
}
|
|
}
|
|
tokens := make([]*Token, 0, 2)
|
|
tokens = append(tokens, &Token{TText: link, TType: "extlink"})
|
|
if nt != nil {
|
|
tokens = append(tokens, nt...)
|
|
}
|
|
tokens = append(tokens, &Token{TType: "closeextlink"})
|
|
return endpos, tokens, true
|
|
}
|
|
|
|
func (a *Article) parseFileLink(l string) (int, []*Token, bool) {
|
|
// possible internal link
|
|
pipepos := make([]int, 0, 0)
|
|
closed := false
|
|
matchingpos := 0
|
|
intLinkOpen := false
|
|
skipNext := false
|
|
plLoop:
|
|
for idx, rv := range l {
|
|
if idx < 2 {
|
|
continue
|
|
}
|
|
if skipNext {
|
|
skipNext = false
|
|
continue
|
|
}
|
|
switch rv {
|
|
case '\x07': //prevent special tags in internal link
|
|
if len(pipepos) == 0 { //only in the link portion
|
|
return 0, nil, false
|
|
}
|
|
case '[':
|
|
if len(l) > idx+1 && l[idx+1] == '[' {
|
|
intLinkOpen = true
|
|
skipNext = true
|
|
continue
|
|
}
|
|
|
|
case ']':
|
|
if len(l) > idx+1 && l[idx+1] == ']' {
|
|
if intLinkOpen {
|
|
intLinkOpen = false
|
|
skipNext = true
|
|
continue
|
|
}
|
|
matchingpos = idx
|
|
closed = true
|
|
break plLoop
|
|
}
|
|
case '|':
|
|
if !intLinkOpen {
|
|
pipepos = append(pipepos, idx)
|
|
}
|
|
default:
|
|
}
|
|
}
|
|
if !closed {
|
|
return 0, nil, false
|
|
}
|
|
var link WikiLink
|
|
var pipes = make([]string, 0, 0)
|
|
var nt []*Token = nil
|
|
var err error = nil
|
|
if len(pipepos) == 0 {
|
|
link = WikiCanonicalForm(l[2:matchingpos])
|
|
nt = []*Token{&Token{TText: l[2:matchingpos], TType: "text"}}
|
|
} else {
|
|
link = WikiCanonicalForm(l[2:pipepos[0]])
|
|
for i := 0; i < len(pipepos)-1; i++ {
|
|
pipes = append(pipes, l[pipepos[i]+1:pipepos[i+1]])
|
|
}
|
|
if pipepos[len(pipepos)-1]+1 < matchingpos {
|
|
nt, err = a.parseInlineText(l, pipepos[len(pipepos)-1]+1, matchingpos)
|
|
if err != nil {
|
|
return 0, nil, false
|
|
}
|
|
}
|
|
}
|
|
tokens := make([]*Token, 0, 2)
|
|
tokens = append(tokens, &Token{TLink: link, TType: "filelink", TPipes: pipes})
|
|
if nt != nil {
|
|
tokens = append(tokens, nt...)
|
|
}
|
|
tokens = append(tokens, &Token{TType: "closefilelink"})
|
|
return matchingpos + 2, tokens, true
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a <= b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
var behavswitchre = regexp.MustCompile(`^__[A-Z]+__`)
|
|
|
|
func (a *Article) decodeBehavSwitch(l string) (int, bool) {
|
|
match := behavswitchre.FindString(l)
|
|
if len(match) == 0 {
|
|
return 0, false
|
|
} else {
|
|
return len(match), true
|
|
}
|
|
}
|
|
|
|
func (a *Article) parseInlineText(l string, start, end int) ([]*Token, error) {
|
|
nt := make([]*Token, 0)
|
|
|
|
tStart, tEnd := start, start
|
|
|
|
for pos := start; pos < end; {
|
|
rv, rune_len := utf8.DecodeRuneInString(l[pos:end])
|
|
switch rv {
|
|
case '<':
|
|
e, tag, attr, closed, ok := a.decodeHTMLtag(l[pos:end])
|
|
if ok {
|
|
pos += e
|
|
if isValidHTMLtag(tag) {
|
|
if tEnd > tStart {
|
|
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
|
}
|
|
nt = append(nt, &Token{TType: "html", TText: tag, TAttr: attr, TClosed: closed})
|
|
tStart = pos
|
|
}
|
|
tEnd = pos
|
|
continue
|
|
}
|
|
case '[':
|
|
e, lt, ok := a.parseLink(l[pos:end])
|
|
if ok {
|
|
if tEnd > tStart {
|
|
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
|
}
|
|
nt = append(nt, lt...)
|
|
pos += e
|
|
tStart, tEnd = pos, pos
|
|
continue
|
|
}
|
|
case '_':
|
|
e, ok := a.decodeBehavSwitch(l[pos:end])
|
|
if ok {
|
|
if tEnd > tStart {
|
|
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
|
}
|
|
nt = append(nt, &Token{TType: "magic", TAttr: l[pos : pos+e]})
|
|
pos += e
|
|
tStart, tEnd = pos, pos
|
|
continue
|
|
}
|
|
case ' ', '\t', '\r':
|
|
if tEnd > tStart {
|
|
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
|
}
|
|
nt = append(nt, &Token{TType: "space"})
|
|
tStart = pos + rune_len
|
|
case '\'':
|
|
if tEnd > tStart {
|
|
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
|
}
|
|
nt = append(nt, &Token{TType: "quote"})
|
|
tStart = pos + rune_len
|
|
case ':':
|
|
if tEnd > tStart {
|
|
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
|
}
|
|
nt = append(nt, &Token{TType: "colon"})
|
|
tStart = pos + rune_len
|
|
case '\x07':
|
|
// case '@':
|
|
if tEnd > tStart {
|
|
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
|
}
|
|
nt = append(nt, &Token{TType: "special", TText: l[pos : pos+8]})
|
|
pos += 8
|
|
tStart, tEnd = pos, pos
|
|
continue
|
|
}
|
|
pos += rune_len
|
|
tEnd = pos
|
|
}
|
|
if tEnd > tStart {
|
|
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
|
}
|
|
return nt, nil
|
|
}
|
|
|
|
func (a *Article) isHeading(l string) bool {
|
|
if l[0] != '=' {
|
|
return false
|
|
}
|
|
done := 0
|
|
lastEqual := false
|
|
for _, rv := range l {
|
|
done++
|
|
if done > 2 {
|
|
if unicode.IsSpace(rv) {
|
|
continue
|
|
}
|
|
if rv == '=' {
|
|
lastEqual = true
|
|
continue
|
|
}
|
|
lastEqual = false
|
|
}
|
|
|
|
}
|
|
return lastEqual
|
|
}
|
|
|
|
func (a *Article) isTable(l string) bool {
|
|
return (len(l) > 1 && (l[0:2] == "{|" || l[0:2] == "|}" || l[0:2] == "|+" || l[0:2] == "|-")) || (len(l) > 0 && (l[0:1] == "|" || l[0:1] == "!"))
|
|
}
|
|
|
|
func (a *Article) lineType(l string) string {
|
|
switch {
|
|
case len(l) == 0:
|
|
return "blank"
|
|
case len(l) > 8 && strings.ToLower(l[0:9]) == "#redirect":
|
|
return "redirect"
|
|
case len(l) > 3 && l[0:4] == "----":
|
|
return "hr"
|
|
case a.isHeading(l):
|
|
return "heading"
|
|
case l[0] == ';' || l[0] == ':' || l[0] == '*' || l[0] == '#':
|
|
return "list"
|
|
case a.isTable(l):
|
|
return "table"
|
|
case l[0] == ' ':
|
|
return "wikipre"
|
|
}
|
|
return "normal"
|
|
}
|
|
|
|
func (a *Article) Tokenize(mw string, g PageGetter) ([]*Token, error) {
|
|
mwnc := a.stripComments(mw)
|
|
mwnr := a.stripRefs(mwnc)
|
|
mw_stripped, nowikipremathmap := a.stripNowikiPreMath(mwnr)
|
|
mw_tmpl, templatemap := a.processTemplates(mw_stripped, nowikipremathmap, g)
|
|
mw_links := a.preprocessLinks(mw_tmpl)
|
|
|
|
lines := strings.Split(mw_links, "\n")
|
|
tokens := make([]*Token, 0, 16)
|
|
for _, l := range lines {
|
|
var nt []*Token
|
|
var err error = nil
|
|
lt := a.lineType(l)
|
|
switch lt {
|
|
case "normal":
|
|
nt, err = a.parseInlineText(l, 0, len(l))
|
|
case "redirect":
|
|
nt, err = a.parseRedirectLine(l)
|
|
case "hr":
|
|
nt, err = a.parseHRuler(l)
|
|
case "heading":
|
|
nt, err = a.parseHeadingLine(l)
|
|
case "list":
|
|
nt, err = a.parseListLine(l)
|
|
case "table":
|
|
nt, err = a.parseTableLine(l)
|
|
case "wikipre":
|
|
nt, err = a.parseWikiPreLine(l)
|
|
case "blank":
|
|
nt = []*Token{&Token{TType: "blank"}}
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nt = append(nt, &Token{TType: "newline"})
|
|
tokens = append(tokens, nt...)
|
|
}
|
|
specialcount := 0
|
|
for i := range tokens {
|
|
if tokens[i].TType == "special" {
|
|
specialcount++
|
|
t, ok := templatemap[tokens[i].TText]
|
|
if !ok {
|
|
return nil, errors.New("special not in map")
|
|
}
|
|
tokens[i] = t
|
|
}
|
|
}
|
|
|
|
if specialcount != len(templatemap) {
|
|
if DebugLevel > 0 {
|
|
fmt.Println("[Tokenize] Warning: number of specials in map differs from number found")
|
|
}
|
|
}
|
|
return tokens, nil
|
|
}
|
|
|
|
var commentsRe = regexp.MustCompile(`(?isU)<!--.*(?:-->|\z)`)
|
|
|
|
func (a *Article) stripComments(mw string) string {
|
|
return commentsRe.ReplaceAllLiteralString(mw, "")
|
|
}
|
|
|
|
var refRe = regexp.MustCompile(`(?msU)<ref.*</ref>`)
|
|
|
|
func (a *Article) stripRefs(mw string) string {
|
|
return refRe.ReplaceAllLiteralString(mw, "")
|
|
}
|
|
|
|
var nowikiOpenRe = regexp.MustCompile(`(?i)<\s*(nowiki)\s*[^>/]*>`)
|
|
var nowikiCloseRe = regexp.MustCompile(`(?i)<(/nowiki)\s*[^>/]*>`)
|
|
var preOpenRe = regexp.MustCompile(`(?i)<\s*(pre)\s*[^>]*>`)
|
|
var preCloseRe = regexp.MustCompile(`(?i)<(/pre)\s*[^>]*>`)
|
|
var mathOpenRe = regexp.MustCompile(`(?i)<\s*(math)\s*[^>]*>`)
|
|
var mathCloseRe = regexp.MustCompile(`(?i)<(/math)\s*[^>]*>`)
|
|
|
|
type ssInt [][]int
|
|
|
|
func (a ssInt) Len() int { return len(a) }
|
|
func (a ssInt) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
func (a ssInt) Less(i, j int) bool { return a[i][0] < a[j][0] }
|
|
|
|
func (a *Article) stripNowikiPreMath(mw string) (string, map[string]*Token) {
|
|
nwoc := nowikiOpenRe.FindAllStringSubmatchIndex(mw, -1)
|
|
nwcc := nowikiCloseRe.FindAllStringSubmatchIndex(mw, -1)
|
|
poc := preOpenRe.FindAllStringSubmatchIndex(mw, -1)
|
|
pcc := preCloseRe.FindAllStringSubmatchIndex(mw, -1)
|
|
moc := mathOpenRe.FindAllStringSubmatchIndex(mw, -1)
|
|
mcc := mathCloseRe.FindAllStringSubmatchIndex(mw, -1)
|
|
|
|
for i := range nwoc {
|
|
nwoc[i] = append(nwoc[i], 0)
|
|
}
|
|
for i := range nwcc {
|
|
nwcc[i] = append(nwcc[i], 1)
|
|
}
|
|
for i := range poc {
|
|
poc[i] = append(poc[i], 2)
|
|
}
|
|
for i := range pcc {
|
|
pcc[i] = append(pcc[i], 3)
|
|
}
|
|
for i := range moc {
|
|
moc[i] = append(moc[i], 4)
|
|
}
|
|
for i := range mcc {
|
|
mcc[i] = append(mcc[i], 5)
|
|
}
|
|
am := make([][]int, 0, len(nwoc)+len(nwcc)+len(poc)+len(pcc)+len(moc)+len(mcc))
|
|
am = append(am, nwoc...)
|
|
am = append(am, nwcc...)
|
|
am = append(am, poc...)
|
|
am = append(am, pcc...)
|
|
am = append(am, moc...)
|
|
am = append(am, mcc...)
|
|
sort.Sort(ssInt(am))
|
|
|
|
tokens := make(map[string]*Token, len(am))
|
|
if len(am) == 0 {
|
|
return mw, tokens
|
|
}
|
|
|
|
ctype := -1
|
|
out := ""
|
|
lastclose := 0
|
|
openidx := 0
|
|
count := 0
|
|
for i := range am {
|
|
if (ctype != -1) && (am[i][4] == ctype+1) && (am[openidx][1] <= am[i][0]) {
|
|
// closing an open one
|
|
special := fmt.Sprintf("\x07%07d", count)
|
|
|
|
tokens[special] = &Token{
|
|
TText: mw[am[openidx][1]:am[i][0]],
|
|
TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]),
|
|
TAttr: mw[am[openidx][3] : am[openidx][1]-1],
|
|
}
|
|
out += special
|
|
ctype = -1
|
|
lastclose = am[i][1]
|
|
count++
|
|
} else if (ctype == -1) && (am[i][4]&1 == 0) && (lastclose <= am[i][0]) {
|
|
// open a new one
|
|
out += mw[lastclose:am[i][0]]
|
|
ctype = am[i][4]
|
|
openidx = i
|
|
}
|
|
}
|
|
if ctype != -1 {
|
|
//it's open: close it
|
|
special := fmt.Sprintf("\x07%07d", count)
|
|
|
|
tokens[special] = &Token{
|
|
TText: mw[am[openidx][1]:len(mw)],
|
|
TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]),
|
|
TAttr: mw[am[openidx][3] : am[openidx][1]-1],
|
|
}
|
|
out += special
|
|
ctype = -1
|
|
count++
|
|
} else {
|
|
out += mw[lastclose:]
|
|
}
|
|
return out, tokens
|
|
}
|
|
|
|
var multiLineLinksRe = regexp.MustCompile(`(?sm)\[\[[^\n|]*\|.*?\]\]`)
|
|
|
|
/* TODO: add preprocessing as in Parser.php:pstPass2() to enable pipe tricks
|
|
*/
|
|
func (a *Article) preprocessLinks(s string) string {
|
|
mw := []byte(s)
|
|
mll := multiLineLinksRe.FindAllSubmatchIndex(mw, -1)
|
|
for _, pair := range mll {
|
|
for i := pair[0]; i < pair[1]; {
|
|
// we have to walk this string carefully, by rune, not by i
|
|
rv, rlen := utf8.DecodeRune(mw[i:])
|
|
if rv == '\n' {
|
|
mw[i] = ' '
|
|
}
|
|
i += rlen
|
|
}
|
|
}
|
|
return string(mw)
|
|
}
|