Adding library code.

This commit is contained in:
Michele M. Franceschini 2018-01-03 16:21:59 -05:00
parent d5cca56718
commit 93aa7513fb
11 changed files with 2921 additions and 1 deletions

202
LICENSE Normal file
View file

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -1,3 +1,3 @@
# gowiki
Gowiki is a golang library to parse mediawiki markup as found in Wikipedia pages
Gowiki is a golang library to parse mediawiki markup as found in Wikipedia pages.

233
gowiki.go Normal file
View file

@ -0,0 +1,233 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import (
"bytes"
// "errors"
// "fmt"
"html"
"regexp"
"strings"
)
// var Debug bool = false
var DebugLevel int = 0
type Article struct {
MediaWiki string
Title string
Links []WikiLink
ExtLinks []string
Type string
AbstractText string
Media []WikiLink
Tokens []*Token
// OldTokens []*Token
Root *ParseNode
Parsed bool
Text string
TextLinks []FullWikiLink
Templates []*Template
// unexported fields
gt bool
text *bytes.Buffer
nchar int
innerParseErrorCount int
}
type WikiLink struct {
Namespace string
PageName string
Anchor string
}
type FullWikiLink struct {
Link WikiLink
Text string
Start int // rune offset of beginning
End int // rune offset of end (index of the char after the last)
}
type PageGetter interface {
Get(page WikiLink) (string, error)
}
func NewArticle(title, text string) (*Article, error) {
a := new(Article)
a.Title = title
a.MediaWiki = text
a.Links = make([]WikiLink, 0, 16)
a.Media = make([]WikiLink, 0, 16)
a.TextLinks = make([]FullWikiLink, 0, 16)
a.ExtLinks = make([]string, 0, 16)
return a, nil
}
func (a *Article) GetText() string {
if !a.gt {
a.genText()
}
return a.Text
}
func (a *Article) GetAbstract() string {
if !a.gt {
a.genText()
}
return a.AbstractText
}
func (a *Article) GetLinks() []WikiLink {
return a.Links
}
func (a *Article) GetExternalLinks() []string {
return a.ExtLinks
}
func (a *Article) GetMedia() []WikiLink {
return a.Media
}
func (a *Article) GetTextLinks() []FullWikiLink {
if !a.gt {
a.genText()
}
return a.TextLinks
}
var canoReSpaces = regexp.MustCompile(`[ _]+`)
func WikiCanonicalFormEsc(l string, unescape bool) WikiLink {
return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, "", unescape)
}
func WikiCanonicalForm(l string) WikiLink {
return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, "", true)
}
func WikiCanonicalFormNamespace(l string, defaultNamespace string) WikiLink {
return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, defaultNamespace, true)
}
func (namespaces Namespaces) WikiCanonicalFormNamespaceEsc(l string, defaultNamespace string, unescape bool) WikiLink {
hpos := strings.IndexRune(l, '#')
anchor := ""
if hpos >= 0 {
anchor = l[hpos+1:]
l = l[0:hpos]
}
i := strings.Index(l, ":")
namespace := defaultNamespace
if i >= 0 {
cns := strings.TrimSpace(canoReSpaces.ReplaceAllString(l[:i], " "))
if unescape {
cns = html.UnescapeString(cns)
}
ns, ok := namespaces[strings.ToLower(cns)]
switch {
case ok && len(cns) > 0:
namespace = ns //strings.ToUpper(cns[0:1]) + strings.ToLower(cns[1:])
case ok:
namespace = ""
default:
i = -1
}
}
article := strings.TrimSpace(canoReSpaces.ReplaceAllString(l[i+1:], " "))
anchor = canoReSpaces.ReplaceAllString(anchor, " ")
if unescape {
article = html.UnescapeString(article)
anchor = html.UnescapeString(anchor)
}
if len(article) > 0 {
article = strings.ToUpper(article[0:1]) + article[1:]
}
return WikiLink{Namespace: namespace, PageName: article, Anchor: anchor}
}
func (wl *WikiLink) FullPagename() string {
if len(wl.Namespace) == 0 {
return wl.PageName
}
return wl.Namespace + ":" + wl.PageName
}
func (wl *WikiLink) FullPagenameAnchor() string {
ns := ""
if len(wl.Namespace) != 0 {
ns = wl.Namespace + ":"
}
an := ""
if len(wl.Anchor) != 0 {
an = "#" + wl.Anchor
}
return ns + wl.PageName + an
}
func (wl *WikiLink) IsImplicitSelfLink() bool {
return len(wl.PageName) == 0
}
func (wl *WikiLink) HasAnchor() bool {
return len(wl.Anchor) != 0
}
func (wl *WikiLink) GetAnchor() string {
return wl.Anchor
}
type Namespaces map[string]string
var StandardNamespaces Namespaces = map[string]string{
"media": "Media",
"special": "Special",
"talk": "Talk",
"user": "User",
"user talk": "User talk",
"wikipedia": "Wikipedia",
"wikipedia talk": "Wikipedia talk",
"file": "File",
"file talk": "File talk",
"mediawiki": "MediaWiki",
"mediawiki talk": "MediaWiki talk",
"template": "Template",
"template talk": "Template talk",
"help": "Help",
"help talk": "Help talk",
"category": "Category",
"category talk": "Category talk",
"portal": "Portal",
"portal talk": "Portal talk",
"book": "Book",
"book talk": "Book talk",
"draft": "Draft",
"draft talk": "Draft talk",
"education program": "Education Program",
"education program talk": "Education Program talk",
"timedtext": "TimedText",
"timedtext talk": "TimedText talk",
"module": "Module",
"module talk": "Module talk",
"topic": "Topic",
}
type DummyPageGetter struct{}
func (g *DummyPageGetter) Get(wl WikiLink) (string, error) {
return "", nil
}

46
gowiki_test.go Normal file
View file

@ -0,0 +1,46 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import (
"encoding/json"
// "os"
// "strings"
"testing"
)
func TestParseArticle(t *testing.T) {
mw := "* ''[[The Album (ABBA album)|''The Album'']]'' (1977)"
t.Log(mw)
a, err := ParseArticle("Test", mw, &DummyPageGetter{})
if err != nil {
t.Error("Error:", err)
}
b, err := json.MarshalIndent(a.Tokens, "", "\t")
if err != nil {
t.Error("Error:", err)
}
t.Log("Tokens\n")
t.Log(string(b))
}
func TestWikiCanonicalFormNamespaceEsc(t *testing.T) {
wl := StandardNamespaces.WikiCanonicalFormNamespaceEsc("WiKIpEdia:pagename#section", "", true)
if wl.Namespace != "Wikipedia" || wl.PageName != "Pagename" || wl.Anchor != "section" {
t.Error("Error: wikilink not parsed correctly", wl)
}
}

636
parse.go Normal file
View file

@ -0,0 +1,636 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import (
"errors"
"fmt"
"html"
"log"
"strconv"
"strings"
)
const maxInnerParseErrorCount = 100
type ParseNode struct {
NType string
NSubType string
Link WikiLink
Contents string
Flags int
Nodes []*ParseNode
}
func (a *Article) PrintParseTree() {
a.printParseTree(a.Root, 0)
}
func (a *Article) printParseTree(root *ParseNode, depth int) {
if depth > 20 {
return
}
spaces := "......................................"
min := len(spaces)
if depth < len(spaces) {
min = depth
}
if depth < 0 {
min = 0
}
prefix := spaces[0:min]
for _, n := range root.Nodes {
fmt.Printf("%s NType: %10s NSubType: %10s Contents: %16s Flags: %d\n", prefix, n.NType, n.NSubType, n.Contents, n.Flags)
if len(n.Nodes) > 0 {
a.printParseTree(n, depth+1)
}
}
}
const (
TClosed int = 1 << iota
)
const (
QS_none int = iota
QS_i
QS_b
QS_ib
QS_bi
)
func ParseArticle(title, text string, g PageGetter) (*Article, error) {
a, err := NewArticle(title, text)
if err != nil {
return nil, err
}
a.Tokens, err = a.Tokenize(a.MediaWiki, g)
if err != nil {
return a, err
}
err = a.parse()
if err != nil {
return a, err
}
a.gt = false
return a, nil
}
func (a *Article) doQuotes() {
log.SetFlags(log.Lshortfile) // | log.Ldate | log.Ltime)
state := QS_none
save := QS_none
l := 0
ni := 0
tn := make([]*Token, 0, len(a.Tokens))
t := a.Tokens
for ; ni < len(t); ni++ {
// log.Println(*t[ni])
if t[ni].TType == "quote" {
l++
// log.Println(l)
}
if t[ni].TType != "quote" || ni == len(t)-1 {
switch {
case l == 0:
// log.Println(l)
case l == 1:
// log.Println(l)
tn = append(tn, &Token{TText: "'", TType: "text"})
case l == 2:
// log.Println(l)
switch state {
case QS_b:
tn = append(tn, &Token{TType: "html", TText: "i"})
state = QS_bi
case QS_i:
tn = append(tn, &Token{TType: "html", TText: "/i"})
state = QS_none
case QS_bi:
tn = append(tn, &Token{TType: "html", TText: "/i"})
state = QS_b
case QS_ib:
tn = append(tn, &Token{TType: "html", TText: "/b"})
tn = append(tn, &Token{TType: "html", TText: "/i"})
tn = append(tn, &Token{TType: "html", TText: "b"})
state = QS_b
case QS_none:
tn = append(tn, &Token{TType: "html", TText: "i"})
state = QS_i
}
case l == 3, l == 4:
// log.Println(l)
if l == 4 {
tn = append(tn, &Token{TText: "'", TType: "text"})
}
switch state {
case QS_b:
tn = append(tn, &Token{TType: "html", TText: "/b"})
state = QS_none
case QS_i:
tn = append(tn, &Token{TType: "html", TText: "b"})
state = QS_ib
case QS_ib:
tn = append(tn, &Token{TType: "html", TText: "/b"})
state = QS_i
case QS_bi:
tn = append(tn, &Token{TType: "html", TText: "/i"})
tn = append(tn, &Token{TType: "html", TText: "/b"})
tn = append(tn, &Token{TType: "html", TText: "i"})
state = QS_i
case QS_none:
tn = append(tn, &Token{TType: "html", TText: "b"})
state = QS_b
}
case l >= 5:
// log.Println(l)
s := ""
for i := 5; i < l; i++ {
s += "'"
}
if len(s) > 0 {
tn = append(tn, &Token{TText: s, TType: "text"})
}
switch state {
case QS_b:
tn = append(tn, &Token{TType: "html", TText: "/b"})
tn = append(tn, &Token{TType: "html", TText: "i"})
state = QS_i
case QS_i:
tn = append(tn, &Token{TType: "html", TText: "/i"})
tn = append(tn, &Token{TType: "html", TText: "b"})
state = QS_b
case QS_ib:
tn = append(tn, &Token{TType: "html", TText: "/b"})
tn = append(tn, &Token{TType: "html", TText: "/i"})
state = QS_none
case QS_bi:
tn = append(tn, &Token{TType: "html", TText: "/i"})
tn = append(tn, &Token{TType: "html", TText: "/b"})
state = QS_none
case QS_none:
tn = append(tn, &Token{TType: "html", TText: "b"})
tn = append(tn, &Token{TType: "html", TText: "i"})
state = QS_bi
}
}
l = 0
}
if t[ni].TType == "link" || t[ni].TType == "extlink" || t[ni].TType == "filelink" {
// log.Println(l)
save = state
switch state {
case QS_b:
tn = append(tn, &Token{TType: "html", TText: "/b"})
case QS_i:
tn = append(tn, &Token{TType: "html", TText: "/i"})
case QS_ib:
tn = append(tn, &Token{TType: "html", TText: "/b"})
tn = append(tn, &Token{TType: "html", TText: "/i"})
case QS_bi:
tn = append(tn, &Token{TType: "html", TText: "/i"})
tn = append(tn, &Token{TType: "html", TText: "/b"})
}
state = QS_none
l = 0
}
if t[ni].TType == "closelink" || t[ni].TType == "closeextlink" || t[ni].TType == "closefilelink" {
// log.Println(l)
switch state {
case QS_b:
tn = append(tn, &Token{TType: "html", TText: "/b"})
case QS_i:
tn = append(tn, &Token{TType: "html", TText: "/i"})
case QS_ib:
tn = append(tn, &Token{TType: "html", TText: "/b"})
tn = append(tn, &Token{TType: "html", TText: "/i"})
case QS_bi:
tn = append(tn, &Token{TType: "html", TText: "/i"})
tn = append(tn, &Token{TType: "html", TText: "/b"})
}
state = save
save = QS_none
l = 0
}
if t[ni].TType != "quote" && t[ni].TType != "newline" {
// log.Println(l)
tn = append(tn, t[ni])
}
if t[ni].TType == "newline" || ni == len(t)-1 {
// log.Println(l)
switch state {
case QS_b:
tn = append(tn, &Token{TType: "html", TText: "/b"})
case QS_i:
tn = append(tn, &Token{TType: "html", TText: "/i"})
case QS_ib:
tn = append(tn, &Token{TType: "html", TText: "/b"})
tn = append(tn, &Token{TType: "html", TText: "/i"})
case QS_bi:
tn = append(tn, &Token{TType: "html", TText: "/i"})
tn = append(tn, &Token{TType: "html", TText: "/b"})
}
state = QS_none
l = 0
save = QS_none
}
if t[ni].TType == "newline" {
// log.Println(l)
tn = append(tn, t[ni])
}
}
a.Tokens = tn
// a.OldTokens = t
}
//nowiki, wikipre, pre, math, quote, colon, magic, h?, *, #, ;, :, html,
func (a *Article) parse() error {
a.doQuotes()
nodes, err := a.internalParse(a.Tokens)
if err != nil {
return err
}
root := &ParseNode{NType: "root", Nodes: nodes}
a.Root = root
a.Parsed = true
return nil
}
func isImage(t *Token) bool {
return strings.ToLower(t.TLink.Namespace) == "file"
}
func (a *Article) internalParse(t []*Token) ([]*ParseNode, error) {
ti := 0
nl := make([]*ParseNode, 0, 0)
lastti := -1
for ti < len(t) {
if ti == lastti {
// fmt.Println(len(t), ti, *t[ti], *t[ti-1], *t[ti+1])
return nil, errors.New("parsing issue")
}
lastti = ti
switch t[ti].TType {
case "nowiki":
n := &ParseNode{NType: "text", NSubType: "nowiki", Contents: html.UnescapeString(t[ti].TText)}
nl = append(nl, n)
ti++
/* case "curlyblock":
n := &ParseNode{NType: "curly", Contents: t[ti].TText}
nl = append(nl, n)
ti++ */
case "text":
n := &ParseNode{NType: "text", Contents: html.UnescapeString(t[ti].TText)}
nl = append(nl, n)
ti++
case "math":
n := &ParseNode{NType: "math", Contents: t[ti].TText}
nl = append(nl, n)
ti++
case "pre":
n2 := &ParseNode{NType: "text", NSubType: "pre", Contents: html.UnescapeString(t[ti].TText)}
n1 := &ParseNode{NType: "html", NSubType: "pre", Contents: t[ti].TAttr, Nodes: []*ParseNode{n2}}
nl = append(nl, n1)
ti++
case "nop":
ti++
case "wikipre":
closebefore := len(t)
ni := ti + 1
for ; ni < len(t)-1; ni++ {
if t[ni].TType == "newline" {
if t[ni+1].TType == "wikipre" {
t[ni+1].TType = "nop"
} else {
closebefore = ni
break
}
}
}
if closebefore <= ni+1 {
n := &ParseNode{NType: "html", NSubType: "pre"}
nl = append(nl, n)
ti++
} else {
nodes, err := a.internalParse(t[ti+1 : closebefore])
if err != nil {
return nil, err
}
n := &ParseNode{NType: "html", NSubType: "pre", Nodes: nodes}
nl = append(nl, n)
ti = closebefore
}
case "extlink":
ni := ti + 1
for ; ni < len(t); ni++ {
if t[ni].TType == "closeextlink" {
break
}
}
if ni == len(t) {
return nil, errors.New("Unmatched external link token for link: " + t[ti].TText)
}
n := &ParseNode{NType: "extlink", NSubType: "", Contents: t[ti].TText}
a.ExtLinks = append(a.ExtLinks, t[ti].TText)
if ni > ti+1 {
nodes, err := a.internalParse(t[ti+1 : ni])
if err != nil {
return nil, err
}
n.Nodes = nodes
}
nl = append(nl, n)
ti = ni + 1
case "closeextlink":
return nil, errors.New("Unmatched close external link token")
case "hrule":
n := &ParseNode{NType: "html", NSubType: "hr"}
nl = append(nl, n)
ti++
case "magic":
n := &ParseNode{NType: "magic", Contents: t[ti].TText}
nl = append(nl, n)
ti++
case "colon":
n := &ParseNode{NType: "text", Contents: ":"}
nl = append(nl, n)
ti++
case "space":
n := &ParseNode{NType: "space", Contents: " "}
nl = append(nl, n)
ti++
case "blank":
n := &ParseNode{NType: "break"}
nl = append(nl, n)
ti++
case "redirect":
ni := ti + 1
for ; ni < len(t); ni++ {
if t[ni].TType == "newline" {
break
}
if t[ni].TType == "link" {
break
}
}
if ni == len(t) || t[ni].TType == "newline" {
n := &ParseNode{NType: "text", Contents: html.UnescapeString(t[ti].TText)}
nl = append(nl, n)
ti++
} else {
n := &ParseNode{NType: "redirect", Link: t[ni].TLink, NSubType: t[ni].TAttr}
nl = append(nl, n)
ti++
}
case "link":
ni := ti + 1
nopen := 1
for ; ni < len(t); ni++ {
switch t[ni].TType {
case "link":
nopen++
case "closelink":
nopen--
}
if nopen == 0 {
break
}
}
if ni == len(t) {
return nil, errors.New("Unmatched link token for link: " + t[ti].TLink.PageName + " namespace: " + t[ti].TLink.Namespace)
}
var n *ParseNode
n = &ParseNode{NType: "link", Link: t[ti].TLink}
a.Links = append(a.Links, t[ti].TLink)
if ni > ti+1 {
nodes, err := a.internalParse(t[ti+1 : ni])
if err != nil {
return nil, err
}
n.Nodes = nodes
}
nl = append(nl, n)
ti = ni + 1
case "filelink":
ni := ti + 1
nopen := 1
for ; ni < len(t); ni++ {
switch t[ni].TType {
case "filelink":
nopen++
case "closefilelink":
nopen--
}
if nopen == 0 {
break
}
}
if ni == len(t) {
return nil, errors.New("Unmatched filelink token for filelink: " + t[ti].TLink.PageName + " namespace: " + t[ti].TLink.Namespace)
}
var n *ParseNode
n = &ParseNode{NType: "image", Link: t[ti].TLink}
a.Media = append(a.Media, t[ti].TLink)
if ni > ti+1 {
nodes, err := a.internalParse(t[ti+1 : ni])
if err != nil {
return nil, err
}
n.Nodes = nodes
}
nl = append(nl, n)
ti = ni + 1
case "closelink":
return nil, errors.New("Unmatched close link token")
case "closefilelink":
return nil, errors.New("Unmatched close file link token")
case "html":
tag := strings.ToLower(t[ti].TText)
if tag[0] == '/' {
ti++
continue
}
n := &ParseNode{NType: "html", NSubType: tag, Contents: t[ti].TAttr}
if t[ti].TClosed == true {
flags := TClosed
n.Flags = flags
nl = append(nl, n)
ti++
continue
}
ni := ti + 1
nopen := 1
for ; ni < len(t); ni++ {
if t[ni].TType == "html" {
ntag := strings.ToLower(t[ni].TText)
switch ntag {
case tag:
nopen++
case "/" + tag:
nopen--
}
if nopen == 0 {
break
}
}
}
if ni > ti+1 {
nodes, err := a.internalParse(t[ti+1 : ni])
if err != nil {
a.innerParseErrorCount++
if a.innerParseErrorCount >= maxInnerParseErrorCount {
return nil, err
}
ti++
continue
}
n.Nodes = nodes
}
nl = append(nl, n)
ti = ni + 1
if ti > len(t) {
ti = len(t)
}
case "*", "#", ";", ":":
ti += 1
/* stack := ""
si := 0
ni := ti
ln := &ParseNode{NType: "root", Nodes: make([]*ParseNode, 0, 4)}
for {
this := ""
islist := false
for ; ni < len(t); ni++ {
switch t[ni].TType {
case "*", "#", ";", ":":
islist = true
}
if islist {
this += t[ni].TType
} else {
break
}
}
same := 0
for i := 0; i < len(this) && i < len(stack); i++ {
if this[i] == stack[i] ||
(this[i] == ';' && stack[i] == ':') ||
(this[i] == ':' && stack[i] == ';') {
same++
} else {
break
}
}
n := ln
for i := 0; i < same; i++ {
n = n.Nodes[len(n.Nodes)-1]
n = n.Nodes[len(n.Nodes)-1]
}
for i := same; i < len(this); i++ { //open
var nn *ParseNode
switch this[i] {
case '*':
nn = &ParseNode{NType: "html", NSubType: "ul"}
case '#':
nn = &ParseNode{NType: "html", NSubType: "ol"}
case ';':
nn = &ParseNode{NType: "html", NSubType: "dl"}
case ':':
nn = &ParseNode{NType: "html", NSubType: "dl"}
}
nn.Nodes = make([]*ParseNode, 0, 1)
n.Nodes = append(n.Nodes, nn)
n = nn
if i < len(this)-1 {
var elem *ParseNode
switch this[len] {
case '*', '#':
elem = &ParseNode{NType: "html", NSubType: "li"}
case ';':
elem = &ParseNode{NType: "html", NSubType: "dt"}
case ':':
elem = &ParseNode{NType: "html", NSubType: "dd"}
}
elem.Nodes = make([]*ParseNode, 0, 1)
n.Nodes = append(n.Nodes, elem)
n = elem
}
}
var nitem *ParseNode
switch this[len] {
case '*', '#':
nitem = &ParseNode{NType: "html", NSubType: "li"}
case ';':
nitem = &ParseNode{NType: "html", NSubType: "dt"}
case ':':
nitem = &ParseNode{NType: "html", NSubType: "dd"}
}
n := &ParseNode{NType: "html", NSubType: st}
nl = append(nl, n)
} */
case "newline":
n := &ParseNode{NType: "text", Contents: "\n"}
nl = append(nl, n)
ti++
case "h1", "h2", "h3", "h4", "h5", "h6":
ni := ti + 1
for ; ni < len(t); ni++ {
if t[ni].TType == "newline" {
break
}
}
if ni == len(t) {
return nil, errors.New("No newline after heading")
}
n := &ParseNode{NType: "html", NSubType: t[ti].TType}
if ni > ti+1 {
nodes, err := a.internalParse(t[ti+1 : ni])
if err != nil {
return nil, err
}
n.Nodes = nodes
}
nl = append(nl, n)
ti = ni + 1
case "tb", "te":
templateIndex, err := strconv.Atoi(t[ti].TText)
if err != nil {
return nil, errors.New("Malformed tb token")
}
if templateIndex >= len(a.Templates) {
return nil, errors.New("Template index out of range")
//fmt.Println("Template index out of range", t[ti])
} else {
n := &ParseNode{NType: t[ti].TType, Contents: a.Templates[templateIndex].Name}
nl = append(nl, n)
}
ti++
default:
return nil, errors.New("Unrecognized token type: " + t[ti].TType)
}
}
return nl, nil
}

39
redirect.go Normal file
View file

@ -0,0 +1,39 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import "strings"
func (a *Article) checkRedirect(mw string) (bool, *WikiLink) {
if len(mw) < 9 || strings.ToLower(mw[0:9]) != "#redirect" {
return false, nil
}
idx := strings.Index(mw, "\n")
if idx < 0 {
idx = len(mw)
}
nnt, err := a.parseInlineText(mw, 9, idx)
if err != nil {
return false, nil
}
for _, t := range nnt {
if t.TType == "link" {
return true, &t.TLink
}
}
return false, nil
}

27
simple.go Normal file
View file

@ -0,0 +1,27 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
// "bytes"
// "errors"
// "fmt"
func (a *Article) ParseSimple() error {
a.Text = a.MediaWiki
a.Parsed = true
return nil
}

660
template.go Normal file
View file

@ -0,0 +1,660 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import (
"fmt"
"os"
"regexp"
"sort"
"strings"
)
type Template struct {
Typ string `json:"type"` //magic,normal,ext,param
Name string `json:"name"`
Attr string `json:"attr"` //text after the ':' in magic templates
Parameters map[string]string `json:"parameters"`
}
func (a *Article) parseTemplateEtc(l string) []Template {
return nil
}
type streak struct {
opening bool
length int
b int
e int
}
type template struct {
b int
e int
isparam bool
children []*template
rt string
rendered bool
}
type byStart []*template
func (a byStart) Len() int { return len(a) }
func (a byStart) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a byStart) Less(i, j int) bool { return a[i].b < a[j].b }
var templateStreaksRe = regexp.MustCompile(`(?:\{\{+)|(?:\}\}+)`)
func findCurlyStreaks(mw string) [][]int {
out := [][]int{}
found := '.'
beg := 0
// count :=0
for i, r := range mw {
switch r {
case found:
default:
if i-beg > 1 && (found == '{' || found == '}') {
out = append(out, []int{beg, i})
}
beg = i
found = r
}
}
if beg < len(mw)-1 && (found == '{' || found == '}') {
out = append(out, []int{beg, len(mw)})
}
return out
}
func findTemplates(mw string) []*template {
// tsl := templateStreaksRe.FindAllStringSubmatchIndex(mw, -1)
tsl := findCurlyStreaks(mw)
// fmt.Println(tsl)
streaks := make([]streak, 0, len(tsl))
for _, pair := range tsl {
streaks = append(streaks, streak{
opening: (mw[pair[0]] == '{'),
length: pair[1] - pair[0],
b: pair[0],
e: pair[1],
})
}
// fmt.Println(streaks)
tl := make([]*template, 0, 8)
i := 0
for i < len(streaks) {
if !streaks[i].opening && streaks[i].length > 1 { // found a closing set: search for the opening
found := false
for j := i - 1; j >= 0; j-- {
if streaks[j].opening && streaks[j].length > 1 {
found = true
n := 2
isparam := false
if streaks[i].length > 2 && streaks[j].length > 2 {
n = 3
isparam = true
}
tl = append(tl, &template{
isparam: isparam,
b: streaks[j].e - n,
e: streaks[i].b + n,
})
streaks[i].length -= n
streaks[i].b += n
streaks[j].length -= n
streaks[j].e -= n
break
}
}
if found {
continue
}
}
i++
}
sort.Sort(byStart(tl))
/* fmt.Println("Templates found:")
for i := range tl {
fmt.Println(tl[i])
} */
out := make([]*template, 0, 4)
cur_end := 0
for i := range tl {
tl[i].children = []*template{}
if tl[i].b >= cur_end {
cur_end = tl[i].e
out = append(out, tl[i])
} else {
for j := i - 1; j >= 0; j-- {
if tl[j].e > tl[i].e {
tl[j].children = append(tl[j].children, tl[i])
break
}
}
}
}
/* fmt.Println("Templates out:")
for i := range out {
fmt.Println(out[i])
}*/
/* fmt.Println("Templates found:")
for i := range tl {
fmt.Println(mw[tl[i].b:tl[i].e])
}
*/
return out
}
func findTemplateParamPos(mw string, t *template) [][]int { //first is position of pipe, second is position of first equal
out := make([][]int, 0, 1)
inChildTemplate := false
inlink := false
lastopen := false
lastclosed := false
for i, rv := range mw[t.b:t.e] {
inChildTemplate = false
open := false
closed := false
for _, ct := range t.children {
if i+t.b >= ct.b && i+t.b < ct.e {
inChildTemplate = true
break
}
}
if !inChildTemplate {
switch {
case rv == '[':
if lastopen {
inlink = true
}
open = true
case rv == ']':
if lastclosed {
inlink = false
}
closed = true
case rv == '|' && !inlink:
out = append(out, []int{i + t.b})
case rv == '=' && len(out) > 0 && len(out[len(out)-1]) == 1 && !inlink:
out[len(out)-1] = append(out[len(out)-1], i+t.b)
}
}
lastopen = open
lastclosed = closed
}
return out
}
/*func (a *Article) processTemplates(mw string, tokens map[string]*Token) (string, map[string]*Token) {
mlt := findTemplates(mw)
last := 0
out := make([]byte, 0, len(mw))
// tokens := make(map[string]*Token, len(mlt))
for i, t := range mlt {
sb := fmt.Sprintf("\x07tb%05d", i)
se := fmt.Sprintf("\x07te%05d", i)
out = append(out, []byte(mw[last:t.b])...)
out = append(out, []byte(sb+a.renderTemplate(mw, t)+se)...)
last = t.e
tokens[sb] = &Token{
TText: fmt.Sprintf("%d", i),
TType: "tb",
}
tokens[se] = &Token{
TText: fmt.Sprintf("%d", i),
TType: "te",
}
}
out = append(out, []byte(mw[last:])...)
return string(out), tokens
} */
func (a *Article) processTemplates(mws string, tokens map[string]*Token, g PageGetter) (string, map[string]*Token) {
//strip nowiki noinclude etc here
// mws := a.stripComments(mw)
// mws = a.stripNoinclude(mws)
// fmt.Println(mws)
mlt := findTemplates(mws)
last := 0
out := make([]byte, 0, len(mws))
for i, t := range mlt {
// fmt.Println("Process templates:", *t)
sb := fmt.Sprintf("\x07tb%05d", i)
se := fmt.Sprintf("\x07te%05d", i)
tn, pm := a.renderInnerTemplates(mws, t, nil, g, 0)
a.addTemplate(tn, pm)
out = append(out, []byte(mws[last:t.b])...)
out = append(out, []byte(sb+t.rt+se)...)
last = t.e
tokens[sb] = &Token{
TText: fmt.Sprintf("%d", i),
TType: "tb",
}
tokens[se] = &Token{
TText: fmt.Sprintf("%d", i),
TType: "te",
}
}
out = append(out, []byte(mws[last:])...)
//unstrip here
return string(out), tokens
}
func (a *Article) addTemplate(tn string, pm map[string]string) {
outT := Template{Parameters: pm}
base, attr, typ, _ := detectTemplateType(tn)
outT.Typ = typ
outT.Name = base
outT.Attr = attr
a.Templates = append(a.Templates, &outT)
return
}
func (a *Article) renderTemplate(mw string, t *template) string {
pp := findTemplateParamPos(mw, t)
n := 2
if t.isparam {
n = 3
}
var tn string
if len(pp) > 0 {
tn = fmt.Sprint(strings.TrimSpace(mw[t.b+n : pp[0][0]]))
} else {
tn = fmt.Sprint(strings.TrimSpace(mw[t.b+n : t.e-n]))
}
pm := make(map[string]string, len(pp))
pp = append(pp, []int{t.e - n})
for i := 0; i < len(pp)-1; i++ {
var name string
var param string
if len(pp[i]) > 1 { //named param
name = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i][1]]))
param = fmt.Sprint(strings.TrimSpace(mw[pp[i][1]+1 : pp[i+1][0]]))
} else {
name = fmt.Sprint(i + 1)
param = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i+1][0]]))
}
pm[name] = param
}
outT := Template{Parameters: pm}
base, attr, typ, text := detectTemplateType(tn)
switch {
case t.isparam:
outT.Typ = "param"
outT.Name = tn
text = ""
default:
outT.Typ = typ
outT.Name = base
outT.Attr = attr
}
a.Templates = append(a.Templates, &outT)
return text
}
func detectTemplateType(tn string) (string, string, string, string) {
index := strings.Index(tn, ":")
var base string
var attr string
if index > 0 {
base = strings.TrimSpace(tn[:index])
attr = strings.TrimSpace(tn[index+1:])
} else {
base = tn
}
_, ok := MagicMap[base]
if ok {
return base, attr, "magic", ""
}
return tn, "", "normal", ""
}
type TemplateRenderer func(name, mw string, params map[string]string) string
var MagicMap map[string]TemplateRenderer = map[string]TemplateRenderer{
"DISPLAYTITLE": nil,
}
var noHashFunctionsMap map[string]bool = map[string]bool{
"displaytitle": true,
"formatdate": true,
"int": true,
"namespace": true,
"pagesinnamespace": true,
"speciale": true,
"special": true,
"tag": true,
"anchorencode": true, "basepagenamee": true, "basepagename": true, "canonicalurle": true,
"canonicalurl": true, "cascadingsources": true, "defaultsort": true, "filepath": true,
"formatnum": true, "fullpagenamee": true, "fullpagename": true, "fullurle": true,
"fullurl": true, "gender": true, "grammar": true, "language": true,
"lcfirst": true, "lc": true, "localurle": true, "localurl": true,
"namespacee": true, "namespacenumber": true, "nse": true, "ns": true,
"numberingroup": true, "numberofactiveusers": true, "numberofadmins": true, "numberofarticles": true,
"numberofedits": true, "numberoffiles": true, "numberofpages": true, "numberofusers": true,
"numberofviews": true, "padleft": true, "padright": true, "pageid": true,
"pagenamee": true, "pagename": true, "pagesincategory": true, "pagesize": true,
"plural": true, "protectionlevel": true, "revisionday2": true, "revisionday": true,
"revisionid": true, "revisionmonth1": true, "revisionmonth": true, "revisiontimestamp": true,
"revisionuser": true, "revisionyear": true, "rootpagenamee": true, "rootpagename": true,
"subjectpagenamee": true, "subjectpagename": true, "subjectspacee": true, "subjectspace": true,
"subpagenamee": true, "subpagename": true, "talkpagenamee": true, "talkpagename": true,
"talkspacee": true, "talkspace": true, "ucfirst": true, "uc": true,
"urlencode": true,
}
var variablesMap map[string]bool = map[string]bool{
"articlepath": true,
"basepagenamee": true,
"basepagename": true,
"cascadingsources": true,
"contentlanguage": true,
"currentday2": true,
"currentdayname": true,
"currentday": true,
"currentdow": true,
"currenthour": true,
"currentmonth1": true,
"currentmonthabbrev": true,
"currentmonthnamegen": true,
"currentmonthname": true,
"currentmonth": true,
"currenttimestamp": true,
"currenttime": true,
"currentversion": true,
"currentweek": true,
"currentyear": true,
"directionmark": true,
"fullpagenamee": true,
"fullpagename": true,
"localday2": true,
"localdayname": true,
"localday": true,
"localdow": true,
"localhour": true,
"localmonth1": true,
"localmonthabbrev": true,
"localmonthnamegen": true,
"localmonthname": true,
"localmonth": true,
"localtimestamp": true,
"localtime": true,
"localweek": true,
"localyear": true,
"namespacee": true,
"namespacenumber": true,
"namespace": true,
"numberofactiveusers": true,
"numberofadmins": true,
"numberofarticles": true,
"numberofedits": true,
"numberoffiles": true,
"numberofpages": true,
"numberofusers": true,
"numberofviews": true,
"pageid": true,
"pagenamee": true,
"pagename": true,
"revisionday2": true,
"revisionday": true,
"revisionid": true,
"revisionmonth1": true,
"revisionmonth": true,
"revisionsize": true,
"revisiontimestamp": true,
"revisionuser": true,
"revisionyear": true,
"rootpagenamee": true,
"rootpagename": true,
"scriptpath": true,
"servername": true,
"server": true,
"sitename": true,
"stylepath": true,
"subjectpagenamee": true,
"subjectpagename": true,
"subjectspacee": true,
"subjectspace": true,
"subpagenamee": true,
"subpagename": true,
"talkpagenamee": true,
"talkpagename": true,
"talkspacee": true,
"talkspace": true,
}
func (a *Article) renderTemplateMagic(name string, params map[string]string) string {
return ""
}
func (a *Article) renderTemplateExt(name string, params map[string]string) string {
return ""
}
func (a *Article) renderTemplateRecursive(name string, params map[string]string, g PageGetter, depth int) string {
if depth > 4 {
return ""
}
//name and parameters have already been substituted so they are guarranteed not to contain any template
//establish the type of template
switch templateType(name) {
case "magic":
return a.renderTemplateMagic(name, params)
case "ext":
return a.renderTemplateExt(name, params)
}
//case "normal"
//based on the type of template
//for the name and each parameter, find templates and substite them in the proper order
mw, err := g.Get(WikiCanonicalFormNamespace(name, "Template"))
if err != nil {
fmt.Fprintln(os.Stderr, "Title:", a.Title, " Error retrieving:", name, " ->", err)
return ""
}
return a.TranscludeTemplatesRecursive(mw, params, g, depth)
}
func (a *Article) TranscludeTemplatesRecursive(mw string, params map[string]string, g PageGetter, depth int) string {
var mws string
followed := 0
for {
if followed > 4 {
return ""
}
//strip nowiki noinclude etc here
mws := a.stripComments(mw)
isRedirect, redirect := a.checkRedirect(mws)
if !isRedirect {
break
}
var err error
mw, err = g.Get(*redirect)
if err != nil {
return ""
}
followed++
}
mws = a.stripNoinclude(mws)
// fmt.Println(ds[depth], "TranscludeTemplatesRecursive", mws)
mlt := findTemplates(mws)
last := 0
out := make([]byte, 0, len(mws))
for _, t := range mlt {
a.renderInnerTemplates(mws, t, params, g, depth)
out = append(out, []byte(mws[last:t.b])...)
out = append(out, []byte(t.rt)...)
last = t.e
}
out = append(out, []byte(mws[last:])...)
//unstrip here
return string(out)
}
var ds []string = []string{" ", " ", " ", " ", " ", " "}
func (a *Article) renderInnerTemplates(mws string, t *template, params map[string]string, g PageGetter, depth int) (string, map[string]string) {
// render inner templates first
// fmt.Println(ds[depth], *t, "\n", ds[depth], "Template:\n", ds[depth], mws[t.b:t.e])
for _, it := range t.children {
if !it.rendered {
a.renderInnerTemplates(mws, it, params, g, depth)
}
}
// fmt.Println(ds[depth], "Working on", mws[t.b:t.e])
pp := findTemplateParamPos(mws, t) //position of the pipes for this template
// fmt.Println(ds[depth], "pp:", pp)
n := 2
if t.isparam {
n = 3
}
pp = append(pp, []int{t.e - n})
var mw string
var tb int
// var te int
if len(t.children) == 0 {
// fmt.Println(ds[depth], "No nested templates in", mws[t.b:t.e])
mw = mws
tb = t.b
// te = t.e
} else {
// fmt.Println(ds[depth], "Nested templates: fixing pp")
//substitute the strings and update pp
tci := 0
ioff := t.children[tci].b
tb = 0
mw = mws[t.b:ioff]
// fmt.Println(*t)
ooff := -t.b
ppi0 := 0
ppi1 := 0
for ppi0 < len(pp) {
// fmt.Println(mws)
// fmt.Println(len(mws), tci, ioff, ooff, ppi0, ppi1, pp)
if pp[ppi0][ppi1] <= ioff {
pp[ppi0][ppi1] += ooff
ppi1++
if ppi1 >= len(pp[ppi0]) {
ppi0++
ppi1 = 0
}
} else {
mw += t.children[tci].rt
ooff += len(t.children[tci].rt) - (t.children[tci].e - t.children[tci].b)
teoff := t.children[tci].e
tci++
if tci >= len(t.children) {
ioff = t.e
} else {
ioff = t.children[tci].b
}
// fmt.Println(ds[depth], tci, teoff, ioff)
mw += mws[teoff:ioff]
}
}
// te = len(mw)
}
// fmt.Println("len(mw):", len(mw), "mw:", mw, "\npp:", pp)
var tn string
if len(pp) > 1 {
tn = fmt.Sprint(strings.TrimSpace(mw[tb+n : pp[0][0]]))
} else {
tn = fmt.Sprint(strings.TrimSpace(mw[tb+n : pp[len(pp)-1][0]]))
}
t.rendered = true
if t.isparam { //it's a parameter substitution
text, ok := params[tn]
if ok {
t.rt = text
return "", nil
}
if len(pp) == 1 { //no default
t.rt = "{{{" + tn + "}}}"
return "", nil
}
t.rt = mw[pp[0][0]+1 : pp[len(pp)-1][0]]
return "", nil
}
pm := make(map[string]string, len(pp))
for i := 0; i < len(pp)-1; i++ {
var name string
var param string
if len(pp[i]) > 1 { //named param
name = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i][1]]))
param = fmt.Sprint(strings.TrimSpace(mw[pp[i][1]+1 : pp[i+1][0]]))
} else {
name = fmt.Sprint(i + 1)
param = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i+1][0]]))
}
pm[name] = param
}
t.rt = a.renderTemplateRecursive(tn, pm, g, depth+1)
return tn, pm
}
func templateType(tn string) string {
index := strings.Index(tn, ":")
tns := strings.TrimSpace(tn)
var base string
// var attr string
if index > 0 {
base = strings.TrimSpace(tn[:index])
// attr = strings.TrimSpace(tn[index+1:])
} else {
base = tns
}
base = strings.ToLower(base)
_, ok1 := noHashFunctionsMap[base]
_, ok2 := variablesMap[base]
if ok1 || ok2 {
return "magic"
}
if strings.HasPrefix(tns, "#") {
return "ext"
}
return "normal"
}
var noincludeRe = regexp.MustCompile(`(?isU)<noinclude>.*(?:</noinclude>|\z)`)
var includeonlyRe = regexp.MustCompile(`(?isU)<includeonly>(.*)(?:</includeonly>|\z)`)
func (a *Article) stripNoinclude(mw string) string {
mwni := noincludeRe.ReplaceAllLiteralString(mw, "")
ssl := includeonlyRe.FindAllStringSubmatch(mwni, -1)
if len(ssl) == 0 {
return mwni
}
sl := make([]string, 0, len(ssl))
for _, s := range ssl {
sl = append(sl, s[1])
}
return strings.Join(sl, "")
}

102
text.go Normal file
View file

@ -0,0 +1,102 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import (
"bytes"
"unicode/utf8"
)
func (a *Article) appendText(t string) {
a.nchar += utf8.RuneCountInString(t)
a.text.WriteString(t)
}
func (a *Article) genTextInternal(root *ParseNode, indent int) {
lastwasspace := false
for _, n := range root.Nodes {
var linkStart int
var fl FullWikiLink
isLink := false
tappend := ""
switch n.NType {
case "break":
a.appendText("\n")
case "space":
if !lastwasspace {
a.appendText(" ")
}
case "text":
a.appendText(n.Contents)
case "image":
a.appendText("\n")
tappend = "\n"
case "link":
isLink = true
linkStart = len(a.text.Bytes())
fl = FullWikiLink{Link: n.Link, Start: a.nchar}
case "html":
switch n.NSubType {
case "h1", "h2", "h3", "h4", "h5", "h6":
a.appendText("\n")
tappend = "\n"
if len(a.AbstractText) == 0 {
a.AbstractText = a.text.String()
}
case "br":
a.appendText("\n")
case "ref":
a.appendText(" ")
}
}
if len(n.Nodes) > 0 {
a.genTextInternal(n, 0)
}
if isLink {
ttmp := a.text.Bytes()
fl.End = a.nchar
fl.Text = string(ttmp[linkStart:])
a.TextLinks = append(a.TextLinks, fl)
}
lastwasspace = false
if n.NType == "space" {
lastwasspace = true
}
// a.Text += tappend
a.appendText(tappend)
}
return
}
func (a *Article) genText() error {
a.text = bytes.NewBuffer(make([]byte, 1024*1024, 1024*1024))
a.text.Truncate(0)
a.nchar = 0
a.AbstractText = ""
a.genTextInternal(a.Root, 0)
a.Text = string(a.text.Bytes())
if len(a.AbstractText) == 0 {
a.AbstractText = a.Text
}
a.gt = true
return nil
}
func (a *Article) GenText() error {
return a.genText()
}

916
tokenize.go Normal file
View file

@ -0,0 +1,916 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import (
// "bytes"
"errors"
"fmt"
// "html"
"regexp"
"sort"
"strings"
"unicode"
"unicode/utf8"
)
type Token struct {
TText string `json:"tText,omitempty"`
TType string `json:"tType,omitempty"`
TAttr string `json:"tAttr,omitempty"`
TLink WikiLink `json:"tLink,omitempty"`
TClosed bool `json:"tClosed,omitempty"`
TPipes []string `json:"tPipes,omitempty"`
}
func (a *Article) parseRedirectLine(l string) ([]*Token, error) {
nt := make([]*Token, 0, 2)
nt = append(nt, &Token{TType: "redirect"})
nnt, err := a.parseInlineText(l, 9, len(l))
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
return nt, nil
}
func (a *Article) parseWikiPreLine(l string) ([]*Token, error) {
nt := make([]*Token, 0, 2)
nt = append(nt, &Token{TType: "wikipre"})
nnt, err := a.parseInlineText(l, 1, len(l))
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
return nt, nil
}
func (a *Article) parseHRuler(l string) ([]*Token, error) {
pos := 0
for i, rv := range l {
if rv != '-' {
pos = i
break
}
}
nt := make([]*Token, 0, 2)
nt = append(nt, &Token{TType: "hrule"})
if pos != 0 {
nnt, err := a.parseInlineText(l, pos, len(l))
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
}
return nt, nil
}
func (a *Article) parseHeadingLine(l string) ([]*Token, error) {
pf := 0
pl := 0
for i, rv := range l {
if rv == '=' {
pl = i
}
}
for {
pf++
if pf == pl || l[pf] != '=' {
pf--
break
}
pl--
if pf == pl || l[pl] != '=' {
pl++
pf--
break
}
}
pf++
if pf > 6 {
diff := pf - 6
pf -= diff
pl += diff
}
nt := make([]*Token, 0, 2)
nt = append(nt, &Token{TType: fmt.Sprintf("h%d", pf)})
nnt, err := a.parseInlineText(l, pf, pl)
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
return nt, nil
}
func (a *Article) parseListLine(l string) ([]*Token, error) {
nt := make([]*Token, 0, 2)
pos := 0
for ; pos < len(l); pos++ {
switch l[pos] {
case ';', ':', '*', '#':
nt = append(nt, &Token{TType: l[pos : pos+1]})
continue
}
break
}
if pos < len(l) {
nnt, err := a.parseInlineText(l, pos, len(l))
if err != nil {
return nil, err
}
nt = append(nt, nnt...)
}
return nt, nil
}
func (a *Article) parseTableLine(l string) ([]*Token, error) {
nt := make([]*Token, 0, 0)
return nt, nil
}
func isValidHTMLtag(tag string) bool {
return true
}
func (a *Article) decodeHTMLtag(l string) (int, string, string, bool, bool) {
matchingpos := 0
inquote := false
lastbackslash := false
quote := '#'
closefound := false
tagend := 0
tagstart := 0
//taking care of comments at preprocessing time
/* if strings.HasPrefix(l, "<!--") {
i := strings.Index(l[4:], "-->")
if i == -1 {
return len(l), "!--", l[4:], true, true
}
return 4 + i + 3, "!--", l[4 : 4+i], true, true
} */
dhtLoop:
for idx, rv := range l {
// fmt.Println(string(rv), inquote, string(quote), idx, matchingpos)
switch rv {
case '>':
if !inquote {
matchingpos = idx
break dhtLoop
}
case '\'', '"':
switch {
case inquote && quote == rv && !lastbackslash:
inquote = false
case !inquote:
inquote = true
quote = rv
}
case ' ', '\t', '\r':
case '/':
closefound = true
}
lastbackslash = (rv == '\\')
if !unicode.IsSpace(rv) && tagstart == 0 {
tagstart = idx
}
if rv != '/' && !unicode.IsSpace(rv) {
closefound = false
}
if unicode.IsSpace(rv) && tagstart != 0 && tagend == 0 {
tagend = idx
}
}
if matchingpos == 0 || tagstart == 0 {
return 0, "", "", false, false
}
var tag string
var attr string
if tagend == 0 {
tag = l[tagstart:matchingpos]
attr = ""
} else {
tag = l[tagstart:tagend]
attr = l[tagend:matchingpos]
}
return matchingpos + 1, tag, attr, closefound, true
// e, tag, attr, closed, ok := decodeHTMLtag(l[pos:end])
}
func matchPrefixes(s string, prefixes []string) bool {
for i := range prefixes {
if len(s) >= len(prefixes[i]) && strings.EqualFold(s[:len(prefixes[i])], prefixes[i]) {
return true
}
}
return false
}
var extlinkre = regexp.MustCompile(`^(http:)|(ftp:)|()//[^\s]+`)
func isExtLink(l string) bool {
// return extlinkre.MatchString(l)
return matchPrefixes(l, []string{"http://", "ftp://", "//"})
}
var filelinkre = regexp.MustCompile(`(?i)^\[\[(?:image:)|(?:media:)|(?:file:)`)
func possibleFileLink(l string) bool {
// return filelinkre.MatchString(l)
return matchPrefixes(l, []string{"[[image:", "[[media:", "[[file:"})
}
func (a *Article) parseLink(l string) (int, []*Token, bool) {
if len(l) < 5 {
return 0, nil, false
}
if l[1] == '[' {
if possibleFileLink(l) {
return a.parseFileLink(l)
}
return a.parseInternalLink(l)
}
return a.parseExternalLink(l)
}
func (a *Article) parseInternalLink(l string) (int, []*Token, bool) {
// possible internal link
pipepos := 0
closed := false
matchingpos := 0
linktrail := 0
//plLoop:
for idx, rv := range l {
if idx < 2 {
continue
}
if matchingpos == 0 {
switch rv {
case '\x07': //prevent special tags in internal link
if pipepos == 0 { //only in the link portion
return 0, nil, false
}
case '[':
if idx == 2 || len(l) > idx+1 && l[idx+1] == '[' {
return 0, nil, false
}
case ']':
if len(l) > idx+1 && l[idx+1] == ']' {
matchingpos = idx
}
case '|':
if pipepos == 0 {
pipepos = idx
}
default:
}
continue
}
if !closed {
closed = true
continue
}
if unicode.IsLetter(rv) {
linktrail = idx
continue
}
break
}
if !closed {
return 0, nil, false
}
var link WikiLink
var nt []*Token = nil
var err error = nil
if pipepos == 0 {
innerstring := l[2:matchingpos]
if linktrail != 0 {
innerstring += l[matchingpos+2 : linktrail+1]
}
link = WikiCanonicalForm(l[2:matchingpos])
nt = []*Token{&Token{TText: innerstring, TType: "text"}}
} else {
innerstring := l[pipepos+1 : matchingpos]
if linktrail != 0 {
innerstring += l[matchingpos+2 : linktrail+1]
}
link = WikiCanonicalForm(l[2:pipepos])
if pipepos+1 < matchingpos {
nt, err = a.parseInlineText(innerstring, 0, len(innerstring))
if err != nil {
return 0, nil, false
}
}
}
tokens := make([]*Token, 0, 2)
tokens = append(tokens, &Token{TLink: link, TType: "link"})
if nt != nil {
tokens = append(tokens, nt...)
}
tokens = append(tokens, &Token{TType: "closelink"})
if linktrail != 0 {
return linktrail + 1, tokens, true
}
return matchingpos + 2, tokens, true
}
func (a *Article) parseExternalLink(l string) (int, []*Token, bool) {
// possible external link
spacepos := 0
matchingpos := 0
endpos := 0
intLinkOpen := false
skipNext := false
plLoop2:
for idx, rv := range l {
if idx < 1 {
continue
}
if skipNext {
skipNext = false
continue
}
switch rv {
case '\x07':
if spacepos == 0 {
return 0, nil, false
}
case '[':
if len(l) > idx+1 && l[idx+1] == '[' {
intLinkOpen = true
}
case ' ':
if spacepos == 0 {
spacepos = idx
}
case '<':
if spacepos > 0 {
// e, tag, attr, closed, ok := a.decodeHTMLtag(l[idx:len(l)])
_, tag, _, _, ok := a.decodeHTMLtag(l[idx:len(l)])
// fmt.Println("html tag in ext link. Line:", l, "\n\n", tag, ok)
if ok && tag == "/ref" {
// fmt.Println("closing link...")
matchingpos = idx
endpos = idx
break plLoop2
}
}
case ']':
if intLinkOpen && len(l) > idx+1 && l[idx+1] == ']' {
intLinkOpen = false
skipNext = true
continue
}
matchingpos = idx
endpos = idx + 1
break plLoop2
}
}
if matchingpos == 0 {
return 0, nil, false
}
var link string
var nt []*Token = nil
var err error = nil
if spacepos == 0 {
link = l[1:matchingpos]
if !isExtLink(link) {
return 0, nil, false
}
} else {
link = l[1:spacepos]
if !isExtLink(link) {
return 0, nil, false
}
if spacepos+1 < matchingpos {
nt, err = a.parseInlineText(l, spacepos+1, matchingpos)
if err != nil {
return 0, nil, false
}
}
}
tokens := make([]*Token, 0, 2)
tokens = append(tokens, &Token{TText: link, TType: "extlink"})
if nt != nil {
tokens = append(tokens, nt...)
}
tokens = append(tokens, &Token{TType: "closeextlink"})
return endpos, tokens, true
}
func (a *Article) parseFileLink(l string) (int, []*Token, bool) {
// possible internal link
pipepos := make([]int, 0, 0)
closed := false
matchingpos := 0
intLinkOpen := false
skipNext := false
plLoop:
for idx, rv := range l {
if idx < 2 {
continue
}
if skipNext {
skipNext = false
continue
}
switch rv {
case '\x07': //prevent special tags in internal link
if len(pipepos) == 0 { //only in the link portion
return 0, nil, false
}
case '[':
if len(l) > idx+1 && l[idx+1] == '[' {
intLinkOpen = true
skipNext = true
continue
}
case ']':
if len(l) > idx+1 && l[idx+1] == ']' {
if intLinkOpen {
intLinkOpen = false
skipNext = true
continue
}
matchingpos = idx
closed = true
break plLoop
}
case '|':
if !intLinkOpen {
pipepos = append(pipepos, idx)
}
default:
}
}
if !closed {
return 0, nil, false
}
var link WikiLink
var pipes = make([]string, 0, 0)
var nt []*Token = nil
var err error = nil
if len(pipepos) == 0 {
link = WikiCanonicalForm(l[2:matchingpos])
nt = []*Token{&Token{TText: l[2:matchingpos], TType: "text"}}
} else {
link = WikiCanonicalForm(l[2:pipepos[0]])
for i := 0; i < len(pipepos)-1; i++ {
pipes = append(pipes, l[pipepos[i]+1:pipepos[i+1]])
}
if pipepos[len(pipepos)-1]+1 < matchingpos {
nt, err = a.parseInlineText(l, pipepos[len(pipepos)-1]+1, matchingpos)
if err != nil {
return 0, nil, false
}
}
}
tokens := make([]*Token, 0, 2)
tokens = append(tokens, &Token{TLink: link, TType: "filelink", TPipes: pipes})
if nt != nil {
tokens = append(tokens, nt...)
}
tokens = append(tokens, &Token{TType: "closefilelink"})
return matchingpos + 2, tokens, true
}
func min(a, b int) int {
if a <= b {
return a
}
return b
}
var behavswitchre = regexp.MustCompile(`^__[A-Z]+__`)
func (a *Article) decodeBehavSwitch(l string) (int, bool) {
match := behavswitchre.FindString(l)
if len(match) == 0 {
return 0, false
} else {
return len(match), true
}
// e, ok := decodeMagic(l[pos:end])
}
func (a *Article) parseInlineText(l string, start, end int) ([]*Token, error) {
nt := make([]*Token, 0)
// fmt.Println("in parseInlineText")
tStart, tEnd := start, start
for pos := start; pos < end; {
rv, rune_len := utf8.DecodeRuneInString(l[pos:end])
switch rv {
case '<':
e, tag, attr, closed, ok := a.decodeHTMLtag(l[pos:end])
if ok {
pos += e
if isValidHTMLtag(tag) {
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "html", TText: tag, TAttr: attr, TClosed: closed})
tStart = pos
}
tEnd = pos
continue
}
case '[':
e, lt, ok := a.parseLink(l[pos:end])
if ok {
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, lt...)
pos += e
tStart, tEnd = pos, pos
continue
}
/* case '{':
e, tt, ok := a.parseTemplateEtc(l[pos:end])
fmt.Println("template:", e, tt, ok)
if ok {
if len(cs) > 0 {
nt = append(nt, &Token{TText: cs, TType: "text"})
}
nt = append(nt, tt...)
pos += e
cs = ""
continue
}
cs += string(rv) */
case '_':
e, ok := a.decodeBehavSwitch(l[pos:end])
if ok {
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "magic", TAttr: l[pos : pos+e]})
pos += e
tStart, tEnd = pos, pos
continue
}
case ' ', '\t', '\r':
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "space"})
tStart = pos + rune_len
case '\'':
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "quote"})
tStart = pos + rune_len
case ':':
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "colon"})
tStart = pos + rune_len
case '\x07':
// case '@':
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
nt = append(nt, &Token{TType: "special", TText: l[pos : pos+8]})
pos += 8
tStart, tEnd = pos, pos
continue
}
pos += rune_len
tEnd = pos
}
if tEnd > tStart {
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
}
return nt, nil
}
func (a *Article) isHeading(l string) bool {
if l[0] != '=' {
return false
}
done := 0
lastEqual := false
for _, rv := range l {
done++
if done > 2 {
if unicode.IsSpace(rv) {
continue
}
if rv == '=' {
lastEqual = true
continue
}
lastEqual = false
}
}
return lastEqual
}
func (a *Article) isTable(l string) bool {
return (len(l) > 1 && (l[0:2] == "{|" || l[0:2] == "|}" || l[0:2] == "|+" || l[0:2] == "|-")) || (len(l) > 0 && (l[0:1] == "|" || l[0:1] == "!"))
}
func (a *Article) lineType(l string) string {
switch {
case len(l) == 0:
return "blank"
case len(l) > 8 && strings.ToLower(l[0:9]) == "#redirect":
return "redirect"
case len(l) > 3 && l[0:4] == "----":
return "hr"
case a.isHeading(l):
return "heading"
case l[0] == ';' || l[0] == ':' || l[0] == '*' || l[0] == '#':
return "list"
case a.isTable(l):
return "table"
case l[0] == ' ':
return "wikipre"
}
return "normal"
}
func (a *Article) Tokenize(mw string, g PageGetter) ([]*Token, error) {
mwnc := a.stripComments(mw)
mw_stripped, nowikipremathmap := a.stripNowikiPreMath(mwnc)
mw_tmpl, templatemap := a.processTemplates(mw_stripped, nowikipremathmap, g)
mw_links := a.preprocessLinks(mw_tmpl)
lines := strings.Split(mw_links, "\n")
tokens := make([]*Token, 0, 16)
for _, l := range lines {
var nt []*Token
var err error = nil
lt := a.lineType(l)
switch lt {
case "normal":
nt, err = a.parseInlineText(l, 0, len(l))
case "redirect":
nt, err = a.parseRedirectLine(l)
case "hr":
nt, err = a.parseHRuler(l)
case "heading":
nt, err = a.parseHeadingLine(l)
case "list":
nt, err = a.parseListLine(l)
case "table":
nt, err = a.parseTableLine(l)
case "wikipre":
nt, err = a.parseWikiPreLine(l)
case "blank":
nt = []*Token{&Token{TType: "blank"}}
}
if err != nil {
return nil, err
}
nt = append(nt, &Token{TType: "newline"})
tokens = append(tokens, nt...)
}
specialcount := 0
for i := range tokens {
if tokens[i].TType == "special" {
specialcount++
t, ok := templatemap[tokens[i].TText] //nowikipremathmap[tokens[i].TText]
if !ok {
return nil, errors.New("special not in map")
}
tokens[i] = t
}
}
// fmt.Println(specialcount, len(nowikipremathmap))
// if specialcount != len(nowikipremathmap) {
if specialcount != len(templatemap) {
if DebugLevel > 0 {
fmt.Println("[Tokenize] Warning: number of specials in map differs from number found")
}
// return nil, errors.New("number of specials in map differs from number found")
}
return tokens, nil
}
var commentsRe = regexp.MustCompile(`(?isU)<!--.*(?:-->|\z)`)
func (a *Article) stripComments(mw string) string {
return commentsRe.ReplaceAllLiteralString(mw, "")
}
var nowikiOpenRe = regexp.MustCompile(`(?i)<\s*(nowiki)\s*[^>/]*>`)
var nowikiCloseRe = regexp.MustCompile(`(?i)<(/nowiki)\s*[^>/]*>`)
var preOpenRe = regexp.MustCompile(`(?i)<\s*(pre)\s*[^>]*>`)
var preCloseRe = regexp.MustCompile(`(?i)<(/pre)\s*[^>]*>`)
var mathOpenRe = regexp.MustCompile(`(?i)<\s*(math)\s*[^>]*>`)
var mathCloseRe = regexp.MustCompile(`(?i)<(/math)\s*[^>]*>`)
type ssInt [][]int
func (a ssInt) Len() int { return len(a) }
func (a ssInt) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ssInt) Less(i, j int) bool { return a[i][0] < a[j][0] }
func (a *Article) stripNowikiPreMath(mw string) (string, map[string]*Token) {
nwoc := nowikiOpenRe.FindAllStringSubmatchIndex(mw, -1)
nwcc := nowikiCloseRe.FindAllStringSubmatchIndex(mw, -1)
poc := preOpenRe.FindAllStringSubmatchIndex(mw, -1)
pcc := preCloseRe.FindAllStringSubmatchIndex(mw, -1)
moc := mathOpenRe.FindAllStringSubmatchIndex(mw, -1)
mcc := mathCloseRe.FindAllStringSubmatchIndex(mw, -1)
/*
nwoc = append(nwoc, []int{len(mw) + 1, len(mw) + 1})
nwcc = append(nwcc, []int{len(mw) + 1, len(mw) + 1})
poc = append(poc, []int{len(mw) + 1, len(mw) + 1})
pcc = append(pcc, []int{len(mw) + 1, len(mw) + 1})
moc = append(moc, []int{len(mw) + 1, len(mw) + 1})
mcc = append(mcc, []int{len(mw) + 1, len(mw) + 1})
*/
for i := range nwoc {
nwoc[i] = append(nwoc[i], 0)
}
for i := range nwcc {
nwcc[i] = append(nwcc[i], 1)
}
for i := range poc {
poc[i] = append(poc[i], 2)
}
for i := range pcc {
pcc[i] = append(pcc[i], 3)
}
for i := range moc {
moc[i] = append(moc[i], 4)
}
for i := range mcc {
mcc[i] = append(mcc[i], 5)
}
am := make([][]int, 0, len(nwoc)+len(nwcc)+len(poc)+len(pcc)+len(moc)+len(mcc))
am = append(am, nwoc...)
am = append(am, nwcc...)
am = append(am, poc...)
am = append(am, pcc...)
am = append(am, moc...)
am = append(am, mcc...)
sort.Sort(ssInt(am))
// fmt.Println(am)
tokens := make(map[string]*Token, len(am))
if len(am) == 0 {
return mw, tokens
}
ctype := -1
out := ""
lastclose := 0
openidx := 0
count := 0
for i := range am {
// fmt.Println("ctype", ctype, "lastclose", lastclose, "count", count, "openidx", openidx, "am[i]", am[i])
if (ctype != -1) && (am[i][4] == ctype+1) && (am[openidx][1] <= am[i][0]) {
// closing an open one
special := fmt.Sprintf("\x07%07d", count)
// special := fmt.Sprintf("@%07d", count)
tokens[special] = &Token{
TText: mw[am[openidx][1]:am[i][0]],
TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]),
TAttr: mw[am[openidx][3] : am[openidx][1]-1],
}
out += special
ctype = -1
lastclose = am[i][1]
count++
} else if (ctype == -1) && (am[i][4]&1 == 0) && (lastclose <= am[i][0]) {
// open a new one
out += mw[lastclose:am[i][0]]
ctype = am[i][4]
openidx = i
}
}
if ctype != -1 {
//it's open: close it
special := fmt.Sprintf("\x07%07d", count)
// special := fmt.Sprintf("@%07d", count)
tokens[special] = &Token{
TText: mw[am[openidx][1]:len(mw)],
TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]),
TAttr: mw[am[openidx][3] : am[openidx][1]-1],
}
out += special
ctype = -1
count++
} else {
out += mw[lastclose:]
}
return out, tokens
}
var multiLineLinksRe = regexp.MustCompile(`(?sm)\[\[[^\n|]*\|.*?\]\]`)
/* TODO: add preprocessing as in Parser.php:pstPass2() to enable pipe tricks
*/
func (a *Article) preprocessLinks(s string) string {
mw := []byte(s)
mll := multiLineLinksRe.FindAllSubmatchIndex(mw, -1)
for _, pair := range mll {
for i := pair[0]; i < pair[1]; {
// we have to walk this string carefully, by rune, not by i
rv, rlen := utf8.DecodeRune(mw[i:])
if rv == '\n' {
mw[i] = ' '
}
i += rlen
}
}
return string(mw)
}
//var nowikiOpenRe = regexp.MustCompile(`(?i)<\s*nowiki\s*[^>/]*>`)
//var nowikiCloseRe = regexp.MustCompile(`(?i)</nowiki\s*[^>/]*>`)
//var nowikiOpenCloseRe = regexp.MustCompile(`(?i)<nowiki\s*[^>]*/>`)
/*
type WikiParser struct {
mw string
}
func NewWikiParser(mw string) *WikiParser {
return &WikiParser{mw: mw}
}
func (wp *WikiParser) doNowiki() {
openCandidates := nowikiOpenRe.FindAllStringIndex(wp.mw, -1)
closeCandidates := nowikiCloseRe.FindAllStringIndex(wp.mw, -1)
openCloseCandidates := nowikiOpenCloseRe.FindAllStringIndex(wp.mw, -1)
tail := []int{len(wp.mw) + 1, len(wp.mw) + 1}
openCandidates = append(openCandidates, tail)
closeCandidates = append(closeCandidates, tail)
openCloseCandidates = append(openCloseCandidates, tail)
oi := 0
ci := 0
oci := 0
inNowiki := false
ol = make([][]int, 0, len(openCandidates))
cl = make([][]int, 0, len(closeCandidates))
ocl = make([][]int, 0, len(openCloseCandidates))
for {
if oi == len(openCandidates)-1 &&
ci == len(closeCandidates)-1 &&
oci == len(openCloseCandidates)-1 {
break
}
switch {
case openCandidates[oi][0] <= closeCandidates[oi][0] &&
openCandidates[oi][0] <= openCloseloseCandidates[oi][0]:
if !inNowiki {
ol = append(ol.openCandidates[oi])
inNowiki = true
}
oi += 1
case closeCandidates[oi][0] <= openCandidates[oi][0] &&
closeCandidates[oi][0] <= openCloseloseCandidates[oi][0]:
default:
}
}
}
func (wp *WikiParser) Parse() {
doSGML()
doNowiki()
doMath()
doPre()
doBlanks()
doHTMLvalidation()
doReplaceVariables()
doHR()
doAllQuotes()
doHeadings()
doLists()
doDates()
doExternalLinks()
doInternalLinks()
doISBN()
doRecombine()
}
*/

59
utils.go Normal file
View file

@ -0,0 +1,59 @@
/*
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gowiki
import (
// "fmt"
"strings"
)
func (a *Article) CheckRedirect() (bool, *WikiLink) {
rf := false
for i, t := range a.Tokens {
if i > 10 {
break
}
switch t.TType {
case "redirect":
rf = true
case "link":
if rf {
return true, &t.TLink
}
}
}
return false, nil
}
func (a *Article) CheckDisambiguation() bool {
for _, t := range a.Templates {
if t.Typ != "normal" {
continue
}
ln := strings.ToLower(t.Name)
if strings.Contains(ln, "disambig") ||
ln == "dab" ||
ln == "geodis" ||
ln == "hndis" ||
ln == "hndis-cleanup" ||
ln == "numberdis" {
return true
}
}
return false
}