Adding library code.
This commit is contained in:
parent
d5cca56718
commit
93aa7513fb
202
LICENSE
Normal file
202
LICENSE
Normal file
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -1,3 +1,3 @@
|
|||
# gowiki
|
||||
Gowiki is a golang library to parse mediawiki markup as found in Wikipedia pages
|
||||
Gowiki is a golang library to parse mediawiki markup as found in Wikipedia pages.
|
||||
|
||||
|
|
233
gowiki.go
Normal file
233
gowiki.go
Normal file
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
// "errors"
|
||||
// "fmt"
|
||||
"html"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// var Debug bool = false
|
||||
var DebugLevel int = 0
|
||||
|
||||
type Article struct {
|
||||
MediaWiki string
|
||||
Title string
|
||||
Links []WikiLink
|
||||
ExtLinks []string
|
||||
Type string
|
||||
AbstractText string
|
||||
Media []WikiLink
|
||||
Tokens []*Token
|
||||
// OldTokens []*Token
|
||||
Root *ParseNode
|
||||
Parsed bool
|
||||
Text string
|
||||
TextLinks []FullWikiLink
|
||||
Templates []*Template
|
||||
|
||||
// unexported fields
|
||||
gt bool
|
||||
text *bytes.Buffer
|
||||
nchar int
|
||||
innerParseErrorCount int
|
||||
}
|
||||
type WikiLink struct {
|
||||
Namespace string
|
||||
PageName string
|
||||
Anchor string
|
||||
}
|
||||
type FullWikiLink struct {
|
||||
Link WikiLink
|
||||
Text string
|
||||
Start int // rune offset of beginning
|
||||
End int // rune offset of end (index of the char after the last)
|
||||
}
|
||||
|
||||
type PageGetter interface {
|
||||
Get(page WikiLink) (string, error)
|
||||
}
|
||||
|
||||
func NewArticle(title, text string) (*Article, error) {
|
||||
a := new(Article)
|
||||
a.Title = title
|
||||
a.MediaWiki = text
|
||||
a.Links = make([]WikiLink, 0, 16)
|
||||
a.Media = make([]WikiLink, 0, 16)
|
||||
a.TextLinks = make([]FullWikiLink, 0, 16)
|
||||
a.ExtLinks = make([]string, 0, 16)
|
||||
return a, nil
|
||||
}
|
||||
|
||||
func (a *Article) GetText() string {
|
||||
if !a.gt {
|
||||
a.genText()
|
||||
}
|
||||
return a.Text
|
||||
}
|
||||
|
||||
func (a *Article) GetAbstract() string {
|
||||
if !a.gt {
|
||||
a.genText()
|
||||
}
|
||||
return a.AbstractText
|
||||
}
|
||||
|
||||
func (a *Article) GetLinks() []WikiLink {
|
||||
return a.Links
|
||||
}
|
||||
|
||||
func (a *Article) GetExternalLinks() []string {
|
||||
return a.ExtLinks
|
||||
}
|
||||
|
||||
func (a *Article) GetMedia() []WikiLink {
|
||||
return a.Media
|
||||
}
|
||||
|
||||
func (a *Article) GetTextLinks() []FullWikiLink {
|
||||
if !a.gt {
|
||||
a.genText()
|
||||
}
|
||||
return a.TextLinks
|
||||
}
|
||||
|
||||
var canoReSpaces = regexp.MustCompile(`[ _]+`)
|
||||
|
||||
func WikiCanonicalFormEsc(l string, unescape bool) WikiLink {
|
||||
return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, "", unescape)
|
||||
}
|
||||
|
||||
func WikiCanonicalForm(l string) WikiLink {
|
||||
return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, "", true)
|
||||
}
|
||||
|
||||
func WikiCanonicalFormNamespace(l string, defaultNamespace string) WikiLink {
|
||||
return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, defaultNamespace, true)
|
||||
}
|
||||
|
||||
func (namespaces Namespaces) WikiCanonicalFormNamespaceEsc(l string, defaultNamespace string, unescape bool) WikiLink {
|
||||
hpos := strings.IndexRune(l, '#')
|
||||
anchor := ""
|
||||
if hpos >= 0 {
|
||||
anchor = l[hpos+1:]
|
||||
l = l[0:hpos]
|
||||
}
|
||||
i := strings.Index(l, ":")
|
||||
namespace := defaultNamespace
|
||||
if i >= 0 {
|
||||
cns := strings.TrimSpace(canoReSpaces.ReplaceAllString(l[:i], " "))
|
||||
if unescape {
|
||||
cns = html.UnescapeString(cns)
|
||||
}
|
||||
ns, ok := namespaces[strings.ToLower(cns)]
|
||||
switch {
|
||||
case ok && len(cns) > 0:
|
||||
namespace = ns //strings.ToUpper(cns[0:1]) + strings.ToLower(cns[1:])
|
||||
case ok:
|
||||
namespace = ""
|
||||
default:
|
||||
i = -1
|
||||
}
|
||||
}
|
||||
article := strings.TrimSpace(canoReSpaces.ReplaceAllString(l[i+1:], " "))
|
||||
anchor = canoReSpaces.ReplaceAllString(anchor, " ")
|
||||
if unescape {
|
||||
article = html.UnescapeString(article)
|
||||
anchor = html.UnescapeString(anchor)
|
||||
}
|
||||
if len(article) > 0 {
|
||||
article = strings.ToUpper(article[0:1]) + article[1:]
|
||||
}
|
||||
return WikiLink{Namespace: namespace, PageName: article, Anchor: anchor}
|
||||
}
|
||||
|
||||
func (wl *WikiLink) FullPagename() string {
|
||||
if len(wl.Namespace) == 0 {
|
||||
return wl.PageName
|
||||
}
|
||||
return wl.Namespace + ":" + wl.PageName
|
||||
}
|
||||
|
||||
func (wl *WikiLink) FullPagenameAnchor() string {
|
||||
ns := ""
|
||||
if len(wl.Namespace) != 0 {
|
||||
ns = wl.Namespace + ":"
|
||||
}
|
||||
an := ""
|
||||
if len(wl.Anchor) != 0 {
|
||||
an = "#" + wl.Anchor
|
||||
}
|
||||
return ns + wl.PageName + an
|
||||
}
|
||||
|
||||
func (wl *WikiLink) IsImplicitSelfLink() bool {
|
||||
return len(wl.PageName) == 0
|
||||
}
|
||||
|
||||
func (wl *WikiLink) HasAnchor() bool {
|
||||
return len(wl.Anchor) != 0
|
||||
}
|
||||
|
||||
func (wl *WikiLink) GetAnchor() string {
|
||||
return wl.Anchor
|
||||
}
|
||||
|
||||
type Namespaces map[string]string
|
||||
|
||||
var StandardNamespaces Namespaces = map[string]string{
|
||||
"media": "Media",
|
||||
"special": "Special",
|
||||
"talk": "Talk",
|
||||
"user": "User",
|
||||
"user talk": "User talk",
|
||||
"wikipedia": "Wikipedia",
|
||||
"wikipedia talk": "Wikipedia talk",
|
||||
"file": "File",
|
||||
"file talk": "File talk",
|
||||
"mediawiki": "MediaWiki",
|
||||
"mediawiki talk": "MediaWiki talk",
|
||||
"template": "Template",
|
||||
"template talk": "Template talk",
|
||||
"help": "Help",
|
||||
"help talk": "Help talk",
|
||||
"category": "Category",
|
||||
"category talk": "Category talk",
|
||||
"portal": "Portal",
|
||||
"portal talk": "Portal talk",
|
||||
"book": "Book",
|
||||
"book talk": "Book talk",
|
||||
"draft": "Draft",
|
||||
"draft talk": "Draft talk",
|
||||
"education program": "Education Program",
|
||||
"education program talk": "Education Program talk",
|
||||
"timedtext": "TimedText",
|
||||
"timedtext talk": "TimedText talk",
|
||||
"module": "Module",
|
||||
"module talk": "Module talk",
|
||||
"topic": "Topic",
|
||||
}
|
||||
|
||||
type DummyPageGetter struct{}
|
||||
|
||||
func (g *DummyPageGetter) Get(wl WikiLink) (string, error) {
|
||||
return "", nil
|
||||
}
|
46
gowiki_test.go
Normal file
46
gowiki_test.go
Normal file
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
// "os"
|
||||
// "strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseArticle(t *testing.T) {
|
||||
mw := "* ''[[The Album (ABBA album)|''The Album'']]'' (1977)"
|
||||
t.Log(mw)
|
||||
a, err := ParseArticle("Test", mw, &DummyPageGetter{})
|
||||
if err != nil {
|
||||
t.Error("Error:", err)
|
||||
}
|
||||
b, err := json.MarshalIndent(a.Tokens, "", "\t")
|
||||
if err != nil {
|
||||
t.Error("Error:", err)
|
||||
}
|
||||
t.Log("Tokens\n")
|
||||
t.Log(string(b))
|
||||
}
|
||||
|
||||
func TestWikiCanonicalFormNamespaceEsc(t *testing.T) {
|
||||
wl := StandardNamespaces.WikiCanonicalFormNamespaceEsc("WiKIpEdia:pagename#section", "", true)
|
||||
if wl.Namespace != "Wikipedia" || wl.PageName != "Pagename" || wl.Anchor != "section" {
|
||||
t.Error("Error: wikilink not parsed correctly", wl)
|
||||
}
|
||||
}
|
636
parse.go
Normal file
636
parse.go
Normal file
|
@ -0,0 +1,636 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"html"
|
||||
"log"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const maxInnerParseErrorCount = 100
|
||||
|
||||
type ParseNode struct {
|
||||
NType string
|
||||
NSubType string
|
||||
Link WikiLink
|
||||
Contents string
|
||||
Flags int
|
||||
Nodes []*ParseNode
|
||||
}
|
||||
|
||||
func (a *Article) PrintParseTree() {
|
||||
a.printParseTree(a.Root, 0)
|
||||
}
|
||||
|
||||
func (a *Article) printParseTree(root *ParseNode, depth int) {
|
||||
if depth > 20 {
|
||||
return
|
||||
}
|
||||
spaces := "......................................"
|
||||
min := len(spaces)
|
||||
if depth < len(spaces) {
|
||||
min = depth
|
||||
}
|
||||
if depth < 0 {
|
||||
min = 0
|
||||
}
|
||||
prefix := spaces[0:min]
|
||||
for _, n := range root.Nodes {
|
||||
fmt.Printf("%s NType: %10s NSubType: %10s Contents: %16s Flags: %d\n", prefix, n.NType, n.NSubType, n.Contents, n.Flags)
|
||||
if len(n.Nodes) > 0 {
|
||||
a.printParseTree(n, depth+1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
TClosed int = 1 << iota
|
||||
)
|
||||
|
||||
const (
|
||||
QS_none int = iota
|
||||
QS_i
|
||||
QS_b
|
||||
QS_ib
|
||||
QS_bi
|
||||
)
|
||||
|
||||
func ParseArticle(title, text string, g PageGetter) (*Article, error) {
|
||||
a, err := NewArticle(title, text)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
a.Tokens, err = a.Tokenize(a.MediaWiki, g)
|
||||
if err != nil {
|
||||
return a, err
|
||||
}
|
||||
err = a.parse()
|
||||
if err != nil {
|
||||
return a, err
|
||||
}
|
||||
a.gt = false
|
||||
return a, nil
|
||||
}
|
||||
|
||||
func (a *Article) doQuotes() {
|
||||
log.SetFlags(log.Lshortfile) // | log.Ldate | log.Ltime)
|
||||
state := QS_none
|
||||
save := QS_none
|
||||
l := 0
|
||||
ni := 0
|
||||
tn := make([]*Token, 0, len(a.Tokens))
|
||||
t := a.Tokens
|
||||
for ; ni < len(t); ni++ {
|
||||
// log.Println(*t[ni])
|
||||
|
||||
if t[ni].TType == "quote" {
|
||||
l++
|
||||
// log.Println(l)
|
||||
}
|
||||
if t[ni].TType != "quote" || ni == len(t)-1 {
|
||||
switch {
|
||||
case l == 0:
|
||||
// log.Println(l)
|
||||
case l == 1:
|
||||
// log.Println(l)
|
||||
tn = append(tn, &Token{TText: "'", TType: "text"})
|
||||
case l == 2:
|
||||
// log.Println(l)
|
||||
switch state {
|
||||
case QS_b:
|
||||
tn = append(tn, &Token{TType: "html", TText: "i"})
|
||||
state = QS_bi
|
||||
case QS_i:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
state = QS_none
|
||||
case QS_bi:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
state = QS_b
|
||||
case QS_ib:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "b"})
|
||||
state = QS_b
|
||||
case QS_none:
|
||||
tn = append(tn, &Token{TType: "html", TText: "i"})
|
||||
state = QS_i
|
||||
}
|
||||
case l == 3, l == 4:
|
||||
// log.Println(l)
|
||||
if l == 4 {
|
||||
tn = append(tn, &Token{TText: "'", TType: "text"})
|
||||
}
|
||||
switch state {
|
||||
case QS_b:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
state = QS_none
|
||||
case QS_i:
|
||||
tn = append(tn, &Token{TType: "html", TText: "b"})
|
||||
state = QS_ib
|
||||
case QS_ib:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
state = QS_i
|
||||
case QS_bi:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "i"})
|
||||
state = QS_i
|
||||
case QS_none:
|
||||
tn = append(tn, &Token{TType: "html", TText: "b"})
|
||||
state = QS_b
|
||||
}
|
||||
case l >= 5:
|
||||
// log.Println(l)
|
||||
s := ""
|
||||
for i := 5; i < l; i++ {
|
||||
s += "'"
|
||||
}
|
||||
if len(s) > 0 {
|
||||
tn = append(tn, &Token{TText: s, TType: "text"})
|
||||
}
|
||||
switch state {
|
||||
case QS_b:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "i"})
|
||||
state = QS_i
|
||||
case QS_i:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "b"})
|
||||
state = QS_b
|
||||
case QS_ib:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
state = QS_none
|
||||
case QS_bi:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
state = QS_none
|
||||
case QS_none:
|
||||
tn = append(tn, &Token{TType: "html", TText: "b"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "i"})
|
||||
state = QS_bi
|
||||
}
|
||||
}
|
||||
l = 0
|
||||
}
|
||||
|
||||
if t[ni].TType == "link" || t[ni].TType == "extlink" || t[ni].TType == "filelink" {
|
||||
// log.Println(l)
|
||||
save = state
|
||||
switch state {
|
||||
case QS_b:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
case QS_i:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
case QS_ib:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
case QS_bi:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
}
|
||||
state = QS_none
|
||||
l = 0
|
||||
}
|
||||
if t[ni].TType == "closelink" || t[ni].TType == "closeextlink" || t[ni].TType == "closefilelink" {
|
||||
// log.Println(l)
|
||||
switch state {
|
||||
case QS_b:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
case QS_i:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
case QS_ib:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
case QS_bi:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
}
|
||||
state = save
|
||||
save = QS_none
|
||||
l = 0
|
||||
}
|
||||
|
||||
if t[ni].TType != "quote" && t[ni].TType != "newline" {
|
||||
// log.Println(l)
|
||||
tn = append(tn, t[ni])
|
||||
}
|
||||
if t[ni].TType == "newline" || ni == len(t)-1 {
|
||||
// log.Println(l)
|
||||
switch state {
|
||||
case QS_b:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
case QS_i:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
case QS_ib:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
case QS_bi:
|
||||
tn = append(tn, &Token{TType: "html", TText: "/i"})
|
||||
tn = append(tn, &Token{TType: "html", TText: "/b"})
|
||||
}
|
||||
state = QS_none
|
||||
l = 0
|
||||
save = QS_none
|
||||
}
|
||||
if t[ni].TType == "newline" {
|
||||
// log.Println(l)
|
||||
tn = append(tn, t[ni])
|
||||
}
|
||||
|
||||
}
|
||||
a.Tokens = tn
|
||||
// a.OldTokens = t
|
||||
}
|
||||
|
||||
//nowiki, wikipre, pre, math, quote, colon, magic, h?, *, #, ;, :, html,
|
||||
func (a *Article) parse() error {
|
||||
a.doQuotes()
|
||||
nodes, err := a.internalParse(a.Tokens)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
root := &ParseNode{NType: "root", Nodes: nodes}
|
||||
a.Root = root
|
||||
a.Parsed = true
|
||||
return nil
|
||||
}
|
||||
func isImage(t *Token) bool {
|
||||
return strings.ToLower(t.TLink.Namespace) == "file"
|
||||
}
|
||||
|
||||
func (a *Article) internalParse(t []*Token) ([]*ParseNode, error) {
|
||||
ti := 0
|
||||
nl := make([]*ParseNode, 0, 0)
|
||||
lastti := -1
|
||||
for ti < len(t) {
|
||||
if ti == lastti {
|
||||
// fmt.Println(len(t), ti, *t[ti], *t[ti-1], *t[ti+1])
|
||||
return nil, errors.New("parsing issue")
|
||||
}
|
||||
lastti = ti
|
||||
switch t[ti].TType {
|
||||
case "nowiki":
|
||||
n := &ParseNode{NType: "text", NSubType: "nowiki", Contents: html.UnescapeString(t[ti].TText)}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
/* case "curlyblock":
|
||||
n := &ParseNode{NType: "curly", Contents: t[ti].TText}
|
||||
nl = append(nl, n)
|
||||
ti++ */
|
||||
case "text":
|
||||
n := &ParseNode{NType: "text", Contents: html.UnescapeString(t[ti].TText)}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
case "math":
|
||||
n := &ParseNode{NType: "math", Contents: t[ti].TText}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
case "pre":
|
||||
n2 := &ParseNode{NType: "text", NSubType: "pre", Contents: html.UnescapeString(t[ti].TText)}
|
||||
n1 := &ParseNode{NType: "html", NSubType: "pre", Contents: t[ti].TAttr, Nodes: []*ParseNode{n2}}
|
||||
nl = append(nl, n1)
|
||||
ti++
|
||||
case "nop":
|
||||
ti++
|
||||
case "wikipre":
|
||||
closebefore := len(t)
|
||||
ni := ti + 1
|
||||
for ; ni < len(t)-1; ni++ {
|
||||
if t[ni].TType == "newline" {
|
||||
if t[ni+1].TType == "wikipre" {
|
||||
t[ni+1].TType = "nop"
|
||||
} else {
|
||||
closebefore = ni
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if closebefore <= ni+1 {
|
||||
n := &ParseNode{NType: "html", NSubType: "pre"}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
} else {
|
||||
nodes, err := a.internalParse(t[ti+1 : closebefore])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
n := &ParseNode{NType: "html", NSubType: "pre", Nodes: nodes}
|
||||
nl = append(nl, n)
|
||||
ti = closebefore
|
||||
}
|
||||
case "extlink":
|
||||
ni := ti + 1
|
||||
for ; ni < len(t); ni++ {
|
||||
if t[ni].TType == "closeextlink" {
|
||||
break
|
||||
}
|
||||
}
|
||||
if ni == len(t) {
|
||||
return nil, errors.New("Unmatched external link token for link: " + t[ti].TText)
|
||||
}
|
||||
n := &ParseNode{NType: "extlink", NSubType: "", Contents: t[ti].TText}
|
||||
a.ExtLinks = append(a.ExtLinks, t[ti].TText)
|
||||
if ni > ti+1 {
|
||||
nodes, err := a.internalParse(t[ti+1 : ni])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
n.Nodes = nodes
|
||||
}
|
||||
nl = append(nl, n)
|
||||
ti = ni + 1
|
||||
|
||||
case "closeextlink":
|
||||
return nil, errors.New("Unmatched close external link token")
|
||||
case "hrule":
|
||||
n := &ParseNode{NType: "html", NSubType: "hr"}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
case "magic":
|
||||
n := &ParseNode{NType: "magic", Contents: t[ti].TText}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
case "colon":
|
||||
n := &ParseNode{NType: "text", Contents: ":"}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
case "space":
|
||||
n := &ParseNode{NType: "space", Contents: " "}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
case "blank":
|
||||
n := &ParseNode{NType: "break"}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
case "redirect":
|
||||
ni := ti + 1
|
||||
for ; ni < len(t); ni++ {
|
||||
if t[ni].TType == "newline" {
|
||||
break
|
||||
}
|
||||
if t[ni].TType == "link" {
|
||||
break
|
||||
}
|
||||
}
|
||||
if ni == len(t) || t[ni].TType == "newline" {
|
||||
n := &ParseNode{NType: "text", Contents: html.UnescapeString(t[ti].TText)}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
} else {
|
||||
n := &ParseNode{NType: "redirect", Link: t[ni].TLink, NSubType: t[ni].TAttr}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
}
|
||||
case "link":
|
||||
ni := ti + 1
|
||||
nopen := 1
|
||||
for ; ni < len(t); ni++ {
|
||||
switch t[ni].TType {
|
||||
case "link":
|
||||
nopen++
|
||||
case "closelink":
|
||||
nopen--
|
||||
}
|
||||
if nopen == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
if ni == len(t) {
|
||||
return nil, errors.New("Unmatched link token for link: " + t[ti].TLink.PageName + " namespace: " + t[ti].TLink.Namespace)
|
||||
}
|
||||
var n *ParseNode
|
||||
n = &ParseNode{NType: "link", Link: t[ti].TLink}
|
||||
a.Links = append(a.Links, t[ti].TLink)
|
||||
if ni > ti+1 {
|
||||
nodes, err := a.internalParse(t[ti+1 : ni])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
n.Nodes = nodes
|
||||
}
|
||||
nl = append(nl, n)
|
||||
ti = ni + 1
|
||||
case "filelink":
|
||||
ni := ti + 1
|
||||
nopen := 1
|
||||
for ; ni < len(t); ni++ {
|
||||
switch t[ni].TType {
|
||||
case "filelink":
|
||||
nopen++
|
||||
case "closefilelink":
|
||||
nopen--
|
||||
}
|
||||
if nopen == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
if ni == len(t) {
|
||||
return nil, errors.New("Unmatched filelink token for filelink: " + t[ti].TLink.PageName + " namespace: " + t[ti].TLink.Namespace)
|
||||
}
|
||||
var n *ParseNode
|
||||
n = &ParseNode{NType: "image", Link: t[ti].TLink}
|
||||
a.Media = append(a.Media, t[ti].TLink)
|
||||
if ni > ti+1 {
|
||||
nodes, err := a.internalParse(t[ti+1 : ni])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
n.Nodes = nodes
|
||||
}
|
||||
nl = append(nl, n)
|
||||
ti = ni + 1
|
||||
|
||||
case "closelink":
|
||||
return nil, errors.New("Unmatched close link token")
|
||||
case "closefilelink":
|
||||
return nil, errors.New("Unmatched close file link token")
|
||||
case "html":
|
||||
tag := strings.ToLower(t[ti].TText)
|
||||
if tag[0] == '/' {
|
||||
ti++
|
||||
continue
|
||||
}
|
||||
n := &ParseNode{NType: "html", NSubType: tag, Contents: t[ti].TAttr}
|
||||
if t[ti].TClosed == true {
|
||||
flags := TClosed
|
||||
n.Flags = flags
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
continue
|
||||
}
|
||||
ni := ti + 1
|
||||
nopen := 1
|
||||
for ; ni < len(t); ni++ {
|
||||
if t[ni].TType == "html" {
|
||||
ntag := strings.ToLower(t[ni].TText)
|
||||
switch ntag {
|
||||
case tag:
|
||||
nopen++
|
||||
case "/" + tag:
|
||||
nopen--
|
||||
}
|
||||
if nopen == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if ni > ti+1 {
|
||||
nodes, err := a.internalParse(t[ti+1 : ni])
|
||||
if err != nil {
|
||||
a.innerParseErrorCount++
|
||||
if a.innerParseErrorCount >= maxInnerParseErrorCount {
|
||||
return nil, err
|
||||
}
|
||||
ti++
|
||||
continue
|
||||
}
|
||||
n.Nodes = nodes
|
||||
}
|
||||
nl = append(nl, n)
|
||||
ti = ni + 1
|
||||
if ti > len(t) {
|
||||
ti = len(t)
|
||||
}
|
||||
case "*", "#", ";", ":":
|
||||
ti += 1
|
||||
/* stack := ""
|
||||
si := 0
|
||||
ni := ti
|
||||
ln := &ParseNode{NType: "root", Nodes: make([]*ParseNode, 0, 4)}
|
||||
for {
|
||||
|
||||
this := ""
|
||||
islist := false
|
||||
for ; ni < len(t); ni++ {
|
||||
switch t[ni].TType {
|
||||
case "*", "#", ";", ":":
|
||||
islist = true
|
||||
}
|
||||
if islist {
|
||||
this += t[ni].TType
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
same := 0
|
||||
for i := 0; i < len(this) && i < len(stack); i++ {
|
||||
if this[i] == stack[i] ||
|
||||
(this[i] == ';' && stack[i] == ':') ||
|
||||
(this[i] == ':' && stack[i] == ';') {
|
||||
same++
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
n := ln
|
||||
for i := 0; i < same; i++ {
|
||||
n = n.Nodes[len(n.Nodes)-1]
|
||||
n = n.Nodes[len(n.Nodes)-1]
|
||||
}
|
||||
|
||||
for i := same; i < len(this); i++ { //open
|
||||
var nn *ParseNode
|
||||
switch this[i] {
|
||||
case '*':
|
||||
nn = &ParseNode{NType: "html", NSubType: "ul"}
|
||||
case '#':
|
||||
nn = &ParseNode{NType: "html", NSubType: "ol"}
|
||||
case ';':
|
||||
nn = &ParseNode{NType: "html", NSubType: "dl"}
|
||||
case ':':
|
||||
nn = &ParseNode{NType: "html", NSubType: "dl"}
|
||||
}
|
||||
nn.Nodes = make([]*ParseNode, 0, 1)
|
||||
n.Nodes = append(n.Nodes, nn)
|
||||
n = nn
|
||||
if i < len(this)-1 {
|
||||
var elem *ParseNode
|
||||
switch this[len] {
|
||||
case '*', '#':
|
||||
elem = &ParseNode{NType: "html", NSubType: "li"}
|
||||
case ';':
|
||||
elem = &ParseNode{NType: "html", NSubType: "dt"}
|
||||
case ':':
|
||||
elem = &ParseNode{NType: "html", NSubType: "dd"}
|
||||
}
|
||||
elem.Nodes = make([]*ParseNode, 0, 1)
|
||||
n.Nodes = append(n.Nodes, elem)
|
||||
n = elem
|
||||
}
|
||||
}
|
||||
var nitem *ParseNode
|
||||
switch this[len] {
|
||||
case '*', '#':
|
||||
nitem = &ParseNode{NType: "html", NSubType: "li"}
|
||||
case ';':
|
||||
nitem = &ParseNode{NType: "html", NSubType: "dt"}
|
||||
case ':':
|
||||
nitem = &ParseNode{NType: "html", NSubType: "dd"}
|
||||
}
|
||||
n := &ParseNode{NType: "html", NSubType: st}
|
||||
nl = append(nl, n)
|
||||
|
||||
} */
|
||||
case "newline":
|
||||
n := &ParseNode{NType: "text", Contents: "\n"}
|
||||
nl = append(nl, n)
|
||||
ti++
|
||||
case "h1", "h2", "h3", "h4", "h5", "h6":
|
||||
ni := ti + 1
|
||||
for ; ni < len(t); ni++ {
|
||||
if t[ni].TType == "newline" {
|
||||
break
|
||||
}
|
||||
}
|
||||
if ni == len(t) {
|
||||
return nil, errors.New("No newline after heading")
|
||||
}
|
||||
n := &ParseNode{NType: "html", NSubType: t[ti].TType}
|
||||
if ni > ti+1 {
|
||||
nodes, err := a.internalParse(t[ti+1 : ni])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
n.Nodes = nodes
|
||||
}
|
||||
nl = append(nl, n)
|
||||
ti = ni + 1
|
||||
case "tb", "te":
|
||||
templateIndex, err := strconv.Atoi(t[ti].TText)
|
||||
if err != nil {
|
||||
return nil, errors.New("Malformed tb token")
|
||||
}
|
||||
if templateIndex >= len(a.Templates) {
|
||||
return nil, errors.New("Template index out of range")
|
||||
//fmt.Println("Template index out of range", t[ti])
|
||||
} else {
|
||||
n := &ParseNode{NType: t[ti].TType, Contents: a.Templates[templateIndex].Name}
|
||||
nl = append(nl, n)
|
||||
}
|
||||
ti++
|
||||
|
||||
default:
|
||||
return nil, errors.New("Unrecognized token type: " + t[ti].TType)
|
||||
}
|
||||
}
|
||||
return nl, nil
|
||||
}
|
39
redirect.go
Normal file
39
redirect.go
Normal file
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
import "strings"
|
||||
|
||||
func (a *Article) checkRedirect(mw string) (bool, *WikiLink) {
|
||||
if len(mw) < 9 || strings.ToLower(mw[0:9]) != "#redirect" {
|
||||
return false, nil
|
||||
}
|
||||
idx := strings.Index(mw, "\n")
|
||||
if idx < 0 {
|
||||
idx = len(mw)
|
||||
}
|
||||
nnt, err := a.parseInlineText(mw, 9, idx)
|
||||
if err != nil {
|
||||
return false, nil
|
||||
}
|
||||
for _, t := range nnt {
|
||||
if t.TType == "link" {
|
||||
return true, &t.TLink
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
27
simple.go
Normal file
27
simple.go
Normal file
|
@ -0,0 +1,27 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
// "bytes"
|
||||
// "errors"
|
||||
// "fmt"
|
||||
|
||||
func (a *Article) ParseSimple() error {
|
||||
a.Text = a.MediaWiki
|
||||
a.Parsed = true
|
||||
return nil
|
||||
}
|
660
template.go
Normal file
660
template.go
Normal file
|
@ -0,0 +1,660 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Template struct {
|
||||
Typ string `json:"type"` //magic,normal,ext,param
|
||||
Name string `json:"name"`
|
||||
Attr string `json:"attr"` //text after the ':' in magic templates
|
||||
Parameters map[string]string `json:"parameters"`
|
||||
}
|
||||
|
||||
func (a *Article) parseTemplateEtc(l string) []Template {
|
||||
return nil
|
||||
}
|
||||
|
||||
type streak struct {
|
||||
opening bool
|
||||
length int
|
||||
b int
|
||||
e int
|
||||
}
|
||||
|
||||
type template struct {
|
||||
b int
|
||||
e int
|
||||
isparam bool
|
||||
children []*template
|
||||
rt string
|
||||
rendered bool
|
||||
}
|
||||
|
||||
type byStart []*template
|
||||
|
||||
func (a byStart) Len() int { return len(a) }
|
||||
func (a byStart) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a byStart) Less(i, j int) bool { return a[i].b < a[j].b }
|
||||
|
||||
var templateStreaksRe = regexp.MustCompile(`(?:\{\{+)|(?:\}\}+)`)
|
||||
|
||||
func findCurlyStreaks(mw string) [][]int {
|
||||
out := [][]int{}
|
||||
found := '.'
|
||||
beg := 0
|
||||
// count :=0
|
||||
for i, r := range mw {
|
||||
switch r {
|
||||
case found:
|
||||
default:
|
||||
if i-beg > 1 && (found == '{' || found == '}') {
|
||||
out = append(out, []int{beg, i})
|
||||
}
|
||||
beg = i
|
||||
found = r
|
||||
}
|
||||
}
|
||||
if beg < len(mw)-1 && (found == '{' || found == '}') {
|
||||
out = append(out, []int{beg, len(mw)})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func findTemplates(mw string) []*template {
|
||||
// tsl := templateStreaksRe.FindAllStringSubmatchIndex(mw, -1)
|
||||
tsl := findCurlyStreaks(mw)
|
||||
// fmt.Println(tsl)
|
||||
streaks := make([]streak, 0, len(tsl))
|
||||
for _, pair := range tsl {
|
||||
streaks = append(streaks, streak{
|
||||
opening: (mw[pair[0]] == '{'),
|
||||
length: pair[1] - pair[0],
|
||||
b: pair[0],
|
||||
e: pair[1],
|
||||
})
|
||||
}
|
||||
// fmt.Println(streaks)
|
||||
tl := make([]*template, 0, 8)
|
||||
i := 0
|
||||
for i < len(streaks) {
|
||||
if !streaks[i].opening && streaks[i].length > 1 { // found a closing set: search for the opening
|
||||
found := false
|
||||
for j := i - 1; j >= 0; j-- {
|
||||
if streaks[j].opening && streaks[j].length > 1 {
|
||||
found = true
|
||||
n := 2
|
||||
isparam := false
|
||||
if streaks[i].length > 2 && streaks[j].length > 2 {
|
||||
n = 3
|
||||
isparam = true
|
||||
}
|
||||
tl = append(tl, &template{
|
||||
isparam: isparam,
|
||||
b: streaks[j].e - n,
|
||||
e: streaks[i].b + n,
|
||||
})
|
||||
streaks[i].length -= n
|
||||
streaks[i].b += n
|
||||
streaks[j].length -= n
|
||||
streaks[j].e -= n
|
||||
break
|
||||
}
|
||||
}
|
||||
if found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
i++
|
||||
}
|
||||
sort.Sort(byStart(tl))
|
||||
/* fmt.Println("Templates found:")
|
||||
for i := range tl {
|
||||
fmt.Println(tl[i])
|
||||
} */
|
||||
out := make([]*template, 0, 4)
|
||||
cur_end := 0
|
||||
for i := range tl {
|
||||
tl[i].children = []*template{}
|
||||
if tl[i].b >= cur_end {
|
||||
cur_end = tl[i].e
|
||||
out = append(out, tl[i])
|
||||
} else {
|
||||
for j := i - 1; j >= 0; j-- {
|
||||
if tl[j].e > tl[i].e {
|
||||
tl[j].children = append(tl[j].children, tl[i])
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* fmt.Println("Templates out:")
|
||||
for i := range out {
|
||||
fmt.Println(out[i])
|
||||
}*/
|
||||
/* fmt.Println("Templates found:")
|
||||
for i := range tl {
|
||||
fmt.Println(mw[tl[i].b:tl[i].e])
|
||||
}
|
||||
*/
|
||||
return out
|
||||
}
|
||||
|
||||
func findTemplateParamPos(mw string, t *template) [][]int { //first is position of pipe, second is position of first equal
|
||||
out := make([][]int, 0, 1)
|
||||
inChildTemplate := false
|
||||
inlink := false
|
||||
lastopen := false
|
||||
lastclosed := false
|
||||
for i, rv := range mw[t.b:t.e] {
|
||||
inChildTemplate = false
|
||||
open := false
|
||||
closed := false
|
||||
for _, ct := range t.children {
|
||||
if i+t.b >= ct.b && i+t.b < ct.e {
|
||||
inChildTemplate = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !inChildTemplate {
|
||||
switch {
|
||||
case rv == '[':
|
||||
if lastopen {
|
||||
inlink = true
|
||||
}
|
||||
open = true
|
||||
case rv == ']':
|
||||
if lastclosed {
|
||||
inlink = false
|
||||
}
|
||||
closed = true
|
||||
case rv == '|' && !inlink:
|
||||
out = append(out, []int{i + t.b})
|
||||
case rv == '=' && len(out) > 0 && len(out[len(out)-1]) == 1 && !inlink:
|
||||
out[len(out)-1] = append(out[len(out)-1], i+t.b)
|
||||
}
|
||||
}
|
||||
lastopen = open
|
||||
lastclosed = closed
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
/*func (a *Article) processTemplates(mw string, tokens map[string]*Token) (string, map[string]*Token) {
|
||||
mlt := findTemplates(mw)
|
||||
last := 0
|
||||
out := make([]byte, 0, len(mw))
|
||||
// tokens := make(map[string]*Token, len(mlt))
|
||||
for i, t := range mlt {
|
||||
sb := fmt.Sprintf("\x07tb%05d", i)
|
||||
se := fmt.Sprintf("\x07te%05d", i)
|
||||
out = append(out, []byte(mw[last:t.b])...)
|
||||
out = append(out, []byte(sb+a.renderTemplate(mw, t)+se)...)
|
||||
last = t.e
|
||||
tokens[sb] = &Token{
|
||||
TText: fmt.Sprintf("%d", i),
|
||||
TType: "tb",
|
||||
}
|
||||
tokens[se] = &Token{
|
||||
TText: fmt.Sprintf("%d", i),
|
||||
TType: "te",
|
||||
}
|
||||
|
||||
}
|
||||
out = append(out, []byte(mw[last:])...)
|
||||
return string(out), tokens
|
||||
} */
|
||||
|
||||
func (a *Article) processTemplates(mws string, tokens map[string]*Token, g PageGetter) (string, map[string]*Token) {
|
||||
//strip nowiki noinclude etc here
|
||||
// mws := a.stripComments(mw)
|
||||
// mws = a.stripNoinclude(mws)
|
||||
|
||||
// fmt.Println(mws)
|
||||
mlt := findTemplates(mws)
|
||||
|
||||
last := 0
|
||||
out := make([]byte, 0, len(mws))
|
||||
for i, t := range mlt {
|
||||
// fmt.Println("Process templates:", *t)
|
||||
sb := fmt.Sprintf("\x07tb%05d", i)
|
||||
se := fmt.Sprintf("\x07te%05d", i)
|
||||
tn, pm := a.renderInnerTemplates(mws, t, nil, g, 0)
|
||||
a.addTemplate(tn, pm)
|
||||
out = append(out, []byte(mws[last:t.b])...)
|
||||
out = append(out, []byte(sb+t.rt+se)...)
|
||||
last = t.e
|
||||
tokens[sb] = &Token{
|
||||
TText: fmt.Sprintf("%d", i),
|
||||
TType: "tb",
|
||||
}
|
||||
tokens[se] = &Token{
|
||||
TText: fmt.Sprintf("%d", i),
|
||||
TType: "te",
|
||||
}
|
||||
}
|
||||
out = append(out, []byte(mws[last:])...)
|
||||
|
||||
//unstrip here
|
||||
|
||||
return string(out), tokens
|
||||
}
|
||||
|
||||
func (a *Article) addTemplate(tn string, pm map[string]string) {
|
||||
outT := Template{Parameters: pm}
|
||||
base, attr, typ, _ := detectTemplateType(tn)
|
||||
outT.Typ = typ
|
||||
outT.Name = base
|
||||
outT.Attr = attr
|
||||
a.Templates = append(a.Templates, &outT)
|
||||
return
|
||||
}
|
||||
|
||||
func (a *Article) renderTemplate(mw string, t *template) string {
|
||||
pp := findTemplateParamPos(mw, t)
|
||||
n := 2
|
||||
if t.isparam {
|
||||
n = 3
|
||||
}
|
||||
var tn string
|
||||
if len(pp) > 0 {
|
||||
tn = fmt.Sprint(strings.TrimSpace(mw[t.b+n : pp[0][0]]))
|
||||
} else {
|
||||
tn = fmt.Sprint(strings.TrimSpace(mw[t.b+n : t.e-n]))
|
||||
}
|
||||
pm := make(map[string]string, len(pp))
|
||||
pp = append(pp, []int{t.e - n})
|
||||
for i := 0; i < len(pp)-1; i++ {
|
||||
var name string
|
||||
var param string
|
||||
if len(pp[i]) > 1 { //named param
|
||||
name = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i][1]]))
|
||||
param = fmt.Sprint(strings.TrimSpace(mw[pp[i][1]+1 : pp[i+1][0]]))
|
||||
} else {
|
||||
name = fmt.Sprint(i + 1)
|
||||
param = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i+1][0]]))
|
||||
}
|
||||
pm[name] = param
|
||||
}
|
||||
|
||||
outT := Template{Parameters: pm}
|
||||
base, attr, typ, text := detectTemplateType(tn)
|
||||
switch {
|
||||
case t.isparam:
|
||||
outT.Typ = "param"
|
||||
outT.Name = tn
|
||||
text = ""
|
||||
default:
|
||||
outT.Typ = typ
|
||||
outT.Name = base
|
||||
outT.Attr = attr
|
||||
}
|
||||
a.Templates = append(a.Templates, &outT)
|
||||
return text
|
||||
}
|
||||
|
||||
func detectTemplateType(tn string) (string, string, string, string) {
|
||||
index := strings.Index(tn, ":")
|
||||
var base string
|
||||
var attr string
|
||||
if index > 0 {
|
||||
base = strings.TrimSpace(tn[:index])
|
||||
attr = strings.TrimSpace(tn[index+1:])
|
||||
} else {
|
||||
base = tn
|
||||
}
|
||||
_, ok := MagicMap[base]
|
||||
if ok {
|
||||
return base, attr, "magic", ""
|
||||
}
|
||||
|
||||
return tn, "", "normal", ""
|
||||
}
|
||||
|
||||
type TemplateRenderer func(name, mw string, params map[string]string) string
|
||||
|
||||
var MagicMap map[string]TemplateRenderer = map[string]TemplateRenderer{
|
||||
"DISPLAYTITLE": nil,
|
||||
}
|
||||
|
||||
var noHashFunctionsMap map[string]bool = map[string]bool{
|
||||
"displaytitle": true,
|
||||
"formatdate": true,
|
||||
"int": true,
|
||||
"namespace": true,
|
||||
"pagesinnamespace": true,
|
||||
"speciale": true,
|
||||
"special": true,
|
||||
"tag": true,
|
||||
"anchorencode": true, "basepagenamee": true, "basepagename": true, "canonicalurle": true,
|
||||
"canonicalurl": true, "cascadingsources": true, "defaultsort": true, "filepath": true,
|
||||
"formatnum": true, "fullpagenamee": true, "fullpagename": true, "fullurle": true,
|
||||
"fullurl": true, "gender": true, "grammar": true, "language": true,
|
||||
"lcfirst": true, "lc": true, "localurle": true, "localurl": true,
|
||||
"namespacee": true, "namespacenumber": true, "nse": true, "ns": true,
|
||||
"numberingroup": true, "numberofactiveusers": true, "numberofadmins": true, "numberofarticles": true,
|
||||
"numberofedits": true, "numberoffiles": true, "numberofpages": true, "numberofusers": true,
|
||||
"numberofviews": true, "padleft": true, "padright": true, "pageid": true,
|
||||
"pagenamee": true, "pagename": true, "pagesincategory": true, "pagesize": true,
|
||||
"plural": true, "protectionlevel": true, "revisionday2": true, "revisionday": true,
|
||||
"revisionid": true, "revisionmonth1": true, "revisionmonth": true, "revisiontimestamp": true,
|
||||
"revisionuser": true, "revisionyear": true, "rootpagenamee": true, "rootpagename": true,
|
||||
"subjectpagenamee": true, "subjectpagename": true, "subjectspacee": true, "subjectspace": true,
|
||||
"subpagenamee": true, "subpagename": true, "talkpagenamee": true, "talkpagename": true,
|
||||
"talkspacee": true, "talkspace": true, "ucfirst": true, "uc": true,
|
||||
"urlencode": true,
|
||||
}
|
||||
var variablesMap map[string]bool = map[string]bool{
|
||||
"articlepath": true,
|
||||
"basepagenamee": true,
|
||||
"basepagename": true,
|
||||
"cascadingsources": true,
|
||||
"contentlanguage": true,
|
||||
"currentday2": true,
|
||||
"currentdayname": true,
|
||||
"currentday": true,
|
||||
"currentdow": true,
|
||||
"currenthour": true,
|
||||
"currentmonth1": true,
|
||||
"currentmonthabbrev": true,
|
||||
"currentmonthnamegen": true,
|
||||
"currentmonthname": true,
|
||||
"currentmonth": true,
|
||||
"currenttimestamp": true,
|
||||
"currenttime": true,
|
||||
"currentversion": true,
|
||||
"currentweek": true,
|
||||
"currentyear": true,
|
||||
"directionmark": true,
|
||||
"fullpagenamee": true,
|
||||
"fullpagename": true,
|
||||
"localday2": true,
|
||||
"localdayname": true,
|
||||
"localday": true,
|
||||
"localdow": true,
|
||||
"localhour": true,
|
||||
"localmonth1": true,
|
||||
"localmonthabbrev": true,
|
||||
"localmonthnamegen": true,
|
||||
"localmonthname": true,
|
||||
"localmonth": true,
|
||||
"localtimestamp": true,
|
||||
"localtime": true,
|
||||
"localweek": true,
|
||||
"localyear": true,
|
||||
"namespacee": true,
|
||||
"namespacenumber": true,
|
||||
"namespace": true,
|
||||
"numberofactiveusers": true,
|
||||
"numberofadmins": true,
|
||||
"numberofarticles": true,
|
||||
"numberofedits": true,
|
||||
"numberoffiles": true,
|
||||
"numberofpages": true,
|
||||
"numberofusers": true,
|
||||
"numberofviews": true,
|
||||
"pageid": true,
|
||||
"pagenamee": true,
|
||||
"pagename": true,
|
||||
"revisionday2": true,
|
||||
"revisionday": true,
|
||||
"revisionid": true,
|
||||
"revisionmonth1": true,
|
||||
"revisionmonth": true,
|
||||
"revisionsize": true,
|
||||
"revisiontimestamp": true,
|
||||
"revisionuser": true,
|
||||
"revisionyear": true,
|
||||
"rootpagenamee": true,
|
||||
"rootpagename": true,
|
||||
"scriptpath": true,
|
||||
"servername": true,
|
||||
"server": true,
|
||||
"sitename": true,
|
||||
"stylepath": true,
|
||||
"subjectpagenamee": true,
|
||||
"subjectpagename": true,
|
||||
"subjectspacee": true,
|
||||
"subjectspace": true,
|
||||
"subpagenamee": true,
|
||||
"subpagename": true,
|
||||
"talkpagenamee": true,
|
||||
"talkpagename": true,
|
||||
"talkspacee": true,
|
||||
"talkspace": true,
|
||||
}
|
||||
|
||||
func (a *Article) renderTemplateMagic(name string, params map[string]string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (a *Article) renderTemplateExt(name string, params map[string]string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (a *Article) renderTemplateRecursive(name string, params map[string]string, g PageGetter, depth int) string {
|
||||
if depth > 4 {
|
||||
return ""
|
||||
}
|
||||
//name and parameters have already been substituted so they are guarranteed not to contain any template
|
||||
|
||||
//establish the type of template
|
||||
switch templateType(name) {
|
||||
case "magic":
|
||||
return a.renderTemplateMagic(name, params)
|
||||
case "ext":
|
||||
return a.renderTemplateExt(name, params)
|
||||
}
|
||||
//case "normal"
|
||||
//based on the type of template
|
||||
//for the name and each parameter, find templates and substite them in the proper order
|
||||
mw, err := g.Get(WikiCanonicalFormNamespace(name, "Template"))
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Title:", a.Title, " Error retrieving:", name, " ->", err)
|
||||
return ""
|
||||
}
|
||||
return a.TranscludeTemplatesRecursive(mw, params, g, depth)
|
||||
}
|
||||
|
||||
func (a *Article) TranscludeTemplatesRecursive(mw string, params map[string]string, g PageGetter, depth int) string {
|
||||
var mws string
|
||||
followed := 0
|
||||
for {
|
||||
if followed > 4 {
|
||||
return ""
|
||||
}
|
||||
//strip nowiki noinclude etc here
|
||||
mws := a.stripComments(mw)
|
||||
isRedirect, redirect := a.checkRedirect(mws)
|
||||
if !isRedirect {
|
||||
break
|
||||
}
|
||||
var err error
|
||||
mw, err = g.Get(*redirect)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
followed++
|
||||
}
|
||||
mws = a.stripNoinclude(mws)
|
||||
|
||||
// fmt.Println(ds[depth], "TranscludeTemplatesRecursive", mws)
|
||||
mlt := findTemplates(mws)
|
||||
|
||||
last := 0
|
||||
out := make([]byte, 0, len(mws))
|
||||
for _, t := range mlt {
|
||||
a.renderInnerTemplates(mws, t, params, g, depth)
|
||||
out = append(out, []byte(mws[last:t.b])...)
|
||||
out = append(out, []byte(t.rt)...)
|
||||
last = t.e
|
||||
}
|
||||
out = append(out, []byte(mws[last:])...)
|
||||
|
||||
//unstrip here
|
||||
|
||||
return string(out)
|
||||
}
|
||||
|
||||
var ds []string = []string{" ", " ", " ", " ", " ", " "}
|
||||
|
||||
func (a *Article) renderInnerTemplates(mws string, t *template, params map[string]string, g PageGetter, depth int) (string, map[string]string) {
|
||||
// render inner templates first
|
||||
// fmt.Println(ds[depth], *t, "\n", ds[depth], "Template:\n", ds[depth], mws[t.b:t.e])
|
||||
for _, it := range t.children {
|
||||
if !it.rendered {
|
||||
a.renderInnerTemplates(mws, it, params, g, depth)
|
||||
}
|
||||
}
|
||||
// fmt.Println(ds[depth], "Working on", mws[t.b:t.e])
|
||||
pp := findTemplateParamPos(mws, t) //position of the pipes for this template
|
||||
// fmt.Println(ds[depth], "pp:", pp)
|
||||
|
||||
n := 2
|
||||
if t.isparam {
|
||||
n = 3
|
||||
}
|
||||
pp = append(pp, []int{t.e - n})
|
||||
|
||||
var mw string
|
||||
var tb int
|
||||
// var te int
|
||||
if len(t.children) == 0 {
|
||||
// fmt.Println(ds[depth], "No nested templates in", mws[t.b:t.e])
|
||||
mw = mws
|
||||
tb = t.b
|
||||
// te = t.e
|
||||
} else {
|
||||
// fmt.Println(ds[depth], "Nested templates: fixing pp")
|
||||
//substitute the strings and update pp
|
||||
tci := 0
|
||||
ioff := t.children[tci].b
|
||||
tb = 0
|
||||
mw = mws[t.b:ioff]
|
||||
// fmt.Println(*t)
|
||||
ooff := -t.b
|
||||
ppi0 := 0
|
||||
ppi1 := 0
|
||||
for ppi0 < len(pp) {
|
||||
// fmt.Println(mws)
|
||||
// fmt.Println(len(mws), tci, ioff, ooff, ppi0, ppi1, pp)
|
||||
if pp[ppi0][ppi1] <= ioff {
|
||||
pp[ppi0][ppi1] += ooff
|
||||
ppi1++
|
||||
if ppi1 >= len(pp[ppi0]) {
|
||||
ppi0++
|
||||
ppi1 = 0
|
||||
}
|
||||
} else {
|
||||
mw += t.children[tci].rt
|
||||
ooff += len(t.children[tci].rt) - (t.children[tci].e - t.children[tci].b)
|
||||
teoff := t.children[tci].e
|
||||
tci++
|
||||
if tci >= len(t.children) {
|
||||
ioff = t.e
|
||||
} else {
|
||||
ioff = t.children[tci].b
|
||||
}
|
||||
// fmt.Println(ds[depth], tci, teoff, ioff)
|
||||
mw += mws[teoff:ioff]
|
||||
}
|
||||
}
|
||||
// te = len(mw)
|
||||
}
|
||||
// fmt.Println("len(mw):", len(mw), "mw:", mw, "\npp:", pp)
|
||||
var tn string
|
||||
if len(pp) > 1 {
|
||||
tn = fmt.Sprint(strings.TrimSpace(mw[tb+n : pp[0][0]]))
|
||||
} else {
|
||||
tn = fmt.Sprint(strings.TrimSpace(mw[tb+n : pp[len(pp)-1][0]]))
|
||||
}
|
||||
|
||||
t.rendered = true
|
||||
if t.isparam { //it's a parameter substitution
|
||||
text, ok := params[tn]
|
||||
if ok {
|
||||
t.rt = text
|
||||
return "", nil
|
||||
}
|
||||
if len(pp) == 1 { //no default
|
||||
t.rt = "{{{" + tn + "}}}"
|
||||
return "", nil
|
||||
}
|
||||
t.rt = mw[pp[0][0]+1 : pp[len(pp)-1][0]]
|
||||
return "", nil
|
||||
}
|
||||
pm := make(map[string]string, len(pp))
|
||||
for i := 0; i < len(pp)-1; i++ {
|
||||
var name string
|
||||
var param string
|
||||
if len(pp[i]) > 1 { //named param
|
||||
name = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i][1]]))
|
||||
param = fmt.Sprint(strings.TrimSpace(mw[pp[i][1]+1 : pp[i+1][0]]))
|
||||
} else {
|
||||
name = fmt.Sprint(i + 1)
|
||||
param = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i+1][0]]))
|
||||
}
|
||||
pm[name] = param
|
||||
}
|
||||
t.rt = a.renderTemplateRecursive(tn, pm, g, depth+1)
|
||||
return tn, pm
|
||||
}
|
||||
|
||||
func templateType(tn string) string {
|
||||
index := strings.Index(tn, ":")
|
||||
tns := strings.TrimSpace(tn)
|
||||
var base string
|
||||
// var attr string
|
||||
if index > 0 {
|
||||
base = strings.TrimSpace(tn[:index])
|
||||
// attr = strings.TrimSpace(tn[index+1:])
|
||||
} else {
|
||||
base = tns
|
||||
}
|
||||
base = strings.ToLower(base)
|
||||
_, ok1 := noHashFunctionsMap[base]
|
||||
_, ok2 := variablesMap[base]
|
||||
if ok1 || ok2 {
|
||||
return "magic"
|
||||
}
|
||||
if strings.HasPrefix(tns, "#") {
|
||||
return "ext"
|
||||
}
|
||||
return "normal"
|
||||
}
|
||||
|
||||
var noincludeRe = regexp.MustCompile(`(?isU)<noinclude>.*(?:</noinclude>|\z)`)
|
||||
var includeonlyRe = regexp.MustCompile(`(?isU)<includeonly>(.*)(?:</includeonly>|\z)`)
|
||||
|
||||
func (a *Article) stripNoinclude(mw string) string {
|
||||
mwni := noincludeRe.ReplaceAllLiteralString(mw, "")
|
||||
ssl := includeonlyRe.FindAllStringSubmatch(mwni, -1)
|
||||
if len(ssl) == 0 {
|
||||
return mwni
|
||||
}
|
||||
sl := make([]string, 0, len(ssl))
|
||||
for _, s := range ssl {
|
||||
sl = append(sl, s[1])
|
||||
}
|
||||
return strings.Join(sl, "")
|
||||
}
|
102
text.go
Normal file
102
text.go
Normal file
|
@ -0,0 +1,102 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func (a *Article) appendText(t string) {
|
||||
a.nchar += utf8.RuneCountInString(t)
|
||||
a.text.WriteString(t)
|
||||
}
|
||||
|
||||
func (a *Article) genTextInternal(root *ParseNode, indent int) {
|
||||
lastwasspace := false
|
||||
for _, n := range root.Nodes {
|
||||
var linkStart int
|
||||
var fl FullWikiLink
|
||||
isLink := false
|
||||
tappend := ""
|
||||
switch n.NType {
|
||||
case "break":
|
||||
a.appendText("\n")
|
||||
case "space":
|
||||
if !lastwasspace {
|
||||
a.appendText(" ")
|
||||
}
|
||||
case "text":
|
||||
a.appendText(n.Contents)
|
||||
case "image":
|
||||
a.appendText("\n")
|
||||
tappend = "\n"
|
||||
case "link":
|
||||
isLink = true
|
||||
linkStart = len(a.text.Bytes())
|
||||
fl = FullWikiLink{Link: n.Link, Start: a.nchar}
|
||||
case "html":
|
||||
switch n.NSubType {
|
||||
case "h1", "h2", "h3", "h4", "h5", "h6":
|
||||
a.appendText("\n")
|
||||
tappend = "\n"
|
||||
if len(a.AbstractText) == 0 {
|
||||
a.AbstractText = a.text.String()
|
||||
}
|
||||
case "br":
|
||||
a.appendText("\n")
|
||||
case "ref":
|
||||
a.appendText(" ")
|
||||
}
|
||||
}
|
||||
if len(n.Nodes) > 0 {
|
||||
a.genTextInternal(n, 0)
|
||||
}
|
||||
if isLink {
|
||||
ttmp := a.text.Bytes()
|
||||
fl.End = a.nchar
|
||||
fl.Text = string(ttmp[linkStart:])
|
||||
a.TextLinks = append(a.TextLinks, fl)
|
||||
}
|
||||
lastwasspace = false
|
||||
if n.NType == "space" {
|
||||
lastwasspace = true
|
||||
}
|
||||
// a.Text += tappend
|
||||
a.appendText(tappend)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (a *Article) genText() error {
|
||||
a.text = bytes.NewBuffer(make([]byte, 1024*1024, 1024*1024))
|
||||
a.text.Truncate(0)
|
||||
a.nchar = 0
|
||||
a.AbstractText = ""
|
||||
a.genTextInternal(a.Root, 0)
|
||||
a.Text = string(a.text.Bytes())
|
||||
if len(a.AbstractText) == 0 {
|
||||
a.AbstractText = a.Text
|
||||
}
|
||||
a.gt = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *Article) GenText() error {
|
||||
return a.genText()
|
||||
}
|
916
tokenize.go
Normal file
916
tokenize.go
Normal file
|
@ -0,0 +1,916 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
import (
|
||||
// "bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
// "html"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type Token struct {
|
||||
TText string `json:"tText,omitempty"`
|
||||
TType string `json:"tType,omitempty"`
|
||||
TAttr string `json:"tAttr,omitempty"`
|
||||
TLink WikiLink `json:"tLink,omitempty"`
|
||||
TClosed bool `json:"tClosed,omitempty"`
|
||||
TPipes []string `json:"tPipes,omitempty"`
|
||||
}
|
||||
|
||||
func (a *Article) parseRedirectLine(l string) ([]*Token, error) {
|
||||
nt := make([]*Token, 0, 2)
|
||||
nt = append(nt, &Token{TType: "redirect"})
|
||||
nnt, err := a.parseInlineText(l, 9, len(l))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nt = append(nt, nnt...)
|
||||
return nt, nil
|
||||
}
|
||||
|
||||
func (a *Article) parseWikiPreLine(l string) ([]*Token, error) {
|
||||
nt := make([]*Token, 0, 2)
|
||||
nt = append(nt, &Token{TType: "wikipre"})
|
||||
nnt, err := a.parseInlineText(l, 1, len(l))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nt = append(nt, nnt...)
|
||||
return nt, nil
|
||||
}
|
||||
|
||||
func (a *Article) parseHRuler(l string) ([]*Token, error) {
|
||||
pos := 0
|
||||
for i, rv := range l {
|
||||
if rv != '-' {
|
||||
pos = i
|
||||
break
|
||||
}
|
||||
}
|
||||
nt := make([]*Token, 0, 2)
|
||||
nt = append(nt, &Token{TType: "hrule"})
|
||||
if pos != 0 {
|
||||
nnt, err := a.parseInlineText(l, pos, len(l))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nt = append(nt, nnt...)
|
||||
}
|
||||
return nt, nil
|
||||
}
|
||||
|
||||
func (a *Article) parseHeadingLine(l string) ([]*Token, error) {
|
||||
pf := 0
|
||||
pl := 0
|
||||
for i, rv := range l {
|
||||
if rv == '=' {
|
||||
pl = i
|
||||
}
|
||||
}
|
||||
for {
|
||||
pf++
|
||||
if pf == pl || l[pf] != '=' {
|
||||
pf--
|
||||
break
|
||||
}
|
||||
pl--
|
||||
if pf == pl || l[pl] != '=' {
|
||||
pl++
|
||||
pf--
|
||||
break
|
||||
}
|
||||
}
|
||||
pf++
|
||||
if pf > 6 {
|
||||
diff := pf - 6
|
||||
pf -= diff
|
||||
pl += diff
|
||||
}
|
||||
nt := make([]*Token, 0, 2)
|
||||
nt = append(nt, &Token{TType: fmt.Sprintf("h%d", pf)})
|
||||
nnt, err := a.parseInlineText(l, pf, pl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nt = append(nt, nnt...)
|
||||
return nt, nil
|
||||
}
|
||||
|
||||
func (a *Article) parseListLine(l string) ([]*Token, error) {
|
||||
nt := make([]*Token, 0, 2)
|
||||
pos := 0
|
||||
for ; pos < len(l); pos++ {
|
||||
switch l[pos] {
|
||||
case ';', ':', '*', '#':
|
||||
nt = append(nt, &Token{TType: l[pos : pos+1]})
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
if pos < len(l) {
|
||||
nnt, err := a.parseInlineText(l, pos, len(l))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nt = append(nt, nnt...)
|
||||
}
|
||||
return nt, nil
|
||||
}
|
||||
|
||||
func (a *Article) parseTableLine(l string) ([]*Token, error) {
|
||||
nt := make([]*Token, 0, 0)
|
||||
return nt, nil
|
||||
}
|
||||
|
||||
func isValidHTMLtag(tag string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (a *Article) decodeHTMLtag(l string) (int, string, string, bool, bool) {
|
||||
matchingpos := 0
|
||||
inquote := false
|
||||
lastbackslash := false
|
||||
quote := '#'
|
||||
closefound := false
|
||||
tagend := 0
|
||||
tagstart := 0
|
||||
//taking care of comments at preprocessing time
|
||||
/* if strings.HasPrefix(l, "<!--") {
|
||||
i := strings.Index(l[4:], "-->")
|
||||
if i == -1 {
|
||||
return len(l), "!--", l[4:], true, true
|
||||
}
|
||||
return 4 + i + 3, "!--", l[4 : 4+i], true, true
|
||||
} */
|
||||
dhtLoop:
|
||||
for idx, rv := range l {
|
||||
// fmt.Println(string(rv), inquote, string(quote), idx, matchingpos)
|
||||
switch rv {
|
||||
case '>':
|
||||
if !inquote {
|
||||
matchingpos = idx
|
||||
break dhtLoop
|
||||
}
|
||||
case '\'', '"':
|
||||
switch {
|
||||
case inquote && quote == rv && !lastbackslash:
|
||||
inquote = false
|
||||
case !inquote:
|
||||
inquote = true
|
||||
quote = rv
|
||||
}
|
||||
case ' ', '\t', '\r':
|
||||
case '/':
|
||||
closefound = true
|
||||
}
|
||||
lastbackslash = (rv == '\\')
|
||||
if !unicode.IsSpace(rv) && tagstart == 0 {
|
||||
tagstart = idx
|
||||
}
|
||||
if rv != '/' && !unicode.IsSpace(rv) {
|
||||
closefound = false
|
||||
}
|
||||
if unicode.IsSpace(rv) && tagstart != 0 && tagend == 0 {
|
||||
tagend = idx
|
||||
}
|
||||
}
|
||||
if matchingpos == 0 || tagstart == 0 {
|
||||
return 0, "", "", false, false
|
||||
}
|
||||
var tag string
|
||||
var attr string
|
||||
|
||||
if tagend == 0 {
|
||||
tag = l[tagstart:matchingpos]
|
||||
attr = ""
|
||||
} else {
|
||||
tag = l[tagstart:tagend]
|
||||
attr = l[tagend:matchingpos]
|
||||
}
|
||||
return matchingpos + 1, tag, attr, closefound, true
|
||||
// e, tag, attr, closed, ok := decodeHTMLtag(l[pos:end])
|
||||
}
|
||||
|
||||
func matchPrefixes(s string, prefixes []string) bool {
|
||||
for i := range prefixes {
|
||||
if len(s) >= len(prefixes[i]) && strings.EqualFold(s[:len(prefixes[i])], prefixes[i]) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var extlinkre = regexp.MustCompile(`^(http:)|(ftp:)|()//[^\s]+`)
|
||||
|
||||
func isExtLink(l string) bool {
|
||||
// return extlinkre.MatchString(l)
|
||||
return matchPrefixes(l, []string{"http://", "ftp://", "//"})
|
||||
}
|
||||
|
||||
var filelinkre = regexp.MustCompile(`(?i)^\[\[(?:image:)|(?:media:)|(?:file:)`)
|
||||
|
||||
func possibleFileLink(l string) bool {
|
||||
// return filelinkre.MatchString(l)
|
||||
return matchPrefixes(l, []string{"[[image:", "[[media:", "[[file:"})
|
||||
}
|
||||
|
||||
func (a *Article) parseLink(l string) (int, []*Token, bool) {
|
||||
if len(l) < 5 {
|
||||
return 0, nil, false
|
||||
}
|
||||
if l[1] == '[' {
|
||||
if possibleFileLink(l) {
|
||||
return a.parseFileLink(l)
|
||||
}
|
||||
return a.parseInternalLink(l)
|
||||
}
|
||||
return a.parseExternalLink(l)
|
||||
}
|
||||
|
||||
func (a *Article) parseInternalLink(l string) (int, []*Token, bool) {
|
||||
|
||||
// possible internal link
|
||||
pipepos := 0
|
||||
closed := false
|
||||
matchingpos := 0
|
||||
linktrail := 0
|
||||
//plLoop:
|
||||
for idx, rv := range l {
|
||||
if idx < 2 {
|
||||
continue
|
||||
}
|
||||
if matchingpos == 0 {
|
||||
switch rv {
|
||||
case '\x07': //prevent special tags in internal link
|
||||
if pipepos == 0 { //only in the link portion
|
||||
return 0, nil, false
|
||||
}
|
||||
case '[':
|
||||
if idx == 2 || len(l) > idx+1 && l[idx+1] == '[' {
|
||||
return 0, nil, false
|
||||
}
|
||||
|
||||
case ']':
|
||||
if len(l) > idx+1 && l[idx+1] == ']' {
|
||||
matchingpos = idx
|
||||
}
|
||||
case '|':
|
||||
if pipepos == 0 {
|
||||
pipepos = idx
|
||||
}
|
||||
default:
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !closed {
|
||||
closed = true
|
||||
continue
|
||||
}
|
||||
if unicode.IsLetter(rv) {
|
||||
linktrail = idx
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
if !closed {
|
||||
return 0, nil, false
|
||||
}
|
||||
var link WikiLink
|
||||
var nt []*Token = nil
|
||||
var err error = nil
|
||||
if pipepos == 0 {
|
||||
innerstring := l[2:matchingpos]
|
||||
if linktrail != 0 {
|
||||
innerstring += l[matchingpos+2 : linktrail+1]
|
||||
}
|
||||
link = WikiCanonicalForm(l[2:matchingpos])
|
||||
nt = []*Token{&Token{TText: innerstring, TType: "text"}}
|
||||
|
||||
} else {
|
||||
innerstring := l[pipepos+1 : matchingpos]
|
||||
if linktrail != 0 {
|
||||
innerstring += l[matchingpos+2 : linktrail+1]
|
||||
}
|
||||
link = WikiCanonicalForm(l[2:pipepos])
|
||||
if pipepos+1 < matchingpos {
|
||||
nt, err = a.parseInlineText(innerstring, 0, len(innerstring))
|
||||
if err != nil {
|
||||
return 0, nil, false
|
||||
}
|
||||
}
|
||||
}
|
||||
tokens := make([]*Token, 0, 2)
|
||||
tokens = append(tokens, &Token{TLink: link, TType: "link"})
|
||||
if nt != nil {
|
||||
tokens = append(tokens, nt...)
|
||||
}
|
||||
tokens = append(tokens, &Token{TType: "closelink"})
|
||||
if linktrail != 0 {
|
||||
return linktrail + 1, tokens, true
|
||||
}
|
||||
return matchingpos + 2, tokens, true
|
||||
}
|
||||
|
||||
func (a *Article) parseExternalLink(l string) (int, []*Token, bool) {
|
||||
// possible external link
|
||||
spacepos := 0
|
||||
matchingpos := 0
|
||||
endpos := 0
|
||||
intLinkOpen := false
|
||||
skipNext := false
|
||||
plLoop2:
|
||||
for idx, rv := range l {
|
||||
if idx < 1 {
|
||||
continue
|
||||
}
|
||||
if skipNext {
|
||||
skipNext = false
|
||||
continue
|
||||
}
|
||||
switch rv {
|
||||
case '\x07':
|
||||
if spacepos == 0 {
|
||||
return 0, nil, false
|
||||
}
|
||||
case '[':
|
||||
if len(l) > idx+1 && l[idx+1] == '[' {
|
||||
intLinkOpen = true
|
||||
}
|
||||
case ' ':
|
||||
if spacepos == 0 {
|
||||
spacepos = idx
|
||||
}
|
||||
case '<':
|
||||
if spacepos > 0 {
|
||||
// e, tag, attr, closed, ok := a.decodeHTMLtag(l[idx:len(l)])
|
||||
_, tag, _, _, ok := a.decodeHTMLtag(l[idx:len(l)])
|
||||
// fmt.Println("html tag in ext link. Line:", l, "\n\n", tag, ok)
|
||||
if ok && tag == "/ref" {
|
||||
// fmt.Println("closing link...")
|
||||
matchingpos = idx
|
||||
endpos = idx
|
||||
break plLoop2
|
||||
}
|
||||
|
||||
}
|
||||
case ']':
|
||||
if intLinkOpen && len(l) > idx+1 && l[idx+1] == ']' {
|
||||
intLinkOpen = false
|
||||
skipNext = true
|
||||
continue
|
||||
}
|
||||
matchingpos = idx
|
||||
endpos = idx + 1
|
||||
break plLoop2
|
||||
}
|
||||
}
|
||||
if matchingpos == 0 {
|
||||
return 0, nil, false
|
||||
}
|
||||
var link string
|
||||
var nt []*Token = nil
|
||||
var err error = nil
|
||||
if spacepos == 0 {
|
||||
link = l[1:matchingpos]
|
||||
if !isExtLink(link) {
|
||||
return 0, nil, false
|
||||
}
|
||||
} else {
|
||||
link = l[1:spacepos]
|
||||
if !isExtLink(link) {
|
||||
return 0, nil, false
|
||||
}
|
||||
if spacepos+1 < matchingpos {
|
||||
nt, err = a.parseInlineText(l, spacepos+1, matchingpos)
|
||||
if err != nil {
|
||||
return 0, nil, false
|
||||
}
|
||||
}
|
||||
}
|
||||
tokens := make([]*Token, 0, 2)
|
||||
tokens = append(tokens, &Token{TText: link, TType: "extlink"})
|
||||
if nt != nil {
|
||||
tokens = append(tokens, nt...)
|
||||
}
|
||||
tokens = append(tokens, &Token{TType: "closeextlink"})
|
||||
return endpos, tokens, true
|
||||
}
|
||||
|
||||
func (a *Article) parseFileLink(l string) (int, []*Token, bool) {
|
||||
// possible internal link
|
||||
pipepos := make([]int, 0, 0)
|
||||
closed := false
|
||||
matchingpos := 0
|
||||
intLinkOpen := false
|
||||
skipNext := false
|
||||
plLoop:
|
||||
for idx, rv := range l {
|
||||
if idx < 2 {
|
||||
continue
|
||||
}
|
||||
if skipNext {
|
||||
skipNext = false
|
||||
continue
|
||||
}
|
||||
switch rv {
|
||||
case '\x07': //prevent special tags in internal link
|
||||
if len(pipepos) == 0 { //only in the link portion
|
||||
return 0, nil, false
|
||||
}
|
||||
case '[':
|
||||
if len(l) > idx+1 && l[idx+1] == '[' {
|
||||
intLinkOpen = true
|
||||
skipNext = true
|
||||
continue
|
||||
}
|
||||
|
||||
case ']':
|
||||
if len(l) > idx+1 && l[idx+1] == ']' {
|
||||
if intLinkOpen {
|
||||
intLinkOpen = false
|
||||
skipNext = true
|
||||
continue
|
||||
}
|
||||
matchingpos = idx
|
||||
closed = true
|
||||
break plLoop
|
||||
}
|
||||
case '|':
|
||||
if !intLinkOpen {
|
||||
pipepos = append(pipepos, idx)
|
||||
}
|
||||
default:
|
||||
}
|
||||
}
|
||||
if !closed {
|
||||
return 0, nil, false
|
||||
}
|
||||
var link WikiLink
|
||||
var pipes = make([]string, 0, 0)
|
||||
var nt []*Token = nil
|
||||
var err error = nil
|
||||
if len(pipepos) == 0 {
|
||||
link = WikiCanonicalForm(l[2:matchingpos])
|
||||
nt = []*Token{&Token{TText: l[2:matchingpos], TType: "text"}}
|
||||
|
||||
} else {
|
||||
link = WikiCanonicalForm(l[2:pipepos[0]])
|
||||
for i := 0; i < len(pipepos)-1; i++ {
|
||||
pipes = append(pipes, l[pipepos[i]+1:pipepos[i+1]])
|
||||
}
|
||||
if pipepos[len(pipepos)-1]+1 < matchingpos {
|
||||
nt, err = a.parseInlineText(l, pipepos[len(pipepos)-1]+1, matchingpos)
|
||||
if err != nil {
|
||||
return 0, nil, false
|
||||
}
|
||||
}
|
||||
}
|
||||
tokens := make([]*Token, 0, 2)
|
||||
tokens = append(tokens, &Token{TLink: link, TType: "filelink", TPipes: pipes})
|
||||
if nt != nil {
|
||||
tokens = append(tokens, nt...)
|
||||
}
|
||||
tokens = append(tokens, &Token{TType: "closefilelink"})
|
||||
return matchingpos + 2, tokens, true
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a <= b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
var behavswitchre = regexp.MustCompile(`^__[A-Z]+__`)
|
||||
|
||||
func (a *Article) decodeBehavSwitch(l string) (int, bool) {
|
||||
match := behavswitchre.FindString(l)
|
||||
if len(match) == 0 {
|
||||
return 0, false
|
||||
} else {
|
||||
return len(match), true
|
||||
}
|
||||
// e, ok := decodeMagic(l[pos:end])
|
||||
}
|
||||
|
||||
func (a *Article) parseInlineText(l string, start, end int) ([]*Token, error) {
|
||||
nt := make([]*Token, 0)
|
||||
// fmt.Println("in parseInlineText")
|
||||
|
||||
tStart, tEnd := start, start
|
||||
|
||||
for pos := start; pos < end; {
|
||||
rv, rune_len := utf8.DecodeRuneInString(l[pos:end])
|
||||
switch rv {
|
||||
case '<':
|
||||
e, tag, attr, closed, ok := a.decodeHTMLtag(l[pos:end])
|
||||
if ok {
|
||||
pos += e
|
||||
if isValidHTMLtag(tag) {
|
||||
if tEnd > tStart {
|
||||
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
||||
}
|
||||
nt = append(nt, &Token{TType: "html", TText: tag, TAttr: attr, TClosed: closed})
|
||||
tStart = pos
|
||||
}
|
||||
tEnd = pos
|
||||
continue
|
||||
}
|
||||
case '[':
|
||||
e, lt, ok := a.parseLink(l[pos:end])
|
||||
if ok {
|
||||
if tEnd > tStart {
|
||||
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
||||
}
|
||||
nt = append(nt, lt...)
|
||||
pos += e
|
||||
tStart, tEnd = pos, pos
|
||||
continue
|
||||
}
|
||||
/* case '{':
|
||||
e, tt, ok := a.parseTemplateEtc(l[pos:end])
|
||||
fmt.Println("template:", e, tt, ok)
|
||||
if ok {
|
||||
if len(cs) > 0 {
|
||||
nt = append(nt, &Token{TText: cs, TType: "text"})
|
||||
}
|
||||
nt = append(nt, tt...)
|
||||
pos += e
|
||||
cs = ""
|
||||
continue
|
||||
}
|
||||
cs += string(rv) */
|
||||
case '_':
|
||||
e, ok := a.decodeBehavSwitch(l[pos:end])
|
||||
if ok {
|
||||
if tEnd > tStart {
|
||||
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
||||
}
|
||||
nt = append(nt, &Token{TType: "magic", TAttr: l[pos : pos+e]})
|
||||
pos += e
|
||||
tStart, tEnd = pos, pos
|
||||
continue
|
||||
}
|
||||
case ' ', '\t', '\r':
|
||||
if tEnd > tStart {
|
||||
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
||||
}
|
||||
nt = append(nt, &Token{TType: "space"})
|
||||
tStart = pos + rune_len
|
||||
case '\'':
|
||||
if tEnd > tStart {
|
||||
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
||||
}
|
||||
nt = append(nt, &Token{TType: "quote"})
|
||||
tStart = pos + rune_len
|
||||
case ':':
|
||||
if tEnd > tStart {
|
||||
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
||||
}
|
||||
nt = append(nt, &Token{TType: "colon"})
|
||||
tStart = pos + rune_len
|
||||
case '\x07':
|
||||
// case '@':
|
||||
if tEnd > tStart {
|
||||
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
||||
}
|
||||
nt = append(nt, &Token{TType: "special", TText: l[pos : pos+8]})
|
||||
pos += 8
|
||||
tStart, tEnd = pos, pos
|
||||
continue
|
||||
}
|
||||
pos += rune_len
|
||||
tEnd = pos
|
||||
}
|
||||
if tEnd > tStart {
|
||||
nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"})
|
||||
}
|
||||
return nt, nil
|
||||
}
|
||||
|
||||
func (a *Article) isHeading(l string) bool {
|
||||
if l[0] != '=' {
|
||||
return false
|
||||
}
|
||||
done := 0
|
||||
lastEqual := false
|
||||
for _, rv := range l {
|
||||
done++
|
||||
if done > 2 {
|
||||
if unicode.IsSpace(rv) {
|
||||
continue
|
||||
}
|
||||
if rv == '=' {
|
||||
lastEqual = true
|
||||
continue
|
||||
}
|
||||
lastEqual = false
|
||||
}
|
||||
|
||||
}
|
||||
return lastEqual
|
||||
}
|
||||
|
||||
func (a *Article) isTable(l string) bool {
|
||||
return (len(l) > 1 && (l[0:2] == "{|" || l[0:2] == "|}" || l[0:2] == "|+" || l[0:2] == "|-")) || (len(l) > 0 && (l[0:1] == "|" || l[0:1] == "!"))
|
||||
}
|
||||
|
||||
func (a *Article) lineType(l string) string {
|
||||
switch {
|
||||
case len(l) == 0:
|
||||
return "blank"
|
||||
case len(l) > 8 && strings.ToLower(l[0:9]) == "#redirect":
|
||||
return "redirect"
|
||||
case len(l) > 3 && l[0:4] == "----":
|
||||
return "hr"
|
||||
case a.isHeading(l):
|
||||
return "heading"
|
||||
case l[0] == ';' || l[0] == ':' || l[0] == '*' || l[0] == '#':
|
||||
return "list"
|
||||
case a.isTable(l):
|
||||
return "table"
|
||||
case l[0] == ' ':
|
||||
return "wikipre"
|
||||
}
|
||||
return "normal"
|
||||
}
|
||||
|
||||
func (a *Article) Tokenize(mw string, g PageGetter) ([]*Token, error) {
|
||||
mwnc := a.stripComments(mw)
|
||||
mw_stripped, nowikipremathmap := a.stripNowikiPreMath(mwnc)
|
||||
mw_tmpl, templatemap := a.processTemplates(mw_stripped, nowikipremathmap, g)
|
||||
mw_links := a.preprocessLinks(mw_tmpl)
|
||||
|
||||
lines := strings.Split(mw_links, "\n")
|
||||
tokens := make([]*Token, 0, 16)
|
||||
for _, l := range lines {
|
||||
var nt []*Token
|
||||
var err error = nil
|
||||
lt := a.lineType(l)
|
||||
switch lt {
|
||||
case "normal":
|
||||
nt, err = a.parseInlineText(l, 0, len(l))
|
||||
case "redirect":
|
||||
nt, err = a.parseRedirectLine(l)
|
||||
case "hr":
|
||||
nt, err = a.parseHRuler(l)
|
||||
case "heading":
|
||||
nt, err = a.parseHeadingLine(l)
|
||||
case "list":
|
||||
nt, err = a.parseListLine(l)
|
||||
case "table":
|
||||
nt, err = a.parseTableLine(l)
|
||||
case "wikipre":
|
||||
nt, err = a.parseWikiPreLine(l)
|
||||
case "blank":
|
||||
nt = []*Token{&Token{TType: "blank"}}
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nt = append(nt, &Token{TType: "newline"})
|
||||
tokens = append(tokens, nt...)
|
||||
}
|
||||
specialcount := 0
|
||||
for i := range tokens {
|
||||
if tokens[i].TType == "special" {
|
||||
specialcount++
|
||||
t, ok := templatemap[tokens[i].TText] //nowikipremathmap[tokens[i].TText]
|
||||
if !ok {
|
||||
return nil, errors.New("special not in map")
|
||||
}
|
||||
tokens[i] = t
|
||||
}
|
||||
}
|
||||
// fmt.Println(specialcount, len(nowikipremathmap))
|
||||
// if specialcount != len(nowikipremathmap) {
|
||||
if specialcount != len(templatemap) {
|
||||
if DebugLevel > 0 {
|
||||
fmt.Println("[Tokenize] Warning: number of specials in map differs from number found")
|
||||
}
|
||||
// return nil, errors.New("number of specials in map differs from number found")
|
||||
}
|
||||
return tokens, nil
|
||||
}
|
||||
|
||||
var commentsRe = regexp.MustCompile(`(?isU)<!--.*(?:-->|\z)`)
|
||||
|
||||
func (a *Article) stripComments(mw string) string {
|
||||
return commentsRe.ReplaceAllLiteralString(mw, "")
|
||||
}
|
||||
|
||||
var nowikiOpenRe = regexp.MustCompile(`(?i)<\s*(nowiki)\s*[^>/]*>`)
|
||||
var nowikiCloseRe = regexp.MustCompile(`(?i)<(/nowiki)\s*[^>/]*>`)
|
||||
var preOpenRe = regexp.MustCompile(`(?i)<\s*(pre)\s*[^>]*>`)
|
||||
var preCloseRe = regexp.MustCompile(`(?i)<(/pre)\s*[^>]*>`)
|
||||
var mathOpenRe = regexp.MustCompile(`(?i)<\s*(math)\s*[^>]*>`)
|
||||
var mathCloseRe = regexp.MustCompile(`(?i)<(/math)\s*[^>]*>`)
|
||||
|
||||
type ssInt [][]int
|
||||
|
||||
func (a ssInt) Len() int { return len(a) }
|
||||
func (a ssInt) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a ssInt) Less(i, j int) bool { return a[i][0] < a[j][0] }
|
||||
|
||||
func (a *Article) stripNowikiPreMath(mw string) (string, map[string]*Token) {
|
||||
nwoc := nowikiOpenRe.FindAllStringSubmatchIndex(mw, -1)
|
||||
nwcc := nowikiCloseRe.FindAllStringSubmatchIndex(mw, -1)
|
||||
poc := preOpenRe.FindAllStringSubmatchIndex(mw, -1)
|
||||
pcc := preCloseRe.FindAllStringSubmatchIndex(mw, -1)
|
||||
moc := mathOpenRe.FindAllStringSubmatchIndex(mw, -1)
|
||||
mcc := mathCloseRe.FindAllStringSubmatchIndex(mw, -1)
|
||||
|
||||
/*
|
||||
nwoc = append(nwoc, []int{len(mw) + 1, len(mw) + 1})
|
||||
nwcc = append(nwcc, []int{len(mw) + 1, len(mw) + 1})
|
||||
poc = append(poc, []int{len(mw) + 1, len(mw) + 1})
|
||||
pcc = append(pcc, []int{len(mw) + 1, len(mw) + 1})
|
||||
moc = append(moc, []int{len(mw) + 1, len(mw) + 1})
|
||||
mcc = append(mcc, []int{len(mw) + 1, len(mw) + 1})
|
||||
*/
|
||||
for i := range nwoc {
|
||||
nwoc[i] = append(nwoc[i], 0)
|
||||
}
|
||||
for i := range nwcc {
|
||||
nwcc[i] = append(nwcc[i], 1)
|
||||
}
|
||||
for i := range poc {
|
||||
poc[i] = append(poc[i], 2)
|
||||
}
|
||||
for i := range pcc {
|
||||
pcc[i] = append(pcc[i], 3)
|
||||
}
|
||||
for i := range moc {
|
||||
moc[i] = append(moc[i], 4)
|
||||
}
|
||||
for i := range mcc {
|
||||
mcc[i] = append(mcc[i], 5)
|
||||
}
|
||||
am := make([][]int, 0, len(nwoc)+len(nwcc)+len(poc)+len(pcc)+len(moc)+len(mcc))
|
||||
am = append(am, nwoc...)
|
||||
am = append(am, nwcc...)
|
||||
am = append(am, poc...)
|
||||
am = append(am, pcc...)
|
||||
am = append(am, moc...)
|
||||
am = append(am, mcc...)
|
||||
sort.Sort(ssInt(am))
|
||||
// fmt.Println(am)
|
||||
tokens := make(map[string]*Token, len(am))
|
||||
if len(am) == 0 {
|
||||
return mw, tokens
|
||||
}
|
||||
|
||||
ctype := -1
|
||||
out := ""
|
||||
lastclose := 0
|
||||
openidx := 0
|
||||
count := 0
|
||||
for i := range am {
|
||||
// fmt.Println("ctype", ctype, "lastclose", lastclose, "count", count, "openidx", openidx, "am[i]", am[i])
|
||||
if (ctype != -1) && (am[i][4] == ctype+1) && (am[openidx][1] <= am[i][0]) {
|
||||
// closing an open one
|
||||
special := fmt.Sprintf("\x07%07d", count)
|
||||
// special := fmt.Sprintf("@%07d", count)
|
||||
tokens[special] = &Token{
|
||||
TText: mw[am[openidx][1]:am[i][0]],
|
||||
TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]),
|
||||
TAttr: mw[am[openidx][3] : am[openidx][1]-1],
|
||||
}
|
||||
out += special
|
||||
ctype = -1
|
||||
lastclose = am[i][1]
|
||||
count++
|
||||
} else if (ctype == -1) && (am[i][4]&1 == 0) && (lastclose <= am[i][0]) {
|
||||
// open a new one
|
||||
out += mw[lastclose:am[i][0]]
|
||||
ctype = am[i][4]
|
||||
openidx = i
|
||||
}
|
||||
}
|
||||
if ctype != -1 {
|
||||
//it's open: close it
|
||||
special := fmt.Sprintf("\x07%07d", count)
|
||||
// special := fmt.Sprintf("@%07d", count)
|
||||
tokens[special] = &Token{
|
||||
TText: mw[am[openidx][1]:len(mw)],
|
||||
TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]),
|
||||
TAttr: mw[am[openidx][3] : am[openidx][1]-1],
|
||||
}
|
||||
out += special
|
||||
ctype = -1
|
||||
count++
|
||||
} else {
|
||||
out += mw[lastclose:]
|
||||
}
|
||||
return out, tokens
|
||||
}
|
||||
|
||||
var multiLineLinksRe = regexp.MustCompile(`(?sm)\[\[[^\n|]*\|.*?\]\]`)
|
||||
|
||||
/* TODO: add preprocessing as in Parser.php:pstPass2() to enable pipe tricks
|
||||
*/
|
||||
func (a *Article) preprocessLinks(s string) string {
|
||||
mw := []byte(s)
|
||||
mll := multiLineLinksRe.FindAllSubmatchIndex(mw, -1)
|
||||
for _, pair := range mll {
|
||||
for i := pair[0]; i < pair[1]; {
|
||||
// we have to walk this string carefully, by rune, not by i
|
||||
rv, rlen := utf8.DecodeRune(mw[i:])
|
||||
if rv == '\n' {
|
||||
mw[i] = ' '
|
||||
}
|
||||
i += rlen
|
||||
}
|
||||
}
|
||||
return string(mw)
|
||||
}
|
||||
|
||||
//var nowikiOpenRe = regexp.MustCompile(`(?i)<\s*nowiki\s*[^>/]*>`)
|
||||
//var nowikiCloseRe = regexp.MustCompile(`(?i)</nowiki\s*[^>/]*>`)
|
||||
//var nowikiOpenCloseRe = regexp.MustCompile(`(?i)<nowiki\s*[^>]*/>`)
|
||||
/*
|
||||
type WikiParser struct {
|
||||
mw string
|
||||
}
|
||||
|
||||
func NewWikiParser(mw string) *WikiParser {
|
||||
return &WikiParser{mw: mw}
|
||||
}
|
||||
|
||||
func (wp *WikiParser) doNowiki() {
|
||||
openCandidates := nowikiOpenRe.FindAllStringIndex(wp.mw, -1)
|
||||
closeCandidates := nowikiCloseRe.FindAllStringIndex(wp.mw, -1)
|
||||
openCloseCandidates := nowikiOpenCloseRe.FindAllStringIndex(wp.mw, -1)
|
||||
tail := []int{len(wp.mw) + 1, len(wp.mw) + 1}
|
||||
openCandidates = append(openCandidates, tail)
|
||||
closeCandidates = append(closeCandidates, tail)
|
||||
openCloseCandidates = append(openCloseCandidates, tail)
|
||||
oi := 0
|
||||
ci := 0
|
||||
oci := 0
|
||||
inNowiki := false
|
||||
ol = make([][]int, 0, len(openCandidates))
|
||||
cl = make([][]int, 0, len(closeCandidates))
|
||||
ocl = make([][]int, 0, len(openCloseCandidates))
|
||||
for {
|
||||
if oi == len(openCandidates)-1 &&
|
||||
ci == len(closeCandidates)-1 &&
|
||||
oci == len(openCloseCandidates)-1 {
|
||||
break
|
||||
}
|
||||
switch {
|
||||
case openCandidates[oi][0] <= closeCandidates[oi][0] &&
|
||||
openCandidates[oi][0] <= openCloseloseCandidates[oi][0]:
|
||||
if !inNowiki {
|
||||
ol = append(ol.openCandidates[oi])
|
||||
inNowiki = true
|
||||
}
|
||||
oi += 1
|
||||
|
||||
case closeCandidates[oi][0] <= openCandidates[oi][0] &&
|
||||
closeCandidates[oi][0] <= openCloseloseCandidates[oi][0]:
|
||||
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (wp *WikiParser) Parse() {
|
||||
doSGML()
|
||||
doNowiki()
|
||||
doMath()
|
||||
doPre()
|
||||
doBlanks()
|
||||
doHTMLvalidation()
|
||||
doReplaceVariables()
|
||||
doHR()
|
||||
doAllQuotes()
|
||||
doHeadings()
|
||||
doLists()
|
||||
doDates()
|
||||
doExternalLinks()
|
||||
doInternalLinks()
|
||||
doISBN()
|
||||
doRecombine()
|
||||
}
|
||||
*/
|
59
utils.go
Normal file
59
utils.go
Normal file
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
Copyright (C) IBM Corporation 2015, Michele Franceschini <franceschini@us.ibm.com>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gowiki
|
||||
|
||||
import (
|
||||
// "fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (a *Article) CheckRedirect() (bool, *WikiLink) {
|
||||
|
||||
rf := false
|
||||
for i, t := range a.Tokens {
|
||||
if i > 10 {
|
||||
break
|
||||
}
|
||||
switch t.TType {
|
||||
case "redirect":
|
||||
rf = true
|
||||
case "link":
|
||||
if rf {
|
||||
return true, &t.TLink
|
||||
}
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func (a *Article) CheckDisambiguation() bool {
|
||||
for _, t := range a.Templates {
|
||||
if t.Typ != "normal" {
|
||||
continue
|
||||
}
|
||||
ln := strings.ToLower(t.Name)
|
||||
if strings.Contains(ln, "disambig") ||
|
||||
ln == "dab" ||
|
||||
ln == "geodis" ||
|
||||
ln == "hndis" ||
|
||||
ln == "hndis-cleanup" ||
|
||||
ln == "numberdis" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
Reference in a new issue