From 93aa7513fb32ddce2947a22dbc3f7b2706df04e5 Mon Sep 17 00:00:00 2001 From: "Michele M. Franceschini" Date: Wed, 3 Jan 2018 16:21:59 -0500 Subject: [PATCH] Adding library code. --- LICENSE | 202 +++++++++++ README.md | 2 +- gowiki.go | 233 +++++++++++++ gowiki_test.go | 46 +++ parse.go | 636 ++++++++++++++++++++++++++++++++++ redirect.go | 39 +++ simple.go | 27 ++ template.go | 660 +++++++++++++++++++++++++++++++++++ text.go | 102 ++++++ tokenize.go | 916 +++++++++++++++++++++++++++++++++++++++++++++++++ utils.go | 59 ++++ 11 files changed, 2921 insertions(+), 1 deletion(-) create mode 100644 LICENSE create mode 100644 gowiki.go create mode 100644 gowiki_test.go create mode 100644 parse.go create mode 100644 redirect.go create mode 100644 simple.go create mode 100644 template.go create mode 100644 text.go create mode 100644 tokenize.go create mode 100644 utils.go diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index efe1897..9a95fa5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,3 @@ # gowiki -Gowiki is a golang library to parse mediawiki markup as found in Wikipedia pages +Gowiki is a golang library to parse mediawiki markup as found in Wikipedia pages. diff --git a/gowiki.go b/gowiki.go new file mode 100644 index 0000000..654f056 --- /dev/null +++ b/gowiki.go @@ -0,0 +1,233 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +import ( + "bytes" + // "errors" + // "fmt" + "html" + "regexp" + "strings" +) + +// var Debug bool = false +var DebugLevel int = 0 + +type Article struct { + MediaWiki string + Title string + Links []WikiLink + ExtLinks []string + Type string + AbstractText string + Media []WikiLink + Tokens []*Token + // OldTokens []*Token + Root *ParseNode + Parsed bool + Text string + TextLinks []FullWikiLink + Templates []*Template + + // unexported fields + gt bool + text *bytes.Buffer + nchar int + innerParseErrorCount int +} +type WikiLink struct { + Namespace string + PageName string + Anchor string +} +type FullWikiLink struct { + Link WikiLink + Text string + Start int // rune offset of beginning + End int // rune offset of end (index of the char after the last) +} + +type PageGetter interface { + Get(page WikiLink) (string, error) +} + +func NewArticle(title, text string) (*Article, error) { + a := new(Article) + a.Title = title + a.MediaWiki = text + a.Links = make([]WikiLink, 0, 16) + a.Media = make([]WikiLink, 0, 16) + a.TextLinks = make([]FullWikiLink, 0, 16) + a.ExtLinks = make([]string, 0, 16) + return a, nil +} + +func (a *Article) GetText() string { + if !a.gt { + a.genText() + } + return a.Text +} + +func (a *Article) GetAbstract() string { + if !a.gt { + a.genText() + } + return a.AbstractText +} + +func (a *Article) GetLinks() []WikiLink { + return a.Links +} + +func (a *Article) GetExternalLinks() []string { + return a.ExtLinks +} + +func (a *Article) GetMedia() []WikiLink { + return a.Media +} + +func (a *Article) GetTextLinks() []FullWikiLink { + if !a.gt { + a.genText() + } + return a.TextLinks +} + +var canoReSpaces = regexp.MustCompile(`[ _]+`) + +func WikiCanonicalFormEsc(l string, unescape bool) WikiLink { + return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, "", unescape) +} + +func WikiCanonicalForm(l string) WikiLink { + return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, "", true) +} + +func WikiCanonicalFormNamespace(l string, defaultNamespace string) WikiLink { + return StandardNamespaces.WikiCanonicalFormNamespaceEsc(l, defaultNamespace, true) +} + +func (namespaces Namespaces) WikiCanonicalFormNamespaceEsc(l string, defaultNamespace string, unescape bool) WikiLink { + hpos := strings.IndexRune(l, '#') + anchor := "" + if hpos >= 0 { + anchor = l[hpos+1:] + l = l[0:hpos] + } + i := strings.Index(l, ":") + namespace := defaultNamespace + if i >= 0 { + cns := strings.TrimSpace(canoReSpaces.ReplaceAllString(l[:i], " ")) + if unescape { + cns = html.UnescapeString(cns) + } + ns, ok := namespaces[strings.ToLower(cns)] + switch { + case ok && len(cns) > 0: + namespace = ns //strings.ToUpper(cns[0:1]) + strings.ToLower(cns[1:]) + case ok: + namespace = "" + default: + i = -1 + } + } + article := strings.TrimSpace(canoReSpaces.ReplaceAllString(l[i+1:], " ")) + anchor = canoReSpaces.ReplaceAllString(anchor, " ") + if unescape { + article = html.UnescapeString(article) + anchor = html.UnescapeString(anchor) + } + if len(article) > 0 { + article = strings.ToUpper(article[0:1]) + article[1:] + } + return WikiLink{Namespace: namespace, PageName: article, Anchor: anchor} +} + +func (wl *WikiLink) FullPagename() string { + if len(wl.Namespace) == 0 { + return wl.PageName + } + return wl.Namespace + ":" + wl.PageName +} + +func (wl *WikiLink) FullPagenameAnchor() string { + ns := "" + if len(wl.Namespace) != 0 { + ns = wl.Namespace + ":" + } + an := "" + if len(wl.Anchor) != 0 { + an = "#" + wl.Anchor + } + return ns + wl.PageName + an +} + +func (wl *WikiLink) IsImplicitSelfLink() bool { + return len(wl.PageName) == 0 +} + +func (wl *WikiLink) HasAnchor() bool { + return len(wl.Anchor) != 0 +} + +func (wl *WikiLink) GetAnchor() string { + return wl.Anchor +} + +type Namespaces map[string]string + +var StandardNamespaces Namespaces = map[string]string{ + "media": "Media", + "special": "Special", + "talk": "Talk", + "user": "User", + "user talk": "User talk", + "wikipedia": "Wikipedia", + "wikipedia talk": "Wikipedia talk", + "file": "File", + "file talk": "File talk", + "mediawiki": "MediaWiki", + "mediawiki talk": "MediaWiki talk", + "template": "Template", + "template talk": "Template talk", + "help": "Help", + "help talk": "Help talk", + "category": "Category", + "category talk": "Category talk", + "portal": "Portal", + "portal talk": "Portal talk", + "book": "Book", + "book talk": "Book talk", + "draft": "Draft", + "draft talk": "Draft talk", + "education program": "Education Program", + "education program talk": "Education Program talk", + "timedtext": "TimedText", + "timedtext talk": "TimedText talk", + "module": "Module", + "module talk": "Module talk", + "topic": "Topic", +} + +type DummyPageGetter struct{} + +func (g *DummyPageGetter) Get(wl WikiLink) (string, error) { + return "", nil +} diff --git a/gowiki_test.go b/gowiki_test.go new file mode 100644 index 0000000..69ae2e5 --- /dev/null +++ b/gowiki_test.go @@ -0,0 +1,46 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +import ( + "encoding/json" + // "os" + // "strings" + "testing" +) + +func TestParseArticle(t *testing.T) { + mw := "* ''[[The Album (ABBA album)|''The Album'']]'' (1977)" + t.Log(mw) + a, err := ParseArticle("Test", mw, &DummyPageGetter{}) + if err != nil { + t.Error("Error:", err) + } + b, err := json.MarshalIndent(a.Tokens, "", "\t") + if err != nil { + t.Error("Error:", err) + } + t.Log("Tokens\n") + t.Log(string(b)) +} + +func TestWikiCanonicalFormNamespaceEsc(t *testing.T) { + wl := StandardNamespaces.WikiCanonicalFormNamespaceEsc("WiKIpEdia:pagename#section", "", true) + if wl.Namespace != "Wikipedia" || wl.PageName != "Pagename" || wl.Anchor != "section" { + t.Error("Error: wikilink not parsed correctly", wl) + } +} diff --git a/parse.go b/parse.go new file mode 100644 index 0000000..e3602a1 --- /dev/null +++ b/parse.go @@ -0,0 +1,636 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +import ( + "errors" + "fmt" + "html" + "log" + "strconv" + "strings" +) + +const maxInnerParseErrorCount = 100 + +type ParseNode struct { + NType string + NSubType string + Link WikiLink + Contents string + Flags int + Nodes []*ParseNode +} + +func (a *Article) PrintParseTree() { + a.printParseTree(a.Root, 0) +} + +func (a *Article) printParseTree(root *ParseNode, depth int) { + if depth > 20 { + return + } + spaces := "......................................" + min := len(spaces) + if depth < len(spaces) { + min = depth + } + if depth < 0 { + min = 0 + } + prefix := spaces[0:min] + for _, n := range root.Nodes { + fmt.Printf("%s NType: %10s NSubType: %10s Contents: %16s Flags: %d\n", prefix, n.NType, n.NSubType, n.Contents, n.Flags) + if len(n.Nodes) > 0 { + a.printParseTree(n, depth+1) + } + } +} + +const ( + TClosed int = 1 << iota +) + +const ( + QS_none int = iota + QS_i + QS_b + QS_ib + QS_bi +) + +func ParseArticle(title, text string, g PageGetter) (*Article, error) { + a, err := NewArticle(title, text) + if err != nil { + return nil, err + } + a.Tokens, err = a.Tokenize(a.MediaWiki, g) + if err != nil { + return a, err + } + err = a.parse() + if err != nil { + return a, err + } + a.gt = false + return a, nil +} + +func (a *Article) doQuotes() { + log.SetFlags(log.Lshortfile) // | log.Ldate | log.Ltime) + state := QS_none + save := QS_none + l := 0 + ni := 0 + tn := make([]*Token, 0, len(a.Tokens)) + t := a.Tokens + for ; ni < len(t); ni++ { + // log.Println(*t[ni]) + + if t[ni].TType == "quote" { + l++ + // log.Println(l) + } + if t[ni].TType != "quote" || ni == len(t)-1 { + switch { + case l == 0: + // log.Println(l) + case l == 1: + // log.Println(l) + tn = append(tn, &Token{TText: "'", TType: "text"}) + case l == 2: + // log.Println(l) + switch state { + case QS_b: + tn = append(tn, &Token{TType: "html", TText: "i"}) + state = QS_bi + case QS_i: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + state = QS_none + case QS_bi: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + state = QS_b + case QS_ib: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + tn = append(tn, &Token{TType: "html", TText: "/i"}) + tn = append(tn, &Token{TType: "html", TText: "b"}) + state = QS_b + case QS_none: + tn = append(tn, &Token{TType: "html", TText: "i"}) + state = QS_i + } + case l == 3, l == 4: + // log.Println(l) + if l == 4 { + tn = append(tn, &Token{TText: "'", TType: "text"}) + } + switch state { + case QS_b: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + state = QS_none + case QS_i: + tn = append(tn, &Token{TType: "html", TText: "b"}) + state = QS_ib + case QS_ib: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + state = QS_i + case QS_bi: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + tn = append(tn, &Token{TType: "html", TText: "/b"}) + tn = append(tn, &Token{TType: "html", TText: "i"}) + state = QS_i + case QS_none: + tn = append(tn, &Token{TType: "html", TText: "b"}) + state = QS_b + } + case l >= 5: + // log.Println(l) + s := "" + for i := 5; i < l; i++ { + s += "'" + } + if len(s) > 0 { + tn = append(tn, &Token{TText: s, TType: "text"}) + } + switch state { + case QS_b: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + tn = append(tn, &Token{TType: "html", TText: "i"}) + state = QS_i + case QS_i: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + tn = append(tn, &Token{TType: "html", TText: "b"}) + state = QS_b + case QS_ib: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + tn = append(tn, &Token{TType: "html", TText: "/i"}) + state = QS_none + case QS_bi: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + tn = append(tn, &Token{TType: "html", TText: "/b"}) + state = QS_none + case QS_none: + tn = append(tn, &Token{TType: "html", TText: "b"}) + tn = append(tn, &Token{TType: "html", TText: "i"}) + state = QS_bi + } + } + l = 0 + } + + if t[ni].TType == "link" || t[ni].TType == "extlink" || t[ni].TType == "filelink" { + // log.Println(l) + save = state + switch state { + case QS_b: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + case QS_i: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + case QS_ib: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + tn = append(tn, &Token{TType: "html", TText: "/i"}) + case QS_bi: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + tn = append(tn, &Token{TType: "html", TText: "/b"}) + } + state = QS_none + l = 0 + } + if t[ni].TType == "closelink" || t[ni].TType == "closeextlink" || t[ni].TType == "closefilelink" { + // log.Println(l) + switch state { + case QS_b: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + case QS_i: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + case QS_ib: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + tn = append(tn, &Token{TType: "html", TText: "/i"}) + case QS_bi: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + tn = append(tn, &Token{TType: "html", TText: "/b"}) + } + state = save + save = QS_none + l = 0 + } + + if t[ni].TType != "quote" && t[ni].TType != "newline" { + // log.Println(l) + tn = append(tn, t[ni]) + } + if t[ni].TType == "newline" || ni == len(t)-1 { + // log.Println(l) + switch state { + case QS_b: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + case QS_i: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + case QS_ib: + tn = append(tn, &Token{TType: "html", TText: "/b"}) + tn = append(tn, &Token{TType: "html", TText: "/i"}) + case QS_bi: + tn = append(tn, &Token{TType: "html", TText: "/i"}) + tn = append(tn, &Token{TType: "html", TText: "/b"}) + } + state = QS_none + l = 0 + save = QS_none + } + if t[ni].TType == "newline" { + // log.Println(l) + tn = append(tn, t[ni]) + } + + } + a.Tokens = tn + // a.OldTokens = t +} + +//nowiki, wikipre, pre, math, quote, colon, magic, h?, *, #, ;, :, html, +func (a *Article) parse() error { + a.doQuotes() + nodes, err := a.internalParse(a.Tokens) + if err != nil { + return err + } + root := &ParseNode{NType: "root", Nodes: nodes} + a.Root = root + a.Parsed = true + return nil +} +func isImage(t *Token) bool { + return strings.ToLower(t.TLink.Namespace) == "file" +} + +func (a *Article) internalParse(t []*Token) ([]*ParseNode, error) { + ti := 0 + nl := make([]*ParseNode, 0, 0) + lastti := -1 + for ti < len(t) { + if ti == lastti { + // fmt.Println(len(t), ti, *t[ti], *t[ti-1], *t[ti+1]) + return nil, errors.New("parsing issue") + } + lastti = ti + switch t[ti].TType { + case "nowiki": + n := &ParseNode{NType: "text", NSubType: "nowiki", Contents: html.UnescapeString(t[ti].TText)} + nl = append(nl, n) + ti++ + /* case "curlyblock": + n := &ParseNode{NType: "curly", Contents: t[ti].TText} + nl = append(nl, n) + ti++ */ + case "text": + n := &ParseNode{NType: "text", Contents: html.UnescapeString(t[ti].TText)} + nl = append(nl, n) + ti++ + case "math": + n := &ParseNode{NType: "math", Contents: t[ti].TText} + nl = append(nl, n) + ti++ + case "pre": + n2 := &ParseNode{NType: "text", NSubType: "pre", Contents: html.UnescapeString(t[ti].TText)} + n1 := &ParseNode{NType: "html", NSubType: "pre", Contents: t[ti].TAttr, Nodes: []*ParseNode{n2}} + nl = append(nl, n1) + ti++ + case "nop": + ti++ + case "wikipre": + closebefore := len(t) + ni := ti + 1 + for ; ni < len(t)-1; ni++ { + if t[ni].TType == "newline" { + if t[ni+1].TType == "wikipre" { + t[ni+1].TType = "nop" + } else { + closebefore = ni + break + } + } + } + if closebefore <= ni+1 { + n := &ParseNode{NType: "html", NSubType: "pre"} + nl = append(nl, n) + ti++ + } else { + nodes, err := a.internalParse(t[ti+1 : closebefore]) + if err != nil { + return nil, err + } + n := &ParseNode{NType: "html", NSubType: "pre", Nodes: nodes} + nl = append(nl, n) + ti = closebefore + } + case "extlink": + ni := ti + 1 + for ; ni < len(t); ni++ { + if t[ni].TType == "closeextlink" { + break + } + } + if ni == len(t) { + return nil, errors.New("Unmatched external link token for link: " + t[ti].TText) + } + n := &ParseNode{NType: "extlink", NSubType: "", Contents: t[ti].TText} + a.ExtLinks = append(a.ExtLinks, t[ti].TText) + if ni > ti+1 { + nodes, err := a.internalParse(t[ti+1 : ni]) + if err != nil { + return nil, err + } + n.Nodes = nodes + } + nl = append(nl, n) + ti = ni + 1 + + case "closeextlink": + return nil, errors.New("Unmatched close external link token") + case "hrule": + n := &ParseNode{NType: "html", NSubType: "hr"} + nl = append(nl, n) + ti++ + case "magic": + n := &ParseNode{NType: "magic", Contents: t[ti].TText} + nl = append(nl, n) + ti++ + case "colon": + n := &ParseNode{NType: "text", Contents: ":"} + nl = append(nl, n) + ti++ + case "space": + n := &ParseNode{NType: "space", Contents: " "} + nl = append(nl, n) + ti++ + case "blank": + n := &ParseNode{NType: "break"} + nl = append(nl, n) + ti++ + case "redirect": + ni := ti + 1 + for ; ni < len(t); ni++ { + if t[ni].TType == "newline" { + break + } + if t[ni].TType == "link" { + break + } + } + if ni == len(t) || t[ni].TType == "newline" { + n := &ParseNode{NType: "text", Contents: html.UnescapeString(t[ti].TText)} + nl = append(nl, n) + ti++ + } else { + n := &ParseNode{NType: "redirect", Link: t[ni].TLink, NSubType: t[ni].TAttr} + nl = append(nl, n) + ti++ + } + case "link": + ni := ti + 1 + nopen := 1 + for ; ni < len(t); ni++ { + switch t[ni].TType { + case "link": + nopen++ + case "closelink": + nopen-- + } + if nopen == 0 { + break + } + } + if ni == len(t) { + return nil, errors.New("Unmatched link token for link: " + t[ti].TLink.PageName + " namespace: " + t[ti].TLink.Namespace) + } + var n *ParseNode + n = &ParseNode{NType: "link", Link: t[ti].TLink} + a.Links = append(a.Links, t[ti].TLink) + if ni > ti+1 { + nodes, err := a.internalParse(t[ti+1 : ni]) + if err != nil { + return nil, err + } + n.Nodes = nodes + } + nl = append(nl, n) + ti = ni + 1 + case "filelink": + ni := ti + 1 + nopen := 1 + for ; ni < len(t); ni++ { + switch t[ni].TType { + case "filelink": + nopen++ + case "closefilelink": + nopen-- + } + if nopen == 0 { + break + } + } + if ni == len(t) { + return nil, errors.New("Unmatched filelink token for filelink: " + t[ti].TLink.PageName + " namespace: " + t[ti].TLink.Namespace) + } + var n *ParseNode + n = &ParseNode{NType: "image", Link: t[ti].TLink} + a.Media = append(a.Media, t[ti].TLink) + if ni > ti+1 { + nodes, err := a.internalParse(t[ti+1 : ni]) + if err != nil { + return nil, err + } + n.Nodes = nodes + } + nl = append(nl, n) + ti = ni + 1 + + case "closelink": + return nil, errors.New("Unmatched close link token") + case "closefilelink": + return nil, errors.New("Unmatched close file link token") + case "html": + tag := strings.ToLower(t[ti].TText) + if tag[0] == '/' { + ti++ + continue + } + n := &ParseNode{NType: "html", NSubType: tag, Contents: t[ti].TAttr} + if t[ti].TClosed == true { + flags := TClosed + n.Flags = flags + nl = append(nl, n) + ti++ + continue + } + ni := ti + 1 + nopen := 1 + for ; ni < len(t); ni++ { + if t[ni].TType == "html" { + ntag := strings.ToLower(t[ni].TText) + switch ntag { + case tag: + nopen++ + case "/" + tag: + nopen-- + } + if nopen == 0 { + break + } + } + } + if ni > ti+1 { + nodes, err := a.internalParse(t[ti+1 : ni]) + if err != nil { + a.innerParseErrorCount++ + if a.innerParseErrorCount >= maxInnerParseErrorCount { + return nil, err + } + ti++ + continue + } + n.Nodes = nodes + } + nl = append(nl, n) + ti = ni + 1 + if ti > len(t) { + ti = len(t) + } + case "*", "#", ";", ":": + ti += 1 + /* stack := "" + si := 0 + ni := ti + ln := &ParseNode{NType: "root", Nodes: make([]*ParseNode, 0, 4)} + for { + + this := "" + islist := false + for ; ni < len(t); ni++ { + switch t[ni].TType { + case "*", "#", ";", ":": + islist = true + } + if islist { + this += t[ni].TType + } else { + break + } + } + same := 0 + for i := 0; i < len(this) && i < len(stack); i++ { + if this[i] == stack[i] || + (this[i] == ';' && stack[i] == ':') || + (this[i] == ':' && stack[i] == ';') { + same++ + } else { + break + } + } + n := ln + for i := 0; i < same; i++ { + n = n.Nodes[len(n.Nodes)-1] + n = n.Nodes[len(n.Nodes)-1] + } + + for i := same; i < len(this); i++ { //open + var nn *ParseNode + switch this[i] { + case '*': + nn = &ParseNode{NType: "html", NSubType: "ul"} + case '#': + nn = &ParseNode{NType: "html", NSubType: "ol"} + case ';': + nn = &ParseNode{NType: "html", NSubType: "dl"} + case ':': + nn = &ParseNode{NType: "html", NSubType: "dl"} + } + nn.Nodes = make([]*ParseNode, 0, 1) + n.Nodes = append(n.Nodes, nn) + n = nn + if i < len(this)-1 { + var elem *ParseNode + switch this[len] { + case '*', '#': + elem = &ParseNode{NType: "html", NSubType: "li"} + case ';': + elem = &ParseNode{NType: "html", NSubType: "dt"} + case ':': + elem = &ParseNode{NType: "html", NSubType: "dd"} + } + elem.Nodes = make([]*ParseNode, 0, 1) + n.Nodes = append(n.Nodes, elem) + n = elem + } + } + var nitem *ParseNode + switch this[len] { + case '*', '#': + nitem = &ParseNode{NType: "html", NSubType: "li"} + case ';': + nitem = &ParseNode{NType: "html", NSubType: "dt"} + case ':': + nitem = &ParseNode{NType: "html", NSubType: "dd"} + } + n := &ParseNode{NType: "html", NSubType: st} + nl = append(nl, n) + + } */ + case "newline": + n := &ParseNode{NType: "text", Contents: "\n"} + nl = append(nl, n) + ti++ + case "h1", "h2", "h3", "h4", "h5", "h6": + ni := ti + 1 + for ; ni < len(t); ni++ { + if t[ni].TType == "newline" { + break + } + } + if ni == len(t) { + return nil, errors.New("No newline after heading") + } + n := &ParseNode{NType: "html", NSubType: t[ti].TType} + if ni > ti+1 { + nodes, err := a.internalParse(t[ti+1 : ni]) + if err != nil { + return nil, err + } + n.Nodes = nodes + } + nl = append(nl, n) + ti = ni + 1 + case "tb", "te": + templateIndex, err := strconv.Atoi(t[ti].TText) + if err != nil { + return nil, errors.New("Malformed tb token") + } + if templateIndex >= len(a.Templates) { + return nil, errors.New("Template index out of range") + //fmt.Println("Template index out of range", t[ti]) + } else { + n := &ParseNode{NType: t[ti].TType, Contents: a.Templates[templateIndex].Name} + nl = append(nl, n) + } + ti++ + + default: + return nil, errors.New("Unrecognized token type: " + t[ti].TType) + } + } + return nl, nil +} diff --git a/redirect.go b/redirect.go new file mode 100644 index 0000000..5bf9d9a --- /dev/null +++ b/redirect.go @@ -0,0 +1,39 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +import "strings" + +func (a *Article) checkRedirect(mw string) (bool, *WikiLink) { + if len(mw) < 9 || strings.ToLower(mw[0:9]) != "#redirect" { + return false, nil + } + idx := strings.Index(mw, "\n") + if idx < 0 { + idx = len(mw) + } + nnt, err := a.parseInlineText(mw, 9, idx) + if err != nil { + return false, nil + } + for _, t := range nnt { + if t.TType == "link" { + return true, &t.TLink + } + } + return false, nil +} diff --git a/simple.go b/simple.go new file mode 100644 index 0000000..c35a641 --- /dev/null +++ b/simple.go @@ -0,0 +1,27 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +// "bytes" +// "errors" +// "fmt" + +func (a *Article) ParseSimple() error { + a.Text = a.MediaWiki + a.Parsed = true + return nil +} diff --git a/template.go b/template.go new file mode 100644 index 0000000..bfef824 --- /dev/null +++ b/template.go @@ -0,0 +1,660 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +import ( + "fmt" + "os" + "regexp" + "sort" + "strings" +) + +type Template struct { + Typ string `json:"type"` //magic,normal,ext,param + Name string `json:"name"` + Attr string `json:"attr"` //text after the ':' in magic templates + Parameters map[string]string `json:"parameters"` +} + +func (a *Article) parseTemplateEtc(l string) []Template { + return nil +} + +type streak struct { + opening bool + length int + b int + e int +} + +type template struct { + b int + e int + isparam bool + children []*template + rt string + rendered bool +} + +type byStart []*template + +func (a byStart) Len() int { return len(a) } +func (a byStart) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a byStart) Less(i, j int) bool { return a[i].b < a[j].b } + +var templateStreaksRe = regexp.MustCompile(`(?:\{\{+)|(?:\}\}+)`) + +func findCurlyStreaks(mw string) [][]int { + out := [][]int{} + found := '.' + beg := 0 + // count :=0 + for i, r := range mw { + switch r { + case found: + default: + if i-beg > 1 && (found == '{' || found == '}') { + out = append(out, []int{beg, i}) + } + beg = i + found = r + } + } + if beg < len(mw)-1 && (found == '{' || found == '}') { + out = append(out, []int{beg, len(mw)}) + } + return out +} + +func findTemplates(mw string) []*template { + // tsl := templateStreaksRe.FindAllStringSubmatchIndex(mw, -1) + tsl := findCurlyStreaks(mw) + // fmt.Println(tsl) + streaks := make([]streak, 0, len(tsl)) + for _, pair := range tsl { + streaks = append(streaks, streak{ + opening: (mw[pair[0]] == '{'), + length: pair[1] - pair[0], + b: pair[0], + e: pair[1], + }) + } + // fmt.Println(streaks) + tl := make([]*template, 0, 8) + i := 0 + for i < len(streaks) { + if !streaks[i].opening && streaks[i].length > 1 { // found a closing set: search for the opening + found := false + for j := i - 1; j >= 0; j-- { + if streaks[j].opening && streaks[j].length > 1 { + found = true + n := 2 + isparam := false + if streaks[i].length > 2 && streaks[j].length > 2 { + n = 3 + isparam = true + } + tl = append(tl, &template{ + isparam: isparam, + b: streaks[j].e - n, + e: streaks[i].b + n, + }) + streaks[i].length -= n + streaks[i].b += n + streaks[j].length -= n + streaks[j].e -= n + break + } + } + if found { + continue + } + } + i++ + } + sort.Sort(byStart(tl)) + /* fmt.Println("Templates found:") + for i := range tl { + fmt.Println(tl[i]) + } */ + out := make([]*template, 0, 4) + cur_end := 0 + for i := range tl { + tl[i].children = []*template{} + if tl[i].b >= cur_end { + cur_end = tl[i].e + out = append(out, tl[i]) + } else { + for j := i - 1; j >= 0; j-- { + if tl[j].e > tl[i].e { + tl[j].children = append(tl[j].children, tl[i]) + break + } + } + } + } + /* fmt.Println("Templates out:") + for i := range out { + fmt.Println(out[i]) + }*/ + /* fmt.Println("Templates found:") + for i := range tl { + fmt.Println(mw[tl[i].b:tl[i].e]) + } + */ + return out +} + +func findTemplateParamPos(mw string, t *template) [][]int { //first is position of pipe, second is position of first equal + out := make([][]int, 0, 1) + inChildTemplate := false + inlink := false + lastopen := false + lastclosed := false + for i, rv := range mw[t.b:t.e] { + inChildTemplate = false + open := false + closed := false + for _, ct := range t.children { + if i+t.b >= ct.b && i+t.b < ct.e { + inChildTemplate = true + break + } + } + if !inChildTemplate { + switch { + case rv == '[': + if lastopen { + inlink = true + } + open = true + case rv == ']': + if lastclosed { + inlink = false + } + closed = true + case rv == '|' && !inlink: + out = append(out, []int{i + t.b}) + case rv == '=' && len(out) > 0 && len(out[len(out)-1]) == 1 && !inlink: + out[len(out)-1] = append(out[len(out)-1], i+t.b) + } + } + lastopen = open + lastclosed = closed + } + return out +} + +/*func (a *Article) processTemplates(mw string, tokens map[string]*Token) (string, map[string]*Token) { + mlt := findTemplates(mw) + last := 0 + out := make([]byte, 0, len(mw)) + // tokens := make(map[string]*Token, len(mlt)) + for i, t := range mlt { + sb := fmt.Sprintf("\x07tb%05d", i) + se := fmt.Sprintf("\x07te%05d", i) + out = append(out, []byte(mw[last:t.b])...) + out = append(out, []byte(sb+a.renderTemplate(mw, t)+se)...) + last = t.e + tokens[sb] = &Token{ + TText: fmt.Sprintf("%d", i), + TType: "tb", + } + tokens[se] = &Token{ + TText: fmt.Sprintf("%d", i), + TType: "te", + } + + } + out = append(out, []byte(mw[last:])...) + return string(out), tokens +} */ + +func (a *Article) processTemplates(mws string, tokens map[string]*Token, g PageGetter) (string, map[string]*Token) { + //strip nowiki noinclude etc here + // mws := a.stripComments(mw) + // mws = a.stripNoinclude(mws) + + // fmt.Println(mws) + mlt := findTemplates(mws) + + last := 0 + out := make([]byte, 0, len(mws)) + for i, t := range mlt { + // fmt.Println("Process templates:", *t) + sb := fmt.Sprintf("\x07tb%05d", i) + se := fmt.Sprintf("\x07te%05d", i) + tn, pm := a.renderInnerTemplates(mws, t, nil, g, 0) + a.addTemplate(tn, pm) + out = append(out, []byte(mws[last:t.b])...) + out = append(out, []byte(sb+t.rt+se)...) + last = t.e + tokens[sb] = &Token{ + TText: fmt.Sprintf("%d", i), + TType: "tb", + } + tokens[se] = &Token{ + TText: fmt.Sprintf("%d", i), + TType: "te", + } + } + out = append(out, []byte(mws[last:])...) + + //unstrip here + + return string(out), tokens +} + +func (a *Article) addTemplate(tn string, pm map[string]string) { + outT := Template{Parameters: pm} + base, attr, typ, _ := detectTemplateType(tn) + outT.Typ = typ + outT.Name = base + outT.Attr = attr + a.Templates = append(a.Templates, &outT) + return +} + +func (a *Article) renderTemplate(mw string, t *template) string { + pp := findTemplateParamPos(mw, t) + n := 2 + if t.isparam { + n = 3 + } + var tn string + if len(pp) > 0 { + tn = fmt.Sprint(strings.TrimSpace(mw[t.b+n : pp[0][0]])) + } else { + tn = fmt.Sprint(strings.TrimSpace(mw[t.b+n : t.e-n])) + } + pm := make(map[string]string, len(pp)) + pp = append(pp, []int{t.e - n}) + for i := 0; i < len(pp)-1; i++ { + var name string + var param string + if len(pp[i]) > 1 { //named param + name = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i][1]])) + param = fmt.Sprint(strings.TrimSpace(mw[pp[i][1]+1 : pp[i+1][0]])) + } else { + name = fmt.Sprint(i + 1) + param = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i+1][0]])) + } + pm[name] = param + } + + outT := Template{Parameters: pm} + base, attr, typ, text := detectTemplateType(tn) + switch { + case t.isparam: + outT.Typ = "param" + outT.Name = tn + text = "" + default: + outT.Typ = typ + outT.Name = base + outT.Attr = attr + } + a.Templates = append(a.Templates, &outT) + return text +} + +func detectTemplateType(tn string) (string, string, string, string) { + index := strings.Index(tn, ":") + var base string + var attr string + if index > 0 { + base = strings.TrimSpace(tn[:index]) + attr = strings.TrimSpace(tn[index+1:]) + } else { + base = tn + } + _, ok := MagicMap[base] + if ok { + return base, attr, "magic", "" + } + + return tn, "", "normal", "" +} + +type TemplateRenderer func(name, mw string, params map[string]string) string + +var MagicMap map[string]TemplateRenderer = map[string]TemplateRenderer{ + "DISPLAYTITLE": nil, +} + +var noHashFunctionsMap map[string]bool = map[string]bool{ + "displaytitle": true, + "formatdate": true, + "int": true, + "namespace": true, + "pagesinnamespace": true, + "speciale": true, + "special": true, + "tag": true, + "anchorencode": true, "basepagenamee": true, "basepagename": true, "canonicalurle": true, + "canonicalurl": true, "cascadingsources": true, "defaultsort": true, "filepath": true, + "formatnum": true, "fullpagenamee": true, "fullpagename": true, "fullurle": true, + "fullurl": true, "gender": true, "grammar": true, "language": true, + "lcfirst": true, "lc": true, "localurle": true, "localurl": true, + "namespacee": true, "namespacenumber": true, "nse": true, "ns": true, + "numberingroup": true, "numberofactiveusers": true, "numberofadmins": true, "numberofarticles": true, + "numberofedits": true, "numberoffiles": true, "numberofpages": true, "numberofusers": true, + "numberofviews": true, "padleft": true, "padright": true, "pageid": true, + "pagenamee": true, "pagename": true, "pagesincategory": true, "pagesize": true, + "plural": true, "protectionlevel": true, "revisionday2": true, "revisionday": true, + "revisionid": true, "revisionmonth1": true, "revisionmonth": true, "revisiontimestamp": true, + "revisionuser": true, "revisionyear": true, "rootpagenamee": true, "rootpagename": true, + "subjectpagenamee": true, "subjectpagename": true, "subjectspacee": true, "subjectspace": true, + "subpagenamee": true, "subpagename": true, "talkpagenamee": true, "talkpagename": true, + "talkspacee": true, "talkspace": true, "ucfirst": true, "uc": true, + "urlencode": true, +} +var variablesMap map[string]bool = map[string]bool{ + "articlepath": true, + "basepagenamee": true, + "basepagename": true, + "cascadingsources": true, + "contentlanguage": true, + "currentday2": true, + "currentdayname": true, + "currentday": true, + "currentdow": true, + "currenthour": true, + "currentmonth1": true, + "currentmonthabbrev": true, + "currentmonthnamegen": true, + "currentmonthname": true, + "currentmonth": true, + "currenttimestamp": true, + "currenttime": true, + "currentversion": true, + "currentweek": true, + "currentyear": true, + "directionmark": true, + "fullpagenamee": true, + "fullpagename": true, + "localday2": true, + "localdayname": true, + "localday": true, + "localdow": true, + "localhour": true, + "localmonth1": true, + "localmonthabbrev": true, + "localmonthnamegen": true, + "localmonthname": true, + "localmonth": true, + "localtimestamp": true, + "localtime": true, + "localweek": true, + "localyear": true, + "namespacee": true, + "namespacenumber": true, + "namespace": true, + "numberofactiveusers": true, + "numberofadmins": true, + "numberofarticles": true, + "numberofedits": true, + "numberoffiles": true, + "numberofpages": true, + "numberofusers": true, + "numberofviews": true, + "pageid": true, + "pagenamee": true, + "pagename": true, + "revisionday2": true, + "revisionday": true, + "revisionid": true, + "revisionmonth1": true, + "revisionmonth": true, + "revisionsize": true, + "revisiontimestamp": true, + "revisionuser": true, + "revisionyear": true, + "rootpagenamee": true, + "rootpagename": true, + "scriptpath": true, + "servername": true, + "server": true, + "sitename": true, + "stylepath": true, + "subjectpagenamee": true, + "subjectpagename": true, + "subjectspacee": true, + "subjectspace": true, + "subpagenamee": true, + "subpagename": true, + "talkpagenamee": true, + "talkpagename": true, + "talkspacee": true, + "talkspace": true, +} + +func (a *Article) renderTemplateMagic(name string, params map[string]string) string { + return "" +} + +func (a *Article) renderTemplateExt(name string, params map[string]string) string { + return "" +} + +func (a *Article) renderTemplateRecursive(name string, params map[string]string, g PageGetter, depth int) string { + if depth > 4 { + return "" + } + //name and parameters have already been substituted so they are guarranteed not to contain any template + + //establish the type of template + switch templateType(name) { + case "magic": + return a.renderTemplateMagic(name, params) + case "ext": + return a.renderTemplateExt(name, params) + } + //case "normal" + //based on the type of template + //for the name and each parameter, find templates and substite them in the proper order + mw, err := g.Get(WikiCanonicalFormNamespace(name, "Template")) + if err != nil { + fmt.Fprintln(os.Stderr, "Title:", a.Title, " Error retrieving:", name, " ->", err) + return "" + } + return a.TranscludeTemplatesRecursive(mw, params, g, depth) +} + +func (a *Article) TranscludeTemplatesRecursive(mw string, params map[string]string, g PageGetter, depth int) string { + var mws string + followed := 0 + for { + if followed > 4 { + return "" + } + //strip nowiki noinclude etc here + mws := a.stripComments(mw) + isRedirect, redirect := a.checkRedirect(mws) + if !isRedirect { + break + } + var err error + mw, err = g.Get(*redirect) + if err != nil { + return "" + } + followed++ + } + mws = a.stripNoinclude(mws) + + // fmt.Println(ds[depth], "TranscludeTemplatesRecursive", mws) + mlt := findTemplates(mws) + + last := 0 + out := make([]byte, 0, len(mws)) + for _, t := range mlt { + a.renderInnerTemplates(mws, t, params, g, depth) + out = append(out, []byte(mws[last:t.b])...) + out = append(out, []byte(t.rt)...) + last = t.e + } + out = append(out, []byte(mws[last:])...) + + //unstrip here + + return string(out) +} + +var ds []string = []string{" ", " ", " ", " ", " ", " "} + +func (a *Article) renderInnerTemplates(mws string, t *template, params map[string]string, g PageGetter, depth int) (string, map[string]string) { + // render inner templates first + // fmt.Println(ds[depth], *t, "\n", ds[depth], "Template:\n", ds[depth], mws[t.b:t.e]) + for _, it := range t.children { + if !it.rendered { + a.renderInnerTemplates(mws, it, params, g, depth) + } + } + // fmt.Println(ds[depth], "Working on", mws[t.b:t.e]) + pp := findTemplateParamPos(mws, t) //position of the pipes for this template + // fmt.Println(ds[depth], "pp:", pp) + + n := 2 + if t.isparam { + n = 3 + } + pp = append(pp, []int{t.e - n}) + + var mw string + var tb int + // var te int + if len(t.children) == 0 { + // fmt.Println(ds[depth], "No nested templates in", mws[t.b:t.e]) + mw = mws + tb = t.b + // te = t.e + } else { + // fmt.Println(ds[depth], "Nested templates: fixing pp") + //substitute the strings and update pp + tci := 0 + ioff := t.children[tci].b + tb = 0 + mw = mws[t.b:ioff] + // fmt.Println(*t) + ooff := -t.b + ppi0 := 0 + ppi1 := 0 + for ppi0 < len(pp) { + // fmt.Println(mws) + // fmt.Println(len(mws), tci, ioff, ooff, ppi0, ppi1, pp) + if pp[ppi0][ppi1] <= ioff { + pp[ppi0][ppi1] += ooff + ppi1++ + if ppi1 >= len(pp[ppi0]) { + ppi0++ + ppi1 = 0 + } + } else { + mw += t.children[tci].rt + ooff += len(t.children[tci].rt) - (t.children[tci].e - t.children[tci].b) + teoff := t.children[tci].e + tci++ + if tci >= len(t.children) { + ioff = t.e + } else { + ioff = t.children[tci].b + } + // fmt.Println(ds[depth], tci, teoff, ioff) + mw += mws[teoff:ioff] + } + } + // te = len(mw) + } + // fmt.Println("len(mw):", len(mw), "mw:", mw, "\npp:", pp) + var tn string + if len(pp) > 1 { + tn = fmt.Sprint(strings.TrimSpace(mw[tb+n : pp[0][0]])) + } else { + tn = fmt.Sprint(strings.TrimSpace(mw[tb+n : pp[len(pp)-1][0]])) + } + + t.rendered = true + if t.isparam { //it's a parameter substitution + text, ok := params[tn] + if ok { + t.rt = text + return "", nil + } + if len(pp) == 1 { //no default + t.rt = "{{{" + tn + "}}}" + return "", nil + } + t.rt = mw[pp[0][0]+1 : pp[len(pp)-1][0]] + return "", nil + } + pm := make(map[string]string, len(pp)) + for i := 0; i < len(pp)-1; i++ { + var name string + var param string + if len(pp[i]) > 1 { //named param + name = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i][1]])) + param = fmt.Sprint(strings.TrimSpace(mw[pp[i][1]+1 : pp[i+1][0]])) + } else { + name = fmt.Sprint(i + 1) + param = fmt.Sprint(strings.TrimSpace(mw[pp[i][0]+1 : pp[i+1][0]])) + } + pm[name] = param + } + t.rt = a.renderTemplateRecursive(tn, pm, g, depth+1) + return tn, pm +} + +func templateType(tn string) string { + index := strings.Index(tn, ":") + tns := strings.TrimSpace(tn) + var base string + // var attr string + if index > 0 { + base = strings.TrimSpace(tn[:index]) + // attr = strings.TrimSpace(tn[index+1:]) + } else { + base = tns + } + base = strings.ToLower(base) + _, ok1 := noHashFunctionsMap[base] + _, ok2 := variablesMap[base] + if ok1 || ok2 { + return "magic" + } + if strings.HasPrefix(tns, "#") { + return "ext" + } + return "normal" +} + +var noincludeRe = regexp.MustCompile(`(?isU).*(?:|\z)`) +var includeonlyRe = regexp.MustCompile(`(?isU)(.*)(?:|\z)`) + +func (a *Article) stripNoinclude(mw string) string { + mwni := noincludeRe.ReplaceAllLiteralString(mw, "") + ssl := includeonlyRe.FindAllStringSubmatch(mwni, -1) + if len(ssl) == 0 { + return mwni + } + sl := make([]string, 0, len(ssl)) + for _, s := range ssl { + sl = append(sl, s[1]) + } + return strings.Join(sl, "") +} diff --git a/text.go b/text.go new file mode 100644 index 0000000..7f16961 --- /dev/null +++ b/text.go @@ -0,0 +1,102 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +import ( + "bytes" + "unicode/utf8" +) + +func (a *Article) appendText(t string) { + a.nchar += utf8.RuneCountInString(t) + a.text.WriteString(t) +} + +func (a *Article) genTextInternal(root *ParseNode, indent int) { + lastwasspace := false + for _, n := range root.Nodes { + var linkStart int + var fl FullWikiLink + isLink := false + tappend := "" + switch n.NType { + case "break": + a.appendText("\n") + case "space": + if !lastwasspace { + a.appendText(" ") + } + case "text": + a.appendText(n.Contents) + case "image": + a.appendText("\n") + tappend = "\n" + case "link": + isLink = true + linkStart = len(a.text.Bytes()) + fl = FullWikiLink{Link: n.Link, Start: a.nchar} + case "html": + switch n.NSubType { + case "h1", "h2", "h3", "h4", "h5", "h6": + a.appendText("\n") + tappend = "\n" + if len(a.AbstractText) == 0 { + a.AbstractText = a.text.String() + } + case "br": + a.appendText("\n") + case "ref": + a.appendText(" ") + } + } + if len(n.Nodes) > 0 { + a.genTextInternal(n, 0) + } + if isLink { + ttmp := a.text.Bytes() + fl.End = a.nchar + fl.Text = string(ttmp[linkStart:]) + a.TextLinks = append(a.TextLinks, fl) + } + lastwasspace = false + if n.NType == "space" { + lastwasspace = true + } + // a.Text += tappend + a.appendText(tappend) + } + + return +} + +func (a *Article) genText() error { + a.text = bytes.NewBuffer(make([]byte, 1024*1024, 1024*1024)) + a.text.Truncate(0) + a.nchar = 0 + a.AbstractText = "" + a.genTextInternal(a.Root, 0) + a.Text = string(a.text.Bytes()) + if len(a.AbstractText) == 0 { + a.AbstractText = a.Text + } + a.gt = true + return nil +} + +func (a *Article) GenText() error { + return a.genText() +} diff --git a/tokenize.go b/tokenize.go new file mode 100644 index 0000000..702c83f --- /dev/null +++ b/tokenize.go @@ -0,0 +1,916 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +import ( + // "bytes" + "errors" + "fmt" + // "html" + "regexp" + "sort" + "strings" + "unicode" + "unicode/utf8" +) + +type Token struct { + TText string `json:"tText,omitempty"` + TType string `json:"tType,omitempty"` + TAttr string `json:"tAttr,omitempty"` + TLink WikiLink `json:"tLink,omitempty"` + TClosed bool `json:"tClosed,omitempty"` + TPipes []string `json:"tPipes,omitempty"` +} + +func (a *Article) parseRedirectLine(l string) ([]*Token, error) { + nt := make([]*Token, 0, 2) + nt = append(nt, &Token{TType: "redirect"}) + nnt, err := a.parseInlineText(l, 9, len(l)) + if err != nil { + return nil, err + } + nt = append(nt, nnt...) + return nt, nil +} + +func (a *Article) parseWikiPreLine(l string) ([]*Token, error) { + nt := make([]*Token, 0, 2) + nt = append(nt, &Token{TType: "wikipre"}) + nnt, err := a.parseInlineText(l, 1, len(l)) + if err != nil { + return nil, err + } + nt = append(nt, nnt...) + return nt, nil +} + +func (a *Article) parseHRuler(l string) ([]*Token, error) { + pos := 0 + for i, rv := range l { + if rv != '-' { + pos = i + break + } + } + nt := make([]*Token, 0, 2) + nt = append(nt, &Token{TType: "hrule"}) + if pos != 0 { + nnt, err := a.parseInlineText(l, pos, len(l)) + if err != nil { + return nil, err + } + nt = append(nt, nnt...) + } + return nt, nil +} + +func (a *Article) parseHeadingLine(l string) ([]*Token, error) { + pf := 0 + pl := 0 + for i, rv := range l { + if rv == '=' { + pl = i + } + } + for { + pf++ + if pf == pl || l[pf] != '=' { + pf-- + break + } + pl-- + if pf == pl || l[pl] != '=' { + pl++ + pf-- + break + } + } + pf++ + if pf > 6 { + diff := pf - 6 + pf -= diff + pl += diff + } + nt := make([]*Token, 0, 2) + nt = append(nt, &Token{TType: fmt.Sprintf("h%d", pf)}) + nnt, err := a.parseInlineText(l, pf, pl) + if err != nil { + return nil, err + } + nt = append(nt, nnt...) + return nt, nil +} + +func (a *Article) parseListLine(l string) ([]*Token, error) { + nt := make([]*Token, 0, 2) + pos := 0 + for ; pos < len(l); pos++ { + switch l[pos] { + case ';', ':', '*', '#': + nt = append(nt, &Token{TType: l[pos : pos+1]}) + continue + } + break + } + if pos < len(l) { + nnt, err := a.parseInlineText(l, pos, len(l)) + if err != nil { + return nil, err + } + nt = append(nt, nnt...) + } + return nt, nil +} + +func (a *Article) parseTableLine(l string) ([]*Token, error) { + nt := make([]*Token, 0, 0) + return nt, nil +} + +func isValidHTMLtag(tag string) bool { + return true +} + +func (a *Article) decodeHTMLtag(l string) (int, string, string, bool, bool) { + matchingpos := 0 + inquote := false + lastbackslash := false + quote := '#' + closefound := false + tagend := 0 + tagstart := 0 + //taking care of comments at preprocessing time + /* if strings.HasPrefix(l, "") + if i == -1 { + return len(l), "!--", l[4:], true, true + } + return 4 + i + 3, "!--", l[4 : 4+i], true, true + } */ +dhtLoop: + for idx, rv := range l { + // fmt.Println(string(rv), inquote, string(quote), idx, matchingpos) + switch rv { + case '>': + if !inquote { + matchingpos = idx + break dhtLoop + } + case '\'', '"': + switch { + case inquote && quote == rv && !lastbackslash: + inquote = false + case !inquote: + inquote = true + quote = rv + } + case ' ', '\t', '\r': + case '/': + closefound = true + } + lastbackslash = (rv == '\\') + if !unicode.IsSpace(rv) && tagstart == 0 { + tagstart = idx + } + if rv != '/' && !unicode.IsSpace(rv) { + closefound = false + } + if unicode.IsSpace(rv) && tagstart != 0 && tagend == 0 { + tagend = idx + } + } + if matchingpos == 0 || tagstart == 0 { + return 0, "", "", false, false + } + var tag string + var attr string + + if tagend == 0 { + tag = l[tagstart:matchingpos] + attr = "" + } else { + tag = l[tagstart:tagend] + attr = l[tagend:matchingpos] + } + return matchingpos + 1, tag, attr, closefound, true + // e, tag, attr, closed, ok := decodeHTMLtag(l[pos:end]) +} + +func matchPrefixes(s string, prefixes []string) bool { + for i := range prefixes { + if len(s) >= len(prefixes[i]) && strings.EqualFold(s[:len(prefixes[i])], prefixes[i]) { + return true + } + } + return false +} + +var extlinkre = regexp.MustCompile(`^(http:)|(ftp:)|()//[^\s]+`) + +func isExtLink(l string) bool { + // return extlinkre.MatchString(l) + return matchPrefixes(l, []string{"http://", "ftp://", "//"}) +} + +var filelinkre = regexp.MustCompile(`(?i)^\[\[(?:image:)|(?:media:)|(?:file:)`) + +func possibleFileLink(l string) bool { + // return filelinkre.MatchString(l) + return matchPrefixes(l, []string{"[[image:", "[[media:", "[[file:"}) +} + +func (a *Article) parseLink(l string) (int, []*Token, bool) { + if len(l) < 5 { + return 0, nil, false + } + if l[1] == '[' { + if possibleFileLink(l) { + return a.parseFileLink(l) + } + return a.parseInternalLink(l) + } + return a.parseExternalLink(l) +} + +func (a *Article) parseInternalLink(l string) (int, []*Token, bool) { + + // possible internal link + pipepos := 0 + closed := false + matchingpos := 0 + linktrail := 0 + //plLoop: + for idx, rv := range l { + if idx < 2 { + continue + } + if matchingpos == 0 { + switch rv { + case '\x07': //prevent special tags in internal link + if pipepos == 0 { //only in the link portion + return 0, nil, false + } + case '[': + if idx == 2 || len(l) > idx+1 && l[idx+1] == '[' { + return 0, nil, false + } + + case ']': + if len(l) > idx+1 && l[idx+1] == ']' { + matchingpos = idx + } + case '|': + if pipepos == 0 { + pipepos = idx + } + default: + } + continue + } + if !closed { + closed = true + continue + } + if unicode.IsLetter(rv) { + linktrail = idx + continue + } + break + } + if !closed { + return 0, nil, false + } + var link WikiLink + var nt []*Token = nil + var err error = nil + if pipepos == 0 { + innerstring := l[2:matchingpos] + if linktrail != 0 { + innerstring += l[matchingpos+2 : linktrail+1] + } + link = WikiCanonicalForm(l[2:matchingpos]) + nt = []*Token{&Token{TText: innerstring, TType: "text"}} + + } else { + innerstring := l[pipepos+1 : matchingpos] + if linktrail != 0 { + innerstring += l[matchingpos+2 : linktrail+1] + } + link = WikiCanonicalForm(l[2:pipepos]) + if pipepos+1 < matchingpos { + nt, err = a.parseInlineText(innerstring, 0, len(innerstring)) + if err != nil { + return 0, nil, false + } + } + } + tokens := make([]*Token, 0, 2) + tokens = append(tokens, &Token{TLink: link, TType: "link"}) + if nt != nil { + tokens = append(tokens, nt...) + } + tokens = append(tokens, &Token{TType: "closelink"}) + if linktrail != 0 { + return linktrail + 1, tokens, true + } + return matchingpos + 2, tokens, true +} + +func (a *Article) parseExternalLink(l string) (int, []*Token, bool) { + // possible external link + spacepos := 0 + matchingpos := 0 + endpos := 0 + intLinkOpen := false + skipNext := false +plLoop2: + for idx, rv := range l { + if idx < 1 { + continue + } + if skipNext { + skipNext = false + continue + } + switch rv { + case '\x07': + if spacepos == 0 { + return 0, nil, false + } + case '[': + if len(l) > idx+1 && l[idx+1] == '[' { + intLinkOpen = true + } + case ' ': + if spacepos == 0 { + spacepos = idx + } + case '<': + if spacepos > 0 { + // e, tag, attr, closed, ok := a.decodeHTMLtag(l[idx:len(l)]) + _, tag, _, _, ok := a.decodeHTMLtag(l[idx:len(l)]) + // fmt.Println("html tag in ext link. Line:", l, "\n\n", tag, ok) + if ok && tag == "/ref" { + // fmt.Println("closing link...") + matchingpos = idx + endpos = idx + break plLoop2 + } + + } + case ']': + if intLinkOpen && len(l) > idx+1 && l[idx+1] == ']' { + intLinkOpen = false + skipNext = true + continue + } + matchingpos = idx + endpos = idx + 1 + break plLoop2 + } + } + if matchingpos == 0 { + return 0, nil, false + } + var link string + var nt []*Token = nil + var err error = nil + if spacepos == 0 { + link = l[1:matchingpos] + if !isExtLink(link) { + return 0, nil, false + } + } else { + link = l[1:spacepos] + if !isExtLink(link) { + return 0, nil, false + } + if spacepos+1 < matchingpos { + nt, err = a.parseInlineText(l, spacepos+1, matchingpos) + if err != nil { + return 0, nil, false + } + } + } + tokens := make([]*Token, 0, 2) + tokens = append(tokens, &Token{TText: link, TType: "extlink"}) + if nt != nil { + tokens = append(tokens, nt...) + } + tokens = append(tokens, &Token{TType: "closeextlink"}) + return endpos, tokens, true +} + +func (a *Article) parseFileLink(l string) (int, []*Token, bool) { + // possible internal link + pipepos := make([]int, 0, 0) + closed := false + matchingpos := 0 + intLinkOpen := false + skipNext := false +plLoop: + for idx, rv := range l { + if idx < 2 { + continue + } + if skipNext { + skipNext = false + continue + } + switch rv { + case '\x07': //prevent special tags in internal link + if len(pipepos) == 0 { //only in the link portion + return 0, nil, false + } + case '[': + if len(l) > idx+1 && l[idx+1] == '[' { + intLinkOpen = true + skipNext = true + continue + } + + case ']': + if len(l) > idx+1 && l[idx+1] == ']' { + if intLinkOpen { + intLinkOpen = false + skipNext = true + continue + } + matchingpos = idx + closed = true + break plLoop + } + case '|': + if !intLinkOpen { + pipepos = append(pipepos, idx) + } + default: + } + } + if !closed { + return 0, nil, false + } + var link WikiLink + var pipes = make([]string, 0, 0) + var nt []*Token = nil + var err error = nil + if len(pipepos) == 0 { + link = WikiCanonicalForm(l[2:matchingpos]) + nt = []*Token{&Token{TText: l[2:matchingpos], TType: "text"}} + + } else { + link = WikiCanonicalForm(l[2:pipepos[0]]) + for i := 0; i < len(pipepos)-1; i++ { + pipes = append(pipes, l[pipepos[i]+1:pipepos[i+1]]) + } + if pipepos[len(pipepos)-1]+1 < matchingpos { + nt, err = a.parseInlineText(l, pipepos[len(pipepos)-1]+1, matchingpos) + if err != nil { + return 0, nil, false + } + } + } + tokens := make([]*Token, 0, 2) + tokens = append(tokens, &Token{TLink: link, TType: "filelink", TPipes: pipes}) + if nt != nil { + tokens = append(tokens, nt...) + } + tokens = append(tokens, &Token{TType: "closefilelink"}) + return matchingpos + 2, tokens, true +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + +var behavswitchre = regexp.MustCompile(`^__[A-Z]+__`) + +func (a *Article) decodeBehavSwitch(l string) (int, bool) { + match := behavswitchre.FindString(l) + if len(match) == 0 { + return 0, false + } else { + return len(match), true + } + // e, ok := decodeMagic(l[pos:end]) +} + +func (a *Article) parseInlineText(l string, start, end int) ([]*Token, error) { + nt := make([]*Token, 0) + // fmt.Println("in parseInlineText") + + tStart, tEnd := start, start + + for pos := start; pos < end; { + rv, rune_len := utf8.DecodeRuneInString(l[pos:end]) + switch rv { + case '<': + e, tag, attr, closed, ok := a.decodeHTMLtag(l[pos:end]) + if ok { + pos += e + if isValidHTMLtag(tag) { + if tEnd > tStart { + nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"}) + } + nt = append(nt, &Token{TType: "html", TText: tag, TAttr: attr, TClosed: closed}) + tStart = pos + } + tEnd = pos + continue + } + case '[': + e, lt, ok := a.parseLink(l[pos:end]) + if ok { + if tEnd > tStart { + nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"}) + } + nt = append(nt, lt...) + pos += e + tStart, tEnd = pos, pos + continue + } + /* case '{': + e, tt, ok := a.parseTemplateEtc(l[pos:end]) + fmt.Println("template:", e, tt, ok) + if ok { + if len(cs) > 0 { + nt = append(nt, &Token{TText: cs, TType: "text"}) + } + nt = append(nt, tt...) + pos += e + cs = "" + continue + } + cs += string(rv) */ + case '_': + e, ok := a.decodeBehavSwitch(l[pos:end]) + if ok { + if tEnd > tStart { + nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"}) + } + nt = append(nt, &Token{TType: "magic", TAttr: l[pos : pos+e]}) + pos += e + tStart, tEnd = pos, pos + continue + } + case ' ', '\t', '\r': + if tEnd > tStart { + nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"}) + } + nt = append(nt, &Token{TType: "space"}) + tStart = pos + rune_len + case '\'': + if tEnd > tStart { + nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"}) + } + nt = append(nt, &Token{TType: "quote"}) + tStart = pos + rune_len + case ':': + if tEnd > tStart { + nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"}) + } + nt = append(nt, &Token{TType: "colon"}) + tStart = pos + rune_len + case '\x07': + // case '@': + if tEnd > tStart { + nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"}) + } + nt = append(nt, &Token{TType: "special", TText: l[pos : pos+8]}) + pos += 8 + tStart, tEnd = pos, pos + continue + } + pos += rune_len + tEnd = pos + } + if tEnd > tStart { + nt = append(nt, &Token{TText: l[tStart:tEnd], TType: "text"}) + } + return nt, nil +} + +func (a *Article) isHeading(l string) bool { + if l[0] != '=' { + return false + } + done := 0 + lastEqual := false + for _, rv := range l { + done++ + if done > 2 { + if unicode.IsSpace(rv) { + continue + } + if rv == '=' { + lastEqual = true + continue + } + lastEqual = false + } + + } + return lastEqual +} + +func (a *Article) isTable(l string) bool { + return (len(l) > 1 && (l[0:2] == "{|" || l[0:2] == "|}" || l[0:2] == "|+" || l[0:2] == "|-")) || (len(l) > 0 && (l[0:1] == "|" || l[0:1] == "!")) +} + +func (a *Article) lineType(l string) string { + switch { + case len(l) == 0: + return "blank" + case len(l) > 8 && strings.ToLower(l[0:9]) == "#redirect": + return "redirect" + case len(l) > 3 && l[0:4] == "----": + return "hr" + case a.isHeading(l): + return "heading" + case l[0] == ';' || l[0] == ':' || l[0] == '*' || l[0] == '#': + return "list" + case a.isTable(l): + return "table" + case l[0] == ' ': + return "wikipre" + } + return "normal" +} + +func (a *Article) Tokenize(mw string, g PageGetter) ([]*Token, error) { + mwnc := a.stripComments(mw) + mw_stripped, nowikipremathmap := a.stripNowikiPreMath(mwnc) + mw_tmpl, templatemap := a.processTemplates(mw_stripped, nowikipremathmap, g) + mw_links := a.preprocessLinks(mw_tmpl) + + lines := strings.Split(mw_links, "\n") + tokens := make([]*Token, 0, 16) + for _, l := range lines { + var nt []*Token + var err error = nil + lt := a.lineType(l) + switch lt { + case "normal": + nt, err = a.parseInlineText(l, 0, len(l)) + case "redirect": + nt, err = a.parseRedirectLine(l) + case "hr": + nt, err = a.parseHRuler(l) + case "heading": + nt, err = a.parseHeadingLine(l) + case "list": + nt, err = a.parseListLine(l) + case "table": + nt, err = a.parseTableLine(l) + case "wikipre": + nt, err = a.parseWikiPreLine(l) + case "blank": + nt = []*Token{&Token{TType: "blank"}} + } + if err != nil { + return nil, err + } + nt = append(nt, &Token{TType: "newline"}) + tokens = append(tokens, nt...) + } + specialcount := 0 + for i := range tokens { + if tokens[i].TType == "special" { + specialcount++ + t, ok := templatemap[tokens[i].TText] //nowikipremathmap[tokens[i].TText] + if !ok { + return nil, errors.New("special not in map") + } + tokens[i] = t + } + } + // fmt.Println(specialcount, len(nowikipremathmap)) + // if specialcount != len(nowikipremathmap) { + if specialcount != len(templatemap) { + if DebugLevel > 0 { + fmt.Println("[Tokenize] Warning: number of specials in map differs from number found") + } + // return nil, errors.New("number of specials in map differs from number found") + } + return tokens, nil +} + +var commentsRe = regexp.MustCompile(`(?isU)|\z)`) + +func (a *Article) stripComments(mw string) string { + return commentsRe.ReplaceAllLiteralString(mw, "") +} + +var nowikiOpenRe = regexp.MustCompile(`(?i)<\s*(nowiki)\s*[^>/]*>`) +var nowikiCloseRe = regexp.MustCompile(`(?i)<(/nowiki)\s*[^>/]*>`) +var preOpenRe = regexp.MustCompile(`(?i)<\s*(pre)\s*[^>]*>`) +var preCloseRe = regexp.MustCompile(`(?i)<(/pre)\s*[^>]*>`) +var mathOpenRe = regexp.MustCompile(`(?i)<\s*(math)\s*[^>]*>`) +var mathCloseRe = regexp.MustCompile(`(?i)<(/math)\s*[^>]*>`) + +type ssInt [][]int + +func (a ssInt) Len() int { return len(a) } +func (a ssInt) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a ssInt) Less(i, j int) bool { return a[i][0] < a[j][0] } + +func (a *Article) stripNowikiPreMath(mw string) (string, map[string]*Token) { + nwoc := nowikiOpenRe.FindAllStringSubmatchIndex(mw, -1) + nwcc := nowikiCloseRe.FindAllStringSubmatchIndex(mw, -1) + poc := preOpenRe.FindAllStringSubmatchIndex(mw, -1) + pcc := preCloseRe.FindAllStringSubmatchIndex(mw, -1) + moc := mathOpenRe.FindAllStringSubmatchIndex(mw, -1) + mcc := mathCloseRe.FindAllStringSubmatchIndex(mw, -1) + + /* + nwoc = append(nwoc, []int{len(mw) + 1, len(mw) + 1}) + nwcc = append(nwcc, []int{len(mw) + 1, len(mw) + 1}) + poc = append(poc, []int{len(mw) + 1, len(mw) + 1}) + pcc = append(pcc, []int{len(mw) + 1, len(mw) + 1}) + moc = append(moc, []int{len(mw) + 1, len(mw) + 1}) + mcc = append(mcc, []int{len(mw) + 1, len(mw) + 1}) + */ + for i := range nwoc { + nwoc[i] = append(nwoc[i], 0) + } + for i := range nwcc { + nwcc[i] = append(nwcc[i], 1) + } + for i := range poc { + poc[i] = append(poc[i], 2) + } + for i := range pcc { + pcc[i] = append(pcc[i], 3) + } + for i := range moc { + moc[i] = append(moc[i], 4) + } + for i := range mcc { + mcc[i] = append(mcc[i], 5) + } + am := make([][]int, 0, len(nwoc)+len(nwcc)+len(poc)+len(pcc)+len(moc)+len(mcc)) + am = append(am, nwoc...) + am = append(am, nwcc...) + am = append(am, poc...) + am = append(am, pcc...) + am = append(am, moc...) + am = append(am, mcc...) + sort.Sort(ssInt(am)) + // fmt.Println(am) + tokens := make(map[string]*Token, len(am)) + if len(am) == 0 { + return mw, tokens + } + + ctype := -1 + out := "" + lastclose := 0 + openidx := 0 + count := 0 + for i := range am { + // fmt.Println("ctype", ctype, "lastclose", lastclose, "count", count, "openidx", openidx, "am[i]", am[i]) + if (ctype != -1) && (am[i][4] == ctype+1) && (am[openidx][1] <= am[i][0]) { + // closing an open one + special := fmt.Sprintf("\x07%07d", count) + // special := fmt.Sprintf("@%07d", count) + tokens[special] = &Token{ + TText: mw[am[openidx][1]:am[i][0]], + TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]), + TAttr: mw[am[openidx][3] : am[openidx][1]-1], + } + out += special + ctype = -1 + lastclose = am[i][1] + count++ + } else if (ctype == -1) && (am[i][4]&1 == 0) && (lastclose <= am[i][0]) { + // open a new one + out += mw[lastclose:am[i][0]] + ctype = am[i][4] + openidx = i + } + } + if ctype != -1 { + //it's open: close it + special := fmt.Sprintf("\x07%07d", count) + // special := fmt.Sprintf("@%07d", count) + tokens[special] = &Token{ + TText: mw[am[openidx][1]:len(mw)], + TType: strings.ToLower(mw[am[openidx][2]:am[openidx][3]]), + TAttr: mw[am[openidx][3] : am[openidx][1]-1], + } + out += special + ctype = -1 + count++ + } else { + out += mw[lastclose:] + } + return out, tokens +} + +var multiLineLinksRe = regexp.MustCompile(`(?sm)\[\[[^\n|]*\|.*?\]\]`) + +/* TODO: add preprocessing as in Parser.php:pstPass2() to enable pipe tricks + */ +func (a *Article) preprocessLinks(s string) string { + mw := []byte(s) + mll := multiLineLinksRe.FindAllSubmatchIndex(mw, -1) + for _, pair := range mll { + for i := pair[0]; i < pair[1]; { + // we have to walk this string carefully, by rune, not by i + rv, rlen := utf8.DecodeRune(mw[i:]) + if rv == '\n' { + mw[i] = ' ' + } + i += rlen + } + } + return string(mw) +} + +//var nowikiOpenRe = regexp.MustCompile(`(?i)<\s*nowiki\s*[^>/]*>`) +//var nowikiCloseRe = regexp.MustCompile(`(?i)/]*>`) +//var nowikiOpenCloseRe = regexp.MustCompile(`(?i)]*/>`) +/* +type WikiParser struct { + mw string +} + +func NewWikiParser(mw string) *WikiParser { + return &WikiParser{mw: mw} +} + +func (wp *WikiParser) doNowiki() { + openCandidates := nowikiOpenRe.FindAllStringIndex(wp.mw, -1) + closeCandidates := nowikiCloseRe.FindAllStringIndex(wp.mw, -1) + openCloseCandidates := nowikiOpenCloseRe.FindAllStringIndex(wp.mw, -1) + tail := []int{len(wp.mw) + 1, len(wp.mw) + 1} + openCandidates = append(openCandidates, tail) + closeCandidates = append(closeCandidates, tail) + openCloseCandidates = append(openCloseCandidates, tail) + oi := 0 + ci := 0 + oci := 0 + inNowiki := false + ol = make([][]int, 0, len(openCandidates)) + cl = make([][]int, 0, len(closeCandidates)) + ocl = make([][]int, 0, len(openCloseCandidates)) + for { + if oi == len(openCandidates)-1 && + ci == len(closeCandidates)-1 && + oci == len(openCloseCandidates)-1 { + break + } + switch { + case openCandidates[oi][0] <= closeCandidates[oi][0] && + openCandidates[oi][0] <= openCloseloseCandidates[oi][0]: + if !inNowiki { + ol = append(ol.openCandidates[oi]) + inNowiki = true + } + oi += 1 + + case closeCandidates[oi][0] <= openCandidates[oi][0] && + closeCandidates[oi][0] <= openCloseloseCandidates[oi][0]: + + default: + } + } +} + +func (wp *WikiParser) Parse() { + doSGML() + doNowiki() + doMath() + doPre() + doBlanks() + doHTMLvalidation() + doReplaceVariables() + doHR() + doAllQuotes() + doHeadings() + doLists() + doDates() + doExternalLinks() + doInternalLinks() + doISBN() + doRecombine() +} +*/ diff --git a/utils.go b/utils.go new file mode 100644 index 0000000..2d677ef --- /dev/null +++ b/utils.go @@ -0,0 +1,59 @@ +/* +Copyright (C) IBM Corporation 2015, Michele Franceschini + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gowiki + +import ( + // "fmt" + "strings" +) + +func (a *Article) CheckRedirect() (bool, *WikiLink) { + + rf := false + for i, t := range a.Tokens { + if i > 10 { + break + } + switch t.TType { + case "redirect": + rf = true + case "link": + if rf { + return true, &t.TLink + } + } + } + return false, nil +} + +func (a *Article) CheckDisambiguation() bool { + for _, t := range a.Templates { + if t.Typ != "normal" { + continue + } + ln := strings.ToLower(t.Name) + if strings.Contains(ln, "disambig") || + ln == "dab" || + ln == "geodis" || + ln == "hndis" || + ln == "hndis-cleanup" || + ln == "numberdis" { + return true + } + } + return false +}