package wikitext_parser import ( "golang.org/x/text/unicode/norm" "unicode" ) type HTML struct { Tag *HTMLTag } type HTMLTag struct { Parent *HTMLTag Name []byte Parameters []byte Content []*HTMLTag } func (t *HTMLTag) String() (r string) { if string(t.Name) == "#text" { r = string(t.Parameters) } else { for _, c := range t.Content { r += c.String() } } if string(t.Name) == "del" { //add strikethrough var runeList []rune for _, runeEntry := range []rune(norm.NFD.String(r)) { if runeEntry <= unicode.MaxASCII { runeList = append(runeList, '\u0336') //combining long stroke overlay } runeList = append(runeList, runeEntry) } r = norm.NFC.String(string(runeList)) } else if string(t.Name) == "ref" { //remove references return "" } else if string(t.Name) == "br" { //new line return "\n" } else if string(t.Name) == "script" { return "" } return } func ParseHTML(text string, index int, depth int) (i int, html *HTML) { var c byte html = &HTML{} readingTag := false readingParameters := false isTerminating := false var tag *HTMLTag tagDepth := 0 for i = index; i < len(text); i++ { c = text[i] if c == '<' && i < len(text)-1 && text[i+1] == '/' { isTerminating = true readingTag = true readingParameters = false } else if c == '<' { newTag := &HTMLTag{ Parent: tag, } if tag != nil { tag.Content = append(tag.Content, newTag) } tag = newTag readingTag = true readingParameters = false isTerminating = false if tagDepth == 0 && html.Tag == nil { html.Tag = tag } tagDepth++ } else if readingTag && c == '>' { readingTag = false readingParameters = false if isTerminating { tagDepth-- if tag != nil { tag = tag.Parent } isTerminating = false } if tagDepth == 0 { return i + 1, html } } else if readingTag && c == '/' { isTerminating = true } else if !isTerminating && readingTag { if c == ' ' { readingParameters = true } if readingParameters { tag.Parameters = append(tag.Parameters, c) } else { tag.Name = append(tag.Name, c) } } else if !isTerminating && tagDepth > 0 { if len(tag.Content) == 0 || string(tag.Content[len(tag.Content)-1].Name) != "#text" { tag.Content = append(tag.Content, &HTMLTag{ Parent: tag, Name: []byte("#text"), }) } tag.Content[len(tag.Content)-1].Parameters = append(tag.Content[len(tag.Content)-1].Parameters, c) } } return }