diff options
Diffstat (limited to 'src/cmd/vendor/rsc.io/markdown/link.go')
-rw-r--r-- | src/cmd/vendor/rsc.io/markdown/link.go | 861 |
1 files changed, 861 insertions, 0 deletions
diff --git a/src/cmd/vendor/rsc.io/markdown/link.go b/src/cmd/vendor/rsc.io/markdown/link.go new file mode 100644 index 0000000000..94fa798f0a --- /dev/null +++ b/src/cmd/vendor/rsc.io/markdown/link.go @@ -0,0 +1,861 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package markdown + +import ( + "bytes" + "fmt" + "strings" + "unicode/utf8" + + "golang.org/x/text/cases" +) + +func parseLinkRefDef(p buildState, s string) (int, bool) { + // “A link reference definition consists of a link label, + // optionally preceded by up to three spaces of indentation, + // followed by a colon (:), + // optional spaces or tabs (including up to one line ending), + // a link destination, + // optional spaces or tabs (including up to one line ending), + // and an optional link title, + // which if it is present must be separated from the link destination + // by spaces or tabs. No further character may occur.” + i := skipSpace(s, 0) + label, i, ok := parseLinkLabel(p.(*parseState), s, i) + if !ok || i >= len(s) || s[i] != ':' { + return 0, false + } + i = skipSpace(s, i+1) + suf := s[i:] + dest, i, ok := parseLinkDest(s, i) + if !ok { + if suf != "" && suf[0] == '<' { + // Goldmark treats <<> as a link definition. + p.(*parseState).corner = true + } + return 0, false + } + moved := false + for i < len(s) && (s[i] == ' ' || s[i] == '\t') { + moved = true + i++ + } + + // Take title if present and doesn't break parse. + j := i + if j >= len(s) || s[j] == '\n' { + moved = true + if j < len(s) { + j++ + } + } + + var title string + var titleChar byte + var corner bool + if moved { + for j < len(s) && (s[j] == ' ' || s[j] == '\t') { + j++ + } + if t, c, j, ok := parseLinkTitle(s, j); ok { + for j < len(s) && (s[j] == ' ' || s[j] == '\t') { + j++ + } + if j >= len(s) || s[j] == '\n' { + i = j + if t == "" { + // Goldmark adds title="" in this case. + // We do not, nor does the Dingus. + corner = true + } + title = t + titleChar = c + } + } + } + + // Must end line. Already trimmed spaces. + if i < len(s) && s[i] != '\n' { + return 0, false + } + if i < len(s) { + i++ + } + + label = normalizeLabel(label) + if p.link(label) == nil { + p.defineLink(label, &Link{URL: dest, Title: title, TitleChar: titleChar, corner: corner}) + } + return i, true +} + +func parseLinkTitle(s string, i int) (title string, char byte, next int, found bool) { + if i < len(s) && (s[i] == '"' || s[i] == '\'' || s[i] == '(') { + want := s[i] + if want == '(' { + want = ')' + } + j := i + 1 + for ; j < len(s); j++ { + if s[j] == want { + title := s[i+1 : j] + // TODO: Validate title? + return mdUnescaper.Replace(title), want, j + 1, true + } + if s[j] == '(' && want == ')' { + break + } + if s[j] == '\\' && j+1 < len(s) { + j++ + } + } + } + return "", 0, 0, false +} + +func parseLinkLabel(p *parseState, s string, i int) (string, int, bool) { + // “A link label begins with a left bracket ([) and ends with + // the first right bracket (]) that is not backslash-escaped. + // Between these brackets there must be at least one character + // that is not a space, tab, or line ending. + // Unescaped square bracket characters are not allowed + // inside the opening and closing square brackets of link labels. + // A link label can have at most 999 characters inside the square brackets.” + if i >= len(s) || s[i] != '[' { + return "", 0, false + } + j := i + 1 + for ; j < len(s); j++ { + if s[j] == ']' { + if j-(i+1) > 999 { + // Goldmark does not apply 999 limit. + p.corner = true + break + } + if label := trimSpaceTabNewline(s[i+1 : j]); label != "" { + // Note: CommonMark Dingus does not escape. + return label, j + 1, true + } + break + } + if s[j] == '[' { + break + } + if s[j] == '\\' && j+1 < len(s) { + j++ + } + } + return "", 0, false +} + +func normalizeLabel(s string) string { + if strings.Contains(s, "[") || strings.Contains(s, "]") { + // Labels cannot have [ ] so avoid the work of translating. + // This is especially important for pathlogical cases like + // [[[[[[[[[[a]]]]]]]]]] which would otherwise generate quadratic + // amounts of garbage. + return "" + } + + // “To normalize a label, strip off the opening and closing brackets, + // perform the Unicode case fold, strip leading and trailing spaces, tabs, and line endings, + // and collapse consecutive internal spaces, tabs, and line endings to a single space.” + s = trimSpaceTabNewline(s) + var b strings.Builder + space := false + hi := false + for i := 0; i < len(s); i++ { + c := s[i] + switch c { + case ' ', '\t', '\n': + space = true + continue + default: + if space { + b.WriteByte(' ') + space = false + } + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + if c >= 0x80 { + hi = true + } + b.WriteByte(c) + } + } + s = b.String() + if hi { + s = cases.Fold().String(s) + } + return s +} + +func parseLinkDest(s string, i int) (string, int, bool) { + if i >= len(s) { + return "", 0, false + } + + // “A sequence of zero or more characters between an opening < and a closing > + // that contains no line endings or unescaped < or > characters,” + if s[i] == '<' { + for j := i + 1; ; j++ { + if j >= len(s) || s[j] == '\n' || s[j] == '<' { + return "", 0, false + } + if s[j] == '>' { + // TODO unescape? + return mdUnescape(s[i+1 : j]), j + 1, true + } + if s[j] == '\\' { + j++ + } + } + } + + // “or a nonempty sequence of characters that does not start with <, + // does not include ASCII control characters or space character, + // and includes parentheses only if (a) they are backslash-escaped + // or (b) they are part of a balanced pair of unescaped parentheses. + depth := 0 + j := i +Loop: + for ; j < len(s); j++ { + switch s[j] { + case '(': + depth++ + if depth > 32 { + // Avoid quadratic inputs by stopping if too deep. + // This is the same depth that cmark-gfm uses. + return "", 0, false + } + case ')': + if depth == 0 { + break Loop + } + depth-- + case '\\': + if j+1 < len(s) { + if s[j+1] == ' ' || s[j+1] == '\t' { + return "", 0, false + } + j++ + } + case ' ', '\t', '\n': + break Loop + } + } + + dest := s[i:j] + // TODO: Validate dest? + // TODO: Unescape? + // NOTE: CommonMark Dingus does not reject control characters. + return mdUnescape(dest), j, true +} + +func parseAutoLinkURI(s string, i int) (Inline, int, bool) { + // CommonMark 0.30: + // + // For purposes of this spec, a scheme is any sequence of 2–32 characters + // beginning with an ASCII letter and followed by any combination of + // ASCII letters, digits, or the symbols plus (”+”), period (”.”), or + // hyphen (”-”). + // + // An absolute URI, for these purposes, consists of a scheme followed by + // a colon (:) followed by zero or more characters other ASCII control + // characters, space, <, and >. If the URI includes these characters, + // they must be percent-encoded (e.g. %20 for a space). + + j := i + if j+1 >= len(s) || s[j] != '<' || !isLetter(s[j+1]) { + return nil, 0, false + } + j++ + for j < len(s) && isScheme(s[j]) && j-(i+1) <= 32 { + j++ + } + if j-(i+1) < 2 || j-(i+1) > 32 || j >= len(s) || s[j] != ':' { + return nil, 0, false + } + j++ + for j < len(s) && isURL(s[j]) { + j++ + } + if j >= len(s) || s[j] != '>' { + return nil, 0, false + } + link := s[i+1 : j] + // link = mdUnescaper.Replace(link) + return &AutoLink{link, link}, j + 1, true +} + +func parseAutoLinkEmail(s string, i int) (Inline, int, bool) { + // CommonMark 0.30: + // + // An email address, for these purposes, is anything that matches + // the non-normative regex from the HTML5 spec: + // + // /^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ + + j := i + if j+1 >= len(s) || s[j] != '<' || !isUser(s[j+1]) { + return nil, 0, false + } + j++ + for j < len(s) && isUser(s[j]) { + j++ + } + if j >= len(s) || s[j] != '@' { + return nil, 0, false + } + for { + j++ + n, ok := skipDomainElem(s[j:]) + if !ok { + return nil, 0, false + } + j += n + if j >= len(s) || s[j] != '.' && s[j] != '>' { + return nil, 0, false + } + if s[j] == '>' { + break + } + } + email := s[i+1 : j] + return &AutoLink{email, "mailto:" + email}, j + 1, true +} + +func isUser(c byte) bool { + if isLetterDigit(c) { + return true + } + s := ".!#$%&'*+/=?^_`{|}~-" + for i := 0; i < len(s); i++ { + if c == s[i] { + return true + } + } + return false +} + +func isHexDigit(c byte) bool { + return 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' || '0' <= c && c <= '9' +} + +func isDigit(c byte) bool { + return '0' <= c && c <= '9' +} + +func skipDomainElem(s string) (int, bool) { + // String of LDH, up to 63 in length, with LetterDigit + // at both ends (1-letter/digit names are OK). + // Aka /[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?/. + if len(s) < 1 || !isLetterDigit(s[0]) { + return 0, false + } + i := 1 + for i < len(s) && isLDH(s[i]) && i <= 63 { + i++ + } + if i > 63 || !isLetterDigit(s[i-1]) { + return 0, false + } + return i, true +} + +func isScheme(c byte) bool { + return isLetterDigit(c) || c == '+' || c == '.' || c == '-' +} + +func isURL(c byte) bool { + return c > ' ' && c != '<' && c != '>' +} + +type AutoLink struct { + Text string + URL string +} + +func (*AutoLink) Inline() {} + +func (x *AutoLink) PrintHTML(buf *bytes.Buffer) { + fmt.Fprintf(buf, "<a href=\"%s\">%s</a>", htmlLinkEscaper.Replace(x.URL), htmlEscaper.Replace(x.Text)) +} + +func (x *AutoLink) printMarkdown(buf *bytes.Buffer) { + fmt.Fprintf(buf, "<%s>", x.Text) +} + +func (x *AutoLink) PrintText(buf *bytes.Buffer) { + fmt.Fprintf(buf, "%s", htmlEscaper.Replace(x.Text)) +} + +type Link struct { + Inner []Inline + URL string + Title string + TitleChar byte // ', " or ) + corner bool +} + +func (*Link) Inline() {} + +func (x *Link) PrintHTML(buf *bytes.Buffer) { + fmt.Fprintf(buf, "<a href=\"%s\"", htmlLinkEscaper.Replace(x.URL)) + if x.Title != "" { + fmt.Fprintf(buf, " title=\"%s\"", htmlQuoteEscaper.Replace(x.Title)) + } + buf.WriteString(">") + for _, c := range x.Inner { + c.PrintHTML(buf) + } + buf.WriteString("</a>") +} + +func (x *Link) printMarkdown(buf *bytes.Buffer) { + buf.WriteByte('[') + x.printRemainingMarkdown(buf) +} + +func (x *Link) printRemainingMarkdown(buf *bytes.Buffer) { + for _, c := range x.Inner { + c.printMarkdown(buf) + } + buf.WriteString("](") + buf.WriteString(x.URL) + printLinkTitleMarkdown(buf, x.Title, x.TitleChar) + buf.WriteByte(')') +} + +func printLinkTitleMarkdown(buf *bytes.Buffer, title string, titleChar byte) { + if title == "" { + return + } + closeChar := titleChar + openChar := closeChar + if openChar == ')' { + openChar = '(' + } + fmt.Fprintf(buf, " %c%s%c", openChar, title /*TODO(jba): escape*/, closeChar) +} + +func (x *Link) PrintText(buf *bytes.Buffer) { + for _, c := range x.Inner { + c.PrintText(buf) + } +} + +type Image struct { + Inner []Inline + URL string + Title string + TitleChar byte + corner bool +} + +func (*Image) Inline() {} + +func (x *Image) PrintHTML(buf *bytes.Buffer) { + fmt.Fprintf(buf, "<img src=\"%s\"", htmlLinkEscaper.Replace(x.URL)) + fmt.Fprintf(buf, " alt=\"") + i := buf.Len() + for _, c := range x.Inner { + c.PrintText(buf) + } + // GitHub and Goldmark both rewrite \n to space + // but the Dingus does not. + // The spec says title can be split across lines but not + // what happens at that point. + out := buf.Bytes() + for ; i < len(out); i++ { + if out[i] == '\n' { + out[i] = ' ' + } + } + fmt.Fprintf(buf, "\"") + if x.Title != "" { + fmt.Fprintf(buf, " title=\"%s\"", htmlQuoteEscaper.Replace(x.Title)) + } + buf.WriteString(" />") +} + +func (x *Image) printMarkdown(buf *bytes.Buffer) { + buf.WriteString("![") + (*Link)(x).printRemainingMarkdown(buf) +} + +func (x *Image) PrintText(buf *bytes.Buffer) { + for _, c := range x.Inner { + c.PrintText(buf) + } +} + +// GitHub Flavored Markdown autolinks extension +// https://github.github.com/gfm/#autolinks-extension- + +// autoLinkMore rewrites any extended autolinks in the body +// and returns the result. +// +// body is a list of Plain, Emph, Strong, and Del nodes. +// Two Plains only appear consecutively when one is a +// potential emphasis marker that ended up being plain after all, like "_" or "**". +// There are no Link nodes. +// +// The GitHub “spec” declares that “autolinks can only come at the +// beginning of a line, after whitespace, or any of the delimiting +// characters *, _, ~, and (”. However, the GitHub web site does not +// enforce this rule: text like "$abc@def.ghi is my email" links the +// text following the $ as an email address. It appears the actual rule +// is that autolinks cannot come after ASCII letters, although they can +// come after numbers or Unicode letters. +// Since the only point of implementing GitHub Flavored Markdown +// is to match GitHub's behavior, we do what they do, not what they say, +// at least for now. +func (p *parseState) autoLinkText(list []Inline) []Inline { + if !p.AutoLinkText { + return list + } + + var out []Inline // allocated lazily when we first change list + for i, x := range list { + switch x := x.(type) { + case *Plain: + if rewrite := p.autoLinkPlain(x.Text); rewrite != nil { + if out == nil { + out = append(out, list[:i]...) + } + out = append(out, rewrite...) + continue + } + case *Strong: + x.Inner = p.autoLinkText(x.Inner) + case *Del: + x.Inner = p.autoLinkText(x.Inner) + case *Emph: + x.Inner = p.autoLinkText(x.Inner) + } + if out != nil { + out = append(out, x) + } + } + if out == nil { + return list + } + return out +} + +func (p *parseState) autoLinkPlain(s string) []Inline { + vd := &validDomainChecker{s: s} + var out []Inline +Restart: + for i := 0; i < len(s); i++ { + c := s[i] + if c == '@' { + if before, link, after, ok := p.parseAutoEmail(s, i); ok { + if before != "" { + out = append(out, &Plain{Text: before}) + } + out = append(out, link) + vd.skip(len(s) - len(after)) + s = after + goto Restart + } + } + + if (c == 'h' || c == 'm' || c == 'x' || c == 'w') && (i == 0 || !isLetter(s[i-1])) { + if link, after, ok := p.parseAutoProto(s, i, vd); ok { + if i > 0 { + out = append(out, &Plain{Text: s[:i]}) + } + out = append(out, link) + vd.skip(len(s) - len(after)) + s = after + goto Restart + } + } + } + if out == nil { + return nil + } + out = append(out, &Plain{Text: s}) + return out +} + +func (p *parseState) parseAutoProto(s string, i int, vd *validDomainChecker) (link *Link, after string, found bool) { + if s == "" { + return + } + switch s[i] { + case 'h': + var n int + if strings.HasPrefix(s[i:], "https://") { + n = len("https://") + } else if strings.HasPrefix(s[i:], "http://") { + n = len("http://") + } else { + return + } + return p.parseAutoHTTP(s[i:i+n], s, i, i+n, i+n+1, vd) + case 'w': + if !strings.HasPrefix(s[i:], "www.") { + return + } + // GitHub Flavored Markdown says to use http://, + // but it's not 1985 anymore. We live in the https:// future + // (unless the parser is explicitly configured otherwise). + // People who really care in their docs can write http:// themselves. + scheme := "https://" + if p.AutoLinkAssumeHTTP { + scheme = "http://" + } + return p.parseAutoHTTP(scheme, s, i, i, i+3, vd) + case 'm': + if !strings.HasPrefix(s[i:], "mailto:") { + return + } + return p.parseAutoMailto(s, i) + case 'x': + if !strings.HasPrefix(s[i:], "xmpp:") { + return + } + return p.parseAutoXmpp(s, i) + } + return +} + +// parseAutoWWW parses an extended www autolink. +// https://github.github.com/gfm/#extended-www-autolink +func (p *parseState) parseAutoHTTP(scheme, s string, textstart, start, min int, vd *validDomainChecker) (link *Link, after string, found bool) { + n, ok := vd.parseValidDomain(start) + if !ok { + return + } + i := start + n + domEnd := i + + // “After a valid domain, zero or more non-space non-< characters may follow.” + paren := 0 + for i < len(s) { + r, n := utf8.DecodeRuneInString(s[i:]) + if isUnicodeSpace(r) || r == '<' { + break + } + if r == '(' { + paren++ + } + if r == ')' { + paren-- + } + i += n + } + + // https://github.github.com/gfm/#extended-autolink-path-validation +Trim: + for i > min { + switch s[i-1] { + case '?', '!', '.', ',', ':', '@', '_', '~': + // Trim certain trailing punctuation. + i-- + continue Trim + + case ')': + // Trim trailing unmatched (by count only) parens. + if paren < 0 { + for s[i-1] == ')' && paren < 0 { + paren++ + i-- + } + continue Trim + } + + case ';': + // Trim entity reference. + // After doing the work of the scan, we either cut that part off the string + // or we stop the trimming entirely, so there's no chance of repeating + // the scan on a future iteration and going accidentally quadratic. + // Even though the Markdown spec already requires having a complete + // list of all the HTML entities, the GitHub definition here just requires + // "looks like" an entity, meaning its an ampersand, letters/digits, and semicolon. + for j := i - 2; j > start; j-- { + if j < i-2 && s[j] == '&' { + i = j + continue Trim + } + if !isLetterDigit(s[j]) { + break Trim + } + } + } + break Trim + } + + // According to the literal text of the GitHub Flavored Markdown spec + // and the actual behavior on GitHub, + // www.example.com$foo turns into <a href="https://www.example.com$foo">, + // but that makes the character restrictions in the valid-domain check + // almost meaningless. So we insist that when all is said and done, + // if the domain is followed by anything, that thing must be a slash, + // even though GitHub is not that picky. + // People might complain about www.example.com:1234 not working, + // but if you want to get fancy with that kind of thing, just write http:// in front. + if textstart == start && i > domEnd && s[domEnd] != '/' { + i = domEnd + } + + if i < min { + return + } + + link = &Link{ + Inner: []Inline{&Plain{Text: s[textstart:i]}}, + URL: scheme + s[start:i], + } + return link, s[i:], true +} + +type validDomainChecker struct { + s string + cut int // before this index, no valid domains +} + +func (v *validDomainChecker) skip(i int) { + v.s = v.s[i:] + v.cut -= i +} + +// parseValidDomain parses a valid domain. +// https://github.github.com/gfm/#valid-domain +// +// If s starts with a valid domain, parseValidDomain returns +// the length of that domain and true. If s does not start with +// a valid domain, parseValidDomain returns n, false, +// where n is the length of a prefix guaranteed not to be acceptable +// to any future call to parseValidDomain. +// +// “A valid domain consists of segments of alphanumeric characters, +// underscores (_) and hyphens (-) separated by periods (.). +// There must be at least one period, and no underscores may be +// present in the last two segments of the domain.” +// +// The spec does not spell out whether segments can be empty. +// Empirically, in GitHub's implementation they can. +func (v *validDomainChecker) parseValidDomain(start int) (n int, found bool) { + if start < v.cut { + return 0, false + } + i := start + dots := 0 + for ; i < len(v.s); i++ { + c := v.s[i] + if c == '_' { + dots = -2 + continue + } + if c == '.' { + dots++ + continue + } + if !isLDH(c) { + break + } + } + if dots >= 0 && i > start { + return i - start, true + } + v.cut = i + return 0, false +} + +func (p *parseState) parseAutoEmail(s string, i int) (before string, link *Link, after string, ok bool) { + if s[i] != '@' { + return + } + + // “One ore more characters which are alphanumeric, or ., -, _, or +.” + j := i + for j > 0 && (isLDH(s[j-1]) || s[j-1] == '_' || s[j-1] == '+' || s[j-1] == '.') { + j-- + } + if i-j < 1 { + return + } + + // “One or more characters which are alphanumeric, or - or _, separated by periods (.). + // There must be at least one period. The last character must not be one of - or _.” + dots := 0 + k := i + 1 + for k < len(s) && (isLDH(s[k]) || s[k] == '_' || s[k] == '.') { + if s[k] == '.' { + if s[k-1] == '.' { + // Empirically, .. stops the scan but foo@.bar is fine. + break + } + dots++ + } + k++ + } + + // “., -, and _ can occur on both sides of the @, but only . may occur at the end + // of the email address, in which case it will not be considered part of the address” + if s[k-1] == '.' { + dots-- + k-- + } + if s[k-1] == '-' || s[k-1] == '_' { + return + } + if k-(i+1)-dots < 2 || dots < 1 { + return + } + + link = &Link{ + Inner: []Inline{&Plain{Text: s[j:k]}}, + URL: "mailto:" + s[j:k], + } + return s[:j], link, s[k:], true +} + +func (p *parseState) parseAutoMailto(s string, i int) (link *Link, after string, ok bool) { + j := i + len("mailto:") + for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '+' || s[j] == '.') { + j++ + } + if j >= len(s) || s[j] != '@' { + return + } + before, link, after, ok := p.parseAutoEmail(s[i:], j-i) + if before != "mailto:" || !ok { + return nil, "", false + } + link.Inner[0] = &Plain{Text: s[i : len(s)-len(after)]} + return link, after, true +} + +func (p *parseState) parseAutoXmpp(s string, i int) (link *Link, after string, ok bool) { + j := i + len("xmpp:") + for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '+' || s[j] == '.') { + j++ + } + if j >= len(s) || s[j] != '@' { + return + } + before, link, after, ok := p.parseAutoEmail(s[i:], j-i) + if before != "xmpp:" || !ok { + return nil, "", false + } + if after != "" && after[0] == '/' { + k := 1 + for k < len(after) && (isLetterDigit(after[k]) || after[k] == '@' || after[k] == '.') { + k++ + } + after = after[k:] + } + url := s[i : len(s)-len(after)] + link.Inner[0] = &Plain{Text: url} + link.URL = url + return link, after, true +} |