Overhaul links (#202)
Overhaul link parsing, dramatically improving CommonMark compliance.
diff --git a/lib/src/ast.dart b/lib/src/ast.dart
index 9e4869e..4d7af15 100644
--- a/lib/src/ast.dart
+++ b/lib/src/ast.dart
@@ -2,7 +2,7 @@
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
-typedef Node Resolver(String name);
+typedef Node Resolver(String name, [String title]);
/// Base class for any AST item.
///
diff --git a/lib/src/block_parser.dart b/lib/src/block_parser.dart
index e41770a..e4239df 100644
--- a/lib/src/block_parser.dart
+++ b/lib/src/block_parser.dart
@@ -32,6 +32,9 @@
/// SETEXT should win.
final _hrPattern = new RegExp(r'^ {0,3}([-*_])[ \t]*\1[ \t]*\1(?:\1|[ \t])*$');
+/// One or more whitespace, for compressing.
+final _oneOrMoreWhitespacePattern = new RegExp('[ \n\r\t]+');
+
/// A line starting with one of these markers: `-`, `*`, `+`. May have up to
/// three leading spaces before the marker and any number of spaces or tabs
/// after.
@@ -1021,8 +1024,9 @@
title = title.substring(1, title.length - 1);
}
- // References are case-insensitive.
- label = label.toLowerCase().trim();
+ // References are case-insensitive, and internal whitespace is compressed.
+ label =
+ label.toLowerCase().trim().replaceAll(_oneOrMoreWhitespacePattern, ' ');
parser.document.linkReferences
.putIfAbsent(label, () => new LinkReference(label, destination, title));
diff --git a/lib/src/extension_set.dart b/lib/src/extension_set.dart
index d6bec74..12be92b 100644
--- a/lib/src/extension_set.dart
+++ b/lib/src/extension_set.dart
@@ -7,7 +7,8 @@
/// For example, the [gitHub] set of syntax extensions allows users to output
/// HTML from their Markdown in a similar fashion to GitHub's parsing.
class ExtensionSet {
- /// The [none] extension set renders Markdown similar to [Markdown.pl].
+ /// The [ExtensionSet.none] extension set renders Markdown similar to
+ /// [Markdown.pl].
///
/// However, this set does not render _exactly_ the same as Markdown.pl;
/// rather it is more-or-less the CommonMark standard of Markdown, without
diff --git a/lib/src/inline_parser.dart b/lib/src/inline_parser.dart
index 12d8fee..4242d0b 100644
--- a/lib/src/inline_parser.dart
+++ b/lib/src/inline_parser.dart
@@ -2,6 +2,8 @@
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
+import 'package:charcode/charcode.dart';
+
import 'ast.dart';
import 'document.dart';
import 'emojis.dart';
@@ -87,29 +89,14 @@
_stack.add(new TagState(0, 0, null, null));
while (!isDone) {
- var matched = false;
-
- // See if any of the current tags on the stack match. We don't allow tags
- // of the same kind to nest, so this takes priority over other possible
- // matches.
- for (var i = _stack.length - 1; i > 0; i--) {
- if (_stack[i].tryMatch(this)) {
- matched = true;
- break;
- }
- }
-
- if (matched) continue;
+ // See if any of the current tags on the stack match. This takes
+ // priority over other possible matches.
+ if (_stack.reversed
+ .any((state) => state.syntax != null && state.tryMatch(this)))
+ continue;
// See if the current text matches any defined markdown syntax.
- for (var syntax in syntaxes) {
- if (syntax.tryMatch(this)) {
- matched = true;
- break;
- }
- }
-
- if (matched) continue;
+ if (syntaxes.any((syntax) => syntax.tryMatch(this))) continue;
// If we got here, it's just text.
advanceBy(1);
@@ -119,6 +106,8 @@
return _stack[0].close(this, null);
}
+ int charAt(int index) => source.codeUnitAt(index);
+
void writeText() {
writeTextRange(start, pos);
start = pos;
@@ -139,10 +128,14 @@
}
}
+ /// Add [node] to the last [TagState] on the stack.
void addNode(Node node) {
_stack.last.children.add(node);
}
+ /// Push [state] onto the stack of [TagState]s.
+ void openTag(TagState state) => _stack.add(state);
+
bool get isDone => pos == source.length;
void advanceBy(int length) {
@@ -229,7 +222,7 @@
}
/// Leave inline HTML tags alone, from
-/// [CommonMark 0.22](http://spec.commonmark.org/0.22/#raw-html).
+/// [CommonMark 0.28](http://spec.commonmark.org/0.28/#raw-html).
///
/// This is not actually a good definition (nor CommonMark's) of an HTML tag,
/// but it is fast. It will leave text like `<a href='hi">` alone, which is
@@ -238,7 +231,7 @@
/// TODO(srawlins): improve accuracy while ensuring performance, once
/// Markdown benchmarking is more mature.
class InlineHtmlSyntax extends TextSyntax {
- InlineHtmlSyntax() : super(r'<[/!?]?[A-Za-z][A-Za-z0-9-]*(?: [^>]*)?>');
+ InlineHtmlSyntax() : super(r'<[/!?]?[A-Za-z][A-Za-z0-9-]*(?:\s[^>]*)?>');
}
/// Matches autolinks like `<foo@bar.example.com>`.
@@ -402,7 +395,7 @@
// TODO(srawlins): Unicode whitespace
static final String whitespace = ' \t\r\n';
- final String char;
+ final int char;
final int length;
final bool isLeftFlanking;
final bool isRightFlanking;
@@ -463,7 +456,7 @@
}
return new _DelimiterRun._(
- char: parser.source.substring(runStart, runStart + 1),
+ char: parser.charAt(runStart),
length: runEnd - runStart + 1,
isLeftFlanking: leftFlanking,
isRightFlanking: rightFlanking,
@@ -478,18 +471,23 @@
// Whether a delimiter in this run can open emphasis or strong emphasis.
bool get canOpen =>
isLeftFlanking &&
- (char == '*' || !isRightFlanking || isPrecededByPunctuation);
+ (char == $asterisk || !isRightFlanking || isPrecededByPunctuation);
// Whether a delimiter in this run can close emphasis or strong emphasis.
bool get canClose =>
isRightFlanking &&
- (char == '*' || !isLeftFlanking || isFollowedByPunctuation);
+ (char == $asterisk || !isLeftFlanking || isFollowedByPunctuation);
}
/// Matches syntax that has a pair of tags and becomes an element, like `*` for
/// `<em>`. Allows nested tags.
class TagSyntax extends InlineSyntax {
final RegExp endPattern;
+
+ /// Whether this is parsed according to the same nesting rules as [emphasis
+ /// delimiters][].
+ ///
+ /// [emphasis delimiters]: http://spec.commonmark.org/0.28/#can-open-emphasis
final bool requiresDelimiterRun;
TagSyntax(String pattern, {String end, this.requiresDelimiterRun: false})
@@ -500,10 +498,15 @@
var runLength = match.group(0).length;
var matchStart = parser.pos;
var matchEnd = parser.pos + runLength - 1;
+ if (!requiresDelimiterRun) {
+ parser.openTag(new TagState(parser.pos, matchEnd + 1, this, null));
+ return true;
+ }
+
var delimiterRun = _DelimiterRun.tryParse(parser, matchStart, matchEnd);
if (delimiterRun != null && delimiterRun.canOpen) {
- parser._stack
- .add(new TagState(parser.pos, matchEnd + 1, this, delimiterRun));
+ parser
+ .openTag(new TagState(parser.pos, matchEnd + 1, this, delimiterRun));
return true;
} else {
parser.advanceBy(runLength);
@@ -517,9 +520,6 @@
var matchEnd = parser.pos + runLength - 1;
var openingRunLength = state.endPos - state.startPos;
var delimiterRun = _DelimiterRun.tryParse(parser, matchStart, matchEnd);
- if (!delimiterRun.isRightFlanking) {
- return false;
- }
if (openingRunLength == 1 && runLength == 1) {
parser.addNode(new Element('em', state.children));
@@ -528,7 +528,7 @@
parser.pos = parser.pos - (runLength - 1);
parser.start = parser.pos;
} else if (openingRunLength > 1 && runLength == 1) {
- parser._stack.add(
+ parser.openTag(
new TagState(state.startPos, state.endPos - 1, this, delimiterRun));
parser.addNode(new Element('em', state.children));
} else if (openingRunLength == 2 && runLength == 2) {
@@ -538,11 +538,11 @@
parser.pos = parser.pos - (runLength - 2);
parser.start = parser.pos;
} else if (openingRunLength > 2 && runLength == 2) {
- parser._stack.add(
+ parser.openTag(
new TagState(state.startPos, state.endPos - 2, this, delimiterRun));
parser.addNode(new Element('strong', state.children));
} else if (openingRunLength > 2 && runLength > 2) {
- parser._stack.add(
+ parser.openTag(
new TagState(state.startPos, state.endPos - 2, this, delimiterRun));
parser.addNode(new Element('strong', state.children));
parser.pos = parser.pos - (runLength - 2);
@@ -572,146 +572,454 @@
}
}
-/// Matches inline links like `[blah][id]` and `[blah](url)`.
+/// Matches links like `[blah][label]` and `[blah](url)`.
class LinkSyntax extends TagSyntax {
+ static final _entirelyWhitespacePattern = new RegExp(r'^\s*$');
+
final Resolver linkResolver;
- /// The regex for the end of a link.
- ///
- /// This handles both reference-style and inline-style links as well as
- /// optional titles for inline links. To make that a bit more palatable, this
- /// breaks it into pieces.
- static String get _linkPattern {
- var refLink = r'\[([^\]]*)\]'; // `[id]` reflink id.
- var title = r'(?:\s*"([^"]+?)"\s*|)'; // Optional title in quotes.
- var inlineLink = '\\((\\S*?)$title\\)'; // `(url "title")` link.
- return '\](?:($refLink|$inlineLink)|)';
+ LinkSyntax({Resolver linkResolver, String pattern: r'\['})
+ : this.linkResolver = (linkResolver ?? (String _, [String __]) => null),
+ super(pattern, end: r'\]');
- // The groups matched by this are:
- // 1: Will be non-empty if it's either a ref or inline link. Will be empty
- // if it's just a bare pair of square brackets with nothing after them.
- // 2: Contains the id inside [] for a reference-style link.
- // 3: Contains the URL for an inline link.
- // 4: Contains the title, if present, for an inline link.
- }
+ // The pending [TagState]s, all together, are "active" or "inactive" based on
+ // whether a link element has just been parsed.
+ //
+ // Links cannot be nested, so we must "deactivate" any pending ones. For
+ // example, take the following text:
+ //
+ // Text [link and [more](links)](links).
+ //
+ // Once we have parsed `Text [`, there is one (pending) link in the state
+ // stack. It is, by default, active. Once we parse the next possible link,
+ // `[more](links)`, as a real link, we must deactive the pending links (just
+ // the one, in this case).
+ var _pendingStatesAreActive = true;
- LinkSyntax({this.linkResolver, String pattern: r'\['})
- : super(pattern, end: _linkPattern);
+ bool onMatch(InlineParser parser, Match match) {
+ var matched = super.onMatch(parser, match);
+ if (!matched) return false;
- Node createNode(InlineParser parser, Match match, TagState state) {
- if (match[1] == null) {
- // Try for a shortcut reference link, like `[foo]`.
- var element = _createElement(parser, match, state);
- if (element != null) return element;
+ _pendingStatesAreActive = true;
- // If we didn't match refLink or inlineLink, and it's not a _shortcut_
- // reflink, then it means it isn't a normal Markdown link at all. Instead,
- // we allow users of the library to specify a special resolver function
- // ([linkResolver]) that may choose to handle this. Otherwise, it's just
- // treated as plain text.
- if (linkResolver == null) return null;
-
- // Treat the contents as unparsed text even if they happen to match. This
- // way, we can handle things like [LINK_WITH_UNDERSCORES] as a link and
- // not get confused by the emphasis.
- var textToResolve = parser.source.substring(state.endPos, parser.pos);
-
- // See if we have a resolver that will generate a link for us.
- return linkResolver(textToResolve);
- } else {
- return _createElement(parser, match, state);
- }
- }
-
- /// Given that [match] has matched both a title and URL, creates an `<a>`
- /// [Element] for it.
- Element _createElement(InlineParser parser, Match match, TagState state) {
- var link = getLink(parser, match, state);
- if (link == null) return null;
-
- var element = new Element('a', state.children);
-
- element.attributes["href"] = escapeHtml(link.destination);
- if (link.title != null) {
- element.attributes['title'] = escapeHtml(link.title);
- }
-
- return element;
- }
-
- /// Get the Link represented by [match].
- ///
- /// This method can return null, if the link is a reference link, and has no
- /// accompanying link reference definition.
- ///
- /// Temporarily, this is returning [LinkReference]s, for convenience, which
- /// is an improper use of [LinkReference]s. This should change before this
- /// package is released.
- LinkReference getLink(InlineParser parser, Match match, TagState state) {
- if (match[3] != null) {
- // Inline link like [foo](url).
- var url = match[3];
- var title = match[4];
-
- // For whatever reason, Markdown allows angle-bracketed URLs here.
- if (url.startsWith('<') && url.endsWith('>')) {
- url = url.substring(1, url.length - 1);
- }
-
- return new LinkReference(null, url, title);
- } else {
- String label;
- String _contents() {
- var offset = pattern.pattern.length - 1;
- return parser.source.substring(state.startPos + offset, parser.pos);
- }
-
- // Reference link like [foo][bar].
- if (match[1] == null) {
- // There are no reference brackets ("shortcut reference link"), so infer
- // the label from the contents.
- label = _contents();
- } else if (match[2] == '') {
- // The label is empty ("[]") so infer it from the contents.
- label = _contents();
- } else {
- label = match[2];
- }
-
- // References are case-insensitive.
- label = label.toLowerCase();
- return parser.document.linkReferences[label];
- }
+ return true;
}
bool onMatchEnd(InlineParser parser, Match match, TagState state) {
- var node = createNode(parser, match, state);
- if (node == null) return false;
+ if (!_pendingStatesAreActive) return false;
- parser.addNode(node);
+ var text = parser.source.substring(state.endPos, parser.pos);
+ // The current character is the `]` that closed the link text. Examine the
+ // next character, to determine what type of link we might have (a '('
+ // means a possible inline link; otherwise a possible reference link).
+ if (parser.pos + 1 >= parser.source.length) {
+ // In this case, the Markdown document may have ended with a shortcut
+ // reference link.
+
+ return _tryAddReferenceLink(parser, state, text);
+ }
+ // Peek at the next character; don't advance, so as to avoid later stepping
+ // backward.
+ var char = parser.charAt(parser.pos + 1);
+
+ if (char == $lparen) {
+ // Maybe an inline link, like `[text](destination)`.
+ parser.advanceBy(1);
+ var leftParenIndex = parser.pos;
+ var inlineLink = _parseInlineLink(parser);
+ if (inlineLink != null)
+ return _tryAddInlineLink(parser, state, inlineLink);
+
+ // Reset the parser position.
+ parser.pos = leftParenIndex;
+
+ // At this point, we've matched `[...](`, but that `(` did not pan out to
+ // be an inline link. We must now check if `[...]` is simply a shortcut
+ // reference link.
+ parser.advanceBy(-1);
+ return _tryAddReferenceLink(parser, state, text);
+ }
+
+ if (char == $lbracket) {
+ parser.advanceBy(1);
+ // At this point, we've matched `[...][`. Maybe a *full* reference link,
+ // like `[foo][bar]` or a *collapsed* reference link, like `[foo][]`.
+ if (parser.pos + 1 < parser.source.length &&
+ parser.charAt(parser.pos + 1) == $rbracket) {
+ // That opening `[` is not actually part of the link. Maybe a
+ // *shortcut* reference link (followed by a `[`).
+ parser.advanceBy(1);
+ return _tryAddReferenceLink(parser, state, text);
+ }
+ var label = _parseReferenceLinkLabel(parser);
+ if (label != null) return _tryAddReferenceLink(parser, state, label);
+ return false;
+ }
+
+ // The link text (inside `[...]`) was not followed with a opening `(` nor
+ // an opening `[`. Perhaps just a simple shortcut reference link (`[...]`).
+
+ return _tryAddReferenceLink(parser, state, text);
+ }
+
+ /// Resolve a possible reference link.
+ ///
+ /// Uses [linkReferences], [linkResolver], and [_createNode] to try to
+ /// resolve [label] and [state] into a [Node]. If [label] is defined in
+ /// [linkReferences] or can be resolved by [linkResolver], returns a [Node]
+ /// that links to the resolved URL.
+ ///
+ /// Otherwise, returns `null`.
+ ///
+ /// [label] does not need to be normalized.
+ Node _resolveReferenceLink(
+ String label, TagState state, Map<String, LinkReference> linkReferences) {
+ var normalizedLabel = label.toLowerCase();
+ var linkReference = linkReferences[normalizedLabel];
+ if (linkReference != null) {
+ return _createNode(state, linkReference.destination, linkReference.title);
+ } else {
+ // This link has no reference definition. But we allow users of the
+ // library to specify a custom resolver function ([linkResolver]) that
+ // may choose to handle this. Otherwise, it's just treated as plain
+ // text.
+
+ // Normally, label text does not get parsed as inline Markdown. However,
+ // for the benefit of the link resolver, we need to at least escape
+ // brackets, so that, e.g. a link resolver can receive `[\[\]]` as `[]`.
+ return linkResolver(label
+ .replaceAll(r'\\', r'\')
+ .replaceAll(r'\[', '[')
+ .replaceAll(r'\]', ']'));
+ }
+ }
+
+ /// Create the node represented by a Markdown link.
+ Node _createNode(TagState state, String destination, String title) {
+ var element = new Element('a', state.children);
+ element.attributes['href'] = escapeAttribute(destination);
+ if (title != null && title.isNotEmpty) {
+ element.attributes['title'] = escapeAttribute(title);
+ }
+ return element;
+ }
+
+ // Add a reference link node to [parser]'s AST.
+ //
+ // Returns whether the link was added successfully.
+ bool _tryAddReferenceLink(InlineParser parser, TagState state, String label) {
+ var element =
+ _resolveReferenceLink(label, state, parser.document.linkReferences);
+ if (element == null) {
+ return false;
+ }
+ parser.addNode(element);
+ parser.start = parser.pos;
+ _pendingStatesAreActive = false;
return true;
}
+
+ // Add an inline link node to [parser]'s AST.
+ //
+ // Returns whether the link was added successfully.
+ bool _tryAddInlineLink(InlineParser parser, TagState state, InlineLink link) {
+ var element = _createNode(state, link.destination, link.title);
+ if (element == null) return false;
+ parser.addNode(element);
+ parser.start = parser.pos;
+ _pendingStatesAreActive = false;
+ return true;
+ }
+
+ /// Parse a reference link label at the current position.
+ ///
+ /// Specifically, [parser.pos] is expected to be pointing at the `[` which
+ /// opens the link label.
+ ///
+ /// Returns the label if it could be parsed, or `null` if not.
+ String _parseReferenceLinkLabel(InlineParser parser) {
+ // Walk past the opening `[`.
+ parser.advanceBy(1);
+ if (parser.isDone) return null;
+
+ var buffer = new StringBuffer();
+ while (true) {
+ var char = parser.charAt(parser.pos);
+ if (char == $backslash) {
+ parser.advanceBy(1);
+ var next = parser.charAt(parser.pos);
+ if (next != $backslash && next != $rbracket) {
+ buffer.writeCharCode(char);
+ }
+ buffer.writeCharCode(next);
+ } else if (char == $rbracket) {
+ break;
+ } else {
+ buffer.writeCharCode(char);
+ }
+ parser.advanceBy(1);
+ if (parser.isDone) return null;
+ // TODO(srawlins): only check 999 characters, for performance reasons?
+ }
+
+ var label = buffer.toString();
+
+ // A link label must contain at least one non-whitespace character.
+ if (_entirelyWhitespacePattern.hasMatch(label)) return null;
+
+ return label;
+ }
+
+ /// Parse an inline [InlineLink] at the current position.
+ ///
+ /// At this point, we have parsed a link's (or image's) opening `[`, and then
+ /// a matching closing `]`, and [parser.pos] is pointing at an opening `(`.
+ /// This method will then attempt to parse a link destination wrapped in `<>`,
+ /// such as `(<http://url>)`, or a bare link destination, such as
+ /// `(http://url)`, or a link destination with a title, such as
+ /// `(http://url "title")`.
+ ///
+ /// Returns the [InlineLink] if one was parsed, or `null` if not.
+ InlineLink _parseInlineLink(InlineParser parser) {
+ // Start walking to the character just after the opening `(`.
+ parser.advanceBy(1);
+
+ _moveThroughWhitespace(parser);
+ if (parser.isDone) return null; // EOF. Not a link.
+
+ if (parser.charAt(parser.pos) == $lt) {
+ // Maybe a `<...>`-enclosed link destination.
+ return _parseInlineBracketedLink(parser);
+ } else {
+ return _parseInlineBareDestinationLink(parser);
+ }
+ }
+
+ /// Parse an inline link with a bracketed destination (a destination wrapped
+ /// in `<...>`). The current position of the parser must be the first
+ /// character of the destination.
+ InlineLink _parseInlineBracketedLink(InlineParser parser) {
+ parser.advanceBy(1);
+
+ var buffer = new StringBuffer();
+ while (true) {
+ var char = parser.charAt(parser.pos);
+ if (char == $backslash) {
+ parser.advanceBy(1);
+ var next = parser.charAt(parser.pos);
+ if (char == $space || char == $lf || char == $cr || char == $ff) {
+ // Not a link (no whitespace allowed within `<...>`).
+ return null;
+ }
+ // TODO: Follow the backslash spec better here.
+ // http://spec.commonmark.org/0.28/#backslash-escapes
+ if (next != $backslash && next != $gt) {
+ buffer.writeCharCode(char);
+ }
+ buffer.writeCharCode(next);
+ } else if (char == $space || char == $lf || char == $cr || char == $ff) {
+ // Not a link (no whitespace allowed within `<...>`).
+ return null;
+ } else if (char == $gt) {
+ break;
+ } else {
+ buffer.writeCharCode(char);
+ }
+ parser.advanceBy(1);
+ if (parser.isDone) return null;
+ }
+ var destination = buffer.toString();
+
+ parser.advanceBy(1);
+ var char = parser.charAt(parser.pos);
+ if (char == $space || char == $lf || char == $cr || char == $ff) {
+ var title = _parseTitle(parser);
+ if (title == null && parser.charAt(parser.pos) != $rparen) {
+ // This looked like an inline link, until we found this $space
+ // followed by mystery characters; no longer a link.
+ return null;
+ }
+ return new InlineLink(destination, title: title);
+ } else if (char == $rparen) {
+ return new InlineLink(destination);
+ } else {
+ // We parsed something like `[foo](<url>X`. Not a link.
+ return null;
+ }
+ }
+
+ /// Parse an inline link with a "bare" destination (a destination _not_
+ /// wrapped in `<...>`). The current position of the parser must be the first
+ /// character of the destination.
+ InlineLink _parseInlineBareDestinationLink(InlineParser parser) {
+ // According to
+ // [CommonMark](http://spec.commonmark.org/0.28/#link-destination):
+ //
+ // > A link destination consists of [...] a nonempty sequence of
+ // > characters [...], and includes parentheses only if (a) they are
+ // > backslash-escaped or (b) they are part of a balanced pair of
+ // > unescaped parentheses.
+ //
+ // We need to count the open parens. We start with 1 for the paren that
+ // opened the destination.
+ var parenCount = 1;
+ var buffer = new StringBuffer();
+
+ while (true) {
+ var char = parser.charAt(parser.pos);
+ switch (char) {
+ case $backslash:
+ parser.advanceBy(1);
+ if (parser.isDone) return null; // EOF. Not a link.
+ var next = parser.charAt(parser.pos);
+ // Parentheses may be escaped.
+ //
+ // http://spec.commonmark.org/0.28/#example-467
+ if (next != $backslash && next != $lparen && next != $rparen) {
+ buffer.writeCharCode(char);
+ }
+ buffer.writeCharCode(next);
+ break;
+
+ case $space:
+ case $lf:
+ case $cr:
+ case $ff:
+ var destination = buffer.toString();
+ var title = _parseTitle(parser);
+ if (title == null && parser.charAt(parser.pos) != $rparen) {
+ // This looked like an inline link, until we found this $space
+ // followed by mystery characters; no longer a link.
+ return null;
+ }
+ // [_parseTitle] made sure the title was follwed by a closing `)`
+ // (but it's up to the code here to examine the balance of
+ // parentheses).
+ parenCount--;
+ if (parenCount == 0) {
+ return new InlineLink(destination, title: title);
+ }
+ break;
+
+ case $lparen:
+ parenCount++;
+ buffer.writeCharCode(char);
+ break;
+
+ case $rparen:
+ parenCount--;
+ if (parenCount == 0) {
+ var destination = buffer.toString();
+ return new InlineLink(destination);
+ }
+ buffer.writeCharCode(char);
+ break;
+
+ default:
+ buffer.writeCharCode(char);
+ }
+ parser.advanceBy(1);
+ if (parser.isDone) return null; // EOF. Not a link.
+ }
+ }
+
+ // Walk the parser forward through any whitespace.
+ void _moveThroughWhitespace(InlineParser parser) {
+ while (true) {
+ var char = parser.charAt(parser.pos);
+ if (char != $space &&
+ char != $tab &&
+ char != $lf &&
+ char != $vt &&
+ char != $cr &&
+ char != $ff) {
+ return;
+ }
+ parser.advanceBy(1);
+ if (parser.isDone) return;
+ }
+ }
+
+ // Parse a link title in [parser] at it's current position. The parser's
+ // current position should be a whitespace character that followed a link
+ // destination.
+ String _parseTitle(InlineParser parser) {
+ _moveThroughWhitespace(parser);
+ if (parser.isDone) return null;
+
+ // The whitespace should be followed by a title delimiter.
+ var delimiter = parser.charAt(parser.pos);
+ if (delimiter != $apostrophe &&
+ delimiter != $quote &&
+ delimiter != $lparen) {
+ return null;
+ }
+
+ var closeDelimiter = delimiter == $lparen ? $rparen : delimiter;
+ parser.advanceBy(1);
+
+ // Now we look for an un-escaped closing delimiter.
+ var buffer = new StringBuffer();
+ while (true) {
+ var char = parser.charAt(parser.pos);
+ if (char == $backslash) {
+ parser.advanceBy(1);
+ var next = parser.charAt(parser.pos);
+ if (next != $backslash && next != closeDelimiter) {
+ buffer.writeCharCode(char);
+ }
+ buffer.writeCharCode(next);
+ } else if (char == closeDelimiter) {
+ break;
+ } else {
+ buffer.writeCharCode(char);
+ }
+ parser.advanceBy(1);
+ if (parser.isDone) return null;
+ }
+ var title = buffer.toString();
+
+ // Advance past the closing delimiter.
+ parser.advanceBy(1);
+ if (parser.isDone) return null;
+ _moveThroughWhitespace(parser);
+ if (parser.isDone) return null;
+ if (parser.charAt(parser.pos) != $rparen) return null;
+ return title;
+ }
}
/// Matches images like `![alternate text](url "optional title")` and
-/// `![alternate text][url reference]`.
+/// `![alternate text][label]`.
class ImageSyntax extends LinkSyntax {
ImageSyntax({Resolver linkResolver})
: super(linkResolver: linkResolver, pattern: r'!\[');
- /// Creates an <img> element from the given complete [match].
- Element _createElement(InlineParser parser, Match match, TagState state) {
- var link = getLink(parser, match, state);
- if (link == null) return null;
- var image = new Element.empty("img");
- image.attributes["src"] = escapeHtml(link.destination);
- image.attributes["alt"] = state?.textContent ?? '';
-
- if (link.title != null) {
- image.attributes["title"] = escapeHtml(link.title);
+ Node _createNode(TagState state, String destination, String title) {
+ var element = new Element.empty('img');
+ element.attributes['src'] = escapeHtml(destination);
+ element.attributes['alt'] = state?.textContent ?? '';
+ if (title != null && title.isNotEmpty) {
+ element.attributes['title'] = escapeAttribute(title);
}
+ return element;
+ }
- return image;
+ // Add an image node to [parser]'s AST.
+ //
+ // If [label] is present, the potential image is treated as a reference image.
+ // Otherwise, it is treated as an inline image.
+ //
+ // Returns whether the image was added successfully.
+ bool _tryAddReferenceLink(InlineParser parser, TagState state, String label) {
+ var element =
+ _resolveReferenceLink(label, state, parser.document.linkReferences);
+ if (element == null) {
+ return false;
+ }
+ parser.addNode(element);
+ parser.start = parser.pos;
+ return true;
}
}
@@ -732,7 +1040,7 @@
CodeSyntax() : super(_pattern);
bool tryMatch(InlineParser parser, [int startMatchPos]) {
- if (parser.pos > 0 && parser.source[parser.pos - 1] == '`') {
+ if (parser.pos > 0 && parser.charAt(parser.pos - 1) == $backquote) {
// Not really a match! We can't just sneak past one backtick to try the
// next character. An example of this situation would be:
//
@@ -814,6 +1122,7 @@
return true;
}
+ // TODO: Move this logic into TagSyntax.
var runLength = endMatch.group(0).length;
var openingRunLength = endPos - startPos;
var closingMatchStart = parser.pos;
@@ -884,3 +1193,10 @@
String get textContent =>
children.map((Node child) => child.textContent).join('');
}
+
+class InlineLink {
+ final String destination;
+ final String title;
+
+ InlineLink(this.destination, {this.title});
+}
diff --git a/lib/src/util.dart b/lib/src/util.dart
index f670389..7e0df9c 100644
--- a/lib/src/util.dart
+++ b/lib/src/util.dart
@@ -1,4 +1,71 @@
import 'dart:convert';
+import 'package:charcode/charcode.dart';
+
String escapeHtml(String html) =>
const HtmlEscape(HtmlEscapeMode.ELEMENT).convert(html);
+
+// Escape the contents of [value], so that it may be used as an HTML attribute.
+
+// Based on http://spec.commonmark.org/0.28/#backslash-escapes.
+String escapeAttribute(String value) {
+ var result = new StringBuffer();
+ int ch;
+ for (var i = 0; i < value.codeUnits.length; i++) {
+ ch = value.codeUnitAt(i);
+ if (ch == $backslash) {
+ i++;
+ if (i == value.codeUnits.length) {
+ result.writeCharCode(ch);
+ break;
+ }
+ ch = value.codeUnitAt(i);
+ switch (ch) {
+ case $quote:
+ result.write('"');
+ break;
+ case $exclamation:
+ case $hash:
+ case $dollar:
+ case $percent:
+ case $ampersand:
+ case $apostrophe:
+ case $lparen:
+ case $rparen:
+ case $asterisk:
+ case $plus:
+ case $comma:
+ case $dash:
+ case $dot:
+ case $slash:
+ case $colon:
+ case $semicolon:
+ case $lt:
+ case $equal:
+ case $gt:
+ case $question:
+ case $at:
+ case $lbracket:
+ case $backslash:
+ case $rbracket:
+ case $caret:
+ case $underscore:
+ case $backquote:
+ case $lbrace:
+ case $bar:
+ case $rbrace:
+ case $tilde:
+ result.writeCharCode(ch);
+ break;
+ default:
+ result.write('%5C');
+ result.writeCharCode(ch);
+ }
+ } else if (ch == $quote) {
+ result.write('%22');
+ } else {
+ result.writeCharCode(ch);
+ }
+ }
+ return result.toString();
+}
diff --git a/pubspec.yaml b/pubspec.yaml
index 2bcabaf..05da83a 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -12,6 +12,7 @@
dependencies:
args: '^1.0.0'
+ charcode: '^1.1.0'
dev_dependencies:
collection: '^1.2.0'
diff --git a/test/markdown_test.dart b/test/markdown_test.dart
index 9860d8e..bab44d2 100644
--- a/test/markdown_test.dart
+++ b/test/markdown_test.dart
@@ -25,7 +25,7 @@
inlineSyntaxes: [new InlineHtmlSyntax()]);
group('Resolver', () {
- Node nyanResolver(String text) => new Text('~=[,,_${text}_,,]:3');
+ Node nyanResolver(String text, [_]) => new Text('~=[,,_${text}_,,]:3');
validateCore(
'simple link resolver',
'''
@@ -55,6 +55,26 @@
<p>resolve ~=[,,_*star* _underline__,,]:3 thing</p>
''',
linkResolver: nyanResolver);
+
+ validateCore(
+ 'link resolver uses un-normalized link label',
+ '''
+resolve [TH IS] thing
+''',
+ '''
+<p>resolve ~=[,,_TH IS_,,]:3 thing</p>
+''',
+ linkResolver: nyanResolver);
+
+ validateCore(
+ 'can resolve brackets',
+ r'''
+resolve [\[\]] thing
+''',
+ '''
+<p>resolve ~=[,,_[]_,,]:3 thing</p>
+''',
+ linkResolver: nyanResolver);
});
group('Custom inline syntax', () {
@@ -69,7 +89,7 @@
validateCore('dart custom links', 'links [are<foo>] awesome',
'<p>links <a>are<foo></a> awesome</p>\n',
- linkResolver: (text) =>
+ linkResolver: (String text, [_]) =>
new Element.text('a', text.replaceAll('<', '<')));
// TODO(amouravski): need more tests here for custom syntaxes, as some
diff --git a/test/original/inline_links.unit b/test/original/inline_links.unit
index dd0a65d..e62cddd 100644
--- a/test/original/inline_links.unit
+++ b/test/original/inline_links.unit
@@ -38,6 +38,11 @@
<<<
<p>links <a href="http://example.com">are</a> awesome</p>
+>>> URL wrapped in angle brackets with a title; https://github.com/commonmark/CommonMark/issues/521
+links [are](<http://example.com> "title") awesome
+
+<<<
+<p>links <a href="http://example.com" title="title">are</a> awesome</p>
>>> multi-line link
links [are
awesome](<http://example.com>).
@@ -61,3 +66,13 @@
<<<
<p>links [are <em>awesome</em>]</p>
+>>> links with escaped parens
+[a](\(yes-a-link)
+[a](\(yes-a-link\))
+[a](\\(not-a-link\))
+[a](\\(yes-a-link\)))
+<<<
+<p><a href="(yes-a-link">a</a>
+<a href="(yes-a-link)">a</a>
+[a](\(not-a-link))
+<a href="(yes-a-link))">a</a></p>
diff --git a/tool/common_mark_stats.json b/tool/common_mark_stats.json
index 1a78497..4916da7 100644
--- a/tool/common_mark_stats.json
+++ b/tool/common_mark_stats.json
@@ -51,8 +51,8 @@
"296": "strict",
"297": "strict",
"298": "strict",
- "299": "fail",
- "300": "fail",
+ "299": "strict",
+ "300": "strict",
"301": "fail"
},
"Blank lines": {
@@ -414,22 +414,22 @@
"463": "strict",
"464": "loose",
"465": "strict",
- "466": "fail",
- "467": "fail",
- "468": "fail",
- "469": "fail",
- "470": "fail",
- "471": "fail",
+ "466": "strict",
+ "467": "strict",
+ "468": "strict",
+ "469": "strict",
+ "470": "strict",
+ "471": "strict",
"472": "strict",
- "473": "fail",
+ "473": "strict",
"474": "fail",
- "475": "fail",
- "476": "fail",
+ "475": "strict",
+ "476": "strict",
"477": "fail",
"478": "fail",
"479": "loose",
"480": "fail",
- "481": "fail",
+ "481": "strict",
"482": "strict",
"483": "fail",
"484": "strict",
@@ -437,8 +437,8 @@
"486": "strict",
"487": "strict",
"488": "loose",
- "489": "fail",
- "490": "fail",
+ "489": "strict",
+ "490": "strict",
"491": "fail",
"492": "fail",
"493": "strict",
@@ -451,8 +451,8 @@
"500": "strict",
"501": "strict",
"502": "loose",
- "503": "loose",
- "504": "fail",
+ "503": "strict",
+ "504": "strict",
"505": "fail",
"506": "strict",
"507": "strict",
@@ -460,7 +460,7 @@
"509": "strict",
"510": "strict",
"511": "strict",
- "512": "fail",
+ "512": "strict",
"513": "strict",
"514": "strict",
"515": "strict",
@@ -488,9 +488,9 @@
"537": "strict",
"538": "strict",
"539": "strict",
- "540": "loose",
+ "540": "strict",
"541": "strict",
- "542": "loose"
+ "542": "strict"
},
"List items": {
"216": "loose",
@@ -584,7 +584,7 @@
"Raw HTML": {
"584": "strict",
"585": "fail",
- "586": "fail",
+ "586": "strict",
"587": "strict",
"588": "strict",
"589": "loose",
diff --git a/tool/common_mark_stats.txt b/tool/common_mark_stats.txt
index 9d03b4e..3692c47 100644
--- a/tool/common_mark_stats.txt
+++ b/tool/common_mark_stats.txt
@@ -1,6 +1,6 @@
17 of 18 – 94.4% ATX headings
19 of 19 – 100.0% Autolinks
- 9 of 13 – 69.2% Backslash escapes
+ 11 of 13 – 84.6% Backslash escapes
1 of 1 – 100.0% Blank lines
22 of 25 – 88.0% Block quotes
16 of 17 – 94.1% Code spans
@@ -13,15 +13,15 @@
11 of 12 – 91.7% Indented code blocks
1 of 1 – 100.0% Inlines
21 of 23 – 91.3% Link reference definitions
- 60 of 84 – 71.4% Links
+ 75 of 84 – 89.3% Links
44 of 48 – 91.7% List items
18 of 24 – 75.0% Lists
8 of 8 – 100.0% Paragraphs
1 of 1 – 100.0% Precedence
- 15 of 21 – 71.4% Raw HTML
+ 16 of 21 – 76.2% Raw HTML
25 of 26 – 96.2% Setext headings
2 of 2 – 100.0% Soft line breaks
11 of 11 – 100.0% Tabs
3 of 3 – 100.0% Textual content
19 of 19 – 100.0% Thematic breaks
- 561 of 624 – 89.9% TOTAL
+ 579 of 624 – 92.8% TOTAL
diff --git a/tool/gfm_stats.json b/tool/gfm_stats.json
index 88d8819..2fae2fb 100644
--- a/tool/gfm_stats.json
+++ b/tool/gfm_stats.json
@@ -64,8 +64,8 @@
"306": "strict",
"307": "strict",
"308": "strict",
- "309": "fail",
- "310": "fail",
+ "309": "strict",
+ "310": "strict",
"311": "fail"
},
"Blank lines": {
@@ -430,22 +430,22 @@
"476": "strict",
"477": "loose",
"478": "strict",
- "479": "fail",
- "480": "fail",
- "481": "fail",
- "482": "fail",
- "483": "fail",
- "484": "fail",
+ "479": "strict",
+ "480": "strict",
+ "481": "strict",
+ "482": "strict",
+ "483": "strict",
+ "484": "strict",
"485": "strict",
- "486": "fail",
+ "486": "strict",
"487": "fail",
- "488": "fail",
- "489": "fail",
+ "488": "strict",
+ "489": "strict",
"490": "fail",
"491": "fail",
"492": "loose",
"493": "fail",
- "494": "fail",
+ "494": "strict",
"495": "strict",
"496": "fail",
"497": "strict",
@@ -453,8 +453,8 @@
"499": "strict",
"500": "strict",
"501": "loose",
- "502": "fail",
- "503": "fail",
+ "502": "strict",
+ "503": "strict",
"504": "fail",
"505": "fail",
"506": "strict",
@@ -467,8 +467,8 @@
"513": "strict",
"514": "strict",
"515": "loose",
- "516": "loose",
- "517": "fail",
+ "516": "strict",
+ "517": "strict",
"518": "fail",
"519": "strict",
"520": "strict",
@@ -476,7 +476,7 @@
"522": "strict",
"523": "strict",
"524": "strict",
- "525": "fail",
+ "525": "strict",
"526": "strict",
"527": "strict",
"528": "strict",
@@ -504,9 +504,9 @@
"550": "strict",
"551": "strict",
"552": "strict",
- "553": "loose",
+ "553": "strict",
"554": "strict",
- "555": "loose"
+ "555": "strict"
},
"List items": {
"224": "loose",
@@ -600,7 +600,7 @@
"Raw HTML": {
"608": "strict",
"609": "fail",
- "610": "fail",
+ "610": "strict",
"611": "strict",
"612": "strict",
"613": "loose",
diff --git a/tool/gfm_stats.txt b/tool/gfm_stats.txt
index 11b8b83..848e96f 100644
--- a/tool/gfm_stats.txt
+++ b/tool/gfm_stats.txt
@@ -1,7 +1,7 @@
17 of 18 – 94.4% ATX headings
17 of 19 – 89.5% Autolinks
8 of 11 – 72.7% Autolinks (extension)
- 9 of 13 – 69.2% Backslash escapes
+ 11 of 13 – 84.6% Backslash escapes
1 of 1 – 100.0% Blank lines
22 of 25 – 88.0% Block quotes
16 of 17 – 94.1% Code spans
@@ -15,12 +15,12 @@
11 of 12 – 91.7% Indented code blocks
1 of 1 – 100.0% Inlines
21 of 23 – 91.3% Link reference definitions
- 60 of 84 – 71.4% Links
+ 75 of 84 – 89.3% Links
44 of 48 – 91.7% List items
18 of 24 – 75.0% Lists
8 of 8 – 100.0% Paragraphs
1 of 1 – 100.0% Precedence
- 15 of 21 – 71.4% Raw HTML
+ 16 of 21 – 76.2% Raw HTML
25 of 26 – 96.2% Setext headings
2 of 2 – 100.0% Soft line breaks
3 of 3 – 100.0% Strikethrough (extension)
@@ -28,4 +28,4 @@
11 of 11 – 100.0% Tabs
3 of 3 – 100.0% Textual content
19 of 19 – 100.0% Thematic breaks
- 577 of 647 – 89.2% TOTAL
+ 595 of 647 – 92.0% TOTAL