GFM autolink extension (#203) Add support for GFM autolink extension

commit: c0c436106ed71c5ff254e9f27feb54f4e9bb025e [log] [tgz]
author: Louis Orleans <louis@orleans.io> Mon Jan 29 08:20:44 2018 -0800
committer: Sam Rawlins <sam.rawlins@gmail.com> Mon Jan 29 08:20:44 2018 -0800
tree: 31ea85043345081afefd81ee7d5183b053de403e
parent: 827256db7ad34b420b8053351eab09977e6dc1a3 [diff]
diff --git a/lib/src/extension_set.dart b/lib/src/extension_set.dart
index bab57e5..d6bec74 100644
--- a/lib/src/extension_set.dart
+++ b/lib/src/extension_set.dart

@@ -40,6 +40,7 @@
     new InlineHtmlSyntax(),
     new StrikethroughSyntax(),
     new EmojiSyntax(),
+    new AutolinkExtensionSyntax(),
   ]);
 
   /// The [gitHubFlavored] extension set is close to compliance with the [GitHub
@@ -52,6 +53,7 @@
   ], [
     new InlineHtmlSyntax(),
     new StrikethroughSyntax(),
+    new AutolinkExtensionSyntax(),
   ]);
 
   /// The deprecated name for the [gitHubFlavored] extension set.

diff --git a/lib/src/inline_parser.dart b/lib/src/inline_parser.dart
index 777ba21..12d8fee 100644
--- a/lib/src/inline_parser.dart
+++ b/lib/src/inline_parser.dart

@@ -68,9 +68,9 @@
     // character position.
     if (documentHasCustomInlineSyntaxes) {
       // We should be less aggressive in blowing past "words".
-      syntaxes.add(new TextSyntax(r'[A-Za-z0-9]+\b'));
+      syntaxes.add(new TextSyntax(r'[A-Za-z0-9]+(?=\s)'));
     } else {
-      syntaxes.add(new TextSyntax(r'[ \tA-Za-z0-9]*[A-Za-z0-9]'));
+      syntaxes.add(new TextSyntax(r'[ \tA-Za-z0-9]*[A-Za-z0-9](?=\s)'));
     }
 
     syntaxes.addAll(_defaultSyntaxes);
@@ -163,18 +163,19 @@
 
   /// Tries to match at the parser's current position.
   ///
+  /// The parser's position can be overriden with [startMatchPos].
   /// Returns whether or not the pattern successfully matched.
-  bool tryMatch(InlineParser parser) {
-    var startMatch = pattern.matchAsPrefix(parser.source, parser.pos);
-    if (startMatch != null) {
-      // Write any existing plain text up to this point.
-      parser.writeText();
+  bool tryMatch(InlineParser parser, [int startMatchPos]) {
+    if (startMatchPos == null) startMatchPos = parser.pos;
 
-      if (onMatch(parser, startMatch)) parser.consume(startMatch[0].length);
-      return true;
-    }
+    final startMatch = pattern.matchAsPrefix(parser.source, startMatchPos);
+    if (startMatch == null) return false;
 
-    return false;
+    // Write any existing plain text up to this point.
+    parser.writeText();
+
+    if (onMatch(parser, startMatch)) parser.consume(startMatch[0].length);
+    return true;
   }
 
   /// Processes [match], adding nodes to [parser] and possibly advancing
@@ -274,6 +275,128 @@
   }
 }
 
+/// Matches autolinks like `http://foo.com`.
+class AutolinkExtensionSyntax extends InlineSyntax {
+  /// Broken up parts of the autolink regex for reusability and readability
+
+  // Autolinks can only come at the beginning of a line, after whitespace, or
+  // any of the delimiting characters *, _, ~, and (.
+  static const start = r'(?:^|[\s*_~(>])';
+  // An extended url autolink will be recognized when one of the schemes
+  // http://, https://, or ftp://, followed by a valid domain
+  static const scheme = r'(?:(?:https?|ftp):\/\/|www\.)';
+  // A valid domain consists of alphanumeric characters, underscores (_),
+  // hyphens (-) and periods (.). There must be at least one period, and no
+  // underscores may be present in the last two segments of the domain.
+  static const domainPart = r'\w\-';
+  static const domain = '[$domainPart][$domainPart.]+';
+  // A valid domain consists of alphanumeric characters, underscores (_),
+  // hyphens (-) and periods (.).
+  static const path = r'[^\s<]*';
+  // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will not
+  // be considered part of the autolink
+  static const truncatingPunctuationPositive = r'[?!.,:*_~]';
+
+  static final regExpTrailingPunc =
+      new RegExp('$truncatingPunctuationPositive*' + r'$');
+  static final regExpEndsWithColon = new RegExp(r'\&[a-zA-Z0-9]+;$');
+  static final regExpWhiteSpace = new RegExp(r'\s');
+
+  AutolinkExtensionSyntax() : super('$start(($scheme)($domain)($path))');
+
+  @override
+  bool tryMatch(InlineParser parser, [int startMatchPos]) {
+    return super.tryMatch(parser, parser.pos > 0 ? parser.pos - 1 : 0);
+  }
+
+  @override
+  bool onMatch(InlineParser parser, Match match) {
+    var url = match[1];
+    var href = url;
+    var matchLength = url.length;
+
+    if (url[0] == '>' || url.startsWith(regExpWhiteSpace)) {
+      url = url.substring(1, url.length - 1);
+      href = href.substring(1, href.length - 1);
+      parser.pos++;
+      matchLength--;
+    }
+
+    // Prevent accidental standard autolink matches
+    if (url.endsWith('>') && parser.source[parser.pos - 1] == '<') {
+      return false;
+    }
+
+    // When an autolink ends in ), we scan the entire autolink for the total
+    // number of parentheses. If there is a greater number of closing
+    // parentheses than opening ones, we don’t consider the last character
+    // part of the autolink, in order to facilitate including an autolink
+    // inside a parenthesis:
+    // https://github.github.com/gfm/#example-600
+    if (url.endsWith(')')) {
+      final opening = _countChars(url, '(');
+      final closing = _countChars(url, ')');
+
+      if (closing > opening) {
+        url = url.substring(0, url.length - 1);
+        href = href.substring(0, href.length - 1);
+        matchLength--;
+      }
+    }
+
+    // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
+    // not be considered part of the autolink, though they may be included
+    // in the interior of the link:
+    // https://github.github.com/gfm/#example-599
+    final trailingPunc = regExpTrailingPunc.firstMatch(url);
+    if (trailingPunc != null) {
+      url = url.substring(0, url.length - trailingPunc[0].length);
+      href = href.substring(0, href.length - trailingPunc[0].length);
+      matchLength -= trailingPunc[0].length;
+    }
+
+    // If an autolink ends in a semicolon (;), we check to see if it appears
+    // to resemble an
+    // [entity reference](https://github.github.com/gfm/#entity-references);
+    // if the preceding text is & followed by one or more alphanumeric
+    // characters. If so, it is excluded from the autolink:
+    // https://github.github.com/gfm/#example-602
+    if (url.endsWith(';')) {
+      final entityRef = regExpEndsWithColon.firstMatch(url);
+      if (entityRef != null) {
+        // Strip out HTML entity reference
+        url = url.substring(0, url.length - entityRef[0].length);
+        href = href.substring(0, href.length - entityRef[0].length);
+        matchLength -= entityRef[0].length;
+      }
+    }
+
+    // The scheme http will be inserted automatically
+    if (!href.startsWith('http://') &&
+        !href.startsWith('https://') &&
+        !href.startsWith('ftp://')) {
+      href = 'http://$href';
+    }
+
+    final anchor = new Element.text('a', escapeHtml(url));
+    anchor.attributes['href'] = Uri.encodeFull(href);
+    parser.addNode(anchor);
+
+    parser.consume(matchLength);
+    return false;
+  }
+
+  int _countChars(String input, String char) {
+    var count = 0;
+
+    for (var i = 0; i < input.length; i++) {
+      if (input[i] == char) count++;
+    }
+
+    return count;
+  }
+}
+
 class _DelimiterRun {
   static final String punctuation = r'''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~''';
   // TODO(srawlins): Unicode whitespace
@@ -608,7 +731,7 @@
 
   CodeSyntax() : super(_pattern);
 
-  bool tryMatch(InlineParser parser) {
+  bool tryMatch(InlineParser parser, [int startMatchPos]) {
     if (parser.pos > 0 && parser.source[parser.pos - 1] == '`') {
       // Not really a match! We can't just sneak past one backtick to try the
       // next character. An example of this situation would be:

diff --git a/tool/gfm_stats.json b/tool/gfm_stats.json
index e1f9f43..88d8819 100644
--- a/tool/gfm_stats.json
+++ b/tool/gfm_stats.json

@@ -34,21 +34,21 @@
   "589": "strict",
   "590": "loose",
   "591": "loose",
-  "592": "loose",
+  "592": "fail",
   "593": "loose",
   "594": "loose",
-  "595": "strict",
+  "595": "fail",
   "596": "strict"
  },
  "Autolinks (extension)": {
-  "597": "fail",
-  "598": "fail",
-  "599": "fail",
-  "600": "fail",
-  "601": "fail",
-  "602": "fail",
-  "603": "fail",
-  "604": "fail",
+  "597": "strict",
+  "598": "strict",
+  "599": "strict",
+  "600": "loose",
+  "601": "strict",
+  "602": "loose",
+  "603": "strict",
+  "604": "strict",
   "605": "fail",
   "606": "fail",
   "607": "fail"

diff --git a/tool/gfm_stats.txt b/tool/gfm_stats.txt
index 68fd77e..11b8b83 100644
--- a/tool/gfm_stats.txt
+++ b/tool/gfm_stats.txt

@@ -1,6 +1,6 @@
   17 of   18 –  94.4%  ATX headings
-  19 of   19 – 100.0%  Autolinks
-   0 of   11 –   0.0%  Autolinks (extension)
+  17 of   19 –  89.5%  Autolinks
+   8 of   11 –  72.7%  Autolinks (extension)
    9 of   13 –  69.2%  Backslash escapes
    1 of    1 – 100.0%  Blank lines
   22 of   25 –  88.0%  Block quotes
@@ -28,4 +28,4 @@
   11 of   11 – 100.0%  Tabs
    3 of    3 – 100.0%  Textual content
   19 of   19 – 100.0%  Thematic breaks
- 571 of  647 –  88.3%  TOTAL
+ 577 of  647 –  89.2%  TOTAL
commit	c0c436106ed71c5ff254e9f27feb54f4e9bb025e	[log] [tgz]
author	Louis Orleans <louis@orleans.io>	Mon Jan 29 08:20:44 2018 -0800
committer	Sam Rawlins <sam.rawlins@gmail.com>	Mon Jan 29 08:20:44 2018 -0800
tree	31ea85043345081afefd81ee7d5183b053de403e
parent	827256db7ad34b420b8053351eab09977e6dc1a3 [diff]