Add startCharacter, reducing parsing time by another 17%; bump to 2.1.2 (#276)

commit: 494861631afcabefce43329b248df1d6b2c0e816 [log] [tgz]
author: Sam Rawlins <srawlins@google.com> Fri Nov 15 14:33:15 2019 -0800
committer: GitHub <noreply@github.com> Fri Nov 15 14:33:15 2019 -0800
tree: 61a8dab5e497a0e495d86eb695c826f250ccdaa7
parent: c32c3fc30c237e76926f594091f2cb2ca74ac1f9 [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c4ab46..97740d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -1,10 +1,13 @@
-## 2.1.2-dev
+## 2.1.2
 
 * Drop support for Dart 2.0.0 through 2.1.0.
 * Recognize Unicode ellipsis (…) and other Unicode punctuation as punctuation
   when parsing potential emphasis.
 * Reduce time to parse a large HTML-block-free Markdown document (such as that
-  in #271) by about half.
+  in #271) by more than half.
+* Add a new optional parameter for InlineSyntax(), `startCharacter`, where a
+  subclass can specify a single character to try to match, before matching with
+  more expensive regular expressions.
 
 ## 2.1.1
 

diff --git a/lib/src/inline_parser.dart b/lib/src/inline_parser.dart
index 0b1a25e..ed4513f 100644
--- a/lib/src/inline_parser.dart
+++ b/lib/src/inline_parser.dart

@@ -22,9 +22,9 @@
     // Allow any punctuation to be escaped.
     EscapeSyntax(),
     // "*" surrounded by spaces is left alone.
-    TextSyntax(r' \* '),
+    TextSyntax(r' \* ', startCharacter: $space),
     // "_" surrounded by spaces is left alone.
-    TextSyntax(r' _ '),
+    TextSyntax(r' _ ', startCharacter: $space),
     // Parse "**strong**" and "*emphasis*" tags.
     TagSyntax(r'\*+', requiresDelimiterRun: true),
     // Parse "__strong__" and "_emphasis_" tags.
@@ -37,13 +37,13 @@
       List<InlineSyntax>.unmodifiable(<InlineSyntax>[
     // Leave already-encoded HTML entities alone. Ensures we don't turn
     // "&amp;" into "&amp;amp;"
-    TextSyntax(r'&[#a-zA-Z0-9]*;'),
+    TextSyntax(r'&[#a-zA-Z0-9]*;', startCharacter: $ampersand),
     // Encode "&".
-    TextSyntax(r'&', sub: '&amp;'),
+    TextSyntax(r'&', sub: '&amp;', startCharacter: $ampersand),
     // Encode "<".
-    TextSyntax(r'<', sub: '&lt;'),
+    TextSyntax(r'<', sub: '&lt;', startCharacter: $lt),
     // Encode ">".
-    TextSyntax(r'>', sub: '&gt;'),
+    TextSyntax(r'>', sub: '&gt;', startCharacter: $gt),
     // We will add the LinkSyntax once we know about the specific link resolver.
   ]);
 
@@ -164,7 +164,17 @@
 abstract class InlineSyntax {
   final RegExp pattern;
 
-  InlineSyntax(String pattern) : pattern = RegExp(pattern, multiLine: true);
+  /// The first character of [pattern], to be used as an efficient first check
+  /// that this syntax matches the current parser position.
+  final int _startCharacter;
+
+  /// Create a new [InlineSyntax] which matches text on [pattern].
+  ///
+  /// If [startCharacter] is passed, it is used as a pre-matching check which
+  /// is faster than matching against [pattern].
+  InlineSyntax(String pattern, {int startCharacter})
+      : pattern = RegExp(pattern, multiLine: true),
+        _startCharacter = startCharacter;
 
   /// Tries to match at the parser's current position.
   ///
@@ -173,6 +183,14 @@
   bool tryMatch(InlineParser parser, [int startMatchPos]) {
     if (startMatchPos == null) startMatchPos = parser.pos;
 
+    // Before matching with the regular expression [pattern], which can be
+    // expensive on some platforms, check if even the first character matches
+    // this syntax.
+    if (_startCharacter != null &&
+        parser.source.codeUnitAt(startMatchPos) != _startCharacter) {
+      return false;
+    }
+
     final startMatch = pattern.matchAsPrefix(parser.source, startMatchPos);
     if (startMatch == null) return false;
 
@@ -205,9 +223,14 @@
 class TextSyntax extends InlineSyntax {
   final String substitute;
 
-  TextSyntax(String pattern, {String sub})
+  /// Create a new [TextSyntax] which matches text on [pattern].
+  ///
+  /// If [sub] is passed, it is used as a simple replacement for [pattern]. If
+  /// [startCharacter] is passed, it is used as a pre-matching check which is
+  /// faster than matching against [pattern].
+  TextSyntax(String pattern, {String sub, int startCharacter})
       : substitute = sub,
-        super(pattern);
+        super(pattern, startCharacter: startCharacter);
 
   /// Adds a [Text] node to [parser] and returns `true` if there is a
   /// [substitute], as long as the preceding character (if any) is not a `/`.
@@ -262,7 +285,9 @@
 /// TODO(srawlins): improve accuracy while ensuring performance, once
 /// Markdown benchmarking is more mature.
 class InlineHtmlSyntax extends TextSyntax {
-  InlineHtmlSyntax() : super(r'<[/!?]?[A-Za-z][A-Za-z0-9-]*(?:\s[^>]*)?>');
+  InlineHtmlSyntax()
+      : super(r'<[/!?]?[A-Za-z][A-Za-z0-9-]*(?:\s[^>]*)?>',
+            startCharacter: $lt);
 }
 
 /// Matches autolinks like `<foo@bar.example.com>`.
@@ -273,7 +298,7 @@
       r'''[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}'''
       r'''[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*''';
 
-  EmailAutolinkSyntax() : super('<($_email)>');
+  EmailAutolinkSyntax() : super('<($_email)>', startCharacter: $lt);
 
   bool onMatch(InlineParser parser, Match match) {
     var url = match[1];
@@ -551,9 +576,17 @@
   /// [emphasis delimiters]: http://spec.commonmark.org/0.28/#can-open-emphasis
   final bool requiresDelimiterRun;
 
-  TagSyntax(String pattern, {String end, this.requiresDelimiterRun = false})
+  /// Create a new [TagSyntax] which matches text on [pattern].
+  ///
+  /// If [end] is passed, it is used as the pattern which denotes the end of
+  /// matching text. Otherwise, [pattern] is used. If [requiresDelimiterRun] is
+  /// passed, this syntax parses according to the same nesting rules as
+  /// emphasis delimiters.  If [startCharacter] is passed, it is used as a
+  /// pre-matching check which is faster than matching against [pattern].
+  TagSyntax(String pattern,
+      {String end, this.requiresDelimiterRun = false, int startCharacter})
       : endPattern = RegExp((end != null) ? end : pattern, multiLine: true),
-        super(pattern);
+        super(pattern, startCharacter: startCharacter);
 
   bool onMatch(InlineParser parser, Match match) {
     var runLength = match.group(0).length;
@@ -638,9 +671,12 @@
 
   final Resolver linkResolver;
 
-  LinkSyntax({Resolver linkResolver, String pattern = r'\['})
+  LinkSyntax(
+      {Resolver linkResolver,
+      String pattern = r'\[',
+      int startCharacter = $lbracket})
       : this.linkResolver = (linkResolver ?? (String _, [String __]) => null),
-        super(pattern, end: r'\]');
+        super(pattern, end: r'\]', startCharacter: startCharacter);
 
   // The pending [TagState]s, all together, are "active" or "inactive" based on
   // whether a link element has just been parsed.
@@ -1053,7 +1089,10 @@
 /// `![alternate text][label]`.
 class ImageSyntax extends LinkSyntax {
   ImageSyntax({Resolver linkResolver})
-      : super(linkResolver: linkResolver, pattern: r'!\[');
+      : super(
+            linkResolver: linkResolver,
+            pattern: r'!\[',
+            startCharacter: $exclamation);
 
   Node _createNode(TagState state, String destination, String title) {
     var element = Element.empty('img');

diff --git a/lib/src/version.dart b/lib/src/version.dart
index 7a59bfe..5e5a029 100644
--- a/lib/src/version.dart
+++ b/lib/src/version.dart

@@ -1,2 +1,2 @@
 // Generated code. Do not modify.
-const packageVersion = '2.1.2-dev';
+const packageVersion = '2.1.2';

diff --git a/pubspec.yaml b/pubspec.yaml
index 396387a..39bef73 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml

@@ -1,5 +1,5 @@
 name: markdown
-version: 2.1.2-dev
+version: 2.1.2
 
 description: A library for converting markdown to HTML.
 author: Dart Team <misc@dartlang.org>
commit	494861631afcabefce43329b248df1d6b2c0e816	[log] [tgz]
author	Sam Rawlins <srawlins@google.com>	Fri Nov 15 14:33:15 2019 -0800
committer	GitHub <noreply@github.com>	Fri Nov 15 14:33:15 2019 -0800
tree	61a8dab5e497a0e495d86eb695c826f250ccdaa7
parent	c32c3fc30c237e76926f594091f2cb2ca74ac1f9 [diff]