Reduce time to parse a large Markdown document by about half (#274)

commit: c32c3fc30c237e76926f594091f2cb2ca74ac1f9 [log] [tgz]
author: Sam Rawlins <srawlins@google.com> Thu Nov 14 19:15:58 2019 -0800
committer: GitHub <noreply@github.com> Thu Nov 14 19:15:58 2019 -0800
tree: ebca9734c71c68438372b65d20437dd99ec731bd
parent: c3c15a015f2d4ee75e6b07c430f3733cf6decacb [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 096d301..7c4ab46 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -3,6 +3,8 @@
 * Drop support for Dart 2.0.0 through 2.1.0.
 * Recognize Unicode ellipsis (…) and other Unicode punctuation as punctuation
   when parsing potential emphasis.
+* Reduce time to parse a large HTML-block-free Markdown document (such as that
+  in #271) by about half.
 
 ## 2.1.1
 

diff --git a/lib/src/block_parser.dart b/lib/src/block_parser.dart
index 73af3b2..eb9980b 100644
--- a/lib/src/block_parser.dart
+++ b/lib/src/block_parser.dart

@@ -136,13 +136,13 @@
   /// Gets whether or not the current line matches the given pattern.
   bool matches(RegExp regex) {
     if (isDone) return false;
-    return regex.firstMatch(current) != null;
+    return regex.hasMatch(current);
   }
 
   /// Gets whether or not the next line matches the given pattern.
   bool matchesNext(RegExp regex) {
     if (next == null) return false;
-    return regex.firstMatch(next) != null;
+    return regex.hasMatch(next);
   }
 
   List<Node> parseLines() {
@@ -170,7 +170,7 @@
   bool get canEndBlock => true;
 
   bool canParse(BlockParser parser) {
-    return pattern.firstMatch(parser.current) != null;
+    return pattern.hasMatch(parser.current);
   }
 
   Node parse(BlockParser parser);
@@ -519,10 +519,22 @@
       'title|tr|track|ul)'
       r'(?:\s|>|/>|$)');
 
+  /// The [_pattern] regular expression above is very expensive, even on
+  /// paragraphs of Markdown with no HTML. This regular expression can be used
+  /// first as a basic check that the input might possibly be an HTML block
+  /// tag, which occur very rarely in typical Markdown.
+  static final _openBracketPattern = RegExp(r'^ {0,3}<');
+
   RegExp get pattern => _pattern;
 
   const BlockTagBlockHtmlSyntax();
 
+  @override
+  bool canParse(BlockParser parser) {
+    if (!_openBracketPattern.hasMatch(parser.current)) return false;
+    return super.canParse(parser);
+  }
+
   Node parse(BlockParser parser) {
     var childLines = <String>[];
 
@@ -633,7 +645,7 @@
       var leadingSpace = _whitespaceRe.matchAsPrefix(parser.current).group(0);
       var leadingExpandedTabLength = _expandedTabLength(leadingSpace);
       if (tryMatch(_emptyPattern)) {
-        if (_emptyPattern.firstMatch(parser.next ?? '') != null) {
+        if (_emptyPattern.hasMatch(parser.next ?? '')) {
           // Two blank lines ends a list.
           break;
         }
commit	c32c3fc30c237e76926f594091f2cb2ca74ac1f9	[log] [tgz]
author	Sam Rawlins <srawlins@google.com>	Thu Nov 14 19:15:58 2019 -0800
committer	GitHub <noreply@github.com>	Thu Nov 14 19:15:58 2019 -0800
tree	ebca9734c71c68438372b65d20437dd99ec731bd
parent	c3c15a015f2d4ee75e6b07c430f3733cf6decacb [diff]