Reduce time to parse a large Markdown document by about half (#274)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 096d301..7c4ab46 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,8 @@
* Drop support for Dart 2.0.0 through 2.1.0.
* Recognize Unicode ellipsis (…) and other Unicode punctuation as punctuation
when parsing potential emphasis.
+* Reduce time to parse a large HTML-block-free Markdown document (such as that
+ in #271) by about half.
## 2.1.1
diff --git a/lib/src/block_parser.dart b/lib/src/block_parser.dart
index 73af3b2..eb9980b 100644
--- a/lib/src/block_parser.dart
+++ b/lib/src/block_parser.dart
@@ -136,13 +136,13 @@
/// Gets whether or not the current line matches the given pattern.
bool matches(RegExp regex) {
if (isDone) return false;
- return regex.firstMatch(current) != null;
+ return regex.hasMatch(current);
}
/// Gets whether or not the next line matches the given pattern.
bool matchesNext(RegExp regex) {
if (next == null) return false;
- return regex.firstMatch(next) != null;
+ return regex.hasMatch(next);
}
List<Node> parseLines() {
@@ -170,7 +170,7 @@
bool get canEndBlock => true;
bool canParse(BlockParser parser) {
- return pattern.firstMatch(parser.current) != null;
+ return pattern.hasMatch(parser.current);
}
Node parse(BlockParser parser);
@@ -519,10 +519,22 @@
'title|tr|track|ul)'
r'(?:\s|>|/>|$)');
+ /// The [_pattern] regular expression above is very expensive, even on
+ /// paragraphs of Markdown with no HTML. This regular expression can be used
+ /// first as a basic check that the input might possibly be an HTML block
+ /// tag, which occur very rarely in typical Markdown.
+ static final _openBracketPattern = RegExp(r'^ {0,3}<');
+
RegExp get pattern => _pattern;
const BlockTagBlockHtmlSyntax();
+ @override
+ bool canParse(BlockParser parser) {
+ if (!_openBracketPattern.hasMatch(parser.current)) return false;
+ return super.canParse(parser);
+ }
+
Node parse(BlockParser parser) {
var childLines = <String>[];
@@ -633,7 +645,7 @@
var leadingSpace = _whitespaceRe.matchAsPrefix(parser.current).group(0);
var leadingExpandedTabLength = _expandedTabLength(leadingSpace);
if (tryMatch(_emptyPattern)) {
- if (_emptyPattern.firstMatch(parser.next ?? '') != null) {
+ if (_emptyPattern.hasMatch(parser.next ?? '')) {
// Two blank lines ends a list.
break;
}