Handle Unicode whitespace more gracefully. * Don't crash when trailing non-ASCII whitespace is trimmed. * Add tests for trailing whitespace trimming. * Add tests for other Unicode characters. Fix #901.

commit: 2b41b3f431fc07296790aa0fe22bb61d03dd8f0d [log] [tgz]
author: Robert Nystrom <rnystrom@google.com> Fri Aug 14 18:38:32 2020 -0700
committer: Robert Nystrom <rnystrom@google.com> Fri Aug 14 18:38:32 2020 -0700
tree: e85c2a9f2c3016851f883c114d161c90d0b43f43
parent: b7f9ab5d158c131c5b8c71817df13d93b1bee761 [diff]
diff --git a/lib/src/string_compare.dart b/lib/src/string_compare.dart
index 4306cb8..30af46e 100644
--- a/lib/src/string_compare.dart
+++ b/lib/src/string_compare.dart

@@ -1,8 +1,24 @@
-library dart_style.src.string_compare;
-
 /// Returns `true` if [c] represents a whitespace code unit allowed in Dart
 /// source code.
-bool _isWhitespace(int c) => (c <= 0x000D && c >= 0x0009) || c == 0x0020;
+///
+/// This follows the same rules as `String.trim()` because that's what dartfmt
+/// uses to trim trailing whitespace.
+bool _isWhitespace(int c) {
+  // Not using a set or something more elegant because this code is on the hot
+  // path and this large expression is significantly faster than a set lookup.
+  return c >= 0x0009 && c <= 0x000d || // Control characters.
+      c == 0x0020 || // SPACE.
+      c == 0x0085 || // Control characters.
+      c == 0x00a0 || // NO-BREAK SPACE.
+      c == 0x1680 || // OGHAM SPACE MARK.
+      c >= 0x2000 && c <= 0x200a || // EN QUAD..HAIR SPACE.
+      c == 0x2028 || // LINE SEPARATOR.
+      c == 0x2029 || // PARAGRAPH SEPARATOR.
+      c == 0x202f || // NARROW NO-BREAK SPACE.
+      c == 0x205f || // MEDIUM MATHEMATICAL SPACE.
+      c == 0x3000 || // IDEOGRAPHIC SPACE.
+      c == 0xfeff; // ZERO WIDTH NO_BREAK SPACE.
+}
 
 /// Returns the index of the next non-whitespace character.
 ///

diff --git a/test/command_line_test.dart b/test/command_line_test.dart
index 7284e50..d328374 100644
--- a/test/command_line_test.dart
+++ b/test/command_line_test.dart

@@ -122,7 +122,8 @@
           emits('Usage:   dartfmt [options...] [files or directories...]'));
       await expectLater(process.stdout, emitsThrough(contains('--overwrite')));
       await expectLater(process.stdout, emitsThrough(contains('--fix')));
-      await expectLater(process.stdout, neverEmits(contains('--set-exit-if-changed')));
+      await expectLater(
+          process.stdout, neverEmits(contains('--set-exit-if-changed')));
       await process.shouldExit(0);
     });
 
@@ -135,7 +136,8 @@
           emits('Usage:   dartfmt [options...] [files or directories...]'));
       await expectLater(process.stdout, emitsThrough(contains('--overwrite')));
       await expectLater(process.stdout, emitsThrough(contains('--fix')));
-      await expectLater(process.stdout, emitsThrough(contains('--set-exit-if-changed')));
+      await expectLater(
+          process.stdout, emitsThrough(contains('--set-exit-if-changed')));
       await process.shouldExit(0);
     });
   });

diff --git a/test/string_compare_test.dart b/test/string_compare_test.dart
index 226b421..3adaa71 100644
--- a/test/string_compare_test.dart
+++ b/test/string_compare_test.dart

@@ -39,26 +39,26 @@
   });
 
   test('test unicode whitespace characters', () {
-    // Dart sources only allow ascii whitespace code points so we
-    // should not consider the following strings equal.
+    // The formatter strips all Unicode whitespace characters from the end of
+    // comment lines, so treat those as whitespace too.
     var whitespaceRunes = [
-      0x00A0,
-      0x1680,
-      0x180E,
+      0x0020,
+      0x0085,
+      0x00a0,
       0x2000,
-      0x200A,
+      0x200a,
       0x2028,
       0x2029,
-      0x202F,
-      0x205F,
+      0x202f,
+      0x205f,
       0x3000,
-      0xFEFF
+      0xfeff
     ];
     for (var rune in whitespaceRunes) {
       expect(
           equalIgnoringWhitespace(
               'foo${String.fromCharCode(rune)}bar', 'foo    bar'),
-          isFalse);
+          isTrue);
     }
   });
 

diff --git a/test/utils.dart b/test/utils.dart
index e8437ac..2d1ab06 100644
--- a/test/utils.dart
+++ b/test/utils.dart

@@ -24,6 +24,7 @@
 
 final _indentPattern = RegExp(r'\(indent (\d+)\)');
 final _fixPattern = RegExp(r'\(fix ([a-x-]+)\)');
+final _unicodePattern = RegExp(r'×([0-9a-fA-F]{2,4})');
 
 /// If tool/command_shell.dart has been compiled to a snapshot, this is the path
 /// to it.
@@ -226,6 +227,10 @@
         expectedOutput += lines[i] + '\n';
       }
 
+      // Unescape special Unicode escape markers.
+      input = _unescapeUnicode(input);
+      expectedOutput = _unescapeUnicode(expectedOutput);
+
       // TODO(rnystrom): Stop skipping these tests when possible.
       if (description.contains('(skip:')) {
         print('skipping $description');
@@ -240,6 +245,7 @@
 
         var expected = _extractSelection(expectedOutput,
             isCompilationUnit: isCompilationUnit);
+        var expectedText = expected.text;
 
         var formatter = DartFormatter(
             pageWidth: pageWidth, indent: leadingIndent, fixes: fixes);
@@ -254,9 +260,9 @@
 
         // Fail with an explicit message because it's easier to read than
         // the matcher output.
-        if (actualText != expected.text) {
+        if (actualText != expectedText) {
           fail('Formatting did not match expectation. Expected:\n'
-              '${expected.text}\nActual:\n$actualText');
+              '$expectedText\nActual:\n$actualText');
         }
 
         expect(actual.selectionStart, equals(expected.selectionStart));
@@ -281,3 +287,15 @@
       selectionStart: start == -1 ? null : start,
       selectionLength: end == -1 ? null : end - start);
 }
+
+/// Turn the special Unicode escape marker syntax used in the tests into real
+/// Unicode characters.
+///
+/// This does not use Dart's own string escape sequences so that we don't
+/// accidentally modify the Dart code being formatted.
+String _unescapeUnicode(String input) {
+  return input.replaceAllMapped(_unicodePattern, (match) {
+    var codePoint = int.parse(match[1], radix: 16);
+    return String.fromCharCode(codePoint);
+  });
+}

diff --git a/test/whitespace/trailing.unit b/test/whitespace/trailing.unit
new file mode 100644
index 0000000..79dbeff
--- /dev/null
+++ b/test/whitespace/trailing.unit

@@ -0,0 +1,32 @@
+40 columns                              |
+>>> remove after line comment
+// trailing spaces after here:×20×20×20×20
+<<<
+// trailing spaces after here:
+>>> remove from empty line comment
+//×20×20×20
+<<<
+//
+>>> keep inside block comment lines
+/* one×20×20
+   two×20
+×20×20×20
+   three×20×20×20×20
+*/×20×20
+<<<
+/* one×20×20
+   two×20
+×20×20×20
+   three×20×20×20×20
+*/
+>>> after code
+main() {×20×20
+×20
+  veryLongExpression    +×20×20×20
+  veryLongStatement;×20×20
+}×20×20×20×20
+<<<
+main() {
+  veryLongExpression +
+      veryLongStatement;
+}
\ No newline at end of file

diff --git a/test/whitespace/unicode.unit b/test/whitespace/unicode.unit
new file mode 100644
index 0000000..b40018e
--- /dev/null
+++ b/test/whitespace/unicode.unit

@@ -0,0 +1,65 @@
+40 columns                              |
+>>> preserve unicode whitespace inside comments from trim from the end
+// control middle: ×09 end: ×09 ×09
+// control middle: ×0b end: ×0b ×0b
+// space middle: ×20 end: ×20 ×20
+// control middle: ×85 end: ×85 ×85
+<<<
+// control middle: ×09 end:
+// control middle: ×0b end:
+// space middle: ×20 end:
+// control middle: ×85 end:
+>>> preserve unicode whitespace inside comments from trim from the end
+// no-break space middle: ×a0 end: ×a0 ×a0
+// ogham space mark middle: ×1680 end: ×1680 ×1680
+// en quad middle: ×2000 end: ×2000 ×2000
+// em quad middle: ×2001 end: ×2001 ×2001
+// en space middle: ×2002 end: ×2002 ×2002
+// em space middle: ×2003 end: ×2003 ×2003
+<<<
+// no-break space middle: ×a0 end:
+// ogham space mark middle: ×1680 end:
+// en quad middle: ×2000 end:
+// em quad middle: ×2001 end:
+// en space middle: ×2002 end:
+// em space middle: ×2003 end:
+>>>
+// three-per-em space middle: ×2004 end: ×2004 ×2004
+// four-per-em space middle: ×2005 end: ×2005 ×2005
+// six-per-em space middle: ×2006 end: ×2006 ×2006
+// figure space middle: ×2007 end: ×2007 ×2007
+// punctuation space middle: ×2008 end: ×2008 ×2008
+// thin space middle: ×2009 end: ×2009 ×2009
+// hair space middle: ×200a end: ×200a ×200a
+<<<
+// three-per-em space middle: ×2004 end:
+// four-per-em space middle: ×2005 end:
+// six-per-em space middle: ×2006 end:
+// figure space middle: ×2007 end:
+// punctuation space middle: ×2008 end:
+// thin space middle: ×2009 end:
+// hair space middle: ×200a end:
+>>>
+// line separator middle: ×2028 end: ×2028 ×2028
+// paragraph separator middle: ×2029 end: ×2029 ×2029
+// narrow no-break space middle: ×202f end: ×202f ×202f
+// medium mathematical space middle: ×205f end: ×205f ×205f
+// ideographic space middle: ×3000 end: ×3000 ×3000
+// zero width no-break space middle: ×feff end: ×feff ×feff
+<<<
+// line separator middle: ×2028 end:
+// paragraph separator middle: ×2029 end:
+// narrow no-break space middle: ×202f end:
+// medium mathematical space middle: ×205f end:
+// ideographic space middle: ×3000 end:
+// zero width no-break space middle: ×feff end:
+>>> unicode line endings
+// line feed middle: ×0a // end: ×0a ×0a
+// form feed middle: ×0c // end: ×0c ×0c
+// carriage return middle: ×0d // end: ×0d ×0d
+<<<
+// line feed middle:
+// end:
+
+// form feed middle: ×0c // end:
+// carriage return middle: // end:
commit	2b41b3f431fc07296790aa0fe22bb61d03dd8f0d	[log] [tgz]
author	Robert Nystrom <rnystrom@google.com>	Fri Aug 14 18:38:32 2020 -0700
committer	Robert Nystrom <rnystrom@google.com>	Fri Aug 14 18:38:32 2020 -0700
tree	e85c2a9f2c3016851f883c114d161c90d0b43f43
parent	b7f9ab5d158c131c5b8c71817df13d93b1bee761 [diff]