Add string comparators that can compare ASCII strings ignoring case.

Also add comparators that compare digit sequences lexicographically
(which is the same as comparing numerically if there are no leading
zeros).
The comparison functions define total orderings, breaking ties for
strings that only differ by case by their first case-different letter,
upper-case being less than lower-case.

R=floitsch@google.com, rnystrom@google.com, sra@google.com

Review URL: https://codereview.chromium.org//1412293008.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ad50fa1..a39a2b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 1.2.0
+
+* Add string comparators that ignore ASCII case and sort numbers numerically.
+
 ## 1.1.3
 
 * Fix type inconsistencies with `Map` and `Set`.
diff --git a/lib/collection.dart b/lib/collection.dart
index 45d3867..6d451b8 100644
--- a/lib/collection.dart
+++ b/lib/collection.dart
@@ -22,5 +22,6 @@
 export "iterable_zip.dart";
 export "priority_queue.dart";
 export "src/canonicalized_map.dart";
+export "src/comparators.dart";
 export "src/queue_list.dart";
 export "wrappers.dart";
diff --git a/lib/src/comparators.dart b/lib/src/comparators.dart
new file mode 100644
index 0000000..19b70e9
--- /dev/null
+++ b/lib/src/comparators.dart
@@ -0,0 +1,399 @@
+// Copyright (c) 2015, the Dart project authors.  Please see the AUTHORS file

+// for details. All rights reserved. Use of this source code is governed by a

+// BSD-style license that can be found in the LICENSE file.

+

+library dart.pkg.collection.comparators;

+

+// Character constants.

+const int _zero         = 0x30;

+const int _upperCaseA   = 0x41;

+const int _upperCaseZ   = 0x5b;

+const int _lowerCaseA   = 0x61;

+const int _lowerCaseZ   = 0x7b;

+const int _asciiCaseBit = 0x20;

+

+/// Checks if strings [a] and [b] differ only on the case of ASCII letters.

+///

+/// Strings are equal if they have the same length, and the characters at

+/// each index are the same, or they are ASCII letters where one is upper-case

+/// and the other is the lower-case version of the same letter.

+///

+/// The comparison does not ignore the case of non-ASCII letters, so

+/// an upper-case ae-ligature (Æ) is different from

+/// a lower case ae-ligature (æ).

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+bool equalsIgnoreAsciiCase(String a, String b) {

+  if (a.length != b.length) return false;

+  for (int i = 0; i < a.length; i++) {

+    var aChar = a.codeUnitAt(i);

+    var bChar = b.codeUnitAt(i);

+    if (aChar == bChar) continue;

+    // Quick-check for whether this may be different cases of the same letter.

+    if (aChar ^ bChar != _asciiCaseBit) return false;

+    // If it's possible, then check if either character is actually an ASCII

+    // letter.

+    int aCharUpperCase = aChar | _asciiCaseBit;

+    if (_upperCaseA <= aCharUpperCase && aCharUpperCase <= _upperCaseZ) {

+      continue;

+    }

+    return false;

+  }

+  return true;

+}

+

+

+/// Hash code for a string which is compatible with [equalsIgnoreAsciiCase].

+///

+/// The hash code is unaffected by changing the case of ASCII letters, but

+/// the case of non-ASCII letters do affect the result.

+int hashIgnoreAsciiCase(String string) {

+  // Jenkins hash code ( http://en.wikipedia.org/wiki/Jenkins_hash_function).

+  // adapted to smi values.

+  // Same hash used by dart2js for strings, modified to ignore ASCII letter

+  // case.

+  int hash = 0;

+  for (int i = 0; i < string.length; i++) {

+    int char = string.codeUnitAt(i);

+    // Convert lower-case ASCII letters to upper case.upper

+    // This ensures that strings that differ only in case will have the

+    // same hash code.

+    if (_lowerCaseA <= char && char <= _lowerCaseZ) char -= _asciiCaseBit;

+    hash = 0x1fffffff & (hash + char);

+    hash = 0x1fffffff & (hash + ((0x0007ffff & hash) << 10));

+    hash >>= 6;

+  }

+  hash = 0x1fffffff & (hash + ((0x03ffffff & hash) << 3));

+  hash >>= 11;

+  return 0x1fffffff & (hash + ((0x00003fff & hash) << 15));

+}

+

+

+/// Compares [a] and [b] lexically, converting ASCII letters to upper case.

+///

+/// Comparison treats all lower-case ASCII letters as upper-case letters,

+/// but does no case conversion for non-ASCII letters.

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings and is compatible with equality.

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+int compareAsciiUpperCase(String a, String b) {

+  int defaultResult = 0; // Returned if no difference found.

+  for (int i = 0; i < a.length; i++) {

+    if (i >= b.length) return 1;

+    var aChar = a.codeUnitAt(i);

+    var bChar = b.codeUnitAt(i);

+    if (aChar == bChar) continue;

+    // Upper-case if letters.

+    int aUpperCase = aChar;

+    int bUpperCase = bChar;

+    if (_lowerCaseA <= aChar && aChar <= _lowerCaseZ) {

+      aUpperCase -= _asciiCaseBit;

+    }

+    if (_lowerCaseA <= bChar && bChar <= _lowerCaseZ) {

+      bUpperCase -= _asciiCaseBit;

+    }

+    if (aUpperCase != bUpperCase) return (aUpperCase - bUpperCase).sign;

+    if (defaultResult == 0) defaultResult = (aChar - bChar);

+  }

+  if (b.length > a.length) return -1;

+  return defaultResult.sign;

+}

+

+

+/// Compares [a] and [b] lexically, converting ASCII letters to lower case.

+///

+/// Comparison treats all upper-case ASCII letters as lower-case letters,

+/// but does no case conversion for non-ASCII letters.

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings.

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+int compareAsciiLowerCase(String a, String b) {

+  int defaultResult = 0;

+  for (int i = 0; i < a.length; i++) {

+    if (i >= b.length) return 1;

+    var aChar = a.codeUnitAt(i);

+    var bChar = b.codeUnitAt(i);

+    if (aChar == bChar) continue;

+    int aLowerCase = aChar;

+    int bLowerCase = bChar;

+    // Upper case if ASCII letters.

+    if (_upperCaseA <= bChar && bChar <= _upperCaseZ) {

+      bLowerCase += _asciiCaseBit;

+    }

+    if (_upperCaseA <= aChar && aChar <= _upperCaseZ) {

+      aLowerCase += _asciiCaseBit;

+    }

+    if (aLowerCase != bLowerCase) return (aLowerCase - bLowerCase).sign;

+    if (defaultResult == 0) defaultResult = aChar - bChar;

+  }

+  if (b.length > a.length) return -1;

+  return defaultResult.sign;

+}

+

+/// Compares strings [a] and [b] according to [natural sort ordering].

+///

+/// A natural sort ordering is a lexical ordering where embedded

+/// numerals (digit sequences) are treated as a single unit and ordered by

+/// numerical value.

+/// This means that `"a10b"` will be ordered after `"a7b"` in natural

+/// ordering, where lexical ordering would put the `1` before the `7`, ignoring

+/// that the `1` is part of a larger number.

+///

+/// Example:

+/// The following strings are in the order they would be sorted by using this

+/// comparison function:

+///

+///     "a", "a0", "a0b", "a1", "a01", "a9", "a10", "a100", "a100b", "aa"

+///

+/// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order

+int compareNatural(String a, String b) {

+  for (int i = 0; i < a.length; i++) {

+    if (i >= b.length) return 1;

+    var aChar = a.codeUnitAt(i);

+    var bChar = b.codeUnitAt(i);

+    if (aChar != bChar) {

+      return _compareNaturally(a, b, i, aChar, bChar);

+    }

+  }

+  if (b.length > a.length) return -1;

+  return 0;

+}

+

+/// Compares strings [a] and [b] according to lower-case

+/// [natural sort ordering].

+///

+/// ASCII letters are converted to lower case before being compared, like

+/// for [compareAsciiLowerCase], then the result is compared like for

+/// [compareNatural].

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings.

+///

+/// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order

+int compareAsciiLowerCaseNatural(String a, String b) {

+  int defaultResult = 0; // Returned if no difference found.

+  for (int i = 0; i < a.length; i++) {

+    if (i >= b.length) return 1;

+    var aChar = a.codeUnitAt(i);

+    var bChar = b.codeUnitAt(i);

+    if (aChar == bChar) continue;

+    int aLowerCase = aChar;

+    int bLowerCase = bChar;

+    if (_upperCaseA <= aChar && aChar <= _upperCaseZ) {

+      aLowerCase += _asciiCaseBit;

+    }

+    if (_upperCaseA <= bChar && bChar <= _upperCaseZ) {

+      bLowerCase += _asciiCaseBit;

+    }

+    if (aLowerCase != bLowerCase) {

+      return _compareNaturally(a, b, i, aLowerCase, bLowerCase);

+    }

+    if (defaultResult == 0) defaultResult = aChar - bChar;

+  }

+  if (b.length > a.length) return -1;

+  return defaultResult.sign;

+}

+

+/// Compares strings [a] and [b] according to upper-case

+/// [natural sort ordering].

+///

+/// ASCII letters are converted to upper case before being compared, like

+/// for [compareAsciiUpperCase], then the result is compared like for

+/// [compareNatural].

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings

+///

+/// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order

+int compareAsciiUpperCaseNatural(String a, String b) {

+  int defaultResult = 0;

+  for (int i = 0; i < a.length; i++) {

+    if (i >= b.length) return 1;

+    var aChar = a.codeUnitAt(i);

+    var bChar = b.codeUnitAt(i);

+    if (aChar == bChar) continue;

+    int aUpperCase = aChar;

+    int bUpperCase = bChar;

+    if (_lowerCaseA <= aChar && aChar <= _lowerCaseZ) {

+      aUpperCase -= _asciiCaseBit;

+    }

+    if (_lowerCaseA <= bChar && bChar <= _lowerCaseZ) {

+      bUpperCase -= _asciiCaseBit;

+    }

+    if (aUpperCase != bUpperCase) {

+      return _compareNaturally(a, b, i, aUpperCase, bUpperCase);

+    }

+    if (defaultResult == 0) defaultResult = aChar - bChar;

+  }

+  if (b.length > a.length) return -1;

+  return defaultResult.sign;

+}

+

+/// Check for numbers overlapping the current mismatched characters.

+///

+/// If both [aChar] and [bChar] are digits, use numerical comparison.

+/// Check if the previous characters is a non-zero number, and if not,

+/// skip - but count - leading zeros before comparing numbers.

+///

+/// If one is a digit and the other isn't, check if the previous character

+/// is a digit, and if so, the the one with the digit is the greater number.

+///

+/// Otherwise just returns the difference between [aChar] and [bChar].

+int _compareNaturally(

+    String a, String b, int index, int aChar, int bChar) {

+  assert(aChar != bChar);

+  var aIsDigit = _isDigit(aChar);

+  var bIsDigit = _isDigit(bChar);

+  if (aIsDigit) {

+    if (bIsDigit) {

+      return _compareNumerically(a, b, aChar, bChar, index);

+    } else if (index > 0 && _isDigit(a.codeUnitAt(index - 1))) {

+      // aChar is the continuation of a longer number.

+      return 1;

+    }

+  } else if (bIsDigit && index > 0 && _isDigit(b.codeUnitAt(index - 1))) {

+    // bChar is the continuation of a longer number.

+    return -1;

+  }

+  // Characters are both non-digits, or not continuation of earlier number.

+  return (aChar - bChar).sign;

+}

+

+/// Compare numbers overlapping [aChar] and [bChar] numerically.

+///

+/// If the numbers have the same numerical value, but one has more leading

+/// zeros, the longer number is considered greater than the shorter one.

+///

+/// This ensures a total ordering on strings compatible with equality.

+int _compareNumerically(String a, String b, int aChar, int bChar, int index) {

+  // Both are digits. Find the first significant different digit, then find

+  // the length of the numbers.

+  if (_isNonZeroNumberSuffix(a, index)) {

+    // Part of a longer number, differs at this index, just count the length.

+    int result = _compareDigitCount(a, b, index, index);

+    if (result != 0) return result;

+    // If same length, the current character is the most significant differing

+    // digit.

+    return (aChar - bChar).sign;

+  }

+  // Not part of larger (non-zero) number, so skip leading zeros before

+  // comparing numbers.

+  int aIndex = index;

+  int bIndex = index;

+  if (aChar == _zero) {

+    do {

+      aIndex++;

+      if (aIndex == a.length) return -1;  // number in a is zero, b is not.

+      aChar = a.codeUnitAt(aIndex);

+    } while (aChar == _zero);

+    if (!_isDigit(aChar)) return -1;

+  } else if (bChar == _zero) {

+    do {

+      bIndex++;

+      if (bIndex == b.length) return 1;  // number in b is zero, a is not.

+      bChar = b.codeUnitAt(bIndex);

+    } while (bChar == _zero);

+    if (!_isDigit(bChar)) return 1;

+  }

+  if (aChar != bChar) {

+    int result = _compareDigitCount(a, b, aIndex, bIndex);

+    if (result != 0) return result;

+    return (aChar - bChar).sign;

+  }

+  // Same leading digit, one had more leading zeros.

+  // Compare digits until reaching a difference.

+  while (true) {

+    var aIsDigit = false;

+    var bIsDigit = false;

+    aChar = 0;

+    bChar = 0;

+    if (++aIndex < a.length) {

+      aChar = a.codeUnitAt(aIndex);

+      aIsDigit = _isDigit(aChar);

+    }

+    if (++bIndex < b.length) {

+      bChar = b.codeUnitAt(bIndex);

+      bIsDigit = _isDigit(bChar);

+    }

+    if (aIsDigit) {

+      if (bIsDigit) {

+        if (aChar == bChar) continue;

+        // First different digit found.

+        break;

+      }

+      // bChar is non-digit, so a has longer number.

+      return 1;

+    } else if (bIsDigit) {

+      return -1;  // b has longer number.

+    } else {

+      // Neither is digit, so numbers had same numerical value.

+      // Fall back on number of leading zeros

+      // (reflected by difference in indices).

+      return (aIndex - bIndex).sign;

+    }

+  }

+  // At first differing digits.

+  int result = _compareDigitCount(a, b, aIndex, bIndex);

+  if (result != 0) return result;

+  return (aChar - bChar).sign;

+}

+

+/// Checks which of [a] and [b] has the longest sequence of digits.

+///

+/// Starts counting from `i + 1` and `j + 1` (assumes that `a[i]` and `b[j]` are

+/// both already known to be digits).

+int _compareDigitCount(String a, String b, int i, int j) {

+  while (++i < a.length) {

+    bool aIsDigit = _isDigit(a.codeUnitAt(i));

+    if (++j == b.length) return aIsDigit ? 1 : 0;

+    bool bIsDigit = _isDigit(b.codeUnitAt(j));

+    if (aIsDigit) {

+      if (bIsDigit) continue;

+      return 1;

+    } else if (bIsDigit) {

+      return -1;

+    } else {

+      return 0;

+    }

+  }

+  if (++j < b.length && _isDigit(b.codeUnitAt(j))) {

+    return -1;

+  }

+  return 0;

+}

+

+bool _isDigit(int charCode) => (charCode ^ _zero) <= 9;

+

+/// Check if the digit at [index] is continuing a non-zero number.

+///

+/// If there is no non-zero digits before, then leading zeros at [index]

+/// are also ignored when comparing numerically. If there is a non-zero digit

+/// before, then zeros at [index] are significant.

+bool _isNonZeroNumberSuffix(String string, int index) {

+  while (--index >= 0) {

+    int char = string.codeUnitAt(index);

+    if (char != _zero) return _isDigit(char);

+  }

+  return false;

+}

diff --git a/pubspec.yaml b/pubspec.yaml
index 514ec4b..3642321 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -1,5 +1,5 @@
 name: collection
-version: 1.1.3
+version: 1.2.0
 author: Dart Team <misc@dartlang.org>
 description: Collections and utilities functions and classes related to collections.
 homepage: https://www.github.com/dart-lang/collection
diff --git a/test/comparators_test.dart b/test/comparators_test.dart
new file mode 100644
index 0000000..4acdc2c
--- /dev/null
+++ b/test/comparators_test.dart
@@ -0,0 +1,119 @@
+// Copyright (c) 2015, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+import "package:collection/collection.dart";
+import "package:test/test.dart";
+
+void main() {
+  List<String> strings = [
+    "",
+    "\x00",
+    " ",
+    "+",
+    "/",
+    "0",
+    "00",
+    "000",
+    "001",
+    "01",
+    "011",
+    "1",
+    "100",
+    "11",
+    "110",
+    "9",
+    ":",
+    "=",
+    "@",
+    "A",
+    "A0",
+    "A000A",
+    "A001A",
+    "A00A",
+    "A01A",
+    "A0A",
+    "A1A",
+    "AA",
+    "AAB",
+    "AB",
+    "Z",
+    "[",
+    "_",
+    "`",
+    "a",
+    "a0",
+    "a000a",
+    "a001a",
+    "a00a",
+    "a01a",
+    "a0a",
+    "a1a",
+    "aa",
+    "aab",
+    "ab",
+    "z",
+    "{",
+    "~"
+  ];
+
+  sortedBy(compare) => strings.toList()..shuffle()..sort(compare);
+
+  test("String.compareTo", () {
+    expect(sortedBy(null), strings);
+  });
+  test("compareAsciiLowerCase", () {
+    expect(sortedBy(compareAsciiLowerCase),
+           sortedBy((a, b) {
+             int delta = a.toLowerCase().compareTo(b.toLowerCase());
+             if (delta != 0) return delta;
+             if (a == b) return 0;
+             return a.compareTo(b);
+           }));
+  });
+  test("compareAsciiUpperCase", () {
+    expect(sortedBy(compareAsciiUpperCase),
+           sortedBy((a, b) {
+             int delta = a.toUpperCase().compareTo(b.toUpperCase());
+             if (delta != 0) return delta;
+             if (a == b) return 0;
+             return a.compareTo(b);
+           }));
+  });
+
+  // Replace any digit sequence by ("0", value, length) as char codes.
+  // This will sort alphabetically (by charcode) the way digits sort
+  // numerically, and the leading 0 means it sorts like a digit
+  // compared to non-digits.
+  replaceNumbers(string) => string.replaceAllMapped(new RegExp(r"\d+"), (m) {
+    var digits = m[0];
+    return new String.fromCharCodes([0x30, int.parse(digits), digits.length]);
+  });
+
+  test("compareNatural", () {
+    expect(sortedBy(compareNatural),
+           sortedBy((a, b) => replaceNumbers(a).compareTo(replaceNumbers(b))));
+  });
+
+  test("compareAsciiLowerCaseNatural", () {
+    expect(sortedBy(compareAsciiLowerCaseNatural),
+           sortedBy((a, b) {
+             int delta = replaceNumbers(a.toLowerCase()).compareTo(
+                             replaceNumbers(b.toLowerCase()));
+             if (delta != 0) return delta;
+             if (a == b) return 0;
+             return a.compareTo(b);
+           }));
+  });
+
+  test("compareAsciiUpperCaseNatural", () {
+    expect(sortedBy(compareAsciiUpperCaseNatural),
+           sortedBy((a, b) {
+             int delta = replaceNumbers(a.toUpperCase()).compareTo(
+                             replaceNumbers(b.toUpperCase()));
+             if (delta != 0) return delta;
+             if (a == b) return 0;
+             return a.compareTo(b);
+           }));
+  });
+}