blob: b50f58521ec22e1d692ef89b397a3e22984d1f0a [file] [log] [blame]
// Copyright (c) 2017, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
library fasta.scanner.string_canonicalizer;
import 'dart:convert';
class Node {
Object /* String | List<int> */ data;
int start;
int end;
String payload;
Node next;
Node(this.data, this.start, this.end, this.payload, this.next);
}
/// A hash table for triples:
/// (list of bytes, start, end) --> canonicalized string
/// Using triples avoids allocating string slices before checking if they
/// are canonical.
///
/// Gives about 3% speedup on dart2js.
class StringCanonicalizer {
/// Mask away top bits to keep hash calculation within 32-bit SMI range.
static const int MASK = 16 * 1024 * 1024 - 1;
static const int INITIAL_SIZE = 8 * 1024;
/// Linear size of a hash table.
int _size = INITIAL_SIZE;
/// Items in a hash table.
int _count = 0;
/// The table itself.
List<Node> _nodes = new List<Node>(INITIAL_SIZE);
static String decode(List<int> data, int start, int end, bool asciiOnly) {
String s;
if (asciiOnly) {
s = new String.fromCharCodes(data, start, end);
} else {
s = new Utf8Decoder(allowMalformed: true).convert(data, start, end);
}
return s;
}
static int hashBytes(List<int> data, int start, int end) {
int h = 5381;
for (int i = start; i < end; i++) {
h = ((h << 5) + h + data[i]) & MASK;
}
return h;
}
static int hashString(String data, int start, int end) {
int h = 5381;
for (int i = start; i < end; i++) {
h = ((h << 5) + h + data.codeUnitAt(i)) & MASK;
}
return h;
}
rehash() {
var newSize = _size * 2;
var newNodes = new List<Node>(newSize);
for (int i = 0; i < _size; i++) {
Node t = _nodes[i];
while (t != null) {
Node n = t.next;
int newIndex = t.data is String
? hashString(t.data, t.start, t.end) & (newSize - 1)
: hashBytes(t.data, t.start, t.end) & (newSize - 1);
Node s = newNodes[newIndex];
t.next = s;
newNodes[newIndex] = t;
t = n;
}
}
_size = newSize;
_nodes = newNodes;
}
String _canonicalizeString(String data, int start, int end, bool asciiOnly) {
if (_count > _size) rehash();
int index = hashString(data as String, start, end);
index = index & (_size - 1);
Node s = _nodes[index];
Node t = s;
int len = end - start;
while (t != null) {
if (t.end - t.start == len) {
int i = start, j = t.start;
while (i < end && data[i] == (t.data as String)[j]) {
i++;
j++;
}
if (i == end) {
return t.payload;
}
}
t = t.next;
}
String payload = data.substring(start, end);
_nodes[index] = new Node(data, start, end, payload, s);
_count++;
return payload;
}
String _canonicalizeBytes(List<int> data, int start, int end, bool asciiOnly) {
if (_count > _size) rehash();
int index = hashBytes(data as List<int>, start, end);
index = index & (_size - 1);
Node s = _nodes[index];
Node t = s;
int len = end - start;
while (t != null) {
if (t.end - t.start == len) {
int i = start, j = t.start;
while (i < end && data[i] == (t.data as List<int>)[j]) {
i++;
j++;
}
if (i == end) {
return t.payload;
}
}
t = t.next;
}
String payload = decode(data, start, end, asciiOnly);
_nodes[index] = new Node(data, start, end, payload, s);
_count++;
return payload;
}
String canonicalize(Object data, int start, int end, bool asciiOnly) {
if (data is String) {
return _canonicalizeString(data, start, end, asciiOnly);
} else {
assert(data is List<int>);
return _canonicalizeBytes(data, start, end, asciiOnly);
}
}
clear() {
_nodes = new List<Node>(_size);
}
}