| // Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| |
| // Micro-benchmarks for copying typed data lists. |
| |
| import 'dart:ffi'; |
| import 'dart:math'; |
| import 'dart:typed_data'; |
| |
| import 'package:args/args.dart'; |
| import 'package:ffi/ffi.dart'; |
| |
| const maxSizeInBytes = 10 * 1024 * 1024; |
| |
| final argParser = ArgParser() |
| ..addMultiOption('length', |
| abbr: 'l', |
| help: 'Byte length to benchmark', |
| valueHelp: 'INT', |
| defaultsTo: const []) |
| ..addFlag('mebibytes-per-second', |
| abbr: 'm', help: 'Show MiB/s', defaultsTo: false) |
| ..addFlag('nanoseconds-per-byte', |
| abbr: 'n', help: 'Show ns/byte', defaultsTo: false) |
| ..addFlag('bytes-per-second', |
| abbr: 'b', help: 'Show byte/s', defaultsTo: true) |
| ..addFlag('verbose', abbr: 'v', help: 'Verbose output', defaultsTo: false) |
| ..addFlag('aligned', |
| abbr: 'a', help: 'Align results on initial numbers', defaultsTo: false); |
| |
| class Emitter { |
| final bool bytesPerSecond; |
| final bool nanosecondsPerByte; |
| final bool mebibytesPerSecond; |
| final bool _alignedOutput; |
| |
| Emitter(ArgResults results) |
| : bytesPerSecond = results['bytes-per-second'] || results['verbose'], |
| nanosecondsPerByte = |
| results['nanoseconds-per-byte'] || results['verbose'], |
| mebibytesPerSecond = |
| results['mebibytes-per-second'] || results['verbose'], |
| _alignedOutput = results['aligned']; |
| |
| static final kValueRegexp = RegExp(r'^([0-9]+)'); |
| static final kMaxLabelLength = |
| 'MemoryCopy.1048576.setRange.TypedData.Double(NanosecondsPerChar)'.length; |
| // Maximum expected number of digits on either side of the decimal point. |
| static final kMaxDigits = 16; |
| |
| void printLabeledValue(String label, double value) { |
| final valueString = value.toString(); |
| final buffer = StringBuffer(); |
| buffer |
| ..write(label) |
| ..write(': '); |
| if (_alignedOutput) { |
| final matches = kValueRegexp.firstMatch(valueString)!; |
| final valuePadding = (kMaxLabelLength - label.length) + |
| max<int>(kMaxDigits - matches[1]!.length, 0); |
| buffer..write(' ' * valuePadding); |
| } |
| buffer.write(valueString); |
| print(buffer.toString()); |
| } |
| } |
| |
| // A modified version of BenchmarkBase from package:benchmark_harness where |
| // - the run() method takes a number of rounds, so that there is only one run() |
| // call per measurement and thus the overhead of calling the run() method is |
| // the same across subclass results. |
| // - the measureFor() method returns the number of bytes transfered per second, |
| // not the number of microseconds per iteration (round). |
| abstract class MemoryCopyBenchmark { |
| final String name; |
| final int bytes; |
| |
| MemoryCopyBenchmark(String name, this.bytes) : name = 'MemoryCopy.$name'; |
| |
| static const targetBatchSizeInBytes = 32 * 1024; |
| |
| // Returns the number of bytes copied per second. |
| double measureFor(Duration minDuration) { |
| // The logic below is based off of BenchmarkBase._measureForImpl. |
| // We can't use BenchmarkBase.measureFor directly, because |
| // * it calls the function in a loop instead of passing the number of |
| // desired iterations to the function being called. Here, method |
| // invocation would dominate the actual body for small byte counts. |
| // * it doesn't provide the caller with the number of iterations performed, |
| // which we need to calculate the number of bytes transferred. |
| |
| // Start off with enough rounds to ensure a minimum number of bytes copied |
| // per run() invocation. |
| int rounds = max(targetBatchSizeInBytes ~/ bytes, 1); |
| |
| // If running a long measurement permit some amount of measurement jitter |
| // to avoid discarding results that almost, but not quite, reach the minimum |
| // duration requested. |
| final allowedJitter = Duration( |
| microseconds: minDuration.inSeconds > 0 |
| ? (minDuration.inMicroseconds * 0.1).floor() |
| : 0); |
| |
| final watch = Stopwatch()..start(); |
| while (true) { |
| // Try running for the current number of rounds and see if that reaches |
| // the minimum duration requested, so we only get the elapsed time from |
| // the StopWatch once for the final results used. |
| watch.reset(); |
| run(rounds); |
| final elapsed = watch.elapsed; |
| final numberOfBytesCopied = rounds * bytes; |
| if (elapsed >= (minDuration - allowedJitter)) { |
| return (numberOfBytesCopied / elapsed.inMicroseconds) * |
| Duration.microsecondsPerSecond; |
| } |
| // If not, then adjust our estimate of how many iterations are needed to |
| // reach the minimum and try again. |
| if (elapsed.inMilliseconds == 0) { |
| rounds *= 1000; |
| } else { |
| rounds *= (minDuration.inMicroseconds / elapsed.inMicroseconds).ceil(); |
| } |
| } |
| } |
| |
| double measure() { |
| setup(); |
| |
| // Warmup for 100 ms. |
| measureFor(const Duration(milliseconds: 100)); |
| |
| // Run benchmark for 1 second. |
| final double result = measureFor(const Duration(seconds: 1)); |
| |
| teardown(); |
| return result; |
| } |
| |
| void report(Emitter emitter) { |
| final bytesPerSecond = measure(); |
| |
| if (emitter.bytesPerSecond) { |
| emitter.printLabeledValue('$name(BytesPerSecond)', bytesPerSecond); |
| } |
| if (emitter.nanosecondsPerByte) { |
| const nanoSecondsPerSecond = 1000 * 1000 * 1000; |
| final nanosecondsPerByte = nanoSecondsPerSecond / bytesPerSecond; |
| emitter.printLabeledValue( |
| '$name(NanosecondsPerChar)', nanosecondsPerByte); |
| } |
| if (emitter.mebibytesPerSecond) { |
| const bytesPerMebibyte = 1024 * 1024; |
| final mibPerSecond = bytesPerSecond / bytesPerMebibyte; |
| emitter.printLabeledValue('$name(MebibytesPerSecond)', mibPerSecond); |
| } |
| } |
| |
| void setup(); |
| void teardown(); |
| void run(int rounds); |
| } |
| |
| abstract class Uint8ListCopyBenchmark extends MemoryCopyBenchmark { |
| final int count; |
| late Uint8List input; |
| late Uint8List result; |
| |
| Uint8ListCopyBenchmark(String method, int bytes) |
| : count = bytes, |
| super('$bytes.$method.TypedData.Uint8', bytes); |
| |
| @override |
| void setup() { |
| input = Uint8List(count); |
| for (int i = 0; i < count; ++i) { |
| input[i] = (i + 3) & 0xff; |
| } |
| result = Uint8List(maxSizeInBytes); |
| } |
| |
| @override |
| void teardown() { |
| for (int i = 0; i < count; ++i) { |
| final expected = (i + 3) & 0xff; |
| if (result[i] != expected) { |
| throw 'Expected result[$i] = $expected, got ${result[i]}'; |
| } |
| } |
| final expected = 0; |
| for (int i = count; i < maxSizeInBytes; ++i) { |
| if (result[i] != expected) { |
| throw 'Expected result[$i] = $expected, got ${result[i]}'; |
| } |
| } |
| } |
| } |
| |
| class Uint8ListCopyViaLoopBenchmark extends Uint8ListCopyBenchmark { |
| Uint8ListCopyViaLoopBenchmark(int bytes) : super('loop', bytes); |
| |
| @override |
| void run(int rounds) { |
| final count = this.count; |
| final input = this.input; |
| final result = this.result; |
| for (int r = 0; r < rounds; r++) { |
| for (int i = 0; i < count; i++) { |
| result[i] = input[i]; |
| } |
| } |
| } |
| } |
| |
| class Uint8ListCopyViaSetRangeBenchmark extends Uint8ListCopyBenchmark { |
| Uint8ListCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes); |
| |
| @override |
| void run(int rounds) { |
| for (int r = 0; r < rounds; r++) { |
| result.setRange(0, count, input); |
| } |
| } |
| } |
| |
| abstract class Float64ListCopyBenchmark extends MemoryCopyBenchmark { |
| final int count; |
| late Float64List input; |
| late Float64List result; |
| |
| Float64ListCopyBenchmark(String method, int bytes) |
| : count = bytes ~/ 8, |
| super('$bytes.$method.TypedData.Double', bytes); |
| |
| static const maxSizeInElements = maxSizeInBytes ~/ 8; |
| |
| @override |
| void setup() { |
| input = Float64List(count); |
| for (int i = 0; i < count; ++i) { |
| input[i] = (i - 7).toDouble(); |
| } |
| result = Float64List(maxSizeInElements); |
| } |
| |
| @override |
| void teardown() { |
| for (int i = 0; i < count; ++i) { |
| final expected = (i - 7).toDouble(); |
| if (result[i] != expected) { |
| throw 'Expected result[$i] = $expected, got ${result[i]}'; |
| } |
| } |
| final expected = 0.0; |
| for (int i = count; i < maxSizeInElements; ++i) { |
| if (result[i] != expected) { |
| throw 'Expected result[$i] = $expected, got ${result[i]}'; |
| } |
| } |
| } |
| } |
| |
| class Float64ListCopyViaLoopBenchmark extends Float64ListCopyBenchmark { |
| Float64ListCopyViaLoopBenchmark(int bytes) : super('loop', bytes); |
| |
| @override |
| void run(int rounds) { |
| final count = this.count; |
| final input = this.input; |
| final result = this.result; |
| for (int r = 0; r < rounds; r++) { |
| for (int i = 0; i < count; i++) { |
| result[i] = input[i]; |
| } |
| } |
| } |
| } |
| |
| class Float64ListCopyViaSetRangeBenchmark extends Float64ListCopyBenchmark { |
| Float64ListCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes); |
| |
| @override |
| void run(int rounds) { |
| for (int r = 0; r < rounds; r++) { |
| result.setRange(0, count, input); |
| } |
| } |
| } |
| |
| abstract class PointerUint8CopyBenchmark extends MemoryCopyBenchmark { |
| final int count; |
| late Pointer<Uint8> input; |
| late Pointer<Uint8> result; |
| |
| PointerUint8CopyBenchmark(String method, int bytes) |
| : count = bytes, |
| super('$bytes.$method.Pointer.Uint8', bytes); |
| |
| @override |
| void setup() { |
| input = malloc<Uint8>(count); |
| for (var i = 0; i < count; ++i) { |
| input[i] = (i + 3) & 0xff; |
| } |
| result = calloc<Uint8>(maxSizeInBytes); |
| } |
| |
| @override |
| void teardown() { |
| malloc.free(input); |
| for (var i = 0; i < count; ++i) { |
| final expected = (i + 3) & 0xff; |
| if (result[i] != expected) { |
| throw 'Expected result[$i] = $expected, got ${result[i]}'; |
| } |
| } |
| final expected = 0; |
| for (var i = count; i < maxSizeInBytes; ++i) { |
| if (result[i] != expected) { |
| throw 'Expected result[$i] = $expected, got ${result[i]}'; |
| } |
| } |
| calloc.free(result); |
| } |
| } |
| |
| class PointerUint8CopyViaLoopBenchmark extends PointerUint8CopyBenchmark { |
| PointerUint8CopyViaLoopBenchmark(int bytes) : super('loop', bytes); |
| |
| @override |
| void run(int rounds) { |
| // Compare the setRange version to looping using Pointer.[]/Pointer.[]=. |
| final count = this.count; |
| final input = this.input; |
| final result = this.result; |
| for (int r = 0; r < rounds; r++) { |
| for (int i = 0; i < count; i++) { |
| result[i] = input[i]; |
| } |
| } |
| } |
| } |
| |
| class PointerUint8CopyViaSetRangeBenchmark extends PointerUint8CopyBenchmark { |
| PointerUint8CopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes); |
| |
| @override |
| void run(int rounds) { |
| for (int r = 0; r < rounds; r++) { |
| result |
| .asTypedList(maxSizeInBytes) |
| .setRange(0, count, input.asTypedList(count)); |
| } |
| } |
| } |
| |
| @Native<Void Function(Pointer<Void>, Pointer<Void>, Size)>(isLeaf: true) |
| external void memmove(Pointer<Void> to, Pointer<Void> from, int size); |
| |
| class PointerUint8CopyViaMemmoveBenchmark extends PointerUint8CopyBenchmark { |
| // This particular benchmark was originally written using memcpy, but a |
| // better comparison is against memmove. While our benchmarks don't use |
| // to and from memory that overlaps, in general this case must be handled. |
| // |
| // In order to not have to change the benchmark suite in golem, we keep the |
| // old name for this result. |
| PointerUint8CopyViaMemmoveBenchmark(int bytes) : super('memcpy', bytes); |
| |
| @override |
| void run(int rounds) { |
| for (int r = 0; r < rounds; r++) { |
| memmove(result.cast(), input.cast(), count); |
| } |
| } |
| } |
| |
| abstract class PointerDoubleCopyBenchmark extends MemoryCopyBenchmark { |
| final int count; |
| late Pointer<Double> input; |
| late Pointer<Double> result; |
| |
| PointerDoubleCopyBenchmark(String method, int bytes) |
| : count = bytes ~/ 8, |
| super('$bytes.$method.Pointer.Double', bytes); |
| |
| static const maxSizeInElements = maxSizeInBytes ~/ 8; |
| |
| @override |
| void setup() { |
| input = malloc<Double>(count); |
| for (var i = 0; i < count; ++i) { |
| input[i] = (i - 7).toDouble(); |
| } |
| result = calloc<Double>(maxSizeInElements); |
| } |
| |
| @override |
| void teardown() { |
| malloc.free(input); |
| for (var i = 0; i < count; ++i) { |
| final expected = (i - 7).toDouble(); |
| if (result[i] != expected) { |
| throw 'Expected result[$i] = $expected, got ${result[i]}'; |
| } |
| } |
| final expected = 0.0; |
| for (var i = count; i < maxSizeInElements; ++i) { |
| if (result[i] != expected) { |
| throw 'Expected result[$i] = $expected, got ${result[i]}'; |
| } |
| } |
| calloc.free(result); |
| } |
| } |
| |
| class PointerDoubleCopyViaLoopBenchmark extends PointerDoubleCopyBenchmark { |
| PointerDoubleCopyViaLoopBenchmark(int bytes) : super('loop', bytes); |
| |
| @override |
| void run(int rounds) { |
| // Compare the setRange version to looping using Pointer.[]/Pointer.[]=. |
| final count = this.count; |
| final input = this.input; |
| final result = this.result; |
| for (int r = 0; r < rounds; r++) { |
| for (int i = 0; i < count; i++) { |
| result[i] = input[i]; |
| } |
| } |
| } |
| } |
| |
| class PointerDoubleCopyViaSetRangeBenchmark extends PointerDoubleCopyBenchmark { |
| PointerDoubleCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes); |
| |
| @override |
| void run(int rounds) { |
| for (int r = 0; r < rounds; r++) { |
| result |
| .asTypedList(PointerDoubleCopyBenchmark.maxSizeInElements) |
| .setRange(0, count, input.asTypedList(count)); |
| } |
| } |
| } |
| |
| final defaultLengthsInBytes = [8, 64, 512, 4 * 1024, 1024 * 1024]; |
| |
| void main(List<String> args) { |
| final results = argParser.parse(args); |
| List<int> lengthsInBytes = defaultLengthsInBytes; |
| final emitter = Emitter(results); |
| if (results['length'].isNotEmpty) { |
| lengthsInBytes = (results['length'] as List<String>) |
| .map(int.parse) |
| .where((i) => i <= maxSizeInBytes) |
| .toList(); |
| } |
| final filter = results.rest.firstOrNull; |
| final benchmarks = [ |
| for (int bytes in lengthsInBytes) ...[ |
| PointerUint8CopyViaMemmoveBenchmark(bytes), |
| PointerUint8CopyViaLoopBenchmark(bytes), |
| PointerDoubleCopyViaLoopBenchmark(bytes), |
| Uint8ListCopyViaLoopBenchmark(bytes), |
| Float64ListCopyViaLoopBenchmark(bytes), |
| PointerUint8CopyViaSetRangeBenchmark(bytes), |
| PointerDoubleCopyViaSetRangeBenchmark(bytes), |
| Uint8ListCopyViaSetRangeBenchmark(bytes), |
| Float64ListCopyViaSetRangeBenchmark(bytes), |
| ], |
| ]; |
| for (var bench in benchmarks) { |
| if (filter == null || bench.name.contains(filter)) { |
| bench.report(emitter); |
| } |
| } |
| } |