[vm/compiler] Improve AOT block scheduler

Try to keep loop blocks together, previously AOT compiler would produce
bad block order for loops like these:

    while (true) {
      if (smth) {
        // (*)
        return;
      }
    }

And put `(*)` blocks in between loop header and other loop blocks.

TEST=ci

Cq-Include-Trybots: luci.dart.try:vm-aot-linux-debug-x64-try,vm-aot-mac-release-arm64-try,vm-aot-linux-release-x64-try
Change-Id: I52304810b61ff9288fe9df648b21c0258299d61e
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/358447
Reviewed-by: Alexander Markov <alexmarkov@google.com>
Commit-Queue: Slava Egorov <vegorov@google.com>
diff --git a/pkg/vm/bin/compare_il.dart b/pkg/vm/bin/compare_il.dart
index 28a0f3d..f32d01e 100644
--- a/pkg/vm/bin/compare_il.dart
+++ b/pkg/vm/bin/compare_il.dart
@@ -195,8 +195,13 @@
 
   for (var graph in File(ilFile).readAsLinesSync()) {
     final m = jsonDecode(graph) as Map<String, dynamic>;
-    graphs.putIfAbsent(m['f'], () => {})[m['p']] =
-        FlowGraph(m['b'], m['desc'], m['flags'], rename: rename);
+    graphs.putIfAbsent(m['f'], () => {})[m['p']] = FlowGraph(
+      m['b'],
+      m['desc'],
+      m['flags'],
+      rename: rename,
+      codegenBlockOrder: m['cbo'],
+    );
   }
 
   return graphs;
diff --git a/pkg/vm/lib/testing/il_matchers.dart b/pkg/vm/lib/testing/il_matchers.dart
index 9cb5c00..3b316df 100644
--- a/pkg/vm/lib/testing/il_matchers.dart
+++ b/pkg/vm/lib/testing/il_matchers.dart
@@ -13,22 +13,29 @@
 
 /// Flow graph parsed from --print-flow-graph-as-json output.
 class FlowGraph {
-  final List<dynamic> blocks;
+  final List<dynamic> _blocks;
   final Map<String, InstructionDescriptor> descriptors;
   final Map<String, dynamic> flags;
   final Renamer rename;
+  final List<dynamic>? codegenBlockOrder;
 
-  FlowGraph(this.blocks, Map<String, dynamic> desc, this.flags,
-      {required this.rename})
+  FlowGraph(this._blocks, Map<String, dynamic> desc, this.flags,
+      {required this.rename, List<dynamic>? codegenBlockOrder})
       : descriptors = {
           for (var e in desc.entries)
             e.key: InstructionDescriptor.fromJson(e.value)
-        };
+        },
+        codegenBlockOrder =
+            codegenBlockOrder?.map((idx) => _blocks[idx as int]).toList();
 
   bool get soundNullSafety => flags['nnbd'];
 
   PrettyPrinter get printer => PrettyPrinter(descriptors);
 
+  List<dynamic> blocks({bool inCodegenBlockOrder = false}) {
+    return inCodegenBlockOrder ? codegenBlockOrder! : _blocks;
+  }
+
   /// Match the sequence of blocks in this flow graph against the given
   /// sequence of matchers: `expected[i]` is expected to match `blocks[i]`,
   /// but there can be more blocks in the graph than matchers (the suffix is
@@ -38,14 +45,19 @@
   /// a fresh instance of [Env] will be created and used.
   ///
   /// This function returns the populated matching environment.
-  Env match(List<Matcher> expected, {Env? env}) {
+  Env match(
+    List<Matcher> expected, {
+    Env? env,
+    bool inCodegenBlockOrder = false,
+  }) {
     env ??= Env(rename: rename, descriptors: descriptors);
 
+    final got = blocks(inCodegenBlockOrder: inCodegenBlockOrder);
     for (var i = 0; i < expected.length; i++) {
-      final result = expected[i].match(env, blocks[i]);
+      final result = expected[i].match(env, got[i]);
       if (result.isFail) {
         print('Failed to match: ${result.message}');
-        dump();
+        dump(inCodegenBlockOrder: inCodegenBlockOrder);
         throw 'Failed to match';
       }
     }
@@ -63,11 +75,11 @@
     return {for (final e in attrs.entries) e.key: instr['d'][e.value]};
   }
 
-  void dump() {
+  void dump({bool inCodegenBlockOrder = false}) {
     final printer = PrettyPrinter(descriptors);
 
     final buffer = StringBuffer();
-    for (var block in blocks) {
+    for (var block in blocks(inCodegenBlockOrder: inCodegenBlockOrder)) {
       printer._formatBlock(buffer, block);
     }
     print(buffer);
@@ -676,7 +688,7 @@
       );
 
   // ignore: non_constant_identifier_names
-  InstructionMatcher Branch(InstructionMatcher compare,
+  InstructionMatcher Branch(Matcher compare,
           {String? ifTrue, String? ifFalse, bool skipUntilMatched = true}) =>
       InstructionMatcher._(
         op: 'Branch',
diff --git a/runtime/tests/vm/dart/block_ordering_il_test.dart b/runtime/tests/vm/dart/block_ordering_il_test.dart
new file mode 100644
index 0000000..6a89638
--- /dev/null
+++ b/runtime/tests/vm/dart/block_ordering_il_test.dart
@@ -0,0 +1,184 @@
+// Copyright (c) 2024, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+import 'dart:typed_data';
+
+// This test checks that AOT compiler tries to keep loop blocks together and
+// sinks cold code to the end of the function.
+
+import 'package:expect/expect.dart';
+import 'package:vm/testing/il_matchers.dart';
+
+@pragma('vm:never-inline')
+@pragma('vm:testing:print-flow-graph', 'ReorderBlocks')
+int loop(Uint8List list) {
+  for (var i = 0; i < list.length; i++) {
+    if (list[i] == 1) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+@pragma('vm:never-inline')
+@pragma('vm:testing:print-flow-graph', 'ReorderBlocks')
+int loop2(Uint8List list) {
+  int sum = 0;
+  for (var i = 0; i < list.length; i++) {
+    for (var j = 0; j < list.length; j++) {
+      if (list[j] == 1) {
+        break;
+      }
+      sum += list[j];
+    }
+  }
+  return sum;
+}
+
+@pragma('vm:never-inline')
+@pragma('vm:testing:print-flow-graph', 'ReorderBlocks')
+int bodyAlwaysThrows(Uint8List list) {
+  try {
+    throw 'Yay';
+  } catch (e) {
+    return list.length;
+  }
+}
+
+@pragma('vm:never-inline')
+@pragma('vm:testing:print-flow-graph', 'ReorderBlocks')
+int throwInALoop(Uint8List list) {
+  for (var i = 0; i < list.length; i++) {
+    if (list[i] == 1) {
+      return i;
+    }
+    if (list[i] == 2) {
+      throw 'Unexpected value';
+    }
+  }
+  return -1;
+}
+
+void main(List<String> args) {
+  final input = args.contains('foo')
+      ? (Uint8List(10)..[5] = 1)
+      : (Uint8List(100)..[50] = 1);
+
+  Expect.equals(50, loop(input));
+  Expect.equals(0, loop2(input));
+  Expect.equals(100, bodyAlwaysThrows(input));
+  Expect.equals(50, throwInALoop(input));
+}
+
+void matchIL$loop(FlowGraph graph) {
+  graph.match(inCodegenBlockOrder: true, [
+    match.block('Graph'),
+    match.block('Function', [
+      match.Goto('loop_header'),
+    ]),
+    'loop_header' <<
+        match.block('Join', [
+          match.Branch(match.any, ifTrue: 'loop_body'),
+        ]),
+    'loop_body' <<
+        match.block('Target', [
+          match.Branch(match.any, ifFalse: 'loop_inc'),
+        ]),
+    'loop_inc' <<
+        match.block('Target', [
+          match.Goto('loop_header'),
+        ]),
+  ]);
+}
+
+void matchIL$loop2(FlowGraph graph) {
+  graph.match(inCodegenBlockOrder: true, [
+    match.block('Graph'),
+    match.block('Function', [
+      match.Goto('loop_header_1'),
+    ]),
+    'loop_header_1' <<
+        match.block('Join', [
+          match.Branch(match.any, ifTrue: 'loop_body_1'),
+        ]),
+    'loop_body_1' <<
+        match.block('Target', [
+          match.Goto('loop_header_2'),
+        ]),
+    'loop_header_2' <<
+        match.block('Join', [
+          match.Branch(match.any,
+              ifTrue: 'loop_body_2', ifFalse: 'loop_body_2_exit_1'),
+        ]),
+    'loop_body_2' <<
+        match.block('Target', [
+          match.Branch(match.any,
+              ifTrue: 'loop_body_2_exit_2', ifFalse: 'loop_inc_2'),
+        ]),
+    'loop_inc_2' <<
+        match.block('Target', [
+          match.Goto('loop_header_2'),
+        ]),
+    'loop_body_2_exit_2' <<
+        match.block('Target', [
+          match.Goto('loop_inc_1'),
+        ]),
+    'loop_body_2_exit_1' <<
+        match.block('Target', [
+          match.Goto('loop_inc_1'),
+        ]),
+    'loop_inc_1' <<
+        match.block('Join', [
+          match.Goto('loop_header_1'),
+        ]),
+  ]);
+}
+
+void matchIL$bodyAlwaysThrows(FlowGraph graph) {
+  graph.match([
+    match.block('Graph'),
+    match.block('Function'),
+    match.block('CatchBlock'),
+    match.block('Join'),
+  ], inCodegenBlockOrder: true);
+}
+
+void matchIL$throwInALoop(FlowGraph graph) {
+  graph.match(inCodegenBlockOrder: true, [
+    match.block('Graph'),
+    match.block('Function', [
+      match.Goto('loop_header'),
+    ]),
+    'loop_header' <<
+        match.block('Join', [
+          'i' << match.Phi(match.any, 'inc_i'),
+          match.Branch(match.any, ifTrue: 'loop_body', ifFalse: 'return_fail'),
+        ]),
+    'loop_body' <<
+        match.block('Target', [
+          match.Branch(match.any,
+              ifTrue: 'return_found', ifFalse: 'loop_body_cont'),
+        ]),
+    'loop_body_cont' <<
+        match.block('Target', [
+          match.Branch(match.any, ifTrue: 'throw', ifFalse: 'loop_inc'),
+        ]),
+    'loop_inc' <<
+        match.block('Target', [
+          'inc_i' << match.BinaryInt64Op('i', match.any),
+          match.Goto('loop_header'),
+        ]),
+    'return_found' <<
+        match.block('Target', [
+          match.Return('i'),
+        ]),
+    'return_fail' <<
+        match.block('Target', [
+          match.Return(match.any),
+        ]),
+    'throw' <<
+        match.block('Target', [
+          match.Throw(match.any),
+        ]),
+  ]);
+}
diff --git a/runtime/tests/vm/dart/inline_TypedList_getUint32_il_test.dart b/runtime/tests/vm/dart/inline_TypedList_getUint32_il_test.dart
index 053b6ca..f1a647a 100644
--- a/runtime/tests/vm/dart/inline_TypedList_getUint32_il_test.dart
+++ b/runtime/tests/vm/dart/inline_TypedList_getUint32_il_test.dart
@@ -43,7 +43,7 @@
 void matchIL$calculateRetainers(FlowGraph graph) {
   graph.dump();
   final descriptors = graph.descriptors;
-  for (var block in graph.blocks) {
+  for (var block in graph.blocks()) {
     for (var instr in [...?block['is']]) {
       if (instr['o'] != 'StaticCall') continue;
       final function = graph.attributesFor(instr)?['function'];
diff --git a/runtime/vm/compiler/backend/block_scheduler.cc b/runtime/vm/compiler/backend/block_scheduler.cc
index 2065f58..cb29964 100644
--- a/runtime/vm/compiler/backend/block_scheduler.cc
+++ b/runtime/vm/compiler/backend/block_scheduler.cc
@@ -238,62 +238,153 @@
   }
 }
 
-// Moves blocks ending in a throw/rethrow, as well as any block post-dominated
-// by such a throwing block, to the end.
-void BlockScheduler::ReorderBlocksAOT(FlowGraph* flow_graph) {
-  auto& reverse_postorder = flow_graph->reverse_postorder();
-  const intptr_t block_count = reverse_postorder.length();
-  GrowableArray<bool> is_terminating(block_count);
-  is_terminating.FillWith(false, 0, block_count);
+// AOT block order is based on reverse post order but with two changes:
+//
+// - Blocks which always throw and their direct predecessors are considered
+// *cold* and moved to the end of the order.
+// - Blocks which belong to the same loop are kept together (where possible)
+// and not interspersed with other blocks.
+//
+namespace {
+class AOTBlockScheduler {
+ public:
+  explicit AOTBlockScheduler(FlowGraph* flow_graph)
+      : flow_graph_(flow_graph),
+        block_count_(flow_graph->reverse_postorder().length()),
+        marks_(block_count_),
+        postorder_(block_count_),
+        cold_postorder_(10) {
+    marks_.FillWith(0, 0, block_count_);
+  }
 
-  // Any block in the worklist is marked and any of its unconditional
-  // predecessors need to be marked as well.
-  GrowableArray<BlockEntryInstr*> worklist;
+  void ComputeOrder() {
+    ComputeOrderImpl();
 
-  // Add all throwing blocks to the worklist.
-  for (intptr_t i = 0; i < block_count; ++i) {
-    auto block = reverse_postorder[i];
-    auto last = block->last_instruction();
-    if (last->IsThrow() || last->IsReThrow()) {
-      const intptr_t preorder_nr = block->preorder_number();
-      is_terminating[preorder_nr] = true;
-      worklist.Add(block);
+    const auto codegen_order = flow_graph_->CodegenBlockOrder();
+    for (intptr_t i = postorder_.length() - 1; i >= 0; --i) {
+      codegen_order->Add(postorder_[i]);
+    }
+    for (intptr_t i = cold_postorder_.length() - 1; i >= 0; --i) {
+      codegen_order->Add(cold_postorder_[i]);
     }
   }
 
-  // Follow all indirect predecessors which unconditionally will end up in a
-  // throwing block.
-  while (worklist.length() > 0) {
-    auto block = worklist.RemoveLast();
-    for (intptr_t i = 0; i < block->PredecessorCount(); ++i) {
-      auto predecessor = block->PredecessorAt(i);
-      if (predecessor->last_instruction()->IsGoto()) {
-        const intptr_t preorder_nr = predecessor->preorder_number();
-        if (!is_terminating[preorder_nr]) {
-          is_terminating[preorder_nr] = true;
-          worklist.Add(predecessor);
+ private:
+  // The algorithm below is almost identical to |FlowGraph::DiscoverBlocks|, but
+  // with few tweaks which guarantee improved scheduling for cold code and
+  // loops.
+  void ComputeOrderImpl() {
+    PushBlock(flow_graph_->graph_entry());
+    while (!block_stack_.is_empty()) {
+      BlockEntryInstr* block = block_stack_.Last();
+      auto& marks = MarksOf(block);
+      auto last = block->last_instruction();
+      const auto successor_count = last->SuccessorCount();
+
+      if ((marks & kVisitedMark) == 0) {
+        marks |= kVisitedMark;
+
+        if (last->IsThrow() || last->IsReThrow()) {
+          marks |= kColdMark;
+        } else {
+          // When visiting a block inside a loop with two successors
+          // push the successor with lesser nesting *last*, so that it is
+          // visited first. This helps to keep blocks which belong to the
+          // same loop together.
+          //
+          // This is the main difference from |DiscoverBlocks| which always
+          // visits successors in reverse order.
+          if (successor_count == 2 && block->loop_info() != nullptr) {
+            auto succ0 = last->SuccessorAt(0);
+            auto succ1 = last->SuccessorAt(1);
+
+            if (succ0->NestingDepth() < succ1->NestingDepth()) {
+              PushBlock(succ1);
+              PushBlock(succ0);
+            } else {
+              PushBlock(succ0);
+              PushBlock(succ1);
+            }
+          } else {
+            for (intptr_t i = 0; i < successor_count; i++) {
+              PushBlock(last->SuccessorAt(i));
+            }
+          }
+
+          // We have pushed some successors to the stack. Process them first.
+          if (block_stack_.Last() != block) {
+            continue;
+          }
+
+          // No successors added, fall through.
         }
       }
+
+      // All successors of this block were visited, which means we are
+      // done with this block.
+      block_stack_.RemoveLast();
+
+      // Propagate cold mark from the successors: if all successors are
+      // cold then this block is cold as well.
+      if (successor_count > 0) {
+        uint8_t cold_mark = kColdMark;
+        for (intptr_t i = 0; i < successor_count; i++) {
+          cold_mark &= MarksOf(last->SuccessorAt(i));
+        }
+        marks |= cold_mark;
+      }
+
+      if ((marks & (kColdMark | kPinnedMark)) == kColdMark) {
+        // This block is cold and not pinned: move it to cold section at
+        // the end.
+        cold_postorder_.Add(block);
+      } else {
+        postorder_.Add(block);
+      }
     }
   }
 
-  // Emit code in reverse postorder but move any throwing blocks (except the
-  // function entry, which needs to come first) to the very end.
-  auto codegen_order = flow_graph->CodegenBlockOrder();
-  for (intptr_t i = 0; i < block_count; ++i) {
-    auto block = reverse_postorder[i];
-    const intptr_t preorder_nr = block->preorder_number();
-    if (!is_terminating[preorder_nr] || block->IsFunctionEntry()) {
-      codegen_order->Add(block);
+  // The block was added to the stack.
+  static constexpr uint8_t kSeenMark = 1 << 0;
+  // The block was visited and all of its successors were added to the stack.
+  static constexpr uint8_t kVisitedMark = 1 << 1;
+  // The block terminates with unconditional throw or rethrow.
+  static constexpr uint8_t kColdMark = 1 << 2;
+  // The block should not move to cold section.
+  static constexpr uint8_t kPinnedMark = 1 << 3;
+
+  uint8_t& MarksOf(BlockEntryInstr* block) {
+    return marks_[block->preorder_number()];
+  }
+
+  void PushBlock(BlockEntryInstr* block) {
+    auto& marks = MarksOf(block);
+    if ((marks & kSeenMark) == 0) {
+      marks |= kSeenMark;
+      block_stack_.Add(block);
+
+      if (block->IsFunctionEntry() || block->IsGraphEntry()) {
+        marks |= kPinnedMark;
+      }
     }
   }
-  for (intptr_t i = 0; i < block_count; ++i) {
-    auto block = reverse_postorder[i];
-    const intptr_t preorder_nr = block->preorder_number();
-    if (is_terminating[preorder_nr] && !block->IsFunctionEntry()) {
-      codegen_order->Add(block);
-    }
-  }
+
+  FlowGraph* const flow_graph_;
+  const intptr_t block_count_;
+
+  // Block marks for each block indexed by block preorder number.
+  GrowableArray<uint8_t> marks_;
+
+  // Stack of blocks to process.
+  GrowableArray<BlockEntryInstr*> block_stack_;
+
+  GrowableArray<BlockEntryInstr*> postorder_;
+  GrowableArray<BlockEntryInstr*> cold_postorder_;
+};
+}  // namespace
+
+void BlockScheduler::ReorderBlocksAOT(FlowGraph* flow_graph) {
+  AOTBlockScheduler(flow_graph).ComputeOrder();
 }
 
 }  // namespace dart
diff --git a/runtime/vm/compiler/backend/il_printer.cc b/runtime/vm/compiler/backend/il_printer.cc
index 36f0b83..8d17729 100644
--- a/runtime/vm/compiler/backend/il_printer.cc
+++ b/runtime/vm/compiler/backend/il_printer.cc
@@ -57,6 +57,16 @@
       PrintBlock(&writer, block);
     }
     writer.CloseArray();
+    const auto& codegen_order = *flow_graph->CodegenBlockOrder();
+    if (!codegen_order.is_empty() &&
+        (&codegen_order != &flow_graph->reverse_postorder())) {
+      writer.OpenArray("cbo");
+      const auto block_count = flow_graph->reverse_postorder().length();
+      for (auto block : codegen_order) {
+        writer.PrintValue64((block_count - 1) - block->postorder_number());
+      }
+      writer.CloseArray();
+    }
     writer.OpenObject("desc");
     AttributesSerializer(&writer).WriteDescriptors();
     writer.CloseObject();