[Impeller] Fix GLES gaussian implementation. (#55329)

Fixes https://github.com/flutter/flutter/issues/142355

problems: no uniform structs, no int uniforms, buffer bindings dont work when the struct type doesn't match the uniform name 😓
diff --git a/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc b/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc
index 1b595ca..4c7af0b 100644
--- a/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc
+++ b/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc
@@ -12,6 +12,7 @@
 #include "impeller/entity/texture_downsample.frag.h"
 #include "impeller/entity/texture_fill.frag.h"
 #include "impeller/entity/texture_fill.vert.h"
+#include "impeller/geometry/color.h"
 #include "impeller/renderer/render_pass.h"
 #include "impeller/renderer/vertex_buffer_builder.h"
 
@@ -316,7 +317,7 @@
 fml::StatusOr<RenderTarget> MakeDownsampleSubpass(
     const ContentContext& renderer,
     const std::shared_ptr<CommandBuffer>& command_buffer,
-    std::shared_ptr<Texture> input_texture,
+    const std::shared_ptr<Texture>& input_texture,
     const SamplerDescriptor& sampler_descriptor,
     const DownsamplePassArgs& pass_args,
     Entity::TileMode tile_mode) {
@@ -338,7 +339,8 @@
 
           TextureFillVertexShader::FrameInfo frame_info;
           frame_info.mvp = Matrix::MakeOrthographic(ISize(1, 1));
-          frame_info.texture_sampler_y_coord_scale = 1.0;
+          frame_info.texture_sampler_y_coord_scale =
+              input_texture->GetYCoordScale();
 
           TextureFillFragmentShader::FragInfo frag_info;
           frag_info.alpha = 1.0;
@@ -391,7 +393,8 @@
 
           TextureFillVertexShader::FrameInfo frame_info;
           frame_info.mvp = Matrix::MakeOrthographic(ISize(1, 1));
-          frame_info.texture_sampler_y_coord_scale = 1.0;
+          frame_info.texture_sampler_y_coord_scale =
+              input_texture->GetYCoordScale();
 
           TextureDownsampleFragmentShader::FragInfo frag_info;
           frag_info.edge = edge;
@@ -442,16 +445,18 @@
     return input_pass;
   }
 
-  std::shared_ptr<Texture> input_texture = input_pass.GetRenderTargetTexture();
+  const std::shared_ptr<Texture>& input_texture =
+      input_pass.GetRenderTargetTexture();
 
   // TODO(gaaclarke): This blurs the whole image, but because we know the clip
   //                  region we could focus on just blurring that.
   ISize subpass_size = input_texture->GetSize();
   ContentContext::SubpassCallback subpass_callback =
       [&](const ContentContext& renderer, RenderPass& pass) {
-        GaussianBlurVertexShader::FrameInfo frame_info{
-            .mvp = Matrix::MakeOrthographic(ISize(1, 1)),
-            .texture_sampler_y_coord_scale = 1.0};
+        GaussianBlurVertexShader::FrameInfo frame_info;
+        frame_info.mvp = Matrix::MakeOrthographic(ISize(1, 1)),
+        frame_info.texture_sampler_y_coord_scale =
+            input_texture->GetYCoordScale();
 
         HostBuffer& host_buffer = renderer.GetTransientsBuffer();
 
@@ -476,11 +481,9 @@
                 linear_sampler_descriptor));
         GaussianBlurVertexShader::BindFrameInfo(
             pass, host_buffer.EmplaceUniform(frame_info));
-        GaussianBlurPipeline::FragmentShader::KernelSamples kernel_samples =
-            LerpHackKernelSamples(GenerateBlurInfo(blur_info));
-        FML_CHECK(kernel_samples.sample_count <= kGaussianBlurMaxKernelSize);
         GaussianBlurFragmentShader::BindKernelSamples(
-            pass, host_buffer.EmplaceUniform(kernel_samples));
+            pass, host_buffer.EmplaceUniform(
+                      LerpHackKernelSamples(GenerateBlurInfo(blur_info))));
         return pass.Draw().ok();
       };
   if (destination_target.has_value()) {
@@ -893,7 +896,7 @@
   Scalar tally = 0.0f;
   for (int i = 0; i < result.sample_count; ++i) {
     int x = x_offset + (i * parameters.step_size) - parameters.blur_radius;
-    result.samples[i] = GaussianBlurPipeline::FragmentShader::KernelSample{
+    result.samples[i] = KernelSample{
         .uv_offset = parameters.blur_uv_offset * x,
         .coefficient = expf(-0.5f * (x * x) /
                             (parameters.blur_sigma * parameters.blur_sigma)) /
@@ -914,25 +917,31 @@
 // between the samples.
 GaussianBlurPipeline::FragmentShader::KernelSamples LerpHackKernelSamples(
     KernelSamples parameters) {
-  GaussianBlurPipeline::FragmentShader::KernelSamples result;
+  GaussianBlurPipeline::FragmentShader::KernelSamples result = {};
   result.sample_count = ((parameters.sample_count - 1) / 2) + 1;
   int32_t middle = result.sample_count / 2;
   int32_t j = 0;
   FML_DCHECK(result.sample_count <= kGaussianBlurMaxKernelSize);
+  static_assert(sizeof(result.sample_data) ==
+                sizeof(std::array<Vector4, kGaussianBlurMaxKernelSize>));
+
   for (int i = 0; i < result.sample_count; i++) {
     if (i == middle) {
-      result.samples[i] = parameters.samples[j++];
+      result.sample_data[i].x = parameters.samples[j].uv_offset.x;
+      result.sample_data[i].y = parameters.samples[j].uv_offset.y;
+      result.sample_data[i].z = parameters.samples[j].coefficient;
+      j++;
     } else {
-      GaussianBlurPipeline::FragmentShader::KernelSample left =
-          parameters.samples[j];
-      GaussianBlurPipeline::FragmentShader::KernelSample right =
-          parameters.samples[j + 1];
-      result.samples[i] = GaussianBlurPipeline::FragmentShader::KernelSample{
-          .uv_offset = (left.uv_offset * left.coefficient +
-                        right.uv_offset * right.coefficient) /
-                       (left.coefficient + right.coefficient),
-          .coefficient = left.coefficient + right.coefficient,
-      };
+      KernelSample left = parameters.samples[j];
+      KernelSample right = parameters.samples[j + 1];
+
+      result.sample_data[i].z = left.coefficient + right.coefficient;
+
+      Point uv = (left.uv_offset * left.coefficient +
+                  right.uv_offset * right.coefficient) /
+                 (left.coefficient + right.coefficient);
+      result.sample_data[i].x = uv.x;
+      result.sample_data[i].y = uv.y;
       j += 2;
     }
   }
diff --git a/impeller/entity/contents/filters/gaussian_blur_filter_contents.h b/impeller/entity/contents/filters/gaussian_blur_filter_contents.h
index 4408858..b24c87f 100644
--- a/impeller/entity/contents/filters/gaussian_blur_filter_contents.h
+++ b/impeller/entity/contents/filters/gaussian_blur_filter_contents.h
@@ -9,12 +9,16 @@
 #include "impeller/entity/contents/content_context.h"
 #include "impeller/entity/contents/filters/filter_contents.h"
 #include "impeller/entity/geometry/geometry.h"
+#include "impeller/geometry/color.h"
 
 namespace impeller {
 
 // Comes from gaussian.frag.
 static constexpr int32_t kGaussianBlurMaxKernelSize = 50;
 
+static_assert(sizeof(GaussianBlurPipeline::FragmentShader::KernelSamples) ==
+              sizeof(Vector4) * kGaussianBlurMaxKernelSize + sizeof(Vector4));
+
 struct BlurParameters {
   Point blur_uv_offset;
   Scalar blur_sigma;
@@ -22,6 +26,11 @@
   int step_size;
 };
 
+struct KernelSample {
+  Vector2 uv_offset;
+  float coefficient;
+};
+
 /// A larger mirror of GaussianBlurPipeline::FragmentShader::KernelSamples.
 ///
 /// This is a mirror of GaussianBlurPipeline::FragmentShader::KernelSamples that
@@ -30,7 +39,7 @@
 struct KernelSamples {
   static constexpr int kMaxKernelSize = kGaussianBlurMaxKernelSize * 2;
   int sample_count;
-  GaussianBlurPipeline::FragmentShader::KernelSample samples[kMaxKernelSize];
+  KernelSample samples[kMaxKernelSize];
 };
 
 KernelSamples GenerateBlurInfo(BlurParameters parameters);
diff --git a/impeller/entity/contents/filters/gaussian_blur_filter_contents_unittests.cc b/impeller/entity/contents/filters/gaussian_blur_filter_contents_unittests.cc
index 59618db..26449a2 100644
--- a/impeller/entity/contents/filters/gaussian_blur_filter_contents_unittests.cc
+++ b/impeller/entity/contents/filters/gaussian_blur_filter_contents_unittests.cc
@@ -9,6 +9,7 @@
 #include "impeller/entity/contents/filters/gaussian_blur_filter_contents.h"
 #include "impeller/entity/contents/texture_contents.h"
 #include "impeller/entity/entity_playground.h"
+#include "impeller/geometry/color.h"
 #include "impeller/geometry/geometry_asserts.h"
 #include "impeller/renderer/testing/mocks.h"
 
@@ -51,6 +52,14 @@
   return x;
 }
 
+Scalar GetCoefficient(const Vector4& vec) {
+  return vec.z;
+}
+
+Vector2 GetUVOffset(const Vector4& vec) {
+  return vec.xy();
+}
+
 fml::StatusOr<Scalar> CalculateSigmaForBlurRadius(
     Scalar radius,
     const Matrix& effect_transform) {
@@ -508,27 +517,24 @@
           },
   };
 
-  GaussianBlurPipeline::FragmentShader::KernelSamples fast_kernel_samples =
+  GaussianBlurPipeline::FragmentShader::KernelSamples blur_info =
       LerpHackKernelSamples(kernel_samples);
-  EXPECT_EQ(fast_kernel_samples.sample_count, 3);
+  EXPECT_EQ(blur_info.sample_count, 3);
 
-  GaussianBlurPipeline::FragmentShader::KernelSample* samples =
-      kernel_samples.samples;
-  GaussianBlurPipeline::FragmentShader::KernelSample* fast_samples =
-      fast_kernel_samples.samples;
+  KernelSample* samples = kernel_samples.samples;
 
   //////////////////////////////////////////////////////////////////////////////
   // Check output kernel.
 
-  EXPECT_FLOAT_EQ(fast_samples[0].uv_offset.x, -1.3333333);
-  EXPECT_FLOAT_EQ(fast_samples[0].uv_offset.y, 0);
-  EXPECT_FLOAT_EQ(fast_samples[0].coefficient, 0.3);
-  EXPECT_FLOAT_EQ(fast_samples[1].uv_offset.x, 0);
-  EXPECT_FLOAT_EQ(fast_samples[1].uv_offset.y, 0);
-  EXPECT_FLOAT_EQ(fast_samples[1].coefficient, 0.4);
-  EXPECT_FLOAT_EQ(fast_samples[2].uv_offset.x, 1.3333333);
-  EXPECT_FLOAT_EQ(fast_samples[2].uv_offset.y, 0);
-  EXPECT_FLOAT_EQ(fast_samples[2].coefficient, 0.3);
+  EXPECT_POINT_NEAR(GetUVOffset(blur_info.sample_data[0]),
+                    Point(-1.3333333, 0));
+  EXPECT_FLOAT_EQ(GetCoefficient(blur_info.sample_data[0]), 0.3);
+
+  EXPECT_POINT_NEAR(GetUVOffset(blur_info.sample_data[1]), Point(0, 0));
+  EXPECT_FLOAT_EQ(GetCoefficient(blur_info.sample_data[1]), 0.4);
+
+  EXPECT_POINT_NEAR(GetUVOffset(blur_info.sample_data[2]), Point(1.333333, 0));
+  EXPECT_FLOAT_EQ(GetCoefficient(blur_info.sample_data[2]), 0.3);
 
   //////////////////////////////////////////////////////////////////////////////
   // Check output of fast kernel versus original kernel.
@@ -549,11 +555,11 @@
     }
   };
   Scalar fast_output =
-      /*1st*/ lerp(fast_samples[0].uv_offset, data[0], data[1]) *
-          fast_samples[0].coefficient +
-      /*2nd*/ data[2] * fast_samples[1].coefficient +
-      /*3rd*/ lerp(fast_samples[2].uv_offset, data[3], data[4]) *
-          fast_samples[2].coefficient;
+      /*1st*/ lerp(GetUVOffset(blur_info.sample_data[0]), data[0], data[1]) *
+          GetCoefficient(blur_info.sample_data[0]) +
+      /*2nd*/ data[2] * GetCoefficient(blur_info.sample_data[1]) +
+      /*3rd*/ lerp(GetUVOffset(blur_info.sample_data[2]), data[3], data[4]) *
+          GetCoefficient(blur_info.sample_data[2]);
 
   EXPECT_NEAR(original_output, fast_output, 0.01);
 }
@@ -604,9 +610,9 @@
   }
 
   Scalar fast_output = 0.0;
-  for (int i = 0; i < fast_kernel_samples.sample_count; ++i) {
-    auto sample = fast_kernel_samples.samples[i];
-    fast_output += sample.coefficient * sampler(sample.uv_offset);
+  for (int i = 0; i < fast_kernel_samples.sample_count; i++) {
+    fast_output += GetCoefficient(fast_kernel_samples.sample_data[i]) *
+                   sampler(GetUVOffset(fast_kernel_samples.sample_data[i]));
   }
 
   EXPECT_NEAR(output, fast_output, 0.1);
diff --git a/impeller/entity/shaders/filters/gaussian.frag b/impeller/entity/shaders/filters/gaussian.frag
index f83a599..a6d58f8 100644
--- a/impeller/entity/shaders/filters/gaussian.frag
+++ b/impeller/entity/shaders/filters/gaussian.frag
@@ -11,16 +11,13 @@
 
 layout(constant_id = 0) const float supports_decal = 1.0;
 
-struct KernelSample {
-  vec2 uv_offset;
-  float coefficient;
-};
-
 uniform KernelSamples {
-  int sample_count;
-  KernelSample samples[50];
+  float sample_count;
+
+  // X, Y are uv offset and Z is Coefficient. W is padding.
+  vec4 sample_data[50];
 }
-blur_info;
+kernel_samples;
 
 f16vec4 Sample(f16sampler2D tex, vec2 coords) {
   if (supports_decal == 1.0) {
@@ -36,11 +33,11 @@
 void main() {
   f16vec4 total_color = f16vec4(0.0hf);
 
-  for (int i = 0; i < blur_info.sample_count; ++i) {
-    float16_t coefficient = float16_t(blur_info.samples[i].coefficient);
-    total_color +=
-        coefficient * Sample(texture_sampler,
-                             v_texture_coords + blur_info.samples[i].uv_offset);
+  for (int i = 0; i < int(kernel_samples.sample_count); i++) {
+    float16_t coefficient = float16_t(kernel_samples.sample_data[i].z);
+    total_color += coefficient *
+                   Sample(texture_sampler,
+                          v_texture_coords + kernel_samples.sample_data[i].xy);
   }
 
   frag_color = total_color;
diff --git a/impeller/geometry/vector.h b/impeller/geometry/vector.h
index 31e894c..d1358bf 100644
--- a/impeller/geometry/vector.h
+++ b/impeller/geometry/vector.h
@@ -310,6 +310,8 @@
     return *this + (v - *this) * t;
   }
 
+  constexpr Vector2 xy() const { return Vector2(x, y); }
+
   std::string ToString() const;
 };
 
diff --git a/impeller/renderer/backend/gles/buffer_bindings_gles.cc b/impeller/renderer/backend/gles/buffer_bindings_gles.cc
index 4067498..71d186f 100644
--- a/impeller/renderer/backend/gles/buffer_bindings_gles.cc
+++ b/impeller/renderer/backend/gles/buffer_bindings_gles.cc
@@ -279,20 +279,20 @@
     auto* buffer_data =
         reinterpret_cast<const GLfloat*>(buffer_ptr + member.offset);
 
-    std::vector<uint8_t> array_element_buffer;
-    if (element_count > 1) {
-      // When binding uniform arrays, the elements must be contiguous. Copy
-      // the uniforms to a temp buffer to eliminate any padding needed by the
-      // other backends.
-      array_element_buffer.resize(member.size * element_count);
+    // When binding uniform arrays, the elements must be contiguous. Copy
+    // the uniforms to a temp buffer to eliminate any padding needed by the
+    // other backends if the array elements have padding.
+    std::vector<uint8_t> array_element_buffer_;
+    if (element_count > 1 && element_stride != member.size) {
+      array_element_buffer_.resize(member.size * element_count);
       for (size_t element_i = 0; element_i < element_count; element_i++) {
-        std::memcpy(array_element_buffer.data() + element_i * member.size,
+        std::memcpy(array_element_buffer_.data() + element_i * member.size,
                     reinterpret_cast<const char*>(buffer_data) +
                         element_i * element_stride,
                     member.size);
       }
       buffer_data =
-          reinterpret_cast<const GLfloat*>(array_element_buffer.data());
+          reinterpret_cast<const GLfloat*>(array_element_buffer_.data());
     }
 
     switch (member.type) {
diff --git a/impeller/tools/malioc.json b/impeller/tools/malioc.json
index c6e8eb4..65d3b54 100644
--- a/impeller/tools/malioc.json
+++ b/impeller/tools/malioc.json
@@ -2581,9 +2581,9 @@
               "arith_cvt"
             ],
             "shortest_path_cycles": [
-              0.109375,
+              0.09375,
               0.0,
-              0.109375,
+              0.09375,
               0.0,
               0.0,
               0.0,
@@ -2593,11 +2593,11 @@
               "load_store"
             ],
             "total_cycles": [
-              0.3125,
+              0.265625,
               0.09375,
-              0.3125,
+              0.265625,
               0.0,
-              2.0,
+              1.0,
               0.25,
               0.25
             ]
@@ -2641,10 +2641,11 @@
               0.0
             ],
             "total_bound_pipelines": [
+              "arithmetic",
               "load_store"
             ],
             "total_cycles": [
-              1.6666666269302368,
+              2.0,
               2.0,
               1.0
             ]