From a42a4888b65340597d4b77f90bfeb12d230ad765 Mon Sep 17 00:00:00 2001
From: Norman Feske <norman.feske@genode-labs.com>
Date: Sun, 26 Jan 2025 11:06:57 +0100
Subject: [PATCH] blit: Neon specialization of Blend::xrgb_a

Issue #5428
---
 repos/os/include/blit/internal/neon.h    | 90 ++++++++++++++++++++++
 repos/os/include/spec/arm_64/blit/blit.h |  2 +-
 repos/os/src/test/blit/main.cc           | 96 +++++++++++++++++++++++-
 3 files changed, 186 insertions(+), 2 deletions(-)
diff --git a/repos/os/include/blit/internal/neon.h b/repos/os/include/blit/internal/neon.h
index 0a66c8ceae3..7dfaf18a1ab 100644
--- a/repos/os/include/blit/internal/neon.h
+++ b/repos/os/include/blit/internal/neon.h
@@ -31,6 +31,16 @@ namespace Blit { struct Neon; }
 
 struct Blit::Neon
 {
+	/**
+	 * Helper for printing the raw lower 64 bits of a vector via Genode::Output
+	 */
+	template <typename T> union Printable
+	{
+		Genode::uint64_t u64; T vec;
+		Printable(T vec) : vec(vec) { }
+		void print(Output &out) const { Genode::print(out, Hex(u64)); }
+	};
+
 	static inline uint32x4_t _reversed(uint32x4_t const v)
 	{
 		return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v)));
@@ -163,6 +173,7 @@ struct Blit::Neon
 
 	struct B2f;
 	struct B2f_flip;
+	struct Blend;
 };
 
 
@@ -291,4 +302,83 @@ void Blit::Neon::B2f_flip::r270(uint32_t       *dst, unsigned const dst_w,
 	_rotate(src_ptr4, dst_ptr4, steps, w, h);
 }
 
+
+struct Blit::Neon::Blend
+{
+	static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *);
+
+	__attribute__((optimize("-O3")))
+	static inline uint32_t _mix(uint32_t bg, uint32_t fg, uint8_t alpha)
+	{
+		if (__builtin_expect(alpha == 0, false))
+			return bg;
+
+		/*
+		 * Compute r, g, b in the lower 3 16-bit lanes.
+		 * The upper 5 lanes are unused.
+		 */
+		uint16x8_t const
+			a   = vmovl_u8(vdup_n_u8(alpha)),
+			s   = vmovl_u8(vcreate_u8(fg)),
+			d   = vmovl_u8(vcreate_u8(bg)),
+			ar  = vaddq_u16(vdupq_n_u16(1),   a),  /* for rounding up */
+			nar = vsubq_u16(vdupq_n_u16(256), a),  /* 1.0 - alpha */
+			res = vaddq_u16(vmulq_u16(s, ar), vmulq_u16(d, nar));
+
+		return uint32_t(::uint64_t(vshrn_n_u16(res, 8)));
+	}
+
+	__attribute__((optimize("-O3")))
+	static inline void _mix_8(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha)
+	{
+		/* fetch 8 alpha values */
+		uint16x8_t const a = vmovl_u8(*(uint8x8_t *)alpha);
+
+		/* skip block if entirely transparent */
+		if (__builtin_expect(vmaxvq_u16(a) == 0, false))
+			return;
+
+		/* load 8 source and destination pixels */
+		uint8x8x4_t const s = vld4_u8((uint8_t const *)fg);
+		uint8x8x4_t       d = vld4_u8((uint8_t const *)bg);
+
+		/* extend r, g, b components from uint8_t to uint16_t */
+		uint16x8x4_t const
+			s_rgb { vmovl_u8(s.val[0]), vmovl_u8(s.val[1]), vmovl_u8(s.val[2]) },
+			d_rgb { vmovl_u8(d.val[0]), vmovl_u8(d.val[1]), vmovl_u8(d.val[2]) };
+
+		/* load 8 alpha values, prepare as factors for source and destination */
+		uint16x8_t const
+			sa = vaddq_u16(vdupq_n_u16(1),   a),
+			da = vsubq_u16(vdupq_n_u16(256), a);  /* 1.0 - alpha */
+
+		/* mix components, keeping only their upper 8 bits */
+		for (unsigned i = 0; i < 3; i++)
+			d.val[i] = vshrn_n_u16(vaddq_u16(vmulq_u16(d_rgb.val[i], da),
+			                                 vmulq_u16(s_rgb.val[i], sa)), 8);
+		/* write 8 pixels */
+		vst4_u8((uint8_t *)bg, d);
+	}
+};
+
+
+__attribute__((optimize("-O3")))
+void Blit::Neon::Blend::xrgb_a(uint32_t *dst, unsigned n,
+                               uint32_t const *pixel, uint8_t const *alpha)
+{
+	int const prefetch_distance = 16;  /* cache line / 32-bit pixel size */
+	for (; n > prefetch_distance; n -= 8, dst += 8, pixel += 8, alpha += 8) {
+		__builtin_prefetch(dst   + prefetch_distance);
+		__builtin_prefetch(pixel + prefetch_distance);
+		__builtin_prefetch(alpha + prefetch_distance);
+		_mix_8(dst, pixel, alpha);
+	}
+
+	for (; n > 7; n -= 8, dst += 8, pixel += 8, alpha += 8)
+		_mix_8(dst, pixel, alpha);
+
+	for (; n--; dst++, pixel++, alpha++)
+		*dst = _mix(*dst, *pixel, *alpha);
+}
+
 #endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */
diff --git a/repos/os/include/spec/arm_64/blit/blit.h b/repos/os/include/spec/arm_64/blit/blit.h
index 74a79e62248..d6023995cda 100644
--- a/repos/os/include/spec/arm_64/blit/blit.h
+++ b/repos/os/include/spec/arm_64/blit/blit.h
@@ -21,7 +21,7 @@
 namespace Blit {
 
 	static inline void back2front  (auto &&... args) { _b2f<Neon>(args...); }
-	static inline void blend_xrgb_a(auto &&... args) { Slow::Blend::xrgb_a(args...); }
+	static inline void blend_xrgb_a(auto &&... args) { Neon::Blend::xrgb_a(args...); }
 }
 
 #endif /* _INCLUDE__SPEC__ARM_64__BLIT_H_ */
diff --git a/repos/os/src/test/blit/main.cc b/repos/os/src/test/blit/main.cc
index 287d62d3fa4..716b0117f2a 100644
--- a/repos/os/src/test/blit/main.cc
+++ b/repos/os/src/test/blit/main.cc
@@ -197,7 +197,7 @@ namespace Blit {
 }
 
 
-static void test_b2f_dispatch()
+static inline void test_b2f_dispatch()
 {
 	Texture<Pixel_rgb888> texture_landscape { nullptr, nullptr, { 640, 480 } };
 	Texture<Pixel_rgb888> texture_portrait  { nullptr, nullptr, { 480, 640 } };
@@ -274,11 +274,105 @@ static void test_b2f_dispatch()
 }
 
 
+template <typename SIMD>
+static inline void test_simd_blend_mix()
+{
+	struct Rgb : Genode::Hex
+	{
+		explicit Rgb(uint32_t v) : Hex(v, OMIT_PREFIX, PAD) { }
+	};
+
+	struct Mix_test
+	{
+		uint32_t bg, fg; uint8_t a; uint32_t expected;
+
+		void print(Output &out) const
+		{
+			Genode::print(out, "bg=", Rgb(bg), " fg=", Rgb(fg), " a=", a);
+		}
+	};
+
+	Mix_test mix_test[] {
+		{ .bg = 0x000000, .fg = 0x000000, .a = 0,   .expected = 0x000000 },
+		{ .bg = 0x000000, .fg = 0xffffff, .a = 0,   .expected = 0x000000 },
+		{ .bg = 0xffffff, .fg = 0x000000, .a = 0,   .expected = 0xffffff },
+		{ .bg = 0xffffff, .fg = 0xffffff, .a = 0,   .expected = 0xffffff },
+
+		{ .bg = 0x000000, .fg = 0x000000, .a = 255, .expected = 0x000000 },
+		{ .bg = 0x000000, .fg = 0xffffff, .a = 255, .expected = 0xffffff },
+		{ .bg = 0xffffff, .fg = 0x000000, .a = 255, .expected = 0x000000 },
+		{ .bg = 0xffffff, .fg = 0xffffff, .a = 255, .expected = 0xffffff },
+	};
+
+	for (Mix_test const &test : mix_test) {
+		uint32_t slow = Slow::Blend::_mix(test.bg, test.fg, test.a);
+		uint32_t simd = SIMD::Blend::_mix(test.bg, test.fg, test.a);
+		if (slow == test.expected && slow == simd) {
+			log("mix ", test, " -> slow=", Rgb(slow), " simd=", Rgb(simd));
+		} else {
+			error("mix ", test, " -> slow=", Rgb(slow), " simd=", Rgb(simd),
+			      " expected=", Rgb(test.expected));
+			throw 1;
+		}
+	}
+
+	struct Xrgb_8x
+	{
+		uint32_t values[8];
+
+		void print(Output &out) const
+		{
+			for (unsigned i = 0; i < 8; i++)
+				Genode::print(out, (i == 0) ? "" : ".", Rgb(values[i]));
+		}
+
+		bool operator != (Xrgb_8x const &other) const
+		{
+			for (unsigned i = 0; i < 8; i++)
+				if (values[i] != other.values[i])
+					return true;
+			return false;
+		}
+	};
+
+	uint32_t const ca = 0xaaaaaaaa, cb = 0xbbbbbbbb, cc = 0xcccccccc,
+	               cd = 0xdddddddd, white = 0xffffff;
+
+	Xrgb_8x black_bg { };
+	Xrgb_8x white_bg { { white, white, white, white, white, white, white, white } };
+
+	Xrgb_8x fg       { { 0x001020, 0x405060, 0x8090a0, 0xc0d0e0, ca, cb, cc, cd } };
+	uint8_t alpha[8]   { 63,       127,      191,      255     , 64, 64, 64, 64 };
+
+	auto test_mix_8 = [&] (auto msg, Xrgb_8x &bg, Xrgb_8x const &fg,
+	                       uint8_t const *alpha, Xrgb_8x const &expected)
+	{
+		log("fg        : ", fg);
+		log("bg        : ", bg);
+		Neon::Blend::_mix_8(bg.values, fg.values, alpha);
+		log(msg, " : ", bg);
+		if (expected != bg) {
+			error("expected ", expected);
+			throw 1;
+		}
+	};
+
+	test_mix_8("blackened", black_bg, fg, alpha, { {
+		0x00000408, 0x00202830, 0x00606c78, 0x00c0d0e0,
+		0x002b2b2b, 0x002f2f2f, 0x00333333, 0x00383838 } });
+
+	test_mix_8("whitened ", white_bg, fg, alpha, { {
+		0x00c0c4c8, 0x00a0a8b0, 0x00a0acb8, 0x00c0d0e0,
+		0x00eaeaea, 0x00eeeeee, 0x00f3f3f3, 0x00f7f7f7 } });
+}
+
+
 void Component::construct(Genode::Env &)
 {
 #ifdef _INCLUDE__BLIT__INTERNAL__NEON_H_
 	log("-- ARM Neon --");
 	test_simd_b2f<Neon>();
+	test_simd_blend_mix<Neon>();
 #endif
 #ifdef _INCLUDE__BLIT__INTERNAL__SSE3_H_
 	log("-- SSE3 --");