blit: Blit::blend_xrgb_a

This commit adds support for SIMD-based alpha blending, which speeds up the alpha-compositing of the nitpicker GUI server by circa 300% on ARM Neon and x86_64 using SSE4.1 Issue genodelabs#5428
nfeske · Jan 30, 2025 · afde65b · afde65b
1 parent bdcee98
commit afde65b
Show file tree

Hide file tree

Showing 7 changed files with 347 additions and 4 deletions.
diff --git a/repos/os/include/blit/blit.h b/repos/os/include/blit/blit.h
@@ -57,6 +57,15 @@ namespace Blit {
 	{
 		_b2f<Slow>(surface, texture, rect, rotate, flip);
 	}
+
+	/**
+	 * Blend a sequence of pixels to 'dst' according to discrete alpha values
+	 */
+	static inline void blend_xrgb_a(uint32_t *dst, unsigned n,
+	                                uint32_t const *pixel, uint8_t const *alpha)
+	{
+		Slow::Blend::xrgb_a(dst, n, pixel, alpha);
+	}
 }
 
 #endif /* _INCLUDE__BLIT_H_ */
diff --git a/repos/os/include/blit/internal/neon.h b/repos/os/include/blit/internal/neon.h
@@ -31,6 +31,16 @@ namespace Blit { struct Neon; }
 
 struct Blit::Neon
 {
+	/**
+	 * Helper for printing the raw lower 64 bits of a vector via Genode::Output
+	 */
+	template <typename T> union Printable
+	{
+		Genode::uint64_t u64; T vec;
+		Printable(T vec) : vec(vec) { }
+		void print(Output &out) const { Genode::print(out, Hex(u64)); }
+	};
+
 	static inline uint32x4_t _reversed(uint32x4_t const v)
 	{
 		return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v)));
@@ -163,6 +173,7 @@ struct Blit::Neon
 
 	struct B2f;
 	struct B2f_flip;
+	struct Blend;
 };
 
 
@@ -291,4 +302,83 @@ void Blit::Neon::B2f_flip::r270(uint32_t       *dst, unsigned const dst_w,
 	_rotate(src_ptr4, dst_ptr4, steps, w, h);
 }
 
+
+struct Blit::Neon::Blend
+{
+	static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *);
+
+	__attribute__((optimize("-O3")))
+	static inline uint32_t _mix(uint32_t bg, uint32_t fg, uint8_t alpha)
+	{
+		if (__builtin_expect(alpha == 0, false))
+			return bg;
+
+		/*
+		 * Compute r, g, b in the lower 3 16-bit lanes.
+		 * The upper 5 lanes are unused.
+		 */
+		uint16x8_t const
+			a   = vmovl_u8(vdup_n_u8(alpha)),
+			s   = vmovl_u8(vcreate_u8(fg)),
+			d   = vmovl_u8(vcreate_u8(bg)),
+			ar  = vaddq_u16(vdupq_n_u16(1),   a),  /* for rounding up */
+			nar = vsubq_u16(vdupq_n_u16(256), a),  /* 1.0 - alpha */
+			res = vaddq_u16(vmulq_u16(s, ar), vmulq_u16(d, nar));
+
+		return uint32_t(::uint64_t(vshrn_n_u16(res, 8)));
+	}
+
+	__attribute__((optimize("-O3")))
+	static inline void _mix_8(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha)
+	{
+		/* fetch 8 alpha values */
+		uint16x8_t const a = vmovl_u8(*(uint8x8_t *)alpha);
+
+		/* skip block if entirely transparent */
+		if (__builtin_expect(vmaxvq_u16(a) == 0, false))
+			return;
+
+		/* load 8 source and destination pixels */
+		uint8x8x4_t const s = vld4_u8((uint8_t const *)fg);
+		uint8x8x4_t       d = vld4_u8((uint8_t const *)bg);
+
+		/* extend r, g, b components from uint8_t to uint16_t */
+		uint16x8x4_t const
+			s_rgb { vmovl_u8(s.val[0]), vmovl_u8(s.val[1]), vmovl_u8(s.val[2]) },
+			d_rgb { vmovl_u8(d.val[0]), vmovl_u8(d.val[1]), vmovl_u8(d.val[2]) };
+
+		/* load 8 alpha values, prepare as factors for source and destination */
+		uint16x8_t const
+			sa = vaddq_u16(vdupq_n_u16(1),   a),
+			da = vsubq_u16(vdupq_n_u16(256), a);  /* 1.0 - alpha */
+
+		/* mix components, keeping only their upper 8 bits */
+		for (unsigned i = 0; i < 3; i++)
+			d.val[i] = vshrn_n_u16(vaddq_u16(vmulq_u16(d_rgb.val[i], da),
+			                                 vmulq_u16(s_rgb.val[i], sa)), 8);
+		/* write 8 pixels */
+		vst4_u8((uint8_t *)bg, d);
+	}
+};
+
+
+__attribute__((optimize("-O3")))
+void Blit::Neon::Blend::xrgb_a(uint32_t *dst, unsigned n,
+                               uint32_t const *pixel, uint8_t const *alpha)
+{
+	int const prefetch_distance = 16;  /* cache line / 32-bit pixel size */
+	for (; n > prefetch_distance; n -= 8, dst += 8, pixel += 8, alpha += 8) {
+		__builtin_prefetch(dst   + prefetch_distance);
+		__builtin_prefetch(pixel + prefetch_distance);
+		__builtin_prefetch(alpha + prefetch_distance);
+		_mix_8(dst, pixel, alpha);
+	}
+
+	for (; n > 7; n -= 8, dst += 8, pixel += 8, alpha += 8)
+		_mix_8(dst, pixel, alpha);
+
+	for (; n--; dst++, pixel++, alpha++)
+		*dst = _mix(*dst, *pixel, *alpha);
+}
+
 #endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */
diff --git a/repos/os/include/blit/internal/slow.h b/repos/os/include/blit/internal/slow.h
@@ -44,6 +44,7 @@ struct Blit::Slow
 {
 	struct B2f;
 	struct B2f_flip;
+	struct Blend;
 };
 
 
@@ -128,4 +129,33 @@ void Blit::Slow::B2f_flip::r270(uint32_t       *dst, unsigned dst_w,
 	_write_lines(src, src_w, dst, w, h, -8*dst_w, -1);
 }
 
+
+struct Blit::Slow::Blend
+{
+	static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *);
+
+	__attribute__((optimize("-O3")))
+	static inline uint32_t _blend(uint32_t xrgb, unsigned alpha)
+	{
+		return (alpha * ((xrgb & 0xff00)    >> 8) & 0xff00)
+		   | (((alpha *  (xrgb & 0xff00ff)) >> 8) & 0xff00ff);
+	}
+
+	__attribute__((optimize("-O3")))
+	static inline uint32_t _mix(uint32_t bg, uint32_t fg, unsigned alpha)
+	{
+		return (__builtin_expect(alpha == 0, false))
+		       ? bg : _blend(bg, 256 - alpha) + _blend(fg, alpha + 1);
+	}
+};
+
+
+__attribute__((optimize("-O3")))
+void Blit::Slow::Blend::xrgb_a(uint32_t *dst, unsigned n,
+                               uint32_t const *pixel, uint8_t const *alpha)
+{
+	for (; n--; dst++, pixel++, alpha++)
+		*dst = _mix(*dst, *pixel, *alpha);
+}
+
 #endif /* _INCLUDE__BLIT__INTERNAL__SLOW_H_ */
diff --git a/repos/os/include/blit/internal/sse4.h b/repos/os/include/blit/internal/sse4.h
@@ -36,6 +36,31 @@ namespace Blit { struct Sse4; };
 
 struct Blit::Sse4
 {
+	/**
+	 * Padded hex output utility
+	 */
+	template <typename T>
+	struct Phex : Hex { explicit Phex(T v) : Hex(v, OMIT_PREFIX, PAD) { } };
+
+	/**
+	 * Vector output utility
+	 */
+	template <typename T>
+	union Vec_as
+	{
+		__m128i v;
+		static constexpr unsigned N = 128/(8*sizeof(T));
+		T u[N];
+
+		Vec_as(__m128i v) : v(v) { }
+
+		void print(Output &out) const
+		{
+			for (unsigned i = 0; i < N; i++)
+				Genode::print(out, Phex(u[i]), i < (N-1) ? "." : "");
+		}
+	};
+
 	union Tile_4x4 { __m128i pi[4]; __m128  ps[4]; };
 
 	struct Src_ptr4
@@ -132,6 +157,7 @@ struct Blit::Sse4
 
 	struct B2f;
 	struct B2f_flip;
+	struct Blend;
 };
 
 
@@ -260,4 +286,93 @@ void Blit::Sse4::B2f_flip::r270(uint32_t       *dst, unsigned dst_w,
 	_rotate(src_ptr4, dst_ptr4, steps, w, h);
 }
 
-#endif /* _INCLUDE__BLIT__INTERNAL__SSE4_H_ */
+
+struct Blit::Sse4::Blend
+{
+	static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *);
+
+	__attribute__((optimize("-O3")))
+	static inline uint32_t _blend(uint32_t xrgb, unsigned alpha)
+	{
+		return (alpha * ((xrgb & 0xff00)    >> 8) & 0xff00)
+		   | (((alpha *  (xrgb & 0xff00ff)) >> 8) & 0xff00ff);
+	}
+
+	__attribute__((optimize("-O3")))
+	static inline uint32_t _mix(uint32_t bg, uint32_t fg, unsigned alpha)
+	{
+		return (__builtin_expect(alpha == 0, false))
+		       ? bg : _blend(bg, 256 - alpha) + _blend(fg, alpha + 1);
+	}
+
+	struct Mix_masks
+	{
+		/* masks for distributing alpha values to 16-bit r, g, b lanes */
+		__m128i const a01 = _mm_set_epi32(0x03020302, 0x03020302, 0x01000100, 0x01000100);
+		__m128i const a23 = _mm_set_epi32(0x07060706, 0x07060706, 0x05040504, 0x05040504);
+	};
+
+	__attribute__((optimize("-O3")))
+	static inline void _mix_4(uint32_t *, uint32_t const *, uint8_t const *, Mix_masks const);
+};
+
+
+__attribute__((optimize("-O3")))
+void Blit::Sse4::Blend::_mix_4(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha, Mix_masks const masks)
+{
+	uint32_t const a_u8_x4 = *(uint32_t const *)alpha;
+
+	if (__builtin_expect(a_u8_x4 == 0, false))
+		return;
+
+	auto upper_half = [&] (__m128i const v) { return _mm_shuffle_epi32(v, 2 + (3<<2)); };
+
+	__m128i const
+		/* load four foreground pixel, background pixel, and alpha values */
+		fg_u8_4x4 = _mm_loadu_si128((__m128i const *)fg),
+		bg_u8_4x4 = _mm_loadu_si128((__m128i const *)bg),
+
+		/* extract first and second pair of pixel values */
+		fg01_u16_4x2 = _mm_cvtepu8_epi16(fg_u8_4x4),
+		fg23_u16_4x2 = _mm_cvtepu8_epi16(upper_half(fg_u8_4x4)),
+		bg01_u16_4x2 = _mm_cvtepu8_epi16(bg_u8_4x4),
+		bg23_u16_4x2 = _mm_cvtepu8_epi16(upper_half(bg_u8_4x4)),
+
+		/* prepare 4 destination and source alpha values */
+		a_u16_x4  = _mm_cvtepu8_epi16(_mm_set1_epi32(a_u8_x4)),
+		da_u16_x4 = _mm_sub_epi16(_mm_set1_epi16(256), a_u16_x4),
+		sa_u16_x4 = _mm_add_epi16(a_u16_x4, _mm_set1_epi16(1)),
+
+		/* mix first pixel pair */
+		da01_u16_4x2 = _mm_shuffle_epi8(da_u16_x4, masks.a01),
+		sa01_u16_4x2 = _mm_shuffle_epi8(sa_u16_x4, masks.a01),
+		mixed01 = _mm_add_epi16(_mm_mullo_epi16(fg01_u16_4x2, sa01_u16_4x2),
+		                        _mm_mullo_epi16(bg01_u16_4x2, da01_u16_4x2)),
+
+		/* mix second pixel pair */
+		da23_u16_4x2 = _mm_shuffle_epi8(da_u16_x4, masks.a23),
+		sa23_u16_4x2 = _mm_shuffle_epi8(sa_u16_x4, masks.a23),
+		mixed23 = _mm_add_epi16(_mm_mullo_epi16(fg23_u16_4x2, sa23_u16_4x2),
+		                        _mm_mullo_epi16(bg23_u16_4x2, da23_u16_4x2)),
+
+		result_4x4 = _mm_packus_epi16(_mm_srli_epi16(mixed01, 8),
+		                              _mm_srli_epi16(mixed23, 8));
+
+	_mm_storeu_si128((__m128i *)bg, result_4x4);
+}
+
+
+__attribute__((optimize("-O3")))
+void Blit::Sse4::Blend::xrgb_a(uint32_t *dst, unsigned n,
+                               uint32_t const *pixel, uint8_t const *alpha)
+{
+	Mix_masks const mix_masks { };
+
+	for (; n > 3; n -= 4, dst += 4, pixel += 4, alpha += 4)
+		_mix_4(dst, pixel, alpha, mix_masks);
+
+	for (; n--; dst++, pixel++, alpha++)
+		*dst = _mix(*dst, *pixel, *alpha);
+}
+
+#endif /* _INCLUDE__BLIT__INTERNAL__SSE3_H_ */
diff --git a/repos/os/include/spec/arm_64/blit/blit.h b/repos/os/include/spec/arm_64/blit/blit.h
@@ -16,10 +16,12 @@
 
 #include <blit/types.h>
 #include <blit/internal/neon.h>
+#include <blit/internal/slow.h>
 
 namespace Blit {
 
-	static inline void back2front(auto &&... args) { _b2f<Neon>(args...); }
+	static inline void back2front  (auto &&... args) { _b2f<Neon>(args...); }
+	static inline void blend_xrgb_a(auto &&... args) { Neon::Blend::xrgb_a(args...); }
 }
 
 #endif /* _INCLUDE__SPEC__ARM_64__BLIT_H_ */
diff --git a/repos/os/include/spec/x86_64/blit/blit.h b/repos/os/include/spec/x86_64/blit/blit.h
@@ -16,10 +16,12 @@
 
 #include <blit/types.h>
 #include <blit/internal/sse4.h>
+#include <blit/internal/slow.h>
 
 namespace Blit {
 
-	static inline void back2front(auto &&... args) { _b2f<Sse4>(args...); }
+	static inline void back2front  (auto &&... args) { _b2f<Sse4>(args...); }
+	static inline void blend_xrgb_a(auto &&... args) { Sse4::Blend::xrgb_a(args...); }
 }
 
 #endif /* _INCLUDE__SPEC__X86_64__BLIT_H_ */