From a42a4888b65340597d4b77f90bfeb12d230ad765 Mon Sep 17 00:00:00 2001 From: Norman Feske Date: Sun, 26 Jan 2025 11:06:57 +0100 Subject: [PATCH] blit: Neon specialization of Blend::xrgb_a Issue #5428 --- repos/os/include/blit/internal/neon.h | 90 ++++++++++++++++++++++ repos/os/include/spec/arm_64/blit/blit.h | 2 +- repos/os/src/test/blit/main.cc | 96 +++++++++++++++++++++++- 3 files changed, 186 insertions(+), 2 deletions(-) diff --git a/repos/os/include/blit/internal/neon.h b/repos/os/include/blit/internal/neon.h index 0a66c8ceae3..7dfaf18a1ab 100644 --- a/repos/os/include/blit/internal/neon.h +++ b/repos/os/include/blit/internal/neon.h @@ -31,6 +31,16 @@ namespace Blit { struct Neon; } struct Blit::Neon { + /** + * Helper for printing the raw lower 64 bits of a vector via Genode::Output + */ + template union Printable + { + Genode::uint64_t u64; T vec; + Printable(T vec) : vec(vec) { } + void print(Output &out) const { Genode::print(out, Hex(u64)); } + }; + static inline uint32x4_t _reversed(uint32x4_t const v) { return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v))); @@ -163,6 +173,7 @@ struct Blit::Neon struct B2f; struct B2f_flip; + struct Blend; }; @@ -291,4 +302,83 @@ void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned const dst_w, _rotate(src_ptr4, dst_ptr4, steps, w, h); } + +struct Blit::Neon::Blend +{ + static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *); + + __attribute__((optimize("-O3"))) + static inline uint32_t _mix(uint32_t bg, uint32_t fg, uint8_t alpha) + { + if (__builtin_expect(alpha == 0, false)) + return bg; + + /* + * Compute r, g, b in the lower 3 16-bit lanes. + * The upper 5 lanes are unused. + */ + uint16x8_t const + a = vmovl_u8(vdup_n_u8(alpha)), + s = vmovl_u8(vcreate_u8(fg)), + d = vmovl_u8(vcreate_u8(bg)), + ar = vaddq_u16(vdupq_n_u16(1), a), /* for rounding up */ + nar = vsubq_u16(vdupq_n_u16(256), a), /* 1.0 - alpha */ + res = vaddq_u16(vmulq_u16(s, ar), vmulq_u16(d, nar)); + + return uint32_t(::uint64_t(vshrn_n_u16(res, 8))); + } + + __attribute__((optimize("-O3"))) + static inline void _mix_8(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha) + { + /* fetch 8 alpha values */ + uint16x8_t const a = vmovl_u8(*(uint8x8_t *)alpha); + + /* skip block if entirely transparent */ + if (__builtin_expect(vmaxvq_u16(a) == 0, false)) + return; + + /* load 8 source and destination pixels */ + uint8x8x4_t const s = vld4_u8((uint8_t const *)fg); + uint8x8x4_t d = vld4_u8((uint8_t const *)bg); + + /* extend r, g, b components from uint8_t to uint16_t */ + uint16x8x4_t const + s_rgb { vmovl_u8(s.val[0]), vmovl_u8(s.val[1]), vmovl_u8(s.val[2]) }, + d_rgb { vmovl_u8(d.val[0]), vmovl_u8(d.val[1]), vmovl_u8(d.val[2]) }; + + /* load 8 alpha values, prepare as factors for source and destination */ + uint16x8_t const + sa = vaddq_u16(vdupq_n_u16(1), a), + da = vsubq_u16(vdupq_n_u16(256), a); /* 1.0 - alpha */ + + /* mix components, keeping only their upper 8 bits */ + for (unsigned i = 0; i < 3; i++) + d.val[i] = vshrn_n_u16(vaddq_u16(vmulq_u16(d_rgb.val[i], da), + vmulq_u16(s_rgb.val[i], sa)), 8); + /* write 8 pixels */ + vst4_u8((uint8_t *)bg, d); + } +}; + + +__attribute__((optimize("-O3"))) +void Blit::Neon::Blend::xrgb_a(uint32_t *dst, unsigned n, + uint32_t const *pixel, uint8_t const *alpha) +{ + int const prefetch_distance = 16; /* cache line / 32-bit pixel size */ + for (; n > prefetch_distance; n -= 8, dst += 8, pixel += 8, alpha += 8) { + __builtin_prefetch(dst + prefetch_distance); + __builtin_prefetch(pixel + prefetch_distance); + __builtin_prefetch(alpha + prefetch_distance); + _mix_8(dst, pixel, alpha); + } + + for (; n > 7; n -= 8, dst += 8, pixel += 8, alpha += 8) + _mix_8(dst, pixel, alpha); + + for (; n--; dst++, pixel++, alpha++) + *dst = _mix(*dst, *pixel, *alpha); +} + #endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */ diff --git a/repos/os/include/spec/arm_64/blit/blit.h b/repos/os/include/spec/arm_64/blit/blit.h index 74a79e62248..d6023995cda 100644 --- a/repos/os/include/spec/arm_64/blit/blit.h +++ b/repos/os/include/spec/arm_64/blit/blit.h @@ -21,7 +21,7 @@ namespace Blit { static inline void back2front (auto &&... args) { _b2f(args...); } - static inline void blend_xrgb_a(auto &&... args) { Slow::Blend::xrgb_a(args...); } + static inline void blend_xrgb_a(auto &&... args) { Neon::Blend::xrgb_a(args...); } } #endif /* _INCLUDE__SPEC__ARM_64__BLIT_H_ */ diff --git a/repos/os/src/test/blit/main.cc b/repos/os/src/test/blit/main.cc index 287d62d3fa4..716b0117f2a 100644 --- a/repos/os/src/test/blit/main.cc +++ b/repos/os/src/test/blit/main.cc @@ -197,7 +197,7 @@ namespace Blit { } -static void test_b2f_dispatch() +static inline void test_b2f_dispatch() { Texture texture_landscape { nullptr, nullptr, { 640, 480 } }; Texture texture_portrait { nullptr, nullptr, { 480, 640 } }; @@ -274,11 +274,105 @@ static void test_b2f_dispatch() } +template +static inline void test_simd_blend_mix() +{ + struct Rgb : Genode::Hex + { + explicit Rgb(uint32_t v) : Hex(v, OMIT_PREFIX, PAD) { } + }; + + struct Mix_test + { + uint32_t bg, fg; uint8_t a; uint32_t expected; + + void print(Output &out) const + { + Genode::print(out, "bg=", Rgb(bg), " fg=", Rgb(fg), " a=", a); + } + }; + + Mix_test mix_test[] { + { .bg = 0x000000, .fg = 0x000000, .a = 0, .expected = 0x000000 }, + { .bg = 0x000000, .fg = 0xffffff, .a = 0, .expected = 0x000000 }, + { .bg = 0xffffff, .fg = 0x000000, .a = 0, .expected = 0xffffff }, + { .bg = 0xffffff, .fg = 0xffffff, .a = 0, .expected = 0xffffff }, + + { .bg = 0x000000, .fg = 0x000000, .a = 255, .expected = 0x000000 }, + { .bg = 0x000000, .fg = 0xffffff, .a = 255, .expected = 0xffffff }, + { .bg = 0xffffff, .fg = 0x000000, .a = 255, .expected = 0x000000 }, + { .bg = 0xffffff, .fg = 0xffffff, .a = 255, .expected = 0xffffff }, + }; + + for (Mix_test const &test : mix_test) { + uint32_t slow = Slow::Blend::_mix(test.bg, test.fg, test.a); + uint32_t simd = SIMD::Blend::_mix(test.bg, test.fg, test.a); + if (slow == test.expected && slow == simd) { + log("mix ", test, " -> slow=", Rgb(slow), " simd=", Rgb(simd)); + } else { + error("mix ", test, " -> slow=", Rgb(slow), " simd=", Rgb(simd), + " expected=", Rgb(test.expected)); + throw 1; + } + } + + struct Xrgb_8x + { + uint32_t values[8]; + + void print(Output &out) const + { + for (unsigned i = 0; i < 8; i++) + Genode::print(out, (i == 0) ? "" : ".", Rgb(values[i])); + } + + bool operator != (Xrgb_8x const &other) const + { + for (unsigned i = 0; i < 8; i++) + if (values[i] != other.values[i]) + return true; + return false; + } + }; + + uint32_t const ca = 0xaaaaaaaa, cb = 0xbbbbbbbb, cc = 0xcccccccc, + cd = 0xdddddddd, white = 0xffffff; + + Xrgb_8x black_bg { }; + Xrgb_8x white_bg { { white, white, white, white, white, white, white, white } }; + + Xrgb_8x fg { { 0x001020, 0x405060, 0x8090a0, 0xc0d0e0, ca, cb, cc, cd } }; + uint8_t alpha[8] { 63, 127, 191, 255 , 64, 64, 64, 64 }; + + auto test_mix_8 = [&] (auto msg, Xrgb_8x &bg, Xrgb_8x const &fg, + uint8_t const *alpha, Xrgb_8x const &expected) + { + log("fg : ", fg); + log("bg : ", bg); + Neon::Blend::_mix_8(bg.values, fg.values, alpha); + log(msg, " : ", bg); + if (expected != bg) { + error("expected ", expected); + throw 1; + } + }; + + test_mix_8("blackened", black_bg, fg, alpha, { { + 0x00000408, 0x00202830, 0x00606c78, 0x00c0d0e0, + 0x002b2b2b, 0x002f2f2f, 0x00333333, 0x00383838 } }); + + test_mix_8("whitened ", white_bg, fg, alpha, { { + 0x00c0c4c8, 0x00a0a8b0, 0x00a0acb8, 0x00c0d0e0, + 0x00eaeaea, 0x00eeeeee, 0x00f3f3f3, 0x00f7f7f7 } }); +} + + void Component::construct(Genode::Env &) { #ifdef _INCLUDE__BLIT__INTERNAL__NEON_H_ log("-- ARM Neon --"); test_simd_b2f(); + test_simd_blend_mix(); #endif #ifdef _INCLUDE__BLIT__INTERNAL__SSE3_H_ log("-- SSE3 --");