diff --git a/repos/os/include/blit/blit.h b/repos/os/include/blit/blit.h index 2b04cbe282a..30108da2467 100644 --- a/repos/os/include/blit/blit.h +++ b/repos/os/include/blit/blit.h @@ -57,6 +57,15 @@ namespace Blit { { _b2f(surface, texture, rect, rotate, flip); } + + /** + * Blend a sequence of pixels to 'dst' according to discrete alpha values + */ + static inline void blend_xrgb_a(uint32_t *dst, unsigned n, + uint32_t const *pixel, uint8_t const *alpha) + { + Slow::Blend::xrgb_a(dst, n, pixel, alpha); + } } #endif /* _INCLUDE__BLIT_H_ */ diff --git a/repos/os/include/blit/internal/neon.h b/repos/os/include/blit/internal/neon.h index 0a66c8ceae3..7dfaf18a1ab 100644 --- a/repos/os/include/blit/internal/neon.h +++ b/repos/os/include/blit/internal/neon.h @@ -31,6 +31,16 @@ namespace Blit { struct Neon; } struct Blit::Neon { + /** + * Helper for printing the raw lower 64 bits of a vector via Genode::Output + */ + template union Printable + { + Genode::uint64_t u64; T vec; + Printable(T vec) : vec(vec) { } + void print(Output &out) const { Genode::print(out, Hex(u64)); } + }; + static inline uint32x4_t _reversed(uint32x4_t const v) { return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v))); @@ -163,6 +173,7 @@ struct Blit::Neon struct B2f; struct B2f_flip; + struct Blend; }; @@ -291,4 +302,83 @@ void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned const dst_w, _rotate(src_ptr4, dst_ptr4, steps, w, h); } + +struct Blit::Neon::Blend +{ + static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *); + + __attribute__((optimize("-O3"))) + static inline uint32_t _mix(uint32_t bg, uint32_t fg, uint8_t alpha) + { + if (__builtin_expect(alpha == 0, false)) + return bg; + + /* + * Compute r, g, b in the lower 3 16-bit lanes. + * The upper 5 lanes are unused. + */ + uint16x8_t const + a = vmovl_u8(vdup_n_u8(alpha)), + s = vmovl_u8(vcreate_u8(fg)), + d = vmovl_u8(vcreate_u8(bg)), + ar = vaddq_u16(vdupq_n_u16(1), a), /* for rounding up */ + nar = vsubq_u16(vdupq_n_u16(256), a), /* 1.0 - alpha */ + res = vaddq_u16(vmulq_u16(s, ar), vmulq_u16(d, nar)); + + return uint32_t(::uint64_t(vshrn_n_u16(res, 8))); + } + + __attribute__((optimize("-O3"))) + static inline void _mix_8(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha) + { + /* fetch 8 alpha values */ + uint16x8_t const a = vmovl_u8(*(uint8x8_t *)alpha); + + /* skip block if entirely transparent */ + if (__builtin_expect(vmaxvq_u16(a) == 0, false)) + return; + + /* load 8 source and destination pixels */ + uint8x8x4_t const s = vld4_u8((uint8_t const *)fg); + uint8x8x4_t d = vld4_u8((uint8_t const *)bg); + + /* extend r, g, b components from uint8_t to uint16_t */ + uint16x8x4_t const + s_rgb { vmovl_u8(s.val[0]), vmovl_u8(s.val[1]), vmovl_u8(s.val[2]) }, + d_rgb { vmovl_u8(d.val[0]), vmovl_u8(d.val[1]), vmovl_u8(d.val[2]) }; + + /* load 8 alpha values, prepare as factors for source and destination */ + uint16x8_t const + sa = vaddq_u16(vdupq_n_u16(1), a), + da = vsubq_u16(vdupq_n_u16(256), a); /* 1.0 - alpha */ + + /* mix components, keeping only their upper 8 bits */ + for (unsigned i = 0; i < 3; i++) + d.val[i] = vshrn_n_u16(vaddq_u16(vmulq_u16(d_rgb.val[i], da), + vmulq_u16(s_rgb.val[i], sa)), 8); + /* write 8 pixels */ + vst4_u8((uint8_t *)bg, d); + } +}; + + +__attribute__((optimize("-O3"))) +void Blit::Neon::Blend::xrgb_a(uint32_t *dst, unsigned n, + uint32_t const *pixel, uint8_t const *alpha) +{ + int const prefetch_distance = 16; /* cache line / 32-bit pixel size */ + for (; n > prefetch_distance; n -= 8, dst += 8, pixel += 8, alpha += 8) { + __builtin_prefetch(dst + prefetch_distance); + __builtin_prefetch(pixel + prefetch_distance); + __builtin_prefetch(alpha + prefetch_distance); + _mix_8(dst, pixel, alpha); + } + + for (; n > 7; n -= 8, dst += 8, pixel += 8, alpha += 8) + _mix_8(dst, pixel, alpha); + + for (; n--; dst++, pixel++, alpha++) + *dst = _mix(*dst, *pixel, *alpha); +} + #endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */ diff --git a/repos/os/include/blit/internal/slow.h b/repos/os/include/blit/internal/slow.h index c4e0954c66f..39a1e6a0383 100644 --- a/repos/os/include/blit/internal/slow.h +++ b/repos/os/include/blit/internal/slow.h @@ -44,6 +44,7 @@ struct Blit::Slow { struct B2f; struct B2f_flip; + struct Blend; }; @@ -128,4 +129,33 @@ void Blit::Slow::B2f_flip::r270(uint32_t *dst, unsigned dst_w, _write_lines(src, src_w, dst, w, h, -8*dst_w, -1); } + +struct Blit::Slow::Blend +{ + static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *); + + __attribute__((optimize("-O3"))) + static inline uint32_t _blend(uint32_t xrgb, unsigned alpha) + { + return (alpha * ((xrgb & 0xff00) >> 8) & 0xff00) + | (((alpha * (xrgb & 0xff00ff)) >> 8) & 0xff00ff); + } + + __attribute__((optimize("-O3"))) + static inline uint32_t _mix(uint32_t bg, uint32_t fg, unsigned alpha) + { + return (__builtin_expect(alpha == 0, false)) + ? bg : _blend(bg, 256 - alpha) + _blend(fg, alpha + 1); + } +}; + + +__attribute__((optimize("-O3"))) +void Blit::Slow::Blend::xrgb_a(uint32_t *dst, unsigned n, + uint32_t const *pixel, uint8_t const *alpha) +{ + for (; n--; dst++, pixel++, alpha++) + *dst = _mix(*dst, *pixel, *alpha); +} + #endif /* _INCLUDE__BLIT__INTERNAL__SLOW_H_ */ diff --git a/repos/os/include/blit/internal/sse4.h b/repos/os/include/blit/internal/sse4.h index 959f24337a0..fd78224d8bc 100644 --- a/repos/os/include/blit/internal/sse4.h +++ b/repos/os/include/blit/internal/sse4.h @@ -36,6 +36,31 @@ namespace Blit { struct Sse4; }; struct Blit::Sse4 { + /** + * Padded hex output utility + */ + template + struct Phex : Hex { explicit Phex(T v) : Hex(v, OMIT_PREFIX, PAD) { } }; + + /** + * Vector output utility + */ + template + union Vec_as + { + __m128i v; + static constexpr unsigned N = 128/(8*sizeof(T)); + T u[N]; + + Vec_as(__m128i v) : v(v) { } + + void print(Output &out) const + { + for (unsigned i = 0; i < N; i++) + Genode::print(out, Phex(u[i]), i < (N-1) ? "." : ""); + } + }; + union Tile_4x4 { __m128i pi[4]; __m128 ps[4]; }; struct Src_ptr4 @@ -132,6 +157,7 @@ struct Blit::Sse4 struct B2f; struct B2f_flip; + struct Blend; }; @@ -260,4 +286,91 @@ void Blit::Sse4::B2f_flip::r270(uint32_t *dst, unsigned dst_w, _rotate(src_ptr4, dst_ptr4, steps, w, h); } -#endif /* _INCLUDE__BLIT__INTERNAL__SSE4_H_ */ + +struct Blit::Sse4::Blend +{ + static inline void xrgb_a(uint32_t *, unsigned, uint32_t const *, uint8_t const *); + + __attribute__((optimize("-O3"))) + static inline uint32_t _blend(uint32_t xrgb, unsigned alpha) + { + return (alpha * ((xrgb & 0xff00) >> 8) & 0xff00) + | (((alpha * (xrgb & 0xff00ff)) >> 8) & 0xff00ff); + } + + __attribute__((optimize("-O3"))) + static inline uint32_t _mix(uint32_t bg, uint32_t fg, unsigned alpha) + { + return (__builtin_expect(alpha == 0, false)) + ? bg : _blend(bg, 256 - alpha) + _blend(fg, alpha + 1); + } + + struct Mix_masks + { + /* masks for distributing alpha values to 16-bit r, g, b lanes */ + __m128i const a01 = _mm_set_epi32(0x03020302, 0x03020302, 0x01000100, 0x01000100); + __m128i const a23 = _mm_set_epi32(0x07060706, 0x07060706, 0x05040504, 0x05040504); + }; + + __attribute__((optimize("-O3"))) + static inline void _mix_4(uint32_t *, uint32_t const *, uint8_t const *, Mix_masks const); +}; + + +__attribute__((optimize("-O3"))) +void Blit::Sse4::Blend::_mix_4(uint32_t *bg, uint32_t const *fg, uint8_t const *alpha, Mix_masks const masks) +{ + uint32_t const a_u8_x4 = *(uint32_t const *)alpha; + + if (__builtin_expect(a_u8_x4 == 0, false)) + return; + + /* load four foreground pixel, background pixel, and alpha values */ + __m128i const fg_u8_4x4 = _mm_loadu_si128((__m128i const *)fg); + __m128i const bg_u8_4x4 = _mm_loadu_si128((__m128i const *)bg); + + auto upper_half = [&] (__m128i const v) { return _mm_shuffle_epi32(v, 2 + (3<<2)); }; + + /* extract first and second pair of pixel values */ + __m128i const fg01_u16_4x2 = _mm_cvtepu8_epi16(fg_u8_4x4); + __m128i const fg23_u16_4x2 = _mm_cvtepu8_epi16(upper_half(fg_u8_4x4)); + __m128i const bg01_u16_4x2 = _mm_cvtepu8_epi16(bg_u8_4x4); + __m128i const bg23_u16_4x2 = _mm_cvtepu8_epi16(upper_half(bg_u8_4x4)); + + /* prepare 4 destination and source alpha values */ + __m128i const a_u16_x4 = _mm_cvtepu8_epi16(_mm_set1_epi32(a_u8_x4)); + __m128i const da_u16_x4 = _mm_sub_epi16(_mm_set1_epi16(256), a_u16_x4); + __m128i const sa_u16_x4 = _mm_add_epi16(a_u16_x4, _mm_set1_epi16(1)); + + /* mix first pixel pair */ + __m128i const da01_u16_4x2 = _mm_shuffle_epi8(da_u16_x4, masks.a01); + __m128i const sa01_u16_4x2 = _mm_shuffle_epi8(sa_u16_x4, masks.a01); + __m128i const mixed01 = _mm_add_epi16(_mm_mullo_epi16(fg01_u16_4x2, sa01_u16_4x2), + _mm_mullo_epi16(bg01_u16_4x2, da01_u16_4x2)); + + /* mix second pixel pair */ + __m128i const da23_u16_4x2 = _mm_shuffle_epi8(da_u16_x4, masks.a23); + __m128i const sa23_u16_4x2 = _mm_shuffle_epi8(sa_u16_x4, masks.a23); + __m128i const mixed23 = _mm_add_epi16(_mm_mullo_epi16(fg23_u16_4x2, sa23_u16_4x2), + _mm_mullo_epi16(bg23_u16_4x2, da23_u16_4x2)); + + __m128i const res_4x4 = _mm_packus_epi16(_mm_srli_epi16(mixed01, 8), + _mm_srli_epi16(mixed23, 8)); + _mm_storeu_si128((__m128i *)bg, res_4x4); +} + + +__attribute__((optimize("-O3"))) +void Blit::Sse4::Blend::xrgb_a(uint32_t *dst, unsigned n, + uint32_t const *pixel, uint8_t const *alpha) +{ + Mix_masks const mix_masks { }; + + for (; n > 3; n -= 4, dst += 4, pixel += 4, alpha += 4) + _mix_4(dst, pixel, alpha, mix_masks); + + for (; n--; dst++, pixel++, alpha++) + *dst = _mix(*dst, *pixel, *alpha); +} + +#endif /* _INCLUDE__BLIT__INTERNAL__SSE3_H_ */ diff --git a/repos/os/include/spec/arm_64/blit/blit.h b/repos/os/include/spec/arm_64/blit/blit.h index 8a62d8ca8a2..d6023995cda 100644 --- a/repos/os/include/spec/arm_64/blit/blit.h +++ b/repos/os/include/spec/arm_64/blit/blit.h @@ -16,10 +16,12 @@ #include #include +#include namespace Blit { - static inline void back2front(auto &&... args) { _b2f(args...); } + static inline void back2front (auto &&... args) { _b2f(args...); } + static inline void blend_xrgb_a(auto &&... args) { Neon::Blend::xrgb_a(args...); } } #endif /* _INCLUDE__SPEC__ARM_64__BLIT_H_ */ diff --git a/repos/os/include/spec/x86_64/blit/blit.h b/repos/os/include/spec/x86_64/blit/blit.h index ef9291a474b..67fbb60fc9d 100644 --- a/repos/os/include/spec/x86_64/blit/blit.h +++ b/repos/os/include/spec/x86_64/blit/blit.h @@ -16,10 +16,12 @@ #include #include +#include namespace Blit { - static inline void back2front(auto &&... args) { _b2f(args...); } + static inline void back2front (auto &&... args) { _b2f(args...); } + static inline void blend_xrgb_a(auto &&... args) { Sse4::Blend::xrgb_a(args...); } } #endif /* _INCLUDE__SPEC__X86_64__BLIT_H_ */ diff --git a/repos/os/src/test/blit/main.cc b/repos/os/src/test/blit/main.cc index b4c961e86b3..790740ee262 100644 --- a/repos/os/src/test/blit/main.cc +++ b/repos/os/src/test/blit/main.cc @@ -197,7 +197,7 @@ namespace Blit { } -static void test_b2f_dispatch() +static inline void test_b2f_dispatch() { Texture texture_landscape { nullptr, nullptr, { 640, 480 } }; Texture texture_portrait { nullptr, nullptr, { 480, 640 } }; @@ -274,15 +274,110 @@ static void test_b2f_dispatch() } +template +static inline void test_simd_blend_mix() +{ + struct Rgb : Genode::Hex + { + explicit Rgb(uint32_t v) : Hex(v, OMIT_PREFIX, PAD) { } + }; + + struct Mix_test + { + uint32_t bg, fg; uint8_t a; uint32_t expected; + + void print(Output &out) const + { + Genode::print(out, "bg=", Rgb(bg), " fg=", Rgb(fg), " a=", a); + } + }; + + Mix_test mix_test[] { + { .bg = 0x000000, .fg = 0x000000, .a = 0, .expected = 0x000000 }, + { .bg = 0x000000, .fg = 0xffffff, .a = 0, .expected = 0x000000 }, + { .bg = 0xffffff, .fg = 0x000000, .a = 0, .expected = 0xffffff }, + { .bg = 0xffffff, .fg = 0xffffff, .a = 0, .expected = 0xffffff }, + + { .bg = 0x000000, .fg = 0x000000, .a = 255, .expected = 0x000000 }, + { .bg = 0x000000, .fg = 0xffffff, .a = 255, .expected = 0xffffff }, + { .bg = 0xffffff, .fg = 0x000000, .a = 255, .expected = 0x000000 }, + { .bg = 0xffffff, .fg = 0xffffff, .a = 255, .expected = 0xffffff }, + }; + + for (Mix_test const &test : mix_test) { + uint32_t slow = Slow::Blend::_mix(test.bg, test.fg, test.a); + uint32_t simd = SIMD::Blend::_mix(test.bg, test.fg, test.a); + if (slow == test.expected && slow == simd) { + log("mix ", test, " -> slow=", Rgb(slow), " simd=", Rgb(simd)); + } else { + error("mix ", test, " -> slow=", Rgb(slow), " simd=", Rgb(simd), + " expected=", Rgb(test.expected)); + throw 1; + } + } + + struct Xrgb_8x + { + uint32_t values[8]; + + void print(Output &out) const + { + for (unsigned i = 0; i < 8; i++) + Genode::print(out, (i == 0) ? "" : ".", Rgb(values[i])); + } + + bool operator != (Xrgb_8x const &other) const + { + for (unsigned i = 0; i < 8; i++) + if (values[i] != other.values[i]) + return true; + return false; + } + }; + + uint32_t const ca = 0xaaaaaa, cb = 0xbbbbbb, cc = 0xcccccc, cd = 0xdddddd, + white = 0xffffff; + + Xrgb_8x black_bg { }; + Xrgb_8x white_bg { { white, white, white, white, white, white, white, white } }; + + Xrgb_8x fg { { 0x001020, 0x405060, 0x8090a0, 0xc0d0e0, ca, cb, cc, cd } }; + uint8_t alpha[8] { 63, 127, 191, 255 , 64, 64, 64, 64 }; + + auto test_mix_8 = [&] (auto msg, Xrgb_8x &bg, Xrgb_8x const &fg, + uint8_t const *alpha, Xrgb_8x const &expected) + { + log("fg : ", fg); + log("bg : ", bg); + SIMD::Blend::xrgb_a(bg.values, 8, fg.values, alpha); + log(msg, " : ", bg); + if (expected != bg) { + error("expected ", expected); + throw 1; + } + }; + + test_mix_8("blackened", black_bg, fg, alpha, { { + 0x00000408, 0x00202830, 0x00606c78, 0x00c0d0e0, + 0x002b2b2b, 0x002f2f2f, 0x00333333, 0x00383838 } }); + + test_mix_8("whitened ", white_bg, fg, alpha, { { + 0x00c0c4c8, 0x00a0a8b0, 0x00a0acb8, 0x00c0d0e0, + 0x00eaeaea, 0x00eeeeee, 0x00f3f3f3, 0x00f7f7f7 } }); +} + + void Component::construct(Genode::Env &) { #ifdef _INCLUDE__BLIT__INTERNAL__NEON_H_ log("-- ARM Neon --"); test_simd_b2f(); + test_simd_blend_mix(); #endif #ifdef _INCLUDE__BLIT__INTERNAL__SSE4_H_ log("-- SSE4 --"); test_simd_b2f(); + test_simd_blend_mix(); #endif test_b2f_dispatch();