From 482f51a6e63c11b4ecd9fbe03b291537f9f10932 Mon Sep 17 00:00:00 2001 From: Norman Feske Date: Thu, 23 Jan 2025 12:09:40 +0100 Subject: [PATCH] fixup "blit: SIMD-based back2front copy" (Neon cache locality and prefetching) Issue #5428 --- repos/os/include/blit/internal/neon.h | 181 ++++++++++++++++++-------- 1 file changed, 128 insertions(+), 53 deletions(-) diff --git a/repos/os/include/blit/internal/neon.h b/repos/os/include/blit/internal/neon.h index fc06975e734..52a35af0a87 100644 --- a/repos/os/include/blit/internal/neon.h +++ b/repos/os/include/blit/internal/neon.h @@ -31,8 +31,6 @@ namespace Blit { struct Neon; } struct Blit::Neon { - template struct Ptr4; - static inline uint32x4_t _reversed(uint32x4_t const v) { return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v))); @@ -59,44 +57,129 @@ struct Blit::Neon *d++ = *s++; }; - static inline void _rotate_4_lines(auto src_ptr, auto dst_ptr, - unsigned len, auto const src_step) + struct Src_ptr4 { - union Tile { uint32x4x4_t all; uint32x4_t row[4]; }; - Tile t; - while (len--) { - t.all = vld4q_lane_u32(src_ptr.p0, t.all, 3); - t.all = vld4q_lane_u32(src_ptr.p1, t.all, 2); - t.all = vld4q_lane_u32(src_ptr.p2, t.all, 1); - t.all = vld4q_lane_u32(src_ptr.p3, t.all, 0); + uint32x4_t const *p0, *p1, *p2, *p3; - dst_ptr.append(t.row[0], t.row[1], t.row[2], t.row[3]); - src_ptr.incr(src_step); - }; + inline Src_ptr4(uint32x4_t const *p, int const step) + : + p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step) + { } + + void incr_4x(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + void incr_y (int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + + void prefetch() const + { + __builtin_prefetch(p0); __builtin_prefetch(p1); + __builtin_prefetch(p2); __builtin_prefetch(p3); + } + + void load(uint32x4x4_t &tile) const { tile = { *p0, *p1, *p2, *p3 }; } }; - template - struct Ptr4 + struct Dst_ptr4 { - PTR *p0, *p1, *p2, *p3; + uint32_t *p0, *p1, *p2, *p3; - Ptr4(PTR *p, int w) : p0(p), p1(p + w), p2(p + 2*w), p3(p + 3*w) { } + Dst_ptr4(uint32_t *p, int const step) + : + p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step) + { } - void incr(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + void incr_x(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + void incr_y(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } - void append(auto v0, auto v1, auto v2, auto v3) + void store(uint32x4x4_t const &tile) { - *p0++ = v0; *p1++ = v1; *p2++ = v2; *p3++ = v3; + vst4q_lane_u32(p0, tile, 0); + vst4q_lane_u32(p1, tile, 1); + vst4q_lane_u32(p2, tile, 2); + vst4q_lane_u32(p3, tile, 3); + } + }; + + struct Steps + { + int const src_4x, src_y, dst_x, dst_y; + + void incr_x_4 (Src_ptr4 &p) const { p.incr_4x(src_4x ); }; + void incr_x_16(Src_ptr4 &p) const { p.incr_4x(src_4x << 2); }; + void incr_y_4 (Src_ptr4 &p) const { p.incr_y (src_y << 2); }; + void incr_y_16(Src_ptr4 &p) const { p.incr_y (src_y << 4); }; + void incr_x_4 (Dst_ptr4 &p) const { p.incr_x (dst_x << 2); }; + void incr_x_16(Dst_ptr4 &p) const { p.incr_x (dst_x << 4); }; + void incr_y_4 (Dst_ptr4 &p) const { p.incr_y (dst_y << 2); }; + void incr_y_16(Dst_ptr4 &p) const { p.incr_y (dst_y << 4); }; + }; + + __attribute__((optimize("-O3"))) + static inline void _load_prefetch_store(Src_ptr4 &src, Dst_ptr4 &dst, Steps const steps) + { + uint32x4x4_t tile; + src.load(tile); + steps.incr_y_4(src); + src.prefetch(); + dst.store(tile); + steps.incr_x_4(dst); + } + + __attribute__((optimize("-O3"))) + static inline void _rotate_16x4(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) + { + for (unsigned i = 0; i < 4; i++) + _load_prefetch_store(src, dst, steps); + } + + __attribute__((optimize("-O3"))) + static inline void _rotate_16x4_last(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) + { + for (unsigned i = 0; i < 3; i++) + _load_prefetch_store(src, dst, steps); + + uint32x4x4_t tile; + src.load(tile); + dst.store(tile); + } + + __attribute__((optimize("-O3"))) + static inline void _rotate_16x16(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) + { + for (unsigned i = 0; i < 3; i++) { + _rotate_16x4(src, dst, steps); + steps.incr_y_4(dst); + steps.incr_x_4(src); } + _rotate_16x4_last(src, dst, steps); + } + + static inline void _rotate_16_lines(Src_ptr4 src, Dst_ptr4 dst, + Steps const steps, unsigned n) + { + for (; n > 1; n--) { + _rotate_16x16(src, dst, steps); + steps.incr_x_16(dst); + steps.incr_y_16(src); + }; + _rotate_16x16(src, dst, steps); }; + static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst, + Steps const steps, unsigned w, unsigned h) + { + for (unsigned i = w; i; i--) { + _rotate_16_lines(src, dst, steps, h); + steps.incr_x_16(src); + steps.incr_y_16(dst); + } + } struct B2f { - static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); - static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); - static inline void r180 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); - static inline void r270 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); }; struct B2f_flip @@ -127,14 +210,12 @@ void Blit::Neon::B2f::r90(uint32_t *dst, unsigned const dst_w, uint32_t const *src, unsigned const src_w, unsigned const w, unsigned const h) { - Ptr4 src_ptr4 (src + 16*src_w*(16*h - 4), 16*src_w); - Ptr4 dst_ptr4 ((uint32x4_t *)dst, 4*dst_w); + Steps const steps { 1, -4*int(src_w), 1, 16*int(dst_w) }; - for (unsigned i = 4*w; i; i--) { - _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, -4*16*src_w); - src_ptr4.incr(4); - dst_ptr4.incr(4*4*dst_w); - } + Src_ptr4 src_ptr4 ((uint32x4_t *)src + 4*src_w*(16*h - 1), steps.src_y); + Dst_ptr4 dst_ptr4 (dst, steps.dst_y); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); } @@ -156,14 +237,12 @@ void Blit::Neon::B2f::r270(uint32_t *dst, unsigned const dst_w, uint32_t const *src, unsigned const src_w, unsigned const w, const unsigned h) { - Ptr4 src_ptr4 (src + 3*16*src_w + 16*w - 4, -16*src_w); - Ptr4 dst_ptr4 ((uint32x4_t *)dst + 3*4*dst_w, -4*dst_w); + Steps const steps { 1, 4*int(src_w), 1, -16*int(dst_w) }; - for (unsigned i = 4*w; i; i--) { - _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, 4*16*src_w); - src_ptr4.incr(-4); - dst_ptr4.incr(4*4*dst_w); - } + Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y); + Dst_ptr4 dst_ptr4 (dst + 16*int(dst_w)*(w*16 - 1), steps.dst_y); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); } @@ -185,14 +264,12 @@ void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned const dst_w, uint32_t const *src, unsigned const src_w, unsigned const w, unsigned const h) { - Ptr4 src_ptr4 (src + 3*16*src_w, -16*src_w); - Ptr4 dst_ptr4 ((uint32x4_t *)dst, 4*dst_w); + Steps const steps { 1, 4*int(src_w), 1, 16*int(dst_w) }; - for (unsigned i = 4*w; i; i--) { - _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, 4*16*src_w); - src_ptr4.incr(4); - dst_ptr4.incr(4*4*dst_w); - } + Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y); + Dst_ptr4 dst_ptr4 (dst, steps.dst_y); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); } @@ -214,14 +291,12 @@ void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned const dst_w, uint32_t const *src, unsigned const src_w, unsigned const w, const unsigned h) { - Ptr4 src_ptr4 (src + (16*h - 4)*16*src_w + 16*w, 16*src_w); - Ptr4 dst_ptr4 ((uint32x4_t *)dst + 3*4*dst_w, -4*dst_w); + Steps const steps { 1, -4*int(src_w), 1, -16*int(dst_w) }; - for (unsigned i = 4*w; i; i--) { - src_ptr4.incr(-4); - _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, -4*16*src_w); - dst_ptr4.incr(4*4*dst_w); - } + Src_ptr4 src_ptr4 ((uint32x4_t *)src + 4*src_w*(16*h - 1), steps.src_y); + Dst_ptr4 dst_ptr4 (dst + 16*int(dst_w)*(w*16 - 1), steps.dst_y); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); } #endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */