From 0a756e44abf8b5e5a0a561700c88fde4fc7f98fe Mon Sep 17 00:00:00 2001 From: Norman Feske Date: Thu, 16 Jan 2025 19:35:01 +0100 Subject: [PATCH] blit: SIMD-based back2front copy Issue #5428 --- repos/os/include/blit/blit.h | 72 ++++-- repos/os/include/blit/internal/neon.h | 294 ++++++++++++++++++++++ repos/os/include/blit/internal/slow.h | 131 ++++++++++ repos/os/include/blit/internal/sse4.h | 263 +++++++++++++++++++ repos/os/include/blit/types.h | 161 ++++++++++++ repos/os/include/spec/arm_64/blit/blit.h | 25 ++ repos/os/include/spec/x86_64/blit/blit.h | 25 ++ repos/os/recipes/api/blit/content.mk | 2 + repos/os/recipes/pkg/test-blit/README | 1 + repos/os/recipes/pkg/test-blit/archives | 1 + repos/os/recipes/pkg/test-blit/runtime | 13 + repos/os/recipes/src/test-blit/content.mk | 2 + repos/os/recipes/src/test-blit/hash | 1 + repos/os/recipes/src/test-blit/used_apis | 2 + repos/os/src/test/blit/main.cc | 291 +++++++++++++++++++++ repos/os/src/test/blit/target.mk | 3 + 16 files changed, 1265 insertions(+), 22 deletions(-) create mode 100644 repos/os/include/blit/internal/neon.h create mode 100644 repos/os/include/blit/internal/slow.h create mode 100644 repos/os/include/blit/internal/sse4.h create mode 100644 repos/os/include/blit/types.h create mode 100644 repos/os/include/spec/arm_64/blit/blit.h create mode 100644 repos/os/include/spec/x86_64/blit/blit.h create mode 100644 repos/os/recipes/pkg/test-blit/README create mode 100644 repos/os/recipes/pkg/test-blit/archives create mode 100644 repos/os/recipes/pkg/test-blit/runtime create mode 100644 repos/os/recipes/src/test-blit/content.mk create mode 100644 repos/os/recipes/src/test-blit/hash create mode 100644 repos/os/recipes/src/test-blit/used_apis create mode 100644 repos/os/src/test/blit/main.cc create mode 100644 repos/os/src/test/blit/target.mk diff --git a/repos/os/include/blit/blit.h b/repos/os/include/blit/blit.h index c8a55331c41..2b04cbe282a 100644 --- a/repos/os/include/blit/blit.h +++ b/repos/os/include/blit/blit.h @@ -1,34 +1,62 @@ /* - * \brief Interface of 2D-copy library + * \brief Blit API * \author Norman Feske - * \date 2007-10-10 + * \date 2025-01-16 */ /* - * Copyright (C) 2007-2017 Genode Labs GmbH + * Copyright (C) 2025 Genode Labs GmbH * * This file is part of the Genode OS framework, which is distributed * under the terms of the GNU Affero General Public License version 3. */ -#ifndef _INCLUDE__BLIT__BLIT_H_ -#define _INCLUDE__BLIT__BLIT_H_ +#ifndef _INCLUDE__BLIT_H_ +#define _INCLUDE__BLIT_H_ -/** - * Blit memory from source buffer to destination buffer - * - * \param src address of source buffer - * \param src_w line length of source buffer in bytes - * \param dst address of destination buffer - * \param dst_w line length of destination buffer in bytes - * \param w number of bytes per line to copy - * \param h number of lines to copy - * - * This function works at a granularity of 16bit. - * If the source and destination overlap, the result - * of the copy operation is not defined. - */ -extern "C" void blit(void const *src, unsigned src_w, - void *dst, unsigned dst_w, int w, int h); +#include +#include + +namespace Blit { + + /** + * Back-to-front copy + * + * Copy a rectangular part of a texture to a surface while optionally + * applying rotation and flipping. The width and height of the texture + * must be divisible by 8. Surface and texture must perfectly line up. + * E.g., when rotating by 90 degrees, the texture width must equal the + * surface height and vice versa. The clipping area of the surface is + * ignored. + * + * The combination of rotate and flip arguments works as follows: + * + * normal flipped + * + * rotated 0 0 1 2 3 3 2 1 0 + * 4 5 6 7 7 6 5 4 + * 8 9 10 11 11 10 9 8 + * + * rotated 90 8 4 0 0 4 8 + * 9 5 1 1 5 9 + * 10 6 2 2 6 10 + * 11 7 3 3 7 11 + * + * rotated 180 11 10 9 8 8 9 10 11 + * 7 6 5 4 4 5 6 7 + * 3 2 1 0 0 1 2 3 + * + * rotated 270 3 7 11 11 7 3 + * 2 6 10 10 6 2 + * 1 5 9 9 5 1 + * 0 4 8 8 4 0 + */ + static inline void back2front(Surface &surface, + Texture const &texture, + Rect rect, Rotate rotate, Flip flip) + { + _b2f(surface, texture, rect, rotate, flip); + } +} -#endif /* _INCLUDE__BLIT__BLIT_H_ */ +#endif /* _INCLUDE__BLIT_H_ */ diff --git a/repos/os/include/blit/internal/neon.h b/repos/os/include/blit/internal/neon.h new file mode 100644 index 00000000000..0a66c8ceae3 --- /dev/null +++ b/repos/os/include/blit/internal/neon.h @@ -0,0 +1,294 @@ +/* + * \brief 2D memory copy using ARM NEON + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__BLIT__INTERNAL__NEON_H_ +#define _INCLUDE__BLIT__INTERNAL__NEON_H_ + +#include + +/* compiler intrinsics */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wfloat-conversion" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#include +#pragma GCC diagnostic pop + + +namespace Blit { struct Neon; } + + +struct Blit::Neon +{ + static inline uint32x4_t _reversed(uint32x4_t const v) + { + return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v))); + } + + static inline void _reverse_line(uint32x4_t const *src, uint32x4_t *dst, unsigned len) + { + src += len; /* move 'src' from end of line towards begin */ + while (len--) + *dst++ = _reversed(*--src); + }; + + static inline void _copy_line(uint32x4_t const *s, uint32x4_t *d, unsigned len) + { + while (len--) + *d++ = *s++; + }; + + struct Src_ptr4 + { + uint32x4_t const *p0, *p1, *p2, *p3; + + inline Src_ptr4(uint32x4_t const *p, int const step) + : + p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step) + { } + + void incr_4(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; } + + void prefetch() const + { + __builtin_prefetch(p0); __builtin_prefetch(p1); + __builtin_prefetch(p2); __builtin_prefetch(p3); + } + + void load(uint32x4x4_t &tile) const { tile = { *p0, *p1, *p2, *p3 }; } + }; + + struct Dst_ptr4 + { + uint32_t *p0, *p1, *p2, *p3; + + Dst_ptr4(uint32_t *p, int const step) + : + p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step) + { } + + void incr(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; } + + void store(uint32x4x4_t const &tile) + { + vst4q_lane_u32(p0, tile, 0); + vst4q_lane_u32(p1, tile, 1); + vst4q_lane_u32(p2, tile, 2); + vst4q_lane_u32(p3, tile, 3); + } + }; + + struct Steps + { + int const src_y, dst_y; + + void incr_x_4(Src_ptr4 &p) const { p.incr_4(1); }; + void incr_x_8(Src_ptr4 &p) const { p.incr_4(2); }; + void incr_y_4(Src_ptr4 &p) const { p.incr_4(src_y << 2); }; + void incr_y_8(Src_ptr4 &p) const { p.incr_4(src_y << 3); }; + + void incr_x_4(Dst_ptr4 &p) const { p.incr(4); }; + void incr_x_8(Dst_ptr4 &p) const { p.incr(8); }; + void incr_y_4(Dst_ptr4 &p) const { p.incr(dst_y << 2); }; + void incr_y_8(Dst_ptr4 &p) const { p.incr(dst_y << 3); }; + }; + + __attribute__((optimize("-O3"))) + static inline void _load_prefetch_store(Src_ptr4 &src, Dst_ptr4 &dst, Steps const steps) + { + uint32x4x4_t tile; + src.load(tile); + steps.incr_y_4(src); + src.prefetch(); + dst.store(tile); + steps.incr_x_4(dst); + } + + __attribute__((optimize("-O3"))) + static inline void _rotate_8x4(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) + { + for (unsigned i = 0; i < 2; i++) + _load_prefetch_store(src, dst, steps); + } + + __attribute__((optimize("-O3"))) + static inline void _rotate_8x4_last(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) + { + _load_prefetch_store(src, dst, steps); + uint32x4x4_t tile; + src.load(tile); + dst.store(tile); + } + + __attribute__((optimize("-O3"))) + static inline void _rotate_8x8(Src_ptr4 src, Dst_ptr4 dst, Steps const steps) + { + _rotate_8x4(src, dst, steps); + steps.incr_y_4(dst); + steps.incr_x_4(src); + _rotate_8x4_last(src, dst, steps); + } + + __attribute__((optimize("-O3"))) + static inline void _rotate_8_lines(Src_ptr4 src, Dst_ptr4 dst, + Steps const steps, unsigned n) + { + for (; n; n--) { + _rotate_8x8(src, dst, steps); + steps.incr_y_8(dst); + steps.incr_x_8(src); + } + }; + + static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst, + Steps const steps, unsigned w, unsigned h) + { + for (unsigned i = h; i; i--) { + _rotate_8_lines(src, dst, steps, w); + steps.incr_y_8(src); + steps.incr_x_8(dst); + } + } + + struct B2f; + struct B2f_flip; +}; + + +struct Blit::Neon::B2f +{ + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); +}; + + +void Blit::Neon::B2f::r0(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + uint32x4_t const *s = (uint32x4_t const *)src; + uint32x4_t *d = (uint32x4_t *)dst; + + for (unsigned lines = h*8; lines; lines--) { + _copy_line(s, d, 2*w); + s += 2*line_w; + d += 2*line_w; + } +} + + +void Blit::Neon::B2f::r90(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, unsigned const h) +{ + Steps const steps { -2*int(src_w), 8*int(dst_w) }; + + Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y); + Dst_ptr4 dst_ptr4 (dst, steps.dst_y); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); +} + + +void Blit::Neon::B2f::r180(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + uint32x4_t *d = (uint32x4_t *)dst; + uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h; + + for (unsigned i = h*8; i; i--) { + s -= 2*line_w; + _reverse_line(s, d, 2*w); + d += 2*line_w; + } +} + + +void Blit::Neon::B2f::r270(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, const unsigned h) +{ + Steps const steps { 2*int(src_w), -8*int(dst_w) }; + + Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y); + Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); +} + + +struct Blit::Neon::B2f_flip +{ + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); +}; + + +void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + uint32x4_t const *s = (uint32x4_t const *)src; + uint32x4_t *d = (uint32x4_t *)dst; + + for (unsigned lines = h*8; lines; lines--) { + _reverse_line(s, d, 2*w); + s += 2*line_w; + d += 2*line_w; + } +} + + +void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, unsigned const h) +{ + Steps const steps { 2*int(src_w), 8*int(dst_w) }; + + Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y); + Dst_ptr4 dst_ptr4 (dst, steps.dst_y); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); +} + + +void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h; + uint32x4_t *d = (uint32x4_t *)dst; + + for (unsigned lines = h*8; lines; lines--) { + s -= 2*line_w; + _copy_line(s, d, 2*w); + d += 2*line_w; + } +} + + +void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, const unsigned h) +{ + Steps const steps { -2*int(src_w), -8*int(dst_w) }; + + Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y); + Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); +} + +#endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */ diff --git a/repos/os/include/blit/internal/slow.h b/repos/os/include/blit/internal/slow.h new file mode 100644 index 00000000000..c4e0954c66f --- /dev/null +++ b/repos/os/include/blit/internal/slow.h @@ -0,0 +1,131 @@ +/* + * \brief Fallback 2D memory copy + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__BLIT__INTERNAL__SLOW_H_ +#define _INCLUDE__BLIT__INTERNAL__SLOW_H_ + +#include + +namespace Blit { + + struct Slow; + + static inline void _write_line(uint32_t const *src, uint32_t *dst, + unsigned len, int dst_step) + { + for (; len--; dst += dst_step) + *dst = *src++; + } + + static inline void _write_lines(uint32_t const *src, unsigned src_w, + uint32_t *dst, + unsigned w, unsigned h, int dx, int dy) + { + for (unsigned lines = h*8; lines; lines--) { + _write_line(src, dst, 8*w, dx); + src += 8*src_w; + dst += dy; + } + }; +} + + +struct Blit::Slow +{ + struct B2f; + struct B2f_flip; +}; + + +struct Blit::Slow::B2f +{ + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); +}; + + +void Blit::Slow::B2f::r0(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) +{ + _write_lines(src, line_w, dst, w, h, 1, 8*line_w); +} + + +void Blit::Slow::B2f::r90(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h) +{ + _write_lines(src, src_w, dst + 8*h - 1, w, h, 8*dst_w, -1); +} + + +void Blit::Slow::B2f::r180(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) +{ + dst += 8*w - 1 + (8*h - 1)*8*line_w; + _write_lines(src, line_w, dst, w, h, -1, -8*line_w); +} + + +void Blit::Slow::B2f::r270(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h) +{ + dst += 8*dst_w*(8*w - 1); + _write_lines(src, src_w, dst, w, h, -8*dst_w, 1); +} + + +struct Blit::Slow::B2f_flip +{ + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); +}; + + +void Blit::Slow::B2f_flip::r0(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) +{ + _write_lines(src, line_w, dst + 8*w - 1, w, h, -1, 8*line_w); +} + + +void Blit::Slow::B2f_flip::r90(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h) +{ + _write_lines(src, src_w, dst, w, h, 8*dst_w, 1); +} + + +void Blit::Slow::B2f_flip::r180(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) +{ + dst += (8*h - 1)*8*line_w; + _write_lines(src, line_w, dst, w, h, 1, -8*line_w); +} + + +void Blit::Slow::B2f_flip::r270(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h) +{ + dst += 8*h - 1 + 8*dst_w*(8*w - 1); + _write_lines(src, src_w, dst, w, h, -8*dst_w, -1); +} + +#endif /* _INCLUDE__BLIT__INTERNAL__SLOW_H_ */ diff --git a/repos/os/include/blit/internal/sse4.h b/repos/os/include/blit/internal/sse4.h new file mode 100644 index 00000000000..959f24337a0 --- /dev/null +++ b/repos/os/include/blit/internal/sse4.h @@ -0,0 +1,263 @@ +/* + * \brief 2D memory copy using SSE4 + * \author Norman Feske + * \date 2025-01-21 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__BLIT__INTERNAL__SSE4_H_ +#define _INCLUDE__BLIT__INTERNAL__SSE4_H_ + +#include + +/* compiler intrinsics */ +#ifndef _MM_MALLOC_H_INCLUDED /* discharge dependency from stdlib.h */ +#define _MM_MALLOC_H_INCLUDED +#define _MM_MALLOC_H_INCLUDED_PREVENTED +#endif +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#include +#pragma GCC diagnostic pop +#ifdef _MM_MALLOC_H_INCLUDED_PREVENTED +#undef _MM_MALLOC_H_INCLUDED +#undef _MM_MALLOC_H_INCLUDED_PREVENTED +#endif + + +namespace Blit { struct Sse4; }; + + +struct Blit::Sse4 +{ + union Tile_4x4 { __m128i pi[4]; __m128 ps[4]; }; + + struct Src_ptr4 + { + __m128i const *p0, *p1, *p2, *p3; + + inline Src_ptr4(__m128i const *p, int const step) + : + p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step) + { } + + void incr_4(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + + void prefetch() const + { + _mm_prefetch(p0, _MM_HINT_T0); _mm_prefetch(p1, _MM_HINT_T0); + _mm_prefetch(p2, _MM_HINT_T0); _mm_prefetch(p3, _MM_HINT_T0); + } + + void load(Tile_4x4 &tile) const + { + tile.pi[0] = _mm_load_si128(p0); tile.pi[1] = _mm_load_si128(p1); + tile.pi[2] = _mm_load_si128(p2); tile.pi[3] = _mm_load_si128(p3); + } + }; + + struct Dst_ptr4 + { + __m128i *p0, *p1, *p2, *p3; + + Dst_ptr4(__m128i *p, int const step_4) + : + p0(p), p1(p0 + step_4), p2(p1 + step_4), p3(p2 + step_4) + { } + + void incr_4(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + + void store(Tile_4x4 const &tile) const + { + _mm_stream_si128(p0, tile.pi[0]); _mm_stream_si128(p1, tile.pi[1]); + _mm_stream_si128(p2, tile.pi[2]); _mm_stream_si128(p3, tile.pi[3]); + } + }; + + struct Steps { int src_y_4, dst_y_4; }; + + static inline void _reverse_line(__m128i const *s, __m128i *d, unsigned len_8) + { + static constexpr int reversed = (0 << 6) | (1 << 4) | (2 << 2) | 3; + + d += 2*len_8; /* move 'dst' from end towards begin */ + + while (len_8--) { + __m128i const v0 = _mm_load_si128(s++); + __m128i const v1 = _mm_load_si128(s++); + _mm_stream_si128(--d, _mm_shuffle_epi32(v0, reversed)); + _mm_stream_si128(--d, _mm_shuffle_epi32(v1, reversed)); + } + }; + + static inline void _copy_line(__m128i const *s, __m128i *d, unsigned len_8) + { + while (len_8--) { + __m128i const v0 = _mm_load_si128(s++); + __m128i const v1 = _mm_load_si128(s++); + _mm_stream_si128(d++, v0); + _mm_stream_si128(d++, v1); + } + }; + + static inline void _rotate_4_lines(Src_ptr4 src, Dst_ptr4 dst, + unsigned len_4, auto const dst_4_step) + { + Tile_4x4 t; + while (len_4--) { + src.load(t); + src.incr_4(1); + src.prefetch(); + _MM_TRANSPOSE4_PS(t.ps[0], t.ps[1], t.ps[2], t.ps[3]); + dst.store(t); + dst.incr_4(dst_4_step); + }; + }; + + static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst, + Steps const steps, unsigned w, unsigned h) + { + for (unsigned i = 2*h; i; i--) { + _rotate_4_lines(src, dst, 2*w, 4*steps.dst_y_4); + src.incr_4(4*steps.src_y_4); + dst.incr_4(1); + } + } + + struct B2f; + struct B2f_flip; +}; + + +struct Blit::Sse4::B2f +{ + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); +}; + + +void Blit::Sse4::B2f::r0(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) +{ + __m128i const *s = (__m128i const *)src; + __m128i *d = (__m128i *)dst; + + for (unsigned lines = h*8; lines; lines--) { + _copy_line(s, d, w); + s += 2*line_w; + d += 2*line_w; + } +} + + +void Blit::Sse4::B2f::r90(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h) +{ + Steps const steps { -2*int(src_w), 2*int(dst_w) }; + + Src_ptr4 src_ptr4 ((__m128i *)src + 2*src_w*(8*h - 1), steps.src_y_4); + Dst_ptr4 dst_ptr4 ((__m128i *)dst, steps.dst_y_4); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); +} + + +void Blit::Sse4::B2f::r180(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) +{ + __m128i *d = (__m128i *)dst; + __m128i const *s = (__m128i const *)src + 2*line_w*8*h; + + for (unsigned i = h*8; i; i--) { + s -= 2*line_w; + _reverse_line(s, d, w); + d += 2*line_w; + } +} + + +void Blit::Sse4::B2f::r270(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h) +{ + Steps const steps { 2*int(src_w), -2*int(dst_w) }; + + Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4); + Dst_ptr4 dst_ptr4 ((__m128i *)dst + 2*int(dst_w)*(8*w - 1), steps.dst_y_4); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); +} + + +struct Blit::Sse4::B2f_flip +{ + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); +}; + + +void Blit::Sse4::B2f_flip::r0(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) +{ + __m128i const *s = (__m128i const *)src; + __m128i *d = (__m128i *)dst; + + for (unsigned lines = h*8; lines; lines--) { + _reverse_line(s, d, w); + s += 2*line_w; + d += 2*line_w; + } +} + + +void Blit::Sse4::B2f_flip::r90(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h) +{ + Steps const steps { 2*int(src_w), 2*int(dst_w) }; + + Src_ptr4 src_ptr4 ((__m128i *)src, steps.src_y_4); + Dst_ptr4 dst_ptr4 ((__m128i *)dst, steps.dst_y_4); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); +} + + +void Blit::Sse4::B2f_flip::r180(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) +{ + __m128i const *s = (__m128i const *)src + 2*line_w*8*h; + __m128i *d = (__m128i *)dst; + + for (unsigned lines = h*8; lines; lines--) { + s -= 2*line_w; + _copy_line(s, d, w); + d += 2*line_w; + } +} + + +void Blit::Sse4::B2f_flip::r270(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h) +{ + Steps const steps { -2*int(src_w), -2*int(dst_w) }; + + Src_ptr4 src_ptr4 ((__m128i *)src + 2*int(src_w)*(h*8 - 1), steps.src_y_4); + Dst_ptr4 dst_ptr4 ((__m128i *)dst + 2*int(dst_w)*(w*8 - 1), steps.dst_y_4); + + _rotate(src_ptr4, dst_ptr4, steps, w, h); +} + +#endif /* _INCLUDE__BLIT__INTERNAL__SSE4_H_ */ diff --git a/repos/os/include/blit/types.h b/repos/os/include/blit/types.h new file mode 100644 index 00000000000..ea3afe48b4c --- /dev/null +++ b/repos/os/include/blit/types.h @@ -0,0 +1,161 @@ +/* + * \brief Types and utilities used for 2D memory copy + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__BLIT__TYPES_H_ +#define _INCLUDE__BLIT__TYPES_H_ + +/* Genode includes */ +#include +#include +#include + +namespace Blit { + + using namespace Genode; + + using Rect = Surface_base::Rect; + using Area = Surface_base::Area; + using Point = Surface_base::Point; + + enum class Rotate { R0, R90, R180, R270 }; + + struct Flip { bool enabled; }; + + static bool swap_w_h(Rotate r) { return r == Rotate::R90 || r == Rotate::R270; } + + static Area transformed(Area a, Rotate rotate) + { + return swap_w_h(rotate) ? Area { a.h, a.w } : a; + } + + static Point transformed(Point p, Area area, Rotate rotate, Flip flip) + { + int const w = area.w, h = area.h; + + switch (rotate) { + case Rotate::R0: break; + case Rotate::R90: p = { .x = h - p.y - 1, .y = p.x }; break; + case Rotate::R180: p = { .x = w - p.x - 1, .y = h - p.y - 1 }; break; + case Rotate::R270: p = { .x = p.y, .y = w - p.x - 1 }; break; + } + + if (flip.enabled) + p = { int(transformed(area, rotate).w) - p.x - 1, p.y }; + + return p; + } + + static Rect transformed(Rect r, Area area, Rotate rotate, Flip flip) + { + auto rect_from_points = [&] (Point p1, Point p2) + { + return Rect::compound(Point { min(p1.x, p2.x), min(p1.y, p2.y) }, + Point { max(p1.x, p2.x), max(p1.y, p2.y) }); + }; + return rect_from_points(transformed(r.p1(), area, rotate, flip), + transformed(r.p2(), area, rotate, flip)); + } + + static Rect snapped_to_8x8_grid(Rect r) + { + return Rect::compound(Point { .x = r.x1() & ~0x7, + .y = r.y1() & ~0x7 }, + Point { .x = ((r.x2() + 8) & ~0x7) - 1, + .y = ((r.y2() + 8) & ~0x7) - 1 }); + } + + template + static inline void _b2f(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h, Rotate rotate) + { + switch (rotate) { + case Rotate::R0: B2F::r0 (dst, dst_w, src, w, h); break; + case Rotate::R90: B2F::r90 (dst, dst_w, src, src_w, w, h); break; + case Rotate::R180: B2F::r180(dst, dst_w, src, w, h); break; + case Rotate::R270: B2F::r270(dst, dst_w, src, src_w, w, h); break; + } + } + + template + static inline void _b2f(Surface &surface, + Texture const &texture, + Rect rect, Rotate rotate, Flip flip) + { + /* surface size must be divisible by 8 */ + if (!aligned(surface.size().w, 2) || !aligned(surface.size().h, 2)) { + warning("surface size ", surface.size(), " not divisible by 8"); + return; + } + + /* check compatibility of surface with texture */ + if (transformed(surface.size(), rotate) != texture.size()) { + warning("surface ", surface.size(), " mismatches texture ", texture.size()); + return; + } + + /* restrict rect to texture size */ + rect = Rect::intersect(rect, Rect { { }, texture.size() }); + + /* compute base addresses of affected pixel window */ + Rect const src_rect = snapped_to_8x8_grid(rect); + Rect const dst_rect = transformed(src_rect, texture.size(), rotate, flip); + + uint32_t const * const src = (uint32_t const *)texture.pixel() + + src_rect.y1()*texture.size().w + + src_rect.x1(); + + uint32_t * const dst = (uint32_t *)surface.addr() + + dst_rect.y1()*surface.size().w + + dst_rect.x1(); + + /* coordinates converted to 8x8 units */ + unsigned const src_w = texture.size().w >> 3, + dst_w = surface.size().w >> 3, + w = src_rect.area.w >> 3, + h = src_rect.area.h >> 3; + + if (w && h) { + if (flip.enabled) + _b2f(dst, dst_w, src, src_w, w, h, rotate); + else + _b2f (dst, dst_w, src, src_w, w, h, rotate); + } + + surface.flush_pixels(dst_rect); + } +} + + +/**************** + ** Legacy API ** + ****************/ + +/** + * Blit memory from source buffer to destination buffer + * + * \param src address of source buffer + * \param src_w line length of source buffer in bytes + * \param dst address of destination buffer + * \param dst_w line length of destination buffer in bytes + * \param w number of bytes per line to copy + * \param h number of lines to copy + * + * This function works at a granularity of 16bit. + * If the source and destination overlap, the result + * of the copy operation is not defined. + */ +extern "C" void blit(void const *src, unsigned src_w, + void *dst, unsigned dst_w, int w, int h); + +#endif /* _INCLUDE__BLIT__TYPES_H_ */ diff --git a/repos/os/include/spec/arm_64/blit/blit.h b/repos/os/include/spec/arm_64/blit/blit.h new file mode 100644 index 00000000000..8a62d8ca8a2 --- /dev/null +++ b/repos/os/include/spec/arm_64/blit/blit.h @@ -0,0 +1,25 @@ +/* + * \brief Blit API + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__SPEC__ARM_64__BLIT_H_ +#define _INCLUDE__SPEC__ARM_64__BLIT_H_ + +#include +#include + +namespace Blit { + + static inline void back2front(auto &&... args) { _b2f(args...); } +} + +#endif /* _INCLUDE__SPEC__ARM_64__BLIT_H_ */ diff --git a/repos/os/include/spec/x86_64/blit/blit.h b/repos/os/include/spec/x86_64/blit/blit.h new file mode 100644 index 00000000000..ef9291a474b --- /dev/null +++ b/repos/os/include/spec/x86_64/blit/blit.h @@ -0,0 +1,25 @@ +/* + * \brief Blit API + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__SPEC__X86_64__BLIT_H_ +#define _INCLUDE__SPEC__X86_64__BLIT_H_ + +#include +#include + +namespace Blit { + + static inline void back2front(auto &&... args) { _b2f(args...); } +} + +#endif /* _INCLUDE__SPEC__X86_64__BLIT_H_ */ diff --git a/repos/os/recipes/api/blit/content.mk b/repos/os/recipes/api/blit/content.mk index 72d2ed4aad9..e5dabaf1416 100644 --- a/repos/os/recipes/api/blit/content.mk +++ b/repos/os/recipes/api/blit/content.mk @@ -1,4 +1,6 @@ MIRROR_FROM_REP_DIR := include/blit \ + include/spec/x86_64/blit \ + include/spec/arm_64/blit \ src/lib/blit \ lib/mk/blit.mk \ lib/mk/spec/arm_64/blit.mk \ diff --git a/repos/os/recipes/pkg/test-blit/README b/repos/os/recipes/pkg/test-blit/README new file mode 100644 index 00000000000..641e69c4db2 --- /dev/null +++ b/repos/os/recipes/pkg/test-blit/README @@ -0,0 +1 @@ +Scenario for testing 2D blitting operations diff --git a/repos/os/recipes/pkg/test-blit/archives b/repos/os/recipes/pkg/test-blit/archives new file mode 100644 index 00000000000..b0a06a1e854 --- /dev/null +++ b/repos/os/recipes/pkg/test-blit/archives @@ -0,0 +1 @@ +_/src/test-blit diff --git a/repos/os/recipes/pkg/test-blit/runtime b/repos/os/recipes/pkg/test-blit/runtime new file mode 100644 index 00000000000..bff0d79325f --- /dev/null +++ b/repos/os/recipes/pkg/test-blit/runtime @@ -0,0 +1,13 @@ + + + + [init] --- blit test finished --- + + + + + + + + + diff --git a/repos/os/recipes/src/test-blit/content.mk b/repos/os/recipes/src/test-blit/content.mk new file mode 100644 index 00000000000..c1f82975735 --- /dev/null +++ b/repos/os/recipes/src/test-blit/content.mk @@ -0,0 +1,2 @@ +SRC_DIR = src/test/blit +include $(GENODE_DIR)/repos/base/recipes/src/content.inc diff --git a/repos/os/recipes/src/test-blit/hash b/repos/os/recipes/src/test-blit/hash new file mode 100644 index 00000000000..69a82a36d44 --- /dev/null +++ b/repos/os/recipes/src/test-blit/hash @@ -0,0 +1 @@ +2024-12-10 67b1a1ad0dddcdc22dd6e266309f8221ff30173f diff --git a/repos/os/recipes/src/test-blit/used_apis b/repos/os/recipes/src/test-blit/used_apis new file mode 100644 index 00000000000..ec3bf565df2 --- /dev/null +++ b/repos/os/recipes/src/test-blit/used_apis @@ -0,0 +1,2 @@ +base +os diff --git a/repos/os/src/test/blit/main.cc b/repos/os/src/test/blit/main.cc new file mode 100644 index 00000000000..b4c961e86b3 --- /dev/null +++ b/repos/os/src/test/blit/main.cc @@ -0,0 +1,291 @@ +/* + * \brief Blitting test + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#include +#include +#include +#include + +using namespace Blit; + + +/******************************* + ** Low-level SIMD operations ** + *******************************/ + +template +struct Image +{ + static constexpr unsigned w = W, h = H; + + uint32_t pixels[W*H]; + + void print(Output &out) const + { + using Genode::print; + for (unsigned y = 0; y < H; y++) { + for (unsigned x = 0; x < min(25u, W); x++) { + uint32_t v = pixels[y*W+x]; + if (v) + print(out, " ", Char('A' + (v&63)), Char(char('A' + ((v>>16)&63)))); + else + print(out, " ."); + } + if (y < H-1) print(out, "\n"); + } + } + + bool operator != (Image const &other) + { + for (unsigned i = 0; i < W*H; i++) + if (other.pixels[i] != pixels[i]) + return true; + return false; + } + + static Image pattern() + { + Image image { }; + for (unsigned y = 0; y < H; y++) + for (unsigned x = 0; x < W; x++) + image.pixels[y*W + x] = (y << 16) | x; + return image; + } +}; + + +#define TEST_LANDSCAPE(SIMD, FN, DST_W, DST_H, W, H) \ +{ \ + Image dst { }, ref { }; \ + Slow:: FN(ref.pixels, ref.w/8, src.pixels, W, H); \ + log(#FN, " ref:\n", ref); \ + SIMD:: FN(dst.pixels, dst.w/8, src.pixels, W, H); \ + log(#FN, " got:\n", dst); \ + if (dst != ref) { \ + error("", #FN, " failed"); \ + throw 1; \ + } \ +} + + +#define TEST_PORTRAIT(SIMD, FN, DST_W, DST_H, W, H) \ +{ \ + Image dst { }, ref { }; \ + Slow:: FN(ref.pixels, ref.w/8, src.pixels, src.w/8, W, H); \ + log(#FN, " ref:\n", ref); \ + SIMD:: FN(dst.pixels, dst.w/8, src.pixels, src.w/8, W, H); \ + log(#FN, " got:\n", dst); \ + if (dst != ref) { \ + error("", #FN, " failed"); \ + throw 1; \ + } \ +} + + +template +static void test_simd_b2f() +{ + static Image<48,32> const src = Image<48,32>::pattern(); + + log("source image:\n", src); + + TEST_LANDSCAPE ( SIMD, B2f ::r0, 48, 32, 2, 4 ); + TEST_LANDSCAPE ( SIMD, B2f_flip ::r0, 48, 32, 2, 4 ); + TEST_PORTRAIT ( SIMD, B2f ::r90, 32, 48, 4, 2 ); + TEST_PORTRAIT ( SIMD, B2f_flip ::r90, 32, 48, 4, 2 ); + TEST_LANDSCAPE ( SIMD, B2f ::r180, 48, 32, 2, 4 ); + TEST_LANDSCAPE ( SIMD, B2f_flip::r180, 48, 32, 2, 4 ); + TEST_PORTRAIT ( SIMD, B2f ::r270, 32, 48, 4, 2 ); + TEST_PORTRAIT ( SIMD, B2f_flip::r270, 32, 48, 4, 2 ); +} + + +/**************************************** + ** Back-to-front argument dispatching ** + ****************************************/ + +struct Recorded +{ + struct Args + { + uint32_t *dst; + unsigned dst_w; + uint32_t const *src; + unsigned src_w; + unsigned w, h; + + bool operator != (Args const &other) const + { + return dst != other.dst + || dst_w != other.dst_w + || src != other.src + || src_w != other.src_w + || w != other.w + || h != other.h; + } + + void print(Output &out) const + { + bool const valid = (*this != Args { }); + if (!valid) { + Genode::print(out, "invalid"); + return; + } + + /* print src and dst pointer values in units of uint32_t words */ + Genode::print(out, "dst=", Hex(addr_t(dst)/4), " dst_w=", dst_w, + " src=", Hex(addr_t(src)/4), " src_w=", src_w, " w=", w, " h=", h); + } + }; + + static Args recorded; + + static void _record(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) + { + recorded = { dst, line_w, src, line_w, w, h }; + } + + static void _record(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, unsigned w, unsigned h) + { + recorded = { dst, dst_w, src, src_w, w, h }; + } + + struct B2f + { + static inline void r0 (auto &&... args) { _record(args...); } + static inline void r90 (auto &&... args) { _record(args...); } + static inline void r180 (auto &&... args) { _record(args...); } + static inline void r270 (auto &&... args) { _record(args...); } + }; + + struct B2f_flip + { + static inline void r0 (auto &&... args) { _record(args...); } + static inline void r90 (auto &&... args) { _record(args...); } + static inline void r180(auto &&... args) { _record(args...); } + static inline void r270(auto &&... args) { _record(args...); } + }; +}; + + +Recorded::Args Recorded::recorded { }; + +namespace Blit { + + static inline const char *name(Rotate r) + { + switch (r) { + case Rotate::R0: return "R0"; + case Rotate::R90: return "R90"; + case Rotate::R180: return "R180"; + case Rotate::R270: return "R270"; + } + return "invalid"; + } +} + + +static void test_b2f_dispatch() +{ + Texture texture_landscape { nullptr, nullptr, { 640, 480 } }; + Texture texture_portrait { nullptr, nullptr, { 480, 640 } }; + Surface surface { nullptr, { 640, 480 } }; + + struct Expected : Recorded::Args { }; + + auto expected = [&] (addr_t dst, unsigned dst_w, addr_t src, unsigned src_w, + unsigned w, unsigned h) + { + return Expected { (uint32_t *)(4*dst), dst_w, + (uint32_t *)(4*src), src_w, w, h }; + }; + + using Rect = Blit::Rect; + + auto test = [&] (Texture const &texture, + Rect rect, Rotate rotate, Flip flip, + Expected const &expected) + { + Recorded::recorded = { }; + _b2f(surface, texture, rect, rotate, flip); + log("b2f: ", rect, " ", name(rotate), flip.enabled ? " flip" : "", + " -> ", Recorded::recorded); + if (Recorded::recorded != expected) { + error("test_b2f_dispatch failed, expected: ", expected); + throw 1; + } + }; + + log("offset calculation of destination window"); + { + unsigned const x = 32, y = 16, w = 64, h = 48; + + addr_t const src_landscape_ptr = y*640 + x, + src_portrait_ptr = y*480 + x; + + Rect const rect { { x, y }, { w, h } }; + + test(texture_landscape, rect, Rotate::R0, Flip { }, + expected(y*640 + x, 80, src_landscape_ptr, 80, 8, 6)); + test(texture_landscape, rect, Rotate::R0, Flip { true }, + expected(y*640 + 640 - w - x, 80, src_landscape_ptr, 80, 8, 6)); + test(texture_portrait, rect, Rotate::R90, Flip { }, + expected(x*640 + 640 - h - y, 80, src_portrait_ptr, 60, 8, 6)); + test(texture_portrait, rect, Rotate::R90, Flip { true }, + expected(x*640 + y, 80, src_portrait_ptr, 60, 8, 6)); + test(texture_landscape, rect, Rotate::R180, Flip { }, + expected((480 - y - h)*640 + 640 - x - w, 80, src_landscape_ptr, 80, 8, 6)); + test(texture_landscape, rect, Rotate::R180, Flip { true }, + expected((480 - y - h)*640 + x, 80, src_landscape_ptr, 80, 8, 6)); + test(texture_portrait, rect, Rotate::R270, Flip { }, + expected((480 - x - w)*640 + y, 80, src_portrait_ptr, 60, 8, 6)); + test(texture_portrait, rect, Rotate::R270, Flip { true }, + expected((480 - x - w)*640 + 640 - y - h, 80, src_portrait_ptr, 60, 8, 6)); + } + + log("check for compatibility of surface and texture"); + test(texture_portrait, { { }, { 16, 16 } }, Rotate::R0, Flip { }, + expected(0, 0, 0, 0, 0, 0)); + + log("clamp rect to texture size"); + test(texture_landscape, { { -99, -99 }, { 999, 999 } }, Rotate::R0, Flip { }, + expected(0, 80, 0, 80, 80, 60)); + + log("ignore out-of-bounds rect"); + test(texture_landscape, { { 1000, 0 }, { 16, 16 } }, Rotate::R0, Flip { }, + expected(0, 0, 0, 0, 0, 0)); + + /* snap to grid */ + log("snap rect argument to 8x8 grid"); + test(texture_landscape, { { 31, 63 }, { 2, 2 } }, Rotate::R0, Flip { }, + expected(56*640 + 24, 80, 56*640 + 24, 80, 2, 2)); +} + + +void Component::construct(Genode::Env &) +{ +#ifdef _INCLUDE__BLIT__INTERNAL__NEON_H_ + log("-- ARM Neon --"); + test_simd_b2f(); +#endif +#ifdef _INCLUDE__BLIT__INTERNAL__SSE4_H_ + log("-- SSE4 --"); + test_simd_b2f(); +#endif + + test_b2f_dispatch(); + + log("--- blit test finished ---"); +} diff --git a/repos/os/src/test/blit/target.mk b/repos/os/src/test/blit/target.mk new file mode 100644 index 00000000000..5f1a2b62282 --- /dev/null +++ b/repos/os/src/test/blit/target.mk @@ -0,0 +1,3 @@ +TARGET = test-blit +SRC_CC = main.cc +LIBS = base