From 3bf26a6d7bced04f3b6b056e310f5d21ef2c242a Mon Sep 17 00:00:00 2001 From: Norman Feske Date: Thu, 16 Jan 2025 19:35:01 +0100 Subject: [PATCH] blit: SIMD-based back2front copy Issue #5428 --- repos/os/include/blit/blit.h | 35 +-- repos/os/include/blit/internal/neon.h | 229 +++++++++++++++++ repos/os/include/blit/internal/slow.h | 192 ++++++++++++++ repos/os/include/blit/internal/sse3.h | 241 ++++++++++++++++++ repos/os/include/blit/types.h | 157 ++++++++++++ repos/os/include/spec/arm_64/blit/blit.h | 25 ++ repos/os/include/spec/x86_64/blit/blit.h | 25 ++ repos/os/recipes/pkg/test-blit/README | 1 + repos/os/recipes/pkg/test-blit/archives | 1 + repos/os/recipes/pkg/test-blit/runtime | 13 + repos/os/recipes/src/test-blit/content.mk | 2 + repos/os/recipes/src/test-blit/hash | 1 + repos/os/recipes/src/test-blit/used_apis | 2 + repos/os/src/test/blit/main.cc | 289 ++++++++++++++++++++++ repos/os/src/test/blit/target.mk | 3 + 15 files changed, 1194 insertions(+), 22 deletions(-) create mode 100644 repos/os/include/blit/internal/neon.h create mode 100644 repos/os/include/blit/internal/slow.h create mode 100644 repos/os/include/blit/internal/sse3.h create mode 100644 repos/os/include/blit/types.h create mode 100644 repos/os/include/spec/arm_64/blit/blit.h create mode 100644 repos/os/include/spec/x86_64/blit/blit.h create mode 100644 repos/os/recipes/pkg/test-blit/README create mode 100644 repos/os/recipes/pkg/test-blit/archives create mode 100644 repos/os/recipes/pkg/test-blit/runtime create mode 100644 repos/os/recipes/src/test-blit/content.mk create mode 100644 repos/os/recipes/src/test-blit/hash create mode 100644 repos/os/recipes/src/test-blit/used_apis create mode 100644 repos/os/src/test/blit/main.cc create mode 100644 repos/os/src/test/blit/target.mk diff --git a/repos/os/include/blit/blit.h b/repos/os/include/blit/blit.h index c8a55331c41..f6dfc1a46e4 100644 --- a/repos/os/include/blit/blit.h +++ b/repos/os/include/blit/blit.h @@ -1,34 +1,25 @@ /* - * \brief Interface of 2D-copy library + * \brief Blit API * \author Norman Feske - * \date 2007-10-10 + * \date 2025-01-16 */ /* - * Copyright (C) 2007-2017 Genode Labs GmbH + * Copyright (C) 2025 Genode Labs GmbH * * This file is part of the Genode OS framework, which is distributed * under the terms of the GNU Affero General Public License version 3. */ -#ifndef _INCLUDE__BLIT__BLIT_H_ -#define _INCLUDE__BLIT__BLIT_H_ +#ifndef _INCLUDE__BLIT_H_ +#define _INCLUDE__BLIT_H_ -/** - * Blit memory from source buffer to destination buffer - * - * \param src address of source buffer - * \param src_w line length of source buffer in bytes - * \param dst address of destination buffer - * \param dst_w line length of destination buffer in bytes - * \param w number of bytes per line to copy - * \param h number of lines to copy - * - * This function works at a granularity of 16bit. - * If the source and destination overlap, the result - * of the copy operation is not defined. - */ -extern "C" void blit(void const *src, unsigned src_w, - void *dst, unsigned dst_w, int w, int h); +#include +#include + +namespace Blit { + + static inline void back2front(auto &&... args) { _b2f(args...); } +} -#endif /* _INCLUDE__BLIT__BLIT_H_ */ +#endif /* _INCLUDE__BLIT_H_ */ diff --git a/repos/os/include/blit/internal/neon.h b/repos/os/include/blit/internal/neon.h new file mode 100644 index 00000000000..1284c5a533a --- /dev/null +++ b/repos/os/include/blit/internal/neon.h @@ -0,0 +1,229 @@ +/* + * \brief 2D memory copy using ARM NEON + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__BLIT__INTERNAL__NEON_H_ +#define _INCLUDE__BLIT__INTERNAL__NEON_H_ + +#include + +/* compiler intrinsics */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wfloat-conversion" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#include +#pragma GCC diagnostic pop + +namespace Blit { + + struct Neon; + + template struct Ptr4; + + static inline uint32x4_t _reversed(uint32x4_t const v) + { + return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v))); + } + + static inline void _reverse_line(uint32x4x4_t const *src, uint32x4x4_t *dst, unsigned len) + { + src += len; /* move 'src' from end of line towards begin */ + union Batch { uint32x4x4_t all; uint32x4_t v[4]; }; + Batch b; + uint32x4_t *d = (uint32x4_t *)dst; + while (len--) { + b.all = *--src; + *d++ = _reversed(b.v[3]); + *d++ = _reversed(b.v[2]); + *d++ = _reversed(b.v[1]); + *d++ = _reversed(b.v[0]); + } + }; + + static inline void _copy_line(uint32x4x4_t const *s, uint32x4x4_t *d, unsigned len) + { + while (len--) + *d++ = *s++; + }; + + static inline void _rotate_4_lines(auto src_ptr, auto dst_ptr, + unsigned len, auto const src_step) + { + union Tile { uint32x4x4_t all; uint32x4_t row[4]; }; + Tile t; + while (len--) { + t.all = vld4q_lane_u32(src_ptr.p0, t.all, 3); + t.all = vld4q_lane_u32(src_ptr.p1, t.all, 2); + t.all = vld4q_lane_u32(src_ptr.p2, t.all, 1); + t.all = vld4q_lane_u32(src_ptr.p3, t.all, 0); + + dst_ptr.append(t.row[0], t.row[1], t.row[2], t.row[3]); + src_ptr.incr(src_step); + }; + }; +} + + +template +struct Blit::Ptr4 +{ + PTR *p0, *p1, *p2, *p3; + + Ptr4(PTR *p, int w) : p0(p), p1(p + w), p2(p + 2*w), p3(p + 3*w) { } + + void incr(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + + void append(auto v0, auto v1, auto v2, auto v3) + { + *p0++ = v0; *p1++ = v1; *p2++ = v2; *p3++ = v3; + } +}; + + +struct Blit::Neon +{ + struct B2f + { + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + }; + + struct B2f_flip + { + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + }; +}; + + +void Blit::Neon::B2f::r0(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + uint32x4x4_t const *s = (uint32x4x4_t const *)src; + uint32x4x4_t *d = (uint32x4x4_t *)dst; + + for (unsigned lines = h*16; lines; lines--) { + _copy_line(s, d, w); + s += line_w; + d += line_w; + } +} + + +void Blit::Neon::B2f::r90(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, unsigned const h) +{ + Ptr4 src_ptr4 (src + 16*src_w*(16*h - 4), 16*src_w); + Ptr4 dst_ptr4 ((uint32x4_t *)dst, 4*dst_w); + + for (unsigned i = 4*w; i; i--) { + _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, -4*16*src_w); + src_ptr4.incr(4); + dst_ptr4.incr(4*4*dst_w); + } +} + + +void Blit::Neon::B2f::r180(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + uint32x4x4_t *d = (uint32x4x4_t *)dst; + uint32x4x4_t const *s = (uint32x4x4_t const *)src + 16*line_w*h; + + for (unsigned i = h*16; i; i--) { + s -= line_w; + _reverse_line(s, d, w); + d += line_w; + } +} + + +void Blit::Neon::B2f::r270(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, const unsigned h) +{ + Ptr4 src_ptr4 (src + 3*16*src_w + 16*w - 4, -16*src_w); + Ptr4 dst_ptr4 ((uint32x4_t *)dst + 3*4*dst_w, -4*dst_w); + + for (unsigned i = 4*w; i; i--) { + _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, 4*16*src_w); + src_ptr4.incr(-4); + dst_ptr4.incr(4*4*dst_w); + } +} + + +void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + uint32x4x4_t const *s = (uint32x4x4_t const *)src; + uint32x4x4_t *d = (uint32x4x4_t *)dst; + + for (unsigned lines = h*16; lines; lines--) { + _reverse_line(s, d, w); + s += line_w; + d += line_w; + } +} + + +void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, unsigned const h) +{ + Ptr4 src_ptr4 (src + 3*16*src_w, -16*src_w); + Ptr4 dst_ptr4 ((uint32x4_t *)dst, 4*dst_w); + + for (unsigned i = 4*w; i; i--) { + _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, 4*16*src_w); + src_ptr4.incr(4); + dst_ptr4.incr(4*4*dst_w); + } +} + + +void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + uint32x4x4_t const *s = (uint32x4x4_t const *)src + 16*line_w*h; + uint32x4x4_t *d = (uint32x4x4_t *)dst; + + for (unsigned lines = h*16; lines; lines--) { + s -= line_w; + _copy_line(s, d, w); + d += line_w; + } +} + + +void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, const unsigned h) +{ + Ptr4 src_ptr4 (src + (16*h - 4)*16*src_w + 16*w, 16*src_w); + Ptr4 dst_ptr4 ((uint32x4_t *)dst + 3*4*dst_w, -4*dst_w); + + for (unsigned i = 4*w; i; i--) { + src_ptr4.incr(-4); + _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, -4*16*src_w); + dst_ptr4.incr(4*4*dst_w); + } +} + +#endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */ diff --git a/repos/os/include/blit/internal/slow.h b/repos/os/include/blit/internal/slow.h new file mode 100644 index 00000000000..27c34c7cd7a --- /dev/null +++ b/repos/os/include/blit/internal/slow.h @@ -0,0 +1,192 @@ +/* + * \brief Fallback 2D memory copy + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__BLIT__INTERNAL__SLOW_H_ +#define _INCLUDE__BLIT__INTERNAL__SLOW_H_ + +#include + +namespace Blit { + + /* + * The back-to-front copy variants work as follows: + * + * normal flipped + * + * rotated 0 0 1 2 3 3 2 1 0 + * 4 5 6 7 7 6 5 4 + * 8 9 10 11 11 10 9 8 + * 12 13 14 15 15 14 13 12 + * + * rotated 90 12 8 4 0 0 4 8 12 + * 13 9 5 1 1 5 9 13 + * 14 10 6 2 2 6 10 14 + * 15 11 7 3 3 7 11 15 + * + * rotated 180 15 14 13 12 12 13 14 15 + * 11 10 9 8 8 9 10 11 + * 7 6 5 4 4 5 6 7 + * 3 2 1 0 0 1 2 3 + * + * rotated 270 3 7 11 15 15 11 7 3 + * 2 6 10 14 14 10 6 2 + * 1 5 9 13 13 9 5 1 + * 0 4 8 12 12 8 4 0 + * + * - coordinates are given in units of 16 pixels + * - one pixel is 32 bit + * - w >= 1 + * - h >= 1 + * - w <= line_w,dst_w,src_w + */ + + struct Slow; + + static inline void _sample_line(uint32_t const *src, uint32_t *dst, + unsigned len, int const step) + { + for (; len--; src += step) + *dst++ = *src; + } + + static inline void _copy_line(uint32_t const *src, uint32_t *dst, unsigned len) + { + _sample_line(src, dst, len, 1); + } +} + + +struct Blit::Slow +{ + struct B2f + { + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + }; + + struct B2f_flip + { + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + }; +}; + + +void Blit::Slow::B2f::r0(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + for (unsigned lines = h*16; lines; lines--) { + _copy_line(src, dst, 16*w); + src += 16*line_w; + dst += 16*line_w; + } +} + + +void Blit::Slow::B2f::r90(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, unsigned const h) +{ + src += (16*h - 1)*16*src_w; + + for (unsigned i = 16*w; i; i--) { + _sample_line(src, dst, 16*h, -16*src_w); + src++; + dst += 16*dst_w; + } +} + + +void Blit::Slow::B2f::r180(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + src += 16*h*16*line_w + 16*w - 1; + + for (unsigned i = h*16; i; i--) { + src -= 16*line_w; + _sample_line(src, dst, 16*w, -1); + dst += 16*line_w; + } +} + + +void Blit::Slow::B2f::r270(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, const unsigned h) +{ + src += 16*w; + + for (unsigned i = 16*w; i; i--) { + src--; + _sample_line(src, dst, 16*h, 16*src_w); + dst += 16*dst_w; + } +} + + +void Blit::Slow::B2f_flip::r0(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + src += 16*w - 1; + + for (unsigned lines = h*16; lines; lines--) { + _sample_line(src, dst, 16*w, -1); + src += 16*line_w; + dst += 16*line_w; + } +} + + +void Blit::Slow::B2f_flip::r90(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, unsigned const h) +{ + for (unsigned i = 16*w; i; i--) { + _sample_line(src, dst, 16*h, 16*src_w); + src++; + dst += 16*dst_w; + } +} + + +void Blit::Slow::B2f_flip::r180(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + src += 16*h*16*line_w; + + for (unsigned lines = h*16; lines; lines--) { + src -= 16*line_w; + _copy_line(src, dst, 16*w); + dst += 16*line_w; + } +} + + +void Blit::Slow::B2f_flip::r270(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, const unsigned h) +{ + src += (16*h - 1)*16*src_w + 16*w; + + for (unsigned i = 16*w; i; i--) { + src--; + _sample_line(src, dst, 16*h, -16*src_w); + dst += 16*dst_w; + } +} + +#endif /* _INCLUDE__BLIT__INTERNAL__SLOW_H_ */ diff --git a/repos/os/include/blit/internal/sse3.h b/repos/os/include/blit/internal/sse3.h new file mode 100644 index 00000000000..eeeb9988335 --- /dev/null +++ b/repos/os/include/blit/internal/sse3.h @@ -0,0 +1,241 @@ +/* + * \brief 2D memory copy using SSE3 + * \author Norman Feske + * \date 2025-01-21 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__BLIT__INTERNAL__SSE3_H_ +#define _INCLUDE__BLIT__INTERNAL__SSE3_H_ + +#include + +/* compiler intrinsics */ +#ifndef _MM_MALLOC_H_INCLUDED /* discharge dependency from stdlib.h */ +#define _MM_MALLOC_H_INCLUDED +#define _MM_MALLOC_H_INCLUDED_PREVENTED +#endif +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#include +#pragma GCC diagnostic pop +#ifdef _MM_MALLOC_H_INCLUDED_PREVENTED +#undef _MM_MALLOC_H_INCLUDED +#undef _MM_MALLOC_H_INCLUDED_PREVENTED +#endif + + +namespace Blit { struct Sse3; }; + + +struct Blit::Sse3 +{ + struct Ptr4 + { + __m128i *p0, *p1, *p2, *p3; + + Ptr4(__m128i *p, int w) : p0(p), p1(p + w), p2(p + 2*w), p3(p + 3*w) { } + + void incr(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + }; + + struct Ptr4_const + { + __m128i const *p0, *p1, *p2, *p3; + + Ptr4_const(__m128i const *p, int w) : p0(p), p1(p + w), p2(p + 2*w), p3(p + 3*w) { } + + void incr(int v) { p0 += v, p1 += v, p2 += v, p3 += v; } + }; + + static inline void _reverse_line(__m128i const *s, __m128i *d, unsigned len) + { + static constexpr int reversed = (0 << 6) | (1 << 4) | (2 << 2) | 3; + + d += 4*len; /* move 'dst' from end towards begin */ + + while (len--) { + __m128i const v0 = _mm_load_si128(s++); + __m128i const v1 = _mm_load_si128(s++); + __m128i const v2 = _mm_load_si128(s++); + __m128i const v3 = _mm_load_si128(s++); + _mm_stream_si128(--d, _mm_shuffle_epi32(v0, reversed)); + _mm_stream_si128(--d, _mm_shuffle_epi32(v1, reversed)); + _mm_stream_si128(--d, _mm_shuffle_epi32(v2, reversed)); + _mm_stream_si128(--d, _mm_shuffle_epi32(v3, reversed)); + } + }; + + static inline void _copy_line(__m128i const *s, __m128i *d, unsigned len) + { + while (len--) { + __m128i const v0 = _mm_load_si128(s++); + __m128i const v1 = _mm_load_si128(s++); + __m128i const v2 = _mm_load_si128(s++); + __m128i const v3 = _mm_load_si128(s++); + _mm_stream_si128(d++, v0); /* bypass cache */ + _mm_stream_si128(d++, v1); + _mm_stream_si128(d++, v2); + _mm_stream_si128(d++, v3); + } + }; + + static inline void _rotate_4_lines(auto src_ptr, auto dst_ptr, + unsigned len, auto const src_step) + { + union Tile { __m128i pi[4]; __m128 ps[4]; } t; + + while (len--) { + t.pi[0] = _mm_load_si128(src_ptr.p3); + t.pi[1] = _mm_load_si128(src_ptr.p2); + t.pi[2] = _mm_load_si128(src_ptr.p1); + t.pi[3] = _mm_load_si128(src_ptr.p0); + _MM_TRANSPOSE4_PS(t.ps[0], t.ps[1], t.ps[2], t.ps[3]); + _mm_stream_si128(dst_ptr.p0++, t.pi[0]); + _mm_stream_si128(dst_ptr.p1++, t.pi[1]); + _mm_stream_si128(dst_ptr.p2++, t.pi[2]); + _mm_stream_si128(dst_ptr.p3++, t.pi[3]); + src_ptr.incr(src_step); + }; + }; + + struct B2f + { + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + }; + + struct B2f_flip + { + static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned); + static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned); + }; +}; + + +void Blit::Sse3::B2f::r0(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + __m128i const *s = (__m128i const *)src; + __m128i *d = (__m128i *)dst; + + for (unsigned lines = h*16; lines; lines--) { + _copy_line(s, d, w); + s += 4*line_w; + d += 4*line_w; + } +} + + +void Blit::Sse3::B2f::r90(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, unsigned const h) +{ + Ptr4_const src_ptr4 ((__m128i const *)(src + 16*src_w*(16*h - 4)), 4*src_w); + Ptr4 dst_ptr4 ((__m128i *)dst, 4*dst_w); + + for (unsigned i = 4*w; i; i--) { + _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, -4*4*src_w); + src_ptr4.incr(1); + dst_ptr4.incr(4*4*dst_w); + } +} + + +void Blit::Sse3::B2f::r180(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + __m128i *d = (__m128i *)dst; + __m128i const *s = (__m128i const *)src + 4*16*line_w*h; + + for (unsigned i = h*16; i; i--) { + s -= 4*line_w; + _reverse_line(s, d, w); + d += 4*line_w; + } +} + + +void Blit::Sse3::B2f::r270(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, const unsigned h) +{ + Ptr4_const src_ptr4 ((__m128i const *)(src + 3*16*src_w + 16*w - 4), -4*src_w); + Ptr4 dst_ptr4 ((__m128i *)dst + 3*4*dst_w, -4*dst_w); + + for (unsigned i = 4*w; i; i--) { + _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, 4*4*src_w); + src_ptr4.incr(-1); + dst_ptr4.incr(4*4*dst_w); + } +} + + +void Blit::Sse3::B2f_flip::r0(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + __m128i const *s = (__m128i const *)src; + __m128i *d = (__m128i *)dst; + + for (unsigned lines = h*16; lines; lines--) { + _reverse_line(s, d, w); + s += 4*line_w; + d += 4*line_w; + } +} + + +void Blit::Sse3::B2f_flip::r90(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, unsigned const h) +{ + Ptr4_const src_ptr4 ((__m128i const *)(src + 3*16*src_w), -4*src_w); + Ptr4 dst_ptr4 ((__m128i *)dst, 4*dst_w); + + for (unsigned i = 4*w; i; i--) { + _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, 4*4*src_w); + src_ptr4.incr(1); + dst_ptr4.incr(4*4*dst_w); + } +} + + +void Blit::Sse3::B2f_flip::r180(uint32_t *dst, unsigned const line_w, + uint32_t const *src, unsigned const w, unsigned const h) +{ + __m128i const *s = (__m128i const *)src + 4*16*line_w*h; + __m128i *d = (__m128i *)dst; + + for (unsigned lines = h*16; lines; lines--) { + s -= 4*line_w; + _copy_line(s, d, w); + d += 4*line_w; + } +} + + +void Blit::Sse3::B2f_flip::r270(uint32_t *dst, unsigned const dst_w, + uint32_t const *src, unsigned const src_w, + unsigned const w, const unsigned h) +{ + Ptr4_const src_ptr4 ((__m128i const *)(src + (16*h - 4)*16*src_w + 16*w), 4*src_w); + Ptr4 dst_ptr4 ((__m128i *)dst + 3*4*dst_w, -4*dst_w); + + for (unsigned i = 4*w; i; i--) { + src_ptr4.incr(-1); + _rotate_4_lines(src_ptr4, dst_ptr4, 4*h, -4*4*src_w); + dst_ptr4.incr(4*4*dst_w); + } +} + +#endif /* _INCLUDE__BLIT__INTERNAL__SSE3_H_ */ diff --git a/repos/os/include/blit/types.h b/repos/os/include/blit/types.h new file mode 100644 index 00000000000..bf14aabdb28 --- /dev/null +++ b/repos/os/include/blit/types.h @@ -0,0 +1,157 @@ +/* + * \brief Types and utilities used for 2D memory copy + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__BLIT__TYPES_H_ +#define _INCLUDE__BLIT__TYPES_H_ + +/* Genode includes */ +#include +#include +#include + +namespace Blit { + + using namespace Genode; + + using Rect = Surface_base::Rect; + using Area = Surface_base::Area; + using Point = Surface_base::Point; + + enum class Rotate { R0, R90, R180, R270 }; + + struct Flip { bool enabled; }; + + static bool swap_w_h(Rotate r) { return r == Rotate::R90 || r == Rotate::R270; } + + static Area transformed(Area a, Rotate rotate) + { + return swap_w_h(rotate) ? Area { a.h, a.w } : a; + } + + static Point transformed(Point p, Area area, Rotate rotate, Flip flip) + { + int const w = area.w, h = area.h; + + switch (rotate) { + case Rotate::R0: break; + case Rotate::R90: p = { .x = h - p.y - 1, .y = p.x }; break; + case Rotate::R180: p = { .x = w - p.x - 1, .y = h - p.y - 1 }; break; + case Rotate::R270: p = { .x = p.y, .y = w - p.x - 1 }; break; + } + + if (flip.enabled) + p = { int(transformed(area, rotate).w) - p.x - 1, p.y }; + + return p; + } + + static Rect transformed(Rect r, Area area, Rotate rotate, Flip flip) + { + auto rect_from_points = [&] (Point p1, Point p2) + { + return Rect::compound(Point { min(p1.x, p2.x), min(p1.y, p2.y) }, + Point { max(p1.x, p2.x), max(p1.y, p2.y) }); + }; + return rect_from_points(transformed(r.p1(), area, rotate, flip), + transformed(r.p2(), area, rotate, flip)); + } + + static Rect snapped_to_16x16_grid(Rect r) + { + return Rect::compound(Point { .x = r.x1() & ~0xf, + .y = r.y1() & ~0xf }, + Point { .x = ((r.x2() + 16) & ~0xf) - 1, + .y = ((r.y2() + 16) & ~0xf) - 1 }); + } + + template + static inline void _b2f(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, + unsigned w, unsigned h, Rotate rotate) + { + switch (rotate) { + case Rotate::R0: B2F::r0 (dst, dst_w, src, w, h); break; + case Rotate::R90: B2F::r90 (dst, dst_w, src, src_w, w, h); break; + case Rotate::R180: B2F::r180(dst, dst_w, src, w, h); break; + case Rotate::R270: B2F::r270(dst, dst_w, src, src_w, w, h); break; + } + } + + template + static inline void _b2f(Surface &surface, + Texture const &texture, + Rect rect, Rotate rotate, Flip flip) + { + /* surface size must be divisible by 16 */ + if (!aligned(surface.size().w, 4) || !aligned(surface.size().h, 4)) + return; + + /* check compatibility of surface with texture */ + if (transformed(surface.size(), rotate) != texture.size()) + return; + + /* restrict rect to texture size */ + rect = Rect::intersect(rect, Rect { { }, texture.size() }); + + /* compute base addresses of affected pixel window */ + Rect const src_rect = snapped_to_16x16_grid(rect); + Rect const dst_rect = transformed(src_rect, texture.size(), rotate, flip); + + uint32_t const * const src = (uint32_t const *)texture.pixel() + + src_rect.y1()*texture.size().w + + src_rect.x1(); + + uint32_t * const dst = (uint32_t *)surface.addr() + + dst_rect.y1()*surface.size().w + + dst_rect.x1(); + + /* coordinates converted to 16x16 units */ + unsigned const src_w = texture.size().w >> 4, + dst_w = surface.size().w >> 4, + w = src_rect.area.w >> 4, + h = src_rect.area.h >> 4; + + if (w && h) { + if (flip.enabled) + _b2f(dst, dst_w, src, src_w, w, h, rotate); + else + _b2f (dst, dst_w, src, src_w, w, h, rotate); + } + + surface.flush_pixels(dst_rect); + } +} + + +/**************** + ** Legacy API ** + ****************/ + +/** + * Blit memory from source buffer to destination buffer + * + * \param src address of source buffer + * \param src_w line length of source buffer in bytes + * \param dst address of destination buffer + * \param dst_w line length of destination buffer in bytes + * \param w number of bytes per line to copy + * \param h number of lines to copy + * + * This function works at a granularity of 16bit. + * If the source and destination overlap, the result + * of the copy operation is not defined. + */ +extern "C" void blit(void const *src, unsigned src_w, + void *dst, unsigned dst_w, int w, int h); + +#endif /* _INCLUDE__BLIT__TYPES_H_ */ diff --git a/repos/os/include/spec/arm_64/blit/blit.h b/repos/os/include/spec/arm_64/blit/blit.h new file mode 100644 index 00000000000..8a62d8ca8a2 --- /dev/null +++ b/repos/os/include/spec/arm_64/blit/blit.h @@ -0,0 +1,25 @@ +/* + * \brief Blit API + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__SPEC__ARM_64__BLIT_H_ +#define _INCLUDE__SPEC__ARM_64__BLIT_H_ + +#include +#include + +namespace Blit { + + static inline void back2front(auto &&... args) { _b2f(args...); } +} + +#endif /* _INCLUDE__SPEC__ARM_64__BLIT_H_ */ diff --git a/repos/os/include/spec/x86_64/blit/blit.h b/repos/os/include/spec/x86_64/blit/blit.h new file mode 100644 index 00000000000..ae76e192711 --- /dev/null +++ b/repos/os/include/spec/x86_64/blit/blit.h @@ -0,0 +1,25 @@ +/* + * \brief Blit API + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__SPEC__X86_64__BLIT_H_ +#define _INCLUDE__SPEC__X86_64__BLIT_H_ + +#include +#include + +namespace Blit { + + static inline void back2front(auto &&... args) { _b2f(args...); } +} + +#endif /* _INCLUDE__SPEC__X86_64__BLIT_H_ */ diff --git a/repos/os/recipes/pkg/test-blit/README b/repos/os/recipes/pkg/test-blit/README new file mode 100644 index 00000000000..641e69c4db2 --- /dev/null +++ b/repos/os/recipes/pkg/test-blit/README @@ -0,0 +1 @@ +Scenario for testing 2D blitting operations diff --git a/repos/os/recipes/pkg/test-blit/archives b/repos/os/recipes/pkg/test-blit/archives new file mode 100644 index 00000000000..b0a06a1e854 --- /dev/null +++ b/repos/os/recipes/pkg/test-blit/archives @@ -0,0 +1 @@ +_/src/test-blit diff --git a/repos/os/recipes/pkg/test-blit/runtime b/repos/os/recipes/pkg/test-blit/runtime new file mode 100644 index 00000000000..bff0d79325f --- /dev/null +++ b/repos/os/recipes/pkg/test-blit/runtime @@ -0,0 +1,13 @@ + + + + [init] --- blit test finished --- + + + + + + + + + diff --git a/repos/os/recipes/src/test-blit/content.mk b/repos/os/recipes/src/test-blit/content.mk new file mode 100644 index 00000000000..c1f82975735 --- /dev/null +++ b/repos/os/recipes/src/test-blit/content.mk @@ -0,0 +1,2 @@ +SRC_DIR = src/test/blit +include $(GENODE_DIR)/repos/base/recipes/src/content.inc diff --git a/repos/os/recipes/src/test-blit/hash b/repos/os/recipes/src/test-blit/hash new file mode 100644 index 00000000000..69a82a36d44 --- /dev/null +++ b/repos/os/recipes/src/test-blit/hash @@ -0,0 +1 @@ +2024-12-10 67b1a1ad0dddcdc22dd6e266309f8221ff30173f diff --git a/repos/os/recipes/src/test-blit/used_apis b/repos/os/recipes/src/test-blit/used_apis new file mode 100644 index 00000000000..ec3bf565df2 --- /dev/null +++ b/repos/os/recipes/src/test-blit/used_apis @@ -0,0 +1,2 @@ +base +os diff --git a/repos/os/src/test/blit/main.cc b/repos/os/src/test/blit/main.cc new file mode 100644 index 00000000000..9d4336e61d5 --- /dev/null +++ b/repos/os/src/test/blit/main.cc @@ -0,0 +1,289 @@ +/* + * \brief Blitting test + * \author Norman Feske + * \date 2025-01-16 + */ + +/* + * Copyright (C) 2025 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#include +#include +#include +#include + +using namespace Blit; + + +/******************************* + ** Low-level SIMD operations ** + *******************************/ + +template +struct Image +{ + static constexpr unsigned w = W, h = H; + + uint32_t pixels[W*H]; + + void print(Output &out) const + { + using Genode::print; + for (unsigned y = 0; y < H; y++) { + for (unsigned x = 0; x < min(25u, W); x++) { + uint32_t v = pixels[y*W+x]; + if (v) + print(out, " ", Char('A' + (v&63)), Char(char('A' + ((v>>16)&63)))); + else + print(out, " ."); + } + if (y < H-1) print(out, "\n"); + } + } + + bool operator != (Image const &other) + { + for (unsigned i = 0; i < W*H; i++) + if (other.pixels[i] != pixels[i]) + return true; + return false; + } + + static Image pattern() + { + Image image { }; + for (unsigned y = 0; y < H; y++) + for (unsigned x = 0; x < W; x++) + image.pixels[y*W + x] = (y << 16) | x; + return image; + } +}; + + +#define TEST_LANDSCAPE(SIMD, FN, DST_W, DST_H, W, H) \ +{ \ + Image dst { }, ref { }; \ + SIMD:: FN(dst.pixels, dst.w/16, src.pixels, W, H); \ + Slow:: FN(ref.pixels, ref.w/16, src.pixels, W, H); \ + log(#FN, ":\n", dst); \ + if (dst != ref) { \ + error("", #FN, " failed"); \ + throw 1; \ + } \ +} + + +#define TEST_PORTRAIT(SIMD, FN, DST_W, DST_H, W, H) \ +{ \ + Image dst { }, ref { }; \ + SIMD:: FN(dst.pixels, dst.w/16, src.pixels, src.w/16, W, H); \ + Slow:: FN(ref.pixels, ref.w/16, src.pixels, src.w/16, W, H); \ + log(#FN, ":\n", dst); \ + if (dst != ref) { \ + error("", #FN, " failed"); \ + throw 1; \ + } \ +} + + +template +static void test_simd_b2f() +{ + static Image<48,32> const src = Image<48,32>::pattern(); + + log("source image:\n", src); + + TEST_LANDSCAPE ( SIMD, B2f ::r0, 48, 32, 1, 2 ); + TEST_LANDSCAPE ( SIMD, B2f_flip ::r0, 48, 32, 1, 2 ); + TEST_PORTRAIT ( SIMD, B2f ::r90, 32, 48, 2, 1 ); + TEST_PORTRAIT ( SIMD, B2f_flip ::r90, 32, 48, 2, 1 ); + TEST_LANDSCAPE ( SIMD, B2f ::r180, 48, 32, 1, 2 ); + TEST_LANDSCAPE ( SIMD, B2f_flip::r180, 48, 32, 1, 2 ); + TEST_PORTRAIT ( SIMD, B2f ::r270, 32, 48, 2, 1 ); + TEST_PORTRAIT ( SIMD, B2f_flip::r270, 32, 48, 2, 1 ); +} + + +/**************************************** + ** Back-to-front argument dispatching ** + ****************************************/ + +struct Recorded +{ + struct Args + { + uint32_t *dst; + unsigned dst_w; + uint32_t const *src; + unsigned src_w; + unsigned w, h; + + bool operator != (Args const &other) const + { + return dst != other.dst + || dst_w != other.dst_w + || src != other.src + || src_w != other.src_w + || w != other.w + || h != other.h; + } + + void print(Output &out) const + { + bool const valid = (*this != Args { }); + if (!valid) { + Genode::print(out, "invalid"); + return; + } + + /* print src and dst pointer values in units of uint32_t words */ + Genode::print(out, "dst=", Hex(addr_t(dst)/4), " dst_w=", dst_w, + " src=", Hex(addr_t(src)/4), " src_w=", src_w, " w=", w, " h=", h); + } + }; + + static Args recorded; + + static void _record(uint32_t *dst, unsigned line_w, + uint32_t const *src, unsigned w, unsigned h) + { + recorded = { dst, line_w, src, line_w, w, h }; + } + + static void _record(uint32_t *dst, unsigned dst_w, + uint32_t const *src, unsigned src_w, unsigned w, unsigned h) + { + recorded = { dst, dst_w, src, src_w, w, h }; + } + + struct B2f + { + static inline void r0 (auto &&... args) { _record(args...); } + static inline void r90 (auto &&... args) { _record(args...); } + static inline void r180 (auto &&... args) { _record(args...); } + static inline void r270 (auto &&... args) { _record(args...); } + }; + + struct B2f_flip + { + static inline void r0 (auto &&... args) { _record(args...); } + static inline void r90 (auto &&... args) { _record(args...); } + static inline void r180(auto &&... args) { _record(args...); } + static inline void r270(auto &&... args) { _record(args...); } + }; +}; + + +Recorded::Args Recorded::recorded { }; + +namespace Blit { + + static inline const char *name(Rotate r) + { + switch (r) { + case Rotate::R0: return "R0"; + case Rotate::R90: return "R90"; + case Rotate::R180: return "R180"; + case Rotate::R270: return "R270"; + } + return "invalid"; + } +} + + +static void test_b2f_dispatch() +{ + Texture texture_landscape { nullptr, nullptr, { 640, 480 } }; + Texture texture_portrait { nullptr, nullptr, { 480, 640 } }; + Surface surface { nullptr, { 640, 480 } }; + + struct Expected : Recorded::Args { }; + + auto expected = [&] (addr_t dst, unsigned dst_w, addr_t src, unsigned src_w, + unsigned w, unsigned h) + { + return Expected { (uint32_t *)(4*dst), dst_w, + (uint32_t *)(4*src), src_w, w, h }; + }; + + using Rect = Blit::Rect; + + auto test = [&] (Texture const &texture, + Rect rect, Rotate rotate, Flip flip, + Expected const &expected) + { + Recorded::recorded = { }; + _b2f(surface, texture, rect, rotate, flip); + log("b2f: ", rect, " ", name(rotate), flip.enabled ? " flip" : "", + " -> ", Recorded::recorded); + if (Recorded::recorded != expected) { + error("test_b2f_dispatch failed, expected: ", expected); + throw 1; + } + }; + + log("offset calculation of destination window"); + { + unsigned const x = 32, y = 16, w = 64, h = 48; + + addr_t const src_landscape_ptr = y*640 + x, + src_portrait_ptr = y*480 + x; + + Rect const rect { { x, y }, { w, h } }; + + test(texture_landscape, rect, Rotate::R0, Flip { }, + expected(y*640 + x, 40, src_landscape_ptr, 40, 4, 3)); + test(texture_landscape, rect, Rotate::R0, Flip { true }, + expected(y*640 + 640 - w - x, 40, src_landscape_ptr, 40, 4, 3)); + test(texture_portrait, rect, Rotate::R90, Flip { }, + expected(x*640 + 640 - h - y, 40, src_portrait_ptr, 30, 4, 3)); + test(texture_portrait, rect, Rotate::R90, Flip { true }, + expected(x*640 + y, 40, src_portrait_ptr, 30, 4, 3)); + test(texture_landscape, rect, Rotate::R180, Flip { }, + expected((480 - y - h)*640 + 640 - x - w, 40, src_landscape_ptr, 40, 4, 3)); + test(texture_landscape, rect, Rotate::R180, Flip { true }, + expected((480 - y - h)*640 + x, 40, src_landscape_ptr, 40, 4, 3)); + test(texture_portrait, rect, Rotate::R270, Flip { }, + expected((480 - x - w)*640 + y, 40, src_portrait_ptr, 30, 4, 3)); + test(texture_portrait, rect, Rotate::R270, Flip { true }, + expected((480 - x - w)*640 + 640 - y - h, 40, src_portrait_ptr, 30, 4, 3)); + } + + log("check for compatibility of surface and texture"); + test(texture_portrait, { { }, { 16, 16 } }, Rotate::R0, Flip { }, + expected(0, 0, 0, 0, 0, 0)); + + log("clamp rect to texture size"); + test(texture_landscape, { { -99, -99 }, { 999, 999 } }, Rotate::R0, Flip { }, + expected(0, 40, 0, 40, 40, 30)); + + log("ignore out-of-bounds rect"); + test(texture_landscape, { { 1000, 0 }, { 16, 16 } }, Rotate::R0, Flip { }, + expected(0, 0, 0, 0, 0, 0)); + + /* snap to grid */ + log("snap rect argument to 16x16 grid"); + test(texture_landscape, { { 31, 63 }, { 2, 2 } }, Rotate::R0, Flip { }, + expected(48*640 + 16, 40, 48*640 + 16, 40, 2, 2)); +} + + +void Component::construct(Genode::Env &) +{ +#ifdef _INCLUDE__BLIT__INTERNAL__NEON_H_ + log("-- ARM Neon --"); + test_simd_b2f(); +#endif +#ifdef _INCLUDE__BLIT__INTERNAL__SSE3_H_ + log("-- SSE3 --"); + test_simd_b2f(); +#endif + + test_b2f_dispatch(); + + log("--- blit test finished ---"); +} diff --git a/repos/os/src/test/blit/target.mk b/repos/os/src/test/blit/target.mk new file mode 100644 index 00000000000..5f1a2b62282 --- /dev/null +++ b/repos/os/src/test/blit/target.mk @@ -0,0 +1,3 @@ +TARGET = test-blit +SRC_CC = main.cc +LIBS = base