Skip to content

Commit

Permalink
blit: SIMD-based back2front copy
Browse files Browse the repository at this point in the history
Issue #5428
  • Loading branch information
nfeske committed Jan 30, 2025
1 parent 10a45b7 commit 0a756e4
Show file tree
Hide file tree
Showing 16 changed files with 1,265 additions and 22 deletions.
72 changes: 50 additions & 22 deletions repos/os/include/blit/blit.h
Original file line number Diff line number Diff line change
@@ -1,34 +1,62 @@
/*
* \brief Interface of 2D-copy library
* \brief Blit API
* \author Norman Feske
* \date 2007-10-10
* \date 2025-01-16
*/

/*
* Copyright (C) 2007-2017 Genode Labs GmbH
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/

#ifndef _INCLUDE__BLIT__BLIT_H_
#define _INCLUDE__BLIT__BLIT_H_
#ifndef _INCLUDE__BLIT_H_
#define _INCLUDE__BLIT_H_

/**
* Blit memory from source buffer to destination buffer
*
* \param src address of source buffer
* \param src_w line length of source buffer in bytes
* \param dst address of destination buffer
* \param dst_w line length of destination buffer in bytes
* \param w number of bytes per line to copy
* \param h number of lines to copy
*
* This function works at a granularity of 16bit.
* If the source and destination overlap, the result
* of the copy operation is not defined.
*/
extern "C" void blit(void const *src, unsigned src_w,
void *dst, unsigned dst_w, int w, int h);
#include <blit/types.h>
#include <blit/internal/slow.h>

namespace Blit {

/**
* Back-to-front copy
*
* Copy a rectangular part of a texture to a surface while optionally
* applying rotation and flipping. The width and height of the texture
* must be divisible by 8. Surface and texture must perfectly line up.
* E.g., when rotating by 90 degrees, the texture width must equal the
* surface height and vice versa. The clipping area of the surface is
* ignored.
*
* The combination of rotate and flip arguments works as follows:
*
* normal flipped
*
* rotated 0 0 1 2 3 3 2 1 0
* 4 5 6 7 7 6 5 4
* 8 9 10 11 11 10 9 8
*
* rotated 90 8 4 0 0 4 8
* 9 5 1 1 5 9
* 10 6 2 2 6 10
* 11 7 3 3 7 11
*
* rotated 180 11 10 9 8 8 9 10 11
* 7 6 5 4 4 5 6 7
* 3 2 1 0 0 1 2 3
*
* rotated 270 3 7 11 11 7 3
* 2 6 10 10 6 2
* 1 5 9 9 5 1
* 0 4 8 8 4 0
*/
static inline void back2front(Surface<Pixel_rgb888> &surface,
Texture<Pixel_rgb888> const &texture,
Rect rect, Rotate rotate, Flip flip)
{
_b2f<Slow>(surface, texture, rect, rotate, flip);
}
}

#endif /* _INCLUDE__BLIT__BLIT_H_ */
#endif /* _INCLUDE__BLIT_H_ */
294 changes: 294 additions & 0 deletions repos/os/include/blit/internal/neon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
/*
* \brief 2D memory copy using ARM NEON
* \author Norman Feske
* \date 2025-01-16
*/

/*
* Copyright (C) 2025 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/

#ifndef _INCLUDE__BLIT__INTERNAL__NEON_H_
#define _INCLUDE__BLIT__INTERNAL__NEON_H_

#include <blit/types.h>

/* compiler intrinsics */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wnarrowing"
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wfloat-conversion"
#pragma GCC diagnostic ignored "-Wunused-parameter"
#include <arm_neon.h>
#pragma GCC diagnostic pop


namespace Blit { struct Neon; }


struct Blit::Neon
{
static inline uint32x4_t _reversed(uint32x4_t const v)
{
return vrev64q_u32(vcombine_u32(vget_high_u32(v), vget_low_u32(v)));
}

static inline void _reverse_line(uint32x4_t const *src, uint32x4_t *dst, unsigned len)
{
src += len; /* move 'src' from end of line towards begin */
while (len--)
*dst++ = _reversed(*--src);
};

static inline void _copy_line(uint32x4_t const *s, uint32x4_t *d, unsigned len)
{
while (len--)
*d++ = *s++;
};

struct Src_ptr4
{
uint32x4_t const *p0, *p1, *p2, *p3;

inline Src_ptr4(uint32x4_t const *p, int const step)
:
p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
{ }

void incr_4(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; }

void prefetch() const
{
__builtin_prefetch(p0); __builtin_prefetch(p1);
__builtin_prefetch(p2); __builtin_prefetch(p3);
}

void load(uint32x4x4_t &tile) const { tile = { *p0, *p1, *p2, *p3 }; }
};

struct Dst_ptr4
{
uint32_t *p0, *p1, *p2, *p3;

Dst_ptr4(uint32_t *p, int const step)
:
p0(p), p1(p0 + step), p2(p1 + step), p3(p2 + step)
{ }

void incr(int const v) { p0 += v, p1 += v, p2 += v, p3 += v; }

void store(uint32x4x4_t const &tile)
{
vst4q_lane_u32(p0, tile, 0);
vst4q_lane_u32(p1, tile, 1);
vst4q_lane_u32(p2, tile, 2);
vst4q_lane_u32(p3, tile, 3);
}
};

struct Steps
{
int const src_y, dst_y;

void incr_x_4(Src_ptr4 &p) const { p.incr_4(1); };
void incr_x_8(Src_ptr4 &p) const { p.incr_4(2); };
void incr_y_4(Src_ptr4 &p) const { p.incr_4(src_y << 2); };
void incr_y_8(Src_ptr4 &p) const { p.incr_4(src_y << 3); };

void incr_x_4(Dst_ptr4 &p) const { p.incr(4); };
void incr_x_8(Dst_ptr4 &p) const { p.incr(8); };
void incr_y_4(Dst_ptr4 &p) const { p.incr(dst_y << 2); };
void incr_y_8(Dst_ptr4 &p) const { p.incr(dst_y << 3); };
};

__attribute__((optimize("-O3")))
static inline void _load_prefetch_store(Src_ptr4 &src, Dst_ptr4 &dst, Steps const steps)
{
uint32x4x4_t tile;
src.load(tile);
steps.incr_y_4(src);
src.prefetch();
dst.store(tile);
steps.incr_x_4(dst);
}

__attribute__((optimize("-O3")))
static inline void _rotate_8x4(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
{
for (unsigned i = 0; i < 2; i++)
_load_prefetch_store(src, dst, steps);
}

__attribute__((optimize("-O3")))
static inline void _rotate_8x4_last(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
{
_load_prefetch_store(src, dst, steps);
uint32x4x4_t tile;
src.load(tile);
dst.store(tile);
}

__attribute__((optimize("-O3")))
static inline void _rotate_8x8(Src_ptr4 src, Dst_ptr4 dst, Steps const steps)
{
_rotate_8x4(src, dst, steps);
steps.incr_y_4(dst);
steps.incr_x_4(src);
_rotate_8x4_last(src, dst, steps);
}

__attribute__((optimize("-O3")))
static inline void _rotate_8_lines(Src_ptr4 src, Dst_ptr4 dst,
Steps const steps, unsigned n)
{
for (; n; n--) {
_rotate_8x8(src, dst, steps);
steps.incr_y_8(dst);
steps.incr_x_8(src);
}
};

static inline void _rotate(Src_ptr4 src, Dst_ptr4 dst,
Steps const steps, unsigned w, unsigned h)
{
for (unsigned i = h; i; i--) {
_rotate_8_lines(src, dst, steps, w);
steps.incr_y_8(src);
steps.incr_x_8(dst);
}
}

struct B2f;
struct B2f_flip;
};


struct Blit::Neon::B2f
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};


void Blit::Neon::B2f::r0(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
{
uint32x4_t const *s = (uint32x4_t const *)src;
uint32x4_t *d = (uint32x4_t *)dst;

for (unsigned lines = h*8; lines; lines--) {
_copy_line(s, d, 2*w);
s += 2*line_w;
d += 2*line_w;
}
}


void Blit::Neon::B2f::r90(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, unsigned const h)
{
Steps const steps { -2*int(src_w), 8*int(dst_w) };

Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y);
Dst_ptr4 dst_ptr4 (dst, steps.dst_y);

_rotate(src_ptr4, dst_ptr4, steps, w, h);
}


void Blit::Neon::B2f::r180(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
{
uint32x4_t *d = (uint32x4_t *)dst;
uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h;

for (unsigned i = h*8; i; i--) {
s -= 2*line_w;
_reverse_line(s, d, 2*w);
d += 2*line_w;
}
}


void Blit::Neon::B2f::r270(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, const unsigned h)
{
Steps const steps { 2*int(src_w), -8*int(dst_w) };

Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y);
Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y);

_rotate(src_ptr4, dst_ptr4, steps, w, h);
}


struct Blit::Neon::B2f_flip
{
static inline void r0 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r90 (uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
static inline void r180(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned);
static inline void r270(uint32_t *, unsigned, uint32_t const *, unsigned, unsigned, unsigned);
};


void Blit::Neon::B2f_flip::r0(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
{
uint32x4_t const *s = (uint32x4_t const *)src;
uint32x4_t *d = (uint32x4_t *)dst;

for (unsigned lines = h*8; lines; lines--) {
_reverse_line(s, d, 2*w);
s += 2*line_w;
d += 2*line_w;
}
}


void Blit::Neon::B2f_flip::r90(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, unsigned const h)
{
Steps const steps { 2*int(src_w), 8*int(dst_w) };

Src_ptr4 src_ptr4 ((uint32x4_t *)src, steps.src_y);
Dst_ptr4 dst_ptr4 (dst, steps.dst_y);

_rotate(src_ptr4, dst_ptr4, steps, w, h);
}


void Blit::Neon::B2f_flip::r180(uint32_t *dst, unsigned const line_w,
uint32_t const *src, unsigned const w, unsigned const h)
{
uint32x4_t const *s = (uint32x4_t const *)src + 2*line_w*8*h;
uint32x4_t *d = (uint32x4_t *)dst;

for (unsigned lines = h*8; lines; lines--) {
s -= 2*line_w;
_copy_line(s, d, 2*w);
d += 2*line_w;
}
}


void Blit::Neon::B2f_flip::r270(uint32_t *dst, unsigned const dst_w,
uint32_t const *src, unsigned const src_w,
unsigned const w, const unsigned h)
{
Steps const steps { -2*int(src_w), -8*int(dst_w) };

Src_ptr4 src_ptr4 ((uint32x4_t *)src + 2*src_w*(8*h - 1), steps.src_y);
Dst_ptr4 dst_ptr4 (dst + 8*int(dst_w)*(w*8 - 1), steps.dst_y);

_rotate(src_ptr4, dst_ptr4, steps, w, h);
}

#endif /* _INCLUDE__BLIT__INTERNAL__NEON_H_ */
Loading

0 comments on commit 0a756e4

Please sign in to comment.