diff options
-rw-r--r-- | src/sna/blt.c | 324 | ||||
-rw-r--r-- | src/sna/kgem.c | 48 | ||||
-rw-r--r-- | src/sna/kgem.h | 24 | ||||
-rw-r--r-- | src/sna/sna.h | 7 | ||||
-rw-r--r-- | src/sna/sna_accel.c | 16 | ||||
-rw-r--r-- | src/sna/sna_io.c | 15 |
6 files changed, 312 insertions, 122 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c index af876672..4dbd9e86 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -213,12 +213,12 @@ memcpy_blt(const void *src, void *dst, int bpp, } } -fast_memcpy void -memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) +static fast_memcpy void +memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) { const unsigned tile_width = 512; const unsigned tile_height = 8; @@ -226,14 +226,14 @@ memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, const unsigned cpp = bpp / 8; const unsigned stride_tiles = dst_stride / tile_width; - const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp; + const unsigned swizzle_pixels = tile_width / cpp; const unsigned tile_pixels = ffs(tile_width / cpp) - 1; const unsigned tile_mask = (1 << tile_pixels) - 1; unsigned x, y; - DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", - __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; @@ -252,19 +252,71 @@ memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, offset = tile_row + (dx >> tile_pixels) * tile_size + (dx & tile_mask) * cpp; - switch (swizzling) { - case I915_BIT_6_SWIZZLE_NONE: - break; - case I915_BIT_6_SWIZZLE_9: - offset ^= (offset >> 3) & 64; - break; - case I915_BIT_6_SWIZZLE_9_10: - offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; - break; - case I915_BIT_6_SWIZZLE_9_11: - offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; - break; - } + memcpy((char *)dst + offset, src_row, length * cpp); + + src_row += length * cpp; + x -= length * cpp; + dx += length; + } + while (x >= 512) { + assert((dx & tile_mask) == 0); + offset = tile_row + (dx >> tile_pixels) * tile_size; + + memcpy((char *)dst + offset, src_row, 512); + + src_row += 512; + x -= 512; + dx += swizzle_pixels; + } + if (x) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + memcpy((char *)dst + offset, src_row, x); + } + } +} + +fast_memcpy static void +memcpy_to_tiled_x__swizzle_9(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned stride_tiles = dst_stride / tile_width; + const unsigned swizzle_pixels = 64 / cpp; + const unsigned tile_pixels = ffs(tile_width / cpp) - 1; + const unsigned tile_mask = (1 << tile_pixels) - 1; + + unsigned x, y; + + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + + src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; + + for (y = 0; y < height; ++y) { + const uint32_t dy = y + dst_y; + const uint32_t tile_row = + (dy / tile_height * stride_tiles * tile_size + + (dy & (tile_height-1)) * tile_width); + const uint8_t *src_row = (const uint8_t *)src + src_stride * y; + uint32_t dx = dst_x, offset; + + x = width * cpp; + if (dx & (swizzle_pixels - 1)) { + const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); + const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + offset ^= (offset >> 3) & 64; memcpy((char *)dst + offset, src_row, length * cpp); @@ -272,64 +324,184 @@ memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, x -= length * cpp; dx += length; } - if (swizzling) { - while (x >= 64) { - offset = tile_row + - (dx >> tile_pixels) * tile_size + - (dx & tile_mask) * cpp; - switch (swizzling) { - case I915_BIT_6_SWIZZLE_9: - offset ^= (offset >> 3) & 64; - break; - case I915_BIT_6_SWIZZLE_9_10: - offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; - break; - case I915_BIT_6_SWIZZLE_9_11: - offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; - break; - } - - memcpy((char *)dst + offset, src_row, 64); - - src_row += 64; - x -= 64; - dx += swizzle_pixels; - } - } else { - while (x >= 512) { - assert((dx & tile_mask) == 0); - offset = tile_row + (dx >> tile_pixels) * tile_size; + while (x >= 64) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + offset ^= (offset >> 3) & 64; - memcpy((char *)dst + offset, src_row, 512); + memcpy((char *)dst + offset, src_row, 64); - src_row += 512; - x -= 512; - dx += swizzle_pixels; - } + src_row += 64; + x -= 64; + dx += swizzle_pixels; } if (x) { offset = tile_row + (dx >> tile_pixels) * tile_size + (dx & tile_mask) * cpp; - switch (swizzling) { - case I915_BIT_6_SWIZZLE_NONE: - break; - case I915_BIT_6_SWIZZLE_9: - offset ^= (offset >> 3) & 64; - break; - case I915_BIT_6_SWIZZLE_9_10: - offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; - break; - case I915_BIT_6_SWIZZLE_9_11: - offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; - break; - } + offset ^= (offset >> 3) & 64; + memcpy((char *)dst + offset, src_row, x); + } + } +} + +fast_memcpy static void +memcpy_to_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned stride_tiles = dst_stride / tile_width; + const unsigned swizzle_pixels = 64 / cpp; + const unsigned tile_pixels = ffs(tile_width / cpp) - 1; + const unsigned tile_mask = (1 << tile_pixels) - 1; + + unsigned x, y; + + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + + src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; + for (y = 0; y < height; ++y) { + const uint32_t dy = y + dst_y; + const uint32_t tile_row = + (dy / tile_height * stride_tiles * tile_size + + (dy & (tile_height-1)) * tile_width); + const uint8_t *src_row = (const uint8_t *)src + src_stride * y; + uint32_t dx = dst_x, offset; + + x = width * cpp; + if (dx & (swizzle_pixels - 1)) { + const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); + const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; + + memcpy((char *)dst + offset, src_row, length * cpp); + + src_row += length * cpp; + x -= length * cpp; + dx += length; + } + while (x >= 64) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; + + memcpy((char *)dst + offset, src_row, 64); + + src_row += 64; + x -= 64; + dx += swizzle_pixels; + } + if (x) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; memcpy((char *)dst + offset, src_row, x); } } } +fast_memcpy static void +memcpy_to_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned stride_tiles = dst_stride / tile_width; + const unsigned swizzle_pixels = 64 / cpp; + const unsigned tile_pixels = ffs(tile_width / cpp) - 1; + const unsigned tile_mask = (1 << tile_pixels) - 1; + + unsigned x, y; + + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + + src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; + + for (y = 0; y < height; ++y) { + const uint32_t dy = y + dst_y; + const uint32_t tile_row = + (dy / tile_height * stride_tiles * tile_size + + (dy & (tile_height-1)) * tile_width); + const uint8_t *src_row = (const uint8_t *)src + src_stride * y; + uint32_t dx = dst_x, offset; + + x = width * cpp; + if (dx & (swizzle_pixels - 1)) { + const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); + const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + memcpy((char *)dst + offset, src_row, length * cpp); + + src_row += length * cpp; + x -= length * cpp; + dx += length; + } + while (x >= 64) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + + memcpy((char *)dst + offset, src_row, 64); + + src_row += 64; + x -= 64; + dx += swizzle_pixels; + } + if (x) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + memcpy((char *)dst + offset, src_row, x); + } + } +} + +void choose_memcpy_to_tiled_x(struct kgem *kgem, int swizzling) +{ + switch (swizzling) { + default: + case I915_BIT_6_SWIZZLE_NONE: + kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0; + break; + case I915_BIT_6_SWIZZLE_9: + kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9; + break; + case I915_BIT_6_SWIZZLE_9_10: + kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10; + break; + case I915_BIT_6_SWIZZLE_9_11: + kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11; + break; + } +} + void memmove_box(const void *src, void *dst, int bpp, int32_t stride, @@ -561,10 +733,10 @@ memcpy_xor(const void *src, void *dst, int bpp, while (i >= 16) { __m128i xmm1, xmm2, xmm3, xmm4; - xmm1 = xmm_load_128u((__m128i*)s + 0); - xmm2 = xmm_load_128u((__m128i*)s + 1); - xmm3 = xmm_load_128u((__m128i*)s + 2); - xmm4 = xmm_load_128u((__m128i*)s + 3); + xmm1 = xmm_load_128u((const __m128i*)s + 0); + xmm2 = xmm_load_128u((const __m128i*)s + 1); + xmm3 = xmm_load_128u((const __m128i*)s + 2); + xmm4 = xmm_load_128u((const __m128i*)s + 3); xmm_save_128((__m128i*)d + 0, _mm_or_si128(xmm1, mask)); @@ -583,8 +755,8 @@ memcpy_xor(const void *src, void *dst, int bpp, if (i & 8) { __m128i xmm1, xmm2; - xmm1 = xmm_load_128u((__m128i*)s + 0); - xmm2 = xmm_load_128u((__m128i*)s + 1); + xmm1 = xmm_load_128u((const __m128i*)s + 0); + xmm2 = xmm_load_128u((const __m128i*)s + 1); xmm_save_128((__m128i*)d + 0, _mm_or_si128(xmm1, mask)); @@ -597,7 +769,7 @@ memcpy_xor(const void *src, void *dst, int bpp, if (i & 4) { xmm_save_128((__m128i*)d, - _mm_or_si128(xmm_load_128u((__m128i*)s), + _mm_or_si128(xmm_load_128u((const __m128i*)s), mask)); d += 4; @@ -643,7 +815,7 @@ memcpy_xor(const void *src, void *dst, int bpp, case 2: do { uint16_t *d = (uint16_t *)dst_bytes; - uint16_t *s = (uint16_t *)src_bytes; + const uint16_t *s = (const uint16_t *)src_bytes; for (i = 0; i < width; i++) d[i] = (s[i] & and) | or; @@ -656,7 +828,7 @@ memcpy_xor(const void *src, void *dst, int bpp, case 4: do { uint32_t *d = (uint32_t *)dst_bytes; - uint32_t *s = (uint32_t *)src_bytes; + const uint32_t *s = (const uint32_t *)src_bytes; for (i = 0; i < width; i++) d[i] = (s[i] & and) | or; diff --git a/src/sna/kgem.c b/src/sna/kgem.c index 66dce479..b32ceee8 100644 --- a/src/sna/kgem.c +++ b/src/sna/kgem.c @@ -964,6 +964,39 @@ err: return false; } +static void kgem_init_swizzling(struct kgem *kgem) +{ + struct drm_i915_gem_get_tiling tiling; + +#ifndef __x86_64__ + /* Between a register starved compiler emitting attrocious code + * and the extra overhead in the kernel for managing the tight + * 32-bit address space, unless we have a 64-bit system, + * using memcpy_to_tiled_x() is extremely slow. + */ + return; +#endif + + if (kgem->gen < 050) /* bit17 swizzling :( */ + return; + + VG_CLEAR(tiling); + tiling.handle = gem_create(kgem->fd, 1); + if (!tiling.handle) + return; + + if (!gem_set_tiling(kgem->fd, tiling.handle, I915_TILING_X, 512)) + goto out; + + if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling)) + goto out; + + choose_memcpy_to_tiled_x(kgem, tiling.swizzle_mode); +out: + gem_close(kgem->fd, tiling.handle); +} + + void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen) { struct drm_i915_gem_get_aperture aperture; @@ -1212,6 +1245,8 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen) kgem->batch_flags_base |= LOCAL_I915_EXEC_HANDLE_LUT; if (kgem->has_pinned_batches) kgem->batch_flags_base |= LOCAL_I915_EXEC_IS_PINNED; + + kgem_init_swizzling(kgem); } /* XXX hopefully a good approximation */ @@ -5797,19 +5832,6 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset) } } -int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo) -{ - struct drm_i915_gem_get_tiling tiling; - - VG_CLEAR(tiling); - tiling.handle = bo->handle; - if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling)) - return 0; - - assert(bo->tiling == tiling.tiling_mode); - return tiling.swizzle_mode; -} - struct kgem_bo * kgem_replace_bo(struct kgem *kgem, struct kgem_bo *src, diff --git a/src/sna/kgem.h b/src/sna/kgem.h index 33a4db08..91a38f75 100644 --- a/src/sna/kgem.h +++ b/src/sna/kgem.h @@ -196,6 +196,12 @@ struct kgem { void (*retire)(struct kgem *kgem); void (*expire)(struct kgem *kgem); + void (*memcpy_to_tiled_x)(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height); + uint16_t reloc__self[256]; uint32_t batch[64*1024-8] page_aligned; struct drm_i915_gem_exec_object2 exec[256] page_aligned; @@ -286,7 +292,6 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem, uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format); void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset); -int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo); bool kgem_retire(struct kgem *kgem); @@ -693,4 +698,21 @@ static inline void __kgem_batch_debug(struct kgem *kgem, uint32_t nbatch) } #endif +static inline void +memcpy_to_tiled_x(struct kgem *kgem, + const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + return kgem->memcpy_to_tiled_x(src, dst, bpp, + src_stride, dst_stride, + src_x, src_y, + dst_x, dst_y, + width, height); +} + +void choose_memcpy_to_tiled_x(struct kgem *kgem, int swizzling); + #endif /* KGEM_H */ diff --git a/src/sna/sna.h b/src/sna/sna.h index da5d8af2..f720c64f 100644 --- a/src/sna/sna.h +++ b/src/sna/sna.h @@ -848,12 +848,7 @@ memcpy_blt(const void *src, void *dst, int bpp, int16_t src_x, int16_t src_y, int16_t dst_x, int16_t dst_y, uint16_t width, uint16_t height); -void -memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height); + void memmove_box(const void *src, void *dst, int bpp, int32_t stride, diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c index 599cfc11..44b87cde 100644 --- a/src/sna/sna_accel.c +++ b/src/sna/sna_accel.c @@ -3868,15 +3868,7 @@ static inline void box32_add_rect(Box32Rec *box, const xRectangle *r) static bool can_upload_tiled_x(struct kgem *kgem, struct kgem_bo *bo) { -#ifndef __x86_64__ - /* Between a register starved compiler emitting attrocious code - * and the extra overhead in the kernel for managing the tight - * 32-bit address space, unless we have a 64-bit system, - * using memcpy_to_tiled_x() is extremely slow. - */ - return false; -#endif - if (kgem->gen < 050) /* bit17 swizzling :( */ + if (!kgem->memcpy_to_tiled_x) return false; if (bo->tiling != I915_TILING_X) @@ -3896,7 +3888,6 @@ try_upload_tiled_x(PixmapPtr pixmap, RegionRec *region, struct sna_pixmap *priv = sna_pixmap(pixmap); BoxRec *box; uint8_t *dst; - int swizzle; int n; DBG(("%s: bo? %d, can tile? %d\n", __FUNCTION__, @@ -3919,10 +3910,9 @@ try_upload_tiled_x(PixmapPtr pixmap, RegionRec *region, DBG(("%s: upload(%d, %d, %d, %d) x %d\n", __FUNCTION__, x, y, w, h, n)); kgem_bo_sync__cpu(&sna->kgem, priv->gpu_bo); - swizzle = kgem_bo_get_swizzling(&sna->kgem, priv->gpu_bo); do { - memcpy_to_tiled_x(bits, dst, - pixmap->drawable.bitsPerPixel, swizzle, + memcpy_to_tiled_x(&sna->kgem, bits, dst, + pixmap->drawable.bitsPerPixel, stride, priv->gpu_bo->pitch, box->x1 - x, box->y1 - y, box->x1, box->y1, diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c index 1ec1a60f..e51c0335 100644 --- a/src/sna/sna_io.c +++ b/src/sna/sna_io.c @@ -477,16 +477,7 @@ fallback: static bool upload_inplace__tiled(struct kgem *kgem, struct kgem_bo *bo) { -#ifndef __x86_64__ - /* Between a register starved compiler emitting attrocious code - * and the extra overhead in the kernel for managing the tight - * 32-bit address space, unless we have a 64-bit system, - * using memcpy_to_tiled_x() is extremely slow. - */ - return false; -#endif - - if (kgem->gen < 050) /* bit17 swizzling :( */ + if (!kgem->memcpy_to_tiled_x) return false; if (bo->tiling != I915_TILING_X) @@ -505,7 +496,6 @@ write_boxes_inplace__tiled(struct kgem *kgem, const BoxRec *box, int n) { uint8_t *dst; - int swizzle; assert(bo->tiling == I915_TILING_X); @@ -514,9 +504,8 @@ write_boxes_inplace__tiled(struct kgem *kgem, return false; kgem_bo_sync__cpu(kgem, bo); - swizzle = kgem_bo_get_swizzling(kgem, bo); do { - memcpy_to_tiled_x(src, dst, bpp, swizzle, stride, bo->pitch, + memcpy_to_tiled_x(kgem, src, dst, bpp, stride, bo->pitch, box->x1 + src_dx, box->y1 + src_dy, box->x1 + dst_dx, box->y1 + dst_dy, box->x2 - box->x1, box->y2 - box->y1); |