diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2013-02-26 00:02:16 +0000 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2013-02-26 00:03:28 +0000 |
commit | 94b95cc2fc63457903a7b7b6eaa09bc27f25750c (patch) | |
tree | a8bdd1dbb1cb5f1381df4658c4c599fee2a0c5d3 | |
parent | f095678125b25aeae80d838729a7f89d09007e10 (diff) |
sna/gen4+: Begin specialising vertex programs for ISA
Allow use of advanced ISA when available by detecting support at
runtime. This initial work just uses GCC to emit varying ISA, future
work could use hand written code for these hot spots.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r-- | src/sna/compiler.h | 10 | ||||
-rw-r--r-- | src/sna/gen4_render.c | 4 | ||||
-rw-r--r-- | src/sna/gen4_vertex.c | 1129 | ||||
-rw-r--r-- | src/sna/gen4_vertex.h | 4 | ||||
-rw-r--r-- | src/sna/gen5_render.c | 4 | ||||
-rw-r--r-- | src/sna/gen6_render.c | 4 | ||||
-rw-r--r-- | src/sna/gen7_render.c | 4 |
7 files changed, 1123 insertions, 36 deletions
diff --git a/src/sna/compiler.h b/src/sna/compiler.h index 23ec31c3..fe2e3214 100644 --- a/src/sna/compiler.h +++ b/src/sna/compiler.h @@ -52,6 +52,16 @@ #define flatten #endif +#if defined(__GNUC__) && (__GNUC__ >= 4) /* 4.4 */ +#define sse2 __attribute__((target("sse2"))) +#define sse4_2 __attribute__((target("sse4.2,sse2"))) +#define avx2 __attribute__((target("avx2,sse4.2,sse2"))) +#else +#define sse2 +#define sse4_2 +#define avx2 +#endif + #ifdef HAVE_VALGRIND #define VG(x) x #else diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c index d08d762b..92802469 100644 --- a/src/sna/gen4_render.c +++ b/src/sna/gen4_render.c @@ -1945,7 +1945,7 @@ gen4_render_composite(struct sna *sna, tmp->mask.bo != NULL, tmp->has_component_alpha, tmp->is_affine); - tmp->u.gen4.ve_id = gen4_choose_composite_emitter(tmp); + tmp->u.gen4.ve_id = gen4_choose_composite_emitter(sna, tmp); tmp->blt = gen4_render_composite_blt; tmp->box = gen4_render_composite_box; @@ -2186,7 +2186,7 @@ gen4_render_composite_spans(struct sna *sna, tmp->base.has_component_alpha = false; tmp->base.need_magic_ca_pass = false; - tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(tmp); + tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(sna, tmp); tmp->base.u.gen4.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine; tmp->box = gen4_render_composite_spans_box; diff --git a/src/sna/gen4_vertex.c b/src/sna/gen4_vertex.c index 5062ebdf..20f85b37 100644 --- a/src/sna/gen4_vertex.c +++ b/src/sna/gen4_vertex.c @@ -272,10 +272,10 @@ emit_texcoord(struct sna *sna, inline static void emit_vertex(struct sna *sna, - const struct sna_composite_op *op, - int16_t srcX, int16_t srcY, - int16_t mskX, int16_t mskY, - int16_t dstX, int16_t dstY) + const struct sna_composite_op *op, + int16_t srcX, int16_t srcY, + int16_t mskX, int16_t mskY, + int16_t dstX, int16_t dstY) { OUT_VERTEX(dstX, dstY); emit_texcoord(sna, &op->src, srcX, srcY); @@ -414,6 +414,66 @@ emit_primitive_linear(struct sna *sna, v[5] = compute_linear(&op->src, r->src.x, r->src.y); } +sse4_2 fastcall static void +emit_primitive_linear__sse4_2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + assert(op->floats_per_rect == 6); + assert((sna->render.vertex_used % 2) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 6; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + dst.p.x = r->dst.x; + v[2] = dst.f; + dst.p.y = r->dst.y; + v[4] = dst.f; + + v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height); + v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height); + v[5] = compute_linear(&op->src, r->src.x, r->src.y); +} + +avx2 fastcall static void +emit_primitive_linear__avx2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + assert(op->floats_per_rect == 6); + assert((sna->render.vertex_used % 2) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 6; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + dst.p.x = r->dst.x; + v[2] = dst.f; + dst.p.y = r->dst.y; + v[4] = dst.f; + + v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height); + v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height); + v[5] = compute_linear(&op->src, r->src.x, r->src.y); +} + fastcall static void emit_boxes_linear(const struct sna_composite_op *op, const BoxRec *box, int nbox, @@ -442,6 +502,62 @@ emit_boxes_linear(const struct sna_composite_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_boxes_linear__sse4_2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + union { + struct sna_coordinate p; + float f; + } dst; + + do { + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[2] = dst.f; + dst.p.y = box->y1; + v[4] = dst.f; + + v[1] = compute_linear(&op->src, box->x2, box->y2); + v[3] = compute_linear(&op->src, box->x1, box->y2); + v[5] = compute_linear(&op->src, box->x1, box->y1); + + v += 6; + box++; + } while (--nbox); +} + +avx2 fastcall static void +emit_boxes_linear__avx2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + union { + struct sna_coordinate p; + float f; + } dst; + + do { + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[2] = dst.f; + dst.p.y = box->y1; + v[4] = dst.f; + + v[1] = compute_linear(&op->src, box->x2, box->y2); + v[3] = compute_linear(&op->src, box->x1, box->y2); + v[5] = compute_linear(&op->src, box->x1, box->y1); + + v += 6; + box++; + } while (--nbox); +} + fastcall static void emit_primitive_identity_source(struct sna *sna, const struct sna_composite_op *op, @@ -473,6 +589,68 @@ emit_primitive_identity_source(struct sna *sna, v[5] = v[2] = v[8] + r->height * op->src.scale[1]; } +sse4_2 fastcall static void +emit_primitive_identity_source__sse4_2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 9; + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + dst.p.x = r->dst.x; + v[3] = dst.f; + dst.p.y = r->dst.y; + v[6] = dst.f; + + v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0]; + v[1] = v[4] + r->width * op->src.scale[0]; + + v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1]; + v[5] = v[2] = v[8] + r->height * op->src.scale[1]; +} + +avx2 fastcall static void +emit_primitive_identity_source__avx2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 9; + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + dst.p.x = r->dst.x; + v[3] = dst.f; + dst.p.y = r->dst.y; + v[6] = dst.f; + + v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0]; + v[1] = v[4] + r->width * op->src.scale[0]; + + v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1]; + v[5] = v[2] = v[8] + r->height * op->src.scale[1]; +} + fastcall static void emit_boxes_identity_source(const struct sna_composite_op *op, const BoxRec *box, int nbox, @@ -503,6 +681,66 @@ emit_boxes_identity_source(const struct sna_composite_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_boxes_identity_source__sse4_2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[3] = dst.f; + dst.p.y = box->y1; + v[6] = dst.f; + + v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0]; + v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0]; + + v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1]; + v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1]; + + v += 9; + box++; + } while (--nbox); +} + +avx2 fastcall static void +emit_boxes_identity_source__avx2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[3] = dst.f; + dst.p.y = box->y1; + v[6] = dst.f; + + v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0]; + v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0]; + + v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1]; + v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1]; + + v += 9; + box++; + } while (--nbox); +} + fastcall static void emit_primitive_simple_source(struct sna *sna, const struct sna_composite_op *op, @@ -543,6 +781,86 @@ emit_primitive_simple_source(struct sna *sna, v[8] = ((r->src.y + ty) * yy + y0) * sy; } +sse4_2 fastcall static void +emit_primitive_simple_source__sse4_2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float xx = op->src.transform->matrix[0][0]; + float x0 = op->src.transform->matrix[0][2]; + float yy = op->src.transform->matrix[1][1]; + float y0 = op->src.transform->matrix[1][2]; + float sx = op->src.scale[0]; + float sy = op->src.scale[1]; + int16_t tx = op->src.offset[0]; + int16_t ty = op->src.offset[1]; + + assert(op->floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*3; + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx; + v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy; + + dst.p.x = r->dst.x; + v[3] = dst.f; + v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx; + + dst.p.y = r->dst.y; + v[6] = dst.f; + v[8] = ((r->src.y + ty) * yy + y0) * sy; +} + +avx2 fastcall static void +emit_primitive_simple_source__avx2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float xx = op->src.transform->matrix[0][0]; + float x0 = op->src.transform->matrix[0][2]; + float yy = op->src.transform->matrix[1][1]; + float y0 = op->src.transform->matrix[1][2]; + float sx = op->src.scale[0]; + float sy = op->src.scale[1]; + int16_t tx = op->src.offset[0]; + int16_t ty = op->src.offset[1]; + + assert(op->floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*3; + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx; + v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy; + + dst.p.x = r->dst.x; + v[3] = dst.f; + v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx; + + dst.p.y = r->dst.y; + v[6] = dst.f; + v[8] = ((r->src.y + ty) * yy + y0) * sy; +} + fastcall static void emit_boxes_simple_source(const struct sna_composite_op *op, const BoxRec *box, int nbox, @@ -582,6 +900,84 @@ emit_boxes_simple_source(const struct sna_composite_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_boxes_simple_source__sse4_2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + float xx = op->src.transform->matrix[0][0]; + float x0 = op->src.transform->matrix[0][2]; + float yy = op->src.transform->matrix[1][1]; + float y0 = op->src.transform->matrix[1][2]; + float sx = op->src.scale[0]; + float sy = op->src.scale[1]; + int16_t tx = op->src.offset[0]; + int16_t ty = op->src.offset[1]; + + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = ((box->x2 + tx) * xx + x0) * sx; + v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy; + + dst.p.x = box->x1; + v[3] = dst.f; + v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx; + + dst.p.y = box->y1; + v[6] = dst.f; + v[8] = ((box->y1 + ty) * yy + y0) * sy; + + v += 9; + box++; + } while (--nbox); +} + +avx2 fastcall static void +emit_boxes_simple_source__avx2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + float xx = op->src.transform->matrix[0][0]; + float x0 = op->src.transform->matrix[0][2]; + float yy = op->src.transform->matrix[1][1]; + float y0 = op->src.transform->matrix[1][2]; + float sx = op->src.scale[0]; + float sy = op->src.scale[1]; + int16_t tx = op->src.offset[0]; + int16_t ty = op->src.offset[1]; + + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = ((box->x2 + tx) * xx + x0) * sx; + v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy; + + dst.p.x = box->x1; + v[3] = dst.f; + v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx; + + dst.p.y = box->y1; + v[6] = dst.f; + v[8] = ((box->y1 + ty) * yy + y0) * sy; + + v += 9; + box++; + } while (--nbox); +} + fastcall static void emit_primitive_affine_source(struct sna *sna, const struct sna_composite_op *op, @@ -981,7 +1377,7 @@ emit_composite_texcoord_affine(struct sna *sna, } -unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp) +unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp) { unsigned vb; @@ -1060,14 +1456,30 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp) vb = 1; } else if (tmp->src.is_linear) { DBG(("%s: linear, no mask\n", __FUNCTION__)); - tmp->prim_emit = emit_primitive_linear; - tmp->emit_boxes = emit_boxes_linear; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_primitive_linear__avx2; + tmp->emit_boxes = emit_boxes_linear__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_primitive_linear__sse4_2; + tmp->emit_boxes = emit_boxes_linear__sse4_2; + } else { + tmp->prim_emit = emit_primitive_linear; + tmp->emit_boxes = emit_boxes_linear; + } tmp->floats_per_vertex = 2; vb = 1; } else if (tmp->src.transform == NULL) { DBG(("%s: identity src, no mask\n", __FUNCTION__)); - tmp->prim_emit = emit_primitive_identity_source; - tmp->emit_boxes = emit_boxes_identity_source; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_primitive_identity_source__avx2; + tmp->emit_boxes = emit_boxes_identity_source__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_primitive_identity_source__sse4_2; + tmp->emit_boxes = emit_boxes_identity_source__sse4_2; + } else { + tmp->prim_emit = emit_primitive_identity_source; + tmp->emit_boxes = emit_boxes_identity_source; + } tmp->floats_per_vertex = 3; vb = 2; } else if (tmp->src.is_affine) { @@ -1075,8 +1487,16 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp) tmp->src.scale[1] /= tmp->src.transform->matrix[2][2]; if (!sna_affine_transform_is_rotation(tmp->src.transform)) { DBG(("%s: simple src, no mask\n", __FUNCTION__)); - tmp->prim_emit = emit_primitive_simple_source; - tmp->emit_boxes = emit_boxes_simple_source; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_primitive_simple_source__avx2; + tmp->emit_boxes = emit_boxes_simple_source__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_primitive_simple_source__sse4_2; + tmp->emit_boxes = emit_boxes_simple_source__sse4_2; + } else { + tmp->prim_emit = emit_primitive_simple_source; + tmp->emit_boxes = emit_boxes_simple_source; + } } else { DBG(("%s: affine src, no mask\n", __FUNCTION__)); tmp->prim_emit = emit_primitive_affine_source; @@ -1222,6 +1642,86 @@ emit_span_identity(struct sna *sna, v[11] = v[7] = v[3] = opacity; } +sse4_2 fastcall static void +emit_span_identity__sse4_2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*4; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = (box->x2 + tx) * sx; + v[6] = v[2] = (box->y2 + ty) * sy; + + dst.p.x = box->x1; + v[4] = dst.f; + v[9] = v[5] = (box->x1 + tx) * sx; + + dst.p.y = box->y1; + v[8] = dst.f; + v[10] = (box->y1 + ty) * sy; + + v[11] = v[7] = v[3] = opacity; +} + +avx2 fastcall static void +emit_span_identity__avx2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*4; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = (box->x2 + tx) * sx; + v[6] = v[2] = (box->y2 + ty) * sy; + + dst.p.x = box->x1; + v[4] = dst.f; + v[9] = v[5] = (box->x1 + tx) * sx; + + dst.p.y = box->y1; + v[8] = dst.f; + v[10] = (box->y1 + ty) * sy; + + v[11] = v[7] = v[3] = opacity; +} + fastcall static void emit_span_boxes_identity(const struct sna_composite_spans_op *op, const struct sna_opacity_box *b, int nbox, @@ -1259,11 +1759,173 @@ emit_span_boxes_identity(const struct sna_composite_spans_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_span_boxes_identity__sse4_2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + v[1] = (b->box.x2 + tx) * sx; + v[6] = v[2] = (b->box.y2 + ty) * sy; + + dst.p.x = b->box.x1; + v[4] = dst.f; + v[9] = v[5] = (b->box.x1 + tx) * sx; + + dst.p.y = b->box.y1; + v[8] = dst.f; + v[10] = (b->box.y1 + ty) * sy; + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + +avx2 fastcall static void +emit_span_boxes_identity__avx2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + v[1] = (b->box.x2 + tx) * sx; + v[6] = v[2] = (b->box.y2 + ty) * sy; + + dst.p.x = b->box.x1; + v[4] = dst.f; + v[9] = v[5] = (b->box.x1 + tx) * sx; + + dst.p.y = b->box.y1; + v[8] = dst.f; + v[10] = (b->box.y1 + ty) * sy; + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + fastcall static void emit_span_simple(struct sna *sna, - const struct sna_composite_spans_op *op, - const BoxRec *box, - float opacity) + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float xx = op->base.src.transform->matrix[0][0]; + float x0 = op->base.src.transform->matrix[0][2]; + float yy = op->base.src.transform->matrix[1][1]; + float y0 = op->base.src.transform->matrix[1][2]; + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*4; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = ((box->x2 + tx) * xx + x0) * sx; + v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy; + + dst.p.x = box->x1; + v[4] = dst.f; + v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx; + + dst.p.y = box->y1; + v[8] = dst.f; + v[10] = ((box->y1 + ty) * yy + y0) * sy; + + v[11] = v[7] = v[3] = opacity; +} + +sse4_2 fastcall static void +emit_span_simple__sse4_2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float xx = op->base.src.transform->matrix[0][0]; + float x0 = op->base.src.transform->matrix[0][2]; + float yy = op->base.src.transform->matrix[1][1]; + float y0 = op->base.src.transform->matrix[1][2]; + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*4; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = ((box->x2 + tx) * xx + x0) * sx; + v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy; + + dst.p.x = box->x1; + v[4] = dst.f; + v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx; + + dst.p.y = box->y1; + v[8] = dst.f; + v[10] = ((box->y1 + ty) * yy + y0) * sy; + + v[11] = v[7] = v[3] = opacity; +} + +avx2 fastcall static void +emit_span_simple__avx2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) { float *v; union { @@ -1344,6 +2006,88 @@ emit_span_boxes_simple(const struct sna_composite_spans_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_span_boxes_simple__sse4_2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + float xx = op->base.src.transform->matrix[0][0]; + float x0 = op->base.src.transform->matrix[0][2]; + float yy = op->base.src.transform->matrix[1][1]; + float y0 = op->base.src.transform->matrix[1][2]; + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + v[1] = ((b->box.x2 + tx) * xx + x0) * sx; + v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy; + + dst.p.x = b->box.x1; + v[4] = dst.f; + v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx; + + dst.p.y = b->box.y1; + v[8] = dst.f; + v[10] = ((b->box.y1 + ty) * yy + y0) * sy; + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + +avx2 fastcall static void +emit_span_boxes_simple__avx2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + float xx = op->base.src.transform->matrix[0][0]; + float x0 = op->base.src.transform->matrix[0][2]; + float yy = op->base.src.transform->matrix[1][1]; + float y0 = op->base.src.transform->matrix[1][2]; + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + v[1] = ((b->box.x2 + tx) * xx + x0) * sx; + v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy; + + dst.p.x = b->box.x1; + v[4] = dst.f; + v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx; + + dst.p.y = b->box.y1; + v[8] = dst.f; + v[10] = ((b->box.y1 + ty) * yy + y0) * sy; + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + fastcall static void emit_span_affine(struct sna *sna, const struct sna_composite_spans_op *op, @@ -1389,6 +2133,96 @@ emit_span_affine(struct sna *sna, v[11] = v[7] = v[3] = opacity; } +sse4_2 fastcall static void +emit_span_affine__sse4_2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 12; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x2, + op->base.src.offset[1] + box->y2, + op->base.src.transform, + op->base.src.scale, + &v[1], &v[2]); + + dst.p.x = box->x1; + v[4] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1, + op->base.src.offset[1] + box->y2, + op->base.src.transform, + op->base.src.scale, + &v[5], &v[6]); + + dst.p.y = box->y1; + v[8] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1, + op->base.src.offset[1] + box->y1, + op->base.src.transform, + op->base.src.scale, + &v[9], &v[10]); + + v[11] = v[7] = v[3] = opacity; +} + +avx2 fastcall static void +emit_span_affine__avx2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 12; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x2, + op->base.src.offset[1] + box->y2, + op->base.src.transform, + op->base.src.scale, + &v[1], &v[2]); + + dst.p.x = box->x1; + v[4] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1, + op->base.src.offset[1] + box->y2, + op->base.src.transform, + op->base.src.scale, + &v[5], &v[6]); + + dst.p.y = box->y1; + v[8] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1, + op->base.src.offset[1] + box->y1, + op->base.src.transform, + op->base.src.scale, + &v[9], &v[10]); + + v[11] = v[7] = v[3] = opacity; +} + fastcall static void emit_span_boxes_affine(const struct sna_composite_spans_op *op, const struct sna_opacity_box *b, int nbox, @@ -1432,11 +2266,161 @@ emit_span_boxes_affine(const struct sna_composite_spans_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_span_boxes_affine__sse4_2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2, + op->base.src.offset[1] + b->box.y2, + op->base.src.transform, + op->base.src.scale, + &v[1], &v[2]); + + dst.p.x = b->box.x1; + v[4] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1, + op->base.src.offset[1] + b->box.y2, + op->base.src.transform, + op->base.src.scale, + &v[5], &v[6]); + + dst.p.y = b->box.y1; + v[8] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1, + op->base.src.offset[1] + b->box.y1, + op->base.src.transform, + op->base.src.scale, + &v[9], &v[10]); + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + +avx2 fastcall static void +emit_span_boxes_affine__avx2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2, + op->base.src.offset[1] + b->box.y2, + op->base.src.transform, + op->base.src.scale, + &v[1], &v[2]); + + dst.p.x = b->box.x1; + v[4] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1, + op->base.src.offset[1] + b->box.y2, + op->base.src.transform, + op->base.src.scale, + &v[5], &v[6]); + + dst.p.y = b->box.y1; + v[8] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1, + op->base.src.offset[1] + b->box.y1, + op->base.src.transform, + op->base.src.scale, + &v[9], &v[10]); + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + fastcall static void emit_span_linear(struct sna *sna, - const struct sna_composite_spans_op *op, - const BoxRec *box, - float opacity) + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->base.floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 9; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[3] = dst.f; + dst.p.y = box->y1; + v[6] = dst.f; + + v[1] = compute_linear(&op->base.src, box->x2, box->y2); + v[4] = compute_linear(&op->base.src, box->x1, box->y2); + v[7] = compute_linear(&op->base.src, box->x1, box->y1); + + v[8] = v[5] = v[2] = opacity; +} + +sse4_2 fastcall static void +emit_span_linear__sse4_2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->base.floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 9; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[3] = dst.f; + dst.p.y = box->y1; + v[6] = dst.f; + + v[1] = compute_linear(&op->base.src, box->x2, box->y2); + v[4] = compute_linear(&op->base.src, box->x1, box->y2); + v[7] = compute_linear(&op->base.src, box->x1, box->y1); + + v[8] = v[5] = v[2] = opacity; +} + +avx2 fastcall static void +emit_span_linear__avx2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) { union { struct sna_coordinate p; @@ -1494,6 +2478,66 @@ emit_span_boxes_linear(const struct sna_composite_spans_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_span_boxes_linear__sse4_2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + dst.p.x = b->box.x1; + v[3] = dst.f; + dst.p.y = b->box.y1; + v[6] = dst.f; + + v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2); + v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2); + v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1); + + v[8] = v[5] = v[2] = b->alpha; + + v += 9; + b++; + } while (--nbox); +} + +avx2 fastcall static void +emit_span_boxes_linear__avx2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + dst.p.x = b->box.x1; + v[3] = dst.f; + dst.p.y = b->box.y1; + v[6] = dst.f; + + v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2); + v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2); + v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1); + + v[8] = v[5] = v[2] = b->alpha; + + v += 9; + b++; + } while (--nbox); +} + inline inline static uint32_t gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op) { @@ -1502,7 +2546,8 @@ gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op) return 1 << 2 | id; } -unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp) +unsigned gen4_choose_spans_emitter(struct sna *sna, + struct sna_composite_spans_op *tmp) { unsigned vb; @@ -1512,24 +2557,56 @@ unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp) tmp->base.floats_per_vertex = 3; vb = 1 << 2 | 1; } else if (tmp->base.src.is_linear) { - tmp->prim_emit = emit_span_linear; - tmp->emit_boxes = emit_span_boxes_linear; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_span_linear__avx2; + tmp->emit_boxes = emit_span_boxes_linear__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_span_linear__sse4_2; + tmp->emit_boxes = emit_span_boxes_linear__sse4_2; + } else { + tmp->prim_emit = emit_span_linear; + tmp->emit_boxes = emit_span_boxes_linear; + } tmp->base.floats_per_vertex = 3; vb = 1 << 2 | 1; } else if (tmp->base.src.transform == NULL) { - tmp->prim_emit = emit_span_identity; - tmp->emit_boxes = emit_span_boxes_identity; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_span_identity__avx2; + tmp->emit_boxes = emit_span_boxes_identity__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_span_identity__sse4_2; + tmp->emit_boxes = emit_span_boxes_identity__sse4_2; + } else { + tmp->prim_emit = emit_span_identity; + tmp->emit_boxes = emit_span_boxes_identity; + } tmp->base.floats_per_vertex = 4; vb = 1 << 2 | 2; } else if (tmp->base.is_affine) { tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2]; tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2]; if (!sna_affine_transform_is_rotation(tmp->base.src.transform)) { - tmp->prim_emit = emit_span_simple; - tmp->emit_boxes = emit_span_boxes_simple; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_span_simple__avx2; + tmp->emit_boxes = emit_span_boxes_simple__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_span_simple__sse4_2; + tmp->emit_boxes = emit_span_boxes_simple__sse4_2; + } else { + tmp->prim_emit = emit_span_simple; + tmp->emit_boxes = emit_span_boxes_simple; + } } else { - tmp->prim_emit = emit_span_affine; - tmp->emit_boxes = emit_span_boxes_affine; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_span_affine__avx2; + tmp->emit_boxes = emit_span_boxes_affine__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_span_affine__sse4_2; + tmp->emit_boxes = emit_span_boxes_affine__sse4_2; + } else { + tmp->prim_emit = emit_span_affine; + tmp->emit_boxes = emit_span_boxes_affine; + } } tmp->base.floats_per_vertex = 4; vb = 1 << 2 | 2; diff --git a/src/sna/gen4_vertex.h b/src/sna/gen4_vertex.h index 431b545e..1494ba14 100644 --- a/src/sna/gen4_vertex.h +++ b/src/sna/gen4_vertex.h @@ -10,7 +10,7 @@ void gen4_vertex_flush(struct sna *sna); int gen4_vertex_finish(struct sna *sna); void gen4_vertex_close(struct sna *sna); -unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp); -unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp); +unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp); +unsigned gen4_choose_spans_emitter(struct sna *sna, struct sna_composite_spans_op *tmp); #endif /* GEN4_VERTEX_H */ diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c index f2368773..8b9eaac1 100644 --- a/src/sna/gen5_render.c +++ b/src/sna/gen5_render.c @@ -1924,7 +1924,7 @@ gen5_render_composite(struct sna *sna, tmp->mask.bo != NULL, tmp->has_component_alpha, tmp->is_affine); - tmp->u.gen5.ve_id = gen4_choose_composite_emitter(tmp); + tmp->u.gen5.ve_id = gen4_choose_composite_emitter(sna, tmp); tmp->blt = gen5_render_composite_blt; tmp->box = gen5_render_composite_box; @@ -2152,7 +2152,7 @@ gen5_render_composite_spans(struct sna *sna, tmp->base.has_component_alpha = false; tmp->base.need_magic_ca_pass = false; - tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(tmp); + tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(sna, tmp); tmp->base.u.gen5.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine; tmp->box = gen5_render_composite_spans_box; diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c index fa4c47b1..d4105141 100644 --- a/src/sna/gen6_render.c +++ b/src/sna/gen6_render.c @@ -2272,7 +2272,7 @@ gen6_render_composite(struct sna *sna, tmp->mask.bo != NULL, tmp->has_component_alpha, tmp->is_affine), - gen4_choose_composite_emitter(tmp)); + gen4_choose_composite_emitter(sna, tmp)); tmp->blt = gen6_render_composite_blt; tmp->box = gen6_render_composite_box; @@ -2508,7 +2508,7 @@ gen6_render_composite_spans(struct sna *sna, SAMPLER_EXTEND_PAD), gen6_get_blend(tmp->base.op, false, tmp->base.dst.format), GEN6_WM_KERNEL_OPACITY | !tmp->base.is_affine, - gen4_choose_spans_emitter(tmp)); + gen4_choose_spans_emitter(sna, tmp)); tmp->box = gen6_render_composite_spans_box; tmp->boxes = gen6_render_composite_spans_boxes; diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c index bd14d90f..7984cf19 100644 --- a/src/sna/gen7_render.c +++ b/src/sna/gen7_render.c @@ -2412,7 +2412,7 @@ gen7_render_composite(struct sna *sna, tmp->mask.bo != NULL, tmp->has_component_alpha, tmp->is_affine), - gen4_choose_composite_emitter(tmp)); + gen4_choose_composite_emitter(sna, tmp)); tmp->blt = gen7_render_composite_blt; tmp->box = gen7_render_composite_box; @@ -2628,7 +2628,7 @@ gen7_render_composite_spans(struct sna *sna, SAMPLER_EXTEND_PAD), gen7_get_blend(tmp->base.op, false, tmp->base.dst.format), GEN7_WM_KERNEL_OPACITY | !tmp->base.is_affine, - gen4_choose_spans_emitter(tmp)); + gen4_choose_spans_emitter(sna, tmp)); tmp->box = gen7_render_composite_spans_box; tmp->boxes = gen7_render_composite_spans_boxes; |