summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Packard <keithp@keithp.com>2008-03-30 00:54:51 -0700
committerKeith Packard <keithp@keithp.com>2008-03-30 18:05:32 -0700
commit6304b38423f99190a5e54f1a7dcaa75adfad4f2a (patch)
tree1291c442de6aabe6b2124f502392f768f257fc54
parent771a56b1ed0df69345c723cb62a73b6842cd8227 (diff)
Reimplement wm program for nomask case to handle affine transforms
This involves correctly computing u/v locations based on x/y vectors and line constants computed in new sf program. Also, use fewer instructions to make this go a bit faster (2X for 500x500 composite).
-rw-r--r--src/exa_wm_nomask.g4a94
-rw-r--r--src/exa_wm_nomask_prog.h60
-rw-r--r--src/i965_render.c41
3 files changed, 48 insertions, 147 deletions
diff --git a/src/exa_wm_nomask.g4a b/src/exa_wm_nomask.g4a
index f92dc1a6..97426ec1 100644
--- a/src/exa_wm_nomask.g4a
+++ b/src/exa_wm_nomask.g4a
@@ -40,75 +40,49 @@
* Y1_R is g7
*/
- /* Set up ss0.x coordinates*/
-mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
-mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
- /* Set up ss0.y coordinates */
-mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
- /* set up ss1.x coordinates */
-mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
-mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
- /* set up ss1.y coordinates */
-mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
- /* Set up ss2.x coordinates */
-mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
-mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
- /* Set up ss2.y coordinates */
-mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
- /* Set up ss3.x coordinates */
-mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
-mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
- /* Set up ss3.y coordinates */
-mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+/* Load X and Y coordinates and compute per-pixel coordinates */
+add (16) g4<1>UW g1.8<2,4,0>UW 0x10101010V { align1 };
+add (16) g6<1>UW g1.10<2,4,0>UW 0x11001100V { align1 };
/* Now, map these screen space coordinates into texture coordinates. */
+
/* subtract screen-space X origin of vertex 0. */
-add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
- /* scale by texture X increment */
-mul (8) g4<1>F g4<8,8,1>F g3<0,1,0>F { align1 };
-mul (8) g5<1>F g5<8,8,1>F g3<0,1,0>F { align1 };
- /* add in texture X offset */
-add (8) g4<1>F g4<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F g3.12<0,1,0>F { align1 };
+add (16) g12<1>F g4<8,8,1>UW -g1.0<0,1,0>F { compr align1 };
+
/* subtract screen-space Y origin of vertex 0. */
-add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
- /* scale by texture Y increment */
-mul (8) g6<1>F g6<8,8,1>F g3.20<0,1,0>F { align1 };
-mul (8) g7<1>F g7<8,8,1>F g3.20<0,1,0>F { align1 };
- /* add in texture Y offset */
-add (8) g6<1>F g6<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F g3.28<0,1,0>F { align1 };
+add (16) g16<1>F g6<8,8,1>UW -g1.4<0,1,0>F { compr align1 };
+
+ /* g8/g9 = X * du/dx */
+mul (16) g8<1>F g12<8,8,1>F g3.0<0,1,0>F { compr align1 };
+
+ /* g10/g11 = Y * du/dy */
+mul (16) g10<1>F g16<8,8,1>F g3.4<0,1,0>F { compr align1 };
+
+ /* g8/g9 = X du/dx + Y du/dy */
+add (16) g8<1>F g8<8,8,1>F g10<8,8,1>F { compr align1 };
+
+ /* m1/m2 = g8/g9 + uo */
+add (16) m1<1>F g8<8,8,1>F g3.12<0,1,0>F { compr align1 };
+
+
+ /* g8/g9 = X * dv/dx */
+mul (16) g8<1>F g12<8,8,1>F g3.16<0,1,0>F { compr align1 };
+
+ /* g10/g11 = Y * du/dy */
+mul (16) g10<1>F g16<8,8,1>F g3.20<0,1,0>F { compr align1 };
+
+ /* g8/g9 = X du/dx + Y du/dy */
+add (16) g8<1>F g8<8,8,1>F g10<8,8,1>F { compr align1 };
+
+ /* m3/m4 = g8/g9 + vo */
+add (16) m3<1>F g8<8,8,1>F g3.28<0,1,0>F { compr align1 };
+
/* prepare sampler read back gX register, which would be written back to output */
/* use simd16 sampler, param 0 is u, param 1 is v. */
/* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g4<8,8,1>F { align1 };
-mov (8) m2<1>F g5<8,8,1>F { align1 }; /* param 0 u in m1, m2 */
-mov (8) m3<1>F g6<8,8,1>F { align1 };
-mov (8) m4<1>F g7<8,8,1>F { align1 }; /* param 1 v in m3, m4 */
/* m0 will be copied with g0, as it contains send desc */
/* emit sampler 'send' cmd */
diff --git a/src/exa_wm_nomask_prog.h b/src/exa_wm_nomask_prog.h
index 7870b3b7..c73bdbcd 100644
--- a/src/exa_wm_nomask_prog.h
+++ b/src/exa_wm_nomask_prog.h
@@ -1,51 +1,15 @@
- { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
- { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
- { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
- { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
- { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
- { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
- { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
- { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
- { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
- { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
- { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
- { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
- { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
- { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
- { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
- { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
- { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
- { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
- { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
- { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
- { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
- { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
- { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
- { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
- { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
- { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
- { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
- { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
- { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
- { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
- { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
- { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
- { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
- { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
- { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
- { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
- { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
- { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
- { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
- { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
- { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
- { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
- { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
- { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
- { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
- { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
- { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
- { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
+ { 0x00800040, 0x20806d29, 0x00480028, 0x10101010 },
+ { 0x00800040, 0x20c06d29, 0x0048002a, 0x11001100 },
+ { 0x00802040, 0x2180753d, 0x008d0080, 0x00004020 },
+ { 0x00802040, 0x2200753d, 0x008d00c0, 0x00004024 },
+ { 0x00802041, 0x210077bd, 0x008d0180, 0x00000060 },
+ { 0x00802041, 0x214077bd, 0x008d0200, 0x00000064 },
+ { 0x00802040, 0x210077bd, 0x008d0100, 0x008d0140 },
+ { 0x00802040, 0x202077be, 0x008d0100, 0x0000006c },
+ { 0x00802041, 0x210077bd, 0x008d0180, 0x00000070 },
+ { 0x00802041, 0x214077bd, 0x008d0200, 0x00000074 },
+ { 0x00802040, 0x210077bd, 0x008d0100, 0x008d0140 },
+ { 0x00802040, 0x206077be, 0x008d0100, 0x0000007c },
{ 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
{ 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
{ 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
diff --git a/src/i965_render.c b/src/i965_render.c
index 93583b0a..26c06aa4 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -318,7 +318,7 @@ static const uint32_t sip_kernel_static[][4] = {
*/
#define SF_KERNEL_NUM_GRF 16
-#define SF_MAX_THREADS 1
+#define SF_MAX_THREADS 2
static const uint32_t sf_kernel_static[][4] = {
#include "exa_sf_prog.h"
@@ -328,10 +328,6 @@ static const uint32_t sf_kernel_static_mask[][4] = {
#include "exa_sf_mask_prog.h"
};
-static const uint32_t sf_kernel_static_rotation[][4] = {
-#include "exa_sf_rotation_prog.h"
-};
-
/* ps kernels */
#define PS_KERNEL_NUM_GRF 32
#define PS_MAX_THREADS 32
@@ -352,10 +348,6 @@ static const uint32_t ps_kernel_static_masknoca [][4] = {
#include "exa_wm_masknoca_prog.h"
};
-static const uint32_t ps_kernel_static_rotation [][4] = {
-#include "exa_wm_rotation_prog.h"
-};
-
static uint32_t
i965_get_card_format(PicturePtr pPict)
{
@@ -370,21 +362,6 @@ i965_get_card_format(PicturePtr pPict)
return i965_tex_formats[i].card_fmt;
}
-static Bool
-i965_check_rotation_transform(PictTransformPtr t)
-{
- /* XXX this is arbitrary */
- int a, b;
- a = xFixedToInt(t->matrix[0][1]);
- b = xFixedToInt(t->matrix[1][0]);
- if (a == -1 && b == 1)
- return TRUE;
- else if (a == 1 && b == -1)
- return TRUE;
- else
- return FALSE;
-}
-
Bool
i965_prepare_composite(int op, PicturePtr pSrcPicture,
PicturePtr pMaskPicture, PicturePtr pDstPicture,
@@ -397,7 +374,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
mask_tiled = 0;
uint32_t dst_format, dst_offset, dst_pitch, dst_tile_format = 0,
dst_tiled = 0;
- Bool rotation_program = FALSE;
IntelEmitInvarientState(pScrn);
*pI830->last_3d = LAST_3D_RENDER;
@@ -431,9 +407,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
pI830->transform[1] = NULL;
pI830->scale_units[1][0] = -1;
pI830->scale_units[1][1] = -1;
- if (pI830->transform[0] &&
- i965_check_rotation_transform(pI830->transform[0]))
- rotation_program = TRUE;
} else {
pI830->transform[1] = pMaskPicture->transform;
if (pI830->transform[1])
@@ -469,8 +442,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
sf_kernel_offset = ALIGN(next_offset, 64);
if (pMask)
next_offset = sf_kernel_offset + sizeof (sf_kernel_static_mask);
- else if (rotation_program)
- next_offset = sf_kernel_offset + sizeof (sf_kernel_static_rotation);
else
next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
@@ -488,8 +459,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
} else
next_offset = ps_kernel_offset +
sizeof(ps_kernel_static_masknoca);
- } else if (rotation_program) {
- next_offset = ps_kernel_offset + sizeof (ps_kernel_static_rotation);
} else {
next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask);
}
@@ -816,9 +785,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
if (pMask)
memcpy(sf_kernel, sf_kernel_static_mask,
sizeof (sf_kernel_static_mask));
- else if (rotation_program)
- memcpy(sf_kernel, sf_kernel_static_rotation,
- sizeof (sf_kernel_static_rotation));
else
memcpy(sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));
@@ -870,9 +836,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
} else
memcpy(ps_kernel, ps_kernel_static_masknoca,
sizeof (ps_kernel_static_masknoca));
- } else if (rotation_program) {
- memcpy(ps_kernel, ps_kernel_static_rotation,
- sizeof (ps_kernel_static_rotation));
} else {
memcpy(ps_kernel, ps_kernel_static_nomask,
sizeof (ps_kernel_static_nomask));
@@ -883,7 +846,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
wm_state->thread0.kernel_start_pointer =
(state_base_offset + ps_kernel_offset) >> 6;
wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
- wm_state->thread1.single_program_flow = 1;
+ wm_state->thread1.single_program_flow = 0;
if (!pMask)
wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
else