diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/evergreen_exa.c | 2 | ||||
-rw-r--r-- | src/evergreen_shader.c | 622 |
2 files changed, 477 insertions, 147 deletions
diff --git a/src/evergreen_exa.c b/src/evergreen_exa.c index 1c027525..89afaff5 100644 --- a/src/evergreen_exa.c +++ b/src/evergreen_exa.c @@ -1318,7 +1318,7 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture, /* Shader */ vs_conf.shader_addr = accel_state->vs_mc_addr; vs_conf.shader_size = accel_state->vs_size; - vs_conf.num_gprs = 3; + vs_conf.num_gprs = 5; vs_conf.stack_size = 1; vs_conf.bo = accel_state->shaders_bo; evergreen_vs_setup(pScrn, &vs_conf, RADEON_GEM_DOMAIN_VRAM); diff --git a/src/evergreen_shader.c b/src/evergreen_shader.c index 42cea7ab..ef56d2d4 100644 --- a/src/evergreen_shader.c +++ b/src/evergreen_shader.c @@ -1410,7 +1410,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) WHOLE_QUAD_MODE(0), BARRIER(1)); /* 3 - mask sub */ - shader[i++] = CF_DWORD0(ADDR(32), + shader[i++] = CF_DWORD0(ADDR(44), JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A)); shader[i++] = CF_DWORD1(POP_COUNT(0), CF_CONST(0), @@ -1430,7 +1430,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP), KCACHE_ADDR0(0), KCACHE_ADDR1(0), - I_COUNT(12), + I_COUNT(20), ALT_CONST(0), CF_INST(SQ_CF_INST_ALU), WHOLE_QUAD_MODE(0), @@ -1500,7 +1500,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) WHOLE_QUAD_MODE(0), BARRIER(1)); /* 9 - non-mask sub */ - shader[i++] = CF_DWORD0(ADDR(38), + shader[i++] = CF_DWORD0(ADDR(50), JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A)); shader[i++] = CF_DWORD1(POP_COUNT(0), CF_CONST(0), @@ -1513,14 +1513,14 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) BARRIER(1)); /* 10 - ALU */ - shader[i++] = CF_ALU_DWORD0(ADDR(26), + shader[i++] = CF_ALU_DWORD0(ADDR(34), KCACHE_BANK0(0), KCACHE_BANK1(0), KCACHE_MODE0(SQ_CF_KCACHE_LOCK_1)); shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP), KCACHE_ADDR0(0), KCACHE_ADDR1(0), - I_COUNT(6), + I_COUNT(10), ALT_CONST(0), CF_INST(SQ_CF_INST_ALU), WHOLE_QUAD_MODE(0), @@ -1573,189 +1573,408 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) WHOLE_QUAD_MODE(0), BARRIER(1)); - /* mask alu - 14 srcX MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 0), + /* 14 srcX.x DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_X), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_X), + CLAMP(0)); + + /* 15 srcX.y DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_Y), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 1), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), SRC1_REL(ABSOLUTE), SRC1_ELEM(ELEM_Y), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), - LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 0), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(1), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(0)); + + /* 16 srcX.z DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(3), DST_REL(ABSOLUTE), DST_ELEM(ELEM_Z), CLAMP(0)); - /* 15 srcY MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 1), + + /* 17 srcX.w DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), SRC0_REL(ABSOLUTE), - SRC0_ELEM(ELEM_Y), + SRC0_ELEM(ELEM_W), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 1), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), SRC1_REL(ABSOLUTE), - SRC1_ELEM(ELEM_Y), + SRC1_ELEM(ELEM_W), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 1), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(1), + DST_GPR(3), DST_REL(ABSOLUTE), DST_ELEM(ELEM_W), CLAMP(0)); - /* 16 srcX MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 0), + /* 18 srcY.x DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_X), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 1), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1), SRC1_REL(ABSOLUTE), SRC1_ELEM(ELEM_X), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), LAST(0)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 1), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(1), + DST_GPR(3), DST_REL(ABSOLUTE), DST_ELEM(ELEM_X), CLAMP(0)); - /* 17 srcY MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 1), + + /* 19 srcY.y DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Y), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Y), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(0)); + + /* 20 srcY.z DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Z), + CLAMP(0)); + + /* 21 srcY.w DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_W), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_W), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(1)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_W), + CLAMP(0)); + + /* 22 maskX.x DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_X), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 1), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 2), SRC1_REL(ABSOLUTE), SRC1_ELEM(ELEM_X), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), - LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 1), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_W), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(1), + DST_GPR(4), DST_REL(ABSOLUTE), - DST_ELEM(ELEM_Y), + DST_ELEM(ELEM_X), CLAMP(0)); - /* 18 maskX MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 2), + /* 23 maskX.y DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_Y), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 2), SRC1_REL(ABSOLUTE), SRC1_ELEM(ELEM_Y), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), - LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 2), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(0), + DST_GPR(4), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(0)); + + /* 24 maskX.z DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 2), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(4), DST_REL(ABSOLUTE), DST_ELEM(ELEM_Z), CLAMP(0)); - /* 19 maskY MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 3), + /* 25 maskX.w DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), - SRC0_ELEM(ELEM_Y), + SRC0_ELEM(ELEM_W), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 2), SRC1_REL(ABSOLUTE), - SRC1_ELEM(ELEM_Y), + SRC1_ELEM(ELEM_W), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 3), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(0), + DST_GPR(4), DST_REL(ABSOLUTE), DST_ELEM(ELEM_W), CLAMP(0)); - /* 20 srcX MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 2), + /* 26 maskY.x DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_X), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 3), SRC1_REL(ABSOLUTE), SRC1_ELEM(ELEM_X), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), LAST(0)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 0), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(0), + DST_GPR(4), DST_REL(ABSOLUTE), DST_ELEM(ELEM_X), CLAMP(0)); - /* 21 srcY MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 3), + + /* 27 maskY.y DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), - SRC0_ELEM(ELEM_X), + SRC0_ELEM(ELEM_Y), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 3), SRC1_REL(ABSOLUTE), - SRC1_ELEM(ELEM_X), + SRC1_ELEM(ELEM_Y), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), - LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 0), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_W), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(0), + DST_GPR(4), DST_REL(ABSOLUTE), DST_ELEM(ELEM_Y), CLAMP(0)); - /* 22 srcX / w */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), + /* 28 maskY.z DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 3), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(4), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Z), + CLAMP(0)); + + /* 29 maskY.w DOT4 - mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_W), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 3), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_W), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(1)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(4), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_W), + CLAMP(0)); + + /* 30 srcX / w */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 3), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_X), SRC0_NEG(0), @@ -1779,8 +1998,8 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) DST_ELEM(ELEM_X), CLAMP(0)); - /* 23 srcY / h */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1), + /* 31 srcY / h */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 3), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_Y), SRC0_NEG(0), @@ -1804,8 +2023,8 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) DST_ELEM(ELEM_Y), CLAMP(0)); - /* 24 maskX / w */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + /* 32 maskX / w */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 4), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_X), SRC0_NEG(0), @@ -1829,8 +2048,8 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) DST_ELEM(ELEM_X), CLAMP(0)); - /* 25 maskY / h */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + /* 33 maskY / h */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 4), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_Y), SRC0_NEG(0), @@ -1854,98 +2073,209 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) DST_ELEM(ELEM_Y), CLAMP(0)); - /* no mask alu - 26 srcX MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 0), + /* 34 srcX.x DOT4 - non-mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_X), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_X), + CLAMP(0)); + + /* 35 srcX.y DOT4 - non-mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_Y), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), SRC1_REL(ABSOLUTE), SRC1_ELEM(ELEM_Y), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), - LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 0), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(0), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(0)); + + /* 36 srcX.z DOT4 - non-mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(2), DST_REL(ABSOLUTE), DST_ELEM(ELEM_Z), CLAMP(0)); - /* 27 srcY MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 1), + + /* 37 srcX.w DOT4 - non-mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), - SRC0_ELEM(ELEM_Y), + SRC0_ELEM(ELEM_W), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), SRC1_REL(ABSOLUTE), - SRC1_ELEM(ELEM_Y), + SRC1_ELEM(ELEM_W), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 1), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(0), + DST_GPR(2), DST_REL(ABSOLUTE), DST_ELEM(ELEM_W), CLAMP(0)); - /* 28 srcX MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 0), + /* 38 srcY.x DOT4 - non-mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_X), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1), SRC1_REL(ABSOLUTE), SRC1_ELEM(ELEM_X), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), LAST(0)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 0), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_Z), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(0), + DST_GPR(2), DST_REL(ABSOLUTE), DST_ELEM(ELEM_X), CLAMP(0)); - /* 29 srcY MAD */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 1), + + /* 39 srcY.y DOT4 - non-mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), - SRC0_ELEM(ELEM_X), + SRC0_ELEM(ELEM_Y), SRC0_NEG(0), - SRC1_SEL(ALU_SRC_GPR_BASE + 0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1), SRC1_REL(ABSOLUTE), - SRC1_ELEM(ELEM_X), + SRC1_ELEM(ELEM_Y), SRC1_NEG(0), INDEX_MODE(SQ_INDEX_LOOP), PRED_SEL(SQ_PRED_SEL_OFF), - LAST(1)); - shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 0), - SRC2_REL(ABSOLUTE), - SRC2_ELEM(ELEM_W), - SRC2_NEG(0), - ALU_INST(SQ_OP3_INST_MULADD), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), BANK_SWIZZLE(SQ_ALU_VEC_012), - DST_GPR(0), + DST_GPR(2), DST_REL(ABSOLUTE), DST_ELEM(ELEM_Y), CLAMP(0)); - /* 30 srcX / w */ + + /* 40 srcY.z DOT4 - non-mask */ shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Z), + CLAMP(0)); + + /* 41 srcY.w DOT4 - non-mask */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_W), + SRC0_NEG(0), + SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_W), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(1)); + shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_W), + CLAMP(0)); + + /* 42 srcX / w */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 2), + SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_X), SRC0_NEG(0), SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0), @@ -1968,8 +2298,8 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) DST_ELEM(ELEM_X), CLAMP(0)); - /* 31 srcY / h */ - shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0), + /* 43 srcY / h */ + shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 2), SRC0_REL(ABSOLUTE), SRC0_ELEM(ELEM_Y), SRC0_NEG(0), @@ -1993,7 +2323,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) DST_ELEM(ELEM_Y), CLAMP(0)); - /* mask vfetch - 32/33 - dst */ + /* mask vfetch - 44/45 - dst */ shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH), FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA), FETCH_WHOLE_QUAD(0), @@ -2020,7 +2350,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) ALT_CONST(0), BUFFER_INDEX_MODE(SQ_CF_INDEX_NONE)); shader[i++] = VTX_DWORD_PAD; - /* 34/35 - src */ + /* 46/47 - src */ shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH), FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA), FETCH_WHOLE_QUAD(0), @@ -2047,7 +2377,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) ALT_CONST(0), BUFFER_INDEX_MODE(SQ_CF_INDEX_NONE)); shader[i++] = VTX_DWORD_PAD; - /* 36/37 - mask */ + /* 48/49 - mask */ shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH), FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA), FETCH_WHOLE_QUAD(0), @@ -2075,7 +2405,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) BUFFER_INDEX_MODE(SQ_CF_INDEX_NONE)); shader[i++] = VTX_DWORD_PAD; - /* no mask vfetch - 38/39 - dst */ + /* no mask vfetch - 50/51 - dst */ shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH), FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA), FETCH_WHOLE_QUAD(0), @@ -2102,7 +2432,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader) ALT_CONST(0), BUFFER_INDEX_MODE(SQ_CF_INDEX_NONE)); shader[i++] = VTX_DWORD_PAD; - /* 40/41 - src */ + /* 52/53 - src */ shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH), FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA), FETCH_WHOLE_QUAD(0), |