From 231aee18a73805be2f6c962e94a8345dd89fd0df Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 9 Feb 2009 13:02:27 -0500 Subject: R6xx/R7xx Xv: implement native shader for planar formats --- src/r600_exa.c | 574 ++++++++++++++++++++++++++++++++++++++++- src/r600_textured_videofuncs.c | 262 +++++++++++++------ src/radeon.h | 3 +- src/radeon_textured_video.c | 67 +++-- 4 files changed, 803 insertions(+), 103 deletions(-) diff --git a/src/r600_exa.c b/src/r600_exa.c index a38469af..950e6ac8 100644 --- a/src/r600_exa.c +++ b/src/r600_exa.c @@ -2263,7 +2263,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen) uint32_t *vs; uint32_t *ps; // 512 bytes per shader for now - int size = 512 * 10; + int size = 512 * 11; int i; accel_state->shaders = NULL; @@ -2285,7 +2285,8 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen) accel_state->comp_mask_vs_offset = 3072; accel_state->comp_mask_ps_offset = 3584; accel_state->xv_vs_offset = 4096; - accel_state->xv_ps_offset = 4608; + accel_state->xv_ps_offset_nv12 = 4608; + accel_state->xv_ps_offset_planar = 5120; // solid vs --------------------------------------- i = accel_state->solid_vs_offset / 4; @@ -2774,8 +2775,8 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen) MEGA_FETCH(0)); vs[i++] = VTX_DWORD_PAD; - // xv ps --------------------------------------- - i = accel_state->xv_ps_offset / 4; + // xv ps nv12 ---------------------------------- + i = accel_state->xv_ps_offset_nv12 / 4; // 0 ps[i++] = CF_DWORD0(ADDR(20)); ps[i++] = CF_DWORD1(POP_COUNT(0), @@ -3311,6 +3312,571 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen) SRC_SEL_W(SQ_SEL_1)); ps[i++] = TEX_DWORD_PAD; + // xv ps planar ---------------------------------- + i = accel_state->xv_ps_offset_planar / 4; + // 0 + ps[i++] = CF_DWORD0(ADDR(20)); + ps[i++] = CF_DWORD1(POP_COUNT(0), + CF_CONST(0), + COND(SQ_CF_COND_ACTIVE), + I_COUNT(3), + CALL_COUNT(0), + END_OF_PROGRAM(0), + VALID_PIXEL_MODE(0), + CF_INST(SQ_CF_INST_TEX), + WHOLE_QUAD_MODE(0), + BARRIER(0)); + // 1 + ps[i++] = CF_ALU_DWORD0(ADDR(3), + KCACHE_BANK0(0), + KCACHE_BANK1(0), + KCACHE_MODE0(SQ_CF_KCACHE_NOP)); + ps[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP), + KCACHE_ADDR0(0), + KCACHE_ADDR1(0), + I_COUNT(16), + USES_WATERFALL(0), + CF_INST(SQ_CF_INST_ALU), + WHOLE_QUAD_MODE(0), + BARRIER(1)); + // 2 + ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0), + TYPE(SQ_EXPORT_PIXEL), + RW_GPR(3), + RW_REL(ABSOLUTE), + INDEX_GPR(0), + ELEM_SIZE(3)); + ps[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X), + SRC_SEL_Y(SQ_SEL_Y), + SRC_SEL_Z(SQ_SEL_Z), + SRC_SEL_W(SQ_SEL_W), + R6xx_ELEM_LOOP(0), + BURST_COUNT(1), + END_OF_PROGRAM(1), + VALID_PIXEL_MODE(0), + CF_INST(SQ_CF_INST_EXPORT_DONE), + WHOLE_QUAD_MODE(0), + BARRIER(1)); + // 3 - alu 0 + // DP4 gpr[2].x gpr[1].x c[0].x + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_X), + SRC0_NEG(0), + SRC1_SEL(256), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_X), + CLAMP(1)); + // 4 - alu 1 + // DP4 gpr[2].y gpr[1].y c[0].y + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Y), + SRC0_NEG(0), + SRC1_SEL(256), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Y), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(1)); + // 5 - alu 2 + // DP4 gpr[2].z gpr[1].z c[0].z + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(256), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Z), + CLAMP(1)); + // 6 - alu 3 + // DP4 gpr[2].w gpr[1].w c[0].w + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_W), + SRC0_NEG(0), + SRC1_SEL(256), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_W), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(1)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_021), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_W), + CLAMP(1)); + // 7 - alu 4 + // DP4 gpr[2].x gpr[1].x c[1].x + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_X), + SRC0_NEG(0), + SRC1_SEL(257), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_X), + CLAMP(1)); + // 8 - alu 5 + // DP4 gpr[2].y gpr[1].y c[1].y + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Y), + SRC0_NEG(0), + SRC1_SEL(257), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Y), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(1)); + // 9 - alu 6 + // DP4 gpr[2].z gpr[1].z c[1].z + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(257), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Z), + CLAMP(1)); + // 10 - alu 7 + // DP4 gpr[2].w gpr[1].w c[1].w + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_W), + SRC0_NEG(0), + SRC1_SEL(257), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_W), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(1)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_021), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_W), + CLAMP(1)); + // 11 - alu 8 + // DP4 gpr[2].x gpr[1].x c[2].x + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_X), + SRC0_NEG(0), + SRC1_SEL(258), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_X), + CLAMP(1)); + // 12 - alu 9 + // DP4 gpr[2].y gpr[1].y c[2].y + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Y), + SRC0_NEG(0), + SRC1_SEL(258), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Y), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(1)); + // 13 - alu 10 + // DP4 gpr[2].z gpr[1].z c[2].z + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(258), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_102), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Z), + CLAMP(1)); + // 14 - alu 11 + // DP4 gpr[2].w gpr[1].w c[2].w + ps[i++] = ALU_DWORD0(SRC0_SEL(1), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_W), + SRC0_NEG(0), + SRC1_SEL(258), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_W), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(1)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(0), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_DOT4), + BANK_SWIZZLE(SQ_ALU_VEC_021), + DST_GPR(2), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_W), + CLAMP(1)); + // 15 - alu 12 + // MOV gpr[3].x gpr[2].x + ps[i++] = ALU_DWORD0(SRC0_SEL(2), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_X), + SRC0_NEG(0), + SRC1_SEL(0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_MOV), + BANK_SWIZZLE(SQ_ALU_VEC_210), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_X), + CLAMP(0)); + // 16 - alu 13 + // MOV gpr[3].y gpr[2].y + ps[i++] = ALU_DWORD0(SRC0_SEL(2), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Y), + SRC0_NEG(0), + SRC1_SEL(0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_MOV), + BANK_SWIZZLE(SQ_ALU_VEC_210), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(0)); + // 17 - alu 14 + // MOV gpr[3].z gpr[2].z + ps[i++] = ALU_DWORD0(SRC0_SEL(2), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_MOV), + BANK_SWIZZLE(SQ_ALU_VEC_210), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Z), + CLAMP(0)); + // 18 - alu 15 + // MOV gpr[3].w gpr[2].w + ps[i++] = ALU_DWORD0(SRC0_SEL(2), + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_W), + SRC0_NEG(0), + SRC1_SEL(0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_LOOP), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(1)); + ps[i++] = ALU_DWORD1_OP2(info->ChipFamily, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_MOV), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(3), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_W), + CLAMP(0)); + // 19 - alignment + ps[i++] = 0x00000000; + ps[i++] = 0x00000000; + // 20/21 - tex 0 + ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE), + BC_FRAC_MODE(0), + FETCH_WHOLE_QUAD(0), + RESOURCE_ID(0), + SRC_GPR(0), + SRC_REL(ABSOLUTE), + R7xx_ALT_CONST(0)); + ps[i++] = TEX_DWORD1(DST_GPR(1), + DST_REL(ABSOLUTE), + DST_SEL_X(SQ_SEL_X), //R + DST_SEL_Y(SQ_SEL_MASK), //G + DST_SEL_Z(SQ_SEL_MASK), //B + DST_SEL_W(SQ_SEL_1), //A + LOD_BIAS(0), + COORD_TYPE_X(TEX_NORMALIZED), + COORD_TYPE_Y(TEX_NORMALIZED), + COORD_TYPE_Z(TEX_NORMALIZED), + COORD_TYPE_W(TEX_NORMALIZED)); + ps[i++] = TEX_DWORD2(OFFSET_X(0), + OFFSET_Y(0), + OFFSET_Z(0), + SAMPLER_ID(0), + SRC_SEL_X(SQ_SEL_X), + SRC_SEL_Y(SQ_SEL_Y), + SRC_SEL_Z(SQ_SEL_0), + SRC_SEL_W(SQ_SEL_1)); + ps[i++] = TEX_DWORD_PAD; + // 22/23 - tex 1 + ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE), + BC_FRAC_MODE(0), + FETCH_WHOLE_QUAD(0), + RESOURCE_ID(1), + SRC_GPR(0), + SRC_REL(ABSOLUTE), + R7xx_ALT_CONST(0)); + ps[i++] = TEX_DWORD1(DST_GPR(1), + DST_REL(ABSOLUTE), + DST_SEL_X(SQ_SEL_MASK), //R + DST_SEL_Y(SQ_SEL_MASK), //G + DST_SEL_Z(SQ_SEL_X), //B + DST_SEL_W(SQ_SEL_MASK), //A + LOD_BIAS(0), + COORD_TYPE_X(TEX_NORMALIZED), + COORD_TYPE_Y(TEX_NORMALIZED), + COORD_TYPE_Z(TEX_NORMALIZED), + COORD_TYPE_W(TEX_NORMALIZED)); + ps[i++] = TEX_DWORD2(OFFSET_X(0), + OFFSET_Y(0), + OFFSET_Z(0), + SAMPLER_ID(1), + SRC_SEL_X(SQ_SEL_X), + SRC_SEL_Y(SQ_SEL_Y), + SRC_SEL_Z(SQ_SEL_0), + SRC_SEL_W(SQ_SEL_1)); + ps[i++] = TEX_DWORD_PAD; + // 24/25 - tex 2 + ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE), + BC_FRAC_MODE(0), + FETCH_WHOLE_QUAD(0), + RESOURCE_ID(2), + SRC_GPR(0), + SRC_REL(ABSOLUTE), + R7xx_ALT_CONST(0)); + ps[i++] = TEX_DWORD1(DST_GPR(1), + DST_REL(ABSOLUTE), + DST_SEL_X(SQ_SEL_MASK), //R + DST_SEL_Y(SQ_SEL_X), //G + DST_SEL_Z(SQ_SEL_MASK), //B + DST_SEL_W(SQ_SEL_MASK), //A + LOD_BIAS(0), + COORD_TYPE_X(TEX_NORMALIZED), + COORD_TYPE_Y(TEX_NORMALIZED), + COORD_TYPE_Z(TEX_NORMALIZED), + COORD_TYPE_W(TEX_NORMALIZED)); + ps[i++] = TEX_DWORD2(OFFSET_X(0), + OFFSET_Y(0), + OFFSET_Z(0), + SAMPLER_ID(2), + SRC_SEL_X(SQ_SEL_X), + SRC_SEL_Y(SQ_SEL_Y), + SRC_SEL_Z(SQ_SEL_0), + SRC_SEL_W(SQ_SEL_1)); + ps[i++] = TEX_DWORD_PAD; + // comp mask vs --------------------------------------- i = accel_state->comp_mask_vs_offset / 4; //0 diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c index 222740e5..42a5d68d 100644 --- a/src/r600_textured_videofuncs.c +++ b/src/r600_textured_videofuncs.c @@ -108,8 +108,20 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv) accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset + accel_state->xv_vs_offset; - accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset + - accel_state->xv_ps_offset; + + switch(pPriv->id) { + case FOURCC_YV12: + case FOURCC_I420: + accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset + + accel_state->xv_ps_offset_planar; + break; + case FOURCC_UYVY: + case FOURCC_YUY2: + default: + accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset + + accel_state->xv_ps_offset_nv12; + break; + } accel_state->vs_size = 512; accel_state->ps_size = 512; @@ -141,76 +153,182 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv) set_alu_consts(pScrn, accel_state->ib, 0, sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts); /* Texture */ - accel_state->src_mc_addr[0] = pPriv->src_offset; - accel_state->src_size[0] = exaGetPixmapPitch(pPixmap) * pPriv->w; - - /* flush texture cache */ - cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0], - accel_state->src_mc_addr[0]); - - // Y texture - tex_res.id = 0; - tex_res.w = pPriv->w; - tex_res.h = pPriv->h; - tex_res.pitch = accel_state->src_pitch[0]; - tex_res.depth = 0; - tex_res.dim = SQ_TEX_DIM_2D; - tex_res.base = accel_state->src_mc_addr[0]; - tex_res.mip_base = accel_state->src_mc_addr[0]; - - tex_res.format = FMT_8; - tex_res.dst_sel_x = SQ_SEL_X; //Y - tex_res.dst_sel_y = SQ_SEL_1; - tex_res.dst_sel_z = SQ_SEL_1; - tex_res.dst_sel_w = SQ_SEL_1; - - tex_res.request_size = 1; - tex_res.base_level = 0; - tex_res.last_level = 0; - tex_res.perf_modulation = 0; - tex_res.interlaced = 0; - set_tex_resource (pScrn, accel_state->ib, &tex_res); - - // UV texture - uv_offset = accel_state->src_pitch[0] * pPriv->h; - uv_offset = (uv_offset + 255) & ~255; - - cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, - accel_state->src_size[0] / 2, - accel_state->src_mc_addr[0] + uv_offset); - - tex_res.id = 1; - tex_res.format = FMT_8_8; - tex_res.w = pPriv->w >> 1; - tex_res.h = pPriv->h >> 1; - tex_res.pitch = accel_state->src_pitch[0] >> 1; - tex_res.dst_sel_x = SQ_SEL_Y; //V - tex_res.dst_sel_y = SQ_SEL_X; //U - tex_res.dst_sel_z = SQ_SEL_1; - tex_res.dst_sel_w = SQ_SEL_1; - tex_res.interlaced = 0; - // XXX tex bases need to be 256B aligned - tex_res.base = accel_state->src_mc_addr[0] + uv_offset; - tex_res.mip_base = accel_state->src_mc_addr[0] + uv_offset; - set_tex_resource (pScrn, accel_state->ib, &tex_res); - - // Y sampler - tex_samp.id = 0; - tex_samp.clamp_x = SQ_TEX_CLAMP_LAST_TEXEL; - tex_samp.clamp_y = SQ_TEX_CLAMP_LAST_TEXEL; - tex_samp.clamp_z = SQ_TEX_WRAP; - - // xxx: switch to bicubic - tex_samp.xy_mag_filter = SQ_TEX_XY_FILTER_BILINEAR; - tex_samp.xy_min_filter = SQ_TEX_XY_FILTER_BILINEAR; - - tex_samp.z_filter = SQ_TEX_Z_FILTER_NONE; - tex_samp.mip_filter = 0; /* no mipmap */ - set_tex_sampler (pScrn, accel_state->ib, &tex_samp); - - // UV sampler - tex_samp.id = 1; - set_tex_sampler (pScrn, accel_state->ib, &tex_samp); + switch(pPriv->id) { + case FOURCC_YV12: + case FOURCC_I420: + accel_state->src_mc_addr[0] = pPriv->src_offset; + accel_state->src_size[0] = accel_state->src_pitch[0] * pPriv->h; + + /* flush texture cache */ + cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0], + accel_state->src_mc_addr[0]); + + // Y texture + tex_res.id = 0; + tex_res.w = pPriv->w; + tex_res.h = pPriv->h; + tex_res.pitch = accel_state->src_pitch[0]; + tex_res.depth = 0; + tex_res.dim = SQ_TEX_DIM_2D; + tex_res.base = accel_state->src_mc_addr[0]; + tex_res.mip_base = accel_state->src_mc_addr[0]; + + tex_res.format = FMT_8; + tex_res.dst_sel_x = SQ_SEL_X; //Y + tex_res.dst_sel_y = SQ_SEL_1; + tex_res.dst_sel_z = SQ_SEL_1; + tex_res.dst_sel_w = SQ_SEL_1; + + tex_res.request_size = 1; + tex_res.base_level = 0; + tex_res.last_level = 0; + tex_res.perf_modulation = 0; + tex_res.interlaced = 0; + set_tex_resource (pScrn, accel_state->ib, &tex_res); + + // Y sampler + tex_samp.id = 0; + tex_samp.clamp_x = SQ_TEX_CLAMP_LAST_TEXEL; + tex_samp.clamp_y = SQ_TEX_CLAMP_LAST_TEXEL; + tex_samp.clamp_z = SQ_TEX_WRAP; + + // xxx: switch to bicubic + tex_samp.xy_mag_filter = SQ_TEX_XY_FILTER_BILINEAR; + tex_samp.xy_min_filter = SQ_TEX_XY_FILTER_BILINEAR; + + tex_samp.z_filter = SQ_TEX_Z_FILTER_NONE; + tex_samp.mip_filter = 0; /* no mipmap */ + set_tex_sampler (pScrn, accel_state->ib, &tex_samp); + + // U or V texture + uv_offset = accel_state->src_pitch[0] * pPriv->h; + uv_offset = (uv_offset + 255) & ~255; + + cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, + accel_state->src_size[0] / 2, + accel_state->src_mc_addr[0] + uv_offset); + + tex_res.id = 1; + tex_res.format = FMT_8; + tex_res.w = pPriv->w >> 1; + tex_res.h = pPriv->h >> 1; + tex_res.pitch = accel_state->src_pitch[0] >> 1; + tex_res.dst_sel_x = SQ_SEL_X; //V or U + tex_res.dst_sel_y = SQ_SEL_1; + tex_res.dst_sel_z = SQ_SEL_1; + tex_res.dst_sel_w = SQ_SEL_1; + tex_res.interlaced = 0; + // XXX tex bases need to be 256B aligned + tex_res.base = accel_state->src_mc_addr[0] + uv_offset; + tex_res.mip_base = accel_state->src_mc_addr[0] + uv_offset; + set_tex_resource (pScrn, accel_state->ib, &tex_res); + + // U or V sampler + tex_samp.id = 1; + set_tex_sampler (pScrn, accel_state->ib, &tex_samp); + + // U or V texture + uv_offset += ((accel_state->src_pitch[0] >> 1) * (pPriv->h >> 1)); + uv_offset = (uv_offset + 255) & ~255; + + cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, + accel_state->src_size[0] / 2, + accel_state->src_mc_addr[0] + uv_offset); + + tex_res.id = 2; + tex_res.format = FMT_8; + tex_res.w = pPriv->w >> 1; + tex_res.h = pPriv->h >> 1; + tex_res.pitch = accel_state->src_pitch[0] >> 1; + tex_res.dst_sel_x = SQ_SEL_X; //V or U + tex_res.dst_sel_y = SQ_SEL_1; + tex_res.dst_sel_z = SQ_SEL_1; + tex_res.dst_sel_w = SQ_SEL_1; + tex_res.interlaced = 0; + // XXX tex bases need to be 256B aligned + tex_res.base = accel_state->src_mc_addr[0] + uv_offset; + tex_res.mip_base = accel_state->src_mc_addr[0] + uv_offset; + set_tex_resource (pScrn, accel_state->ib, &tex_res); + + // UV sampler + tex_samp.id = 2; + set_tex_sampler (pScrn, accel_state->ib, &tex_samp); + break; + case FOURCC_UYVY: + case FOURCC_YUY2: + default: + accel_state->src_mc_addr[0] = pPriv->src_offset; + accel_state->src_size[0] = accel_state->src_pitch[0] * pPriv->h; + + /* flush texture cache */ + cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0], + accel_state->src_mc_addr[0]); + + // Y texture + tex_res.id = 0; + tex_res.w = pPriv->w; + tex_res.h = pPriv->h; + tex_res.pitch = accel_state->src_pitch[0]; + tex_res.depth = 0; + tex_res.dim = SQ_TEX_DIM_2D; + tex_res.base = accel_state->src_mc_addr[0]; + tex_res.mip_base = accel_state->src_mc_addr[0]; + + tex_res.format = FMT_8; + tex_res.dst_sel_x = SQ_SEL_X; //Y + tex_res.dst_sel_y = SQ_SEL_1; + tex_res.dst_sel_z = SQ_SEL_1; + tex_res.dst_sel_w = SQ_SEL_1; + + tex_res.request_size = 1; + tex_res.base_level = 0; + tex_res.last_level = 0; + tex_res.perf_modulation = 0; + tex_res.interlaced = 0; + set_tex_resource (pScrn, accel_state->ib, &tex_res); + + // Y sampler + tex_samp.id = 0; + tex_samp.clamp_x = SQ_TEX_CLAMP_LAST_TEXEL; + tex_samp.clamp_y = SQ_TEX_CLAMP_LAST_TEXEL; + tex_samp.clamp_z = SQ_TEX_WRAP; + + // xxx: switch to bicubic + tex_samp.xy_mag_filter = SQ_TEX_XY_FILTER_BILINEAR; + tex_samp.xy_min_filter = SQ_TEX_XY_FILTER_BILINEAR; + + tex_samp.z_filter = SQ_TEX_Z_FILTER_NONE; + tex_samp.mip_filter = 0; /* no mipmap */ + set_tex_sampler (pScrn, accel_state->ib, &tex_samp); + + // UV texture + uv_offset = accel_state->src_pitch[0] * pPriv->h; + uv_offset = (uv_offset + 255) & ~255; + + cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, + accel_state->src_size[0] / 2, + accel_state->src_mc_addr[0] + uv_offset); + + tex_res.id = 1; + tex_res.format = FMT_8_8; + tex_res.w = pPriv->w >> 1; + tex_res.h = pPriv->h >> 1; + tex_res.pitch = accel_state->src_pitch[0] >> 1; + tex_res.dst_sel_x = SQ_SEL_Y; //V + tex_res.dst_sel_y = SQ_SEL_X; //U + tex_res.dst_sel_z = SQ_SEL_1; + tex_res.dst_sel_w = SQ_SEL_1; + tex_res.interlaced = 0; + // XXX tex bases need to be 256B aligned + tex_res.base = accel_state->src_mc_addr[0] + uv_offset; + tex_res.mip_base = accel_state->src_mc_addr[0] + uv_offset; + set_tex_resource (pScrn, accel_state->ib, &tex_res); + + // UV sampler + tex_samp.id = 1; + set_tex_sampler (pScrn, accel_state->ib, &tex_samp); + break; + } /* Render setup */ ereg (accel_state->ib, CB_SHADER_MASK, (0x0f << OUTPUT0_ENABLE_shift)); diff --git a/src/radeon.h b/src/radeon.h index 629e1ffa..2974cdf4 100644 --- a/src/radeon.h +++ b/src/radeon.h @@ -658,7 +658,8 @@ struct radeon_accel_state { uint32_t comp_mask_vs_offset; uint32_t comp_mask_ps_offset; uint32_t xv_vs_offset; - uint32_t xv_ps_offset; + uint32_t xv_ps_offset_nv12; + uint32_t xv_ps_offset_planar; //size/addr stuff uint32_t src_size[2]; diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c index 16b2c829..22e7d174 100644 --- a/src/radeon_textured_video.c +++ b/src/radeon_textured_video.c @@ -154,12 +154,14 @@ static __inline__ uint32_t F_TO_24(float val) #endif /* XF86DRI */ static void -CopyPlanartoNV12(unsigned char *y_src, unsigned char *u_src, unsigned char *v_src, - unsigned char *dst, - int srcPitch, int srcPitch2, int dstPitch, - int w, int h) +R600CopyPlanar(unsigned char *y_src, unsigned char *u_src, unsigned char *v_src, + unsigned char *dst, + int srcPitch, int srcPitch2, int dstPitch, + int w, int h) { - int i, j; + int i; + int dstPitch2 = dstPitch >> 1; + int h2 = h >> 1; /* Y */ if (srcPitch == dstPitch) { @@ -177,21 +179,34 @@ CopyPlanartoNV12(unsigned char *y_src, unsigned char *u_src, unsigned char *v_sr if (h & 1) dst += dstPitch; - /* UV */ - for (i = 0; i < (h >> 1); i++) { - unsigned char *u = u_src; - unsigned char *v = v_src; - unsigned char *uv = dst; + /* V */ + if (srcPitch2 == dstPitch2) { + memcpy(dst, v_src, srcPitch2 * h2); + dst += (dstPitch2 * h2); + } else { + for (i = 0; i < h2; i++) { + memcpy(dst, v_src, srcPitch2); + v_src += srcPitch2; + dst += dstPitch2; + } + } - for (j = 0; j < w; j++) { - uv[0] = v[j]; - uv[1] = u[j]; - uv += 2; - } - dst += dstPitch; - u_src += srcPitch2; - v_src += srcPitch2; + /* tex base need 256B alignment */ + if (h2 & 1) + dst += dstPitch2; + + /* U */ + if (srcPitch2 == dstPitch2) { + memcpy(dst, u_src, srcPitch2 * h2); + dst += (dstPitch2 * h2); + } else { + for (i = 0; i < h2; i++) { + memcpy(dst, u_src, srcPitch2); + u_src += srcPitch2; + dst += dstPitch2; + } } + } static void @@ -392,15 +407,15 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn, s2offset = srcPitch * height; s3offset = (srcPitch2 * (height >> 1)) + s2offset; if (id == FOURCC_YV12) - CopyPlanartoNV12(buf, buf + s3offset, buf + s2offset, - pPriv->src_addr, - srcPitch, srcPitch2, pPriv->src_pitch, - width, height); + R600CopyPlanar(buf, buf + s3offset, buf + s2offset, + pPriv->src_addr, + srcPitch, srcPitch2, pPriv->src_pitch, + width, height); else - CopyPlanartoNV12(buf, buf + s2offset, buf + s3offset, - pPriv->src_addr, - srcPitch, srcPitch2, pPriv->src_pitch, - width, height); + R600CopyPlanar(buf, buf + s2offset, buf + s3offset, + pPriv->src_addr, + srcPitch, srcPitch2, pPriv->src_pitch, + width, height); } else { top &= ~1; -- cgit v1.2.3