diff options
-rw-r--r-- | src/sna/gen4_render.c | 116 |
1 files changed, 66 insertions, 50 deletions
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c index bc376151..d62d744b 100644 --- a/src/sna/gen4_render.c +++ b/src/sna/gen4_render.c @@ -80,7 +80,7 @@ #define URB_CS_ENTRIES 0 #define URB_VS_ENTRY_SIZE 1 // each 512-bit row -#define URB_VS_ENTRIES 8 // we needs at least 8 entries +#define URB_VS_ENTRIES 32 // we needs at least 8 entries #define URB_GS_ENTRY_SIZE 0 #define URB_GS_ENTRIES 0 @@ -89,7 +89,7 @@ #define URB_CLIP_ENTRIES 0 #define URB_SF_ENTRY_SIZE 2 -#define URB_SF_ENTRIES 1 +#define URB_SF_ENTRIES 8 /* * this program computes dA/dx and dA/dy for the texture coordinates along @@ -97,10 +97,18 @@ */ #define SF_KERNEL_NUM_GRF 16 -#define SF_MAX_THREADS 2 #define PS_KERNEL_NUM_GRF 32 -#define PS_MAX_THREADS 48 + +static const struct gt_info { + uint32_t max_sf_threads; + uint32_t max_wm_threads; + uint32_t urb_size; +} gen4_gt_info = { + 16, 32, 256, +}, g4x_gt_info = { + 32, 50, 384, +}; static const uint32_t sf_kernel[][4] = { #include "exa_sf.g4b" @@ -3569,34 +3577,35 @@ static uint32_t gen4_create_vs_unit_state(struct sna_static_stream *stream) } static uint32_t gen4_create_sf_state(struct sna_static_stream *stream, + const struct gt_info *info, uint32_t kernel) { - struct gen4_sf_unit_state *sf_state; + struct gen4_sf_unit_state *sf; - sf_state = sna_static_stream_map(stream, sizeof(*sf_state), 32); + sf = sna_static_stream_map(stream, sizeof(*sf), 32); - sf_state->thread0.grf_reg_count = GEN4_GRF_BLOCKS(SF_KERNEL_NUM_GRF); - sf_state->thread0.kernel_start_pointer = kernel >> 6; - sf_state->sf1.single_program_flow = 1; + sf->thread0.grf_reg_count = GEN4_GRF_BLOCKS(SF_KERNEL_NUM_GRF); + sf->thread0.kernel_start_pointer = kernel >> 6; + sf->sf1.single_program_flow = 1; /* scratch space is not used in our kernel */ - sf_state->thread2.scratch_space_base_pointer = 0; - sf_state->thread3.const_urb_entry_read_length = 0; /* no const URBs */ - sf_state->thread3.const_urb_entry_read_offset = 0; /* no const URBs */ - sf_state->thread3.urb_entry_read_length = 1; /* 1 URB per vertex */ + sf->thread2.scratch_space_base_pointer = 0; + sf->thread3.const_urb_entry_read_length = 0; /* no const URBs */ + sf->thread3.const_urb_entry_read_offset = 0; /* no const URBs */ + sf->thread3.urb_entry_read_length = 1; /* 1 URB per vertex */ /* don't smash vertex header, read start from dw8 */ - sf_state->thread3.urb_entry_read_offset = 1; - sf_state->thread3.dispatch_grf_start_reg = 3; - sf_state->thread4.max_threads = SF_MAX_THREADS - 1; - sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1; - sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES; - sf_state->sf5.viewport_transform = false; /* skip viewport */ - sf_state->sf6.cull_mode = GEN4_CULLMODE_NONE; - sf_state->sf6.scissor = 0; - sf_state->sf7.trifan_pv = 2; - sf_state->sf6.dest_org_vbias = 0x8; - sf_state->sf6.dest_org_hbias = 0x8; + sf->thread3.urb_entry_read_offset = 1; + sf->thread3.dispatch_grf_start_reg = 3; + sf->thread4.max_threads = info->max_sf_threads - 1; + sf->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1; + sf->thread4.nr_urb_entries = URB_SF_ENTRIES; + sf->sf5.viewport_transform = false; /* skip viewport */ + sf->sf6.cull_mode = GEN4_CULLMODE_NONE; + sf->sf6.scissor = 0; + sf->sf7.trifan_pv = 2; + sf->sf6.dest_org_vbias = 0x8; + sf->sf6.dest_org_hbias = 0x8; - return sna_static_stream_offsetof(stream, sf_state); + return sna_static_stream_offsetof(stream, sf); } static uint32_t gen4_create_sampler_state(struct sna_static_stream *stream, @@ -3616,47 +3625,48 @@ static uint32_t gen4_create_sampler_state(struct sna_static_stream *stream, return sna_static_stream_offsetof(stream, sampler_state); } -static void gen4_init_wm_state(struct gen4_wm_unit_state *state, +static void gen4_init_wm_state(struct gen4_wm_unit_state *wm, + const struct gt_info *info, bool has_mask, uint32_t kernel, uint32_t sampler) { - state->thread0.grf_reg_count = GEN4_GRF_BLOCKS(PS_KERNEL_NUM_GRF); - state->thread0.kernel_start_pointer = kernel >> 6; + wm->thread0.grf_reg_count = GEN4_GRF_BLOCKS(PS_KERNEL_NUM_GRF); + wm->thread0.kernel_start_pointer = kernel >> 6; - state->thread1.single_program_flow = 0; + wm->thread1.single_program_flow = 0; /* scratch space is not used in our kernel */ - state->thread2.scratch_space_base_pointer = 0; - state->thread2.per_thread_scratch_space = 0; + wm->thread2.scratch_space_base_pointer = 0; + wm->thread2.per_thread_scratch_space = 0; - state->thread3.const_urb_entry_read_length = 0; - state->thread3.const_urb_entry_read_offset = 0; + wm->thread3.const_urb_entry_read_length = 0; + wm->thread3.const_urb_entry_read_offset = 0; - state->thread3.urb_entry_read_offset = 0; + wm->thread3.urb_entry_read_offset = 0; /* wm kernel use urb from 3, see wm_program in compiler module */ - state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */ + wm->thread3.dispatch_grf_start_reg = 3; /* must match kernel */ - state->wm4.sampler_count = 1; /* 1-4 samplers */ + wm->wm4.sampler_count = 1; /* 1-4 samplers */ - state->wm4.sampler_state_pointer = sampler >> 5; - state->wm5.max_threads = PS_MAX_THREADS - 1; - state->wm5.transposed_urb_read = 0; - state->wm5.thread_dispatch_enable = 1; + wm->wm4.sampler_state_pointer = sampler >> 5; + wm->wm5.max_threads = info->max_wm_threads - 1; + wm->wm5.transposed_urb_read = 0; + wm->wm5.thread_dispatch_enable = 1; /* just use 16-pixel dispatch (4 subspans), don't need to change kernel * start point */ - state->wm5.enable_16_pix = 1; - state->wm5.enable_8_pix = 0; - state->wm5.early_depth_test = 1; + wm->wm5.enable_16_pix = 1; + wm->wm5.enable_8_pix = 0; + wm->wm5.early_depth_test = 1; /* Each pair of attributes (src/mask coords) is two URB entries */ if (has_mask) { - state->thread1.binding_table_entry_count = 3; /* 2 tex and fb */ - state->thread3.urb_entry_read_length = 4; + wm->thread1.binding_table_entry_count = 3; /* 2 tex and fb */ + wm->thread3.urb_entry_read_length = 4; } else { - state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */ - state->thread3.urb_entry_read_length = 2; + wm->thread1.binding_table_entry_count = 2; /* 1 tex and fb */ + wm->thread3.urb_entry_read_length = 2; } } @@ -3716,9 +3726,15 @@ static bool gen4_render_setup(struct sna *sna) struct gen4_render_state *state = &sna->render_state.gen4; struct sna_static_stream general; struct gen4_wm_unit_state_padded *wm_state; + const struct gt_info *info; uint32_t sf[2], wm[KERNEL_COUNT]; int i, j, k, l, m; + if (sna->kgem.gen == 45) + info = &g4x_gt_info; + else + info = &gen4_gt_info; + sna_static_stream_init(&general); /* Zero pad the start. If you see an offset of 0x0 in the batchbuffer @@ -3744,8 +3760,8 @@ static bool gen4_render_setup(struct sna *sna) state->vs = gen4_create_vs_unit_state(&general); - state->sf[0] = gen4_create_sf_state(&general, sf[0]); - state->sf[1] = gen4_create_sf_state(&general, sf[1]); + state->sf[0] = gen4_create_sf_state(&general, info, sf[0]); + state->sf[1] = gen4_create_sf_state(&general, info, sf[1]); /* Set up the WM states: each filter/extend type for source and mask, per @@ -3769,7 +3785,7 @@ static bool gen4_render_setup(struct sna *sna) k, l); for (m = 0; m < KERNEL_COUNT; m++) { - gen4_init_wm_state(&wm_state->state, + gen4_init_wm_state(&wm_state->state, info, wm_kernels[m].has_mask, wm[m], sampler_state); |