diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2016-12-11 08:40:05 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2016-12-11 08:40:05 +0000 |
commit | 21ab4c9f31674b113c24177398ed39f29b7cd8e6 (patch) | |
tree | 8be392d7a792d9663c2586396be77bfd506f5164 /lib/mesa/src/gallium/drivers/nouveau/nvc0 | |
parent | a8f0a7916e26e550dd2a26e7188835c481978004 (diff) |
Import Mesa 13.0.2
Diffstat (limited to 'lib/mesa/src/gallium/drivers/nouveau/nvc0')
31 files changed, 3844 insertions, 1843 deletions
diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme index 7f76ec66e..11c20564c 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme @@ -255,7 +255,7 @@ dei_draw_again: parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */ parm $r4 send $r4 /* index_bias, send start */ maddr 0x18e3 /* CB_POS */ - send 0x180 /* 256 + 128 */ + send 0x1a0 /* 256 + 160 */ braz $r2 #dei_end parm $r5 send $r4 /* start_instance, send index_bias */ send $r5 /* send start_instance */ @@ -311,7 +311,7 @@ dai_draw_again: braz $r3 #dai_end parm $r4 send $r4 /* start_instance */ maddr 0x18e3 /* CB_POS */ - send 0x180 /* 256 + 128 */ + send 0x1a0 /* 256 + 160 */ send 0x0 /* send 0 as base_vertex */ send $r4 /* send start_instance */ send $r6 /* draw id */ @@ -374,7 +374,7 @@ deic_draw_again: parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */ parm $r4 send $r4 /* index_bias, send start */ maddr 0x18e3 /* CB_POS */ - send 0x180 /* 256 + 128 */ + send 0x1a0 /* 256 + 160 */ braz $r2 #deic_end parm $r5 send $r4 /* start_instance, send index_bias */ send $r5 /* send start_instance */ @@ -455,7 +455,7 @@ daic_draw_again: braz $r3 #daic_end parm $r4 send $r4 /* start_instance */ maddr 0x18e3 /* CB_POS */ - send 0x180 /* 256 + 128 */ + send 0x1a0 /* 256 + 160 */ send 0x0 /* send 0 as base_vertex */ send $r4 /* send start_instance */ send $r6 /* draw id */ diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h index ecadf7e4d..1c8f4bbf2 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h @@ -140,7 +140,7 @@ uint32_t mme9097_draw_elts_indirect[] = { 0x017dc451, 0x00002431, 0x0638c021, - 0x00600041, + 0x00680041, 0x0004d007, 0x00002531, 0x00002841, @@ -185,7 +185,7 @@ uint32_t mme9097_draw_arrays_indirect[] = { 0x0004d807, 0x00002431, 0x0638c021, - 0x00600041, + 0x00680041, 0x00000041, 0x00002041, 0x00003041, @@ -233,7 +233,7 @@ uint32_t mme9097_draw_elts_indirect_count[] = { 0x017dc451, 0x00002431, 0x0638c021, - 0x00600041, + 0x00680041, 0x0004d007, 0x00002531, 0x00002841, @@ -300,7 +300,7 @@ uint32_t mme9097_draw_arrays_indirect_count[] = { 0x0004d807, 0x00002431, 0x0638c021, - 0x00600041, + 0x00680041, 0x00000041, 0x00002041, 0x00003041, diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com90c0.mme b/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com90c0.mme index a3f1bdeeb..a9233ad80 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com90c0.mme +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/mme/com90c0.mme @@ -1,4 +1,4 @@ -/* NVC0_COMPUTE_MACRO_LAUNCH_GRID_INDIRECT +/* NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT * * arg = num_groups_x * parm[0] = num_groups_y diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index c07f186af..11635c946 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -23,7 +23,8 @@ */ #include "nvc0/nvc0_context.h" -#include "nvc0/nvc0_compute.h" + +#include "nvc0/nvc0_compute.xml.h" int nvc0_screen_compute_setup(struct nvc0_screen *screen, @@ -54,98 +55,89 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen, return ret; } - ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL, - &screen->parm); - if (ret) - return ret; - - BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); + BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1); PUSH_DATA (push, screen->compute->oclass); /* hardware limit */ - BEGIN_NVC0(push, NVC0_COMPUTE(MP_LIMIT), 1); + BEGIN_NVC0(push, NVC0_CP(MP_LIMIT), 1); PUSH_DATA (push, screen->mp_count); - BEGIN_NVC0(push, NVC0_COMPUTE(CALL_LIMIT_LOG), 1); + BEGIN_NVC0(push, NVC0_CP(CALL_LIMIT_LOG), 1); PUSH_DATA (push, 0xf); - BEGIN_NVC0(push, SUBC_COMPUTE(0x02a0), 1); + BEGIN_NVC0(push, SUBC_CP(0x02a0), 1); PUSH_DATA (push, 0x8000); /* global memory setup */ - BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1); + BEGIN_NVC0(push, SUBC_CP(0x02c4), 1); PUSH_DATA (push, 0); - BEGIN_NIC0(push, NVC0_COMPUTE(GLOBAL_BASE), 0x100); + BEGIN_NIC0(push, NVC0_CP(GLOBAL_BASE), 0x100); for (i = 0; i <= 0xff; i++) PUSH_DATA (push, (0xc << 28) | (i << 16) | i); - BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1); + BEGIN_NVC0(push, SUBC_CP(0x02c4), 1); PUSH_DATA (push, 1); /* local memory and cstack setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVC0_CP(TEMP_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->tls->offset); PUSH_DATA (push, screen->tls->offset); - BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_SIZE_HIGH), 2); + BEGIN_NVC0(push, NVC0_CP(TEMP_SIZE_HIGH), 2); PUSH_DATAh(push, screen->tls->size); PUSH_DATA (push, screen->tls->size); - BEGIN_NVC0(push, NVC0_COMPUTE(WARP_TEMP_ALLOC), 1); + BEGIN_NVC0(push, NVC0_CP(WARP_TEMP_ALLOC), 1); PUSH_DATA (push, 0); - BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_BASE), 1); + BEGIN_NVC0(push, NVC0_CP(LOCAL_BASE), 1); PUSH_DATA (push, 0xff << 24); /* shared memory setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(CACHE_SPLIT), 1); + BEGIN_NVC0(push, NVC0_CP(CACHE_SPLIT), 1); PUSH_DATA (push, NVC0_COMPUTE_CACHE_SPLIT_48K_SHARED_16K_L1); - BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_BASE), 1); + BEGIN_NVC0(push, NVC0_CP(SHARED_BASE), 1); PUSH_DATA (push, 0xfe << 24); - BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 1); + BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 1); PUSH_DATA (push, 0); /* code segment setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(CODE_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->text->offset); PUSH_DATA (push, screen->text->offset); /* textures */ - BEGIN_NVC0(push, NVC0_COMPUTE(TIC_ADDRESS_HIGH), 3); + BEGIN_NVC0(push, NVC0_CP(TIC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset); PUSH_DATA (push, screen->txc->offset); PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1); /* samplers */ - BEGIN_NVC0(push, NVC0_COMPUTE(TSC_ADDRESS_HIGH), 3); + BEGIN_NVC0(push, NVC0_CP(TSC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset + 65536); PUSH_DATA (push, screen->txc->offset + 65536); PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1); - return 0; -} - -bool -nvc0_compute_validate_program(struct nvc0_context *nvc0) -{ - struct nvc0_program *prog = nvc0->compprog; - - if (prog->mem) - return true; + /* MS sample coordinate offsets */ + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 2 * 8); + PUSH_DATA (push, NVC0_CB_AUX_MS_INFO); + PUSH_DATA (push, 0); /* 0 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 1); /* 1 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); /* 2 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 1); /* 3 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 2); /* 4 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 3); /* 5 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 2); /* 6 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 3); /* 7 */ + PUSH_DATA (push, 1); - if (!prog->translated) { - prog->translated = nvc0_program_translate( - prog, nvc0->screen->base.device->chipset, &nvc0->base.debug); - if (!prog->translated) - return false; - } - if (unlikely(!prog->code_size)) - return false; - - if (likely(prog->code_size)) { - if (nvc0_program_upload_code(nvc0, prog)) { - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); - PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE); - return true; - } - } - return false; + return 0; } static void @@ -153,9 +145,14 @@ nvc0_compute_validate_samplers(struct nvc0_context *nvc0) { bool need_flush = nvc0_validate_tsc(nvc0, 5); if (need_flush) { - BEGIN_NVC0(nvc0->base.pushbuf, NVC0_COMPUTE(TSC_FLUSH), 1); + BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(TSC_FLUSH), 1); PUSH_DATA (nvc0->base.pushbuf, 0); } + + /* Invalidate all 3D samplers because they are aliased. */ + for (int s = 0; s < 5; s++) + nvc0->samplers_dirty[s] = ~0; + nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS; } static void @@ -163,9 +160,30 @@ nvc0_compute_validate_textures(struct nvc0_context *nvc0) { bool need_flush = nvc0_validate_tic(nvc0, 5); if (need_flush) { - BEGIN_NVC0(nvc0->base.pushbuf, NVC0_COMPUTE(TIC_FLUSH), 1); + BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(TIC_FLUSH), 1); PUSH_DATA (nvc0->base.pushbuf, 0); } + + /* Invalidate all 3D textures because they are aliased. */ + for (int s = 0; s < 5; s++) { + for (int i = 0; i < nvc0->num_textures[s]; i++) + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); + nvc0->textures_dirty[s] = ~0; + } + nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES; +} + +static inline void +nvc0_compute_invalidate_constbufs(struct nvc0_context *nvc0) +{ + int s; + + /* Invalidate all 3D constbufs because they are aliased with COMPUTE. */ + for (s = 0; s < 5; s++) { + nvc0->constbuf_dirty[s] |= nvc0->constbuf_valid[s]; + nvc0->state.uniform_buffer_bound[s] = 0; + } + nvc0->dirty_3d |= NVC0_NEW_3D_CONSTBUF; } static void @@ -180,7 +198,7 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0) if (nvc0->constbuf[s][i].user) { struct nouveau_bo *bo = nvc0->screen->uniform_bo; - const unsigned base = s << 16; + const unsigned base = NVC0_CB_USR_INFO(s); const unsigned size = nvc0->constbuf[s][0].size; assert(i == 0); /* we really only want OpenGL uniforms here */ assert(nvc0->constbuf[s][0].u.data); @@ -188,11 +206,11 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0) if (nvc0->state.uniform_buffer_bound[s] < size) { nvc0->state.uniform_buffer_bound[s] = align(size, 0x100); - BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); PUSH_DATA (push, nvc0->state.uniform_buffer_bound[s]); PUSH_DATAh(push, bo->offset + base); PUSH_DATA (push, bo->offset + base); - BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); PUSH_DATA (push, (0 << 8) | 1); } nvc0_cb_bo_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base), @@ -203,18 +221,18 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0) struct nv04_resource *res = nv04_resource(nvc0->constbuf[s][i].u.buf); if (res) { - BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); PUSH_DATA (push, nvc0->constbuf[s][i].size); PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset); PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset); - BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); PUSH_DATA (push, (i << 8) | 1); BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD); res->cb_bindings[s] |= 1 << i; } else { - BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); PUSH_DATA (push, (i << 8) | 0); } if (i == 0) @@ -222,7 +240,9 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0) } } - BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); + nvc0_compute_invalidate_constbufs(nvc0); + + BEGIN_NVC0(push, NVC0_CP(FLUSH), 1); PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB); } @@ -232,29 +252,30 @@ nvc0_compute_validate_driverconst(struct nvc0_context *nvc0) struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_screen *screen = nvc0->screen; - BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); - PUSH_DATA (push, 1024); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (5 << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (5 << 10)); - BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); PUSH_DATA (push, (15 << 8) | 1); - nvc0->dirty |= NVC0_NEW_DRIVERCONST; + nvc0->dirty_3d |= NVC0_NEW_3D_DRIVERCONST; } static void nvc0_compute_validate_buffers(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; const int s = 5; int i; - BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); - PUSH_DATA (push, 1024); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10)); - BEGIN_1IC0(push, NVC0_COMPUTE(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS); - PUSH_DATA (push, 512); + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS); + PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0)); for (i = 0; i < NVC0_MAX_BUFFERS; i++) { if (nvc0->buffers[s][i].buffer) { @@ -265,6 +286,10 @@ nvc0_compute_validate_buffers(struct nvc0_context *nvc0) PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); PUSH_DATA (push, 0); BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); + util_range_add(&res->valid_buffer_range, + nvc0->buffers[s][i].buffer_offset, + nvc0->buffers[s][i].buffer_offset + + nvc0->buffers[s][i].buffer_size); } else { PUSH_DATA (push, 0); PUSH_DATA (push, 0); @@ -274,58 +299,125 @@ nvc0_compute_validate_buffers(struct nvc0_context *nvc0) } } +void +nvc0_compute_validate_globals(struct nvc0_context *nvc0) +{ + unsigned i; + + for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *); + ++i) { + struct pipe_resource *res = *util_dynarray_element( + &nvc0->global_residents, struct pipe_resource *, i); + if (res) + nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL, + nv04_resource(res), NOUVEAU_BO_RDWR); + } +} + +static inline void +nvc0_compute_invalidate_surfaces(struct nvc0_context *nvc0, const int s) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + int i; + + for (i = 0; i < NVC0_MAX_IMAGES; ++i) { + if (s == 5) + BEGIN_NVC0(push, NVC0_CP(IMAGE(i)), 6); + else + BEGIN_NVC0(push, NVC0_3D(IMAGE(i)), 6); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0x14000); + PUSH_DATA(push, 0); + } +} + +static void +nvc0_compute_validate_surfaces(struct nvc0_context *nvc0) +{ + /* TODO: Invalidating both 3D and CP surfaces before validating surfaces for + * compute is probably not really necessary, but we didn't find any better + * solutions for now. This fixes some invalidation issues when compute and + * fragment shaders are used inside the same context. Anyway, we definitely + * have invalidation issues between 3D and CP for other resources like SSBO + * and atomic counters. */ + nvc0_compute_invalidate_surfaces(nvc0, 4); + nvc0_compute_invalidate_surfaces(nvc0, 5); + + nvc0_validate_suf(nvc0, 5); + + /* Invalidate all FRAGMENT images because they are aliased with COMPUTE. */ + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_SUF); + nvc0->dirty_3d |= NVC0_NEW_3D_SURFACES; + nvc0->images_dirty[4] |= nvc0->images_valid[4]; +} + +static struct nvc0_state_validate +validate_list_cp[] = { + { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, + { nvc0_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF }, + { nvc0_compute_validate_driverconst, NVC0_NEW_CP_DRIVERCONST }, + { nvc0_compute_validate_buffers, NVC0_NEW_CP_BUFFERS }, + { nvc0_compute_validate_textures, NVC0_NEW_CP_TEXTURES }, + { nvc0_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS }, + { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, + { nvc0_compute_validate_surfaces, NVC0_NEW_CP_SURFACES }, +}; + static bool -nvc0_compute_state_validate(struct nvc0_context *nvc0) +nvc0_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) { - if (!nvc0_compute_validate_program(nvc0)) - return false; - if (nvc0->dirty_cp & NVC0_NEW_CP_CONSTBUF) - nvc0_compute_validate_constbufs(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_DRIVERCONST) - nvc0_compute_validate_driverconst(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_BUFFERS) - nvc0_compute_validate_buffers(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES) - nvc0_compute_validate_textures(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS) - nvc0_compute_validate_samplers(nvc0); - - /* TODO: surfaces, global memory buffers */ - - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false); - - nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); - if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) - return false; - if (unlikely(nvc0->state.flushed)) - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); + bool ret; - return true; + ret = nvc0_state_validate(nvc0, mask, validate_list_cp, + ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp, + nvc0->bufctx_cp); + if (unlikely(nvc0->state.flushed)) + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); + return ret; } static void -nvc0_compute_upload_input(struct nvc0_context *nvc0, const void *input) +nvc0_compute_upload_input(struct nvc0_context *nvc0, + const struct pipe_grid_info *info) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_screen *screen = nvc0->screen; struct nvc0_program *cp = nvc0->compprog; if (cp->parm_size) { - BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); + struct nouveau_bo *bo = screen->uniform_bo; + const unsigned base = NVC0_CB_USR_INFO(5); + + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); PUSH_DATA (push, align(cp->parm_size, 0x100)); - PUSH_DATAh(push, screen->parm->offset); - PUSH_DATA (push, screen->parm->offset); - BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); + PUSH_DATAh(push, bo->offset + base); + PUSH_DATA (push, bo->offset + base); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); PUSH_DATA (push, (0 << 8) | 1); /* NOTE: size is limited to 4 KiB, which is < NV04_PFIFO_MAX_PACKET_LEN */ - BEGIN_1IC0(push, NVC0_COMPUTE(CB_POS), 1 + cp->parm_size / 4); + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + cp->parm_size / 4); PUSH_DATA (push, 0); - PUSH_DATAp(push, input, cp->parm_size / 4); + PUSH_DATAp(push, info->input, cp->parm_size / 4); - BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); - PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB); + nvc0_compute_invalidate_constbufs(nvc0); } + + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 1); + /* (7) as we only upload work_dim on nvc0, the rest uses special regs */ + PUSH_DATA (push, NVC0_CB_AUX_GRID_INFO(7)); + PUSH_DATA (push, info->work_dim); + + BEGIN_NVC0(push, NVC0_CP(FLUSH), 1); + PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB); } void @@ -334,49 +426,48 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_program *cp = nvc0->compprog; - unsigned s; int ret; - ret = !nvc0_compute_state_validate(nvc0); + ret = !nvc0_state_validate_cp(nvc0, ~0); if (ret) { NOUVEAU_ERR("Failed to launch grid !\n"); return; } - nvc0_compute_upload_input(nvc0, info->input); + nvc0_compute_upload_input(nvc0, info); - BEGIN_NVC0(push, NVC0_COMPUTE(CP_START_ID), 1); + BEGIN_NVC0(push, NVC0_CP(CP_START_ID), 1); PUSH_DATA (push, nvc0_program_symbol_offset(cp, info->pc)); - BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_POS_ALLOC), 3); - PUSH_DATA (push, align(cp->cp.lmem_size, 0x10)); + BEGIN_NVC0(push, NVC0_CP(LOCAL_POS_ALLOC), 3); + PUSH_DATA (push, (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10)); PUSH_DATA (push, 0); PUSH_DATA (push, 0x800); /* WARP_CSTACK_SIZE */ - BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 3); + BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 3); PUSH_DATA (push, align(cp->cp.smem_size, 0x100)); PUSH_DATA (push, info->block[0] * info->block[1] * info->block[2]); PUSH_DATA (push, cp->num_barriers); - BEGIN_NVC0(push, NVC0_COMPUTE(CP_GPR_ALLOC), 1); + BEGIN_NVC0(push, NVC0_CP(CP_GPR_ALLOC), 1); PUSH_DATA (push, cp->num_gprs); /* launch preliminary setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(GRIDID), 1); + BEGIN_NVC0(push, NVC0_CP(GRIDID), 1); PUSH_DATA (push, 0x1); - BEGIN_NVC0(push, SUBC_COMPUTE(0x036c), 1); + BEGIN_NVC0(push, SUBC_CP(0x036c), 1); PUSH_DATA (push, 0); - BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); + BEGIN_NVC0(push, NVC0_CP(FLUSH), 1); PUSH_DATA (push, NVC0_COMPUTE_FLUSH_GLOBAL | NVC0_COMPUTE_FLUSH_UNK8); /* block setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(BLOCKDIM_YX), 2); + BEGIN_NVC0(push, NVC0_CP(BLOCKDIM_YX), 2); PUSH_DATA (push, (info->block[1] << 16) | info->block[0]); PUSH_DATA (push, info->block[2]); if (unlikely(info->indirect)) { struct nv04_resource *res = nv04_resource(info->indirect); uint32_t offset = res->offset + info->indirect_offset; - unsigned macro = NVC0_COMPUTE_MACRO_LAUNCH_GRID_INDIRECT; + unsigned macro = NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT; nouveau_pushbuf_space(push, 16, 0, 1); PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); @@ -385,27 +476,26 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4); } else { /* grid setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(GRIDDIM_YX), 2); + BEGIN_NVC0(push, NVC0_CP(GRIDDIM_YX), 2); PUSH_DATA (push, (info->grid[1] << 16) | info->grid[0]); PUSH_DATA (push, info->grid[2]); /* kernel launching */ - BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_BEGIN), 1); + BEGIN_NVC0(push, NVC0_CP(COMPUTE_BEGIN), 1); PUSH_DATA (push, 0); - BEGIN_NVC0(push, SUBC_COMPUTE(0x0a08), 1); + BEGIN_NVC0(push, SUBC_CP(0x0a08), 1); PUSH_DATA (push, 0); - BEGIN_NVC0(push, NVC0_COMPUTE(LAUNCH), 1); + BEGIN_NVC0(push, NVC0_CP(LAUNCH), 1); PUSH_DATA (push, 0x1000); - BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_END), 1); + BEGIN_NVC0(push, NVC0_CP(COMPUTE_END), 1); PUSH_DATA (push, 0); - BEGIN_NVC0(push, SUBC_COMPUTE(0x0360), 1); + BEGIN_NVC0(push, SUBC_CP(0x0360), 1); PUSH_DATA (push, 0x1); } - /* Invalidate all 3D constbufs because they are aliased with COMPUTE. */ - nvc0->dirty |= NVC0_NEW_CONSTBUF; - for (s = 0; s < 5; s++) { - nvc0->constbuf_dirty[s] |= nvc0->constbuf_valid[s]; - nvc0->state.uniform_buffer_bound[s] = 0; - } + /* TODO: Not sure if this is really necessary. */ + nvc0_compute_invalidate_surfaces(nvc0, 5); + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF); + nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES; + nvc0->images_dirty[5] |= nvc0->images_valid[5]; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c index 66e7f95c2..c711cb07d 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c @@ -90,9 +90,47 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags) nvc0->cb_dirty = true; } } + } else { + /* Pretty much any writing by shaders needs a serialize after + * it. Especially when moving between 3d and compute pipelines, but even + * without that. + */ + IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0); } - if (flags & PIPE_BARRIER_SHADER_BUFFER) { - IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011); + + /* If we're going to texture from a buffer/image written by a shader, we + * must flush the texture cache. + */ + if (flags & PIPE_BARRIER_TEXTURE) + IMMED_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 0); + + if (flags & PIPE_BARRIER_CONSTANT_BUFFER) + nvc0->cb_dirty = true; + if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_INDEX_BUFFER)) + nvc0->base.vbo_dirty = true; +} + +static void +nvc0_emit_string_marker(struct pipe_context *pipe, const char *str, int len) +{ + struct nouveau_pushbuf *push = nvc0_context(pipe)->base.pushbuf; + int string_words = len / 4; + int data_words; + + if (len <= 0) + return; + string_words = MIN2(string_words, NV04_PFIFO_MAX_PACKET_LEN); + if (string_words == NV04_PFIFO_MAX_PACKET_LEN) + data_words = string_words; + else + data_words = string_words + !!(len & 3); + BEGIN_NIC0(push, SUBC_3D(NV04_GRAPH_NOP), data_words); + if (string_words) + PUSH_DATAp(push, str, string_words); + if (string_words != data_words) { + int data = 0; + memcpy(&data, &str[string_words * 4], len & 3); + PUSH_DATA (push, data); } } @@ -119,6 +157,15 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0) for (i = 0; i < NVC0_MAX_PIPE_CONSTBUFS; ++i) if (!nvc0->constbuf[s][i].user) pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, NULL); + + for (i = 0; i < NVC0_MAX_BUFFERS; ++i) + pipe_resource_reference(&nvc0->buffers[s][i].buffer, NULL); + + for (i = 0; i < NVC0_MAX_IMAGES; ++i) { + pipe_resource_reference(&nvc0->images[s][i].resource, NULL); + if (nvc0->screen->base.class_3d >= GM107_3D_CLASS) + pipe_sampler_view_reference(&nvc0->images_tic[s][i], NULL); + } } for (s = 0; s < 2; ++s) { @@ -126,10 +173,6 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0) pipe_surface_reference(&nvc0->surfaces[s][i], NULL); } - for (s = 0; s < 6; ++s) - for (i = 0; i < NVC0_MAX_BUFFERS; ++i) - pipe_resource_reference(&nvc0->buffers[s][i].buffer, NULL); - for (i = 0; i < nvc0->num_tfbbufs; ++i) pipe_so_target_reference(&nvc0->tfbbuf[i], NULL); @@ -194,8 +237,8 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx, for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) { if (nvc0->framebuffer.cbufs[i] && nvc0->framebuffer.cbufs[i]->texture == res) { - nvc0->dirty |= NVC0_NEW_FRAMEBUFFER; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB); + nvc0->dirty_3d |= NVC0_NEW_3D_FRAMEBUFFER; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_FB); if (!--ref) return ref; } @@ -204,8 +247,8 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx, if (res->bind & PIPE_BIND_DEPTH_STENCIL) { if (nvc0->framebuffer.zsbuf && nvc0->framebuffer.zsbuf->texture == res) { - nvc0->dirty |= NVC0_NEW_FRAMEBUFFER; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB); + nvc0->dirty_3d |= NVC0_NEW_3D_FRAMEBUFFER; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_FB); if (!--ref) return ref; } @@ -214,69 +257,91 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx, if (res->target == PIPE_BUFFER) { for (i = 0; i < nvc0->num_vtxbufs; ++i) { if (nvc0->vtxbuf[i].buffer == res) { - nvc0->dirty |= NVC0_NEW_ARRAYS; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX); + nvc0->dirty_3d |= NVC0_NEW_3D_ARRAYS; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_VTX); if (!--ref) return ref; } } if (nvc0->idxbuf.buffer == res) { - nvc0->dirty |= NVC0_NEW_IDXBUF; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_IDX); + nvc0->dirty_3d |= NVC0_NEW_3D_IDXBUF; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_IDX); if (!--ref) return ref; } - for (s = 0; s < 5; ++s) { - for (i = 0; i < nvc0->num_textures[s]; ++i) { - if (nvc0->textures[s][i] && - nvc0->textures[s][i]->texture == res) { - nvc0->textures_dirty[s] |= 1 << i; - nvc0->dirty |= NVC0_NEW_TEXTURES; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i)); - if (!--ref) - return ref; + for (s = 0; s < 6; ++s) { + for (i = 0; i < nvc0->num_textures[s]; ++i) { + if (nvc0->textures[s][i] && + nvc0->textures[s][i]->texture == res) { + nvc0->textures_dirty[s] |= 1 << i; + if (unlikely(s == 5)) { + nvc0->dirty_cp |= NVC0_NEW_CP_TEXTURES; + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_TEX(i)); + } else { + nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); + } + if (!--ref) + return ref; + } } } - } for (s = 0; s < 6; ++s) { - for (i = 0; i < NVC0_MAX_PIPE_CONSTBUFS; ++i) { - if (!(nvc0->constbuf_valid[s] & (1 << i))) - continue; - if (!nvc0->constbuf[s][i].user && - nvc0->constbuf[s][i].u.buf == res) { - nvc0->constbuf_dirty[s] |= 1 << i; - if (unlikely(s == 5)) { - nvc0->dirty_cp |= NVC0_NEW_CP_CONSTBUF; - nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_CB(i)); - } else { - nvc0->dirty |= NVC0_NEW_CONSTBUF; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CB(s, i)); + for (i = 0; i < NVC0_MAX_PIPE_CONSTBUFS; ++i) { + if (!(nvc0->constbuf_valid[s] & (1 << i))) + continue; + if (!nvc0->constbuf[s][i].user && + nvc0->constbuf[s][i].u.buf == res) { + nvc0->constbuf_dirty[s] |= 1 << i; + if (unlikely(s == 5)) { + nvc0->dirty_cp |= NVC0_NEW_CP_CONSTBUF; + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_CB(i)); + } else { + nvc0->dirty_3d |= NVC0_NEW_3D_CONSTBUF; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_CB(s, i)); + } + if (!--ref) + return ref; } - if (!--ref) - return ref; } } + + for (s = 0; s < 6; ++s) { + for (i = 0; i < NVC0_MAX_BUFFERS; ++i) { + if (nvc0->buffers[s][i].buffer == res) { + nvc0->buffers_dirty[s] |= 1 << i; + if (unlikely(s == 5)) { + nvc0->dirty_cp |= NVC0_NEW_CP_BUFFERS; + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_BUF); + } else { + nvc0->dirty_3d |= NVC0_NEW_3D_BUFFERS; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_BUF); + } + if (!--ref) + return ref; + } + } } for (s = 0; s < 6; ++s) { - for (i = 0; i < NVC0_MAX_BUFFERS; ++i) { - if (nvc0->buffers[s][i].buffer == res) { - nvc0->buffers_dirty[s] |= 1 << i; - if (unlikely(s == 5)) { - nvc0->dirty_cp |= NVC0_NEW_CP_BUFFERS; - nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_BUF); - } else { - nvc0->dirty |= NVC0_NEW_BUFFERS; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_BUF); + for (i = 0; i < NVC0_MAX_IMAGES; ++i) { + if (nvc0->images[s][i].resource == res) { + nvc0->images_dirty[s] |= 1 << i; + if (unlikely(s == 5)) { + nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES; + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF); + } else { + nvc0->dirty_3d |= NVC0_NEW_3D_SURFACES; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_SUF); + } } if (!--ref) return ref; } } - } } return ref; @@ -333,6 +398,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) pipe->texture_barrier = nvc0_texture_barrier; pipe->memory_barrier = nvc0_memory_barrier; pipe->get_sample_position = nvc0_context_get_sample_position; + pipe->emit_string_marker = nvc0_emit_string_marker; nouveau_context_init(&nvc0->base); nvc0_init_query_functions(nvc0); @@ -352,7 +418,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) if (!nvc0->tcp_empty) goto out_err; /* set the empty tctl prog on next draw in case one is never set */ - nvc0->dirty |= NVC0_NEW_TCTLPROG; + nvc0->dirty_3d |= NVC0_NEW_3D_TCTLPROG; /* Do not bind the COMPUTE driver constbuf at screen initialization because * CBs are aliased between 3D and COMPUTE, but make sure it will be bound if @@ -373,26 +439,25 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) flags = NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD; - BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->text); - BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->uniform_bo); - BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->txc); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_TEXT, flags, screen->text); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_SCREEN, flags, screen->uniform_bo); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_SCREEN, flags, screen->txc); if (screen->compute) { - BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->text); + BCTX_REFN_bo(nvc0->bufctx_cp, CP_TEXT, flags, screen->text); BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->uniform_bo); BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->txc); - BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->parm); } flags = NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RDWR; if (screen->poly_cache) - BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->poly_cache); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_SCREEN, flags, screen->poly_cache); if (screen->compute) BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->tls); flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR; - BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->fence.bo); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_SCREEN, flags, screen->fence.bo); BCTX_REFN_bo(nvc0->bufctx, FENCE, flags, screen->fence.bo); if (screen->compute) BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->fence.bo); @@ -437,10 +502,8 @@ nvc0_bufctx_fence(struct nvc0_context *nvc0, struct nouveau_bufctx *bufctx, NOUVEAU_DRV_STAT(&nvc0->screen->base, resource_validate_count, count); } -static void -nvc0_context_get_sample_position(struct pipe_context *pipe, - unsigned sample_count, unsigned sample_index, - float *xy) +const void * +nvc0_get_sample_locations(unsigned sample_count) { static const uint8_t ms1[1][2] = { { 0x8, 0x8 } }; static const uint8_t ms2[2][2] = { @@ -472,8 +535,22 @@ nvc0_context_get_sample_position(struct pipe_context *pipe, case 8: ptr = ms8; break; default: assert(0); - return; /* bad sample count -> undefined locations */ + return NULL; /* bad sample count -> undefined locations */ } + return ptr; +} + +static void +nvc0_context_get_sample_position(struct pipe_context *pipe, + unsigned sample_count, unsigned sample_index, + float *xy) +{ + const uint8_t (*ptr)[2]; + + ptr = nvc0_get_sample_locations(sample_count); + if (!ptr) + return; + xy[0] = ptr[sample_index][0] * 0.0625f; xy[1] = ptr[sample_index][1] * 0.0625f; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 7e046c10b..37aecae90 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -27,37 +27,39 @@ #include "nv50/nv50_2d.xml.h" #include "nvc0/nvc0_m2mf.xml.h" #include "nvc0/nve4_p2mf.xml.h" +#include "nvc0/nvc0_compute.xml.h" #include "nvc0/nvc0_macros.h" -/* NOTE: must keep NVC0_NEW_...PROG in consecutive bits in this order */ -#define NVC0_NEW_BLEND (1 << 0) -#define NVC0_NEW_RASTERIZER (1 << 1) -#define NVC0_NEW_ZSA (1 << 2) -#define NVC0_NEW_VERTPROG (1 << 3) -#define NVC0_NEW_TCTLPROG (1 << 4) -#define NVC0_NEW_TEVLPROG (1 << 5) -#define NVC0_NEW_GMTYPROG (1 << 6) -#define NVC0_NEW_FRAGPROG (1 << 7) -#define NVC0_NEW_BLEND_COLOUR (1 << 8) -#define NVC0_NEW_STENCIL_REF (1 << 9) -#define NVC0_NEW_CLIP (1 << 10) -#define NVC0_NEW_SAMPLE_MASK (1 << 11) -#define NVC0_NEW_FRAMEBUFFER (1 << 12) -#define NVC0_NEW_STIPPLE (1 << 13) -#define NVC0_NEW_SCISSOR (1 << 14) -#define NVC0_NEW_VIEWPORT (1 << 15) -#define NVC0_NEW_ARRAYS (1 << 16) -#define NVC0_NEW_VERTEX (1 << 17) -#define NVC0_NEW_CONSTBUF (1 << 18) -#define NVC0_NEW_TEXTURES (1 << 19) -#define NVC0_NEW_SAMPLERS (1 << 20) -#define NVC0_NEW_TFB_TARGETS (1 << 21) -#define NVC0_NEW_IDXBUF (1 << 22) -#define NVC0_NEW_SURFACES (1 << 23) -#define NVC0_NEW_MIN_SAMPLES (1 << 24) -#define NVC0_NEW_TESSFACTOR (1 << 25) -#define NVC0_NEW_BUFFERS (1 << 26) -#define NVC0_NEW_DRIVERCONST (1 << 27) +/* NOTE: must keep NVC0_NEW_3D_...PROG in consecutive bits in this order */ +#define NVC0_NEW_3D_BLEND (1 << 0) +#define NVC0_NEW_3D_RASTERIZER (1 << 1) +#define NVC0_NEW_3D_ZSA (1 << 2) +#define NVC0_NEW_3D_VERTPROG (1 << 3) +#define NVC0_NEW_3D_TCTLPROG (1 << 4) +#define NVC0_NEW_3D_TEVLPROG (1 << 5) +#define NVC0_NEW_3D_GMTYPROG (1 << 6) +#define NVC0_NEW_3D_FRAGPROG (1 << 7) +#define NVC0_NEW_3D_BLEND_COLOUR (1 << 8) +#define NVC0_NEW_3D_STENCIL_REF (1 << 9) +#define NVC0_NEW_3D_CLIP (1 << 10) +#define NVC0_NEW_3D_SAMPLE_MASK (1 << 11) +#define NVC0_NEW_3D_FRAMEBUFFER (1 << 12) +#define NVC0_NEW_3D_STIPPLE (1 << 13) +#define NVC0_NEW_3D_SCISSOR (1 << 14) +#define NVC0_NEW_3D_VIEWPORT (1 << 15) +#define NVC0_NEW_3D_ARRAYS (1 << 16) +#define NVC0_NEW_3D_VERTEX (1 << 17) +#define NVC0_NEW_3D_CONSTBUF (1 << 18) +#define NVC0_NEW_3D_TEXTURES (1 << 19) +#define NVC0_NEW_3D_SAMPLERS (1 << 20) +#define NVC0_NEW_3D_TFB_TARGETS (1 << 21) +#define NVC0_NEW_3D_IDXBUF (1 << 22) +#define NVC0_NEW_3D_SURFACES (1 << 23) +#define NVC0_NEW_3D_MIN_SAMPLES (1 << 24) +#define NVC0_NEW_3D_TESSFACTOR (1 << 25) +#define NVC0_NEW_3D_BUFFERS (1 << 26) +#define NVC0_NEW_3D_DRIVERCONST (1 << 27) +#define NVC0_NEW_3D_WINDOW_RECTS (1 << 28) #define NVC0_NEW_CP_PROGRAM (1 << 0) #define NVC0_NEW_CP_SURFACES (1 << 1) @@ -69,18 +71,19 @@ #define NVC0_NEW_CP_BUFFERS (1 << 7) /* 3d bufctx (during draw_vbo, blit_3d) */ -#define NVC0_BIND_FB 0 -#define NVC0_BIND_VTX 1 -#define NVC0_BIND_VTX_TMP 2 -#define NVC0_BIND_IDX 3 -#define NVC0_BIND_TEX(s, i) ( 4 + 32 * (s) + (i)) -#define NVC0_BIND_CB(s, i) (164 + 16 * (s) + (i)) -#define NVC0_BIND_TFB 244 -#define NVC0_BIND_SUF 245 -#define NVC0_BIND_BUF 246 -#define NVC0_BIND_SCREEN 247 -#define NVC0_BIND_TLS 249 -#define NVC0_BIND_3D_COUNT 250 +#define NVC0_BIND_3D_FB 0 +#define NVC0_BIND_3D_VTX 1 +#define NVC0_BIND_3D_VTX_TMP 2 +#define NVC0_BIND_3D_IDX 3 +#define NVC0_BIND_3D_TEX(s, i) ( 4 + 32 * (s) + (i)) +#define NVC0_BIND_3D_CB(s, i) (164 + 16 * (s) + (i)) +#define NVC0_BIND_3D_TFB 244 +#define NVC0_BIND_3D_SUF 245 +#define NVC0_BIND_3D_BUF 246 +#define NVC0_BIND_3D_SCREEN 247 +#define NVC0_BIND_3D_TLS 249 +#define NVC0_BIND_3D_TEXT 250 +#define NVC0_BIND_3D_COUNT 251 /* compute bufctx (during launch_grid) */ #define NVC0_BIND_CP_CB(i) ( 0 + (i)) @@ -91,13 +94,54 @@ #define NVC0_BIND_CP_SCREEN 51 #define NVC0_BIND_CP_QUERY 52 #define NVC0_BIND_CP_BUF 53 -#define NVC0_BIND_CP_COUNT 54 +#define NVC0_BIND_CP_TEXT 54 +#define NVC0_BIND_CP_COUNT 55 /* bufctx for other operations */ #define NVC0_BIND_2D 0 #define NVC0_BIND_M2MF 0 #define NVC0_BIND_FENCE 1 +/* 6 user uniform buffers, at 64K each */ +#define NVC0_CB_USR_INFO(s) (s << 16) +#define NVC0_CB_USR_SIZE (6 << 16) +/* 6 driver constbuts, at 2K each */ +#define NVC0_CB_AUX_INFO(s) NVC0_CB_USR_SIZE + (s << 11) +#define NVC0_CB_AUX_SIZE (1 << 11) +/* XXX: Figure out what this UNK data is. */ +#define NVC0_CB_AUX_UNK_INFO 0x000 +#define NVC0_CB_AUX_UNK_SIZE (8 * 4) +/* 40 textures handles (8 for GM107+ images only), at 1 32-bits integer each */ +#define NVC0_CB_AUX_TEX_INFO(i) 0x020 + (i) * 4 +#define NVC0_CB_AUX_TEX_SIZE (40 * 4) +/* 8 sets of 32-bits coordinate offsets */ +#define NVC0_CB_AUX_MS_INFO 0x0c0 +#define NVC0_CB_AUX_MS_SIZE (8 * 2 * 4) +/* block/grid size, at 3 32-bits integers each, gridid and work_dim */ +#define NVC0_CB_AUX_GRID_INFO(i) 0x100 + (i) * 4 /* CP */ +#define NVC0_CB_AUX_GRID_SIZE (8 * 4) +/* 8 user clip planes, at 4 32-bits floats each */ +#define NVC0_CB_AUX_UCP_INFO 0x120 +#define NVC0_CB_AUX_UCP_SIZE (PIPE_MAX_CLIP_PLANES * 4 * 4) +/* 13 ubos, at 4 32-bits integer each */ +#define NVC0_CB_AUX_UBO_INFO(i) 0x120 + (i) * 4 * 4 /* CP */ +#define NVC0_CB_AUX_UBO_SIZE ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4) +/* 8 sets of 32-bits integer pairs sample offsets */ +#define NVC0_CB_AUX_SAMPLE_INFO 0x1a0 /* FP */ +#define NVC0_CB_AUX_SAMPLE_SIZE (8 * 4 * 2) +/* draw parameters (index bais, base instance, drawid) */ +#define NVC0_CB_AUX_DRAW_INFO 0x1a0 /* VP */ +/* 32 user buffers, at 4 32-bits integers each */ +#define NVC0_CB_AUX_BUF_INFO(i) 0x220 + (i) * 4 * 4 +#define NVC0_CB_AUX_BUF_SIZE (NVC0_MAX_BUFFERS * 4 * 4) +/* 8 surfaces, at 16 32-bits integers each */ +#define NVC0_CB_AUX_SU_INFO(i) 0x420 + (i) * 16 * 4 +#define NVC0_CB_AUX_SU_SIZE (NVC0_MAX_IMAGES * 16 * 4) +/* 1 64-bits address and 1 32-bits sequence */ +#define NVC0_CB_AUX_MP_INFO 0x620 +#define NVC0_CB_AUX_MP_SIZE 3 * 4 +/* 4 32-bits floats for the vertex runout, put at the end */ +#define NVC0_CB_AUX_RUNOUT_INFO NVC0_CB_USR_SIZE + (NVC0_CB_AUX_SIZE * 6) struct nvc0_blitctx; @@ -118,7 +162,7 @@ struct nvc0_context { const struct nv50_m2mf_rect *src, uint32_t nblocksx, uint32_t nblocksy); - uint32_t dirty; + uint32_t dirty_3d; /* dirty flags for 3d state */ uint32_t dirty_cp; /* dirty flags for compute state */ struct nvc0_graph_state state; @@ -160,7 +204,7 @@ struct nvc0_context { uint32_t textures_coherent[6]; struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS]; unsigned num_samplers[6]; - uint16_t samplers_dirty[6]; + uint32_t samplers_dirty[6]; bool seamless_cube_map; uint32_t tex_handles[6][PIPE_MAX_SAMPLERS]; /* for nve4 */ @@ -174,6 +218,7 @@ struct nvc0_context { struct pipe_viewport_state viewports[NVC0_MAX_VIEWPORTS]; unsigned viewports_dirty; struct pipe_clip_state clip; + struct nvc0_window_rect_stateobj window_rect; unsigned sample_mask; unsigned min_samples; @@ -203,6 +248,11 @@ struct nvc0_context { uint32_t buffers_dirty[6]; uint32_t buffers_valid[6]; + struct pipe_image_view images[6][NVC0_MAX_IMAGES]; + struct pipe_sampler_view *images_tic[6][NVC0_MAX_IMAGES]; /* GM107+ */ + uint16_t images_dirty[6]; + uint16_t images_valid[6]; + struct util_dynarray global_residents; }; @@ -234,6 +284,7 @@ struct pipe_context *nvc0_create(struct pipe_screen *, void *, unsigned flags); void nvc0_bufctx_fence(struct nvc0_context *, struct nouveau_bufctx *, bool on_flush); void nvc0_default_kick_notify(struct nouveau_pushbuf *); +const void *nvc0_get_sample_locations(unsigned); /* nvc0_draw.c */ extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *); @@ -241,7 +292,7 @@ extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *); /* nvc0_program.c */ bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset, struct pipe_debug_callback *); -bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *); +bool nvc0_program_upload(struct nvc0_context *, struct nvc0_program *); void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *); void nvc0_program_library_upload(struct nvc0_context *); uint32_t nvc0_program_symbol_offset(const struct nvc0_program *, @@ -254,6 +305,7 @@ void nvc0_tctlprog_validate(struct nvc0_context *); void nvc0_tevlprog_validate(struct nvc0_context *); void nvc0_gmtyprog_validate(struct nvc0_context *); void nvc0_fragprog_validate(struct nvc0_context *); +void nvc0_compprog_validate(struct nvc0_context *); void nvc0_tfb_validate(struct nvc0_context *); @@ -261,9 +313,15 @@ void nvc0_tfb_validate(struct nvc0_context *); extern void nvc0_init_state_functions(struct nvc0_context *); /* nvc0_state_validate.c */ -void nvc0_validate_global_residents(struct nvc0_context *, - struct nouveau_bufctx *, int bin); -bool nvc0_state_validate(struct nvc0_context *, uint32_t state_mask); +struct nvc0_state_validate { + void (*func)(struct nvc0_context *); + uint32_t states; +}; + +bool nvc0_state_validate(struct nvc0_context *, uint32_t, + struct nvc0_state_validate *, int, uint32_t *, + struct nouveau_bufctx *); +bool nvc0_state_validate_3d(struct nvc0_context *, uint32_t); /* nvc0_surface.c */ extern void nvc0_clear(struct pipe_context *, unsigned buffers, @@ -275,12 +333,16 @@ extern void nvc0_init_surface_functions(struct nvc0_context *); bool nvc0_validate_tic(struct nvc0_context *nvc0, int s); bool nvc0_validate_tsc(struct nvc0_context *nvc0, int s); bool nve4_validate_tsc(struct nvc0_context *nvc0, int s); +void nvc0_validate_suf(struct nvc0_context *nvc0, int s); void nvc0_validate_textures(struct nvc0_context *); void nvc0_validate_samplers(struct nvc0_context *); void nve4_set_tex_handles(struct nvc0_context *); void nvc0_validate_surfaces(struct nvc0_context *); -void nve4_set_surface_info(struct nouveau_pushbuf *, struct pipe_surface *, - struct nvc0_screen *); +void nve4_set_surface_info(struct nouveau_pushbuf *, struct pipe_image_view *, + struct nvc0_context *); +void nvc0_mark_image_range_valid(const struct pipe_image_view *); +void nvc0_update_tic(struct nvc0_context *, struct nv50_tic_entry *, + struct nv04_resource *); struct pipe_sampler_view * nvc0_create_texture_view(struct pipe_context *, @@ -292,6 +354,9 @@ struct pipe_sampler_view * nvc0_create_sampler_view(struct pipe_context *, struct pipe_resource *, const struct pipe_sampler_view *); +struct pipe_sampler_view * +gm107_create_texture_view_from_image(struct pipe_context *, + const struct pipe_image_view *); /* nvc0_transfer.c */ void @@ -342,5 +407,6 @@ void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *); /* nvc0_compute.c */ void nvc0_launch_grid(struct pipe_context *, const struct pipe_grid_info *); +void nvc0_compute_validate_globals(struct nvc0_context *); #endif diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h index 57262fe0e..eeacc714f 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h @@ -35,6 +35,6 @@ #define NVC0_3D_MACRO_QUERY_BUFFER_WRITE 0x00003858 -#define NVC0_COMPUTE_MACRO_LAUNCH_GRID_INDIRECT 0x00003860 +#define NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT 0x00003860 #endif /* __NVC0_MACROS_H__ */ diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c index ed1ac4831..27674f72a 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c @@ -240,7 +240,6 @@ const struct u_resource_vtbl nvc0_miptree_vtbl = nvc0_miptree_transfer_map, /* transfer_map */ u_default_transfer_flush_region, /* transfer_flush_region */ nvc0_miptree_transfer_unmap, /* transfer_unmap */ - u_default_transfer_inline_write /* transfer_inline_write */ }; struct pipe_resource * diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index bc884d6c0..a4a164f15 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -80,6 +80,7 @@ nvc0_shader_output_address(unsigned sn, unsigned si) case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10; case TGSI_SEMANTIC_CLIPVERTEX: return 0x270; case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10; + /* case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0; */ case TGSI_SEMANTIC_EDGEFLAG: return ~0; default: assert(!"invalid TGSI output semantic"); @@ -251,8 +252,9 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info) } } - vp->vp.clip_enable = - (1 << (info->io.clipDistances + info->io.cullDistances)) - 1; + vp->vp.clip_enable = (1 << info->io.clipDistances) - 1; + vp->vp.cull_enable = + ((1 << info->io.cullDistances) - 1) << info->io.clipDistances; for (i = 0; i < info->io.cullDistances; ++i) vp->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4); @@ -293,11 +295,21 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info) return; } - if (info->prop.tp.winding > 0) - tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; + /* It seems like lines want the "CW" bit to indicate they're connected, and + * spit out errors in dmesg when the "CONNECTED" bit is set. + */ + if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS) { + if (info->prop.tp.domain == PIPE_PRIM_LINES) + tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; + else + tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED; + } - if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS) - tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED; + /* Winding only matters for triangles/quads, not lines. */ + if (info->prop.tp.domain != PIPE_PRIM_LINES && + info->prop.tp.outputPrim != PIPE_PRIM_POINTS && + info->prop.tp.winding > 0) + tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; switch (info->prop.tp.partitioning) { case PIPE_TESS_SPACING_EQUAL: @@ -334,6 +346,15 @@ nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info) nvc0_vtgp_gen_header(tcp, info); + if (info->target >= NVISA_GM107_CHIPSET) { + /* On GM107+, the number of output patch components has moved in the TCP + * header, but it seems like blob still also uses the old position. + * Also, the high 8-bits are located inbetween the min/max parallel + * field and has to be set after updating the outputs. */ + tcp->hdr[3] = (opcs & 0x0f) << 28; + tcp->hdr[4] |= (opcs & 0xf0) << 16; + } + nvc0_tp_get_tess_mode(tcp, info); return 0; @@ -381,7 +402,7 @@ nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info) break; } - gp->hdr[4] = MIN2(info->prop.gp.maxVertices, 1024); + gp->hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024); return nvc0_vtgp_gen_header(gp, info); } @@ -456,7 +477,15 @@ nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info) fp->hdr[18] |= 0xf << info->out[i].slot[0]; } + /* There are no "regular" attachments, but the shader still needs to be + * executed. It seems like it wants to think that it has some color + * outputs in order to actually run. + */ + if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth) + fp->hdr[18] |= 0xf; + fp->fp.early_z = info->prop.fp.earlyFragTests; + fp->fp.sample_mask_in = info->prop.fp.usesSampleMaskIn; return 0; } @@ -480,11 +509,14 @@ nvc0_program_create_tfb_state(const struct nv50_ir_prog_info *info, for (i = 0; i < pso->num_outputs; ++i) { unsigned s = pso->output[i].start_component; unsigned p = pso->output[i].dst_offset; + const unsigned r = pso->output[i].register_index; b = pso->output[i].output_buffer; + if (r >= info->numOutputs) + continue; + for (c = 0; c < pso->output[i].num_components; ++c) - tfb->varying_index[b][p++] = - info->out[pso->output[i].register_index].slot[s + c]; + tfb->varying_index[b][p++] = info->out[r].slot[s + c]; tfb->varying_count[b] = MAX2(tfb->varying_count[b], p); tfb->stream[b] = pso->output[i].stream; @@ -533,44 +565,39 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, info->bin.sourceRep = NV50_PROGRAM_IR_TGSI; info->bin.source = (void *)prog->pipe.tokens; +#ifdef DEBUG + info->target = debug_get_num_option("NV50_PROG_CHIPSET", chipset); + info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3); + info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0); +#else + info->optLevel = 3; +#endif + info->io.genUserClip = prog->vp.num_ucps; info->io.auxCBSlot = 15; - info->io.ucpBase = 256; - info->io.drawInfoBase = 256 + 128; + info->io.msInfoCBSlot = 15; + info->io.ucpBase = NVC0_CB_AUX_UCP_INFO; + info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO; + info->io.msInfoBase = NVC0_CB_AUX_MS_INFO; + info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0); + info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0); + if (info->target >= NVISA_GK104_CHIPSET) { + info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0); + } if (prog->type == PIPE_SHADER_COMPUTE) { - if (chipset >= NVISA_GK104_CHIPSET) { - info->io.resInfoCBSlot = 0; - info->io.texBindBase = NVE4_CP_INPUT_TEX(0); - info->io.suInfoBase = NVE4_CP_INPUT_SUF(0); - info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0); - } else { - info->io.resInfoCBSlot = 15; - info->io.suInfoBase = 512; + if (info->target >= NVISA_GK104_CHIPSET) { + info->io.auxCBSlot = 7; + info->io.msInfoCBSlot = 7; + info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0); } - info->io.msInfoCBSlot = 0; - info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS; + info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO(0); } else { - if (chipset >= NVISA_GK104_CHIPSET) { - info->io.texBindBase = 0x20; - info->io.suInfoBase = 0; /* TODO */ - } - info->io.resInfoCBSlot = 15; - info->io.sampleInfoBase = 256 + 128; - info->io.suInfoBase = 512; - info->io.msInfoCBSlot = 15; - info->io.msInfoBase = 0; /* TODO */ + info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO; } info->assignSlots = nvc0_program_assign_varying_slots; -#ifdef DEBUG - info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3); - info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0); -#else - info->optLevel = 3; -#endif - ret = nv50_ir_generate_code(info); if (ret) { NOUVEAU_ERR("shader translation failed: %i\n", ret); @@ -581,10 +608,8 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, prog->code = info->bin.code; prog->code_size = info->bin.codeSize; - prog->immd_data = info->immd.buf; - prog->immd_size = info->immd.bufSize; prog->relocs = info->bin.relocData; - prog->interps = info->bin.interpData; + prog->fixups = info->bin.fixupData; prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1)); prog->num_barriers = info->numBarriers; @@ -654,28 +679,24 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, prog->type, info->bin.tlsSpace, prog->num_gprs, info->bin.instructions, info->bin.codeSize); +#ifdef DEBUG + if (debug_get_option("NV50_PROG_CHIPSET", NULL) && info->dbgFlags) + nvc0_program_dump(prog); +#endif + out: FREE(info); return !ret; } -bool -nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) +static inline int +nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog) { struct nvc0_screen *screen = nvc0->screen; const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; int ret; uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); - uint32_t lib_pos = screen->lib_code->start; - uint32_t code_pos; - - /* c[] bindings need to be aligned to 0x100, but we could use relocations - * to save space. */ - if (prog->immd_size) { - prog->immd_base = size; - size = align(size, 0x40); - size += prog->immd_size + 0xc0; /* add 0xc0 for align 0x40 -> 0x100 */ - } + /* On Fermi, SP_START_ID must be aligned to 0x40. * On Kepler, the first instruction must be aligned to 0x80 because * latency information is expected only at certain positions. @@ -685,27 +706,9 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) size = align(size, 0x40); ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem); - if (ret) { - struct nouveau_heap *heap = screen->text_heap; - /* Note that the code library, which is allocated before anything else, - * does not have a priv pointer. We can stop once we hit it. - */ - while (heap->next && heap->next->priv) { - struct nvc0_program *evict = heap->next->priv; - nouveau_heap_free(&evict->mem); - } - debug_printf("WARNING: out of code space, evicting all shaders.\n"); - ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); - if (ret) { - NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size); - return false; - } - IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0); - } + if (ret) + return ret; prog->code_base = prog->mem->start; - prog->immd_base = align(prog->mem->start + prog->immd_base, 0x100); - assert((prog->immd_size == 0) || (prog->immd_base + prog->immd_size <= - prog->mem->start + prog->mem->size)); if (!is_cp) { if (screen->base.class_3d >= NVE4_3D_CLASS) { @@ -719,22 +722,32 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) break; } } - code_pos = prog->code_base + NVC0_SHADER_HEADER_SIZE; } else { if (screen->base.class_3d >= NVE4_3D_CLASS) { if (prog->mem->start & 0x40) prog->code_base += 0x40; assert((prog->code_base & 0x7f) == 0x00); } - code_pos = prog->code_base; } + return 0; +} + +static inline void +nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) +{ + struct nvc0_screen *screen = nvc0->screen; + const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; + uint32_t code_pos = prog->code_base + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); + if (prog->relocs) - nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0); - if (prog->interps) { - nv50_ir_change_interp(prog->interps, prog->code, - prog->fp.force_persample_interp, - prog->fp.flatshade); + nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, + screen->lib_code->start, 0); + if (prog->fixups) { + nv50_ir_apply_fixups(prog->fixups, prog->code, + prog->fp.force_persample_interp, + prog->fp.flatshade, + 0 /* alphatest */); for (int i = 0; i < 2; i++) { unsigned mask = prog->fp.color_interp[i] >> 4; unsigned interp = prog->fp.color_interp[i] & 3; @@ -749,21 +762,102 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) } } + if (!is_cp) + nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base, + NV_VRAM_DOMAIN(&screen->base), + NVC0_SHADER_HEADER_SIZE, prog->hdr); + + nvc0->base.push_data(&nvc0->base, screen->text, code_pos, + NV_VRAM_DOMAIN(&screen->base), prog->code_size, + prog->code); +} + +bool +nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog) +{ + struct nvc0_screen *screen = nvc0->screen; + const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; + int ret; + uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); + + ret = nvc0_program_alloc_code(nvc0, prog); + if (ret) { + struct nouveau_heap *heap = screen->text_heap; + struct nvc0_program *progs[] = { /* Sorted accordingly to SP_START_ID */ + nvc0->compprog, nvc0->vertprog, nvc0->tctlprog, + nvc0->tevlprog, nvc0->gmtyprog, nvc0->fragprog + }; + + /* Note that the code library, which is allocated before anything else, + * does not have a priv pointer. We can stop once we hit it. + */ + while (heap->next && heap->next->priv) { + struct nvc0_program *evict = heap->next->priv; + nouveau_heap_free(&evict->mem); + } + debug_printf("WARNING: out of code space, evicting all shaders.\n"); + + /* Make sure to synchronize before deleting the code segment. */ + IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0); + + if ((screen->text->size << 1) <= (1 << 23)) { + ret = nvc0_screen_resize_text_area(screen, screen->text->size << 1); + if (ret) { + NOUVEAU_ERR("Error allocating TEXT area: %d\n", ret); + return false; + } + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEXT); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_TEXT, + NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD, + screen->text); + if (screen->compute) { + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_TEXT); + BCTX_REFN_bo(nvc0->bufctx_cp, CP_TEXT, + NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD, + screen->text); + } + + /* Re-upload the builtin function into the new code segment. */ + nvc0_program_library_upload(nvc0); + } + + ret = nvc0_program_alloc_code(nvc0, prog); + if (ret) { + NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size); + return false; + } + + /* All currently bound shaders have to be reuploaded. */ + for (int i = 0; i < ARRAY_SIZE(progs); i++) { + if (!progs[i] || progs[i] == prog) + continue; + + ret = nvc0_program_alloc_code(nvc0, progs[i]); + if (ret) { + NOUVEAU_ERR("failed to re-upload a shader after code eviction.\n"); + return false; + } + nvc0_program_upload_code(nvc0, progs[i]); + + if (progs[i]->type == PIPE_SHADER_COMPUTE) { + /* Caches have to be invalidated but the CP_START_ID will be + * updated in the launch_grid functions. */ + BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(FLUSH), 1); + PUSH_DATA (nvc0->base.pushbuf, NVC0_COMPUTE_FLUSH_CODE); + } else { + BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(SP_START_ID(i)), 1); + PUSH_DATA (nvc0->base.pushbuf, progs[i]->code_base); + } + } + } + + nvc0_program_upload_code(nvc0, prog); + #ifdef DEBUG if (debug_get_bool_option("NV50_PROG_DEBUG", false)) nvc0_program_dump(prog); #endif - if (!is_cp) - nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base, - NV_VRAM_DOMAIN(&screen->base), NVC0_SHADER_HEADER_SIZE, prog->hdr); - nvc0->base.push_data(&nvc0->base, screen->text, code_pos, - NV_VRAM_DOMAIN(&screen->base), prog->code_size, prog->code); - if (prog->immd_size) - nvc0->base.push_data(&nvc0->base, - screen->text, prog->immd_base, NV_VRAM_DOMAIN(&screen->base), - prog->immd_size, prog->immd_data); - BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1); PUSH_DATA (nvc0->base.pushbuf, 0x1011); @@ -806,9 +900,8 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog) if (prog->mem) nouveau_heap_free(&prog->mem); FREE(prog->code); /* may be 0 for hardcoded shaders */ - FREE(prog->immd_data); FREE(prog->relocs); - FREE(prog->interps); + FREE(prog->fixups); if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms) FREE(prog->cp.syms); if (prog->tfb) { @@ -843,7 +936,7 @@ nvc0_program_init_tcp_empty(struct nvc0_context *nvc0) { struct ureg_program *ureg; - ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL); + ureg = ureg_create(PIPE_SHADER_TESS_CTRL); if (!ureg) return; diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.h index 8b8d221ed..d33aa04e3 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.h @@ -26,11 +26,8 @@ struct nvc0_program { uint8_t num_gprs; uint32_t *code; - uint32_t *immd_data; unsigned code_base; unsigned code_size; - unsigned immd_base; - unsigned immd_size; /* size of immediate array data */ unsigned parm_size; /* size of non-bindable uniforms (c0[]) */ uint32_t hdr[20]; @@ -39,6 +36,7 @@ struct nvc0_program { struct { uint32_t clip_mode; /* clip/cull selection */ uint8_t clip_enable; /* mask of defined clip planes */ + uint8_t cull_enable; /* mask of defined cull distances */ uint8_t num_ucps; /* also set to max if ClipDistance is used */ uint8_t edgeflag; /* attribute index of edgeflag input */ bool need_vertex_id; @@ -48,6 +46,7 @@ struct nvc0_program { uint8_t early_z; uint8_t colors; uint8_t color_interp[2]; + bool sample_mask_in; bool force_persample_interp; bool flatshade; } fp; @@ -64,7 +63,7 @@ struct nvc0_program { uint8_t num_barriers; void *relocs; - void *interps; + void *fixups; struct nvc0_transform_feedback_state *tfb; diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index d2acce7d5..91fb72f90 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -58,11 +58,12 @@ nvc0_begin_query(struct pipe_context *pipe, struct pipe_query *pq) return q->funcs->begin_query(nvc0_context(pipe), q); } -static void +static bool nvc0_end_query(struct pipe_context *pipe, struct pipe_query *pq) { struct nvc0_query *q = nvc0_query(pq); q->funcs->end_query(nvc0_context(pipe), q); + return true; } static boolean @@ -204,10 +205,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, if (screen->base.drm->version >= 0x01000101) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += 2; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { + if (screen->base.class_3d <= NVF0_3D_CLASS) { count += 2; } } @@ -227,29 +225,16 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, * currently only used by AMD_performance_monitor. */ info->max_active_queries = 1; - - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->num_queries = NVE4_HW_SM_QUERY_COUNT; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - info->num_queries = NVC0_HW_SM_QUERY_COUNT; - return 1; - } + info->num_queries = nvc0_hw_sm_get_num_queries(screen); + return 1; } } else if (id == NVC0_HW_METRIC_QUERY_GROUP) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { + if (screen->base.class_3d <= NVF0_3D_CLASS) { info->name = "Performance metrics"; info->max_active_queries = 1; - info->num_queries = NVE4_HW_METRIC_QUERY_COUNT; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - info->name = "Performance metrics"; - info->max_active_queries = 1; - info->num_queries = NVC0_HW_METRIC_QUERY_COUNT; + info->num_queries = nvc0_hw_metric_get_num_queries(screen); return 1; } } @@ -270,6 +255,11 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, return 0; } +static void +nvc0_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void nvc0_init_query_functions(struct nvc0_context *nvc0) { @@ -281,6 +271,7 @@ nvc0_init_query_functions(struct nvc0_context *nvc0) pipe->end_query = nvc0_end_query; pipe->get_query_result = nvc0_get_query_result; pipe->get_query_result_resource = nvc0_get_query_result_resource; + pipe->set_active_query_state = nvc0_set_active_query_state; pipe->render_condition = nvc0_render_condition; nvc0->cond_condmode = NVC0_3D_COND_MODE_ALWAYS; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c index 7a64b69b1..2f85c320f 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c @@ -24,32 +24,95 @@ #include "nvc0/nvc0_query_hw_metric.h" #include "nvc0/nvc0_query_hw_sm.h" -/* === PERFORMANCE MONITORING METRICS for NVC0:NVE4 === */ -static const char *nvc0_hw_metric_names[] = -{ - "metric-achieved_occupancy", - "metric-branch_efficiency", - "metric-inst_issued", - "metric-inst_per_wrap", - "metric-inst_replay_overhead", - "metric-issued_ipc", - "metric-issue_slots", - "metric-issue_slot_utilization", - "metric-ipc", +#define _Q(i,n,t,d) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t, d } +static const struct nvc0_hw_metric_cfg { + unsigned id; + const char *name; + enum pipe_driver_query_type type; + const char *desc; +} nvc0_hw_metric_queries[] = { + _Q(ACHIEVED_OCCUPANCY, + "metric-achieved_occupancy", + PERCENTAGE, + "Ratio of the average active warps per active cycle to the maximum number " + "of warps supported on a multiprocessor"), + + _Q(BRANCH_EFFICIENCY, + "metric-branch_efficiency", + PERCENTAGE, + "Ratio of non-divergent branches to total branches"), + + _Q(INST_ISSUED, + "metric-inst_issued", + UINT64, + "The number of instructions issued"), + + _Q(INST_PER_WRAP, + "metric-inst_per_wrap", + UINT64, + "Average number of instructions executed by each warp"), + + _Q(INST_REPLAY_OVERHEAD, + "metric-inst_replay_overhead", + UINT64, + "Average number of replays for each instruction executed"), + + _Q(ISSUED_IPC, + "metric-issued_ipc", + UINT64, + "Instructions issued per cycle"), + + _Q(ISSUE_SLOTS, + "metric-issue_slots", + UINT64, + "The number of issue slots used"), + + _Q(ISSUE_SLOT_UTILIZATION, + "metric-issue_slot_utilization", + PERCENTAGE, + "Percentage of issue slots that issued at least one instruction, averaged " + "across all cycles"), + + _Q(IPC, + "metric-ipc", + UINT64, + "Instructions executed per cycle"), + + _Q(SHARED_REPLAY_OVERHEAD, + "metric-shared_replay_overhead", + UINT64, + "Average number of replays due to shared memory conflicts for each " + "instruction executed"), }; +#undef _Q + +static inline const struct nvc0_hw_metric_cfg * +nvc0_hw_metric_get_cfg(unsigned metric_id) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(nvc0_hw_metric_queries); i++) { + if (nvc0_hw_metric_queries[i].id == metric_id) + return &nvc0_hw_metric_queries[i]; + } + assert(0); + return NULL; +} + struct nvc0_hw_metric_query_cfg { + unsigned type; uint32_t queries[8]; uint32_t num_queries; }; #define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n) -#define _M(n, c) [NVC0_HW_METRIC_QUERY_##n] = c /* ==== Compute capability 2.0 (GF100/GF110) ==== */ static const struct nvc0_hw_metric_query_cfg sm20_achieved_occupancy = { + .type = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY, .queries[0] = _SM(ACTIVE_WARPS), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -58,6 +121,7 @@ sm20_achieved_occupancy = static const struct nvc0_hw_metric_query_cfg sm20_branch_efficiency = { + .type = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY, .queries[0] = _SM(BRANCH), .queries[1] = _SM(DIVERGENT_BRANCH), .num_queries = 2, @@ -66,6 +130,7 @@ sm20_branch_efficiency = static const struct nvc0_hw_metric_query_cfg sm20_inst_per_wrap = { + .type = NVC0_HW_METRIC_QUERY_INST_PER_WRAP, .queries[0] = _SM(INST_EXECUTED), .queries[1] = _SM(WARPS_LAUNCHED), .num_queries = 2, @@ -74,6 +139,7 @@ sm20_inst_per_wrap = static const struct nvc0_hw_metric_query_cfg sm20_inst_replay_overhead = { + .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, .queries[0] = _SM(INST_ISSUED), .queries[1] = _SM(INST_EXECUTED), .num_queries = 2, @@ -82,6 +148,16 @@ sm20_inst_replay_overhead = static const struct nvc0_hw_metric_query_cfg sm20_issued_ipc = { + .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC, + .queries[0] = _SM(INST_ISSUED), + .queries[1] = _SM(ACTIVE_CYCLES), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_issue_slot_utilization = +{ + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, .queries[0] = _SM(INST_ISSUED), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -90,6 +166,7 @@ sm20_issued_ipc = static const struct nvc0_hw_metric_query_cfg sm20_ipc = { + .type = NVC0_HW_METRIC_QUERY_IPC, .queries[0] = _SM(INST_EXECUTED), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -97,21 +174,20 @@ sm20_ipc = static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] = { - _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy), - _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency), - _M(INST_ISSUED, NULL), - _M(INST_PER_WRAP, &sm20_inst_per_wrap), - _M(INST_REPLAY_OVERHEAD, &sm20_inst_replay_overhead), - _M(ISSUED_IPC, &sm20_issued_ipc), - _M(ISSUE_SLOTS, NULL), - _M(ISSUE_SLOT_UTILIZATION, &sm20_issued_ipc), - _M(IPC, &sm20_ipc), + &sm20_achieved_occupancy, + &sm20_branch_efficiency, + &sm20_inst_per_wrap, + &sm20_inst_replay_overhead, + &sm20_issued_ipc, + &sm20_issue_slot_utilization, + &sm20_ipc, }; /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ static const struct nvc0_hw_metric_query_cfg sm21_inst_issued = { + .type = NVC0_HW_METRIC_QUERY_INST_ISSUED, .queries[0] = _SM(INST_ISSUED1_0), .queries[1] = _SM(INST_ISSUED1_1), .queries[2] = _SM(INST_ISSUED2_0), @@ -122,6 +198,7 @@ sm21_inst_issued = static const struct nvc0_hw_metric_query_cfg sm21_inst_replay_overhead = { + .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, .queries[0] = _SM(INST_ISSUED1_0), .queries[1] = _SM(INST_ISSUED1_1), .queries[2] = _SM(INST_ISSUED2_0), @@ -133,6 +210,7 @@ sm21_inst_replay_overhead = static const struct nvc0_hw_metric_query_cfg sm21_issued_ipc = { + .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC, .queries[0] = _SM(INST_ISSUED1_0), .queries[1] = _SM(INST_ISSUED1_1), .queries[2] = _SM(INST_ISSUED2_0), @@ -141,44 +219,47 @@ sm21_issued_ipc = .num_queries = 5, }; -static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] = +static const struct nvc0_hw_metric_query_cfg +sm21_issue_slots = { - _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy), - _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency), - _M(INST_ISSUED, &sm21_inst_issued), - _M(INST_PER_WRAP, &sm20_inst_per_wrap), - _M(INST_REPLAY_OVERHEAD, &sm21_inst_replay_overhead), - _M(ISSUED_IPC, &sm21_issued_ipc), - _M(ISSUE_SLOTS, &sm21_inst_issued), - _M(ISSUE_SLOT_UTILIZATION, &sm21_issued_ipc), - _M(IPC, &sm20_ipc), + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS, + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .num_queries = 4, }; -#undef _SM -#undef _M - -/* === PERFORMANCE MONITORING METRICS for NVE4+ === */ -static const char *nve4_hw_metric_names[] = -{ - "metric-achieved_occupancy", - "metric-branch_efficiency", - "metric-inst_issued", - "metric-inst_per_wrap", - "metric-inst_replay_overhead", - "metric-issued_ipc", - "metric-issue_slots", - "metric-issue_slot_utilization", - "metric-ipc", - "metric-shared_replay_overhead", +static const struct nvc0_hw_metric_query_cfg +sm21_issue_slot_utilization = +{ + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .queries[4] = _SM(ACTIVE_CYCLES), + .num_queries = 5, }; -#define _SM(n) NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_ ##n) -#define _M(n, c) [NVE4_HW_METRIC_QUERY_##n] = c +static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] = +{ + &sm20_achieved_occupancy, + &sm20_branch_efficiency, + &sm21_inst_issued, + &sm20_inst_per_wrap, + &sm21_inst_replay_overhead, + &sm21_issued_ipc, + &sm21_issue_slots, + &sm21_issue_slot_utilization, + &sm20_ipc, +}; /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */ static const struct nvc0_hw_metric_query_cfg sm30_achieved_occupancy = { + .type = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY, .queries[0] = _SM(ACTIVE_WARPS), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -187,6 +268,7 @@ sm30_achieved_occupancy = static const struct nvc0_hw_metric_query_cfg sm30_branch_efficiency = { + .type = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY, .queries[0] = _SM(BRANCH), .queries[1] = _SM(DIVERGENT_BRANCH), .num_queries = 2, @@ -195,6 +277,7 @@ sm30_branch_efficiency = static const struct nvc0_hw_metric_query_cfg sm30_inst_issued = { + .type = NVC0_HW_METRIC_QUERY_INST_ISSUED, .queries[0] = _SM(INST_ISSUED1), .queries[1] = _SM(INST_ISSUED2), .num_queries = 2, @@ -203,6 +286,7 @@ sm30_inst_issued = static const struct nvc0_hw_metric_query_cfg sm30_inst_per_wrap = { + .type = NVC0_HW_METRIC_QUERY_INST_PER_WRAP, .queries[0] = _SM(INST_EXECUTED), .queries[1] = _SM(WARPS_LAUNCHED), .num_queries = 2, @@ -211,6 +295,7 @@ sm30_inst_per_wrap = static const struct nvc0_hw_metric_query_cfg sm30_inst_replay_overhead = { + .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, .queries[0] = _SM(INST_ISSUED1), .queries[1] = _SM(INST_ISSUED2), .queries[2] = _SM(INST_EXECUTED), @@ -220,6 +305,26 @@ sm30_inst_replay_overhead = static const struct nvc0_hw_metric_query_cfg sm30_issued_ipc = { + .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC, + .queries[0] = _SM(INST_ISSUED1), + .queries[1] = _SM(INST_ISSUED2), + .queries[2] = _SM(ACTIVE_CYCLES), + .num_queries = 3, +}; + +static const struct nvc0_hw_metric_query_cfg +sm30_issue_slots = +{ + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS, + .queries[0] = _SM(INST_ISSUED1), + .queries[1] = _SM(INST_ISSUED2), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm30_issue_slot_utilization = +{ + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, .queries[0] = _SM(INST_ISSUED1), .queries[1] = _SM(INST_ISSUED2), .queries[2] = _SM(ACTIVE_CYCLES), @@ -229,6 +334,7 @@ sm30_issued_ipc = static const struct nvc0_hw_metric_query_cfg sm30_ipc = { + .type = NVC0_HW_METRIC_QUERY_IPC, .queries[0] = _SM(INST_EXECUTED), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -237,6 +343,7 @@ sm30_ipc = static const struct nvc0_hw_metric_query_cfg sm30_shared_replay_overhead = { + .type = NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD, .queries[0] = _SM(SHARED_LD_REPLAY), .queries[1] = _SM(SHARED_ST_REPLAY), .queries[2] = _SM(INST_EXECUTED), @@ -245,44 +352,89 @@ sm30_shared_replay_overhead = static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] = { - _M(ACHIEVED_OCCUPANCY, &sm30_achieved_occupancy), - _M(BRANCH_EFFICIENCY, &sm30_branch_efficiency), - _M(INST_ISSUED, &sm30_inst_issued), - _M(INST_PER_WRAP, &sm30_inst_per_wrap), - _M(INST_REPLAY_OVERHEAD, &sm30_inst_replay_overhead), - _M(ISSUED_IPC, &sm30_issued_ipc), - _M(ISSUE_SLOTS, &sm30_inst_issued), - _M(ISSUE_SLOT_UTILIZATION, &sm30_issued_ipc), - _M(IPC, &sm30_ipc), - _M(SHARED_REPLAY_OVERHEAD, &sm30_shared_replay_overhead), + &sm30_achieved_occupancy, + &sm30_branch_efficiency, + &sm30_inst_issued, + &sm30_inst_per_wrap, + &sm30_inst_replay_overhead, + &sm30_issued_ipc, + &sm30_issue_slots, + &sm30_issue_slot_utilization, + &sm30_ipc, + &sm30_shared_replay_overhead, +}; + +/* ==== Compute capability 3.5 (GK110) ==== */ +static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] = +{ + &sm30_achieved_occupancy, + &sm30_inst_issued, + &sm30_inst_per_wrap, + &sm30_inst_replay_overhead, + &sm30_issued_ipc, + &sm30_inst_issued, + &sm30_issue_slot_utilization, + &sm30_ipc, + &sm30_shared_replay_overhead, }; #undef _SM -#undef _M static inline const struct nvc0_hw_metric_query_cfg ** nvc0_hw_metric_get_queries(struct nvc0_screen *screen) { struct nouveau_device *dev = screen->base.device; - if (dev->chipset == 0xc0 || dev->chipset == 0xc8) - return sm20_hw_metric_queries; - return sm21_hw_metric_queries; + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + return sm35_hw_metric_queries; + case NVE4_3D_CLASS: + return sm30_hw_metric_queries; + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return sm20_hw_metric_queries; + return sm21_hw_metric_queries; + } + assert(0); + return NULL; +} + +unsigned +nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + return ARRAY_SIZE(sm35_hw_metric_queries); + case NVE4_3D_CLASS: + return ARRAY_SIZE(sm30_hw_metric_queries); + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return ARRAY_SIZE(sm20_hw_metric_queries); + return ARRAY_SIZE(sm21_hw_metric_queries); + } + return 0; } static const struct nvc0_hw_metric_query_cfg * -nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, - struct nvc0_hw_query *hq) +nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { const struct nvc0_hw_metric_query_cfg **queries; struct nvc0_screen *screen = nvc0->screen; struct nvc0_query *q = &hq->base; + unsigned num_queries; + unsigned i; - if (screen->base.class_3d >= NVE4_3D_CLASS) - return sm30_hw_metric_queries[q->type - NVE4_HW_METRIC_QUERY(0)]; - + num_queries = nvc0_hw_metric_get_num_queries(screen); queries = nvc0_hw_metric_get_queries(screen); - return queries[q->type - NVC0_HW_METRIC_QUERY(0)]; + + for (i = 0; i < num_queries; i++) { + if (NVC0_HW_METRIC_QUERY(queries[i]->type) == q->type) + return queries[i]; + } + assert(0); + return NULL; } static void @@ -328,9 +480,9 @@ sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) { switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: - /* (active_warps / active_cycles) / max. number of warps on a MP */ + /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */ if (res64[1]) - return (res64[0] / (double)res64[1]) / 48; + return ((res64[0] / (double)res64[1]) / 48) * 100; break; case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: /* (branch / (branch + divergent_branch)) * 100 */ @@ -419,47 +571,47 @@ sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) static uint64_t sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) { - switch (hq->base.type - NVE4_HW_METRIC_QUERY(0)) { - case NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: - /* (active_warps / active_cycles) / max. number of warps on a MP */ + switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { + case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: + /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */ if (res64[1]) - return (res64[0] / (double)res64[1]) / 64; + return ((res64[0] / (double)res64[1]) / 64) * 100; break; - case NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY: + case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: return sm20_hw_metric_calc_result(hq, res64); - case NVE4_HW_METRIC_QUERY_INST_ISSUED: + case NVC0_HW_METRIC_QUERY_INST_ISSUED: /* inst_issued1 + inst_issued2 * 2 */ return res64[0] + res64[1] * 2; - case NVE4_HW_METRIC_QUERY_INST_PER_WRAP: + case NVC0_HW_METRIC_QUERY_INST_PER_WRAP: return sm20_hw_metric_calc_result(hq, res64); - case NVE4_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD: + case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD: /* (metric-inst_issued - inst_executed) / inst_executed */ if (res64[2]) return (((res64[0] + res64[1] * 2) - res64[2]) / (double)res64[2]); break; - case NVE4_HW_METRIC_QUERY_ISSUED_IPC: + case NVC0_HW_METRIC_QUERY_ISSUED_IPC: /* metric-inst_issued / active_cycles */ if (res64[2]) return (res64[0] + res64[1] * 2) / (double)res64[2]; break; - case NVE4_HW_METRIC_QUERY_ISSUE_SLOTS: + case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS: /* inst_issued1 + inst_issued2 */ return res64[0] + res64[1]; - case NVE4_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION: + case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION: /* ((metric-issue_slots / 2) / active_cycles) * 100 */ if (res64[2]) return (((res64[0] + res64[1]) / 2) / (double)res64[2]) * 100; break; - case NVE4_HW_METRIC_QUERY_IPC: + case NVC0_HW_METRIC_QUERY_IPC: return sm20_hw_metric_calc_result(hq, res64); - case NVE4_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD: + case NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD: /* (shared_load_replay + shared_store_replay) / inst_executed */ if (res64[2]) return (res64[0] + res64[1]) / (double)res64[2]; break; default: debug_printf("invalid metric type: %d\n", - hq->base.type - NVE4_HW_METRIC_QUERY(0)); + hq->base.type - NVC0_HW_METRIC_QUERY(0)); break; } return 0; @@ -487,13 +639,17 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0, res64[i] = *(uint64_t *)&results[i]; } - if (screen->base.class_3d >= NVE4_3D_CLASS) { + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + case NVE4_3D_CLASS: value = sm30_hw_metric_calc_result(hq, res64); - } else { + break; + default: if (dev->chipset == 0xc0 || dev->chipset == 0xc8) value = sm20_hw_metric_calc_result(hq, res64); else value = sm21_hw_metric_calc_result(hq, res64); + break; } *(uint64_t *)result = value; @@ -515,8 +671,7 @@ nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type) struct nvc0_hw_query *hq; unsigned i; - if ((type < NVE4_HW_METRIC_QUERY(0) || type > NVE4_HW_METRIC_QUERY_LAST) && - (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST)) + if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST) return NULL; hmq = CALLOC_STRUCT(nvc0_hw_metric_query); @@ -541,46 +696,15 @@ nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type) return hq; } -static int -nvc0_hw_metric_get_next_query_id(const struct nvc0_hw_metric_query_cfg **queries, - unsigned id) -{ - unsigned i, next = 0; - - for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) { - if (!queries[i]) { - next++; - } else - if (i >= id && queries[id + next]) { - break; - } - } - return id + next; -} - int nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id, struct pipe_driver_query_info *info) { - uint16_t class_3d = screen->base.class_3d; int count = 0; if (screen->base.drm->version >= 0x01000101) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += NVE4_HW_METRIC_QUERY_COUNT; - } else - if (class_3d < NVE4_3D_CLASS) { - const struct nvc0_hw_metric_query_cfg **queries = - nvc0_hw_metric_get_queries(screen); - unsigned i; - - for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) { - if (queries[i]) - count++; - } - } - } + if (screen->compute) + count = nvc0_hw_metric_get_num_queries(screen); } if (!info) @@ -588,19 +712,15 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id, if (id < count) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_hw_metric_names[id]; - info->query_type = NVE4_HW_METRIC_QUERY(id); - info->group_id = NVC0_HW_METRIC_QUERY_GROUP; - return 1; - } else - if (class_3d < NVE4_3D_CLASS) { - const struct nvc0_hw_metric_query_cfg **queries = + if (screen->base.class_3d <= NVF0_3D_CLASS) { + const struct nvc0_hw_metric_query_cfg **queries = nvc0_hw_metric_get_queries(screen); + const struct nvc0_hw_metric_cfg *cfg = + nvc0_hw_metric_get_cfg(queries[id]->type); - id = nvc0_hw_metric_get_next_query_id(queries, id); - info->name = nvc0_hw_metric_names[id]; - info->query_type = NVC0_HW_METRIC_QUERY(id); + info->name = cfg->name; + info->query_type = NVC0_HW_METRIC_QUERY(queries[id]->type); + info->type = cfg->type; info->group_id = NVC0_HW_METRIC_QUERY_GROUP; return 1; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h index 06cb355db..3203a8ca2 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h @@ -18,24 +18,7 @@ nvc0_hw_metric_query(struct nvc0_hw_query *hq) /* * Driver metrics queries: */ -#define NVE4_HW_METRIC_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i)) -#define NVE4_HW_METRIC_QUERY_LAST NVE4_HW_METRIC_QUERY(NVE4_HW_METRIC_QUERY_COUNT - 1) -enum nve4_hw_metric_queries -{ - NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY = 0, - NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY, - NVE4_HW_METRIC_QUERY_INST_ISSUED, - NVE4_HW_METRIC_QUERY_INST_PER_WRAP, - NVE4_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, - NVE4_HW_METRIC_QUERY_ISSUED_IPC, - NVE4_HW_METRIC_QUERY_ISSUE_SLOTS, - NVE4_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, - NVE4_HW_METRIC_QUERY_IPC, - NVE4_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD, - NVE4_HW_METRIC_QUERY_COUNT -}; - -#define NVC0_HW_METRIC_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i)) +#define NVC0_HW_METRIC_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) #define NVC0_HW_METRIC_QUERY_LAST NVC0_HW_METRIC_QUERY(NVC0_HW_METRIC_QUERY_COUNT - 1) enum nvc0_hw_metric_queries { @@ -48,6 +31,7 @@ enum nvc0_hw_metric_queries NVC0_HW_METRIC_QUERY_ISSUE_SLOTS, NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, NVC0_HW_METRIC_QUERY_IPC, + NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD, NVC0_HW_METRIC_QUERY_COUNT }; @@ -56,4 +40,7 @@ nvc0_hw_metric_create_query(struct nvc0_context *, unsigned); int nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *, unsigned, struct pipe_driver_query_info *); +unsigned +nvc0_hw_metric_get_num_queries(struct nvc0_screen *); + #endif diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 68c8ff53a..d4721201b 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -30,59 +30,327 @@ #include "nvc0/nve4_compute.xml.h" #include "nvc0/nvc0_compute.xml.h" -/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ - /* NOTE: intentionally using the same names as NV */ -static const char *nve4_hw_sm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_cas_count", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "global_ld_mem_divergence_replays", - "global_store_transaction", - "global_st_mem_divergence_replays", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued1", - "inst_issued2", - "l1_global_load_hit", - "l1_global_load_miss", - "__l1_global_load_transactions", - "__l1_global_store_transactions", - "l1_local_load_hit", - "l1_local_load_miss", - "l1_local_store_hit", - "l1_local_store_miss", - "l1_shared_load_transactions", - "l1_shared_store_transactions", - "local_load", - "local_load_transactions", - "local_store", - "local_store_transactions", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_load_replay", - "shared_store", - "shared_store_replay", - "sm_cta_launched", - "threads_launched", - "uncached_global_load_transaction", - "warps_launched", +#define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d } +static const struct { + unsigned type; + const char *name; + const char *desc; +} nvc0_hw_sm_queries[] = { + _Q(ACTIVE_CYCLES, + "active_cycles", + "Number of cycles a multiprocessor has at least one active warp"), + + _Q(ACTIVE_WARPS, + "active_warps", + "Accumulated number of active warps per cycle. For every cycle it " + "increments by the number of active warps in the cycle which can be in " + "the range 0 to 64"), + + _Q(ATOM_CAS_COUNT, + "atom_cas_count", + "Number of warps executing atomic compare and swap operations. Increments " + "by one if at least one thread in a warp executes the instruction."), + + _Q(ATOM_COUNT, + "atom_count", + "Number of warps executing atomic reduction operations. Increments by one " + "if at least one thread in a warp executes the instruction"), + + _Q(BRANCH, + "branch", + "Number of branch instructions executed per warp on a multiprocessor"), + + _Q(DIVERGENT_BRANCH, + "divergent_branch", + "Number of divergent branches within a warp. This counter will be " + "incremented by one if at least one thread in a warp diverges (that is, " + "follows a different execution path) via a conditional branch"), + + _Q(GLD_REQUEST, + "gld_request", + "Number of executed load instructions where the state space is not " + "specified and hence generic addressing is used, increments per warp on a " + "multiprocessor. It can include the load operations from global,local and " + "shared state space"), + + _Q(GLD_MEM_DIV_REPLAY, + "global_ld_mem_divergence_replays", + "Number of instruction replays for global memory loads. Instruction is " + "replayed if the instruction is accessing more than one cache line of " + "128 bytes. For each extra cache line access the counter is incremented " + "by 1"), + + _Q(GST_TRANSACTIONS, + "global_store_transaction", + "Number of global store transactions. Increments by 1 per transaction. " + "Transaction can be 32/64/96/128B"), + + _Q(GST_MEM_DIV_REPLAY, + "global_st_mem_divergence_replays", + "Number of instruction replays for global memory stores. Instruction is " + "replayed if the instruction is accessing more than one cache line of " + "128 bytes. For each extra cache line access the counter is incremented " + "by 1"), + + _Q(GRED_COUNT, + "gred_count", + "Number of warps executing reduction operations on global memory. " + "Increments by one if at least one thread in a warp executes the " + "instruction"), + + _Q(GST_REQUEST, + "gst_request", + "Number of executed store instructions where the state space is not " + "specified and hence generic addressing is used, increments per warp on a " + "multiprocessor. It can include the store operations to global,local and " + "shared state space"), + + _Q(INST_EXECUTED, + "inst_executed", + "Number of instructions executed, do not include replays"), + + _Q(INST_ISSUED, + "inst_issued", + "Number of instructions issued including replays"), + + _Q(INST_ISSUED1, + "inst_issued1", + "Number of single instruction issued per cycle"), + + _Q(INST_ISSUED2, + "inst_issued2", + "Number of dual instructions issued per cycle"), + + _Q(INST_ISSUED1_0, + "inst_issued1_0", + "Number of single instruction issued per cycle in pipeline 0"), + + _Q(INST_ISSUED1_1, + "inst_issued1_1", + "Number of single instruction issued per cycle in pipeline 1"), + + _Q(INST_ISSUED2_0, + "inst_issued2_0", + "Number of dual instructions issued per cycle in pipeline 0"), + + _Q(INST_ISSUED2_1, + "inst_issued2_1", + "Number of dual instructions issued per cycle in pipeline 1"), + + _Q(L1_GLD_HIT, + "l1_global_load_hit", + "Number of cache lines that hit in L1 cache for global memory load " + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " + "32, 64 and 128 bit accesses by a warp respectively"), + + _Q(L1_GLD_MISS, + "l1_global_load_miss", + "Number of cache lines that miss in L1 cache for global memory load " + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " + "32, 64 and 128 bit accesses by a warp respectively"), + + _Q(L1_GLD_TRANSACTIONS, + "__l1_global_load_transactions", + "Number of global load transactions from L1 cache. Increments by 1 per " + "transaction. Transaction can be 32/64/96/128B"), + + _Q(L1_GST_TRANSACTIONS, + "__l1_global_store_transactions", + "Number of global store transactions from L1 cache. Increments by 1 per " + "transaction. Transaction can be 32/64/96/128B"), + + _Q(L1_LOCAL_LD_HIT, + "l1_local_load_hit", + "Number of cache lines that hit in L1 cache for local memory load " + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " + "32, 64 and 128 bit accesses by a warp respectively"), + + _Q(L1_LOCAL_LD_MISS, + "l1_local_load_miss", + "Number of cache lines that miss in L1 cache for local memory load " + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " + "32, 64 and 128 bit accesses by a warp respectively"), + + _Q(L1_LOCAL_ST_HIT, + "l1_local_store_hit", + "Number of cache lines that hit in L1 cache for local memory store " + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " + "32, 64 and 128 bit accesses by a warp respectively"), + + _Q(L1_LOCAL_ST_MISS, + "l1_local_store_miss", + "Number of cache lines that miss in L1 cache for local memory store " + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " + "32,64 and 128 bit accesses by a warp respectively"), + + _Q(L1_SHARED_LD_TRANSACTIONS, + "l1_shared_load_transactions", + "Number of shared load transactions. Increments by 1 per transaction. " + "Transaction can be 32/64/96/128B"), + + _Q(L1_SHARED_ST_TRANSACTIONS, + "l1_shared_store_transactions", + "Number of shared store transactions. Increments by 1 per transaction. " + "Transaction can be 32/64/96/128B"), + + _Q(LOCAL_LD, + "local_load", + "Number of executed load instructions where state space is specified as " + "local, increments per warp on a multiprocessor"), + + _Q(LOCAL_LD_TRANSACTIONS, + "local_load_transactions", + "Number of local load transactions from L1 cache. Increments by 1 per " + "transaction. Transaction can be 32/64/96/128B"), + + _Q(LOCAL_ST, + "local_store", + "Number of executed store instructions where state space is specified as " + "local, increments per warp on a multiprocessor"), + + _Q(LOCAL_ST_TRANSACTIONS, + "local_store_transactions", + "Number of local store transactions to L1 cache. Increments by 1 per " + "transaction. Transaction can be 32/64/96/128B."), + + _Q(NOT_PRED_OFF_INST_EXECUTED, + "not_predicated_off_thread_inst_executed", + "Number of not predicated off instructions executed by all threads, does " + "not include replays. For each instruction it increments by the number of " + "threads that execute this instruction"), + + _Q(PROF_TRIGGER_0, + "prof_trigger_00", + "User profiled generic trigger that can be inserted in any place of the " + "code to collect the related information. Increments per warp."), + + _Q(PROF_TRIGGER_1, + "prof_trigger_01", + "User profiled generic trigger that can be inserted in any place of the " + "code to collect the related information. Increments per warp."), + + _Q(PROF_TRIGGER_2, + "prof_trigger_02", + "User profiled generic trigger that can be inserted in any place of the " + "code to collect the related information. Increments per warp."), + + _Q(PROF_TRIGGER_3, + "prof_trigger_03", + "User profiled generic trigger that can be inserted in any place of the " + "code to collect the related information. Increments per warp."), + + _Q(PROF_TRIGGER_4, + "prof_trigger_04", + "User profiled generic trigger that can be inserted in any place of the " + "code to collect the related information. Increments per warp."), + + _Q(PROF_TRIGGER_5, + "prof_trigger_05", + "User profiled generic trigger that can be inserted in any place of the " + "code to collect the related information. Increments per warp."), + + _Q(PROF_TRIGGER_6, + "prof_trigger_06", + "User profiled generic trigger that can be inserted in any place of the " + "code to collect the related information. Increments per warp."), + + _Q(PROF_TRIGGER_7, + "prof_trigger_07", + "User profiled generic trigger that can be inserted in any place of the " + "code to collect the related information. Increments per warp."), + + _Q(SHARED_LD, + "shared_load", + "Number of executed load instructions where state space is specified as " + "shared, increments per warp on a multiprocessor"), + + _Q(SHARED_LD_REPLAY, + "shared_load_replay", + "Replays caused due to shared load bank conflict (when the addresses for " + "two or more shared memory load requests fall in the same memory bank) or " + "when there is no conflict but the total number of words accessed by all " + "threads in the warp executing that instruction exceed the number of words " + "that can be loaded in one cycle (256 bytes)"), + + _Q(SHARED_ST, + "shared_store", + "Number of executed store instructions where state space is specified as " + "shared, increments per warp on a multiprocessor"), + + _Q(SHARED_ST_REPLAY, + "shared_store_replay", + "Replays caused due to shared store bank conflict (when the addresses for " + "two or more shared memory store requests fall in the same memory bank) or " + "when there is no conflict but the total number of words accessed by all " + "threads in the warp executing that instruction exceed the number of words " + "that can be stored in one cycle"), + + _Q(SM_CTA_LAUNCHED, + "sm_cta_launched", + "Number of thread blocks launched on a multiprocessor"), + + _Q(THREADS_LAUNCHED, + "threads_launched", + "Number of threads launched on a multiprocessor"), + + _Q(TH_INST_EXECUTED, + "thread_inst_executed", + "Number of instructions executed by all threads, does not include " + "replays. For each instruction it increments by the number of threads in " + "the warp that execute the instruction"), + + _Q(TH_INST_EXECUTED_0, + "thread_inst_executed_0", + "Number of instructions executed by all threads, does not include " + "replays. For each instruction it increments by the number of threads in " + "the warp that execute the instruction in pipeline 0"), + + _Q(TH_INST_EXECUTED_1, + "thread_inst_executed_1", + "Number of instructions executed by all threads, does not include " + "replays. For each instruction it increments by the number of threads in " + "the warp that execute the instruction in pipeline 1"), + + _Q(TH_INST_EXECUTED_2, + "thread_inst_executed_2", + "Number of instructions executed by all threads, does not include " + "replays. For each instruction it increments by the number of threads in " + "the warp that execute the instruction in pipeline 2"), + + _Q(TH_INST_EXECUTED_3, + "thread_inst_executed_3", + "Number of instructions executed by all threads, does not include " + "replays. For each instruction it increments by the number of threads in " + "the warp that execute the instruction in pipeline 3"), + + _Q(UNCACHED_GLD_TRANSACTIONS, + "uncached_global_load_transaction", + "Number of uncached global load transactions. Increments by 1 per " + "transaction. Transaction can be 32/64/96/128B."), + + _Q(WARPS_LAUNCHED, + "warps_launched", + "Number of warps launched on a multiprocessor"), }; +#undef _Q + +static inline const char * +nvc0_hw_sm_query_get_name(unsigned query_type) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) { + if (nvc0_hw_sm_queries[i].type == query_type) + return nvc0_hw_sm_queries[i].name; + } + assert(0); + return NULL; +} + +/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ + /* Code to read out MP counters: They are accessible via mmio, too, but let's * just avoid mapping registers in userspace. We'd have to know which MPs are * enabled/present, too, and that information is not presently exposed. @@ -104,9 +372,9 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = * mov b32 $r6 $pm6 * mov b32 $r7 $pm7 * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] + * mov b32 $r10 c7[0x620] * ext u32 $r8 $r12 0x414 - * mov b32 $r11 c0[0x4] + * mov b32 $r11 c7[0x624] * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 * ext u32 $r9 $r12 0x208 * (not $p0) exit @@ -124,7 +392,7 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = * add b32 $r12 $c $r12 $r9 * st b128 wt g[$r10d] $r0q * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 - * mov b32 $r0 c0[0x8] + * mov b32 $r0 c7[0x628] * add b32 $r13 $r13 0x0 $c * $p1 st b128 wt g[$r12d+0x40] $r4q * st b32 wt g[$r12d+0x50] $r0 @@ -142,9 +410,9 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = 0x2c00000028019c04ULL, 0x2c0000002c01dc04ULL, 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, + 0x28005c1880029de4ULL, 0x7000c01050c21c03ULL, - 0x280040001002dde4ULL, + 0x28005c189002dde4ULL, 0x204282020042e047ULL, 0x7000c00820c25c03ULL, 0x80000000000021e7ULL, @@ -162,13 +430,56 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = 0x4801000024c31c03ULL, 0x9400000000a01fc5ULL, 0x200002e04202c047ULL, - 0x2800400020001de4ULL, + 0x28005c18a0001de4ULL, 0x0800000000d35c42ULL, 0x9400000100c107c5ULL, 0x9400000140c01f85ULL, 0x8000000000001de7ULL }; +static const uint64_t nvf0_read_hw_sm_counters_code[] = +{ + /* Same kernel as GK104 */ + 0x0880808080808080ULL, + 0x86400000109c0022ULL, + 0x86400000019c0032ULL, + 0x86400000021c0002ULL, + 0x86400000029c0006ULL, + 0x86400000031c000aULL, + 0x86400000039c000eULL, + 0x86400000041c0012ULL, + 0x08ac1080108c8080ULL, + 0x86400000049c0016ULL, + 0x86400000051c001aULL, + 0x86400000059c001eULL, + 0xdb201c007f9c201eULL, + 0x64c03ce0c41c002aULL, + 0xc00000020a1c3021ULL, + 0x64c03ce0c49c002eULL, + 0x0810a0808010b810ULL, + 0xc0000001041c3025ULL, + 0x180000000020003cULL, + 0xdb201c007f9c243eULL, + 0xc1c00000301c2021ULL, + 0xc1c00000081c2431ULL, + 0xc1c00000021c2435ULL, + 0xe0800000069c2026ULL, + 0x08b010b010b010a0ULL, + 0xe0800000061c2022ULL, + 0xe4c03c00051c0032ULL, + 0xe0840000041c282aULL, + 0xe4c03c00059c0036ULL, + 0xe08040007f9c2c2eULL, + 0xe0840000049c3032ULL, + 0xfe800000001c2800ULL, + 0x080000b81080b010ULL, + 0x64c03ce0c51c0002ULL, + 0xe08040007f9c3436ULL, + 0xfe80000020043010ULL, + 0xfc800000281c3000ULL, + 0x18000000001c003cULL, +}; + /* For simplicity, we will allocate as many group slots as we allocate counter * slots. This means that a single counter which wants to source from 2 groups * will have to be declared as using 2 counter slots. This shouldn't really be @@ -187,69 +498,593 @@ struct nvc0_hw_sm_counter_cfg struct nvc0_hw_sm_query_cfg { + unsigned type; struct nvc0_hw_sm_counter_cfg ctr[8]; uint8_t num_counters; uint8_t norm[2]; /* normalization num,denom */ }; -#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } } -#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } } +#define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s } +#define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s } +#define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c + +/* ==== Compute capability 3.0 (GK104:GK110) ==== */ +static const struct nvc0_hw_sm_query_cfg +sm30_active_cycles = +{ + .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES, + .ctr[0] = _CB(0x0001, B6, WARP, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_active_warps = +{ + .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS, + .ctr[0] = _CB(0x003f, B6, WARP, 0x31483104), + .num_counters = 1, + .norm = { 2, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_atom_cas_count = +{ + .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x000000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_atom_count = +{ + .type = NVC0_HW_SM_QUERY_ATOM_COUNT, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_branch = +{ + .type = NVC0_HW_SM_QUERY_BRANCH, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_divergent_branch = +{ + .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gld_request = +{ + .type = NVC0_HW_SM_QUERY_GLD_REQUEST, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gld_mem_div_replay = +{ + .type = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gst_transactions = +{ + .type = NVC0_HW_SM_QUERY_GST_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, MEM, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gst_mem_div_replay = +{ + .type = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY, + .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gred_count = +{ + .type = NVC0_HW_SM_QUERY_GRED_COUNT, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gst_request = +{ + .type = NVC0_HW_SM_QUERY_GST_REQUEST, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_inst_executed = +{ + .type = NVC0_HW_SM_QUERY_INST_EXECUTED, + .ctr[0] = _CA(0x0003, B6, EXEC, 0x00000398), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_inst_issued1 = +{ + .type = NVC0_HW_SM_QUERY_INST_ISSUED1, + .ctr[0] = _CA(0x0001, B6, ISSUE, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_inst_issued2 = +{ + .type = NVC0_HW_SM_QUERY_INST_ISSUED2, + .ctr[0] = _CA(0x0001, B6, ISSUE, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_gld_hit = +{ + .type = NVC0_HW_SM_QUERY_L1_GLD_HIT, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_gld_miss = +{ + .type = NVC0_HW_SM_QUERY_L1_GLD_MISS, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_gld_transactions = +{ + .type = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, UNK0F, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_gst_transactions = +{ + .type = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, UNK0F, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_local_ld_hit = +{ + .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_local_ld_miss = +{ + .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_local_st_hit = +{ + .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_local_st_miss = +{ + .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS, + .ctr[0] = _CB(0x0001, B6, L1, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_shared_ld_transactions = +{ + .type = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_shared_st_transactions = +{ + .type = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_local_ld = +{ + .type = NVC0_HW_SM_QUERY_LOCAL_LD, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_local_ld_transactions = +{ + .type = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_local_st = +{ + .type = NVC0_HW_SM_QUERY_LOCAL_ST, + .ctr[0] = _CA(0x0001, B6, LDST, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_local_st_transactions = +{ + .type = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_0 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_1 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_2 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_3 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3, + .ctr[0] = _CA(0x0001, B6, USER, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_4 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_5 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_6 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000018), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_7 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7, + .ctr[0] = _CA(0x0001, B6, USER, 0x0000001c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_shared_ld = +{ + .type = NVC0_HW_SM_QUERY_SHARED_LD, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_shared_ld_replay = +{ + .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, + .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_shared_st = +{ + .type = NVC0_HW_SM_QUERY_SHARED_ST, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_shared_st_replay = +{ + .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, + .ctr[0] = _CB(0x0001, B6, REPLAY, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_sm_cta_launched = +{ + .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED, + .ctr[0] = _CB(0x0001, B6, WARP, 0x0000001c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_threads_launched = +{ + .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED, + .ctr[0] = _CA(0x003f, B6, LAUNCH, 0x398a4188), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_uncached_gld_transactions = +{ + .type = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, MEM, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_warps_launched = +{ + .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, + .ctr[0] = _CA(0x0001, B6, LAUNCH, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; /* NOTES: * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps * inst_executed etc.: we only count a single warp scheduler */ -static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = -{ - _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), - _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), - _Q1A(ATOM_CAS_COUNT, 0x0001, B6, BRANCH, 0x000000004, 1, 1), - _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1), - _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1), - _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1), - _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1), - _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1), - _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1), - _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1), - _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1), - _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1), - _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1), - _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1), - _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1), - _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1), - _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1), - _Q1B(L1_GLD_TRANSACTIONS, 0x0001, B6, UNK0F, 0x00000000, 1, 1), - _Q1B(L1_GST_TRANSACTIONS, 0x0001, B6, UNK0F, 0x00000004, 1, 1), - _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1), - _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1), - _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1), - _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1), - _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1), - _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1), - _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1), - _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1), - _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1), - _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1), - _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1), - _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1), - _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1), - _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1), - _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1), - _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1), - _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1), - _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1), - _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1), - _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1), - _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1), - _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1), - _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1), - _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1), -}; - -#undef _Q1A -#undef _Q1B +static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] = +{ + &sm30_active_cycles, + &sm30_active_warps, + &sm30_atom_cas_count, + &sm30_atom_count, + &sm30_branch, + &sm30_divergent_branch, + &sm30_gld_request, + &sm30_gld_mem_div_replay, + &sm30_gst_transactions, + &sm30_gst_mem_div_replay, + &sm30_gred_count, + &sm30_gst_request, + &sm30_inst_executed, + &sm30_inst_issued1, + &sm30_inst_issued2, + &sm30_l1_gld_hit, + &sm30_l1_gld_miss, + &sm30_l1_gld_transactions, + &sm30_l1_gst_transactions, + &sm30_l1_local_ld_hit, + &sm30_l1_local_ld_miss, + &sm30_l1_local_st_hit, + &sm30_l1_local_st_miss, + &sm30_l1_shared_ld_transactions, + &sm30_l1_shared_st_transactions, + &sm30_local_ld, + &sm30_local_ld_transactions, + &sm30_local_st, + &sm30_local_st_transactions, + &sm30_prof_trigger_0, + &sm30_prof_trigger_1, + &sm30_prof_trigger_2, + &sm30_prof_trigger_3, + &sm30_prof_trigger_4, + &sm30_prof_trigger_5, + &sm30_prof_trigger_6, + &sm30_prof_trigger_7, + &sm30_shared_ld, + &sm30_shared_ld_replay, + &sm30_shared_st, + &sm30_shared_st_replay, + &sm30_sm_cta_launched, + &sm30_threads_launched, + &sm30_uncached_gld_transactions, + &sm30_warps_launched, +}; + +/* ==== Compute capability 3.5 (GK110/GK208) ==== */ +static const struct nvc0_hw_sm_query_cfg +sm35_atom_cas_count = +{ + .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, + .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_atom_count = +{ + .type = NVC0_HW_SM_QUERY_ATOM_COUNT, + .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_gred_count = +{ + .type = NVC0_HW_SM_QUERY_GRED_COUNT, + .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000018), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_not_pred_off_inst_executed = +{ + .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED, + .ctr[0] = _CA(0x003f, B6, UNK14, 0x29062080), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_shared_ld_replay = +{ + .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, + .ctr[0] = _CB(0xaaaa, LOGOP, UNK13, 0x00000018), + .ctr[1] = _CB(0x8888, LOGOP, REPLAY, 0x00000151), + .num_counters = 2, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_shared_st_replay = +{ + .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, + .ctr[0] = _CB(0xaaaa, LOGOP, UNK13, 0x00000018), + .ctr[1] = _CB(0x8888, LOGOP, REPLAY, 0x000001d1), + .num_counters = 2, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_th_inst_executed = +{ + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED, + .ctr[0] = _CA(0x003f, B6, UNK11, 0x29062080), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] = +{ + &sm30_active_cycles, + &sm30_active_warps, + &sm35_atom_cas_count, + &sm35_atom_count, + &sm30_gld_request, + &sm30_gld_mem_div_replay, + &sm30_gst_transactions, + &sm30_gst_mem_div_replay, + &sm35_gred_count, + &sm30_gst_request, + &sm30_inst_executed, + &sm30_inst_issued1, + &sm30_inst_issued2, + &sm30_l1_gld_hit, + &sm30_l1_gld_miss, + &sm30_l1_gld_transactions, + &sm30_l1_gst_transactions, + &sm30_l1_local_ld_hit, + &sm30_l1_local_ld_miss, + &sm30_l1_local_st_hit, + &sm30_l1_local_st_miss, + &sm30_l1_shared_ld_transactions, + &sm30_l1_shared_st_transactions, + &sm30_local_ld, + &sm30_local_ld_transactions, + &sm30_local_st, + &sm30_local_st_transactions, + &sm35_not_pred_off_inst_executed, + &sm30_prof_trigger_0, + &sm30_prof_trigger_1, + &sm30_prof_trigger_2, + &sm30_prof_trigger_3, + &sm30_prof_trigger_4, + &sm30_prof_trigger_5, + &sm30_prof_trigger_6, + &sm30_prof_trigger_7, + &sm30_shared_ld, + &sm35_shared_ld_replay, + &sm30_shared_st, + &sm35_shared_st_replay, + &sm30_sm_cta_launched, + &sm35_th_inst_executed, + &sm30_threads_launched, + &sm30_uncached_gld_transactions, + &sm30_warps_launched, +}; + +#undef _Q +#undef _CA +#undef _CB /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ /* NOTES: @@ -257,43 +1092,6 @@ static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = * because there is a context-switch problem that we need to fix. * Results might be wrong sometimes, be careful! */ -static const char *nvc0_hw_sm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued", - "inst_issued1_0", - "inst_issued1_1", - "inst_issued2_0", - "inst_issued2_1", - "local_load", - "local_store", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_store", - "threads_launched", - "thread_inst_executed_0", - "thread_inst_executed_1", - "thread_inst_executed_2", - "thread_inst_executed_3", - "warps_launched", -}; - static const uint64_t nvc0_read_hw_sm_counters_code[] = { /* mov b32 $r8 $tidx @@ -307,14 +1105,14 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = * mov b32 $r6 $pm6 * mov b32 $r7 $pm7 * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] - * mov b32 $r11 c0[0x4] + * mov b32 $r10 c15[0x620] + * mov b32 $r11 c15[0x624] * ext u32 $r8 $r9 0x414 * (not $p0) exit * mul $r8 u32 $r8 u32 48 * add b32 $r10 $c $r10 $r8 * add b32 $r11 $r11 0x0 $c - * mov b32 $r8 c0[0x8] + * mov b32 $r8 c15[0x628] * st b128 wt g[$r10d+0x00] $r0q * st b128 wt g[$r10d+0x10] $r4q * st b32 wt g[$r10d+0x20] $r8 @@ -330,14 +1128,14 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = 0x2c00000028019c04ULL, 0x2c0000002c01dc04ULL, 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, - 0x280040001002dde4ULL, + 0x28007c1880029de4ULL, + 0x28007c189002dde4ULL, 0x7000c01050921c03ULL, 0x80000000000021e7ULL, 0x10000000c0821c02ULL, 0x4801000020a29c03ULL, 0x0800000000b2dc42ULL, - 0x2800400020021de4ULL, + 0x28007c18a0021de4ULL, 0x9400000000a01fc5ULL, 0x9400000040a11fc5ULL, 0x9400000080a21f85ULL, @@ -345,12 +1143,12 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = }; #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s } -#define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c /* ==== Compute capability 2.0 (GF100/GF110) ==== */ static const struct nvc0_hw_sm_query_cfg sm20_active_cycles = { + .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES, .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -359,6 +1157,7 @@ sm20_active_cycles = static const struct nvc0_hw_sm_query_cfg sm20_active_warps = { + .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS, .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010), .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020), .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030), @@ -372,6 +1171,7 @@ sm20_active_warps = static const struct nvc0_hw_sm_query_cfg sm20_atom_count = { + .type = NVC0_HW_SM_QUERY_ATOM_COUNT, .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -380,6 +1180,7 @@ sm20_atom_count = static const struct nvc0_hw_sm_query_cfg sm20_branch = { + .type = NVC0_HW_SM_QUERY_BRANCH, .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010), .num_counters = 2, @@ -389,6 +1190,7 @@ sm20_branch = static const struct nvc0_hw_sm_query_cfg sm20_divergent_branch = { + .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020), .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030), .num_counters = 2, @@ -398,6 +1200,7 @@ sm20_divergent_branch = static const struct nvc0_hw_sm_query_cfg sm20_gld_request = { + .type = NVC0_HW_SM_QUERY_GLD_REQUEST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -406,6 +1209,7 @@ sm20_gld_request = static const struct nvc0_hw_sm_query_cfg sm20_gred_count = { + .type = NVC0_HW_SM_QUERY_GRED_COUNT, .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -414,6 +1218,7 @@ sm20_gred_count = static const struct nvc0_hw_sm_query_cfg sm20_gst_request = { + .type = NVC0_HW_SM_QUERY_GST_REQUEST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060), .num_counters = 1, .norm = { 1, 1 }, @@ -422,6 +1227,7 @@ sm20_gst_request = static const struct nvc0_hw_sm_query_cfg sm20_inst_executed = { + .type = NVC0_HW_SM_QUERY_INST_EXECUTED, .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010), .num_counters = 2, @@ -431,6 +1237,7 @@ sm20_inst_executed = static const struct nvc0_hw_sm_query_cfg sm20_inst_issued = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED, .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060), .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070), .num_counters = 2, @@ -440,6 +1247,7 @@ sm20_inst_issued = static const struct nvc0_hw_sm_query_cfg sm20_local_ld = { + .type = NVC0_HW_SM_QUERY_LOCAL_LD, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -448,6 +1256,7 @@ sm20_local_ld = static const struct nvc0_hw_sm_query_cfg sm20_local_st = { + .type = NVC0_HW_SM_QUERY_LOCAL_ST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -456,6 +1265,7 @@ sm20_local_st = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_0 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -464,6 +1274,7 @@ sm20_prof_trigger_0 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_1 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -472,6 +1283,7 @@ sm20_prof_trigger_1 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_2 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -480,6 +1292,7 @@ sm20_prof_trigger_2 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_3 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -488,6 +1301,7 @@ sm20_prof_trigger_3 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_4 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -496,6 +1310,7 @@ sm20_prof_trigger_4 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_5 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -504,6 +1319,7 @@ sm20_prof_trigger_5 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_6 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060), .num_counters = 1, .norm = { 1, 1 }, @@ -512,6 +1328,7 @@ sm20_prof_trigger_6 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_7 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070), .num_counters = 1, .norm = { 1, 1 }, @@ -520,6 +1337,7 @@ sm20_prof_trigger_7 = static const struct nvc0_hw_sm_query_cfg sm20_shared_ld = { + .type = NVC0_HW_SM_QUERY_SHARED_LD, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -528,6 +1346,7 @@ sm20_shared_ld = static const struct nvc0_hw_sm_query_cfg sm20_shared_st = { + .type = NVC0_HW_SM_QUERY_SHARED_ST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -536,6 +1355,7 @@ sm20_shared_st = static const struct nvc0_hw_sm_query_cfg sm20_threads_launched = { + .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED, .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010), .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020), .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030), @@ -549,6 +1369,7 @@ sm20_threads_launched = static const struct nvc0_hw_sm_query_cfg sm20_th_inst_executed_0 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020), @@ -562,6 +1383,7 @@ sm20_th_inst_executed_0 = static const struct nvc0_hw_sm_query_cfg sm20_th_inst_executed_1 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020), @@ -575,6 +1397,7 @@ sm20_th_inst_executed_1 = static const struct nvc0_hw_sm_query_cfg sm20_warps_launched = { + .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -582,44 +1405,39 @@ sm20_warps_launched = static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] = { - _Q(ACTIVE_CYCLES, &sm20_active_cycles), - _Q(ACTIVE_WARPS, &sm20_active_warps), - _Q(ATOM_COUNT, &sm20_atom_count), - _Q(BRANCH, &sm20_branch), - _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), - _Q(GLD_REQUEST, &sm20_gld_request), - _Q(GRED_COUNT, &sm20_gred_count), - _Q(GST_REQUEST, &sm20_gst_request), - _Q(INST_EXECUTED, &sm20_inst_executed), - _Q(INST_ISSUED, &sm20_inst_issued), - _Q(INST_ISSUED1_0, NULL), - _Q(INST_ISSUED1_1, NULL), - _Q(INST_ISSUED2_0, NULL), - _Q(INST_ISSUED2_1, NULL), - _Q(LOCAL_LD, &sm20_local_ld), - _Q(LOCAL_ST, &sm20_local_st), - _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), - _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), - _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), - _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), - _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), - _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), - _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), - _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), - _Q(SHARED_LD, &sm20_shared_ld), - _Q(SHARED_ST, &sm20_shared_st), - _Q(THREADS_LAUNCHED, &sm20_threads_launched), - _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0), - _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1), - _Q(TH_INST_EXECUTED_2, NULL), - _Q(TH_INST_EXECUTED_3, NULL), - _Q(WARPS_LAUNCHED, &sm20_warps_launched), + &sm20_active_cycles, + &sm20_active_warps, + &sm20_atom_count, + &sm20_branch, + &sm20_divergent_branch, + &sm20_gld_request, + &sm20_gred_count, + &sm20_gst_request, + &sm20_inst_executed, + &sm20_inst_issued, + &sm20_local_ld, + &sm20_local_st, + &sm20_prof_trigger_0, + &sm20_prof_trigger_1, + &sm20_prof_trigger_2, + &sm20_prof_trigger_3, + &sm20_prof_trigger_4, + &sm20_prof_trigger_5, + &sm20_prof_trigger_6, + &sm20_prof_trigger_7, + &sm20_shared_ld, + &sm20_shared_st, + &sm20_threads_launched, + &sm20_th_inst_executed_0, + &sm20_th_inst_executed_1, + &sm20_warps_launched, }; /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ static const struct nvc0_hw_sm_query_cfg sm21_inst_executed = { + .type = NVC0_HW_SM_QUERY_INST_EXECUTED, .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020), @@ -630,6 +1448,7 @@ sm21_inst_executed = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued1_0 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED1_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -638,6 +1457,7 @@ sm21_inst_issued1_0 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued1_1 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED1_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -646,6 +1466,7 @@ sm21_inst_issued1_1 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued2_0 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED2_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -654,6 +1475,7 @@ sm21_inst_issued2_0 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued2_1 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED2_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -662,6 +1484,7 @@ sm21_inst_issued2_1 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_0 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020), @@ -675,6 +1498,7 @@ sm21_th_inst_executed_0 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_1 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020), @@ -688,6 +1512,7 @@ sm21_th_inst_executed_1 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_2 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020), @@ -701,6 +1526,7 @@ sm21_th_inst_executed_2 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_3 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020), @@ -713,41 +1539,39 @@ sm21_th_inst_executed_3 = static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] = { - _Q(ACTIVE_CYCLES, &sm20_active_cycles), - _Q(ACTIVE_WARPS, &sm20_active_warps), - _Q(ATOM_COUNT, &sm20_atom_count), - _Q(BRANCH, &sm20_branch), - _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), - _Q(GLD_REQUEST, &sm20_gld_request), - _Q(GRED_COUNT, &sm20_gred_count), - _Q(GST_REQUEST, &sm20_gst_request), - _Q(INST_EXECUTED, &sm21_inst_executed), - _Q(INST_ISSUED, NULL), - _Q(INST_ISSUED1_0, &sm21_inst_issued1_0), - _Q(INST_ISSUED1_1, &sm21_inst_issued1_1), - _Q(INST_ISSUED2_0, &sm21_inst_issued2_0), - _Q(INST_ISSUED2_1, &sm21_inst_issued2_1), - _Q(LOCAL_LD, &sm20_local_ld), - _Q(LOCAL_ST, &sm20_local_st), - _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), - _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), - _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), - _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), - _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), - _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), - _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), - _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), - _Q(SHARED_LD, &sm20_shared_ld), - _Q(SHARED_ST, &sm20_shared_st), - _Q(THREADS_LAUNCHED, &sm20_threads_launched), - _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0), - _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1), - _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2), - _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3), - _Q(WARPS_LAUNCHED, &sm20_warps_launched), + &sm20_active_cycles, + &sm20_active_warps, + &sm20_atom_count, + &sm20_branch, + &sm20_divergent_branch, + &sm20_gld_request, + &sm20_gred_count, + &sm20_gst_request, + &sm21_inst_executed, + &sm21_inst_issued1_0, + &sm21_inst_issued1_1, + &sm21_inst_issued2_0, + &sm21_inst_issued2_1, + &sm20_local_ld, + &sm20_local_st, + &sm20_prof_trigger_0, + &sm20_prof_trigger_1, + &sm20_prof_trigger_2, + &sm20_prof_trigger_3, + &sm20_prof_trigger_4, + &sm20_prof_trigger_5, + &sm20_prof_trigger_6, + &sm20_prof_trigger_7, + &sm20_shared_ld, + &sm20_shared_st, + &sm20_threads_launched, + &sm21_th_inst_executed_0, + &sm21_th_inst_executed_1, + &sm21_th_inst_executed_2, + &sm21_th_inst_executed_3, + &sm20_warps_launched, }; -#undef _Q #undef _C static inline const struct nvc0_hw_sm_query_cfg ** @@ -755,26 +1579,55 @@ nvc0_hw_sm_get_queries(struct nvc0_screen *screen) { struct nouveau_device *dev = screen->base.device; - if (dev->chipset == 0xc0 || dev->chipset == 0xc8) - return sm20_hw_sm_queries; - return sm21_hw_sm_queries; + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + return sm35_hw_sm_queries; + case NVE4_3D_CLASS: + return sm30_hw_sm_queries; + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return sm20_hw_sm_queries; + return sm21_hw_sm_queries; + } + assert(0); + return NULL; +} + +unsigned +nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + return ARRAY_SIZE(sm35_hw_sm_queries); + case NVE4_3D_CLASS: + return ARRAY_SIZE(sm30_hw_sm_queries); + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return ARRAY_SIZE(sm20_hw_sm_queries); + return ARRAY_SIZE(sm21_hw_sm_queries); + } + return 0; } static const struct nvc0_hw_sm_query_cfg * nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { + const struct nvc0_hw_sm_query_cfg **queries; struct nvc0_screen *screen = nvc0->screen; struct nvc0_query *q = &hq->base; + unsigned num_queries; + unsigned i; - if (screen->base.class_3d >= NVE4_3D_CLASS) - return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + num_queries = nvc0_hw_sm_get_num_queries(screen); + queries = nvc0_hw_sm_get_queries(screen); - if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) { - const struct nvc0_hw_sm_query_cfg **queries = - nvc0_hw_sm_get_queries(screen); - return queries[q->type - NVC0_HW_SM_QUERY(0)]; + for (i = 0; i < num_queries; i++) { + if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type) + return queries[i]; } - debug_printf("invalid query type: %d\n", q->type); + assert(0); return NULL; } @@ -846,15 +1699,15 @@ nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) /* configure and reset the counter(s) */ if (d == 0) - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); + BEGIN_NVC0(push, NVE4_CP(MP_PM_A_SIGSEL(c & 3)), 1); else - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); + BEGIN_NVC0(push, NVE4_CP(MP_PM_B_SIGSEL(c & 3)), 1); PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); + BEGIN_NVC0(push, NVE4_CP(MP_PM_SRCSEL(c)), 1); PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); + BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 1); PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); + BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1); PUSH_DATA (push, 0); } return true; @@ -917,18 +1770,83 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) mask_sel &= cfg->ctr[i].src_mask; /* configure and reset the counter(s) */ - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1); + BEGIN_NVC0(push, NVC0_CP(MP_PM_SIGSEL(c)), 1); PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1); + BEGIN_NVC0(push, NVC0_CP(MP_PM_SRCSEL(c)), 1); PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1); + BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(c)), 1); PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1); + BEGIN_NVC0(push, NVC0_CP(MP_PM_SET(c)), 1); PUSH_DATA (push, 0); } return true; } +static inline struct nvc0_program * +nvc0_hw_sm_get_program(struct nvc0_screen *screen) +{ + struct nvc0_program *prog; + + prog = CALLOC_STRUCT(nvc0_program); + if (!prog) + return NULL; + + prog->type = PIPE_SHADER_COMPUTE; + prog->translated = true; + prog->parm_size = 12; + + if (screen->base.class_3d == NVE4_3D_CLASS || + screen->base.class_3d == NVF0_3D_CLASS) { + if (screen->base.class_3d == NVE4_3D_CLASS) { + prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; + prog->code_size = sizeof(nve4_read_hw_sm_counters_code); + } else { + prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code; + prog->code_size = sizeof(nvf0_read_hw_sm_counters_code); + } + prog->num_gprs = 14; + } else { + prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; + prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); + prog->num_gprs = 12; + } + return prog; +} + +static inline void +nvc0_hw_sm_upload_input(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; + uint64_t address; + const int s = 5; + + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + PUSH_SPACE(push, 11); + + if (screen->base.class_3d >= NVE4_3D_CLASS) { + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_MP_INFO); + PUSH_DATA (push, address + NVC0_CB_AUX_MP_INFO); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 3 * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 3); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + } else { + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, address); + PUSH_DATA (push, address); + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 3); + PUSH_DATA (push, NVC0_CB_AUX_MP_INFO); + } + PUSH_DATA (push, (hq->bo->offset + hq->base_offset)); + PUSH_DATAh(push, (hq->bo->offset + hq->base_offset)); + PUSH_DATA (push, hq->sequence); +} + static void nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { @@ -937,6 +1855,7 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) struct nouveau_pushbuf *push = nvc0->base.pushbuf; const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + struct nvc0_program *old = nvc0->compprog; struct pipe_grid_info info = {}; uint32_t mask; uint32_t input[3]; @@ -944,31 +1863,17 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 }; unsigned c, i; - if (unlikely(!screen->pm.prog)) { - struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); - prog->type = PIPE_SHADER_COMPUTE; - prog->translated = true; - prog->parm_size = 12; - if (is_nve4) { - prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; - prog->code_size = sizeof(nve4_read_hw_sm_counters_code); - prog->num_gprs = 14; - } else { - prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; - prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); - prog->num_gprs = 12; - } - screen->pm.prog = prog; - } + if (unlikely(!screen->pm.prog)) + screen->pm.prog = nvc0_hw_sm_get_program(screen); /* disable all counting */ PUSH_SPACE(push, 8); for (c = 0; c < 8; ++c) if (screen->pm.mp_counter[c]) { if (is_nve4) { - IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); + IMMED_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 0); } else { - IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0); + IMMED_NVC0(push, NVC0_CP(MP_PM_OP(c)), 0); } } /* release counters for this query */ @@ -984,13 +1889,12 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) hq->bo); PUSH_SPACE(push, 1); - IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); + IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0); - pipe->bind_compute_state(pipe, screen->pm.prog); - input[0] = (hq->bo->offset + hq->base_offset); - input[1] = (hq->bo->offset + hq->base_offset) >> 32; - input[2] = hq->sequence; + /* upload input data for the compute shader which reads MP counters */ + nvc0_hw_sm_upload_input(nvc0, hq); + pipe->bind_compute_state(pipe, screen->pm.prog); for (i = 0; i < 3; i++) { info.block[i] = block[i]; info.grid[i] = grid[i]; @@ -998,6 +1902,7 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) info.pc = 0; info.input = input; pipe->launch_grid(pipe, &info); + pipe->bind_compute_state(pipe, old); nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); @@ -1018,9 +1923,9 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) break; mask |= 1 << hsq->ctr[i]; if (is_nve4) { - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1); + BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(hsq->ctr[i])), 1); } else { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1); + BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(hsq->ctr[i])), 1); } PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); } @@ -1132,8 +2037,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) if (nvc0->screen->base.drm->version < 0x01000101) return NULL; - if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) && - (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)) + if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST) return NULL; hsq = CALLOC_STRUCT(nvc0_hw_sm_query); @@ -1201,23 +2105,6 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) return hq; } -static int -nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries, - unsigned id) -{ - unsigned i, next = 0; - - for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { - if (!queries[i]) { - next++; - } else - if (i >= id && queries[id + next]) { - break; - } - } - return id + next; -} - int nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, struct pipe_driver_query_info *info) @@ -1225,21 +2112,8 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, int count = 0; if (screen->base.drm->version >= 0x01000101) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += NVE4_HW_SM_QUERY_COUNT; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - const struct nvc0_hw_sm_query_cfg **queries = - nvc0_hw_sm_get_queries(screen); - unsigned i; - - for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { - if (queries[i]) - count++; - } - } - } + if (screen->compute) + count = nvc0_hw_sm_get_num_queries(screen); } if (!info) @@ -1247,19 +2121,12 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, if (id < count) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_hw_sm_query_names[id]; - info->query_type = NVE4_HW_SM_QUERY(id); - info->group_id = NVC0_HW_SM_QUERY_GROUP; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { + if (screen->base.class_3d <= NVF0_3D_CLASS) { const struct nvc0_hw_sm_query_cfg **queries = nvc0_hw_sm_get_queries(screen); - id = nvc0_hw_sm_get_next_query_id(queries, id); - info->name = nvc0_hw_sm_query_names[id]; - info->query_type = NVC0_HW_SM_QUERY(id); + info->name = nvc0_hw_sm_query_get_name(queries[id]->type); + info->query_type = NVC0_HW_SM_QUERY(queries[id]->type); info->group_id = NVC0_HW_SM_QUERY_GROUP; return 1; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h index 94d55a04f..65d6c8b31 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h @@ -17,78 +17,45 @@ nvc0_hw_sm_query(struct nvc0_hw_query *hq) /* * Performance counter queries: */ -#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) -#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) -enum nve4_hw_sm_queries -{ - NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVE4_HW_SM_QUERY_ACTIVE_WARPS, - NVE4_HW_SM_QUERY_ATOM_CAS_COUNT, - NVE4_HW_SM_QUERY_ATOM_COUNT, - NVE4_HW_SM_QUERY_BRANCH, - NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, - NVE4_HW_SM_QUERY_GLD_REQUEST, - NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GRED_COUNT, - NVE4_HW_SM_QUERY_GST_REQUEST, - NVE4_HW_SM_QUERY_INST_EXECUTED, - NVE4_HW_SM_QUERY_INST_ISSUED1, - NVE4_HW_SM_QUERY_INST_ISSUED2, - NVE4_HW_SM_QUERY_L1_GLD_HIT, - NVE4_HW_SM_QUERY_L1_GLD_MISS, - NVE4_HW_SM_QUERY_L1_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, - NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_LD, - NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_ST, - NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_PROF_TRIGGER_0, - NVE4_HW_SM_QUERY_PROF_TRIGGER_1, - NVE4_HW_SM_QUERY_PROF_TRIGGER_2, - NVE4_HW_SM_QUERY_PROF_TRIGGER_3, - NVE4_HW_SM_QUERY_PROF_TRIGGER_4, - NVE4_HW_SM_QUERY_PROF_TRIGGER_5, - NVE4_HW_SM_QUERY_PROF_TRIGGER_6, - NVE4_HW_SM_QUERY_PROF_TRIGGER_7, - NVE4_HW_SM_QUERY_SHARED_LD, - NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, - NVE4_HW_SM_QUERY_SHARED_ST, - NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, - NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, - NVE4_HW_SM_QUERY_THREADS_LAUNCHED, - NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_WARPS_LAUNCHED, - NVE4_HW_SM_QUERY_COUNT -}; - -#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) +#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) #define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) enum nvc0_hw_sm_queries { NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, NVC0_HW_SM_QUERY_ACTIVE_WARPS, + NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, NVC0_HW_SM_QUERY_ATOM_COUNT, NVC0_HW_SM_QUERY_BRANCH, NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, NVC0_HW_SM_QUERY_GLD_REQUEST, + NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + NVC0_HW_SM_QUERY_GST_TRANSACTIONS, + NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY, NVC0_HW_SM_QUERY_GRED_COUNT, NVC0_HW_SM_QUERY_GST_REQUEST, NVC0_HW_SM_QUERY_INST_EXECUTED, NVC0_HW_SM_QUERY_INST_ISSUED, + NVC0_HW_SM_QUERY_INST_ISSUED1, + NVC0_HW_SM_QUERY_INST_ISSUED2, NVC0_HW_SM_QUERY_INST_ISSUED1_0, NVC0_HW_SM_QUERY_INST_ISSUED1_1, NVC0_HW_SM_QUERY_INST_ISSUED2_0, NVC0_HW_SM_QUERY_INST_ISSUED2_1, + NVC0_HW_SM_QUERY_L1_GLD_HIT, + NVC0_HW_SM_QUERY_L1_GLD_MISS, + NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT, + NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS, + NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT, + NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS, + NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, NVC0_HW_SM_QUERY_LOCAL_LD, + NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, NVC0_HW_SM_QUERY_LOCAL_ST, + NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, + NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED, NVC0_HW_SM_QUERY_PROF_TRIGGER_0, NVC0_HW_SM_QUERY_PROF_TRIGGER_1, NVC0_HW_SM_QUERY_PROF_TRIGGER_2, @@ -98,12 +65,17 @@ enum nvc0_hw_sm_queries NVC0_HW_SM_QUERY_PROF_TRIGGER_6, NVC0_HW_SM_QUERY_PROF_TRIGGER_7, NVC0_HW_SM_QUERY_SHARED_LD, + NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, NVC0_HW_SM_QUERY_SHARED_ST, + NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, + NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED, NVC0_HW_SM_QUERY_THREADS_LAUNCHED, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, + NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, NVC0_HW_SM_QUERY_WARPS_LAUNCHED, NVC0_HW_SM_QUERY_COUNT }; @@ -113,4 +85,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *, unsigned); int nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *, unsigned, struct pipe_driver_query_info *); +unsigned +nvc0_hw_sm_get_num_queries(struct nvc0_screen *); + #endif diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c index 7fbc6e1fd..9bafe3d83 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c @@ -19,7 +19,8 @@ nvc0_resource_create(struct pipe_screen *screen, static struct pipe_resource * nvc0_resource_from_handle(struct pipe_screen * screen, const struct pipe_resource *templ, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { if (templ->target == PIPE_BUFFER) { return NULL; @@ -48,9 +49,11 @@ nvc0_init_resource_functions(struct pipe_context *pcontext) pcontext->transfer_map = u_transfer_map_vtbl; pcontext->transfer_flush_region = u_transfer_flush_region_vtbl; pcontext->transfer_unmap = u_transfer_unmap_vtbl; - pcontext->transfer_inline_write = u_transfer_inline_write_vtbl; + pcontext->buffer_subdata = u_default_buffer_subdata; + pcontext->texture_subdata = u_default_texture_subdata; pcontext->create_surface = nvc0_surface_create; pcontext->surface_destroy = nv50_surface_destroy; + pcontext->invalidate_resource = nv50_invalidate_resource; } void diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index b919438e7..2cac3c79e 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -45,11 +45,19 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen, unsigned sample_count, unsigned bindings) { + const struct util_format_description *desc = util_format_description(format); + if (sample_count > 8) return false; if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */ return false; + /* Short-circuit the rest of the logic -- this is used by the state tracker + * to determine valid MS levels in a no-attachments scenario. + */ + if (format == PIPE_FORMAT_NONE && bindings & PIPE_BIND_RENDER_TARGET) + return true; + if (!util_format_is_supported(format, bindings)) return false; @@ -57,11 +65,36 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen, if (util_format_get_blocksizebits(format) == 3 * 32) return false; - /* transfers & shared are always supported */ - bindings &= ~(PIPE_BIND_TRANSFER_READ | - PIPE_BIND_TRANSFER_WRITE | + if (bindings & PIPE_BIND_LINEAR) + if (util_format_is_depth_or_stencil(format) || + (target != PIPE_TEXTURE_1D && + target != PIPE_TEXTURE_2D && + target != PIPE_TEXTURE_RECT) || + sample_count > 1) + return false; + + /* Restrict ETC2 and ASTC formats here. These are only supported on GK20A. + */ + if ((desc->layout == UTIL_FORMAT_LAYOUT_ETC || + desc->layout == UTIL_FORMAT_LAYOUT_ASTC) && + /* The claim is that this should work on GM107 but it doesn't. Need to + * test further and figure out if it's a nouveau issue or a HW one. + nouveau_screen(pscreen)->class_3d < GM107_3D_CLASS && + */ + nouveau_screen(pscreen)->class_3d != NVEA_3D_CLASS) + return false; + + /* shared is always supported */ + bindings &= ~(PIPE_BIND_LINEAR | PIPE_BIND_SHARED); + if (bindings & PIPE_BIND_SHADER_IMAGE && sample_count > 1 && + nouveau_screen(pscreen)->class_3d >= GM107_3D_CLASS) { + /* MS images are currently unsupported on Maxwell because they have to + * be handled explicitly. */ + return false; + } + return (( nvc0_format_table[format].usage | nvc0_vertex_format[format].usage) & bindings) == bindings; } @@ -92,6 +125,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: return 128 * 1024 * 1024; case PIPE_CAP_GLSL_FEATURE_LEVEL: + if (class_3d <= NVF0_3D_CLASS) + return 430; return 410; case PIPE_CAP_MAX_RENDER_TARGETS: return 8; @@ -127,6 +162,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return PIPE_ENDIAN_LITTLE; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: return 30; + case PIPE_CAP_MAX_WINDOW_RECTANGLES: + return NVC0_MAX_WINDOW_RECTANGLES; /* supported caps */ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: @@ -134,6 +171,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_SHADOW_MAP: case PIPE_CAP_NPOT_TEXTURES: case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: case PIPE_CAP_ANISOTROPIC_FILTER: case PIPE_CAP_SEAMLESS_CUBE_MAP: case PIPE_CAP_CUBE_MAP_ARRAY: @@ -194,13 +232,20 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: case PIPE_CAP_QUERY_BUFFER_OBJECT: + case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_STRING_MARKER: + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_CULL_DISTANCE: + case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: + case PIPE_CAP_TGSI_VOTE: + case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: + case PIPE_CAP_TGSI_ARRAY_COMPONENTS: return 1; + case PIPE_CAP_COMPUTE: + return (class_3d < GP100_3D_CLASS); case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: return (class_3d >= NVE4_3D_CLASS) ? 1 : 0; - case PIPE_CAP_COMPUTE: - if (debug_get_bool_option("NVF0_COMPUTE", false)) - return 1; - return (class_3d <= NVE4_3D_CLASS) ? 1 : 0; case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0; @@ -219,12 +264,15 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_STRING_MARKER: case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: + case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: return 0; case PIPE_CAP_VENDOR_ID: @@ -259,16 +307,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_VERTEX: case PIPE_SHADER_GEOMETRY: case PIPE_SHADER_FRAGMENT: - break; + case PIPE_SHADER_COMPUTE: case PIPE_SHADER_TESS_CTRL: case PIPE_SHADER_TESS_EVAL: - if (class_3d >= GM107_3D_CLASS) - return 0; - break; - case PIPE_SHADER_COMPUTE: - if (!debug_get_bool_option("NVF0_COMPUTE", false)) - if (class_3d > NVE4_3D_CLASS) - return 0; break; default: return 0; @@ -278,8 +319,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_PREFERRED_IR: return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_SUPPORTED_IRS: - if (class_3d >= NVE4_3D_CLASS) - return 0; return 1 << PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: @@ -307,13 +346,10 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: return 65536; case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS) - return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE; return NVC0_MAX_PIPE_CONSTBUFS; case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: return shader != PIPE_SHADER_FRAGMENT; case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: - return shader != PIPE_SHADER_FRAGMENT || class_3d < GM107_3D_CLASS; case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: return 1; @@ -324,7 +360,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - return 0; + return 1; case PIPE_SHADER_CAP_SUBROUTINES: return 1; case PIPE_SHADER_CAP_INTEGERS: @@ -333,19 +369,25 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 1; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: return 1; - case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + return 1; + case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return NVC0_MAX_BUFFERS; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: - return 16; /* would be 32 in linked (OpenGL-style) mode */ + return (class_3d >= NVE4_3D_CLASS) ? 32 : 16; case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: - return 16; /* XXX not sure if more are really safe */ + return (class_3d >= NVE4_3D_CLASS) ? 32 : 16; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + if (class_3d == NVE4_3D_CLASS || class_3d == NVF0_3D_CLASS) + return NVC0_MAX_IMAGES; + if (class_3d < NVE4_3D_CLASS) + if (shader == PIPE_SHADER_FRAGMENT || shader == PIPE_SHADER_COMPUTE) + return NVC0_MAX_IMAGES; return 0; default: NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param); @@ -382,6 +424,7 @@ nvc0_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) static int nvc0_screen_get_compute_param(struct pipe_screen *pscreen, + enum pipe_shader_ir ir_type, enum pipe_compute_cap param, void *data) { struct nvc0_screen *screen = nvc0_screen(pscreen); @@ -406,10 +449,26 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen, RET(((uint64_t []) { 1024, 1024, 64 })); case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: RET((uint64_t []) { 1024 }); + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + if (obj_class >= NVE4_COMPUTE_CLASS) { + RET((uint64_t []) { 1024 }); + } else { + RET((uint64_t []) { 512 }); + } case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g[] */ RET((uint64_t []) { 1ULL << 40 }); case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */ - RET((uint64_t []) { 48 << 10 }); + switch (obj_class) { + case GM200_COMPUTE_CLASS: + RET((uint64_t []) { 96 << 10 }); + break; + case GM107_COMPUTE_CLASS: + RET((uint64_t []) { 64 << 10 }); + break; + default: + RET((uint64_t []) { 48 << 10 }); + break; + } case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */ RET((uint64_t []) { 512 << 10 }); case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */ @@ -424,6 +483,8 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen, RET((uint32_t []) { screen->mp_count_compute }); case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: RET((uint32_t []) { 512 }); /* FIXME: arbitrary limit */ + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + RET((uint32_t []) { 64 }); default: return 0; } @@ -467,7 +528,6 @@ nvc0_screen_destroy(struct pipe_screen *pscreen) nouveau_bo_ref(NULL, &screen->txc); nouveau_bo_ref(NULL, &screen->fence.bo); nouveau_bo_ref(NULL, &screen->poly_cache); - nouveau_bo_ref(NULL, &screen->parm); nouveau_heap_destroy(&screen->lib_code); nouveau_heap_destroy(&screen->text_heap); @@ -605,20 +665,19 @@ nvc0_screen_init_compute(struct nvc0_screen *screen) case 0xd0: return nvc0_screen_compute_setup(screen, screen->base.pushbuf); case 0xe0: - return nve4_screen_compute_setup(screen, screen->base.pushbuf); case 0xf0: case 0x100: case 0x110: - if (debug_get_bool_option("NVF0_COMPUTE", false)) - return nve4_screen_compute_setup(screen, screen->base.pushbuf); case 0x120: + return nve4_screen_compute_setup(screen, screen->base.pushbuf); + case 0x130: return 0; default: return -1; } } -bool +static int nvc0_screen_resize_tls_area(struct nvc0_screen *screen, uint32_t lpos, uint32_t lneg, uint32_t cstack) { @@ -628,7 +687,7 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen, if (size >= (1 << 20)) { NOUVEAU_ERR("requested TLS size too large: 0x%"PRIx64"\n", size); - return false; + return -1; } size *= (screen->base.device->chipset >= 0xe0) ? 64 : 48; /* max warps */ @@ -639,13 +698,47 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen, ret = nouveau_bo_new(screen->base.device, NV_VRAM_DOMAIN(&screen->base), 1 << 17, size, NULL, &bo); - if (ret) { - NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size); - return false; - } + if (ret) + return ret; nouveau_bo_ref(NULL, &screen->tls); screen->tls = bo; - return true; + return 0; +} + +int +nvc0_screen_resize_text_area(struct nvc0_screen *screen, uint64_t size) +{ + struct nouveau_pushbuf *push = screen->base.pushbuf; + struct nouveau_bo *bo; + int ret; + + ret = nouveau_bo_new(screen->base.device, NV_VRAM_DOMAIN(&screen->base), + 1 << 17, size, NULL, &bo); + if (ret) + return ret; + + nouveau_bo_ref(NULL, &screen->text); + screen->text = bo; + + nouveau_heap_destroy(&screen->lib_code); + nouveau_heap_destroy(&screen->text_heap); + + /* XXX: getting a page fault at the end of the code buffer every few + * launches, don't use the last 256 bytes to work around them - prefetch ? + */ + nouveau_heap_init(&screen->text_heap, 0, size - 0x100); + + /* update the code segment setup */ + BEGIN_NVC0(push, NVC0_3D(CODE_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->text->offset); + PUSH_DATA (push, screen->text->offset); + if (screen->compute) { + BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->text->offset); + PUSH_DATA (push, screen->text->offset); + } + + return 0; } #define FAIL_SCREEN_INIT(str, err) \ @@ -675,6 +768,7 @@ nvc0_screen_create(struct nouveau_device *dev) case 0x100: case 0x110: case 0x120: + case 0x130: break; default: return NULL; @@ -687,10 +781,8 @@ nvc0_screen_create(struct nouveau_device *dev) pscreen->destroy = nvc0_screen_destroy; ret = nouveau_screen_init(&screen->base, dev); - if (ret) { - nvc0_screen_destroy(pscreen); - return NULL; - } + if (ret) + FAIL_SCREEN_INIT("Base screen init failed: %d\n", ret); chan = screen->base.channel; push = screen->base.pushbuf; push->user_priv = screen; @@ -727,7 +819,7 @@ nvc0_screen_create(struct nouveau_device *dev) ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo); if (ret) - goto fail; + FAIL_SCREEN_INIT("Error allocating fence BO: %d\n", ret); nouveau_bo_map(screen->fence.bo, 0, NULL); screen->fence.map = screen->fence.bo->map; screen->base.fence.emit = nvc0_screen_fence_emit; @@ -743,6 +835,7 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATA (push, screen->nvsw->handle); switch (dev->chipset & ~0xf) { + case 0x130: case 0x120: case 0x110: case 0x100: @@ -795,6 +888,9 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATA (push, screen->fence.bo->offset + 16); switch (dev->chipset & ~0xf) { + case 0x130: + obj_class = GP100_3D_CLASS; + break; case 0x120: obj_class = GM200_3D_CLASS; break; @@ -855,7 +951,7 @@ nvc0_screen_create(struct nouveau_device *dev) screen->base.drm->version >= 0x01000101); BEGIN_NVC0(push, NVC0_3D(RT_COMP_ENABLE(0)), 8); for (i = 0; i < 8; ++i) - PUSH_DATA(push, screen->base.drm->version >= 0x01000101); + PUSH_DATA(push, screen->base.drm->version >= 0x01000101); BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); PUSH_DATA (push, 1); @@ -895,51 +991,22 @@ nvc0_screen_create(struct nouveau_device *dev) nvc0_magic_3d_init(push, screen->eng3d->oclass); - ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL, - &screen->text); + ret = nvc0_screen_resize_text_area(screen, 1 << 19); if (ret) - goto fail; - - /* XXX: getting a page fault at the end of the code buffer every few - * launches, don't use the last 256 bytes to work around them - prefetch ? - */ - nouveau_heap_init(&screen->text_heap, 0, (1 << 20) - 0x100); + FAIL_SCREEN_INIT("Error allocating TEXT area: %d\n", ret); ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 12, 7 << 16, NULL, &screen->uniform_bo); if (ret) - goto fail; + FAIL_SCREEN_INIT("Error allocating uniform BO: %d\n", ret); PUSH_REFN (push, screen->uniform_bo, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_WR); - for (i = 0; i < 5; ++i) { - /* TIC and TSC entries for each unit (nve4+ only) */ - /* auxiliary constants (6 user clip planes, base instance id) */ - BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - PUSH_DATA (push, 1024); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (i << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (i << 10)); - BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1); - PUSH_DATA (push, (15 << 4) | 1); - if (screen->eng3d->oclass >= NVE4_3D_CLASS) { - unsigned j; - BEGIN_1IC0(push, NVC0_3D(CB_POS), 9); - PUSH_DATA (push, 0); - for (j = 0; j < 8; ++j) - PUSH_DATA(push, j); - } else { - BEGIN_NVC0(push, NVC0_3D(TEX_LIMITS(i)), 1); - PUSH_DATA (push, 0x54); - } - } - BEGIN_NVC0(push, NVC0_3D(LINKED_TSC), 1); - PUSH_DATA (push, 0); - /* return { 0.0, 0.0, 0.0, 0.0 } for out-of-bounds vtxbuf access */ BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 256); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (6 << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (6 << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO); BEGIN_1IC0(push, NVC0_3D(CB_POS), 5); PUSH_DATA (push, 0); PUSH_DATAf(push, 0.0f); @@ -947,15 +1014,13 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATAf(push, 0.0f); PUSH_DATAf(push, 0.0f); BEGIN_NVC0(push, NVC0_3D(VERTEX_RUNOUT_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (6 << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (6 << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO); if (screen->base.drm->version >= 0x01000101) { ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); - if (ret) { - NOUVEAU_ERR("NOUVEAU_GETPARAM_GRAPH_UNITS failed.\n"); - goto fail; - } + if (ret) + FAIL_SCREEN_INIT("NOUVEAU_GETPARAM_GRAPH_UNITS failed: %d\n", ret); } else { if (dev->chipset >= 0xe0 && dev->chipset < 0xf0) value = (8 << 8) | 4; @@ -966,11 +1031,10 @@ nvc0_screen_create(struct nouveau_device *dev) screen->mp_count = value >> 8; screen->mp_count_compute = screen->mp_count; - nvc0_screen_resize_tls_area(screen, 128 * 16, 0, 0x200); + ret = nvc0_screen_resize_tls_area(screen, 128 * 16, 0, 0x200); + if (ret) + FAIL_SCREEN_INIT("Error allocating TLS area: %d\n", ret); - BEGIN_NVC0(push, NVC0_3D(CODE_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->text->offset); - PUSH_DATA (push, screen->text->offset); BEGIN_NVC0(push, NVC0_3D(TEMP_ADDRESS_HIGH), 4); PUSH_DATAh(push, screen->tls->offset); PUSH_DATA (push, screen->tls->offset); @@ -989,7 +1053,7 @@ nvc0_screen_create(struct nouveau_device *dev) ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL, &screen->poly_cache); if (ret) - goto fail; + FAIL_SCREEN_INIT("Error allocating poly cache BO: %d\n", ret); BEGIN_NVC0(push, NVC0_3D(VERTEX_QUARANTINE_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->poly_cache->offset); @@ -1000,7 +1064,7 @@ nvc0_screen_create(struct nouveau_device *dev) ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 17, NULL, &screen->txc); if (ret) - goto fail; + FAIL_SCREEN_INIT("Error allocating txc BO: %d\n", ret); BEGIN_NVC0(push, NVC0_3D(TIC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset); @@ -1077,7 +1141,7 @@ nvc0_screen_create(struct nouveau_device *dev) MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count); MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count); MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write); - MK_MACRO(NVC0_COMPUTE_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect); + MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect); BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1); PUSH_DATA (push, 1); @@ -1107,6 +1171,50 @@ nvc0_screen_create(struct nouveau_device *dev) if (nvc0_screen_init_compute(screen)) goto fail; + /* XXX: Compute and 3D are somehow aliased on Fermi. */ + for (i = 0; i < 5; ++i) { + /* TIC and TSC entries for each unit (nve4+ only) */ + /* auxiliary constants (6 user clip planes, base instance id) */ + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i)); + BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1); + PUSH_DATA (push, (15 << 4) | 1); + if (screen->eng3d->oclass >= NVE4_3D_CLASS) { + unsigned j; + BEGIN_1IC0(push, NVC0_3D(CB_POS), 9); + PUSH_DATA (push, NVC0_CB_AUX_UNK_INFO); + for (j = 0; j < 8; ++j) + PUSH_DATA(push, j); + } else { + BEGIN_NVC0(push, NVC0_3D(TEX_LIMITS(i)), 1); + PUSH_DATA (push, 0x54); + } + + /* MS sample coordinate offsets: these do not work with _ALT modes ! */ + BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * 8); + PUSH_DATA (push, NVC0_CB_AUX_MS_INFO); + PUSH_DATA (push, 0); /* 0 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 1); /* 1 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); /* 2 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 1); /* 3 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 2); /* 4 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 3); /* 5 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 2); /* 6 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 3); /* 7 */ + PUSH_DATA (push, 1); + } + BEGIN_NVC0(push, NVC0_3D(LINKED_TSC), 1); + PUSH_DATA (push, 0); + PUSH_KICK (push); screen->tic.entries = CALLOC(4096, sizeof(void *)); diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index 8487abcf9..aff0308e8 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -16,7 +16,6 @@ /* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */ #define NVC0_MAX_PIPE_CONSTBUFS 14 -#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE 7 #define NVC0_MAX_SURFACE_SLOTS 16 @@ -24,6 +23,9 @@ #define NVC0_MAX_BUFFERS 32 +#define NVC0_MAX_IMAGES 8 + +#define NVC0_MAX_WINDOW_RECTANGLES 8 struct nvc0_context; @@ -65,8 +67,7 @@ struct nvc0_screen { int num_occlusion_queries_active; struct nouveau_bo *text; - struct nouveau_bo *parm; /* for COMPUTE */ - struct nouveau_bo *uniform_bo; /* for 3D */ + struct nouveau_bo *uniform_bo; struct nouveau_bo *tls; struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */ struct nouveau_bo *poly_cache; @@ -135,8 +136,7 @@ int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *); int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *); int nvc0_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *); -bool nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos, - uint32_t lneg, uint32_t cstack); +int nvc0_screen_resize_text_area(struct nvc0_screen *, uint64_t); static inline void nvc0_resource_fence(struct nv04_resource *res, uint32_t flags) diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index c0cd56969..c644fe992 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -28,40 +28,22 @@ #include "nvc0/nvc0_context.h" #include "nvc0/nvc0_query_hw.h" +#include "nvc0/nvc0_compute.xml.h" + static inline void nvc0_program_update_context_state(struct nvc0_context *nvc0, struct nvc0_program *prog, int stage) { - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - if (prog && prog->need_tls) { const uint32_t flags = NV_VRAM_DOMAIN(&nvc0->screen->base) | NOUVEAU_BO_RDWR; if (!nvc0->state.tls_required) - BCTX_REFN_bo(nvc0->bufctx_3d, TLS, flags, nvc0->screen->tls); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_TLS, flags, nvc0->screen->tls); nvc0->state.tls_required |= 1 << stage; } else { if (nvc0->state.tls_required == (1 << stage)) - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TLS); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TLS); nvc0->state.tls_required &= ~(1 << stage); } - - if (prog && prog->immd_size) { - BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - /* NOTE: may overlap code of a different shader */ - PUSH_DATA (push, align(prog->immd_size, 0x100)); - PUSH_DATAh(push, nvc0->screen->text->offset + prog->immd_base); - PUSH_DATA (push, nvc0->screen->text->offset + prog->immd_base); - BEGIN_NVC0(push, NVC0_3D(CB_BIND(stage)), 1); - PUSH_DATA (push, (14 << 4) | 1); - - nvc0->state.c14_bound |= 1 << stage; - } else - if (nvc0->state.c14_bound & (1 << stage)) { - BEGIN_NVC0(push, NVC0_3D(CB_BIND(stage)), 1); - PUSH_DATA (push, (14 << 4) | 0); - - nvc0->state.c14_bound &= ~(1 << stage); - } } static inline bool @@ -78,7 +60,7 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog) } if (likely(prog->code_size)) - return nvc0_program_upload_code(nvc0, prog); + return nvc0_program_upload(nvc0, prog); return true; /* stream output info only */ } @@ -152,7 +134,7 @@ nvc0_fragprog_validate(struct nvc0_context *nvc0) NVC0_3D_SHADE_MODEL_SMOOTH); } - if (fp->mem && !(nvc0->dirty & NVC0_NEW_FRAGPROG)) { + if (fp->mem && !(nvc0->dirty_3d & NVC0_NEW_3D_FRAGPROG)) { return; } @@ -257,6 +239,19 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0) } void +nvc0_compprog_validate(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_program *cp = nvc0->compprog; + + if (cp && !nvc0_program_validate(nvc0, cp)) + return; + + BEGIN_NVC0(push, NVC0_CP(FLUSH), 1); + PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE); +} + +void nvc0_tfb_validate(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -292,7 +287,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) } nvc0->state.tfb = tfb; - if (!(nvc0->dirty & NVC0_NEW_TFB_TARGETS)) + if (!(nvc0->dirty_3d & NVC0_NEW_3D_TFB_TARGETS)) return; for (b = 0; b < nvc0->num_tfbbufs; ++b) { @@ -309,7 +304,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) buf = nv04_resource(targ->pipe.buffer); - BCTX_REFN(nvc0->bufctx_3d, TFB, buf, WR); + BCTX_REFN(nvc0->bufctx_3d, 3D_TFB, buf, WR); if (!(nvc0->tfbbuf_dirty & (1 << b))) continue; diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index 68116e4da..aac296c2b 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -186,7 +186,7 @@ nvc0_blend_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->blend = hwcso; - nvc0->dirty |= NVC0_NEW_BLEND; + nvc0->dirty_3d |= NVC0_NEW_3D_BLEND; } static void @@ -283,8 +283,10 @@ nvc0_rasterizer_state_create(struct pipe_context *pipe, if (cso->offset_point || cso->offset_line || cso->offset_tri) { SB_BEGIN_3D(so, POLYGON_OFFSET_FACTOR, 1); SB_DATA (so, fui(cso->offset_scale)); - SB_BEGIN_3D(so, POLYGON_OFFSET_UNITS, 1); - SB_DATA (so, fui(cso->offset_units * 2.0f)); + if (!cso->offset_units_unscaled) { + SB_BEGIN_3D(so, POLYGON_OFFSET_UNITS, 1); + SB_DATA (so, fui(cso->offset_units * 2.0f)); + } SB_BEGIN_3D(so, POLYGON_OFFSET_CLAMP, 1); SB_DATA (so, fui(cso->offset_clamp)); } @@ -315,7 +317,7 @@ nvc0_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->rast = hwcso; - nvc0->dirty |= NVC0_NEW_RASTERIZER; + nvc0->dirty_3d |= NVC0_NEW_3D_RASTERIZER; } static void @@ -393,7 +395,7 @@ nvc0_zsa_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->zsa = hwcso; - nvc0->dirty |= NVC0_NEW_ZSA; + nvc0->dirty_3d |= NVC0_NEW_3D_ZSA; } static void @@ -424,7 +426,8 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso) } static inline void -nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0, int s, +nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0, + unsigned s, unsigned nr, void **hwcso) { unsigned i; @@ -448,83 +451,22 @@ nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0, int s, } nvc0->num_samplers[s] = nr; - - nvc0->dirty |= NVC0_NEW_SAMPLERS; } static void -nvc0_stage_sampler_states_bind_range(struct nvc0_context *nvc0, - const unsigned s, - unsigned start, unsigned nr, void **cso) +nvc0_bind_sampler_states(struct pipe_context *pipe, + enum pipe_shader_type shader, + unsigned start, unsigned nr, void **samplers) { - const unsigned end = start + nr; - int last_valid = -1; - unsigned i; - - if (cso) { - for (i = start; i < end; ++i) { - const unsigned p = i - start; - if (cso[p]) - last_valid = i; - if (cso[p] == nvc0->samplers[s][i]) - continue; - nvc0->samplers_dirty[s] |= 1 << i; - - if (nvc0->samplers[s][i]) - nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]); - nvc0->samplers[s][i] = cso[p]; - } - } else { - for (i = start; i < end; ++i) { - if (nvc0->samplers[s][i]) { - nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]); - nvc0->samplers[s][i] = NULL; - nvc0->samplers_dirty[s] |= 1 << i; - } - } - } + const unsigned s = nvc0_shader_stage(shader); - if (nvc0->num_samplers[s] <= end) { - if (last_valid < 0) { - for (i = start; i && !nvc0->samplers[s][i - 1]; --i); - nvc0->num_samplers[s] = i; - } else { - nvc0->num_samplers[s] = last_valid + 1; - } - } -} + assert(start == 0); + nvc0_stage_sampler_states_bind(nvc0_context(pipe), s, nr, samplers); -static void -nvc0_bind_sampler_states(struct pipe_context *pipe, unsigned shader, - unsigned start, unsigned nr, void **s) -{ - switch (shader) { - case PIPE_SHADER_VERTEX: - assert(start == 0); - nvc0_stage_sampler_states_bind(nvc0_context(pipe), 0, nr, s); - break; - case PIPE_SHADER_TESS_CTRL: - assert(start == 0); - nvc0_stage_sampler_states_bind(nvc0_context(pipe), 1, nr, s); - break; - case PIPE_SHADER_TESS_EVAL: - assert(start == 0); - nvc0_stage_sampler_states_bind(nvc0_context(pipe), 2, nr, s); - break; - case PIPE_SHADER_GEOMETRY: - assert(start == 0); - nvc0_stage_sampler_states_bind(nvc0_context(pipe), 3, nr, s); - break; - case PIPE_SHADER_FRAGMENT: - assert(start == 0); - nvc0_stage_sampler_states_bind(nvc0_context(pipe), 4, nr, s); - break; - case PIPE_SHADER_COMPUTE: - nvc0_stage_sampler_states_bind_range(nvc0_context(pipe), 5, - start, nr, s); + if (s == 5) nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SAMPLERS; - break; - } + else + nvc0_context(pipe)->dirty_3d |= NVC0_NEW_3D_SAMPLERS; } @@ -566,7 +508,10 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s, } if (old) { - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i)); + if (s == 5) + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_TEX(i)); + else + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); nvc0_screen_tic_unlock(nvc0->screen, old); } @@ -576,111 +521,34 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s, for (i = nr; i < nvc0->num_textures[s]; ++i) { struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]); if (old) { - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i)); + if (s == 5) + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_TEX(i)); + else + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); nvc0_screen_tic_unlock(nvc0->screen, old); pipe_sampler_view_reference(&nvc0->textures[s][i], NULL); } } nvc0->num_textures[s] = nr; - - nvc0->dirty |= NVC0_NEW_TEXTURES; } static void -nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s, - unsigned start, unsigned nr, - struct pipe_sampler_view **views) -{ - struct nouveau_bufctx *bctx = (s == 5) ? nvc0->bufctx_cp : nvc0->bufctx_3d; - const unsigned end = start + nr; - const unsigned bin = (s == 5) ? NVC0_BIND_CP_TEX(0) : NVC0_BIND_TEX(s, 0); - int last_valid = -1; - unsigned i; - - if (views) { - for (i = start; i < end; ++i) { - const unsigned p = i - start; - if (views[p]) - last_valid = i; - if (views[p] == nvc0->textures[s][i]) - continue; - nvc0->textures_dirty[s] |= 1 << i; - - if (views[p] && views[p]->texture) { - struct pipe_resource *res = views[p]->texture; - if (res->target == PIPE_BUFFER && - (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)) - nvc0->textures_coherent[s] |= 1 << i; - else - nvc0->textures_coherent[s] &= ~(1 << i); - } else { - nvc0->textures_coherent[s] &= ~(1 << i); - } - - if (nvc0->textures[s][i]) { - struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]); - nouveau_bufctx_reset(bctx, bin + i); - nvc0_screen_tic_unlock(nvc0->screen, old); - } - pipe_sampler_view_reference(&nvc0->textures[s][i], views[p]); - } - } else { - for (i = start; i < end; ++i) { - struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]); - if (!old) - continue; - nvc0->textures_dirty[s] |= 1 << i; - - nvc0_screen_tic_unlock(nvc0->screen, old); - pipe_sampler_view_reference(&nvc0->textures[s][i], NULL); - nouveau_bufctx_reset(bctx, bin + i); - } - } - - if (nvc0->num_textures[s] <= end) { - if (last_valid < 0) { - for (i = start; i && !nvc0->textures[s][i - 1]; --i); - nvc0->num_textures[s] = i; - } else { - nvc0->num_textures[s] = last_valid + 1; - } - } -} - -static void -nvc0_set_sampler_views(struct pipe_context *pipe, unsigned shader, +nvc0_set_sampler_views(struct pipe_context *pipe, enum pipe_shader_type shader, unsigned start, unsigned nr, struct pipe_sampler_view **views) { + const unsigned s = nvc0_shader_stage(shader); + assert(start == 0); - switch (shader) { - case PIPE_SHADER_VERTEX: - nvc0_stage_set_sampler_views(nvc0_context(pipe), 0, nr, views); - break; - case PIPE_SHADER_TESS_CTRL: - nvc0_stage_set_sampler_views(nvc0_context(pipe), 1, nr, views); - break; - case PIPE_SHADER_TESS_EVAL: - nvc0_stage_set_sampler_views(nvc0_context(pipe), 2, nr, views); - break; - case PIPE_SHADER_GEOMETRY: - nvc0_stage_set_sampler_views(nvc0_context(pipe), 3, nr, views); - break; - case PIPE_SHADER_FRAGMENT: - nvc0_stage_set_sampler_views(nvc0_context(pipe), 4, nr, views); - break; - case PIPE_SHADER_COMPUTE: - nvc0_stage_set_sampler_views_range(nvc0_context(pipe), 5, - start, nr, views); + nvc0_stage_set_sampler_views(nvc0_context(pipe), s, nr, views); + + if (s == 5) nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_TEXTURES; - break; - default: - ; - } + else + nvc0_context(pipe)->dirty_3d |= NVC0_NEW_3D_TEXTURES; } - /* ============================= SHADERS ======================================= */ @@ -733,7 +601,7 @@ nvc0_vp_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->vertprog = hwcso; - nvc0->dirty |= NVC0_NEW_VERTPROG; + nvc0->dirty_3d |= NVC0_NEW_3D_VERTPROG; } static void * @@ -749,7 +617,7 @@ nvc0_fp_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->fragprog = hwcso; - nvc0->dirty |= NVC0_NEW_FRAGPROG; + nvc0->dirty_3d |= NVC0_NEW_3D_FRAGPROG; } static void * @@ -765,7 +633,7 @@ nvc0_gp_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->gmtyprog = hwcso; - nvc0->dirty |= NVC0_NEW_GMTYPROG; + nvc0->dirty_3d |= NVC0_NEW_3D_GMTYPROG; } static void * @@ -781,7 +649,7 @@ nvc0_tcp_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->tctlprog = hwcso; - nvc0->dirty |= NVC0_NEW_TCTLPROG; + nvc0->dirty_3d |= NVC0_NEW_3D_TCTLPROG; } static void * @@ -797,7 +665,7 @@ nvc0_tep_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->tevlprog = hwcso; - nvc0->dirty |= NVC0_NEW_TEVLPROG; + nvc0->dirty_3d |= NVC0_NEW_3D_TEVLPROG; } static void * @@ -831,7 +699,7 @@ nvc0_cp_state_bind(struct pipe_context *pipe, void *hwcso) static void nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, - struct pipe_constant_buffer *cb) + const struct pipe_constant_buffer *cb) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct pipe_resource *res = cb ? cb->buffer : NULL; @@ -851,9 +719,9 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, nvc0->constbuf[s][i].u.buf = NULL; else if (nvc0->constbuf[s][i].u.buf) - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CB(s, i)); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_CB(s, i)); - nvc0->dirty |= NVC0_NEW_CONSTBUF; + nvc0->dirty_3d |= NVC0_NEW_3D_CONSTBUF; } nvc0->constbuf_dirty[s] |= 1 << i; @@ -893,7 +761,7 @@ nvc0_set_blend_color(struct pipe_context *pipe, struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->blend_colour = *bcol; - nvc0->dirty |= NVC0_NEW_BLEND_COLOUR; + nvc0->dirty_3d |= NVC0_NEW_3D_BLEND_COLOUR; } static void @@ -903,7 +771,7 @@ nvc0_set_stencil_ref(struct pipe_context *pipe, struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->stencil_ref = *sr; - nvc0->dirty |= NVC0_NEW_STENCIL_REF; + nvc0->dirty_3d |= NVC0_NEW_3D_STENCIL_REF; } static void @@ -914,7 +782,7 @@ nvc0_set_clip_state(struct pipe_context *pipe, memcpy(nvc0->clip.ucp, clip->ucp, sizeof(clip->ucp)); - nvc0->dirty |= NVC0_NEW_CLIP; + nvc0->dirty_3d |= NVC0_NEW_3D_CLIP; } static void @@ -923,7 +791,7 @@ nvc0_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->sample_mask = sample_mask; - nvc0->dirty |= NVC0_NEW_SAMPLE_MASK; + nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLE_MASK; } static void @@ -933,7 +801,7 @@ nvc0_set_min_samples(struct pipe_context *pipe, unsigned min_samples) if (nvc0->min_samples != min_samples) { nvc0->min_samples = min_samples; - nvc0->dirty |= NVC0_NEW_MIN_SAMPLES; + nvc0->dirty_3d |= NVC0_NEW_3D_MIN_SAMPLES; } } @@ -943,11 +811,11 @@ nvc0_set_framebuffer_state(struct pipe_context *pipe, { struct nvc0_context *nvc0 = nvc0_context(pipe); - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_FB); util_copy_framebuffer_state(&nvc0->framebuffer, fb); - nvc0->dirty |= NVC0_NEW_FRAMEBUFFER; + nvc0->dirty_3d |= NVC0_NEW_3D_FRAMEBUFFER; } static void @@ -957,7 +825,7 @@ nvc0_set_polygon_stipple(struct pipe_context *pipe, struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->stipple = *stipple; - nvc0->dirty |= NVC0_NEW_STIPPLE; + nvc0->dirty_3d |= NVC0_NEW_3D_STIPPLE; } static void @@ -975,7 +843,7 @@ nvc0_set_scissor_states(struct pipe_context *pipe, continue; nvc0->scissors[start_slot + i] = scissor[i]; nvc0->scissors_dirty |= 1 << (start_slot + i); - nvc0->dirty |= NVC0_NEW_SCISSOR; + nvc0->dirty_3d |= NVC0_NEW_3D_SCISSOR; } } @@ -994,12 +862,28 @@ nvc0_set_viewport_states(struct pipe_context *pipe, continue; nvc0->viewports[start_slot + i] = vpt[i]; nvc0->viewports_dirty |= 1 << (start_slot + i); - nvc0->dirty |= NVC0_NEW_VIEWPORT; + nvc0->dirty_3d |= NVC0_NEW_3D_VIEWPORT; } } static void +nvc0_set_window_rectangles(struct pipe_context *pipe, + boolean include, + unsigned num_rectangles, + const struct pipe_scissor_state *rectangles) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + + nvc0->window_rect.inclusive = include; + nvc0->window_rect.rects = MIN2(num_rectangles, NVC0_MAX_WINDOW_RECTANGLES); + memcpy(nvc0->window_rect.rect, rectangles, + sizeof(struct pipe_scissor_state) * nvc0->window_rect.rects); + + nvc0->dirty_3d |= NVC0_NEW_3D_WINDOW_RECTS; +} + +static void nvc0_set_tess_state(struct pipe_context *pipe, const float default_tess_outer[4], const float default_tess_inner[2]) @@ -1008,7 +892,7 @@ nvc0_set_tess_state(struct pipe_context *pipe, memcpy(nvc0->default_tess_outer, default_tess_outer, 4 * sizeof(float)); memcpy(nvc0->default_tess_inner, default_tess_inner, 2 * sizeof(float)); - nvc0->dirty |= NVC0_NEW_TESSFACTOR; + nvc0->dirty_3d |= NVC0_NEW_3D_TESSFACTOR; } static void @@ -1019,8 +903,8 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe, struct nvc0_context *nvc0 = nvc0_context(pipe); unsigned i; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX); - nvc0->dirty |= NVC0_NEW_ARRAYS; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_VTX); + nvc0->dirty_3d |= NVC0_NEW_3D_ARRAYS; util_set_vertex_buffers_count(nvc0->vtxbuf, &nvc0->num_vtxbufs, vb, start_slot, count); @@ -1062,20 +946,20 @@ nvc0_set_index_buffer(struct pipe_context *pipe, struct nvc0_context *nvc0 = nvc0_context(pipe); if (nvc0->idxbuf.buffer) - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_IDX); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_IDX); if (ib) { pipe_resource_reference(&nvc0->idxbuf.buffer, ib->buffer); nvc0->idxbuf.index_size = ib->index_size; if (ib->buffer) { nvc0->idxbuf.offset = ib->offset; - nvc0->dirty |= NVC0_NEW_IDXBUF; + nvc0->dirty_3d |= NVC0_NEW_3D_IDXBUF; } else { nvc0->idxbuf.user_buffer = ib->user_buffer; - nvc0->dirty &= ~NVC0_NEW_IDXBUF; + nvc0->dirty_3d &= ~NVC0_NEW_3D_IDXBUF; } } else { - nvc0->dirty &= ~NVC0_NEW_IDXBUF; + nvc0->dirty_3d &= ~NVC0_NEW_3D_IDXBUF; pipe_resource_reference(&nvc0->idxbuf.buffer, NULL); } } @@ -1086,7 +970,7 @@ nvc0_vertex_state_bind(struct pipe_context *pipe, void *hwcso) struct nvc0_context *nvc0 = nvc0_context(pipe); nvc0->vertex = hwcso; - nvc0->dirty |= NVC0_NEW_VERTEX; + nvc0->dirty_3d |= NVC0_NEW_3D_VERTEX; } static struct pipe_stream_output_target * @@ -1185,8 +1069,8 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe, nvc0->num_tfbbufs = num_targets; if (nvc0->tfbbuf_dirty) { - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TFB); - nvc0->dirty |= NVC0_NEW_TFB_TARGETS; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TFB); + nvc0->dirty_3d |= NVC0_NEW_3D_TFB_TARGETS; } } @@ -1216,7 +1100,7 @@ nvc0_bind_surfaces_range(struct nvc0_context *nvc0, const unsigned t, nvc0->surfaces_dirty[t] |= mask; if (t == 0) - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_SUF); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_SUF); else nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF); } @@ -1231,36 +1115,146 @@ nvc0_set_compute_resources(struct pipe_context *pipe, nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SURFACES; } -static void -nvc0_set_shader_images(struct pipe_context *pipe, unsigned shader, - unsigned start_slot, unsigned count, - struct pipe_image_view *views) +static bool +nvc0_bind_images_range(struct nvc0_context *nvc0, const unsigned s, + unsigned start, unsigned nr, + const struct pipe_image_view *pimages) { + const unsigned end = start + nr; + unsigned mask = 0; + unsigned i; + + assert(s < 6); + + if (pimages) { + for (i = start; i < end; ++i) { + struct pipe_image_view *img = &nvc0->images[s][i]; + const unsigned p = i - start; + + if (img->resource == pimages[p].resource && + img->format == pimages[p].format && + img->access == pimages[p].access) { + if (img->resource == NULL) + continue; + if (img->resource->target == PIPE_BUFFER && + img->u.buf.offset == pimages[p].u.buf.offset && + img->u.buf.size == pimages[p].u.buf.size) + continue; + if (img->resource->target != PIPE_BUFFER && + img->u.tex.first_layer == pimages[p].u.tex.first_layer && + img->u.tex.last_layer == pimages[p].u.tex.last_layer && + img->u.tex.level == pimages[p].u.tex.level) + continue; + } + + mask |= (1 << i); + if (pimages[p].resource) + nvc0->images_valid[s] |= (1 << i); + else + nvc0->images_valid[s] &= ~(1 << i); + + img->format = pimages[p].format; + img->access = pimages[p].access; + if (pimages[p].resource && pimages[p].resource->target == PIPE_BUFFER) + img->u.buf = pimages[p].u.buf; + else + img->u.tex = pimages[p].u.tex; + + pipe_resource_reference( + &img->resource, pimages[p].resource); + + if (nvc0->screen->base.class_3d >= GM107_3D_CLASS) { + if (nvc0->images_tic[s][i]) { + struct nv50_tic_entry *old = + nv50_tic_entry(nvc0->images_tic[s][i]); + nvc0_screen_tic_unlock(nvc0->screen, old); + pipe_sampler_view_reference(&nvc0->images_tic[s][i], NULL); + } + + nvc0->images_tic[s][i] = + gm107_create_texture_view_from_image(&nvc0->base.pipe, + &pimages[p]); + } + } + if (!mask) + return false; + } else { + mask = ((1 << nr) - 1) << start; + if (!(nvc0->images_valid[s] & mask)) + return false; + for (i = start; i < end; ++i) { + pipe_resource_reference(&nvc0->images[s][i].resource, NULL); + if (nvc0->screen->base.class_3d >= GM107_3D_CLASS) { + struct nv50_tic_entry *old = nv50_tic_entry(nvc0->images_tic[s][i]); + if (old) { + nvc0_screen_tic_unlock(nvc0->screen, old); + pipe_sampler_view_reference(&nvc0->images_tic[s][i], NULL); + } + } + } + nvc0->images_valid[s] &= ~mask; + } + nvc0->images_dirty[s] |= mask; + + if (s == 5) + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF); + else + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_SUF); + + return true; } static void +nvc0_set_shader_images(struct pipe_context *pipe, + enum pipe_shader_type shader, + unsigned start, unsigned nr, + const struct pipe_image_view *images) +{ + const unsigned s = nvc0_shader_stage(shader); + if (!nvc0_bind_images_range(nvc0_context(pipe), s, start, nr, images)) + return; + + if (s == 5) + nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SURFACES; + else + nvc0_context(pipe)->dirty_3d |= NVC0_NEW_3D_SURFACES; +} + +static bool nvc0_bind_buffers_range(struct nvc0_context *nvc0, const unsigned t, - unsigned start, unsigned nr, - struct pipe_shader_buffer *pbuffers) + unsigned start, unsigned nr, + const struct pipe_shader_buffer *pbuffers) { const unsigned end = start + nr; - const unsigned mask = ((1 << nr) - 1) << start; + unsigned mask = 0; unsigned i; assert(t < 6); if (pbuffers) { for (i = start; i < end; ++i) { + struct pipe_shader_buffer *buf = &nvc0->buffers[t][i]; const unsigned p = i - start; + if (buf->buffer == pbuffers[p].buffer && + buf->buffer_offset == pbuffers[p].buffer_offset && + buf->buffer_size == pbuffers[p].buffer_size) + continue; + + mask |= (1 << i); if (pbuffers[p].buffer) nvc0->buffers_valid[t] |= (1 << i); else nvc0->buffers_valid[t] &= ~(1 << i); - nvc0->buffers[t][i].buffer_offset = pbuffers[p].buffer_offset; - nvc0->buffers[t][i].buffer_size = pbuffers[p].buffer_size; - pipe_resource_reference(&nvc0->buffers[t][i].buffer, pbuffers[p].buffer); + buf->buffer_offset = pbuffers[p].buffer_offset; + buf->buffer_size = pbuffers[p].buffer_size; + pipe_resource_reference(&buf->buffer, pbuffers[p].buffer); } + if (!mask) + return false; } else { + mask = ((1 << nr) - 1) << start; + if (!(nvc0->buffers_valid[t] & mask)) + return false; for (i = start; i < end; ++i) pipe_resource_reference(&nvc0->buffers[t][i].buffer, NULL); nvc0->buffers_valid[t] &= ~mask; @@ -1270,23 +1264,25 @@ nvc0_bind_buffers_range(struct nvc0_context *nvc0, const unsigned t, if (t == 5) nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_BUF); else - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_BUF); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_BUF); + return true; } static void nvc0_set_shader_buffers(struct pipe_context *pipe, - unsigned shader, + enum pipe_shader_type shader, unsigned start, unsigned nr, - struct pipe_shader_buffer *buffers) + const struct pipe_shader_buffer *buffers) { const unsigned s = nvc0_shader_stage(shader); - nvc0_bind_buffers_range(nvc0_context(pipe), s, start, nr, buffers); + if (!nvc0_bind_buffers_range(nvc0_context(pipe), s, start, nr, buffers)) + return; if (s == 5) nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_BUFFERS; else - nvc0_context(pipe)->dirty |= NVC0_NEW_BUFFERS; + nvc0_context(pipe)->dirty_3d |= NVC0_NEW_3D_BUFFERS; } static inline void @@ -1342,7 +1338,7 @@ nvc0_set_global_bindings(struct pipe_context *pipe, nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL); - nvc0->dirty_cp = NVC0_NEW_CP_GLOBALS; + nvc0->dirty_cp |= NVC0_NEW_CP_GLOBALS; } void @@ -1400,6 +1396,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0) pipe->set_polygon_stipple = nvc0_set_polygon_stipple; pipe->set_scissor_states = nvc0_set_scissor_states; pipe->set_viewport_states = nvc0_set_viewport_states; + pipe->set_window_rectangles = nvc0_set_window_rectangles; pipe->set_tess_state = nvc0_set_tess_state; pipe->create_vertex_elements_state = nvc0_vertex_state_create; diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c index d97267a58..1d8ebe642 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c @@ -1,6 +1,8 @@ #include "util/u_format.h" +#include "util/u_framebuffer.h" #include "util/u_math.h" +#include "util/u_viewport.h" #include "nvc0/nvc0_context.h" @@ -56,148 +58,177 @@ nvc0_validate_zcull(struct nvc0_context *nvc0) #endif static inline void -nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i) +nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i, unsigned layers) { - BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 6); - PUSH_DATA (push, 0); - PUSH_DATA (push, 0); - PUSH_DATA (push, 64); - PUSH_DATA (push, 0); + BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 9); PUSH_DATA (push, 0); PUSH_DATA (push, 0); + PUSH_DATA (push, 64); // width + PUSH_DATA (push, 0); // height + PUSH_DATA (push, 0); // format + PUSH_DATA (push, 0); // tile mode + PUSH_DATA (push, layers); // layers + PUSH_DATA (push, 0); // layer stride + PUSH_DATA (push, 0); // base layer } static void nvc0_validate_fb(struct nvc0_context *nvc0) { - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - struct pipe_framebuffer_state *fb = &nvc0->framebuffer; - unsigned i, ms; - unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1; - bool serialize = false; - - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB); - - BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); - PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs); - BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2); - PUSH_DATA (push, fb->width << 16); - PUSH_DATA (push, fb->height << 16); - - for (i = 0; i < fb->nr_cbufs; ++i) { - struct nv50_surface *sf; - struct nv04_resource *res; - struct nouveau_bo *bo; - - if (!fb->cbufs[i]) { - nvc0_fb_set_null_rt(push, i); - continue; - } - - sf = nv50_surface(fb->cbufs[i]); - res = nv04_resource(sf->base.texture); - bo = res->bo; - - BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 9); - PUSH_DATAh(push, res->address + sf->offset); - PUSH_DATA (push, res->address + sf->offset); - if (likely(nouveau_bo_memtype(bo))) { - struct nv50_miptree *mt = nv50_miptree(sf->base.texture); - - assert(sf->base.texture->target != PIPE_BUFFER); - - PUSH_DATA(push, sf->width); - PUSH_DATA(push, sf->height); - PUSH_DATA(push, nvc0_format_table[sf->base.format].rt); - PUSH_DATA(push, (mt->layout_3d << 16) | - mt->level[sf->base.u.tex.level].tile_mode); - PUSH_DATA(push, sf->base.u.tex.first_layer + sf->depth); - PUSH_DATA(push, mt->layer_stride >> 2); - PUSH_DATA(push, sf->base.u.tex.first_layer); - - ms_mode = mt->ms_mode; - } else { - if (res->base.target == PIPE_BUFFER) { - PUSH_DATA(push, 262144); - PUSH_DATA(push, 1); - } else { - PUSH_DATA(push, nv50_miptree(sf->base.texture)->level[0].pitch); - PUSH_DATA(push, sf->height); - } - PUSH_DATA(push, nvc0_format_table[sf->base.format].rt); - PUSH_DATA(push, 1 << 12); - PUSH_DATA(push, 1); - PUSH_DATA(push, 0); - PUSH_DATA(push, 0); - - nvc0_resource_fence(res, NOUVEAU_BO_WR); - - assert(!fb->zsbuf); - } - - if (res->status & NOUVEAU_BUFFER_STATUS_GPU_READING) - serialize = true; - res->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; - res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; - - /* only register for writing, otherwise we'd always serialize here */ - BCTX_REFN(nvc0->bufctx_3d, FB, res, WR); - } - - if (fb->zsbuf) { - struct nv50_miptree *mt = nv50_miptree(fb->zsbuf->texture); - struct nv50_surface *sf = nv50_surface(fb->zsbuf); - int unk = mt->base.base.target == PIPE_TEXTURE_2D; - - BEGIN_NVC0(push, NVC0_3D(ZETA_ADDRESS_HIGH), 5); - PUSH_DATAh(push, mt->base.address + sf->offset); - PUSH_DATA (push, mt->base.address + sf->offset); - PUSH_DATA (push, nvc0_format_table[fb->zsbuf->format].rt); - PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode); - PUSH_DATA (push, mt->layer_stride >> 2); - BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1); - PUSH_DATA (push, 1); - BEGIN_NVC0(push, NVC0_3D(ZETA_HORIZ), 3); - PUSH_DATA (push, sf->width); - PUSH_DATA (push, sf->height); - PUSH_DATA (push, (unk << 16) | - (sf->base.u.tex.first_layer + sf->depth)); - BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1); - PUSH_DATA (push, sf->base.u.tex.first_layer); - - ms_mode = mt->ms_mode; - - if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING) - serialize = true; - mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; - mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; - - BCTX_REFN(nvc0->bufctx_3d, FB, &mt->base, WR); - } else { - BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1); - PUSH_DATA (push, 0); - } - - IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), ms_mode); - - ms = 1 << ms_mode; - BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - PUSH_DATA (push, 1024); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (4 << 10)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (4 << 10)); - BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms); - PUSH_DATA (push, 256 + 128); - for (i = 0; i < ms; i++) { - float xy[2]; - nvc0->base.pipe.get_sample_position(&nvc0->base.pipe, ms, i, xy); - PUSH_DATAf(push, xy[0]); - PUSH_DATAf(push, xy[1]); - } - - if (serialize) - IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0); - - NOUVEAU_DRV_STAT(&nvc0->screen->base, gpu_serialize_count, serialize); + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct pipe_framebuffer_state *fb = &nvc0->framebuffer; + struct nvc0_screen *screen = nvc0->screen; + unsigned i, ms; + unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1; + unsigned nr_cbufs = fb->nr_cbufs; + bool serialize = false; + + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_FB); + + BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2); + PUSH_DATA (push, fb->width << 16); + PUSH_DATA (push, fb->height << 16); + + for (i = 0; i < fb->nr_cbufs; ++i) { + struct nv50_surface *sf; + struct nv04_resource *res; + struct nouveau_bo *bo; + + if (!fb->cbufs[i]) { + nvc0_fb_set_null_rt(push, i, 0); + continue; + } + + sf = nv50_surface(fb->cbufs[i]); + res = nv04_resource(sf->base.texture); + bo = res->bo; + + BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 9); + PUSH_DATAh(push, res->address + sf->offset); + PUSH_DATA (push, res->address + sf->offset); + if (likely(nouveau_bo_memtype(bo))) { + struct nv50_miptree *mt = nv50_miptree(sf->base.texture); + + assert(sf->base.texture->target != PIPE_BUFFER); + + PUSH_DATA(push, sf->width); + PUSH_DATA(push, sf->height); + PUSH_DATA(push, nvc0_format_table[sf->base.format].rt); + PUSH_DATA(push, (mt->layout_3d << 16) | + mt->level[sf->base.u.tex.level].tile_mode); + PUSH_DATA(push, sf->base.u.tex.first_layer + sf->depth); + PUSH_DATA(push, mt->layer_stride >> 2); + PUSH_DATA(push, sf->base.u.tex.first_layer); + + ms_mode = mt->ms_mode; + } else { + if (res->base.target == PIPE_BUFFER) { + PUSH_DATA(push, 262144); + PUSH_DATA(push, 1); + } else { + PUSH_DATA(push, nv50_miptree(sf->base.texture)->level[0].pitch); + PUSH_DATA(push, sf->height); + } + PUSH_DATA(push, nvc0_format_table[sf->base.format].rt); + PUSH_DATA(push, 1 << 12); + PUSH_DATA(push, 1); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + + nvc0_resource_fence(res, NOUVEAU_BO_WR); + + assert(!fb->zsbuf); + } + + if (res->status & NOUVEAU_BUFFER_STATUS_GPU_READING) + serialize = true; + res->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; + res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; + + /* only register for writing, otherwise we'd always serialize here */ + BCTX_REFN(nvc0->bufctx_3d, 3D_FB, res, WR); + } + + if (fb->zsbuf) { + struct nv50_miptree *mt = nv50_miptree(fb->zsbuf->texture); + struct nv50_surface *sf = nv50_surface(fb->zsbuf); + int unk = mt->base.base.target == PIPE_TEXTURE_2D; + + BEGIN_NVC0(push, NVC0_3D(ZETA_ADDRESS_HIGH), 5); + PUSH_DATAh(push, mt->base.address + sf->offset); + PUSH_DATA (push, mt->base.address + sf->offset); + PUSH_DATA (push, nvc0_format_table[fb->zsbuf->format].rt); + PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode); + PUSH_DATA (push, mt->layer_stride >> 2); + BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1); + PUSH_DATA (push, 1); + BEGIN_NVC0(push, NVC0_3D(ZETA_HORIZ), 3); + PUSH_DATA (push, sf->width); + PUSH_DATA (push, sf->height); + PUSH_DATA (push, (unk << 16) | + (sf->base.u.tex.first_layer + sf->depth)); + BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1); + PUSH_DATA (push, sf->base.u.tex.first_layer); + + ms_mode = mt->ms_mode; + + if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING) + serialize = true; + mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; + mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; + + BCTX_REFN(nvc0->bufctx_3d, 3D_FB, &mt->base, WR); + } else { + BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1); + PUSH_DATA (push, 0); + } + + if (nr_cbufs == 0 && !fb->zsbuf) { + assert(util_is_power_of_two(fb->samples)); + assert(fb->samples <= 8); + + nvc0_fb_set_null_rt(push, 0, fb->layers); + + if (fb->samples > 1) + ms_mode = ffs(fb->samples) - 1; + nr_cbufs = 1; + } + + BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); + PUSH_DATA (push, (076543210 << 4) | nr_cbufs); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), ms_mode); + + ms = 1 << ms_mode; + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4)); + BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms); + PUSH_DATA (push, NVC0_CB_AUX_SAMPLE_INFO); + for (i = 0; i < ms; i++) { + float xy[2]; + nvc0->base.pipe.get_sample_position(&nvc0->base.pipe, ms, i, xy); + PUSH_DATAf(push, xy[0]); + PUSH_DATAf(push, xy[1]); + } + + if (screen->base.class_3d >= GM200_3D_CLASS) { + const uint8_t (*ptr)[2] = nvc0_get_sample_locations(ms); + uint32_t val[4] = {}; + + for (i = 0; i < 16; i++) { + val[i / 4] |= ptr[i % ms][0] << (((i % 4) * 8) + 0); + val[i / 4] |= ptr[i % ms][1] << (((i % 4) * 8) + 4); + } + + BEGIN_NVC0(push, SUBC_3D(0x11e0), 4); + PUSH_DATAp(push, val, 4); + } + + if (serialize) + IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0); + + NOUVEAU_DRV_STAT(&nvc0->screen->base, gpu_serialize_count, serialize); } static void @@ -239,7 +270,7 @@ nvc0_validate_scissor(struct nvc0_context *nvc0) int i; struct nouveau_pushbuf *push = nvc0->base.pushbuf; - if (!(nvc0->dirty & NVC0_NEW_SCISSOR) && + if (!(nvc0->dirty_3d & NVC0_NEW_3D_SCISSOR) && nvc0->rast->pipe.scissor == nvc0->state.scissor) return; @@ -299,8 +330,12 @@ nvc0_validate_viewport(struct nvc0_context *nvc0) PUSH_DATA (push, (w << 16) | x); PUSH_DATA (push, (h << 16) | y); - zmin = vp->translate[2] - fabsf(vp->scale[2]); - zmax = vp->translate[2] + fabsf(vp->scale[2]); + /* If the halfz setting ever changes, the viewports will also get + * updated. The rast will get updated before the validate function has a + * chance to hit, so we can just use it directly without an atom + * dependency. + */ + util_viewport_zmin_zmax(vp, nvc0->rast->pipe.clip_halfz, &zmin, &zmax); BEGIN_NVC0(push, NVC0_3D(DEPTH_RANGE_NEAR(i)), 2); PUSH_DATAf(push, zmin); @@ -309,18 +344,42 @@ nvc0_validate_viewport(struct nvc0_context *nvc0) nvc0->viewports_dirty = 0; } +static void +nvc0_validate_window_rects(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + bool enable = nvc0->window_rect.rects > 0 || nvc0->window_rect.inclusive; + int i; + + IMMED_NVC0(push, NVC0_3D(CLIP_RECTS_EN), enable); + if (!enable) + return; + + IMMED_NVC0(push, NVC0_3D(CLIP_RECTS_MODE), !nvc0->window_rect.inclusive); + BEGIN_NVC0(push, NVC0_3D(CLIP_RECT_HORIZ(0)), NVC0_MAX_WINDOW_RECTANGLES * 2); + for (i = 0; i < nvc0->window_rect.rects; i++) { + struct pipe_scissor_state *s = &nvc0->window_rect.rect[i]; + PUSH_DATA(push, (s->maxx << 16) | s->minx); + PUSH_DATA(push, (s->maxy << 16) | s->miny); + } + for (; i < NVC0_MAX_WINDOW_RECTANGLES; i++) { + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + } +} + static inline void nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; - struct nouveau_bo *bo = nvc0->screen->uniform_bo; + struct nvc0_screen *screen = nvc0->screen; BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - PUSH_DATA (push, 1024); - PUSH_DATAh(push, bo->offset + (6 << 16) + (s << 10)); - PUSH_DATA (push, bo->offset + (6 << 16) + (s << 10)); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); BEGIN_1IC0(push, NVC0_3D(CB_POS), PIPE_MAX_CLIP_PLANES * 4 + 1); - PUSH_DATA (push, 256); + PUSH_DATA (push, NVC0_CB_AUX_UCP_INFO); PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4); } @@ -367,11 +426,12 @@ nvc0_validate_clip(struct nvc0_context *nvc0) if (clip_enable && vp->vp.num_ucps < PIPE_MAX_CLIP_PLANES) nvc0_check_program_ucps(nvc0, vp, clip_enable); - if (nvc0->dirty & (NVC0_NEW_CLIP | (NVC0_NEW_VERTPROG << stage))) + if (nvc0->dirty_3d & (NVC0_NEW_3D_CLIP | (NVC0_NEW_3D_VERTPROG << stage))) if (vp->vp.num_ucps > 0 && vp->vp.num_ucps <= PIPE_MAX_CLIP_PLANES) nvc0_upload_uclip_planes(nvc0, stage); clip_enable &= vp->vp.clip_enable; + clip_enable |= vp->vp.cull_enable; if (nvc0->state.clip_enable != clip_enable) { nvc0->state.clip_enable = clip_enable; @@ -424,7 +484,7 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0) if (nvc0->constbuf[s][i].user) { struct nouveau_bo *bo = nvc0->screen->uniform_bo; - const unsigned base = s << 16; + const unsigned base = NVC0_CB_USR_INFO(s); const unsigned size = nvc0->constbuf[s][0].size; assert(i == 0); /* we really only want OpenGL uniforms here */ assert(nvc0->constbuf[s][0].u.data); @@ -454,7 +514,7 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0) BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1); PUSH_DATA (push, (i << 4) | 1); - BCTX_REFN(nvc0->bufctx_3d, CB(s, i), res, RD); + BCTX_REFN(nvc0->bufctx_3d, 3D_CB(s, i), res, RD); nvc0->cb_dirty = 1; /* Force cache flush for UBO. */ res->cb_bindings[s] |= 1 << i; @@ -468,25 +528,28 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0) } } - /* Invalidate all COMPUTE constbufs because they are aliased with 3D. */ - nvc0->dirty_cp |= NVC0_NEW_CP_CONSTBUF; - nvc0->constbuf_dirty[5] |= nvc0->constbuf_valid[5]; - nvc0->state.uniform_buffer_bound[5] = 0; + if (nvc0->screen->base.class_3d < NVE4_3D_CLASS) { + /* Invalidate all COMPUTE constbufs because they are aliased with 3D. */ + nvc0->dirty_cp |= NVC0_NEW_CP_CONSTBUF; + nvc0->constbuf_dirty[5] |= nvc0->constbuf_valid[5]; + nvc0->state.uniform_buffer_bound[5] = 0; + } } static void nvc0_validate_buffers(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; int i, s; for (s = 0; s < 5; s++) { BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - PUSH_DATA (push, 1024); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10)); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS); - PUSH_DATA (push, 512); + PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0)); for (i = 0; i < NVC0_MAX_BUFFERS; i++) { if (nvc0->buffers[s][i].buffer) { struct nv04_resource *res = @@ -495,7 +558,11 @@ nvc0_validate_buffers(struct nvc0_context *nvc0) PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset); PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); PUSH_DATA (push, 0); - BCTX_REFN(nvc0->bufctx_3d, BUF, res, RDWR); + BCTX_REFN(nvc0->bufctx_3d, 3D_BUF, res, RDWR); + util_range_add(&res->valid_buffer_range, + nvc0->buffers[s][i].buffer_offset, + nvc0->buffers[s][i].buffer_offset + + nvc0->buffers[s][i].buffer_size); } else { PUSH_DATA (push, 0); PUSH_DATA (push, 0); @@ -534,8 +601,14 @@ nvc0_validate_min_samples(struct nvc0_context *nvc0) int samples; samples = util_next_power_of_two(nvc0->min_samples); - if (samples > 1) + if (samples > 1) { + // If we're using the incoming sample mask and doing sample shading, we + // have to do sample shading "to the max", otherwise there's no way to + // tell which sets of samples are covered by the current invocation. + if (nvc0->fragprog->fp.sample_mask_in) + samples = util_framebuffer_get_num_samples(&nvc0->framebuffer); samples |= NVC0_3D_SAMPLE_SHADING_ENABLE; + } IMMED_NVC0(push, NVC0_3D(SAMPLE_SHADING), samples); } @@ -549,9 +622,9 @@ nvc0_validate_driverconst(struct nvc0_context *nvc0) for (i = 0; i < 5; ++i) { BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - PUSH_DATA (push, 1024); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (i << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (i << 10)); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i)); BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1); PUSH_DATA (push, (15 << 4) | 1); } @@ -559,23 +632,8 @@ nvc0_validate_driverconst(struct nvc0_context *nvc0) nvc0->dirty_cp |= NVC0_NEW_CP_DRIVERCONST; } -void -nvc0_validate_global_residents(struct nvc0_context *nvc0, - struct nouveau_bufctx *bctx, int bin) -{ - unsigned i; - - for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *); - ++i) { - struct pipe_resource *res = *util_dynarray_element( - &nvc0->global_residents, struct pipe_resource *, i); - if (res) - nvc0_add_resident(bctx, bin, nv04_resource(res), NOUVEAU_BO_RDWR); - } -} - static void -nvc0_validate_derived_1(struct nvc0_context *nvc0) +nvc0_validate_fp_zsa_rast(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; bool rasterizer_discard; @@ -600,20 +658,21 @@ nvc0_validate_derived_1(struct nvc0_context *nvc0) * nvc0_validate_fb, otherwise that will override the RT count setting. */ static void -nvc0_validate_derived_2(struct nvc0_context *nvc0) +nvc0_validate_zsa_fb(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; if (nvc0->zsa && nvc0->zsa->pipe.alpha.enabled && + nvc0->framebuffer.zsbuf && nvc0->framebuffer.nr_cbufs == 0) { - nvc0_fb_set_null_rt(push, 0); + nvc0_fb_set_null_rt(push, 0, 0); BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); PUSH_DATA (push, (076543210 << 4) | 1); } } static void -nvc0_validate_derived_3(struct nvc0_context *nvc0) +nvc0_validate_blend_fb(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct pipe_framebuffer_state *fb = &nvc0->framebuffer; @@ -632,6 +691,26 @@ nvc0_validate_derived_3(struct nvc0_context *nvc0) } static void +nvc0_validate_rast_fb(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct pipe_framebuffer_state *fb = &nvc0->framebuffer; + struct pipe_rasterizer_state *rast = &nvc0->rast->pipe; + + if (!rast) + return; + + if (rast->offset_units_unscaled) { + BEGIN_NVC0(push, NVC0_3D(POLYGON_OFFSET_UNITS), 1); + if (fb->zsbuf && fb->zsbuf->format == PIPE_FORMAT_Z16_UNORM) + PUSH_DATAf(push, rast->offset_units * (1 << 16)); + else + PUSH_DATAf(push, rast->offset_units * (1 << 24)); + } +} + + +static void nvc0_validate_tess_state(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -652,7 +731,7 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to) else ctx_to->state = ctx_to->screen->save_state; - ctx_to->dirty = ~0; + ctx_to->dirty_3d = ~0; ctx_to->dirty_cp = ~0; ctx_to->viewports_dirty = ~0; ctx_to->scissors_dirty = ~0; @@ -662,74 +741,79 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to) ctx_to->textures_dirty[s] = ~0; ctx_to->constbuf_dirty[s] = (1 << NVC0_MAX_PIPE_CONSTBUFS) - 1; ctx_to->buffers_dirty[s] = ~0; + ctx_to->images_dirty[s] = ~0; } /* Reset tfb as the shader that owns it may have been deleted. */ ctx_to->state.tfb = NULL; if (!ctx_to->vertex) - ctx_to->dirty &= ~(NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS); + ctx_to->dirty_3d &= ~(NVC0_NEW_3D_VERTEX | NVC0_NEW_3D_ARRAYS); if (!ctx_to->idxbuf.buffer) - ctx_to->dirty &= ~NVC0_NEW_IDXBUF; + ctx_to->dirty_3d &= ~NVC0_NEW_3D_IDXBUF; if (!ctx_to->vertprog) - ctx_to->dirty &= ~NVC0_NEW_VERTPROG; + ctx_to->dirty_3d &= ~NVC0_NEW_3D_VERTPROG; if (!ctx_to->fragprog) - ctx_to->dirty &= ~NVC0_NEW_FRAGPROG; + ctx_to->dirty_3d &= ~NVC0_NEW_3D_FRAGPROG; if (!ctx_to->blend) - ctx_to->dirty &= ~NVC0_NEW_BLEND; + ctx_to->dirty_3d &= ~NVC0_NEW_3D_BLEND; if (!ctx_to->rast) - ctx_to->dirty &= ~(NVC0_NEW_RASTERIZER | NVC0_NEW_SCISSOR); + ctx_to->dirty_3d &= ~(NVC0_NEW_3D_RASTERIZER | NVC0_NEW_3D_SCISSOR); if (!ctx_to->zsa) - ctx_to->dirty &= ~NVC0_NEW_ZSA; + ctx_to->dirty_3d &= ~NVC0_NEW_3D_ZSA; ctx_to->screen->cur_ctx = ctx_to; } -static struct state_validate { - void (*func)(struct nvc0_context *); - uint32_t states; -} validate_list[] = { - { nvc0_validate_fb, NVC0_NEW_FRAMEBUFFER }, - { nvc0_validate_blend, NVC0_NEW_BLEND }, - { nvc0_validate_zsa, NVC0_NEW_ZSA }, - { nvc0_validate_sample_mask, NVC0_NEW_SAMPLE_MASK }, - { nvc0_validate_rasterizer, NVC0_NEW_RASTERIZER }, - { nvc0_validate_blend_colour, NVC0_NEW_BLEND_COLOUR }, - { nvc0_validate_stencil_ref, NVC0_NEW_STENCIL_REF }, - { nvc0_validate_stipple, NVC0_NEW_STIPPLE }, - { nvc0_validate_scissor, NVC0_NEW_SCISSOR | NVC0_NEW_RASTERIZER }, - { nvc0_validate_viewport, NVC0_NEW_VIEWPORT }, - { nvc0_vertprog_validate, NVC0_NEW_VERTPROG }, - { nvc0_tctlprog_validate, NVC0_NEW_TCTLPROG }, - { nvc0_tevlprog_validate, NVC0_NEW_TEVLPROG }, - { nvc0_validate_tess_state, NVC0_NEW_TESSFACTOR }, - { nvc0_gmtyprog_validate, NVC0_NEW_GMTYPROG }, - { nvc0_fragprog_validate, NVC0_NEW_FRAGPROG | NVC0_NEW_RASTERIZER }, - { nvc0_validate_derived_1, NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA | - NVC0_NEW_RASTERIZER }, - { nvc0_validate_derived_2, NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER }, - { nvc0_validate_derived_3, NVC0_NEW_BLEND | NVC0_NEW_FRAMEBUFFER }, - { nvc0_validate_clip, NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER | - NVC0_NEW_VERTPROG | - NVC0_NEW_TEVLPROG | - NVC0_NEW_GMTYPROG }, - { nvc0_constbufs_validate, NVC0_NEW_CONSTBUF }, - { nvc0_validate_textures, NVC0_NEW_TEXTURES }, - { nvc0_validate_samplers, NVC0_NEW_SAMPLERS }, - { nve4_set_tex_handles, NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS }, - { nvc0_vertex_arrays_validate, NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS }, - { nvc0_validate_surfaces, NVC0_NEW_SURFACES }, - { nvc0_validate_buffers, NVC0_NEW_BUFFERS }, - { nvc0_idxbuf_validate, NVC0_NEW_IDXBUF }, - { nvc0_tfb_validate, NVC0_NEW_TFB_TARGETS | NVC0_NEW_GMTYPROG }, - { nvc0_validate_min_samples, NVC0_NEW_MIN_SAMPLES }, - { nvc0_validate_driverconst, NVC0_NEW_DRIVERCONST }, +static struct nvc0_state_validate +validate_list_3d[] = { + { nvc0_validate_fb, NVC0_NEW_3D_FRAMEBUFFER }, + { nvc0_validate_blend, NVC0_NEW_3D_BLEND }, + { nvc0_validate_zsa, NVC0_NEW_3D_ZSA }, + { nvc0_validate_sample_mask, NVC0_NEW_3D_SAMPLE_MASK }, + { nvc0_validate_rasterizer, NVC0_NEW_3D_RASTERIZER }, + { nvc0_validate_blend_colour, NVC0_NEW_3D_BLEND_COLOUR }, + { nvc0_validate_stencil_ref, NVC0_NEW_3D_STENCIL_REF }, + { nvc0_validate_stipple, NVC0_NEW_3D_STIPPLE }, + { nvc0_validate_scissor, NVC0_NEW_3D_SCISSOR | NVC0_NEW_3D_RASTERIZER }, + { nvc0_validate_viewport, NVC0_NEW_3D_VIEWPORT }, + { nvc0_validate_window_rects, NVC0_NEW_3D_WINDOW_RECTS }, + { nvc0_vertprog_validate, NVC0_NEW_3D_VERTPROG }, + { nvc0_tctlprog_validate, NVC0_NEW_3D_TCTLPROG }, + { nvc0_tevlprog_validate, NVC0_NEW_3D_TEVLPROG }, + { nvc0_validate_tess_state, NVC0_NEW_3D_TESSFACTOR }, + { nvc0_gmtyprog_validate, NVC0_NEW_3D_GMTYPROG }, + { nvc0_validate_min_samples, NVC0_NEW_3D_MIN_SAMPLES | + NVC0_NEW_3D_FRAGPROG | + NVC0_NEW_3D_FRAMEBUFFER }, + { nvc0_fragprog_validate, NVC0_NEW_3D_FRAGPROG | NVC0_NEW_3D_RASTERIZER }, + { nvc0_validate_fp_zsa_rast, NVC0_NEW_3D_FRAGPROG | NVC0_NEW_3D_ZSA | + NVC0_NEW_3D_RASTERIZER }, + { nvc0_validate_zsa_fb, NVC0_NEW_3D_ZSA | NVC0_NEW_3D_FRAMEBUFFER }, + { nvc0_validate_blend_fb, NVC0_NEW_3D_BLEND | NVC0_NEW_3D_FRAMEBUFFER }, + { nvc0_validate_rast_fb, NVC0_NEW_3D_RASTERIZER | NVC0_NEW_3D_FRAMEBUFFER }, + { nvc0_validate_clip, NVC0_NEW_3D_CLIP | NVC0_NEW_3D_RASTERIZER | + NVC0_NEW_3D_VERTPROG | + NVC0_NEW_3D_TEVLPROG | + NVC0_NEW_3D_GMTYPROG }, + { nvc0_constbufs_validate, NVC0_NEW_3D_CONSTBUF }, + { nvc0_validate_textures, NVC0_NEW_3D_TEXTURES }, + { nvc0_validate_samplers, NVC0_NEW_3D_SAMPLERS }, + { nve4_set_tex_handles, NVC0_NEW_3D_TEXTURES | NVC0_NEW_3D_SAMPLERS }, + { nvc0_vertex_arrays_validate, NVC0_NEW_3D_VERTEX | NVC0_NEW_3D_ARRAYS }, + { nvc0_validate_surfaces, NVC0_NEW_3D_SURFACES }, + { nvc0_validate_buffers, NVC0_NEW_3D_BUFFERS }, + { nvc0_idxbuf_validate, NVC0_NEW_3D_IDXBUF }, + { nvc0_tfb_validate, NVC0_NEW_3D_TFB_TARGETS | NVC0_NEW_3D_GMTYPROG }, + { nvc0_validate_driverconst, NVC0_NEW_3D_DRIVERCONST }, }; bool -nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask) +nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, + struct nvc0_state_validate *validate_list, int size, + uint32_t *dirty, struct nouveau_bufctx *bufctx) { uint32_t state_mask; int ret; @@ -738,26 +822,38 @@ nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask) if (nvc0->screen->cur_ctx != nvc0) nvc0_switch_pipe_context(nvc0); - state_mask = nvc0->dirty & mask; + state_mask = *dirty & mask; if (state_mask) { - for (i = 0; i < ARRAY_SIZE(validate_list); ++i) { - struct state_validate *validate = &validate_list[i]; + for (i = 0; i < size; ++i) { + struct nvc0_state_validate *validate = &validate_list[i]; if (state_mask & validate->states) validate->func(nvc0); } - nvc0->dirty &= ~state_mask; + *dirty &= ~state_mask; - nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, false); + nvc0_bufctx_fence(nvc0, bufctx, false); } - nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_3d); + nouveau_pushbuf_bufctx(nvc0->base.pushbuf, bufctx); ret = nouveau_pushbuf_validate(nvc0->base.pushbuf); + return !ret; +} + +bool +nvc0_state_validate_3d(struct nvc0_context *nvc0, uint32_t mask) +{ + bool ret; + + ret = nvc0_state_validate(nvc0, mask, validate_list_3d, + ARRAY_SIZE(validate_list_3d), &nvc0->dirty_3d, + nvc0->bufctx_3d); + if (unlikely(nvc0->state.flushed)) { nvc0->state.flushed = false; nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, true); } - return !ret; + return ret; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h index f9680f5a9..c900fcadc 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h @@ -61,6 +61,12 @@ struct nvc0_vertex_stateobj { struct nvc0_vertex_element element[0]; }; +struct nvc0_window_rect_stateobj { + bool inclusive; + unsigned rects; + struct pipe_scissor_state rect[PIPE_MAX_WINDOW_RECTANGLES]; +}; + struct nvc0_so_target { struct pipe_stream_output_target pipe; struct pipe_query *pq; diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index b33b6c316..0d1405871 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -280,7 +280,8 @@ nvc0_clear_render_target(struct pipe_context *pipe, struct pipe_surface *dst, const union pipe_color_union *color, unsigned dstx, unsigned dsty, - unsigned width, unsigned height) + unsigned width, unsigned height, + bool render_condition_enabled) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -343,7 +344,8 @@ nvc0_clear_render_target(struct pipe_context *pipe, nvc0_resource_fence(res, NOUVEAU_BO_WR); } - IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS); + if (!render_condition_enabled) + IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS); BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth); for (z = 0; z < sf->depth; ++z) { @@ -351,9 +353,10 @@ nvc0_clear_render_target(struct pipe_context *pipe, (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT)); } - IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode); + if (!render_condition_enabled) + IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode); - nvc0->dirty |= NVC0_NEW_FRAMEBUFFER; + nvc0->dirty_3d |= NVC0_NEW_3D_FRAMEBUFFER; } static void @@ -609,7 +612,7 @@ nvc0_clear_buffer(struct pipe_context *pipe, data, data_size); } - nvc0->dirty |= NVC0_NEW_FRAMEBUFFER; + nvc0->dirty_3d |= NVC0_NEW_3D_FRAMEBUFFER; } static void @@ -619,7 +622,8 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe, double depth, unsigned stencil, unsigned dstx, unsigned dsty, - unsigned width, unsigned height) + unsigned width, unsigned height, + bool render_condition_enabled) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -668,7 +672,8 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe, PUSH_DATA (push, dst->u.tex.first_layer); IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode); - IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS); + if (!render_condition_enabled) + IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS); BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth); for (z = 0; z < sf->depth; ++z) { @@ -676,9 +681,10 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe, (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT)); } - IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode); + if (!render_condition_enabled) + IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode); - nvc0->dirty |= NVC0_NEW_FRAMEBUFFER; + nvc0->dirty_3d |= NVC0_NEW_3D_FRAMEBUFFER; } void @@ -693,7 +699,7 @@ nvc0_clear(struct pipe_context *pipe, unsigned buffers, uint32_t mode = 0; /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */ - if (!nvc0_state_validate(nvc0, NVC0_NEW_FRAMEBUFFER)) + if (!nvc0_state_validate_3d(nvc0, NVC0_NEW_3D_FRAMEBUFFER)) return; if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) { @@ -782,6 +788,7 @@ struct nvc0_blitctx enum pipe_texture_target target; struct { struct pipe_framebuffer_state fb; + struct nvc0_window_rect_stateobj window_rect; struct nvc0_rasterizer_stateobj *rast; struct nvc0_program *vp; struct nvc0_program *tcp; @@ -793,7 +800,7 @@ struct nvc0_blitctx struct pipe_sampler_view *texture[2]; struct nv50_tsc_entry *sampler[2]; unsigned min_samples; - uint32_t dirty; + uint32_t dirty_3d; } saved; struct nvc0_rasterizer_stateobj rast; }; @@ -829,11 +836,11 @@ nvc0_blitter_make_vp(struct nvc0_blitter *blit) }; static const uint32_t code_gm107[] = { - 0xfc0007e0, 0x001f8000, /* sched 0x7e0 0x7e0 0x7e0 */ + 0xfc0007e0, 0x001f8000, /* sched (st 0x0) (st 0x0) (st 0x0) */ 0x0807ff04, 0xefd8ff80, /* ld b64 $r4 a[0x80] 0x0 */ 0x0907ff00, 0xefd97f80, /* ld b96 $r0 a[0x90] 0x0 */ 0x0707ff04, 0xeff0ff80, /* st b64 a[0x70] $r4 0x0 */ - 0xfc0007e0, 0x00000000, /* sched 0x7e0 0x7e0 0x0 */ + 0xfc0007e0, 0x00000000, /* sched (st 0x0) (st 0x0) (st 0x0) */ 0x0807ff00, 0xeff17f80, /* st b96 a[0x80] $r0 0x0 */ 0x0007000f, 0xe3000000, /* exit */ }; @@ -959,10 +966,10 @@ nvc0_blit_set_src(struct nvc0_blitctx *ctx, templ.format = format; templ.u.tex.first_layer = templ.u.tex.last_layer = layer; templ.u.tex.first_level = templ.u.tex.last_level = level; - templ.swizzle_r = PIPE_SWIZZLE_RED; - templ.swizzle_g = PIPE_SWIZZLE_GREEN; - templ.swizzle_b = PIPE_SWIZZLE_BLUE; - templ.swizzle_a = PIPE_SWIZZLE_ALPHA; + templ.swizzle_r = PIPE_SWIZZLE_X; + templ.swizzle_g = PIPE_SWIZZLE_Y; + templ.swizzle_b = PIPE_SWIZZLE_Z; + templ.swizzle_a = PIPE_SWIZZLE_W; if (layer == -1) { templ.u.tex.first_layer = 0; @@ -1035,7 +1042,8 @@ nvc0_blitctx_prepare_state(struct nvc0_blitctx *blit) } static void -nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx) +nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx, + const struct pipe_blit_info *info) { struct nvc0_context *nvc0 = ctx->nvc0; struct nvc0_blitter *blitter = nvc0->screen->blitter; @@ -1043,6 +1051,8 @@ nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx) ctx->saved.fb.width = nvc0->framebuffer.width; ctx->saved.fb.height = nvc0->framebuffer.height; + ctx->saved.fb.samples = nvc0->framebuffer.samples; + ctx->saved.fb.layers = nvc0->framebuffer.layers; ctx->saved.fb.nr_cbufs = nvc0->framebuffer.nr_cbufs; ctx->saved.fb.cbufs[0] = nvc0->framebuffer.cbufs[0]; ctx->saved.fb.zsbuf = nvc0->framebuffer.zsbuf; @@ -1056,6 +1066,7 @@ nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx) ctx->saved.fp = nvc0->fragprog; ctx->saved.min_samples = nvc0->min_samples; + ctx->saved.window_rect = nvc0->window_rect; nvc0->rast = &ctx->rast; @@ -1065,6 +1076,13 @@ nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx) nvc0->gmtyprog = NULL; nvc0->fragprog = ctx->fp; + nvc0->window_rect.rects = + MIN2(info->num_window_rectangles, NVC0_MAX_WINDOW_RECTANGLES); + nvc0->window_rect.inclusive = info->window_rectangle_include; + if (nvc0->window_rect.rects) + memcpy(nvc0->window_rect.rect, info->window_rectangles, + sizeof(struct pipe_scissor_state) * nvc0->window_rect.rects); + for (s = 0; s <= 4; ++s) { ctx->saved.num_textures[s] = nvc0->num_textures[s]; ctx->saved.num_samplers[s] = nvc0->num_samplers[s]; @@ -1085,19 +1103,19 @@ nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx) nvc0->min_samples = 1; - ctx->saved.dirty = nvc0->dirty; + ctx->saved.dirty_3d = nvc0->dirty_3d; nvc0->textures_dirty[4] |= 3; nvc0->samplers_dirty[4] |= 3; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB); - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0)); - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1)); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_FB); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(4, 0)); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(4, 1)); - nvc0->dirty = NVC0_NEW_FRAMEBUFFER | NVC0_NEW_MIN_SAMPLES | - NVC0_NEW_VERTPROG | NVC0_NEW_FRAGPROG | - NVC0_NEW_TCTLPROG | NVC0_NEW_TEVLPROG | NVC0_NEW_GMTYPROG | - NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS; + nvc0->dirty_3d = NVC0_NEW_3D_FRAMEBUFFER | NVC0_NEW_3D_MIN_SAMPLES | + NVC0_NEW_3D_VERTPROG | NVC0_NEW_3D_FRAGPROG | + NVC0_NEW_3D_TCTLPROG | NVC0_NEW_3D_TEVLPROG | NVC0_NEW_3D_GMTYPROG | + NVC0_NEW_3D_TEXTURES | NVC0_NEW_3D_SAMPLERS | NVC0_NEW_3D_WINDOW_RECTS; } static void @@ -1110,6 +1128,8 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit) nvc0->framebuffer.width = blit->saved.fb.width; nvc0->framebuffer.height = blit->saved.fb.height; + nvc0->framebuffer.samples = blit->saved.fb.samples; + nvc0->framebuffer.layers = blit->saved.fb.layers; nvc0->framebuffer.nr_cbufs = blit->saved.fb.nr_cbufs; nvc0->framebuffer.cbufs[0] = blit->saved.fb.cbufs[0]; nvc0->framebuffer.zsbuf = blit->saved.fb.zsbuf; @@ -1123,6 +1143,7 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit) nvc0->fragprog = blit->saved.fp; nvc0->min_samples = blit->saved.min_samples; + nvc0->window_rect = blit->saved.window_rect; pipe_sampler_view_reference(&nvc0->textures[4][0], NULL); pipe_sampler_view_reference(&nvc0->textures[4][1], NULL); @@ -1145,20 +1166,20 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit) nvc0->base.pipe.render_condition(&nvc0->base.pipe, nvc0->cond_query, nvc0->cond_cond, nvc0->cond_mode); - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP); - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB); - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0)); - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1)); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_VTX_TMP); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_FB); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(4, 0)); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(4, 1)); nouveau_scratch_done(&nvc0->base); - nvc0->dirty = blit->saved.dirty | - (NVC0_NEW_FRAMEBUFFER | NVC0_NEW_SCISSOR | NVC0_NEW_SAMPLE_MASK | - NVC0_NEW_RASTERIZER | NVC0_NEW_ZSA | NVC0_NEW_BLEND | - NVC0_NEW_VIEWPORT | - NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS | - NVC0_NEW_VERTPROG | NVC0_NEW_FRAGPROG | - NVC0_NEW_TCTLPROG | NVC0_NEW_TEVLPROG | NVC0_NEW_GMTYPROG | - NVC0_NEW_TFB_TARGETS | NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS); + nvc0->dirty_3d = blit->saved.dirty_3d | + (NVC0_NEW_3D_FRAMEBUFFER | NVC0_NEW_3D_SCISSOR | NVC0_NEW_3D_SAMPLE_MASK | + NVC0_NEW_3D_RASTERIZER | NVC0_NEW_3D_ZSA | NVC0_NEW_3D_BLEND | + NVC0_NEW_3D_VIEWPORT | NVC0_NEW_3D_WINDOW_RECTS | + NVC0_NEW_3D_TEXTURES | NVC0_NEW_3D_SAMPLERS | + NVC0_NEW_3D_VERTPROG | NVC0_NEW_3D_FRAGPROG | + NVC0_NEW_3D_TCTLPROG | NVC0_NEW_3D_TEVLPROG | NVC0_NEW_3D_GMTYPROG | + NVC0_NEW_3D_TFB_TARGETS | NVC0_NEW_3D_VERTEX | NVC0_NEW_3D_ARRAYS); nvc0->scissors_dirty |= 1; nvc0->viewports_dirty |= 1; @@ -1187,7 +1208,7 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) blit->render_condition_enable = info->render_condition_enable; nvc0_blit_select_fp(blit, info); - nvc0_blitctx_pre_blit(blit); + nvc0_blitctx_pre_blit(blit, info); nvc0_blit_set_dst(blit, dst, info->dst.level, -1, info->dst.format); nvc0_blit_set_src(blit, src, info->src.level, -1, info->src.format, @@ -1195,7 +1216,7 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) nvc0_blitctx_prepare_state(blit); - nvc0_state_validate(nvc0, ~0); + nvc0_state_validate_3d(nvc0, ~0); x_range = (float)info->src.box.width / (float)info->dst.box.width; y_range = (float)info->src.box.height / (float)info->dst.box.height; @@ -1267,7 +1288,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) return; } - BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD, vtxbuf_bo); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_VTX_TMP, + NOUVEAU_BO_GART | NOUVEAU_BO_RD, vtxbuf_bo); nouveau_pushbuf_validate(push); BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(0)), 4); @@ -1294,6 +1316,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 | NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST); } + for (i = 1; i < n; ++i) + IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0); if (nvc0->state.instance_elts) { nvc0->state.instance_elts = 0; BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2); @@ -1599,6 +1623,9 @@ nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info) info->src.box.height != -info->dst.box.height)) eng3d = true; + if (info->num_window_rectangles > 0 || info->window_rectangle_include) + eng3d = true; + if (nvc0->screen->num_occlusion_queries_active) IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0); @@ -1654,6 +1681,7 @@ nvc0_blitter_destroy(struct nvc0_screen *screen) } } + pipe_mutex_destroy(blitter->mutex); FREE(blitter); } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c index df10a7421..e57391e9a 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c @@ -36,13 +36,13 @@ static inline uint32_t nv50_tic_swizzle(const struct nvc0_format *fmt, unsigned swz, bool tex_int) { switch (swz) { - case PIPE_SWIZZLE_RED : return fmt->tic.src_x; - case PIPE_SWIZZLE_GREEN: return fmt->tic.src_y; - case PIPE_SWIZZLE_BLUE : return fmt->tic.src_z; - case PIPE_SWIZZLE_ALPHA: return fmt->tic.src_w; - case PIPE_SWIZZLE_ONE: + case PIPE_SWIZZLE_X : return fmt->tic.src_x; + case PIPE_SWIZZLE_Y: return fmt->tic.src_y; + case PIPE_SWIZZLE_Z : return fmt->tic.src_z; + case PIPE_SWIZZLE_W: return fmt->tic.src_w; + case PIPE_SWIZZLE_1: return tex_int ? G80_TIC_SOURCE_ONE_INT : G80_TIC_SOURCE_ONE_FLOAT; - case PIPE_SWIZZLE_ZERO: + case PIPE_SWIZZLE_0: default: return G80_TIC_SOURCE_ZERO; } @@ -132,9 +132,9 @@ gm107_create_texture_view(struct pipe_context *pipe, if (unlikely(!nouveau_bo_memtype(nv04_resource(texture)->bo))) { if (texture->target == PIPE_BUFFER) { assert(!(tic[5] & GM107_TIC2_5_NORMALIZED_COORDS)); - width = view->pipe.u.buf.last_element - view->pipe.u.buf.first_element; + width = view->pipe.u.buf.size / (desc->block.bits / 8) - 1; address += - view->pipe.u.buf.first_element * desc->block.bits / 8; + view->pipe.u.buf.offset; tic[2] = GM107_TIC2_2_HEADER_VERSION_ONE_D_BUFFER; tic[3] |= width >> 16; tic[4] |= GM107_TIC2_4_TEXTURE_TYPE_ONE_D_BUFFER; @@ -236,6 +236,42 @@ gm107_create_texture_view(struct pipe_context *pipe, return &view->pipe; } +struct pipe_sampler_view * +gm107_create_texture_view_from_image(struct pipe_context *pipe, + const struct pipe_image_view *view) +{ + struct nv04_resource *res = nv04_resource(view->resource); + struct pipe_sampler_view templ = {}; + enum pipe_texture_target target; + uint32_t flags = 0; + + if (!res) + return NULL; + target = res->base.target; + + if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) + target = PIPE_TEXTURE_2D_ARRAY; + + templ.format = view->format; + templ.swizzle_r = PIPE_SWIZZLE_X; + templ.swizzle_g = PIPE_SWIZZLE_Y; + templ.swizzle_b = PIPE_SWIZZLE_Z; + templ.swizzle_a = PIPE_SWIZZLE_W; + + if (target == PIPE_BUFFER) { + templ.u.buf.offset = view->u.buf.offset; + templ.u.buf.size = view->u.buf.size; + } else { + templ.u.tex.first_layer = view->u.tex.first_layer; + templ.u.tex.last_layer = view->u.tex.last_layer; + templ.u.tex.first_level = templ.u.tex.last_level = view->u.tex.level; + } + + flags = NV50_TEXVIEW_SCALED_COORDS; + + return nvc0_create_texture_view(pipe, &res->base, &templ, flags, target); +} + static struct pipe_sampler_view * gf100_create_texture_view(struct pipe_context *pipe, struct pipe_resource *texture, @@ -250,6 +286,7 @@ gf100_create_texture_view(struct pipe_context *pipe, uint32_t swz[4]; uint32_t width, height; uint32_t depth; + uint32_t tex_fmt; struct nv50_tic_entry *view; struct nv50_miptree *mt; bool tex_int; @@ -275,12 +312,13 @@ gf100_create_texture_view(struct pipe_context *pipe, fmt = &nvc0_format_table[view->pipe.format]; tex_int = util_format_is_pure_integer(view->pipe.format); + tex_fmt = fmt->tic.format & 0x3f; swz[0] = nv50_tic_swizzle(fmt, view->pipe.swizzle_r, tex_int); swz[1] = nv50_tic_swizzle(fmt, view->pipe.swizzle_g, tex_int); swz[2] = nv50_tic_swizzle(fmt, view->pipe.swizzle_b, tex_int); swz[3] = nv50_tic_swizzle(fmt, view->pipe.swizzle_a, tex_int); - tic[0] = (fmt->tic.format << G80_TIC_0_COMPONENTS_SIZES__SHIFT) | + tic[0] = (tex_fmt << G80_TIC_0_COMPONENTS_SIZES__SHIFT) | (fmt->tic.type_r << G80_TIC_0_R_DATA_TYPE__SHIFT) | (fmt->tic.type_g << G80_TIC_0_G_DATA_TYPE__SHIFT) | (fmt->tic.type_b << G80_TIC_0_B_DATA_TYPE__SHIFT) | @@ -288,7 +326,8 @@ gf100_create_texture_view(struct pipe_context *pipe, (swz[0] << G80_TIC_0_X_SOURCE__SHIFT) | (swz[1] << G80_TIC_0_Y_SOURCE__SHIFT) | (swz[2] << G80_TIC_0_Z_SOURCE__SHIFT) | - (swz[3] << G80_TIC_0_W_SOURCE__SHIFT); + (swz[3] << G80_TIC_0_W_SOURCE__SHIFT) | + ((fmt->tic.format & 0x40) << (GK20A_TIC_0_USE_COMPONENT_SIZES_EXTENDED__SHIFT - 6)); address = mt->base.address; @@ -305,11 +344,11 @@ gf100_create_texture_view(struct pipe_context *pipe, if (texture->target == PIPE_BUFFER) { assert(!(tic[2] & G80_TIC_2_NORMALIZED_COORDS)); address += - view->pipe.u.buf.first_element * desc->block.bits / 8; + view->pipe.u.buf.offset; tic[2] |= G80_TIC_2_LAYOUT_PITCH | G80_TIC_2_TEXTURE_TYPE_ONE_D_BUFFER; tic[3] = 0; tic[4] = /* width */ - view->pipe.u.buf.last_element - view->pipe.u.buf.first_element + 1; + view->pipe.u.buf.size / (desc->block.bits / 8); tic[5] = 0; } else { /* must be 2D texture without mip maps */ @@ -410,15 +449,14 @@ nvc0_create_texture_view(struct pipe_context *pipe, return gf100_create_texture_view(pipe, texture, templ, flags, target); } -static void +void nvc0_update_tic(struct nvc0_context *nvc0, struct nv50_tic_entry *tic, struct nv04_resource *res) { uint64_t address = res->address; if (res->base.target != PIPE_BUFFER) return; - address += tic->pipe.u.buf.first_element * - util_format_get_blocksize(tic->pipe.format); + address += tic->pipe.u.buf.offset; if (tic->tic[1] == (uint32_t)address && (tic->tic[2] & 0xff) == address >> 32) return; @@ -435,7 +473,6 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s) { uint32_t commands[32]; struct nouveau_pushbuf *push = nvc0->base.pushbuf; - struct nouveau_bo *txc = nvc0->screen->txc; unsigned i; unsigned n = 0; bool need_flush = false; @@ -456,23 +493,14 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s) if (tic->id < 0) { tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); - PUSH_SPACE(push, 17); - BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2); - PUSH_DATAh(push, txc->offset + (tic->id * 32)); - PUSH_DATA (push, txc->offset + (tic->id * 32)); - BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2); - PUSH_DATA (push, 32); - PUSH_DATA (push, 1); - BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1); - PUSH_DATA (push, 0x100111); - BEGIN_NIC0(push, NVC0_M2MF(DATA), 8); - PUSH_DATAp(push, &tic->tic[0], 8); - + nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc, tic->id * 32, + NV_VRAM_DOMAIN(&nvc0->screen->base), 32, + tic->tic); need_flush = true; } else if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { if (unlikely(s == 5)) - BEGIN_NVC0(push, NVC0_COMPUTE(TEX_CACHE_CTL), 1); + BEGIN_NVC0(push, NVC0_CP(TEX_CACHE_CTL), 1); else BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1); PUSH_DATA (push, (tic->id << 4) | 1); @@ -490,7 +518,7 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s) if (unlikely(s == 5)) BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD); else - BCTX_REFN(nvc0->bufctx_3d, TEX(s, i), res, RD); + BCTX_REFN(nvc0->bufctx_3d, 3D_TEX(s, i), res, RD); } for (; i < nvc0->state.num_textures[s]; ++i) commands[n++] = (i << 1) | 0; @@ -499,7 +527,7 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s) if (n) { if (unlikely(s == 5)) - BEGIN_NIC0(push, NVC0_COMPUTE(BIND_TIC), n); + BEGIN_NIC0(push, NVC0_CP(BIND_TIC), n); else BEGIN_NIC0(push, NVC0_3D(BIND_TIC(s)), n); PUSH_DATAp(push, commands, n); @@ -512,7 +540,6 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s) static bool nve4_validate_tic(struct nvc0_context *nvc0, unsigned s) { - struct nouveau_bo *txc = nvc0->screen->txc; struct nouveau_pushbuf *push = nvc0->base.pushbuf; unsigned i; bool need_flush = false; @@ -532,17 +559,9 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s) if (tic->id < 0) { tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); - PUSH_SPACE(push, 16); - BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, txc->offset + (tic->id * 32)); - PUSH_DATA (push, txc->offset + (tic->id * 32)); - BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 32); - PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_P2MF(UPLOAD_EXEC), 9); - PUSH_DATA (push, 0x1001); - PUSH_DATAp(push, &tic->tic[0], 8); - + nve4_p2mf_push_linear(&nvc0->base, nvc0->screen->txc, tic->id * 32, + NV_VRAM_DOMAIN(&nvc0->screen->base), 32, + tic->tic); need_flush = true; } else if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { @@ -557,7 +576,7 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s) nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID; nvc0->tex_handles[s][i] |= tic->id; if (dirty) - BCTX_REFN(nvc0->bufctx_3d, TEX(s, i), res, RD); + BCTX_REFN(nvc0->bufctx_3d, 3D_TEX(s, i), res, RD); } for (; i < nvc0->state.num_textures[s]; ++i) { nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; @@ -585,6 +604,12 @@ void nvc0_validate_textures(struct nvc0_context *nvc0) BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(TIC_FLUSH), 1); PUSH_DATA (nvc0->base.pushbuf, 0); } + + /* Invalidate all CP textures because they are aliased. */ + for (int i = 0; i < nvc0->num_textures[5]; i++) + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_TEX(i)); + nvc0->textures_dirty[5] = ~0; + nvc0->dirty_cp |= NVC0_NEW_CP_TEXTURES; } bool @@ -625,7 +650,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s) if (n) { if (unlikely(s == 5)) - BEGIN_NIC0(push, NVC0_COMPUTE(BIND_TSC), n); + BEGIN_NIC0(push, NVC0_CP(BIND_TSC), n); else BEGIN_NIC0(push, NVC0_3D(BIND_TSC(s)), n); PUSH_DATAp(push, commands, n); @@ -638,8 +663,6 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s) bool nve4_validate_tsc(struct nvc0_context *nvc0, int s) { - struct nouveau_bo *txc = nvc0->screen->txc; - struct nouveau_pushbuf *push = nvc0->base.pushbuf; unsigned i; bool need_flush = false; @@ -653,17 +676,10 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s) if (tsc->id < 0) { tsc->id = nvc0_screen_tsc_alloc(nvc0->screen, tsc); - PUSH_SPACE(push, 16); - BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, txc->offset + 65536 + (tsc->id * 32)); - PUSH_DATA (push, txc->offset + 65536 + (tsc->id * 32)); - BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 32); - PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_P2MF(UPLOAD_EXEC), 9); - PUSH_DATA (push, 0x1001); - PUSH_DATAp(push, &tsc->tsc[0], 8); - + nve4_p2mf_push_linear(&nvc0->base, nvc0->screen->txc, + 65536 + tsc->id * 32, + NV_VRAM_DOMAIN(&nvc0->screen->base), + 32, tsc->tsc); need_flush = true; } nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32); @@ -697,6 +713,10 @@ void nvc0_validate_samplers(struct nvc0_context *nvc0) BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(TSC_FLUSH), 1); PUSH_DATA (nvc0->base.pushbuf, 0); } + + /* Invalidate all CP samplers because they are aliased. */ + nvc0->samplers_dirty[5] = ~0; + nvc0->dirty_cp |= NVC0_NEW_CP_SAMPLERS; } /* Upload the "diagonal" entries for the possible texture sources ($t == $s). @@ -707,21 +727,20 @@ void nve4_set_tex_handles(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; - uint64_t address; + struct nvc0_screen *screen = nvc0->screen; unsigned s; if (nvc0->screen->base.class_3d < NVE4_3D_CLASS) return; - address = nvc0->screen->uniform_bo->offset + (6 << 16); - for (s = 0; s < 5; ++s, address += (1 << 10)) { + for (s = 0; s < 5; ++s) { uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s]; if (!dirty) continue; BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - PUSH_DATA (push, 1024); - PUSH_DATAh(push, address); - PUSH_DATA (push, address); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); do { int i = ffs(dirty) - 1; dirty &= ~(1 << i); @@ -741,23 +760,72 @@ static const uint8_t nve4_su_format_map[PIPE_FORMAT_COUNT]; static const uint16_t nve4_su_format_aux_map[PIPE_FORMAT_COUNT]; static const uint16_t nve4_suldp_lib_offset[PIPE_FORMAT_COUNT]; +static void +nvc0_get_surface_dims(struct pipe_image_view *view, int *width, int *height, + int *depth) +{ + struct nv04_resource *res = nv04_resource(view->resource); + int level; + + *width = *height = *depth = 1; + if (res->base.target == PIPE_BUFFER) { + *width = view->u.buf.size / util_format_get_blocksize(view->format); + return; + } + + level = view->u.tex.level; + *width = u_minify(view->resource->width0, level); + *height = u_minify(view->resource->height0, level); + *depth = u_minify(view->resource->depth0, level); + + switch (res->base.target) { + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + *depth = view->u.tex.last_layer - view->u.tex.first_layer + 1; + break; + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_3D: + break; + default: + assert(!"unexpected texture target"); + break; + } +} + +void +nvc0_mark_image_range_valid(const struct pipe_image_view *view) +{ + struct nv04_resource *res = (struct nv04_resource *)view->resource; + + assert(view->resource->target == PIPE_BUFFER); + + util_range_add(&res->valid_buffer_range, + view->u.buf.offset, + view->u.buf.offset + view->u.buf.size); +} + void nve4_set_surface_info(struct nouveau_pushbuf *push, - struct pipe_surface *psf, - struct nvc0_screen *screen) + struct pipe_image_view *view, + struct nvc0_context *nvc0) { - struct nv50_surface *sf = nv50_surface(psf); + struct nvc0_screen *screen = nvc0->screen; struct nv04_resource *res; uint64_t address; uint32_t *const info = push->cur; + int width, height, depth; uint8_t log2cpp; - if (psf && !nve4_su_format_map[psf->format]) + if (view && !nve4_su_format_map[view->format]) NOUVEAU_ERR("unsupported surface format, try is_format_supported() !\n"); push->cur += 16; - if (!psf || !nve4_su_format_map[psf->format]) { + if (!view || !nve4_su_format_map[view->format]) { memset(info, 0, 16 * sizeof(*info)); info[0] = 0xbadf0000; @@ -766,13 +834,16 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, screen->lib_code->start; return; } - res = nv04_resource(sf->base.texture); + res = nv04_resource(view->resource); - address = res->address + sf->offset; + address = res->address; - info[8] = sf->width; - info[9] = sf->height; - info[10] = sf->depth; + /* get surface dimensions based on the target. */ + nvc0_get_surface_dims(view, &width, &height, &depth); + + info[8] = width; + info[9] = height; + info[10] = depth; switch (res->base.target) { case PIPE_TEXTURE_1D_ARRAY: info[11] = 1; @@ -793,17 +864,19 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, info[11] = 0; break; } - log2cpp = (0xf000 & nve4_su_format_aux_map[sf->base.format]) >> 12; + log2cpp = (0xf000 & nve4_su_format_aux_map[view->format]) >> 12; - info[12] = nve4_suldp_lib_offset[sf->base.format] + screen->lib_code->start; + /* Stick the blockwidth (ie. number of bytes per pixel) to check if the + * format doesn't mismatch. */ + info[12] = util_format_get_blocksize(view->format); /* limit in bytes for raw access */ - info[13] = (0x06 << 22) | ((sf->width << log2cpp) - 1); + info[13] = (0x06 << 22) | ((width << log2cpp) - 1); - info[1] = nve4_su_format_map[sf->base.format]; + info[1] = nve4_su_format_map[view->format]; #if 0 - switch (util_format_get_blocksizebits(sf->base.format)) { + switch (util_format_get_blocksizebits(view->format)) { case 16: info[1] |= 1 << 16; break; case 32: info[1] |= 2 << 16; break; case 64: info[1] |= 3 << 16; break; @@ -814,13 +887,15 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, #else info[1] |= log2cpp << 16; info[1] |= 0x4000; - info[1] |= (0x0f00 & nve4_su_format_aux_map[sf->base.format]); + info[1] |= (0x0f00 & nve4_su_format_aux_map[view->format]); #endif if (res->base.target == PIPE_BUFFER) { + address += view->u.buf.offset; + info[0] = address >> 8; - info[2] = sf->width - 1; - info[2] |= (0xff & nve4_su_format_aux_map[sf->base.format]) << 22; + info[2] = width - 1; + info[2] |= (0xff & nve4_su_format_aux_map[view->format]) << 22; info[3] = 0; info[4] = 0; info[5] = 0; @@ -830,28 +905,34 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, info[15] = 0; } else { struct nv50_miptree *mt = nv50_miptree(&res->base); - struct nv50_miptree_level *lvl = &mt->level[sf->base.u.tex.level]; - const unsigned z = sf->base.u.tex.first_layer; + struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level]; + const unsigned z = view->u.tex.first_layer; if (z) { if (mt->layout_3d) { - address += nvc0_mt_zslice_offset(mt, psf->u.tex.level, z); + address += nvc0_mt_zslice_offset(mt, view->u.tex.level, z); /* doesn't work if z passes z-tile boundary */ - assert(sf->depth == 1); + if (depth > 1) { + pipe_debug_message(&nvc0->base.debug, CONFORMANCE, + "3D images are not really supported!"); + debug_printf("3D images are not really supported!\n"); + } } else { address += mt->layer_stride * z; } } + address += lvl->offset; + info[0] = address >> 8; - info[2] = sf->width - 1; + info[2] = (width << mt->ms_x) - 1; /* NOTE: this is really important: */ - info[2] |= (0xff & nve4_su_format_aux_map[sf->base.format]) << 22; + info[2] |= (0xff & nve4_su_format_aux_map[view->format]) << 22; info[3] = (0x88 << 24) | (lvl->pitch / 64); - info[4] = sf->height - 1; + info[4] = (height << mt->ms_y) - 1; info[4] |= (lvl->tile_mode & 0x0f0) << 25; info[4] |= NVC0_TILE_SHIFT_Y(lvl->tile_mode) << 22; info[5] = mt->layer_stride >> 8; - info[6] = sf->depth - 1; + info[6] = depth - 1; info[6] |= (lvl->tile_mode & 0xf00) << 21; info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22; info[7] = 0; @@ -861,15 +942,244 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, } static inline void +nvc0_set_surface_info(struct nouveau_pushbuf *push, + struct pipe_image_view *view, uint64_t address, + int width, int height, int depth) +{ + struct nv04_resource *res; + uint32_t *const info = push->cur; + + push->cur += 16; + + /* Make sure to always initialize the surface information area because it's + * used to check if the given image is bound or not. */ + memset(info, 0, 16 * sizeof(*info)); + + if (!view || !view->resource) + return; + res = nv04_resource(view->resource); + + /* Stick the image dimensions for the imageSize() builtin. */ + info[8] = width; + info[9] = height; + info[10] = depth; + + /* Stick the blockwidth (ie. number of bytes per pixel) to calculate pixel + * offset and to check if the format doesn't mismatch. */ + info[12] = util_format_get_blocksize(view->format); + + if (res->base.target == PIPE_BUFFER) { + info[0] = address >> 8; + info[2] = width; + } else { + struct nv50_miptree *mt = nv50_miptree(&res->base); + + info[0] = address >> 8; + info[2] = width; + info[4] = height; + info[5] = mt->layer_stride >> 8; + info[6] = depth; + info[14] = mt->ms_x; + info[15] = mt->ms_y; + } +} + +void +nvc0_validate_suf(struct nvc0_context *nvc0, int s) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; + + for (int i = 0; i < NVC0_MAX_IMAGES; ++i) { + struct pipe_image_view *view = &nvc0->images[s][i]; + int width, height, depth; + uint64_t address = 0; + + if (s == 5) + BEGIN_NVC0(push, NVC0_CP(IMAGE(i)), 6); + else + BEGIN_NVC0(push, NVC0_3D(IMAGE(i)), 6); + + if (view->resource) { + struct nv04_resource *res = nv04_resource(view->resource); + unsigned rt = nvc0_format_table[view->format].rt; + + if (util_format_is_depth_or_stencil(view->format)) + rt = rt << 12; + else + rt = (rt << 4) | (0x14 << 12); + + /* get surface dimensions based on the target. */ + nvc0_get_surface_dims(view, &width, &height, &depth); + + address = res->address; + if (res->base.target == PIPE_BUFFER) { + unsigned blocksize = util_format_get_blocksize(view->format); + + address += view->u.buf.offset; + assert(!(address & 0xff)); + + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + nvc0_mark_image_range_valid(view); + + PUSH_DATAh(push, address); + PUSH_DATA (push, address); + PUSH_DATA (push, align(width * blocksize, 0x100)); + PUSH_DATA (push, NVC0_3D_IMAGE_HEIGHT_LINEAR | 1); + PUSH_DATA (push, rt); + PUSH_DATA (push, 0); + } else { + struct nv50_miptree *mt = nv50_miptree(view->resource); + struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level]; + const unsigned z = view->u.tex.first_layer; + + if (mt->layout_3d) { + address += nvc0_mt_zslice_offset(mt, view->u.tex.level, z); + if (depth >= 1) { + pipe_debug_message(&nvc0->base.debug, CONFORMANCE, + "3D images are not supported!"); + debug_printf("3D images are not supported!\n"); + } + } else { + address += mt->layer_stride * z; + } + address += lvl->offset; + + PUSH_DATAh(push, address); + PUSH_DATA (push, address); + PUSH_DATA (push, width << mt->ms_x); + PUSH_DATA (push, height << mt->ms_y); + PUSH_DATA (push, rt); + PUSH_DATA (push, lvl->tile_mode & 0xff); /* mask out z-tiling */ + } + + if (s == 5) + BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); + else + BCTX_REFN(nvc0->bufctx_3d, 3D_SUF, res, RDWR); + } else { + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0x14000); + PUSH_DATA(push, 0); + } + + /* stick surface information into the driver constant buffer */ + if (s == 5) + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + else + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + if (s == 5) + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 16); + else + BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 16); + PUSH_DATA (push, NVC0_CB_AUX_SU_INFO(i)); + + nvc0_set_surface_info(push, view, address, width, height, depth); + } +} + +static inline void nvc0_update_surface_bindings(struct nvc0_context *nvc0) { - /* TODO */ + nvc0_validate_suf(nvc0, 4); + + /* Invalidate all COMPUTE images because they are aliased with FRAGMENT. */ + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF); + nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES; + nvc0->images_dirty[5] |= nvc0->images_valid[5]; +} + +static void +gm107_validate_surfaces(struct nvc0_context *nvc0, + struct pipe_image_view *view, int stage, int slot) +{ + struct nv04_resource *res = nv04_resource(view->resource); + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; + struct nv50_tic_entry *tic; + + tic = nv50_tic_entry(nvc0->images_tic[stage][slot]); + + res = nv04_resource(tic->pipe.texture); + nvc0_update_tic(nvc0, tic, res); + + if (tic->id < 0) { + tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); + + /* upload the texture view */ + nve4_p2mf_push_linear(&nvc0->base, nvc0->screen->txc, tic->id * 32, + NV_VRAM_DOMAIN(&nvc0->screen->base), 32, tic->tic); + + BEGIN_NVC0(push, NVC0_3D(TIC_FLUSH), 1); + PUSH_DATA (push, 0); + } else + if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { + BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1); + PUSH_DATA (push, (tic->id << 4) | 1); + } + nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); + + res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; + res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; + + BCTX_REFN(nvc0->bufctx_3d, 3D_SUF, res, RD); + + /* upload the texture handle */ + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(stage)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(stage)); + BEGIN_NVC0(push, NVC0_3D(CB_POS), 2); + PUSH_DATA (push, NVC0_CB_AUX_TEX_INFO(slot + 32)); + PUSH_DATA (push, tic->id); } static inline void nve4_update_surface_bindings(struct nvc0_context *nvc0) { - /* TODO */ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; + int i, j, s; + + for (s = 0; s < 5; s++) { + if (!nvc0->images_dirty[s]) + continue; + + for (i = 0; i < NVC0_MAX_IMAGES; ++i) { + struct pipe_image_view *view = &nvc0->images[s][i]; + + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 16); + PUSH_DATA (push, NVC0_CB_AUX_SU_INFO(i)); + + if (view->resource) { + struct nv04_resource *res = nv04_resource(view->resource); + + if (res->base.target == PIPE_BUFFER) { + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + nvc0_mark_image_range_valid(view); + } + + nve4_set_surface_info(push, view, nvc0); + BCTX_REFN(nvc0->bufctx_3d, 3D_SUF, res, RDWR); + + if (nvc0->screen->base.class_3d >= GM107_3D_CLASS) + gm107_validate_surfaces(nvc0, view, s, i); + } else { + for (j = 0; j < 16; j++) + PUSH_DATA(push, 0); + } + } + } } void @@ -893,13 +1203,14 @@ static const uint8_t nve4_su_format_map[PIPE_FORMAT_COUNT] = [PIPE_FORMAT_R16G16B16A16_SNORM] = GK104_IMAGE_FORMAT_RGBA16_SNORM, [PIPE_FORMAT_R16G16B16A16_SINT] = GK104_IMAGE_FORMAT_RGBA16_SINT, [PIPE_FORMAT_R16G16B16A16_UINT] = GK104_IMAGE_FORMAT_RGBA16_UINT, + [PIPE_FORMAT_B8G8R8A8_UNORM] = GK104_IMAGE_FORMAT_BGRA8_UNORM, [PIPE_FORMAT_R8G8B8A8_UNORM] = GK104_IMAGE_FORMAT_RGBA8_UNORM, [PIPE_FORMAT_R8G8B8A8_SNORM] = GK104_IMAGE_FORMAT_RGBA8_SNORM, [PIPE_FORMAT_R8G8B8A8_SINT] = GK104_IMAGE_FORMAT_RGBA8_SINT, [PIPE_FORMAT_R8G8B8A8_UINT] = GK104_IMAGE_FORMAT_RGBA8_UINT, [PIPE_FORMAT_R11G11B10_FLOAT] = GK104_IMAGE_FORMAT_R11G11B10_FLOAT, [PIPE_FORMAT_R10G10B10A2_UNORM] = GK104_IMAGE_FORMAT_RGB10_A2_UNORM, -/* [PIPE_FORMAT_R10G10B10A2_UINT] = GK104_IMAGE_FORMAT_RGB10_A2_UINT, */ + [PIPE_FORMAT_R10G10B10A2_UINT] = GK104_IMAGE_FORMAT_RGB10_A2_UINT, [PIPE_FORMAT_R32G32_FLOAT] = GK104_IMAGE_FORMAT_RG32_FLOAT, [PIPE_FORMAT_R32G32_SINT] = GK104_IMAGE_FORMAT_RG32_SINT, [PIPE_FORMAT_R32G32_UINT] = GK104_IMAGE_FORMAT_RG32_UINT, @@ -946,7 +1257,8 @@ static const uint16_t nve4_su_format_aux_map[PIPE_FORMAT_COUNT] = [PIPE_FORMAT_R32G32_UINT] = 0x3433, [PIPE_FORMAT_R10G10B10A2_UNORM] = 0x2a24, -/* [PIPE_FORMAT_R10G10B10A2_UINT] = 0x2a24, */ + [PIPE_FORMAT_R10G10B10A2_UINT] = 0x2a24, + [PIPE_FORMAT_B8G8R8A8_UNORM] = 0x2a24, [PIPE_FORMAT_R8G8B8A8_UNORM] = 0x2a24, [PIPE_FORMAT_R8G8B8A8_SNORM] = 0x2a24, [PIPE_FORMAT_R8G8B8A8_SINT] = 0x2a24, @@ -997,7 +1309,7 @@ static const uint16_t nve4_suldp_lib_offset[PIPE_FORMAT_COUNT] = [PIPE_FORMAT_R32G32_SINT] = 0x468, [PIPE_FORMAT_R32G32_UINT] = 0x468, [PIPE_FORMAT_R10G10B10A2_UNORM] = 0x4a8, -/* [PIPE_FORMAT_R10G10B10A2_UINT] = 0x530, */ + [PIPE_FORMAT_R10G10B10A2_UINT] = 0x530, [PIPE_FORMAT_R8G8B8A8_UNORM] = 0x588, [PIPE_FORMAT_R8G8B8A8_SNORM] = 0x5f8, [PIPE_FORMAT_R8G8B8A8_SINT] = 0x670, diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c index 24d23d29b..14fb53cb8 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c @@ -390,15 +390,22 @@ nvc0_miptree_transfer_map(struct pipe_context *pctx, } tx->nlayers = box->depth; - tx->base.stride = tx->nblocksx * util_format_get_blocksize(res->format); - tx->base.layer_stride = tx->nblocksy * tx->base.stride; - if (usage & PIPE_TRANSFER_MAP_DIRECTLY) { - tx->base.stride = align(tx->base.stride, 128); + tx->base.stride = mt->level[level].pitch; + tx->base.layer_stride = mt->layer_stride; + uint32_t offset = box->y * tx->base.stride + + util_format_get_stride(res->format, box->x); + if (!mt->layout_3d) + offset += mt->layer_stride * box->z; + else + offset += nvc0_mt_zslice_offset(mt, level, box->z); *ptransfer = &tx->base; - return mt->base.bo->map + mt->base.offset; + return mt->base.bo->map + mt->base.offset + offset; } + tx->base.stride = tx->nblocksx * util_format_get_blocksize(res->format); + tx->base.layer_stride = tx->nblocksy * tx->base.stride; + nv50_m2mf_rect_setup(&tx->rect[0], res, level, box->x, box->y, box->z); size = tx->base.layer_stride; diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index bb1cf9a0b..69ca091c4 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -222,7 +222,7 @@ static inline void nvc0_release_user_vbufs(struct nvc0_context *nvc0) { if (nvc0->vbo_user) { - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_VTX_TMP); nouveau_scratch_done(&nvc0->base); } } @@ -257,7 +257,7 @@ nvc0_update_user_vbufs(struct nvc0_context *nvc0) address[b] = nouveau_scratch_data(&nvc0->base, vb->user_buffer, base, size, &bo); if (bo) - BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, bo_flags, bo); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_VTX_TMP, bo_flags, bo); NOUVEAU_DRV_STAT(&nvc0->screen->base, user_buffer_upload_bytes, size); } @@ -292,7 +292,7 @@ nvc0_update_user_vbufs_shared(struct nvc0_context *nvc0) address = nouveau_scratch_data(&nvc0->base, nvc0->vtxbuf[b].user_buffer, base, size, &bo); if (bo) - BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, bo_flags, bo); + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_VTX_TMP, bo_flags, bo); BEGIN_1IC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_SELECT), 5); PUSH_DATA (push, b); @@ -368,7 +368,7 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0) if (!(refd & (1 << b))) { refd |= 1 << b; - BCTX_REFN(nvc0->bufctx_3d, VTX, res, RD); + BCTX_REFN(nvc0->bufctx_3d, 3D_VTX, res, RD); } } if (nvc0->vbo_user) @@ -412,7 +412,7 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0) PUSH_DATAh(push, buf->address + limit); PUSH_DATA (push, buf->address + limit); - BCTX_REFN(nvc0->bufctx_3d, VTX, buf, RD); + BCTX_REFN(nvc0->bufctx_3d, 3D_VTX, buf, RD); } /* If there are more elements than buffers, we might not have unset * fetching on the later elements. @@ -435,18 +435,20 @@ nvc0_vertex_arrays_validate(struct nvc0_context *nvc0) uint8_t vbo_mode; bool update_vertex; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_VTX); assert(vertex); if (unlikely(vertex->need_conversion) || unlikely(nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)) { vbo_mode = 3; + } else if (nvc0->vbo_user & ~nvc0->constant_vbos) { + vbo_mode = nvc0->vbo_push_hint ? 1 : 0; } else { - vbo_mode = (nvc0->vbo_user && nvc0->vbo_push_hint) ? 1 : 0; + vbo_mode = 0; } const_vbos = vbo_mode ? 0 : nvc0->constant_vbos; - update_vertex = (nvc0->dirty & NVC0_NEW_VERTEX) || + update_vertex = (nvc0->dirty_3d & NVC0_NEW_3D_VERTEX) || (const_vbos != nvc0->state.constant_vbos) || (vbo_mode != nvc0->state.vbo_mode); @@ -537,7 +539,7 @@ nvc0_idxbuf_validate(struct nvc0_context *nvc0) PUSH_DATA (push, buf->address + buf->base.width0 - 1); PUSH_DATA (push, nvc0->idxbuf.index_size >> 1); - BCTX_REFN(nvc0->bufctx_3d, IDX, buf, RD); + BCTX_REFN(nvc0->bufctx_3d, 3D_IDX, buf, RD); } #define NVC0_PRIM_GL_CASE(n) \ @@ -820,6 +822,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) struct nv04_resource *buf_count = nv04_resource(info->indirect_params); unsigned size, macro, count = info->indirect_count, drawid = info->drawid; uint32_t offset = buf->offset + info->indirect_offset; + struct nvc0_screen *screen = nvc0->screen; PUSH_SPACE(push, 7); @@ -832,11 +835,11 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) /* Queue things up to let the macros write params to the driver constbuf */ BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - PUSH_DATA (push, 512); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9)); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); BEGIN_NVC0(push, NVC0_3D(CB_POS), 1); - PUSH_DATA (push, 256 + 128); + PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO); if (info->indexed) { assert(nvc0->idxbuf.buffer); @@ -934,6 +937,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; int s; /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */ @@ -946,15 +950,16 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) * if index count is larger and we expect repeated vertices, suggest upload. */ nvc0->vbo_push_hint = - info->indexed && (nvc0->vb_elt_limit >= (info->count * 2)); + !info->indirect && info->indexed && + (nvc0->vb_elt_limit >= (info->count * 2)); /* Check whether we want to switch vertex-submission mode. */ - if (nvc0->vbo_user && !(nvc0->dirty & (NVC0_NEW_ARRAYS | NVC0_NEW_VERTEX))) { + if (nvc0->vbo_user && !(nvc0->dirty_3d & (NVC0_NEW_3D_ARRAYS | NVC0_NEW_3D_VERTEX))) { if (nvc0->vbo_push_hint != !!nvc0->state.vbo_mode) if (nvc0->state.vbo_mode != 3) - nvc0->dirty |= NVC0_NEW_ARRAYS; + nvc0->dirty_3d |= NVC0_NEW_3D_ARRAYS; - if (!(nvc0->dirty & NVC0_NEW_ARRAYS) && nvc0->state.vbo_mode == 0) { + if (!(nvc0->dirty_3d & NVC0_NEW_3D_ARRAYS) && nvc0->state.vbo_mode == 0) { if (nvc0->vertex->shared_slots) nvc0_update_user_vbufs_shared(nvc0); else @@ -969,17 +974,17 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices); } - nvc0_state_validate(nvc0, ~0); + nvc0_state_validate_3d(nvc0, ~0); if (nvc0->vertprog->vp.need_draw_parameters) { PUSH_SPACE(push, 9); BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); - PUSH_DATA (push, 512); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9)); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); if (!info->indirect) { BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3); - PUSH_DATA (push, 256 + 128); + PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO); PUSH_DATA (push, info->index_bias); PUSH_DATA (push, info->start_instance); PUSH_DATA (push, info->drawid); diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c index 9c19ba20a..fd2bcbb96 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c @@ -19,6 +19,7 @@ struct push_context { uint32_t vertex_size; uint32_t restart_index; + uint32_t start_instance; uint32_t instance_id; bool prim_restart; @@ -44,6 +45,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx) ctx->translate = nvc0->vertex->translate; ctx->vertex_size = nvc0->vertex->size; + ctx->instance_id = 0; ctx->need_vertex_id = nvc0->vertprog->vp.need_vertex_id && (nvc0->vertex->num_elements < 32); @@ -225,7 +227,7 @@ nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count) PUSH_DATAh(push, va + size - 1); PUSH_DATA (push, va + size - 1); - BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD, + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_VTX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD, bo); nouveau_pushbuf_validate(push); @@ -246,7 +248,8 @@ disp_vertices_i08(struct push_context *ctx, unsigned start, unsigned count) if (unlikely(ctx->prim_restart)) nR = prim_restart_search_i08(elts, nR, ctx->restart_index); - translate->run_elts8(translate, elts, nR, 0, ctx->instance_id, ctx->dest); + translate->run_elts8(translate, elts, nR, + ctx->start_instance, ctx->instance_id, ctx->dest); count -= nR; ctx->dest += nR * ctx->vertex_size; @@ -302,7 +305,8 @@ disp_vertices_i16(struct push_context *ctx, unsigned start, unsigned count) if (unlikely(ctx->prim_restart)) nR = prim_restart_search_i16(elts, nR, ctx->restart_index); - translate->run_elts16(translate, elts, nR, 0, ctx->instance_id, ctx->dest); + translate->run_elts16(translate, elts, nR, + ctx->start_instance, ctx->instance_id, ctx->dest); count -= nR; ctx->dest += nR * ctx->vertex_size; @@ -358,7 +362,8 @@ disp_vertices_i32(struct push_context *ctx, unsigned start, unsigned count) if (unlikely(ctx->prim_restart)) nR = prim_restart_search_i32(elts, nR, ctx->restart_index); - translate->run_elts(translate, elts, nR, 0, ctx->instance_id, ctx->dest); + translate->run_elts(translate, elts, nR, + ctx->start_instance, ctx->instance_id, ctx->dest); count -= nR; ctx->dest += nR * ctx->vertex_size; @@ -410,7 +415,8 @@ disp_vertices_seq(struct push_context *ctx, unsigned start, unsigned count) /* XXX: This will read the data corresponding to the primitive restart index, * maybe we should avoid that ? */ - translate->run(translate, start, count, 0, ctx->instance_id, ctx->dest); + translate->run(translate, start, count, + ctx->start_instance, ctx->instance_id, ctx->dest); do { unsigned nr = count; @@ -515,7 +521,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info) index_size = 0; } - ctx.instance_id = info->start_instance; + ctx.start_instance = info->start_instance; prim = nvc0_prim_gl(info->mode); do { @@ -554,7 +560,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info) prim |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT; ++ctx.instance_id; } - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_VTX_TMP); nouveau_scratch_done(&nvc0->base); } while (inst_count); @@ -629,7 +635,7 @@ nvc0_push_upload_vertex_ids(struct push_context *ctx, data = (uint32_t *)nouveau_scratch_get(&nvc0->base, info->count * index_size, &va, &bo); - BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD, + BCTX_REFN_bo(nvc0->bufctx_3d, 3D_VTX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD, bo); nouveau_pushbuf_validate(push); diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h index 79abe78b7..4d07546c3 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h @@ -50,9 +50,9 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) #define NVC0_3D(n) SUBC_3D(NVC0_3D_##n) #define NVE4_3D(n) SUBC_3D(NVE4_3D_##n) -#define SUBC_COMPUTE(m) 1, (m) -#define NVC0_COMPUTE(n) SUBC_COMPUTE(NVC0_COMPUTE_##n) -#define NVE4_COMPUTE(n) SUBC_COMPUTE(NVE4_COMPUTE_##n) +#define SUBC_CP(m) 1, (m) +#define NVC0_CP(n) SUBC_CP(NVC0_COMPUTE_##n) +#define NVE4_CP(n) SUBC_CP(NVE4_COMPUTE_##n) #define SUBC_M2MF(m) 2, (m) #define SUBC_P2MF(m) 2, (m) diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.c index c74161645..d661c000b 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.c @@ -23,7 +23,6 @@ */ #include "nvc0/nvc0_context.h" -#include "nvc0/nvc0_compute.h" #include "nvc0/nve4_compute.h" #include "codegen/nv50_ir_driver.h" @@ -42,6 +41,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, int i; int ret; uint32_t obj_class; + uint64_t address; switch (dev->chipset & ~0xf) { case 0x100: @@ -54,6 +54,9 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, case 0x110: obj_class = GM107_COMPUTE_CLASS; break; + case 0x120: + obj_class = GM200_COMPUTE_CLASS; + break; default: NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset); return -1; @@ -66,26 +69,21 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, return ret; } - ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL, - &screen->parm); - if (ret) - return ret; - - BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); + BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1); PUSH_DATA (push, screen->compute->oclass); - BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->tls->offset); PUSH_DATA (push, screen->tls->offset); /* No idea why there are 2. Divide size by 2 to be safe. * Actually this might be per-MP TEMP size and looks like I'm only using * 2 MPs instead of all 8. */ - BEGIN_NVC0(push, NVE4_COMPUTE(MP_TEMP_SIZE_HIGH(0)), 3); + BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3); PUSH_DATAh(push, screen->tls->size / screen->mp_count); PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); PUSH_DATA (push, 0xff); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_TEMP_SIZE_HIGH(1)), 3); + BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3); PUSH_DATAh(push, screen->tls->size / screen->mp_count); PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); PUSH_DATA (push, 0xff); @@ -95,24 +93,24 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be * accessible. We cannot prevent that at the moment, so expect failure. */ - BEGIN_NVC0(push, NVE4_COMPUTE(LOCAL_BASE), 1); - PUSH_DATA (push, 1 << 24); - BEGIN_NVC0(push, NVE4_COMPUTE(SHARED_BASE), 1); - PUSH_DATA (push, 2 << 24); + BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1); + PUSH_DATA (push, 0xff << 24); + BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1); + PUSH_DATA (push, 0xfe << 24); - BEGIN_NVC0(push, NVE4_COMPUTE(CODE_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->text->offset); PUSH_DATA (push, screen->text->offset); - BEGIN_NVC0(push, SUBC_COMPUTE(0x0310), 1); + BEGIN_NVC0(push, SUBC_CP(0x0310), 1); PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300); /* NOTE: these do not affect the state used by the 3D object */ - BEGIN_NVC0(push, NVE4_COMPUTE(TIC_ADDRESS_HIGH), 3); + BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset); PUSH_DATA (push, screen->txc->offset); PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1); - BEGIN_NVC0(push, NVE4_COMPUTE(TSC_ADDRESS_HIGH), 3); + BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset + 65536); PUSH_DATA (push, screen->txc->offset + 65536); PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1); @@ -122,26 +120,31 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently * disabled because our firmware doesn't support these commands and the * GPU hangs if they are used. */ - BEGIN_NIC0(push, SUBC_COMPUTE(0x0248), 64); + BEGIN_NIC0(push, SUBC_CP(0x0248), 64); for (i = 63; i >= 0; i--) PUSH_DATA(push, 0x38000 | i); - IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); + IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0); } - BEGIN_NVC0(push, NVE4_COMPUTE(TEX_CB_INDEX), 1); - PUSH_DATA (push, 0); /* does not interefere with 3D */ + BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1); + PUSH_DATA (push, 7); /* does not interfere with 3D */ + /* Disabling this UNK command avoid a read fault when using texelFetch() + * from a compute shader for weird reasons. if (obj_class == NVF0_COMPUTE_CLASS) - IMMED_NVC0(push, SUBC_COMPUTE(0x02c4), 1); + IMMED_NVC0(push, SUBC_CP(0x02c4), 1); + */ + + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); /* MS sample coordinate offsets: these do not work with _ALT modes ! */ - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO); + PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 64); PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); PUSH_DATA (push, 0); /* 0 */ PUSH_DATA (push, 0); @@ -160,14 +163,14 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, PUSH_DATA (push, 3); /* 7 */ PUSH_DATA (push, 1); -#ifdef DEBUG - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); +#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 28); PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 8); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8); PUSH_DATA (push, 1); PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO); PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO); @@ -178,88 +181,136 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, PUSH_DATA (push, 0); /* warp cfstack size */ #endif - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); return 0; } - static void -nve4_compute_validate_surfaces(struct nvc0_context *nvc0) +gm107_compute_validate_surfaces(struct nvc0_context *nvc0, + struct pipe_image_view *view, int slot) { - struct nvc0_screen *screen = nvc0->screen; + struct nv04_resource *res = nv04_resource(view->resource); struct nouveau_pushbuf *push = nvc0->base.pushbuf; - struct nv50_surface *sf; - struct nv04_resource *res; - uint32_t mask; - unsigned i; - const unsigned t = 1; - - mask = nvc0->surfaces_dirty[t]; - while (mask) { - i = ffs(mask) - 1; - mask &= ~(1 << i); - - /* - * NVE4's surface load/store instructions receive all the information - * directly instead of via binding points, so we have to supply them. - */ - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i)); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i)); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 64); + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_bo *txc = nvc0->screen->txc; + struct nv50_tic_entry *tic; + uint64_t address; + const int s = 5; + + tic = nv50_tic_entry(nvc0->images_tic[s][slot]); + + res = nv04_resource(tic->pipe.texture); + nvc0_update_tic(nvc0, tic, res); + + if (tic->id < 0) { + tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); + + /* upload the texture view */ + PUSH_SPACE(push, 16); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, txc->offset + (tic->id * 32)); + PUSH_DATA (push, txc->offset + (tic->id * 32)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 32); PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, &tic->tic[0], 8); + + BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), 1); + PUSH_DATA (push, (tic->id << 4) | 1); + } else + if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { + BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), 1); + PUSH_DATA (push, (tic->id << 4) | 1); + } + nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); - nve4_set_surface_info(push, nvc0->surfaces[t][i], screen); + res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; + res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; - sf = nv50_surface(nvc0->surfaces[t][i]); - if (sf) { - res = nv04_resource(sf->base.texture); + BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); - if (sf->base.writable) - BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); - else - BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); - } - } - if (nvc0->surfaces_dirty[t]) { - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); - PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); - } + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); - /* re-reference non-dirty surfaces */ - mask = nvc0->surfaces_valid[t] & ~nvc0->surfaces_dirty[t]; - while (mask) { - i = ffs(mask) - 1; - mask &= ~(1 << i); + /* upload the texture handle */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(slot + 32)); + PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(slot + 32)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 2); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATA (push, tic->id); - sf = nv50_surface(nvc0->surfaces[t][i]); - res = nv04_resource(sf->base.texture); + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); +} + +static void +nve4_compute_validate_surfaces(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + uint64_t address; + const int s = 5; + int i, j; - if (sf->base.writable) + if (!nvc0->images_dirty[s]) + return; + + address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + for (i = 0; i < NVC0_MAX_IMAGES; ++i) { + struct pipe_image_view *view = &nvc0->images[s][i]; + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_SU_INFO(i)); + PUSH_DATA (push, address + NVC0_CB_AUX_SU_INFO(i)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 16 * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 16); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + + if (view->resource) { + struct nv04_resource *res = nv04_resource(view->resource); + + if (res->base.target == PIPE_BUFFER) { + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + nvc0_mark_image_range_valid(view); + } + + nve4_set_surface_info(push, view, nvc0); BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); - else - BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); - } - nvc0->surfaces_dirty[t] = 0; + if (nvc0->screen->base.class_3d >= GM107_3D_CLASS) + gm107_compute_validate_surfaces(nvc0, view, i); + } else { + for (j = 0; j < 16; j++) + PUSH_DATA(push, 0); + } + } } - /* Thankfully, textures with samplers follow the normal rules. */ static void nve4_compute_validate_samplers(struct nvc0_context *nvc0) { bool need_flush = nve4_validate_tsc(nvc0, 5); if (need_flush) { - BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1); + BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1); PUSH_DATA (nvc0->base.pushbuf, 0); } + + /* Invalidate all 3D samplers because they are aliased. */ + for (int s = 0; s < 5; s++) + nvc0->samplers_dirty[s] = ~0; + nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS; } + /* (Code duplicated at bottom for various non-convincing reasons. * E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC * entries to avoid a subchannel switch. @@ -272,6 +323,7 @@ static void nve4_compute_set_tex_handles(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; uint64_t address; const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE); unsigned i, n; @@ -283,88 +335,204 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0) n = util_logbase2(dirty) + 1 - i; assert(n); - address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i); + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, address); - PUSH_DATA (push, address); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i)); + PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, n * 4); PUSH_DATA (push, 0x1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + n); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); PUSH_DATAp(push, &nvc0->tex_handles[s][i], n); - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); nvc0->textures_dirty[s] = 0; nvc0->samplers_dirty[s] = 0; } +static void +nve4_compute_validate_constbufs(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const int s = 5; + + while (nvc0->constbuf_dirty[s]) { + int i = ffs(nvc0->constbuf_dirty[s]) - 1; + nvc0->constbuf_dirty[s] &= ~(1 << i); + + if (nvc0->constbuf[s][i].user) { + struct nouveau_bo *bo = nvc0->screen->uniform_bo; + const unsigned base = NVC0_CB_USR_INFO(s); + const unsigned size = nvc0->constbuf[s][0].size; + assert(i == 0); /* we really only want OpenGL uniforms here */ + assert(nvc0->constbuf[s][0].u.data); + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, bo->offset + base); + PUSH_DATA (push, bo->offset + base); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, size); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4); + } + else { + struct nv04_resource *res = + nv04_resource(nvc0->constbuf[s][i].u.buf); + if (res) { + uint64_t address + = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + assert(i > 0); /* we really only want uniform buffer objects */ + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); + PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4 * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + + PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset); + PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset); + PUSH_DATA (push, nvc0->constbuf[5][i].size); + PUSH_DATA (push, 0); + BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD); + + res->cb_bindings[s] |= 1 << i; + } + } + } + + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); +} + +static void +nve4_compute_validate_buffers(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + uint64_t address; + const int s = 5; + int i; + + address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0)); + PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + + for (i = 0; i < NVC0_MAX_BUFFERS; i++) { + if (nvc0->buffers[s][i].buffer) { + struct nv04_resource *res = + nv04_resource(nvc0->buffers[s][i].buffer); + PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset); + PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset); + PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); + PUSH_DATA (push, 0); + BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); + util_range_add(&res->valid_buffer_range, + nvc0->buffers[s][i].buffer_offset, + nvc0->buffers[s][i].buffer_offset + + nvc0->buffers[s][i].buffer_size); + } else { + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + } + } +} + +static struct nvc0_state_validate +validate_list_cp[] = { + { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, + { nve4_compute_validate_textures, NVC0_NEW_CP_TEXTURES }, + { nve4_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS }, + { nve4_compute_set_tex_handles, NVC0_NEW_CP_TEXTURES | + NVC0_NEW_CP_SAMPLERS }, + { nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES }, + { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, + { nve4_compute_validate_buffers, NVC0_NEW_CP_BUFFERS }, + { nve4_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF }, +}; static bool -nve4_compute_state_validate(struct nvc0_context *nvc0) +nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) { - if (!nvc0_compute_validate_program(nvc0)) - return false; - if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES) - nve4_compute_validate_textures(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS) - nve4_compute_validate_samplers(nvc0); - if (nvc0->dirty_cp & (NVC0_NEW_CP_TEXTURES | NVC0_NEW_CP_SAMPLERS)) - nve4_compute_set_tex_handles(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_SURFACES) - nve4_compute_validate_surfaces(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS) - nvc0_validate_global_residents(nvc0, - nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL); - - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false); - - nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); - if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) - return false; + bool ret; + + ret = nvc0_state_validate(nvc0, mask, validate_list_cp, + ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp, + nvc0->bufctx_cp); + if (unlikely(nvc0->state.flushed)) nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); - - return true; + return ret; } - static void -nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input, - const uint *block_layout, - const uint *grid_layout) +nve4_compute_upload_input(struct nvc0_context *nvc0, + const struct pipe_grid_info *info) { struct nvc0_screen *screen = nvc0->screen; struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_program *cp = nvc0->compprog; + uint64_t address; + + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); if (cp->parm_size) { - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset); - PUSH_DATA (push, screen->parm->offset); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, cp->parm_size); PUSH_DATA (push, 0x1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (cp->parm_size / 4)); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4)); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); - PUSH_DATAp(push, input, cp->parm_size / 4); + PUSH_DATAp(push, info->input, cp->parm_size / 4); } - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0)); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0)); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 7 * 4); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO(0)); + PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO(0)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 8 * 4); PUSH_DATA (push, 0x1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + 7); - PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); - PUSH_DATAp(push, block_layout, 3); - PUSH_DATAp(push, grid_layout, 3); + + if (unlikely(info->indirect)) { + struct nv04_resource *res = nv04_resource(info->indirect); + uint32_t offset = res->offset + info->indirect_offset; + + nouveau_pushbuf_space(push, 16, 0, 1); + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); + + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, info->block, 3); + nouveau_pushbuf_data(push, res->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4); + } else { + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, info->block, 3); + PUSH_DATAp(push, info->grid, 3); + } PUSH_DATA (push, 0); + PUSH_DATA (push, info->work_dim); - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); } @@ -381,27 +549,24 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size) static void nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, struct nve4_cp_launch_desc *desc, - uint32_t label, - const uint *block_layout, - const uint *grid_layout) + const struct pipe_grid_info *info) { const struct nvc0_screen *screen = nvc0->screen; const struct nvc0_program *cp = nvc0->compprog; - unsigned i; nve4_cp_launch_desc_init_default(desc); - desc->entry = nvc0_program_symbol_offset(cp, label); + desc->entry = nvc0_program_symbol_offset(cp, info->pc); - desc->griddim_x = grid_layout[0]; - desc->griddim_y = grid_layout[1]; - desc->griddim_z = grid_layout[2]; - desc->blockdim_x = block_layout[0]; - desc->blockdim_y = block_layout[1]; - desc->blockdim_z = block_layout[2]; + desc->griddim_x = info->grid[0]; + desc->griddim_y = info->grid[1]; + desc->griddim_z = info->grid[2]; + desc->blockdim_x = info->block[0]; + desc->blockdim_y = info->block[1]; + desc->blockdim_z = info->block[2]; desc->shared_size = align(cp->cp.smem_size, 0x100); - desc->local_size_p = align(cp->cp.lmem_size, 0x10); + desc->local_size_p = (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10); desc->local_size_n = 0; desc->cstack_size = 0x800; desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size); @@ -409,12 +574,15 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, desc->gpr_alloc = cp->num_gprs; desc->bar_alloc = cp->num_barriers; - for (i = 0; i < 7; ++i) { - const unsigned s = 5; - if (nvc0->constbuf[s][i].u.buf) - nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]); + // Only bind user uniforms and the driver constant buffer through the + // launch descriptor because UBOs are sticked to the driver cb to avoid the + // limitation of 8 CBs. + if (nvc0->constbuf[5][0].user || cp->parm_size) { + nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo, + NVC0_CB_USR_INFO(5), 1 << 16); } - nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE); + nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo, + NVC0_CB_AUX_INFO(5), 1 << 11); } static inline struct nve4_cp_launch_desc * @@ -450,38 +618,71 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD, desc_bo); - ret = !nve4_compute_state_validate(nvc0); + ret = !nve4_state_validate_cp(nvc0, ~0); if (ret) goto out; - nve4_compute_setup_launch_desc(nvc0, desc, info->pc, - info->block, info->grid); + nve4_compute_setup_launch_desc(nvc0, desc, info); + + nve4_compute_upload_input(nvc0, info); + #ifdef DEBUG if (debug_get_num_option("NV50_PROG_DEBUG", 0)) nve4_compute_dump_launch_desc(desc); #endif - nve4_compute_upload_input(nvc0, info->input, info->block, info->grid); + if (unlikely(info->indirect)) { + struct nv04_resource *res = nv04_resource(info->indirect); + uint32_t offset = res->offset + info->indirect_offset; + + /* upload the descriptor */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr); + PUSH_DATA (push, desc_gpuaddr); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 256); + PUSH_DATA (push, 1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); + + /* overwrite griddim_x and griddim_y as two 32-bits integers even + * if griddim_y must be a 16-bits integer */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr + 48); + PUSH_DATA (push, desc_gpuaddr + 48); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 8); + PUSH_DATA (push, 1); + + nouveau_pushbuf_space(push, 16, 0, 1); + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); + + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + nouveau_pushbuf_data(push, res->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4); + + /* overwrite the 16 high bits of griddim_y with griddim_z because + * we need (z << 16) | x */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr + 54); + PUSH_DATA (push, desc_gpuaddr + 54); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4); + PUSH_DATA (push, 1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + nouveau_pushbuf_data(push, res->bo, offset + 8, + NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4); + } /* upload descriptor and flush */ -#if 0 - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, desc_gpuaddr); - PUSH_DATA (push, desc_gpuaddr); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 256); - PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (256 / 4)); - PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); - PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); - PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE); -#endif - BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH_DESC_ADDRESS), 1); + BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1); PUSH_DATA (push, desc_gpuaddr >> 8); - BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH), 1); + BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1); PUSH_DATA (push, 0x3); - BEGIN_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1); + BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); out: @@ -501,7 +702,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0) struct nouveau_pushbuf *push = nvc0->base.pushbuf; const unsigned s = 5; unsigned i; - uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX]; + uint32_t commands[2][32]; unsigned n[2] = { 0, 0 }; for (i = 0; i < nvc0->num_textures[s]; ++i) { @@ -514,18 +715,19 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0) continue; } res = nv04_resource(tic->pipe.texture); + nvc0_update_tic(nvc0, tic, res); if (tic->id < 0) { tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); PUSH_SPACE(push, 16); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); PUSH_DATAh(push, txc->offset + (tic->id * 32)); PUSH_DATA (push, txc->offset + (tic->id * 32)); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 32); PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 9); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); PUSH_DATAp(push, &tic->tic[0], 8); @@ -544,19 +746,29 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0) if (dirty) BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD); } - for (; i < nvc0->state.num_textures[s]; ++i) + for (; i < nvc0->state.num_textures[s]; ++i) { nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; + nvc0->textures_dirty[s] |= 1 << i; + } if (n[0]) { - BEGIN_NIC0(push, NVE4_COMPUTE(TIC_FLUSH), n[0]); + BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]); PUSH_DATAp(push, commands[0], n[0]); } if (n[1]) { - BEGIN_NIC0(push, NVE4_COMPUTE(TEX_CACHE_CTL), n[1]); + BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]); PUSH_DATAp(push, commands[1], n[1]); } nvc0->state.num_textures[s] = nvc0->num_textures[s]; + + /* Invalidate all 3D textures because they are aliased. */ + for (int s = 0; s < 5; s++) { + for (int i = 0; i < nvc0->num_textures[s]; i++) + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); + nvc0->textures_dirty[s] = ~0; + } + nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.h index 84f8593b9..b98c65d4a 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.h @@ -4,31 +4,6 @@ #include "nvc0/nve4_compute.xml.h" -/* Input space is implemented as c0[], to which we bind the screen->parm bo. - */ -#define NVE4_CP_INPUT_USER 0x0000 -#define NVE4_CP_INPUT_USER_LIMIT 0x1000 -#define NVE4_CP_INPUT_GRID_INFO(i) (0x1000 + (i) * 4) -#define NVE4_CP_INPUT_NTID(i) (0x1000 + (i) * 4) -#define NVE4_CP_INPUT_NCTAID(i) (0x100c + (i) * 4) -#define NVE4_CP_INPUT_GRIDID 0x1018 -#define NVE4_CP_INPUT_TEX(i) (0x1040 + (i) * 4) -#define NVE4_CP_INPUT_TEX_STRIDE 4 -#define NVE4_CP_INPUT_TEX_MAX 32 -#define NVE4_CP_INPUT_MS_OFFSETS 0x10c0 -#define NVE4_CP_INPUT_SUF_STRIDE 64 -#define NVE4_CP_INPUT_SUF(i) (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE) -#define NVE4_CP_INPUT_SUF_MAX 32 -#define NVE4_CP_INPUT_TRAP_INFO_PTR 0x1900 -#define NVE4_CP_INPUT_TEMP_PTR 0x1908 -#define NVE4_CP_INPUT_MP_TEMP_SIZE 0x1910 -#define NVE4_CP_INPUT_WARP_TEMP_SIZE 0x1914 -#define NVE4_CP_INPUT_CSTACK_SIZE 0x1918 -#define NVE4_CP_INPUT_SIZE 0x1a00 -#define NVE4_CP_PARAM_TRAP_INFO 0x2000 -#define NVE4_CP_PARAM_TRAP_INFO_SZ (1 << 16) -#define NVE4_CP_PARAM_SIZE (NVE4_CP_PARAM_TRAP_INFO + (1 << 16)) - struct nve4_cp_launch_desc { u32 unk0[8]; @@ -81,7 +56,7 @@ static inline void nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc, unsigned index, struct nouveau_bo *bo, - uint32_t base, uint16_t size) + uint32_t base, uint32_t size) { uint64_t address = bo->offset + base; @@ -95,23 +70,6 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc, desc->cb_mask |= 1 << index; } -static inline void -nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc, - unsigned index, - const struct nvc0_constbuf *cb) -{ - assert(index < 8); - - if (!cb->u.buf) { - desc->cb_mask &= ~(1 << index); - } else { - const struct nv04_resource *buf = nv04_resource(cb->u.buf); - assert(!cb->user); - nve4_cp_launch_desc_set_cb(desc, index, - buf->bo, buf->offset + cb->offset, cb->size); - } -} - struct nve4_mp_trap_info { u32 lock; u32 pc; diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h index 3fff1122b..320185805 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h @@ -294,6 +294,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NVE4_COMPUTE_MP_PM_A_SIGSEL_LAUNCH 0x00000003 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_EXEC 0x00000004 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_ISSUE 0x00000005 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK11 0x00000011 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK14 0x00000014 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK1A 0x0000001a #define NVE4_COMPUTE_MP_PM_A_SIGSEL_LDST 0x0000001b #define NVE4_COMPUTE_MP_PM_A_SIGSEL_BRANCH 0x0000001c @@ -307,6 +310,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NVE4_COMPUTE_MP_PM_B_SIGSEL_UNK0F 0x0000000f #define NVE4_COMPUTE_MP_PM_B_SIGSEL_L1 0x00000010 #define NVE4_COMPUTE_MP_PM_B_SIGSEL_MEM 0x00000011 +#define NVE4_COMPUTE_MP_PM_B_SIGSEL_UNK13 0x00000013 #define NVE4_COMPUTE_MP_PM_SRCSEL(i0) (0x0000339c + 0x4*(i0)) #define NVE4_COMPUTE_MP_PM_SRCSEL__ESIZE 0x00000004 |