diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:13:05 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:13:05 +0000 |
commit | 01fb7c3270d8d1e1c53129a974587680aa129089 (patch) | |
tree | 68033daecea5da5fcb45de5cbef65b8b3fc92845 | |
parent | 53b0736c56ca5142a5722eb827a3675ca08e123d (diff) |
Import Mesa 19.2.8
-rw-r--r-- | lib/mesa/src/mesa/math/m_vector_asm.h | 2 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bifrost.h | 627 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bifrost_compile.c | 5809 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bifrost_compile.h | 72 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/cmdline.c | 286 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/disassemble.c | 2292 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/disassemble.h | 20 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/meson.build | 170 |
8 files changed, 2799 insertions, 6479 deletions
diff --git a/lib/mesa/src/mesa/math/m_vector_asm.h b/lib/mesa/src/mesa/math/m_vector_asm.h index 90de44b0a..60cf1ec8f 100644 --- a/lib/mesa/src/mesa/math/m_vector_asm.h +++ b/lib/mesa/src/mesa/math/m_vector_asm.h @@ -52,6 +52,6 @@ * _math_matrix_set_identity(). */ #define MATRIX_M 0 -#define MATRIX_INV (MATRIX_M + 16 * 4) +#define MATRIX_INV (MATRIX_M + MATH_ASM_PTR_SIZE) #endif /* _M_VECTOR_ASM_H */ diff --git a/lib/mesa/src/panfrost/bifrost/bifrost.h b/lib/mesa/src/panfrost/bifrost/bifrost.h index 9d95de562..aa382b43b 100644 --- a/lib/mesa/src/panfrost/bifrost/bifrost.h +++ b/lib/mesa/src/panfrost/bifrost/bifrost.h @@ -28,607 +28,58 @@ #include <stdint.h> #include <stdbool.h> -#include <string.h> -#include <assert.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define BIFROST_DBG_MSGS 0x0001 -#define BIFROST_DBG_SHADERS 0x0002 -#define BIFROST_DBG_SHADERDB 0x0004 -#define BIFROST_DBG_VERBOSE 0x0008 -#define BIFROST_DBG_INTERNAL 0x0010 -#define BIFROST_DBG_NOSCHED 0x0020 -#define BIFROST_DBG_INORDER 0x0040 -#define BIFROST_DBG_NOVALIDATE 0x0080 -#define BIFROST_DBG_NOOPT 0x0100 -#define BIFROST_DBG_NOIDVS 0x0200 -#define BIFROST_DBG_NOSB 0x0400 -#define BIFROST_DBG_NOPRELOAD 0x0800 -#define BIFROST_DBG_SPILL 0x1000 -#define BIFROST_DBG_NOPSCHED 0x2000 - -extern int bifrost_debug; - -enum bifrost_message_type { - BIFROST_MESSAGE_NONE = 0, - BIFROST_MESSAGE_VARYING = 1, - BIFROST_MESSAGE_ATTRIBUTE = 2, - BIFROST_MESSAGE_TEX = 3, - BIFROST_MESSAGE_VARTEX = 4, - BIFROST_MESSAGE_LOAD = 5, - BIFROST_MESSAGE_STORE = 6, - BIFROST_MESSAGE_ATOMIC = 7, - BIFROST_MESSAGE_BARRIER = 8, - BIFROST_MESSAGE_BLEND = 9, - BIFROST_MESSAGE_TILE = 10, - /* type 11 reserved */ - BIFROST_MESSAGE_Z_STENCIL = 12, - BIFROST_MESSAGE_ATEST = 13, - BIFROST_MESSAGE_JOB = 14, - BIFROST_MESSAGE_64BIT = 15 -}; - -enum bifrost_ftz { - BIFROST_FTZ_DISABLE = 0, - BIFROST_FTZ_DX11 = 1, - BIFROST_FTZ_ALWAYS = 2, - BIFROST_FTZ_ABRUPT = 3 -}; - -enum bifrost_exceptions { - BIFROST_EXCEPTIONS_ENABLED = 0, - BIFROST_EXCEPTIONS_DISABLED = 1, - BIFROST_EXCEPTIONS_PRECISE_DIVISION = 2, - BIFROST_EXCEPTIONS_PRECISE_SQRT = 3, -}; - -/* Describes clause flow control, with respect to control flow and branch - * reconvergence. - * - * Control flow may be considered back-to-back (execute clauses back-to-back), - * non-back-to-back (switch warps after clause before the next clause), write - * elision (back-to-back and elide register slot #3 write from the clause), or - * end of shader. - * - * Branch reconvergence may be disabled, enabled unconditionally, or enabled - * based on the program counter. A clause requires reconvergence if it has a - * successor that can be executed without first executing the clause itself. - * Separate iterations of a loop are treated separately here, so it is also the - * case for a loop exit where the iteration count is not warp-invariant. - * - */ - -enum bifrost_flow { - /* End-of-shader */ - BIFROST_FLOW_END = 0, - - /* Non back-to-back, PC-encoded reconvergence */ - BIFROST_FLOW_NBTB_PC = 1, - - /* Non back-to-back, unconditional reconvergence */ - BIFROST_FLOW_NBTB_UNCONDITIONAL = 2, - - /* Non back-to-back, no reconvergence */ - BIFROST_FLOW_NBTB = 3, - - /* Back-to-back, unconditional reconvergence */ - BIFROST_FLOW_BTB_UNCONDITIONAL = 4, - - /* Back-to-back, no reconvergence */ - BIFROST_FLOW_BTB_NONE = 5, - - /* Write elision, unconditional reconvergence */ - BIFROST_FLOW_WE_UNCONDITIONAL = 6, - - /* Write elision, no reconvergence */ - BIFROST_FLOW_WE = 7, -}; - -enum bifrost_slot { - /* 0-5 are general purpose */ - BIFROST_SLOT_ELDEST_DEPTH = 6, - BIFROST_SLOT_ELDEST_COLOUR = 7, -}; struct bifrost_header { - /* Reserved */ - unsigned zero1 : 5; - - /* Flush-to-zero mode, leave zero for GL */ - enum bifrost_ftz flush_to_zero : 2; - - /* Convert any infinite result of any floating-point operation to the - * biggest representable number */ + unsigned unk0 : 7; + // If true, convert any infinite result of any floating-point operation to + // the biggest representable number. unsigned suppress_inf: 1; - - /* Convert NaN to +0.0 */ + // Convert any NaN results to 0. unsigned suppress_nan : 1; - - /* Floating-point excception handling mode */ - enum bifrost_exceptions float_exceptions : 2; - - /* Enum describing the flow control, which matters for handling - * divergence and reconvergence efficiently */ - enum bifrost_flow flow_control : 3; - - /* Reserved */ - unsigned zero2 : 1; - - /* Terminate discarded threads, rather than continuing execution. Set - * for fragment shaders for standard GL behaviour of DISCARD. Also in a - * fragment shader, this disables helper invocations, so cannot be used - * in a shader that requires derivatives or texture LOD computation */ - unsigned terminate_discarded_threads : 1; - - /* If set, the hardware may prefetch the next clause. If false, the - * hardware may not. Clear for unconditional branches. */ - unsigned next_clause_prefetch : 1; - - /* If set, a barrier will be inserted after the clause waiting for all - * message passing instructions to read their staging registers, such - * that it is safe for the next clause to write them. */ - unsigned staging_barrier: 1; - unsigned staging_register : 6; - - /* Slots to wait on and slot to be used for message passing - * instructions respectively */ - unsigned dependency_wait : 8; - unsigned dependency_slot : 3; - - enum bifrost_message_type message_type : 5; - enum bifrost_message_type next_message_type : 5; -} __attribute__((packed)); - -enum bifrost_packed_src { - BIFROST_SRC_PORT0 = 0, - BIFROST_SRC_PORT1 = 1, - BIFROST_SRC_PORT2 = 2, - BIFROST_SRC_STAGE = 3, - BIFROST_SRC_FAU_LO = 4, - BIFROST_SRC_FAU_HI = 5, - BIFROST_SRC_PASS_FMA = 6, - BIFROST_SRC_PASS_ADD = 7, + unsigned unk1 : 2; + // true if the execution mask of the next clause is the same as the mask of + // the current clause. + unsigned back_to_back : 1; + unsigned no_end_of_shader: 1; + unsigned unk2 : 2; + // Set to true for fragment shaders, to implement this bit of spec text + // from section 7.1.5 of the GLSL ES spec: + // + // "Stores to image and buffer variables performed by helper invocations + // have no effect on the underlying image or buffer memory." + // + // Helper invocations are threads (invocations) corresponding to pixels in + // a quad that aren't actually part of the triangle, but are included to + // make derivatives work correctly. They're usually turned on, but they + // need to be masked off for GLSL-level stores. This bit seems to be the + // only bit that's actually different between fragment shaders and other + // shaders, so this is probably what it's doing. + unsigned elide_writes : 1; + // If backToBack is off: + // - true for conditional branches and fallthrough + // - false for unconditional branches + // The blob seems to always set it to true if back-to-back is on. + unsigned branch_cond : 1; + // This bit is set when the next clause writes to the data register of some + // previous clause. + unsigned datareg_writebarrier: 1; + unsigned datareg : 6; + unsigned scoreboard_deps: 8; + unsigned scoreboard_index: 3; + unsigned clause_type: 4; + unsigned unk3 : 1; // part of clauseType? + unsigned next_clause_type: 4; + unsigned unk4 : 1; // part of nextClauseType? }; struct bifrost_fma_inst { unsigned src0 : 3; unsigned op : 20; -} __attribute__((packed)); +}; struct bifrost_add_inst { unsigned src0 : 3; unsigned op : 17; -} __attribute__((packed)); - -enum branch_bit_size { - BR_SIZE_32 = 0, - BR_SIZE_16XX = 1, - BR_SIZE_16YY = 2, - // For the above combinations of bitsize and location, an extra bit is - // encoded via comparing the sources. The only possible source of ambiguity - // would be if the sources were the same, but then the branch condition - // would be always true or always false anyways, so we can ignore it. But - // this no longer works when comparing the y component to the x component, - // since it's valid to compare the y component of a source against its own - // x component. Instead, the extra bit is encoded via an extra bitsize. - BR_SIZE_16YX0 = 3, - BR_SIZE_16YX1 = 4, - BR_SIZE_32_AND_16X = 5, - BR_SIZE_32_AND_16Y = 6, - // Used for comparisons with zero and always-true, see below. I think this - // only works for integer comparisons. - BR_SIZE_ZERO = 7, -}; - -struct bifrost_regs { - unsigned fau_idx : 8; - unsigned reg3 : 6; - unsigned reg2 : 6; - unsigned reg0 : 5; - unsigned reg1 : 6; - unsigned ctrl : 4; -} __attribute__((packed)); - -#define BIFROST_FMTC_CONSTANTS 0b0011 -#define BIFROST_FMTC_FINAL 0b0111 - -struct bifrost_fmt_constant { - unsigned pos : 4; - unsigned tag : 4; - uint64_t imm_1 : 60; - uint64_t imm_2 : 60; -} __attribute__((packed)); - -/* Clause formats, encoded in a table */ - -enum bi_clause_subword { - /* Literal 3-bit values */ - BI_CLAUSE_SUBWORD_LITERAL_0 = 0, - /* etc */ - BI_CLAUSE_SUBWORD_LITERAL_7 = 7, - - /* The value of the corresponding tuple in the corresponding bits */ - BI_CLAUSE_SUBWORD_TUPLE_0 = 8, - /* etc */ - BI_CLAUSE_SUBWORD_TUPLE_7 = 15, - - /* Clause header */ - BI_CLAUSE_SUBWORD_HEADER = 16, - - /* Leave zero, but semantically distinct from literal 0 */ - BI_CLAUSE_SUBWORD_RESERVED = 17, - - /* Embedded constant 0 */ - BI_CLAUSE_SUBWORD_CONSTANT = 18, - - /* M bits controlling modifier for the constant */ - BI_CLAUSE_SUBWORD_M = 19, - - /* Z bit: 1 to begin encoding constants, 0 to terminate the clause */ - BI_CLAUSE_SUBWORD_Z = 20, - - /* Upper 3-bits of a given tuple and zero extended */ - BI_CLAUSE_SUBWORD_UPPER_0 = 32, - /* etc */ - BI_CLAUSE_SUBWORD_UPPER_7 = BI_CLAUSE_SUBWORD_UPPER_0 + 7, - - /* Upper 3-bits of two tuples, concatenated and zero-extended */ - BI_CLAUSE_SUBWORD_UPPER_23 = BI_CLAUSE_SUBWORD_UPPER_0 + 23, - BI_CLAUSE_SUBWORD_UPPER_56 = BI_CLAUSE_SUBWORD_UPPER_0 + 56, -}; - -#define L(x) ((enum bi_clause_subword)(BI_CLAUSE_SUBWORD_LITERAL_0 + x)) -#define U(x) ((enum bi_clause_subword)(BI_CLAUSE_SUBWORD_UPPER_0 + x)) -#define T(x) ((enum bi_clause_subword)(BI_CLAUSE_SUBWORD_TUPLE_0 + x)) -#define EC BI_CLAUSE_SUBWORD_CONSTANT -#define M BI_CLAUSE_SUBWORD_M -#define Z BI_CLAUSE_SUBWORD_Z -#define H BI_CLAUSE_SUBWORD_HEADER -#define R BI_CLAUSE_SUBWORD_RESERVED - -struct bi_clause_format { - unsigned format; /* format number */ - unsigned pos; /* index in the clause */ - enum bi_clause_subword tag_1; /* 2-bits */ - enum bi_clause_subword tag_2; /* 3-bits */ - enum bi_clause_subword tag_3; /* 3-bits */ - enum bi_clause_subword s0_s3; /* 60 bits */ - enum bi_clause_subword s4; /* 15 bits */ - enum bi_clause_subword s5_s6; /* 30 bits */ - enum bi_clause_subword s7; /* 15 bits */ -}; - -static const struct bi_clause_format bi_clause_formats[] = { - { 0, 0, L(0), L(5), U(0), T(0), T(0), H, H }, - { 0, 0, Z, L(1), U(0), T(0), T(0), H, H }, - { 1, 1, Z, L(0), L(3), T(1), T(1), R, U(1) }, - { 2, 1, L(0), L(4), U(1), T(1), T(1), T(2), T(2) }, - { 3, 2, Z, L(0), L(4), EC, M, T(2), U(2) }, - { 4, 2, L(0), L(0), L(1), T(3), T(3), T(2), U(23) }, - { 4, 2, Z, L(0), L(5), T(3), T(3), T(2), U(23) }, - { 5, 2, L(2), U(3), U(2), T(3), T(3), T(2), EC }, - { 6, 3, Z, L(2), U(4), T(4), T(4), EC, EC }, - { 7, 3, L(1), L(4), U(4), T(4), T(4), T(5), T(5) }, - { 8, 4, Z, L(0), L(6), EC, M, T(5), U(5) }, - { 9, 4, Z, L(0), L(7), T(6), T(6), T(5), U(56) }, - { 10, 4, L(3), U(6), U(5), T(6), T(6), T(5), EC }, - { 11, 5, Z, L(3), U(7), T(7), T(7), EC, EC }, -}; - -#undef L -#undef U -#undef T -#undef EC -#undef M -#undef Z -#undef H -#undef R - -/* 32-bit modes for slots 2/3, as encoded in the register block. Other values - * are reserved. First part specifies behaviour of slot 2 (Idle, Read, Write - * Full, Write Low, Write High), second part behaviour of slot 3, and the last - * part specifies the source for the write (FMA, ADD, or MIX for FMA/ADD). - * - * IDLE is a special mode disabling both slots, except for the first - * instruction in the clause which uses IDLE_1 for the same purpose. - * - * All fields 0 used as sentinel for reserved encoding, so IDLE(_1) have FMA - * set (and ignored) as a placeholder to differentiate from reserved. - */ -enum bifrost_reg_mode { - BIFROST_R_WL_FMA = 1, - BIFROST_R_WH_FMA = 2, - BIFROST_R_W_FMA = 3, - BIFROST_R_WL_ADD = 4, - BIFROST_R_WH_ADD = 5, - BIFROST_R_W_ADD = 6, - BIFROST_WL_WL_ADD = 7, - BIFROST_WL_WH_ADD = 8, - BIFROST_WL_W_ADD = 9, - BIFROST_WH_WL_ADD = 10, - BIFROST_WH_WH_ADD = 11, - BIFROST_WH_W_ADD = 12, - BIFROST_W_WL_ADD = 13, - BIFROST_W_WH_ADD = 14, - BIFROST_W_W_ADD = 15, - BIFROST_IDLE_1 = 16, - BIFROST_I_W_FMA = 17, - BIFROST_I_WL_FMA = 18, - BIFROST_I_WH_FMA = 19, - BIFROST_R_I = 20, - BIFROST_I_W_ADD = 21, - BIFROST_I_WL_ADD = 22, - BIFROST_I_WH_ADD = 23, - BIFROST_WL_WH_MIX = 24, - BIFROST_WH_WL_MIX = 26, - BIFROST_IDLE = 27, }; -enum bifrost_reg_op { - BIFROST_OP_IDLE = 0, - BIFROST_OP_READ = 1, - BIFROST_OP_WRITE = 2, - BIFROST_OP_WRITE_LO = 3, - BIFROST_OP_WRITE_HI = 4, -}; - -struct bifrost_reg_ctrl_23 { - enum bifrost_reg_op slot2; - enum bifrost_reg_op slot3; - bool slot3_fma; -}; - -#ifndef __cplusplus -static const struct bifrost_reg_ctrl_23 bifrost_reg_ctrl_lut[32] = { - [BIFROST_R_WL_FMA] = { BIFROST_OP_READ, BIFROST_OP_WRITE_LO, true }, - [BIFROST_R_WH_FMA] = { BIFROST_OP_READ, BIFROST_OP_WRITE_HI, true }, - [BIFROST_R_W_FMA] = { BIFROST_OP_READ, BIFROST_OP_WRITE, true }, - [BIFROST_R_WL_ADD] = { BIFROST_OP_READ, BIFROST_OP_WRITE_LO, false }, - [BIFROST_R_WH_ADD] = { BIFROST_OP_READ, BIFROST_OP_WRITE_HI, false }, - [BIFROST_R_W_ADD] = { BIFROST_OP_READ, BIFROST_OP_WRITE, false }, - [BIFROST_WL_WL_ADD] = { BIFROST_OP_WRITE_LO, BIFROST_OP_WRITE_LO, false }, - [BIFROST_WL_WH_ADD] = { BIFROST_OP_WRITE_LO, BIFROST_OP_WRITE_HI, false }, - [BIFROST_WL_W_ADD] = { BIFROST_OP_WRITE_LO, BIFROST_OP_WRITE, false }, - [BIFROST_WH_WL_ADD] = { BIFROST_OP_WRITE_HI, BIFROST_OP_WRITE_LO, false }, - [BIFROST_WH_WH_ADD] = { BIFROST_OP_WRITE_HI, BIFROST_OP_WRITE_HI, false }, - [BIFROST_WH_W_ADD] = { BIFROST_OP_WRITE_HI, BIFROST_OP_WRITE, false }, - [BIFROST_W_WL_ADD] = { BIFROST_OP_WRITE, BIFROST_OP_WRITE_LO, false }, - [BIFROST_W_WH_ADD] = { BIFROST_OP_WRITE, BIFROST_OP_WRITE_HI, false }, - [BIFROST_W_W_ADD] = { BIFROST_OP_WRITE, BIFROST_OP_WRITE, false }, - [BIFROST_IDLE_1] = { BIFROST_OP_IDLE, BIFROST_OP_IDLE, true }, - [BIFROST_I_W_FMA] = { BIFROST_OP_IDLE, BIFROST_OP_WRITE, true }, - [BIFROST_I_WL_FMA] = { BIFROST_OP_IDLE, BIFROST_OP_WRITE_LO, true }, - [BIFROST_I_WH_FMA] = { BIFROST_OP_IDLE, BIFROST_OP_WRITE_HI, true }, - [BIFROST_R_I] = { BIFROST_OP_READ, BIFROST_OP_IDLE, false }, - [BIFROST_I_W_ADD] = { BIFROST_OP_IDLE, BIFROST_OP_WRITE, false }, - [BIFROST_I_WL_ADD] = { BIFROST_OP_IDLE, BIFROST_OP_WRITE_LO, false }, - [BIFROST_I_WH_ADD] = { BIFROST_OP_IDLE, BIFROST_OP_WRITE_HI, false }, - [BIFROST_WL_WH_MIX] = { BIFROST_OP_WRITE_LO, BIFROST_OP_WRITE_HI, false }, - [BIFROST_WH_WL_MIX] = { BIFROST_OP_WRITE_HI, BIFROST_OP_WRITE_LO, false }, - [BIFROST_IDLE] = { BIFROST_OP_IDLE, BIFROST_OP_IDLE, true }, -}; -#endif - -/* Texture operator descriptors in various states. Usually packed in the - * compiler and stored as a constant */ - -enum bifrost_texture_operation_mode { - /* Dual texturing */ - BIFROST_TEXTURE_OPERATION_DUAL = 1, - - /* Single texturing */ - BIFROST_TEXTURE_OPERATION_SINGLE = 3, -}; - -enum bifrost_index { - /* Both texture/sampler index immediate */ - BIFROST_INDEX_IMMEDIATE_SHARED = 0, - - /* Sampler index immediate, texture index from staging */ - BIFROST_INDEX_IMMEDIATE_SAMPLER = 1, - - /* Texture index immediate, sampler index from staging */ - BIFROST_INDEX_IMMEDIATE_TEXTURE = 2, - - /* Both indices from (separate) staging registers */ - BIFROST_INDEX_REGISTER = 3, -}; - -enum bifrost_tex_op { - /* Given explicit derivatives, compute a gradient descriptor */ - BIFROST_TEX_OP_GRDESC_DER = 4, - - /* Given implicit derivatives (texture coordinates in a fragment - * shader), compute a gradient descriptor */ - BIFROST_TEX_OP_GRDESC = 5, - - /* Fetch a texel. Takes a staging register with LOD level / face index - * packed 16:16 */ - BIFROST_TEX_OP_FETCH = 6, - - /* Filtered texture */ - BIFROST_TEX_OP_TEX = 7, -}; - -enum bifrost_lod_mode { - /* Takes two staging registers forming a 64-bit gradient descriptor - * (computed by a previous GRDESC or GRDESC_DER operation) */ - BIFROST_LOD_MODE_GRDESC = 3, - - /* Take a staging register with 8:8 fixed-point in bottom 16-bits - * specifying an explicit LOD */ - BIFROST_LOD_MODE_EXPLICIT = 4, - - /* Takes a staging register with bottom 16-bits as 8:8 fixed-point LOD - * bias and top 16-bit as 8:8 fixed-point lower bound (generally left - * zero), added and clamped to a computed LOD */ - BIFROST_LOD_MODE_BIAS = 5, - - /* Set LOD to zero */ - BIFROST_LOD_MODE_ZERO = 6, - - /* Compute LOD */ - BIFROST_LOD_MODE_COMPUTE = 7, -}; - -enum bifrost_texture_format { - /* 16-bit floating point, with optional clamping */ - BIFROST_TEXTURE_FORMAT_F16 = 0, - BIFROST_TEXTURE_FORMAT_F16_POS = 1, - BIFROST_TEXTURE_FORMAT_F16_PM1 = 2, - BIFROST_TEXTURE_FORMAT_F16_1 = 3, - - /* 32-bit floating point, with optional clamping */ - BIFROST_TEXTURE_FORMAT_F32 = 4, - BIFROST_TEXTURE_FORMAT_F32_POS = 5, - BIFROST_TEXTURE_FORMAT_F32_PM1 = 6, - BIFROST_TEXTURE_FORMAT_F32_1 = 7, -}; - -enum bifrost_texture_format_full { - /* Transclude bifrost_texture_format from above */ - - /* Integers, unclamped */ - BIFROST_TEXTURE_FORMAT_U16 = 12, - BIFROST_TEXTURE_FORMAT_S16 = 13, - BIFROST_TEXTURE_FORMAT_U32 = 14, - BIFROST_TEXTURE_FORMAT_S32 = 15, -}; - -enum bifrost_texture_fetch { - /* Default texelFetch */ - BIFROST_TEXTURE_FETCH_TEXEL = 1, - - /* Deprecated, fetches 4x U32 of a U8 x 4 texture. Do not use. */ - BIFROST_TEXTURE_FETCH_GATHER4_RGBA = 3, - - /* Gathers */ - BIFROST_TEXTURE_FETCH_GATHER4_R = 4, - BIFROST_TEXTURE_FETCH_GATHER4_G = 5, - BIFROST_TEXTURE_FETCH_GATHER4_B = 6, - BIFROST_TEXTURE_FETCH_GATHER4_A = 7 -}; - -struct bifrost_texture_operation { - /* If immediate_indices is set: - * - immediate sampler index - * - index used as texture index - * Otherwise: - * - bifrost_single_index in lower 2 bits - * - 0x3 in upper 2 bits (single-texturing) - */ - unsigned sampler_index_or_mode : 4; - unsigned index : 7; - bool immediate_indices : 1; - enum bifrost_tex_op op : 3; - - /* If set for TEX/FETCH, loads texel offsets and multisample index from - * a staging register containing offset_x:offset_y:offset_z:ms_index - * packed 8:8:8:8. Offsets must be in [-31, +31]. If set for - * GRDESC(_DER), disable LOD bias. */ - bool offset_or_bias_disable : 1; - - /* If set for TEX/FETCH, loads fp32 shadow comparison value from a - * staging register. Implies fetch_component = gather4_r. If set for - * GRDESC(_DER), disables LOD clamping. */ - bool shadow_or_clamp_disable : 1; - - /* If set, loads an uint32 array index from a staging register. */ - bool array : 1; - - /* Texture dimension, or 0 for a cubemap */ - unsigned dimension : 2; - - /* Method to compute LOD value or for a FETCH, the - * bifrost_texture_fetch component specification */ - enum bifrost_lod_mode lod_or_fetch : 3; - - /* Reserved */ - unsigned zero : 1; - - /* Register format for the result */ - enum bifrost_texture_format_full format : 4; - - /* Write mask for the result */ - unsigned mask : 4; -} __attribute__((packed)); - -struct bifrost_dual_texture_operation { - unsigned primary_sampler_index : 2; - unsigned mode : 2; /* 0x1 for dual */ - unsigned primary_texture_index : 2; - unsigned secondary_sampler_index : 2; - unsigned secondary_texture_index : 2; - - /* Leave zero for dual texturing */ - unsigned reserved : 1; - unsigned index_mode_zero : 1; - - /* Base staging register to write the secondary results to */ - unsigned secondary_register : 6; - - /* Format/mask for each texture */ - enum bifrost_texture_format secondary_format : 3; - unsigned secondary_mask : 4; - - enum bifrost_texture_format primary_format : 3; - unsigned primary_mask : 4; -} __attribute__((packed)); - -static inline uint32_t -bi_dual_tex_as_u32(struct bifrost_dual_texture_operation desc) -{ - uint32_t desc_u; - memcpy(&desc_u, &desc, sizeof(desc)); - - return desc_u; -} - -#define BIFROST_MEGA_SAMPLE 128 -#define BIFROST_ALL_SAMPLES 255 -#define BIFROST_CURRENT_PIXEL 255 - -struct bifrost_pixel_indices { - unsigned sample : 8; - unsigned rt : 8; - unsigned x : 8; - unsigned y : 8; -} __attribute__((packed)); - -enum bi_constmod { - BI_CONSTMOD_NONE, - BI_CONSTMOD_PC_LO, - BI_CONSTMOD_PC_HI, - BI_CONSTMOD_PC_LO_HI -}; - -struct bi_constants { - /* Raw constant values */ - uint64_t raw[6]; - - /* Associated modifier derived from M values */ - enum bi_constmod mods[6]; -}; - -/* FAU selectors for constants are out-of-order, construct the top bits - * here given a embedded constant index in a clause */ - -static inline unsigned -bi_constant_field(unsigned idx) -{ - const unsigned values[] = { - 4, 5, 6, 7, 2, 3 - }; - - assert(idx <= 5); - return values[idx] << 4; -} - -#ifdef __cplusplus -} /* extern C */ -#endif - #endif diff --git a/lib/mesa/src/panfrost/bifrost/bifrost_compile.c b/lib/mesa/src/panfrost/bifrost/bifrost_compile.c index f0aab763e..061eab11a 100644 --- a/lib/mesa/src/panfrost/bifrost/bifrost_compile.c +++ b/lib/mesa/src/panfrost/bifrost/bifrost_compile.c @@ -1,6 +1,5 @@ /* - * Copyright (C) 2020 Collabora Ltd. - * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io> + * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,4115 +19,842 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * Authors (Collabora): - * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> */ -#include "compiler/glsl/glsl_to_nir.h" -#include "compiler/nir_types.h" #include "compiler/nir/nir_builder.h" -#include "compiler/nir/nir_schedule.h" -#include "util/u_debug.h" - -#include "disassemble.h" -#include "valhall/va_compiler.h" -#include "valhall/disassemble.h" #include "bifrost_compile.h" -#include "compiler.h" -#include "valhall/va_compiler.h" -#include "bi_quirks.h" -#include "bi_builder.h" -#include "bifrost_nir.h" - -static const struct debug_named_value bifrost_debug_options[] = { - {"msgs", BIFROST_DBG_MSGS, "Print debug messages"}, - {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"}, - {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"}, - {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"}, - {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"}, - {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"}, - {"nopsched", BIFROST_DBG_NOPSCHED, "Disable scheduling for pressure"}, - {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"}, - {"novalidate",BIFROST_DBG_NOVALIDATE, "Skip IR validation"}, - {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"}, - {"noidvs", BIFROST_DBG_NOIDVS, "Disable IDVS"}, - {"nosb", BIFROST_DBG_NOSB, "Disable scoreboarding"}, - {"nopreload", BIFROST_DBG_NOPRELOAD, "Disable message preloading"}, - {"spill", BIFROST_DBG_SPILL, "Test register spilling"}, - DEBUG_NAMED_VALUE_END -}; - -DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0) - -/* How many bytes are prefetched by the Bifrost shader core. From the final - * clause of the shader, this range must be valid instructions or zero. */ -#define BIFROST_SHADER_PREFETCH 128 - -int bifrost_debug = 0; - -#define DBG(fmt, ...) \ - do { if (bifrost_debug & BIFROST_DBG_MSGS) \ - fprintf(stderr, "%s:%d: "fmt, \ - __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) - -static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list); - -static bi_index -bi_preload(bi_builder *b, unsigned reg) -{ - if (bi_is_null(b->shader->preloaded[reg])) { - /* Insert at the beginning of the shader */ - bi_builder b_ = *b; - b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks)); - - /* Cache the result */ - b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg)); - } - - return b->shader->preloaded[reg]; -} - -static bi_index -bi_coverage(bi_builder *b) -{ - if (bi_is_null(b->shader->coverage)) - b->shader->coverage = bi_preload(b, 60); - - return b->shader->coverage; -} - -/* - * Vertex ID and Instance ID are preloaded registers. Where they are preloaded - * changed from Bifrost to Valhall. Provide helpers that smooth over the - * architectural difference. - */ -static inline bi_index -bi_vertex_id(bi_builder *b) -{ - return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61); -} - -static inline bi_index -bi_instance_id(bi_builder *b) -{ - return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62); -} - -static void -bi_emit_jump(bi_builder *b, nir_jump_instr *instr) -{ - bi_instr *branch = bi_jump(b, bi_zero()); - - switch (instr->type) { - case nir_jump_break: - branch->branch_target = b->shader->break_block; - break; - case nir_jump_continue: - branch->branch_target = b->shader->continue_block; - break; - default: - unreachable("Unhandled jump type"); - } - - bi_block_add_successor(b->shader->current_block, branch->branch_target); - b->shader->current_block->unconditional_jumps = true; -} - -/* Builds a 64-bit hash table key for an index */ -static uint64_t -bi_index_to_key(bi_index idx) -{ - static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding"); - - uint64_t key = 0; - memcpy(&key, &idx, sizeof(idx)); - return key; -} - -/* - * Extract a single channel out of a vector source. We split vectors with SPLIT - * so we can use the split components directly, without emitting an extract. - * This has advantages of RA, as the split can usually be optimized away. - */ -static bi_index -bi_extract(bi_builder *b, bi_index vec, unsigned channel) -{ - bi_index *components = - _mesa_hash_table_u64_search(b->shader->allocated_vec, - bi_index_to_key(vec)); - - /* No extract needed for scalars. - * - * This is a bit imprecise, but actual bugs (missing splits for vectors) - * should be caught by the following assertion. It is too difficult to - * ensure bi_extract is only called for real vectors. - */ - if (components == NULL && channel == 0) - return vec; - - assert(components != NULL && "missing bi_cache_collect()"); - return components[channel]; -} - -static void -bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n) -{ - /* Lifetime of a hash table entry has to be at least as long as the table */ - bi_index *channels = ralloc_array(b->shader, bi_index, n); - memcpy(channels, s, sizeof(bi_index) * n); - - _mesa_hash_table_u64_insert(b->shader->allocated_vec, - bi_index_to_key(dst), channels); -} - -/* - * Splits an n-component vector (vec) into n scalar destinations (dests) using a - * split pseudo-instruction. - * - * Pre-condition: dests is filled with bi_null(). - */ -static void -bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n) -{ - /* Setup the destinations */ - for (unsigned i = 0; i < n; ++i) { - dests[i] = bi_temp(b->shader); - } - - /* Emit the split */ - if (n == 1) { - bi_mov_i32_to(b, dests[0], vec); - } else { - bi_instr *I = bi_split_i32_to(b, n, vec); - - bi_foreach_dest(I, j) - I->dest[j] = dests[j]; - } -} - -static void -bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n) -{ - bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; - bi_emit_split_i32(b, dests, vec, n); - bi_cache_collect(b, vec, dests, n); -} - -/* - * Emit and cache a split for a vector of a given bitsize. The vector may not be - * composed of 32-bit words, but it will be split at 32-bit word boundaries. - */ -static void -bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits) -{ - bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32)); -} - -static void -bi_split_dest(bi_builder *b, nir_dest dest) -{ - bi_emit_cached_split(b, bi_dest_index(&dest), - nir_dest_bit_size(dest) * - nir_dest_num_components(dest)); -} - -static bi_instr * -bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n) -{ - /* Special case: COLLECT of a single value is a scalar move */ - if (n == 1) - return bi_mov_i32_to(b, dst, chan[0]); - - bi_instr *I = bi_collect_i32_to(b, dst, n); - - bi_foreach_src(I, i) - I->src[i] = chan[i]; - - bi_cache_collect(b, dst, chan, n); - return I; -} - -static bi_instr * -bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1) -{ - return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2); -} - -static bi_instr * -bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2) -{ - return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3); -} - -static bi_index -bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1) -{ - bi_index dst = bi_temp(b->shader); - bi_collect_v2i32_to(b, dst, s0, s1); - return dst; -} - -static bi_index -bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr) -{ - switch (intr->intrinsic) { - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - return bi_preload(b, 61); - - /* Need to put the sample ID in the top 16-bits */ - case nir_intrinsic_load_barycentric_at_sample: - return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false), - bi_half(bi_src_index(&intr->src[0]), false)); - - /* Interpret as 8:8 signed fixed point positions in pixels along X and - * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0) - * is the center of the pixel so we first fixup and then convert. For - * fp16 input: - * - * f2i16(((x, y) + (0.5, 0.5)) * 2**8) = - * f2i16((256 * (x, y)) + (128, 128)) = - * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128)) - * - * For fp32 input, that lacks enough precision for MSAA 16x, but the - * idea is the same. FIXME: still doesn't pass - */ - case nir_intrinsic_load_barycentric_at_offset: { - bi_index offset = bi_src_index(&intr->src[0]); - bi_index f16 = bi_null(); - unsigned sz = nir_src_bit_size(intr->src[0]); - - if (sz == 16) { - f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), - bi_imm_f16(128.0)); - } else { - assert(sz == 32); - bi_index f[2]; - for (unsigned i = 0; i < 2; ++i) { - f[i] = bi_fadd_rscale_f32(b, - bi_extract(b, offset, i), - bi_imm_f32(0.5), bi_imm_u32(8), - BI_SPECIAL_NONE); - } - - f16 = bi_v2f32_to_v2f16(b, f[0], f[1]); - } - - return bi_v2f16_to_v2s16(b, f16); - } - - case nir_intrinsic_load_barycentric_pixel: - default: - return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b); - } -} - -static enum bi_sample -bi_interp_for_intrinsic(nir_intrinsic_op op) -{ - switch (op) { - case nir_intrinsic_load_barycentric_centroid: - return BI_SAMPLE_CENTROID; - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_at_sample: - return BI_SAMPLE_SAMPLE; - case nir_intrinsic_load_barycentric_at_offset: - return BI_SAMPLE_EXPLICIT; - case nir_intrinsic_load_barycentric_pixel: - default: - return BI_SAMPLE_CENTER; - } -} - -/* auto, 64-bit omitted */ -static enum bi_register_format -bi_reg_fmt_for_nir(nir_alu_type T) -{ - switch (T) { - case nir_type_float16: return BI_REGISTER_FORMAT_F16; - case nir_type_float32: return BI_REGISTER_FORMAT_F32; - case nir_type_int16: return BI_REGISTER_FORMAT_S16; - case nir_type_uint16: return BI_REGISTER_FORMAT_U16; - case nir_type_int32: return BI_REGISTER_FORMAT_S32; - case nir_type_uint32: return BI_REGISTER_FORMAT_U32; - default: unreachable("Invalid type for register format"); - } -} +#include "bifrost_opts.h" +#include "bifrost_sched.h" +#include "compiler_defines.h" +#include "disassemble.h" +#include "bifrost_print.h" -/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the - * immediate to be used (which applies even if _IMM can't be used) */ +#define BI_DEBUG -static bool -bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max) +static int +glsl_type_size(const struct glsl_type *type, bool bindless) { - nir_src *offset = nir_get_io_offset_src(instr); - - if (!nir_src_is_const(*offset)) - return false; - - *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); - return (*immediate) < max; + return glsl_count_attribute_slots(type, false); } static void -bi_make_vec_to(bi_builder *b, bi_index final_dst, - bi_index *src, - unsigned *channel, - unsigned count, - unsigned bitsize); - -/* Bifrost's load instructions lack a component offset despite operating in - * terms of vec4 slots. Usually I/O vectorization avoids nonzero components, - * but they may be unavoidable with separate shaders in use. To solve this, we - * lower to a larger load and an explicit copy of the desired components. */ - -static void -bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp) +optimize_nir(nir_shader *nir) { - unsigned component = nir_intrinsic_component(instr); - unsigned nr = instr->num_components; - unsigned total = nr + component; - unsigned bitsize = nir_dest_bit_size(instr->dest); - - assert(total <= 4 && "should be vec4"); - bi_emit_cached_split(b, tmp, total * bitsize); + bool progress; - if (component == 0) - return; + NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0); + NIR_PASS(progress, nir, nir_lower_regs_to_ssa); - bi_index srcs[] = { tmp, tmp, tmp }; - unsigned channels[] = { component, component + 1, component + 2 }; + do { + progress = false; - bi_make_vec_to(b, bi_dest_index(&instr->dest), - srcs, channels, nr, nir_dest_bit_size(instr->dest)); -} + NIR_PASS(progress, nir, nir_lower_io, nir_var_all, glsl_type_size, 0); -static void -bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) -{ - nir_alu_type T = nir_intrinsic_dest_type(instr); - enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); - nir_src *offset = nir_get_io_offset_src(instr); - unsigned component = nir_intrinsic_component(instr); - enum bi_vecsize vecsize = (instr->num_components + component - 1); - unsigned imm_index = 0; - unsigned base = nir_intrinsic_base(instr); - bool constant = nir_src_is_const(*offset); - bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); - bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); - bi_instr *I; - - if (immediate) { - I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), - bi_instance_id(b), regfmt, vecsize, - imm_index); - } else { - bi_index idx = bi_src_index(&instr->src[0]); + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - if (constant) - idx = bi_imm_u32(imm_index); - else if (base != 0) - idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_constant_folding); - I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), - idx, regfmt, vecsize); - } + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL); + NIR_PASS(progress, nir, nir_opt_if, true); - if (b->shader->arch >= 9) - I->table = PAN_TABLE_ATTRIBUTE; + } while (progress); - bi_copy_component(b, instr, dest); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); } -/* - * ABI: Special (desktop GL) slots come first, tightly packed. General varyings - * come later, sparsely packed. This handles both linked and separable shaders - * with a common code path, with minimal keying only for desktop GL. Each slot - * consumes 16 bytes (TODO: fp16, partial vectors). - */ static unsigned -bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr) +nir_src_index(compiler_context *ctx, nir_src *src) { - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - uint32_t mask = ctx->inputs->fixed_varying_mask; - - if (sem.location >= VARYING_SLOT_VAR0) { - unsigned nr_special = util_bitcount(mask); - unsigned general_index = (sem.location - VARYING_SLOT_VAR0); - - return 16 * (nr_special + general_index); - } else { - return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location))); - } + if (src->is_ssa) + return src->ssa->index; + else + return ctx->func->impl->ssa_alloc + src->reg.reg->index; } -/* - * Compute the offset in bytes of a varying with an immediate offset, adding the - * offset to the base computed above. Convenience method. - */ static unsigned -bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr) +nir_dest_index(compiler_context *ctx, nir_dest *dst) { - nir_src *src = nir_get_io_offset_src(intr); - assert(nir_src_is_const(*src) && "assumes immediate offset"); - - return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16); -} - -static void -bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) -{ - enum bi_sample sample = BI_SAMPLE_CENTER; - enum bi_update update = BI_UPDATE_STORE; - enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; - bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input; - bi_index src0 = bi_null(); - - unsigned component = nir_intrinsic_component(instr); - enum bi_vecsize vecsize = (instr->num_components + component - 1); - bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); - - unsigned sz = nir_dest_bit_size(instr->dest); - - if (smooth) { - nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); - assert(parent); - - sample = bi_interp_for_intrinsic(parent->intrinsic); - src0 = bi_varying_src0_for_barycentric(b, parent); - - assert(sz == 16 || sz == 32); - regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 - : BI_REGISTER_FORMAT_F32; - } else { - assert(sz == 32); - regfmt = BI_REGISTER_FORMAT_U32; - - /* Valhall can't have bi_null() here, although the source is - * logically unused for flat varyings - */ - if (b->shader->arch >= 9) - src0 = bi_preload(b, 61); - - /* Gather info as we go */ - b->shader->info.bifrost->uses_flat_shading = true; - } - - enum bi_source_format source_format = - smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32; - - nir_src *offset = nir_get_io_offset_src(instr); - unsigned imm_index = 0; - bool immediate = bi_is_intr_immediate(instr, &imm_index, 20); - bi_instr *I = NULL; - - if (b->shader->malloc_idvs && immediate) { - /* Immediate index given in bytes. */ - bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, - sample, source_format, update, vecsize, - bi_varying_offset(b->shader, instr)); - } else if (immediate && smooth) { - I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, - vecsize, imm_index); - } else if (immediate && !smooth) { - I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, - vecsize, imm_index); - } else { - bi_index idx = bi_src_index(offset); - unsigned base = nir_intrinsic_base(instr); - - if (b->shader->malloc_idvs) { - /* Index needs to be in bytes, but NIR gives the index - * in slots. For now assume 16 bytes per element. - */ - bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4)); - unsigned vbase = bi_varying_base_bytes(b->shader, instr); - - if (vbase != 0) - idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false); - - bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, - sample, source_format, update, - vecsize); - } else if (smooth) { - if (base != 0) - idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); - - I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample, - update, vecsize); - } else { - if (base != 0) - idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); - - I = bi_ld_var_flat_to(b, dest, idx, - BI_FUNCTION_NONE, regfmt, - vecsize); - } - } - - /* Valhall usually uses machine-allocated IDVS. If this is disabled, use - * a simple Midgard-style ABI. - */ - if (b->shader->arch >= 9 && I != NULL) - I->table = PAN_TABLE_ATTRIBUTE; - - bi_copy_component(b, instr, dest); -} - -static bi_index -bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel, unsigned count) -{ - assert(1 <= count && count <= 4); - - bi_index bytes[4] = { - bi_imm_u8(0), - bi_imm_u8(0), - bi_imm_u8(0), - bi_imm_u8(0) - }; - - for (unsigned i = 0; i < count; ++i) { - unsigned chan = channel ? channel[i] : 0; - - bytes[i] = bi_byte(bi_extract(b, src[i], chan >> 2), chan & 3); - } - - if (b->shader->arch >= 9) { - bi_index vec = bi_zero(); - - if (count >= 3) - vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec); - - return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec); - } else { - return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]); - } -} - -static bi_index -bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel, unsigned count) -{ - unsigned chan0 = channel ? channel[0] : 0; - bi_index w0 = bi_extract(b, src[0], chan0 >> 1); - bi_index h0 = bi_half(w0, chan0 & 1); - - /* Zero extend */ - if (count == 1) - return bi_mkvec_v2i16(b, h0, bi_imm_u16(0)); - - /* Else, create a vector */ - assert(count == 2); - - unsigned chan1 = channel ? channel[1] : 0; - bi_index w1 = bi_extract(b, src[1], chan1 >> 1); - bi_index h1 = bi_half(w1, chan1 & 1); - - if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1)) - return bi_mov_i32(b, w0); - else if (bi_is_word_equiv(w0, w1)) - return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1)); + if (dst->is_ssa) + return dst->ssa.index; else - return bi_mkvec_v2i16(b, h0, h1); -} - -static void -bi_make_vec_to(bi_builder *b, bi_index dst, - bi_index *src, - unsigned *channel, - unsigned count, - unsigned bitsize) -{ - assert(bitsize == 8 || bitsize == 16 || bitsize == 32); - unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; - unsigned chan_per_word = 1 << shift; - - assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS && - "unnecessarily large vector should have been lowered"); - - bi_index srcs[BI_MAX_VEC]; - - for (unsigned i = 0; i < count; i += chan_per_word) { - unsigned rem = MIN2(count - i, chan_per_word); - unsigned *channel_offset = channel ? (channel + i) : NULL; - - if (bitsize == 32) - srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0); - else if (bitsize == 16) - srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem); - else - srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem); - } - - bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word)); -} - -static inline bi_instr * -bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0, - bi_index src1) -{ - bi_instr *I; - - if (b->shader->arch >= 9) { - I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1); - I->seg = BI_SEG_UBO; - } else { - I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0); - } - - bi_emit_cached_split(b, dest0, bitsize); - return I; -} - -static bi_instr * -bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, - unsigned nr_components, unsigned offset) -{ - unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 ? - b->shader->inputs->fixed_sysval_ubo : - b->shader->nir->info.num_ubos; - unsigned uniform = - pan_lookup_sysval(b->shader->sysval_to_id, - b->shader->info.sysvals, - sysval); - unsigned idx = (uniform * 16) + offset; - - return bi_load_ubo_to(b, nr_components * 32, dest, - bi_imm_u32(idx), bi_imm_u32(sysval_ubo)); + return ctx->func->impl->ssa_alloc + dst->reg.reg->index; } -static void -bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr, - unsigned nr_components, unsigned offset) +static unsigned +nir_alu_src_index(compiler_context *ctx, nir_alu_src *src) { - bi_load_sysval_to(b, bi_dest_index(&intr->dest), - panfrost_sysval_for_instr(&intr->instr, NULL), - nr_components, offset); + return nir_src_index(ctx, &src->src); } -static bi_index -bi_load_sysval(bi_builder *b, int sysval, - unsigned nr_components, unsigned offset) +struct bifrost_instruction * +mir_alloc_ins(struct bifrost_instruction instr) { - bi_index tmp = bi_temp(b->shader); - bi_load_sysval_to(b, tmp, sysval, nr_components, offset); - return tmp; + struct bifrost_instruction *heap_ins = malloc(sizeof(instr)); + memcpy(heap_ins, &instr, sizeof(instr)); + return heap_ins; } static void -bi_load_sample_id_to(bi_builder *b, bi_index dst) -{ - /* r61[16:23] contains the sampleID, mask it out. Upper bits - * seem to read garbage (despite being architecturally defined - * as zero), so use a 5-bit mask instead of 8-bits */ - - bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f), - bi_imm_u8(16), false); -} - -static bi_index -bi_load_sample_id(bi_builder *b) -{ - bi_index sample_id = bi_temp(b->shader); - bi_load_sample_id_to(b, sample_id); - return sample_id; -} - -static bi_index -bi_pixel_indices(bi_builder *b, unsigned rt) +emit_mir_instruction(struct compiler_context *ctx, struct bifrost_instruction instr) { - /* We want to load the current pixel. */ - struct bifrost_pixel_indices pix = { - .y = BIFROST_CURRENT_PIXEL, - .rt = rt - }; - - uint32_t indices_u32 = 0; - memcpy(&indices_u32, &pix, sizeof(indices_u32)); - bi_index indices = bi_imm_u32(indices_u32); - - /* Sample index above is left as zero. For multisampling, we need to - * fill in the actual sample ID in the lower byte */ - - if (b->shader->inputs->blend.nr_samples > 1) - indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false); - - return indices; + list_addtail(&(mir_alloc_ins(instr))->link, &ctx->current_block->instructions); } -/* Source color is passed through r0-r3, or r4-r7 for the second source when - * dual-source blending. Preload the corresponding vector. - */ static void -bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr) +bifrost_block_add_successor(bifrost_block *block, bifrost_block *successor) { - nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0; - unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr)); - assert(size == 16 || size == 32); - - bi_index srcs[] = { - bi_preload(b, base + 0), bi_preload(b, base + 1), - bi_preload(b, base + 2), bi_preload(b, base + 3) - }; - - bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2); + assert(block->num_successors < ARRAY_SIZE(block->successors)); + block->successors[block->num_successors++] = successor; } static void -bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, - bi_index rgba2, nir_alu_type T2, unsigned rt) +emit_load_const(struct compiler_context *ctx, nir_load_const_instr *instr) { - /* Reads 2 or 4 staging registers to cover the input */ - unsigned size = nir_alu_type_get_type_size(T); - unsigned size_2 = nir_alu_type_get_type_size(T2); - unsigned sr_count = (size <= 16) ? 2 : 4; - unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4; - const struct panfrost_compile_inputs *inputs = b->shader->inputs; - uint64_t blend_desc = inputs->blend.bifrost_blend_desc; - enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); - - /* Workaround for NIR-to-TGSI */ - if (b->shader->nir->info.fs.untyped_color_outputs) - regfmt = BI_REGISTER_FORMAT_AUTO; - - if (inputs->is_blend && inputs->blend.nr_samples > 1) { - /* Conversion descriptor comes from the compile inputs, pixel - * indices derived at run time based on sample ID */ - bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b), - bi_imm_u32(blend_desc >> 32), - regfmt, BI_VECSIZE_V4); - } else if (b->shader->inputs->is_blend) { - uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc; - - /* Blend descriptor comes from the compile inputs */ - /* Put the result in r0 */ - - bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b), - bi_imm_u32(blend_desc), - bi_imm_u32(blend_desc >> 32), - bi_null(), regfmt, sr_count, 0); - } else { - /* Blend descriptor comes from the FAU RAM. By convention, the - * return address on Bifrost is stored in r48 and will be used - * by the blend shader to jump back to the fragment shader */ - - bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b), - bi_fau(BIR_FAU_BLEND_0 + rt, false), - bi_fau(BIR_FAU_BLEND_0 + rt, true), - rgba2, regfmt, sr_count, sr_count_2); - } - - assert(rt < 8); - b->shader->info.bifrost->blend[rt].type = T; + nir_ssa_def def = instr->def; - if (T2) - b->shader->info.bifrost->blend_src1_type = T2; + float *v = ralloc_array(NULL, float, 1); + nir_const_load_to_arr(v, instr, f32); + _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v); } -/* Blend shaders do not need to run ATEST since they are dependent on a - * fragment shader that runs it. Blit shaders may not need to run ATEST, since - * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and - * there are no writes to the coverage mask. The latter two are satisfied for - * all blit shaders, so we just care about early-z, which blit shaders force - * iff they do not write depth or stencil */ - -static bool -bi_skip_atest(bi_context *ctx, bool emit_zs) -{ - return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend; -} - -static void -bi_emit_atest(bi_builder *b, bi_index alpha) +static uint32_t +alloc_mir_temp(struct compiler_context *ctx) { - b->shader->coverage = bi_atest(b, bi_coverage(b), alpha, - bi_fau(BIR_FAU_ATEST_PARAM, false)); - b->shader->emitted_atest = true; + return SSA_TEMP_VALUE(ctx->mir_temp++); } -static void -bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr) +static uint32_t +emit_ld_vary_addr_constant(struct compiler_context *ctx, uint32_t location) { - bool combined = instr->intrinsic == - nir_intrinsic_store_combined_output_pan; + // LD_VAR_ADDR.f32 {R0, T1}, R61, R62, location:1, R12 + // ... + // ST_VAR.v4 T1, R12, R13, R14, R4 - unsigned writeout = combined ? nir_intrinsic_component(instr) : - PAN_WRITEOUT_C; - - bool emit_blend = writeout & (PAN_WRITEOUT_C); - bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S); - - unsigned loc = nir_intrinsic_io_semantics(instr).location; - bi_index src0 = bi_src_index(&instr->src[0]); - - /* By ISA convention, the coverage mask is stored in R60. The store - * itself will be handled by a subsequent ATEST instruction */ - if (loc == FRAG_RESULT_SAMPLE_MASK) { - bi_index orig = bi_coverage(b); - bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0); - bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0)); - - b->shader->coverage = - bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO); - return; - } + // R61-R62 is filled with information needed for varying interpolation + // This loads a vec3 with the information that ST_VAR needs to work - /* Emit ATEST if we have to, note ATEST requires a floating-point alpha - * value, but render target #0 might not be floating point. However the - * alpha value is only used for alpha-to-coverage, a stage which is - * skipped for pure integer framebuffers, so the issue is moot. */ - - if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) { - nir_alu_type T = nir_intrinsic_src_type(instr); - - bi_index rgba = bi_src_index(&instr->src[0]); - bi_index alpha = - (T == nir_type_float16) ? bi_half(bi_extract(b, rgba, 1), true) : - (T == nir_type_float32) ? bi_extract(b, rgba, 3) : - bi_dontcare(b); - - /* Don't read out-of-bounds */ - if (nir_src_num_components(instr->src[0]) < 4) - alpha = bi_imm_f32(1.0); - - bi_emit_atest(b, alpha); - } - - if (emit_zs) { - bi_index z = bi_dontcare(b), s = bi_dontcare(b); - - if (writeout & PAN_WRITEOUT_Z) - z = bi_src_index(&instr->src[2]); - - if (writeout & PAN_WRITEOUT_S) - s = bi_src_index(&instr->src[3]); - - b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b), - writeout & PAN_WRITEOUT_S, - writeout & PAN_WRITEOUT_Z); - } - - if (emit_blend) { - unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0; - bool dual = (writeout & PAN_WRITEOUT_2); - bi_index color = bi_src_index(&instr->src[0]); - bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null(); - nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0; - - /* Explicit copy since BLEND inputs are precoloured to R0-R3, - * TODO: maybe schedule around this or implement in RA as a - * spill */ - bool has_mrt = (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1); - - if (has_mrt) { - bi_index srcs[4] = { color, color, color, color }; - unsigned channels[4] = { 0, 1, 2, 3 }; - color = bi_temp(b->shader); - bi_make_vec_to(b, color, srcs, channels, - nir_src_num_components(instr->src[0]), - nir_alu_type_get_type_size(nir_intrinsic_src_type(instr))); - } - - bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), - color2, T2, rt); - } - - if (b->shader->inputs->is_blend) { - /* Jump back to the fragment shader, return address is stored - * in r48 (see above). On Valhall, only jump if the address is - * nonzero. The check is free there and it implements the "jump - * to 0 terminates the blend shader" that's automatic on - * Bifrost. - */ - if (b->shader->arch >= 8) - bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE); - else - bi_jump(b, bi_preload(b, 48)); - } -} - -/** - * In a vertex shader, is the specified variable a position output? These kinds - * of outputs are written from position shaders when IDVS is enabled. All other - * outputs are written from the varying shader. - */ -static bool -bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs) -{ - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - - switch (sem.location) { - case VARYING_SLOT_POS: - case VARYING_SLOT_PSIZ: - return idvs == BI_IDVS_VARYING; - default: - return idvs == BI_IDVS_POSITION; - } -} - -static bool -bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data) -{ - enum bi_idvs_mode *idvs = data; - - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - if (intr->intrinsic != nir_intrinsic_store_output) - return false; - - if (bi_should_remove_store(intr, *idvs)) { - nir_instr_remove(instr); - return true; - } + uint32_t mir_temp_location = alloc_mir_temp(ctx); + // This instruction loads a vec3 starting from the initial register + struct bifrost_instruction instr = { + .op = op_ld_var_addr, + .dest_components = 3, + .ssa_args = { + .dest = mir_temp_location, + .src0 = SSA_FIXED_REGISTER(61), + .src1 = SSA_FIXED_REGISTER(62), + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = location, + }; + emit_mir_instruction(ctx, instr); - return false; + return mir_temp_location; } +// XXX: Doesn't support duplicated values in the components! +// RA WILL fail! static void -bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) +emit_create_vector(struct compiler_context *ctx, unsigned dest, unsigned num_comps, uint32_t *comps) { - /* In principle we can do better for 16-bit. At the moment we require - * 32-bit to permit the use of .auto, in order to force .u32 for flat - * varyings, to handle internal TGSI shaders that set flat in the VS - * but smooth in the FS */ - - ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr); - ASSERTED unsigned T_size = nir_alu_type_get_type_size(T); - assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16)); - enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; - - unsigned imm_index = 0; - bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); - - /* Only look at the total components needed. In effect, we fill in all - * the intermediate "holes" in the write mask, since we can't mask off - * stores. Since nir_lower_io_to_temporaries ensures each varying is - * written at most once, anything that's masked out is undefined, so it - * doesn't matter what we write there. So we may as well do the - * simplest thing possible. */ - unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr)); - assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0)); - - bi_index data = bi_src_index(&instr->src[0]); - - /* To keep the vector dimensions consistent, we need to drop some - * components. This should be coalesced. - * - * TODO: This is ugly and maybe inefficient. Would we rather - * introduce a TRIM.i32 pseudoinstruction? - */ - if (nr < nir_intrinsic_src_components(instr, 0)) { - assert(T_size == 32 && "todo: 16-bit trim"); - - bi_index chans[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; - unsigned src_comps = nir_intrinsic_src_components(instr, 0); - - bi_emit_split_i32(b, chans, data, src_comps); - - bi_index tmp = bi_temp(b->shader); - bi_instr *collect = bi_collect_i32_to(b, tmp, nr); - - bi_foreach_src(collect, w) - collect->src[w] = chans[w]; - - data = tmp; - } - - bool psiz = (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ); - - bi_index a[4] = { bi_null() }; + assert(num_comps <= 4 && "Can't make a vector larger than 4 components"); - if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) { - /* Bifrost position shaders have a fast path */ - assert(T == nir_type_float16 || T == nir_type_float32); - unsigned regfmt = (T == nir_type_float16) ? 0 : 1; - unsigned identity = (b->shader->arch == 6) ? 0x688 : 0; - unsigned snap4 = 0x5E; - uint32_t format = identity | (snap4 << 12) | (regfmt << 24); - - bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59), - bi_imm_u32(format), regfmt, nr - 1); - } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) { - bi_index index = bi_preload(b, 59); - - if (psiz) { - assert(T_size == 16 && "should've been lowered"); - index = bi_iadd_imm_i32(b, index, 4); + // This instruction loads a vec3 starting from the initial register + struct bifrost_instruction instr = { + .op = op_create_vector, + .dest_components = num_comps, + .ssa_args = { + .dest = dest, } - - bi_index address = bi_lea_buf_imm(b, index); - bi_emit_split_i32(b, a, address, 2); - - bool varying = (b->shader->idvs == BI_IDVS_VARYING); - - bi_store(b, nr * nir_src_bit_size(instr->src[0]), - data, a[0], a[1], - varying ? BI_SEG_VARY : BI_SEG_POS, - varying ? bi_varying_offset(b->shader, instr) : 0); - } else if (immediate) { - bi_index address = bi_lea_attr_imm(b, - bi_vertex_id(b), bi_instance_id(b), - regfmt, imm_index); - bi_emit_split_i32(b, a, address, 3); - - bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); - } else { - bi_index idx = - bi_iadd_u32(b, - bi_src_index(nir_get_io_offset_src(instr)), - bi_imm_u32(nir_intrinsic_base(instr)), - false); - bi_index address = bi_lea_attr(b, - bi_vertex_id(b), bi_instance_id(b), - idx, regfmt); - bi_emit_split_i32(b, a, address, 3); - - bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); - } -} - -static void -bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr) -{ - nir_src *offset = nir_get_io_offset_src(instr); - - bool offset_is_const = nir_src_is_const(*offset); - bi_index dyn_offset = bi_src_index(offset); - uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0; - - bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest), - bi_dest_index(&instr->dest), offset_is_const ? - bi_imm_u32(const_offset) : dyn_offset, - bi_src_index(&instr->src[0])); -} - -static void -bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr) -{ - assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms"); - - nir_src *offset = &instr->src[0]; - assert(nir_src_is_const(*offset) && "no indirect push constants"); - uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); - assert((base & 3) == 0 && "unaligned push constants"); - - unsigned bits = nir_dest_bit_size(instr->dest) * - nir_dest_num_components(instr->dest); - - unsigned n = DIV_ROUND_UP(bits, 32); - assert(n <= 4); - bi_index channels[4] = { bi_null() }; - - for (unsigned i = 0; i < n; ++i) { - unsigned word = (base >> 2) + i; - - channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1); - } - - bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n); -} - -static bi_index -bi_addr_high(bi_builder *b, nir_src *src) -{ - return (nir_src_bit_size(*src) == 64) ? - bi_extract(b, bi_src_index(src), 1) : bi_zero(); -} - -static void -bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, enum bi_seg seg, int16_t *offset) -{ - /* Not needed on Bifrost or for global accesses */ - if (b->shader->arch < 9 || seg == BI_SEG_NONE) - return; - - /* There is no segment modifier on Valhall. Instead, we need to - * emit the arithmetic ourselves. We do have an offset - * available, which saves an instruction for constant offsets. - */ - bool wls = (seg == BI_SEG_WLS); - assert(wls || (seg == BI_SEG_TL)); - - enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR; - - bi_index base_lo = bi_fau(fau, false); - - if (offset && addr_lo->type == BI_INDEX_CONSTANT && addr_lo->value == (int16_t) addr_lo->value) { - *offset = addr_lo->value; - *addr_lo = base_lo; - } else { - *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false); - } - - /* Do not allow overflow for WLS or TLS */ - *addr_hi = bi_fau(fau, true); -} - -static void -bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) -{ - int16_t offset = 0; - unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest); - bi_index dest = bi_dest_index(&instr->dest); - bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0); - bi_index addr_hi = bi_addr_high(b, &instr->src[0]); - - bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); - - bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset); - bi_emit_cached_split(b, dest, bits); -} - -static void -bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) -{ - /* Require contiguous masks, gauranteed by nir_lower_wrmasks */ - assert(nir_intrinsic_write_mask(instr) == - BITFIELD_MASK(instr->num_components)); - - int16_t offset = 0; - bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0); - bi_index addr_hi = bi_addr_high(b, &instr->src[1]); - - bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); - - bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]), - bi_src_index(&instr->src[0]), - addr_lo, addr_hi, seg, offset); -} - -/* Exchanges the staging register with memory */ - -static void -bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg) -{ - assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); - - unsigned sz = nir_src_bit_size(*arg); - assert(sz == 32 || sz == 64); - - bi_index data = bi_src_index(arg); - - bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); - - if (b->shader->arch >= 9) - bi_handle_segment(b, &addr, &addr_hi, seg, NULL); - else if (seg == BI_SEG_WLS) - addr_hi = bi_zero(); - - bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg); -} - -/* Exchanges the second staging register with memory if comparison with first - * staging register passes */ - -static void -bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg) -{ - assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); - - /* hardware is swapped from NIR */ - bi_index src0 = bi_src_index(arg_2); - bi_index src1 = bi_src_index(arg_1); - - unsigned sz = nir_src_bit_size(*arg_1); - assert(sz == 32 || sz == 64); - - bi_index data_words[] = { - bi_extract(b, src0, 0), - sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1), - - /* 64-bit */ - bi_extract(b, src1, 0), - sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1), }; - bi_index in = bi_temp(b->shader); - bi_emit_collect_to(b, in, data_words, 2 * (sz / 32)); - bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); - - if (b->shader->arch >= 9) - bi_handle_segment(b, &addr, &addr_hi, seg, NULL); - else if (seg == BI_SEG_WLS) - addr_hi = bi_zero(); - - bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg); - bi_emit_cached_split(b, out, sz); - - bi_index inout_words[] = { - bi_extract(b, out, 0), - sz == 64 ? bi_extract(b, out, 1) : bi_null() + uint32_t *srcs[4] = { + &instr.ssa_args.src0, + &instr.ssa_args.src1, + &instr.ssa_args.src2, + &instr.ssa_args.src3, }; - bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); -} - -/* Extracts an atomic opcode */ - -static enum bi_atom_opc -bi_atom_opc_for_nir(nir_intrinsic_op op) -{ - switch (op) { - case nir_intrinsic_global_atomic_add: - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_image_atomic_add: - return BI_ATOM_OPC_AADD; - - case nir_intrinsic_global_atomic_imin: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_image_atomic_imin: - return BI_ATOM_OPC_ASMIN; - - case nir_intrinsic_global_atomic_umin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_image_atomic_umin: - return BI_ATOM_OPC_AUMIN; - - case nir_intrinsic_global_atomic_imax: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_image_atomic_imax: - return BI_ATOM_OPC_ASMAX; - - case nir_intrinsic_global_atomic_umax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_image_atomic_umax: - return BI_ATOM_OPC_AUMAX; - - case nir_intrinsic_global_atomic_and: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_image_atomic_and: - return BI_ATOM_OPC_AAND; - - case nir_intrinsic_global_atomic_or: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_image_atomic_or: - return BI_ATOM_OPC_AOR; - - case nir_intrinsic_global_atomic_xor: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_image_atomic_xor: - return BI_ATOM_OPC_AXOR; - - default: - unreachable("Unexpected computational atomic"); - } -} - -/* Optimized unary atomics are available with an implied #1 argument */ - -static bool -bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out) -{ - /* Check we have a compatible constant */ - if (arg.type != BI_INDEX_CONSTANT) - return false; - - if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD))) - return false; - - /* Check for a compatible operation */ - switch (op) { - case BI_ATOM_OPC_AADD: - *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC; - return true; - case BI_ATOM_OPC_ASMAX: - *out = BI_ATOM_OPC_ASMAX1; - return true; - case BI_ATOM_OPC_AUMAX: - *out = BI_ATOM_OPC_AUMAX1; - return true; - case BI_ATOM_OPC_AOR: - *out = BI_ATOM_OPC_AOR1; - return true; - default: - return false; - } -} - -/* - * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to - * translate between these forms (with MKVEC.v2i16). - * - * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D - * arrays. For uniform handling, we also treat 3D textures like 2D arrays. - * - * Our indexing needs to reflects this. - */ -static bi_index -bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx, - unsigned coord_comps, bool is_array) -{ - assert(coord_comps > 0 && coord_comps <= 3); - - if (src_idx == 0) { - if (coord_comps == 1 || (coord_comps == 2 && is_array)) - return bi_extract(b, coord, 0); - else - return bi_mkvec_v2i16(b, - bi_half(bi_extract(b, coord, 0), false), - bi_half(bi_extract(b, coord, 1), false)); - } else { - if (coord_comps == 3 && b->shader->arch >= 9) - return bi_mkvec_v2i16(b, bi_imm_u16(0), - bi_half(bi_extract(b, coord, 2), false)); - else if (coord_comps == 2 && is_array && b->shader->arch >= 9) - return bi_mkvec_v2i16(b, bi_imm_u16(0), - bi_half(bi_extract(b, coord, 1), false)); - else if (coord_comps == 3) - return bi_extract(b, coord, 2); - else if (coord_comps == 2 && is_array) - return bi_extract(b, coord, 1); + for (unsigned i = 0; i < 4; ++i) { + if (i < num_comps) + *srcs[i] = comps[i]; else - return bi_zero(); - } -} - -static bi_index -bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr) -{ - nir_src src = instr->src[0]; - bi_index index = bi_src_index(&src); - bi_context *ctx = b->shader; - - /* Images come after vertex attributes, so handle an explicit offset */ - unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ? - util_bitcount64(ctx->nir->info.inputs_read) : 0; - - if (offset == 0) - return index; - else if (nir_src_is_const(src)) - return bi_imm_u32(nir_src_as_uint(src) + offset); - else - return bi_iadd_u32(b, index, bi_imm_u32(offset), false); -} - -static void -bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr) -{ - enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); - unsigned coord_comps = nir_image_intrinsic_coord_components(instr); - bool array = nir_intrinsic_image_array(instr); - ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); - - bi_index coords = bi_src_index(&instr->src[1]); - bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); - bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); - bi_index dest = bi_dest_index(&instr->dest); - enum bi_register_format regfmt = bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)); - enum bi_vecsize vecsize = instr->num_components - 1; - - /* TODO: MSAA */ - assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); - - if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { - bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, - nir_src_as_uint(instr->src[0])); - - I->table = PAN_TABLE_IMAGE; - } else if (b->shader->arch >= 9) { - unreachable("Indirect images on Valhall not yet supported"); - } else { - bi_ld_attr_tex_to(b, dest, xy, zw, - bi_emit_image_index(b, instr), regfmt, - vecsize); - } - - bi_split_dest(b, instr->dest); -} - -static bi_index -bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr) -{ - enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); - bool array = nir_intrinsic_image_array(instr); - ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); - unsigned coord_comps = nir_image_intrinsic_coord_components(instr); - - /* TODO: MSAA */ - assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); - - enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ? - bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) : - BI_REGISTER_FORMAT_AUTO; - - bi_index coords = bi_src_index(&instr->src[1]); - bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); - bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); - bi_index dest = bi_temp(b->shader); - - if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { - bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, - nir_src_as_uint(instr->src[0])); - - I->table = PAN_TABLE_IMAGE; - } else if (b->shader->arch >= 9) { - unreachable("Indirect images on Valhall not yet supported"); - } else { - bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw, - bi_emit_image_index(b, instr), type); - - /* LEA_ATTR_TEX defaults to the secondary attribute table, but - * our ABI has all images in the primary attribute table - */ - I->table = BI_TABLE_ATTRIBUTE_1; - } - - bi_emit_cached_split(b, dest, 3 * 32); - return dest; -} - -static void -bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr) -{ - bi_index a[4] = { bi_null() }; - bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3); - - /* Due to SPIR-V limitations, the source type is not fully reliable: it - * reports uint32 even for write_imagei. This causes an incorrect - * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32 - * instead, which will match per the OpenCL spec. Of course this does - * not work for 16-bit stores, but those are not available in OpenCL. - */ - nir_alu_type T = nir_intrinsic_src_type(instr); - assert(nir_alu_type_get_type_size(T) == 32); - - bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2], - BI_REGISTER_FORMAT_AUTO, - instr->num_components - 1); -} - -static void -bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, - bi_index addr, bi_index arg, nir_intrinsic_op intrinsic) -{ - enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic); - enum bi_atom_opc post_opc = opc; - bool bifrost = b->shader->arch <= 8; - - /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't - * take any vector but can still output in RETURN mode */ - bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst; - unsigned sr_count = bifrost ? 2 : 1; - - /* Generate either ATOM or ATOM1 as required */ - if (bi_promote_atom_c1(opc, arg, &opc)) { - bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0), - bi_extract(b, addr, 1), opc, sr_count); - } else { - bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0), - bi_extract(b, addr, 1), opc, sr_count); - } - - if (bifrost) { - /* Post-process it */ - bi_emit_cached_split_i32(b, tmp_dest, 2); - bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), bi_extract(b, tmp_dest, 1), post_opc); - } -} - -/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5 - * gl_FragCoord.z = ld_vary(fragz) - * gl_FragCoord.w = ld_vary(fragw) - */ - -static void -bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr) -{ - bi_index src[4] = {}; - - for (unsigned i = 0; i < 2; ++i) { - src[i] = bi_fadd_f32(b, - bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)), - bi_imm_f32(0.5f)); - } - - for (unsigned i = 0; i < 2; ++i) { - src[2 + i] = bi_ld_var_special(b, bi_zero(), - BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, - BI_UPDATE_CLOBBER, - (i == 0) ? BI_VARYING_NAME_FRAG_Z : - BI_VARYING_NAME_FRAG_W, - BI_VECSIZE_NONE); - } - - bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32); -} - -static void -bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr) -{ - bi_index dest = bi_dest_index(&instr->dest); - nir_alu_type T = nir_intrinsic_dest_type(instr); - enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); - unsigned rt = b->shader->inputs->blend.rt; - unsigned size = nir_dest_bit_size(instr->dest); - unsigned nr = instr->num_components; - - /* Get the render target */ - if (!b->shader->inputs->is_blend) { - nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - unsigned loc = sem.location; - assert(loc >= FRAG_RESULT_DATA0); - rt = (loc - FRAG_RESULT_DATA0); - } - - bi_index desc = b->shader->inputs->is_blend ? - bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) : - b->shader->inputs->bifrost.static_rt_conv ? - bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) : - bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0); - - bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc, - regfmt, nr - 1); - bi_emit_cached_split(b, dest, size * nr); -} - -static void -bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) -{ - bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ? - bi_dest_index(&instr->dest) : bi_null(); - gl_shader_stage stage = b->shader->stage; - - switch (instr->intrinsic) { - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_at_sample: - case nir_intrinsic_load_barycentric_at_offset: - /* handled later via load_vary */ - break; - case nir_intrinsic_load_interpolated_input: - case nir_intrinsic_load_input: - if (b->shader->inputs->is_blend) - bi_emit_load_blend_input(b, instr); - else if (stage == MESA_SHADER_FRAGMENT) - bi_emit_load_vary(b, instr); - else if (stage == MESA_SHADER_VERTEX) - bi_emit_load_attr(b, instr); - else - unreachable("Unsupported shader stage"); - break; - - case nir_intrinsic_store_output: - if (stage == MESA_SHADER_FRAGMENT) - bi_emit_fragment_out(b, instr); - else if (stage == MESA_SHADER_VERTEX) - bi_emit_store_vary(b, instr); - else - unreachable("Unsupported shader stage"); - break; - - case nir_intrinsic_store_combined_output_pan: - assert(stage == MESA_SHADER_FRAGMENT); - bi_emit_fragment_out(b, instr); - break; - - case nir_intrinsic_load_ubo: - bi_emit_load_ubo(b, instr); - break; - - case nir_intrinsic_load_push_constant: - bi_emit_load_push_constant(b, instr); - break; - - case nir_intrinsic_load_global: - case nir_intrinsic_load_global_constant: - bi_emit_load(b, instr, BI_SEG_NONE); - break; - - case nir_intrinsic_store_global: - bi_emit_store(b, instr, BI_SEG_NONE); - break; - - case nir_intrinsic_load_scratch: - bi_emit_load(b, instr, BI_SEG_TL); - break; - - case nir_intrinsic_store_scratch: - bi_emit_store(b, instr, BI_SEG_TL); - break; - - case nir_intrinsic_load_shared: - bi_emit_load(b, instr, BI_SEG_WLS); - break; - - case nir_intrinsic_store_shared: - bi_emit_store(b, instr, BI_SEG_WLS); - break; - - /* Blob doesn't seem to do anything for memory barriers, note +BARRIER - * is illegal in fragment shaders */ - case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_group_memory_barrier: - break; - - case nir_intrinsic_control_barrier: - assert(b->shader->stage != MESA_SHADER_FRAGMENT); - bi_barrier(b); - break; - - case nir_intrinsic_scoped_barrier: - assert(b->shader->stage != MESA_SHADER_FRAGMENT); - assert(nir_intrinsic_memory_scope(instr) > NIR_SCOPE_SUBGROUP && - "todo: subgroup barriers (different divergence rules)"); - - bi_barrier(b); - break; - - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: { - assert(nir_src_bit_size(instr->src[1]) == 32); - - bi_index addr = bi_src_index(&instr->src[0]); - bi_index addr_hi; - - if (b->shader->arch >= 9) { - bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL); - addr = bi_collect_v2i32(b, addr, addr_hi); - } else { - addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS); - bi_emit_cached_split(b, addr, 64); - } - - bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), - instr->intrinsic); - bi_split_dest(b, instr->dest); - break; - } - - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - assert(nir_src_bit_size(instr->src[3]) == 32); - - bi_emit_atomic_i32_to(b, dst, - bi_emit_lea_image(b, instr), - bi_src_index(&instr->src[3]), - instr->intrinsic); - bi_split_dest(b, instr->dest); - break; - - case nir_intrinsic_global_atomic_add: - case nir_intrinsic_global_atomic_imin: - case nir_intrinsic_global_atomic_umin: - case nir_intrinsic_global_atomic_imax: - case nir_intrinsic_global_atomic_umax: - case nir_intrinsic_global_atomic_and: - case nir_intrinsic_global_atomic_or: - case nir_intrinsic_global_atomic_xor: - assert(nir_src_bit_size(instr->src[1]) == 32); - - bi_emit_atomic_i32_to(b, dst, - bi_src_index(&instr->src[0]), - bi_src_index(&instr->src[1]), - instr->intrinsic); - - bi_split_dest(b, instr->dest); - break; - - case nir_intrinsic_image_load: - bi_emit_image_load(b, instr); - break; - - case nir_intrinsic_image_store: - bi_emit_image_store(b, instr); - break; - - case nir_intrinsic_global_atomic_exchange: - bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), - &instr->src[1], BI_SEG_NONE); - bi_split_dest(b, instr->dest); - break; - - case nir_intrinsic_image_atomic_exchange: - bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr), - &instr->src[3], BI_SEG_NONE); - bi_split_dest(b, instr->dest); - break; - - case nir_intrinsic_shared_atomic_exchange: - bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), - &instr->src[1], BI_SEG_WLS); - bi_split_dest(b, instr->dest); - break; - - case nir_intrinsic_global_atomic_comp_swap: - bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), - &instr->src[1], &instr->src[2], BI_SEG_NONE); - bi_split_dest(b, instr->dest); - break; - - case nir_intrinsic_image_atomic_comp_swap: - bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr), - &instr->src[3], &instr->src[4], BI_SEG_NONE); - bi_split_dest(b, instr->dest); - break; - - case nir_intrinsic_shared_atomic_comp_swap: - bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), - &instr->src[1], &instr->src[2], BI_SEG_WLS); - bi_split_dest(b, instr->dest); - break; - - case nir_intrinsic_load_frag_coord: - bi_emit_load_frag_coord(b, instr); - break; - - case nir_intrinsic_load_output: - bi_emit_ld_tile(b, instr); - break; - - case nir_intrinsic_discard_if: - bi_discard_b32(b, bi_src_index(&instr->src[0])); - break; - - case nir_intrinsic_discard: - bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ); - break; - - case nir_intrinsic_load_ssbo_address: - case nir_intrinsic_load_xfb_address: - bi_load_sysval_nir(b, instr, 2, 0); - break; - - case nir_intrinsic_load_work_dim: - case nir_intrinsic_load_num_vertices: - case nir_intrinsic_load_first_vertex: - case nir_intrinsic_load_draw_id: - bi_load_sysval_nir(b, instr, 1, 0); - break; - - case nir_intrinsic_load_base_vertex: - bi_load_sysval_nir(b, instr, 1, 4); - break; - - case nir_intrinsic_load_base_instance: - case nir_intrinsic_get_ssbo_size: - bi_load_sysval_nir(b, instr, 1, 8); - break; - - case nir_intrinsic_load_viewport_scale: - case nir_intrinsic_load_viewport_offset: - case nir_intrinsic_load_num_workgroups: - case nir_intrinsic_load_workgroup_size: - bi_load_sysval_nir(b, instr, 3, 0); - break; - - case nir_intrinsic_image_size: - bi_load_sysval_nir(b, instr, - nir_dest_num_components(instr->dest), 0); - break; - - case nir_intrinsic_load_blend_const_color_rgba: - bi_load_sysval_nir(b, instr, - nir_dest_num_components(instr->dest), 0); - break; - - case nir_intrinsic_load_sample_positions_pan: - bi_collect_v2i32_to(b, dst, - bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false), - bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); - break; - - case nir_intrinsic_load_sample_mask_in: - /* r61[0:15] contains the coverage bitmap */ - bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false)); - break; - - case nir_intrinsic_load_sample_id: - bi_load_sample_id_to(b, dst); - break; - - case nir_intrinsic_load_front_face: - /* r58 == 0 means primitive is front facing */ - bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ, - BI_RESULT_TYPE_M1); - break; - - case nir_intrinsic_load_point_coord: - bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, - BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER, - BI_VARYING_NAME_POINT, BI_VECSIZE_V2); - bi_emit_cached_split_i32(b, dst, 2); - break; - - /* It appears vertex_id is zero-based with Bifrost geometry flows, but - * not with Valhall's memory-allocation IDVS geometry flow. Ostensibly - * we support the legacy geometry flow even on Valhall, so - * vertex_id_zero_based isn't a machine property for us. Don't set it, - * and lower here if needed. - */ - case nir_intrinsic_load_vertex_id: - if (b->shader->malloc_idvs) { - bi_mov_i32_to(b, dst, bi_vertex_id(b)); - } else { - bi_index first = bi_load_sysval(b, - PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS, - 1, 0); - - bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false); - } - - break; - - /* We only use in our transform feedback lowering */ - case nir_intrinsic_load_vertex_id_zero_base: - assert(b->shader->nir->info.has_transform_feedback_varyings); - bi_mov_i32_to(b, dst, bi_vertex_id(b)); - break; - - case nir_intrinsic_load_instance_id: - bi_mov_i32_to(b, dst, bi_instance_id(b)); - break; - - case nir_intrinsic_load_subgroup_invocation: - bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false)); - break; - - case nir_intrinsic_load_local_invocation_id: - bi_collect_v3i32_to(b, dst, - bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)), - bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)), - bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0))); - break; - - case nir_intrinsic_load_workgroup_id: - bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58), - bi_preload(b, 59)); - break; - - case nir_intrinsic_load_global_invocation_id: - case nir_intrinsic_load_global_invocation_id_zero_base: - bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61), - bi_preload(b, 62)); - break; - - case nir_intrinsic_shader_clock: - bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER); - bi_split_dest(b, instr->dest); - break; - - default: - fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); - assert(0); - } -} - -static void -bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr) -{ - /* Make sure we've been lowered */ - assert(instr->def.num_components <= (32 / instr->def.bit_size)); - - /* Accumulate all the channels of the constant, as if we did an - * implicit SEL over them */ - uint32_t acc = 0; - - for (unsigned i = 0; i < instr->def.num_components; ++i) { - unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size); - acc |= (v << (i * instr->def.bit_size)); - } - - bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc)); -} - -static bi_index -bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps) -{ - /* we don't lower modifiers until the backend */ - assert(!(src.negate || src.abs)); - - unsigned bitsize = nir_src_bit_size(src.src); - - /* the bi_index carries the 32-bit (word) offset separate from the - * subword swizzle, first handle the offset */ - - unsigned offset = 0; - - assert(bitsize == 8 || bitsize == 16 || bitsize == 32); - unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; - - for (unsigned i = 0; i < comps; ++i) { - unsigned new_offset = (src.swizzle[i] >> subword_shift); - - if (i > 0) - assert(offset == new_offset && "wrong vectorization"); - - offset = new_offset; - } - - bi_index idx = bi_extract(b, bi_src_index(&src.src), offset); - - /* Compose the subword swizzle with existing (identity) swizzle */ - assert(idx.swizzle == BI_SWIZZLE_H01); - - /* Bigger vectors should have been lowered */ - assert(comps <= (1 << subword_shift)); - - if (bitsize == 16) { - unsigned c0 = src.swizzle[0] & 1; - unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0; - idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1); - } else if (bitsize == 8) { - /* 8-bit vectors not yet supported */ - assert(comps == 1 && "8-bit vectors not supported"); - idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3); - } - - return idx; -} - -static enum bi_round -bi_nir_round(nir_op op) -{ - switch (op) { - case nir_op_fround_even: return BI_ROUND_NONE; - case nir_op_ftrunc: return BI_ROUND_RTZ; - case nir_op_fceil: return BI_ROUND_RTP; - case nir_op_ffloor: return BI_ROUND_RTN; - default: unreachable("invalid nir round op"); - } -} - -/* Convenience for lowered transcendentals */ - -static bi_index -bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1) -{ - return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f)); -} - -/* Approximate with FRCP_APPROX.f32 and apply a single iteration of - * Newton-Raphson to improve precision */ - -static void -bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0) -{ - bi_index x1 = bi_frcp_approx_f32(b, s0); - bi_index m = bi_frexpm_f32(b, s0, false, false); - bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false); - bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), - bi_zero(), BI_SPECIAL_N); - bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE); -} - -static void -bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0) -{ - bi_index x1 = bi_frsq_approx_f32(b, s0); - bi_index m = bi_frexpm_f32(b, s0, false, true); - bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true); - bi_index t1 = bi_fmul_f32(b, x1, x1); - bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0), - bi_imm_u32(-1), BI_SPECIAL_N); - bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N); -} - -/* More complex transcendentals, see - * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc - * for documentation */ - -static void -bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0) -{ - bi_index t1 = bi_temp(b->shader); - bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000)); - t1_instr->clamp = BI_CLAMP_CLAMP_0_INF; - - bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000)); - - bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2)); - a2->clamp = BI_CLAMP_CLAMP_M1_1; - - bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE); - bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false); - bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4)); - bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635), - bi_imm_u32(0x3e75fffa)); - bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218)); - bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2); - bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), - p3, a1t, a1t, a1i, BI_SPECIAL_NONE); - x->clamp = BI_CLAMP_CLAMP_0_INF; - - bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0); - max->sem = BI_SEM_NAN_PROPAGATE; -} - -static void -bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base) -{ - /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24 - * fixed-point input */ - bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(), - bi_imm_u32(24), BI_SPECIAL_NONE); - bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale); - fixed_pt->round = BI_ROUND_NONE; // XXX - - /* Compute the result for the fixed-point input, but pass along - * the floating-point scale for correct NaN propagation */ - bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale); -} - -static void -bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0) -{ - /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */ - bi_index a1 = bi_frexpm_f32(b, s0, true, false); - bi_index ei = bi_frexpe_f32(b, s0, true, false); - bi_index ef = bi_s32_to_f32(b, ei); - - /* xt estimates -log(r1), a coarse approximation of log(a1) */ - bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE); - bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE); - - /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) - - * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1), - * and then log(s0) = x1 + x2 */ - bi_index x1 = bi_fadd_f32(b, ef, xt); - - /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by - * polynomial approximation around 1. The series is expressed around - * 1, so set y = (a1 * r1) - 1.0 */ - bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0)); - - /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate - * log_e(1 + y) by the Taylor series (lower precision than the blob): - * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */ - bi_index loge = bi_fmul_f32(b, y, - bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0))); - - bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0))); - - /* log(s0) = x1 + x2 */ - bi_fadd_f32_to(b, dst, x1, x2); -} + *srcs[i] = SSA_INVALID_VALUE; + } + emit_mir_instruction(ctx, instr); +} + +static uint32_t +emit_extract_vector_element(struct compiler_context *ctx, unsigned ssa_vector, unsigned element) +{ + uint32_t mir_temp_location = alloc_mir_temp(ctx); + // This instruction loads a vec3 starting from the initial register + struct bifrost_instruction instr = { + .op = op_extract_element, + .dest_components = 1, + .ssa_args = { + .dest = mir_temp_location, + .src0 = ssa_vector, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = element, + }; + emit_mir_instruction(ctx, instr); + + return mir_temp_location; +} +static uint32_t +emit_movi(struct compiler_context *ctx, uint32_t literal) +{ + uint32_t mir_temp_location = alloc_mir_temp(ctx); + // This instruction loads a vec3 starting from the initial register + struct bifrost_instruction instr = { + .op = op_movi, + .dest_components = 1, + .ssa_args = { + .dest = mir_temp_location, + .src0 = SSA_INVALID_VALUE, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = literal, + }; + emit_mir_instruction(ctx, instr); -static void -bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0) -{ - bi_index frexp = bi_frexpe_f32(b, s0, true, false); - bi_index frexpi = bi_s32_to_f32(b, frexp); - bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0); - bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi); + return mir_temp_location; } -static void -bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) +static unsigned +nir_alu_src_index_scalar(compiler_context *ctx, nir_alu_instr *nir_instr, unsigned src) { - bi_index log2_base = bi_null(); + // NIR uses a combination of single channels plus swizzles to determine which component is pulled out of a source + for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; c++) { + if (!nir_alu_instr_channel_used(nir_instr, src, c)) + continue; + // Pull the swizzle from this element that is active and use it as the source + unsigned element = nir_instr->src[src].swizzle[c]; - if (base.type == BI_INDEX_CONSTANT) { - log2_base = bi_imm_f32(log2f(uif(base.value))); - } else { - log2_base = bi_temp(b->shader); - bi_lower_flog2_32(b, log2_base, base); + // Create an op that extracts an element from a vector + return emit_extract_vector_element(ctx, nir_alu_src_index(ctx, &nir_instr->src[src]), element); } - - return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base)); + assert(0); + return 0; } static void -bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) +emit_intrinsic(struct compiler_context *ctx, nir_intrinsic_instr *nir_instr) { - bi_index log2_base = bi_null(); - - if (base.type == BI_INDEX_CONSTANT) { - log2_base = bi_imm_f32(log2f(uif(base.value))); - } else { - log2_base = bi_temp(b->shader); - bi_flog2_32(b, log2_base, base); - } - - return bi_fexp_32(b, dst, exp, log2_base); -} - -/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as - * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and - * calculates the results. We use them to calculate sin/cos via a Taylor - * approximation: - * - * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x) - * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x) - * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x) - */ - -#define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f) -#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0) -#define SINCOS_BIAS bi_imm_u32(0x49400000) + nir_const_value *const_offset; + unsigned offset, reg; -static void -bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos) -{ - /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */ - bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS); - - /* Approximate domain error (small) */ - bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)), - MPI_OVER_TWO, s0); - - /* Lookup sin(x), cos(x) */ - bi_index sinx = bi_fsin_table_u6(b, x_u6, false); - bi_index cosx = bi_fcos_table_u6(b, x_u6, false); - - /* e^2 / 2 */ - bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(), - bi_imm_u32(-1), BI_SPECIAL_NONE); - - /* (-e^2)/2 f''(x) */ - bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2), - cos ? cosx : sinx, - bi_negzero()); - - /* e f'(x) - (e^2/2) f''(x) */ - bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e, - cos ? bi_neg(sinx) : cosx, - quadratic); - I->clamp = BI_CLAMP_CLAMP_M1_1; - - /* f(x) + e f'(x) - (e^2/2) f''(x) */ - bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx); -} + switch (nir_instr->intrinsic) { + case nir_intrinsic_load_ubo: { + nir_const_value *location = nir_src_as_const_value(nir_instr->src[0]); + const_offset = nir_src_as_const_value(nir_instr->src[1]); + assert (location && "no indirect ubo selection"); + assert (const_offset && "no indirect inputs"); -/* - * The XOR lane op is useful for derivative calculations, but not all Bifrost - * implementations have it. Add a safe helper that uses the hardware - * functionality when available and lowers where unavailable. - */ -static bi_index -bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1) -{ - if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) { - return bi_clper_i32(b, s0, s1, - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR, - BI_SUBGROUP_SUBGROUP4); - } + enum bifrost_ir_ops op; - bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false); - bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0)); - return bi_clper_old_i32(b, s0, lane); -} - -static enum bi_cmpf -bi_translate_cmpf(nir_op op) -{ - switch (op) { - case nir_op_ieq8: - case nir_op_ieq16: - case nir_op_ieq32: - case nir_op_feq16: - case nir_op_feq32: - return BI_CMPF_EQ; - - case nir_op_ine8: - case nir_op_ine16: - case nir_op_ine32: - case nir_op_fneu16: - case nir_op_fneu32: - return BI_CMPF_NE; - - case nir_op_ilt8: - case nir_op_ilt16: - case nir_op_ilt32: - case nir_op_flt16: - case nir_op_flt32: - case nir_op_ult8: - case nir_op_ult16: - case nir_op_ult32: - return BI_CMPF_LT; - - case nir_op_ige8: - case nir_op_ige16: - case nir_op_ige32: - case nir_op_fge16: - case nir_op_fge32: - case nir_op_uge8: - case nir_op_uge16: - case nir_op_uge32: - return BI_CMPF_GE; - - default: - unreachable("invalid comparison"); - } -} - -static bool -bi_nir_is_replicated(nir_alu_src *src) -{ - for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) { - if (src->swizzle[0] == src->swizzle[i]) - return false; - } - - return true; -} - -static void -bi_emit_alu(bi_builder *b, nir_alu_instr *instr) -{ - bi_index dst = bi_dest_index(&instr->dest.dest); - unsigned srcs = nir_op_infos[instr->op].num_inputs; - unsigned sz = nir_dest_bit_size(instr->dest.dest); - unsigned comps = nir_dest_num_components(instr->dest.dest); - unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0; - - /* Indicate scalarness */ - if (sz == 16 && comps == 1) - dst.swizzle = BI_SWIZZLE_H00; - - /* First, match against the various moves in NIR. These are - * special-cased because they can operate on vectors even after - * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the - * instruction is no "bigger" than SIMD-within-a-register. These moves - * are the exceptions that need to handle swizzles specially. */ - - switch (instr->op) { - case nir_op_vec2: - case nir_op_vec3: - case nir_op_vec4: - case nir_op_vec8: - case nir_op_vec16: { - bi_index unoffset_srcs[16] = { bi_null() }; - unsigned channels[16] = { 0 }; - - for (unsigned i = 0; i < srcs; ++i) { - unoffset_srcs[i] = bi_src_index(&instr->src[i].src); - channels[i] = instr->src[i].swizzle[0]; + // load_ubo <UBO binding>, <byte offset> + // ld_ubo <byte offset>, <UBO binding> + switch (nir_dest_num_components(nir_instr->dest)) { + case 1: + op = op_ld_ubo_v1; + break; + case 2: + op = op_ld_ubo_v2; + break; + case 3: + op = op_ld_ubo_v3; + break; + case 4: + op = op_ld_ubo_v4; + break; + default: + assert(0); + break; } - bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz); - return; - } - - case nir_op_unpack_32_2x16: { - /* Should have been scalarized */ - assert(comps == 2 && sz == 16); - - bi_index vec = bi_src_index(&instr->src[0].src); - unsigned chan = instr->src[0].swizzle[0]; - - bi_mov_i32_to(b, dst, bi_extract(b, vec, chan)); - return; - } - - case nir_op_unpack_64_2x32_split_x: - { - unsigned chan = (instr->src[0].swizzle[0] * 2) + 0; - bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), chan)); - return; - } - - case nir_op_unpack_64_2x32_split_y: - { - unsigned chan = (instr->src[0].swizzle[0] * 2) + 1; - bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), chan)); - return; - } - - case nir_op_pack_64_2x32_split: - bi_collect_v2i32_to(b, dst, - bi_extract(b, bi_src_index(&instr->src[0].src), instr->src[0].swizzle[0]), - bi_extract(b, bi_src_index(&instr->src[1].src), instr->src[1].swizzle[0])); - return; - - case nir_op_pack_64_2x32: - bi_collect_v2i32_to(b, dst, - bi_extract(b, bi_src_index(&instr->src[0].src), 0), - bi_extract(b, bi_src_index(&instr->src[0].src), 1)); - return; - - case nir_op_pack_uvec2_to_uint: { - bi_index src = bi_src_index(&instr->src[0].src); - - assert(sz == 32 && src_sz == 32); - bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false), - bi_half(bi_extract(b, src, 1), false)); - return; - } - - case nir_op_pack_uvec4_to_uint: { - bi_index src = bi_src_index(&instr->src[0].src); - - assert(sz == 32 && src_sz == 32); - bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0), - bi_byte(bi_extract(b, src, 1), 0), - bi_byte(bi_extract(b, src, 2), 0), - bi_byte(bi_extract(b, src, 3), 0)); - return; - } - - case nir_op_mov: { - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; - - unsigned channels[4] = { - comps > 0 ? instr->src[0].swizzle[0] : 0, - comps > 1 ? instr->src[0].swizzle[1] : 0, - comps > 2 ? instr->src[0].swizzle[2] : 0, - comps > 3 ? instr->src[0].swizzle[3] : 0, - }; - - bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz); - return; - } - - case nir_op_pack_32_2x16: { - assert(comps == 1); - - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; - - unsigned channels[2] = { - instr->src[0].swizzle[0], - instr->src[0].swizzle[1] + reg = nir_dest_index(ctx, &nir_instr->dest); + struct bifrost_instruction instr = { + .op = op, + .dest_components = nir_dest_num_components(nir_instr->dest), + .ssa_args = { + .dest = reg, + .src0 = SSA_INVALID_VALUE, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = nir_src_as_uint(nir_instr->src[1]), + .literal_args[1] = nir_src_as_uint(nir_instr->src[0]), }; - bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16); - return; + emit_mir_instruction(ctx, instr); + break; } + case nir_intrinsic_store_ssbo: { + nir_const_value *location = nir_src_as_const_value(nir_instr->src[1]); + const_offset = nir_src_as_const_value(nir_instr->src[2]); + assert (location && "no indirect ubo selection"); + assert (const_offset && "no indirect inputs"); - case nir_op_f2f16: - case nir_op_f2f16_rtz: - case nir_op_f2f16_rtne: { - assert(src_sz == 32); - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); - bi_index s1 = comps > 1 ? - bi_extract(b, idx, instr->src[0].swizzle[1]) : s0; - - bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1); - - /* Override rounding if explicitly requested. Otherwise, the - * default rounding mode is selected by the builder. Depending - * on the float controls required by the shader, the default - * mode may not be nearest-even. - */ - if (instr->op == nir_op_f2f16_rtz) - I->round = BI_ROUND_RTZ; - else if (instr->op == nir_op_f2f16_rtne) - I->round = BI_ROUND_NONE; /* Nearest even */ + // store_ssbo <Value>, <binding>, <offset> + // store_vN <Addr>, <Value> + reg = nir_src_index(ctx, &nir_instr->src[0]); - return; - } - - /* Vectorized downcasts */ - case nir_op_u2u16: - case nir_op_i2i16: { - if (!(src_sz == 32 && comps == 2)) + enum bifrost_ir_ops op; + switch (nir_src_num_components(nir_instr->src[0])) { + case 1: + op = op_store_v1; break; - - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); - bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]); - - bi_mkvec_v2i16_to(b, dst, - bi_half(s0, false), bi_half(s1, false)); - return; - } - - /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to - * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than - * scalarizing due to scheduling (equal cost on Valhall). Additionally - * if the source is replicated the MKVEC.v2i16 can be optimized out. - */ - case nir_op_u2f16: - case nir_op_i2f16: { - if (!(src_sz == 32 && comps == 2)) + case 2: + op = op_store_v2; break; - - nir_alu_src *src = &instr->src[0]; - bi_index idx = bi_src_index(&src->src); - bi_index s0 = bi_extract(b, idx, src->swizzle[0]); - bi_index s1 = bi_extract(b, idx, src->swizzle[1]); - - bi_index t = (src->swizzle[0] == src->swizzle[1]) ? - bi_half(s0, false) : - bi_mkvec_v2i16(b, bi_half(s0, false), - bi_half(s1, false)); - - if (instr->op == nir_op_u2f16) - bi_v2u16_to_v2f16_to(b, dst, t); - else - bi_v2s16_to_v2f16_to(b, dst, t); - - return; - } - - case nir_op_i2i8: - case nir_op_u2u8: - { - /* Acts like an 8-bit swizzle */ - bi_index idx = bi_src_index(&instr->src[0].src); - unsigned factor = src_sz / 8; - unsigned chan[4] = { 0 }; - - for (unsigned i = 0; i < comps; ++i) - chan[i] = instr->src[0].swizzle[i] * factor; - - bi_make_vec_to(b, dst, &idx, chan, comps, 8); - return; - } - - case nir_op_b32csel: - { - if (sz != 16) + case 3: + op = op_store_v3; + break; + case 4: + op = op_store_v4; + break; + default: + assert(0); break; - - /* We allow vectorizing b32csel(cond, A, B) which can be - * translated as MUX.v2i16, even though cond is a 32-bit vector. - * - * If the source condition vector is replicated, we can use - * MUX.v2i16 directly, letting each component use the - * corresponding half of the 32-bit source. NIR uses 0/~0 - * booleans so that's guaranteed to work (that is, 32-bit NIR - * booleans are 16-bit replicated). - * - * If we're not replicated, we use the same trick but must - * insert a MKVEC.v2i16 first to convert down to 16-bit. - */ - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); - bi_index s1 = bi_alu_src_index(b, instr->src[1], comps); - bi_index s2 = bi_alu_src_index(b, instr->src[2], comps); - - if (!bi_nir_is_replicated(&instr->src[0])) { - s0 = bi_mkvec_v2i16(b, bi_half(s0, false), - bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false)); } - bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); - return; - } - - default: - break; - } - - bi_index s0 = srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null(); - bi_index s1 = srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null(); - bi_index s2 = srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null(); - - switch (instr->op) { - case nir_op_ffma: - bi_fma_to(b, sz, dst, s0, s1, s2); - break; - - case nir_op_fmul: - bi_fma_to(b, sz, dst, s0, s1, bi_negzero()); - break; - - case nir_op_fsub: - s1 = bi_neg(s1); - FALLTHROUGH; - case nir_op_fadd: - bi_fadd_to(b, sz, dst, s0, s1); - break; - - case nir_op_fsat: { - bi_instr *I = bi_fclamp_to(b, sz, dst, s0); - I->clamp = BI_CLAMP_CLAMP_0_1; - break; - } - - case nir_op_fsat_signed_mali: { - bi_instr *I = bi_fclamp_to(b, sz, dst, s0); - I->clamp = BI_CLAMP_CLAMP_M1_1; - break; - } - - case nir_op_fclamp_pos_mali: { - bi_instr *I = bi_fclamp_to(b, sz, dst, s0); - I->clamp = BI_CLAMP_CLAMP_0_INF; + struct bifrost_instruction instr = { + .op = op, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = reg, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = nir_src_as_uint(nir_instr->src[2]), + }; + emit_mir_instruction(ctx, instr); break; } + case nir_intrinsic_load_uniform: + offset = nir_intrinsic_base(nir_instr); - case nir_op_fneg: - bi_fabsneg_to(b, sz, dst, bi_neg(s0)); - break; - - case nir_op_fabs: - bi_fabsneg_to(b, sz, dst, bi_abs(s0)); - break; - - case nir_op_fsin: - bi_lower_fsincos_32(b, dst, s0, false); - break; - - case nir_op_fcos: - bi_lower_fsincos_32(b, dst, s0, true); - break; - - case nir_op_fexp2: - assert(sz == 32); /* should've been lowered */ - - if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_fexp2_32(b, dst, s0); - else - bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f)); - - break; - - case nir_op_flog2: - assert(sz == 32); /* should've been lowered */ - - if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_flog2_32(b, dst, s0); - else - bi_flog2_32(b, dst, s0); - - break; - - case nir_op_fpow: - assert(sz == 32); /* should've been lowered */ - - if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_fpow_32(b, dst, s0, s1); - else - bi_fpow_32(b, dst, s0, s1); - - break; - - case nir_op_frexp_exp: - bi_frexpe_to(b, sz, dst, s0, false, false); - break; - - case nir_op_frexp_sig: - bi_frexpm_to(b, sz, dst, s0, false, false); - break; - - case nir_op_ldexp: - bi_ldexp_to(b, sz, dst, s0, s1); - break; - - case nir_op_b8csel: - bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); - break; - - case nir_op_b16csel: - bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); - break; - - case nir_op_b32csel: - bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); - break; - - case nir_op_extract_u8: - case nir_op_extract_i8: { - assert(comps == 1 && "should be scalarized"); - assert((src_sz == 16 || src_sz == 32) && "should be lowered"); - unsigned byte = nir_src_as_uint(instr->src[1].src); - - if (s0.swizzle == BI_SWIZZLE_H11) { - assert(byte < 2); - byte += 2; - } else if (s0.swizzle != BI_SWIZZLE_H01) { - assert(s0.swizzle == BI_SWIZZLE_H00); + if (nir_src_is_const(nir_instr->src[0])) { + offset += nir_src_as_uint(nir_instr->src[0]); + } else { + assert(0 && "Can't handle indirect load_uniform"); } - assert(byte < 4); - - s0.swizzle = BI_SWIZZLE_H01; - - if (instr->op == nir_op_extract_i8) - bi_s8_to_s32_to(b, dst, bi_byte(s0, byte)); - else - bi_u8_to_u32_to(b, dst, bi_byte(s0, byte)); - break; - } - - case nir_op_extract_u16: - case nir_op_extract_i16: { - assert(comps == 1 && "should be scalarized"); - assert(src_sz == 32 && "should be lowered"); - unsigned half = nir_src_as_uint(instr->src[1].src); - assert(half == 0 || half == 1); - - if (instr->op == nir_op_extract_i16) - bi_s16_to_s32_to(b, dst, bi_half(s0, half)); - else - bi_u16_to_u32_to(b, dst, bi_half(s0, half)); - break; - } - - case nir_op_insert_u16: { - assert(comps == 1 && "should be scalarized"); - unsigned half = nir_src_as_uint(instr->src[1].src); - assert(half == 0 || half == 1); - - if (half == 0) - bi_u16_to_u32_to(b, dst, bi_half(s0, 0)); - else - bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0)); - break; - } - - case nir_op_ishl: - bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); - break; - case nir_op_ushr: - bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false); - break; + reg = nir_dest_index(ctx, &nir_instr->dest); + + unsigned num_components = nir_dest_num_components(nir_instr->dest); + if (num_components == 1) { + struct bifrost_instruction instr = { + .op = op_mov, + .dest_components = 1, + .ssa_args = { + .dest = reg, + .src0 = SSA_FIXED_UREGISTER(offset), + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + }; + emit_mir_instruction(ctx, instr); + } else { + uint32_t comps[4]; + + for (unsigned i = 0; i < nir_dest_num_components(nir_instr->dest); ++i) { + uint32_t temp_dest = alloc_mir_temp(ctx); + comps[i] = temp_dest; + struct bifrost_instruction instr = { + .op = op_mov, + .dest_components = 1, + .ssa_args = { + .dest = temp_dest, + .src0 = SSA_FIXED_UREGISTER(offset + (i * 4)), + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + }; + emit_mir_instruction(ctx, instr); + } - case nir_op_ishr: - if (b->shader->arch >= 9) - bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true); - else - bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0)); + emit_create_vector(ctx, reg, num_components, comps); + } break; - case nir_op_imin: - case nir_op_umin: - bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, - s0, s1, s0, s1, BI_CMPF_LT); - break; + case nir_intrinsic_load_input: { + const_offset = nir_src_as_const_value(nir_instr->src[0]); + assert (const_offset && "no indirect inputs"); - case nir_op_imax: - case nir_op_umax: - bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, - s0, s1, s0, s1, BI_CMPF_GT); - break; + offset = nir_intrinsic_base(nir_instr) + nir_src_as_uint(nir_instr->src[0]); - case nir_op_fddx_must_abs_mali: - case nir_op_fddy_must_abs_mali: { - bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2); - bi_index adjacent = bi_clper_xor(b, s0, bit); - bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0)); - break; - } + reg = nir_dest_index(ctx, &nir_instr->dest); - case nir_op_fddx: - case nir_op_fddy: - case nir_op_fddx_coarse: - case nir_op_fddy_coarse: - case nir_op_fddx_fine: - case nir_op_fddy_fine: { - unsigned axis; - switch (instr->op) { - case nir_op_fddx: - case nir_op_fddx_coarse: - case nir_op_fddx_fine: - axis = 1; + enum bifrost_ir_ops op; + switch (nir_dest_num_components(nir_instr->dest)) { + case 1: + op = op_ld_attr_v1; break; - case nir_op_fddy: - case nir_op_fddy_coarse: - case nir_op_fddy_fine: - axis = 2; + case 2: + op = op_ld_attr_v2; break; - default: - unreachable("Invalid derivative op"); - } - - bi_index lane1, lane2; - switch (instr->op) { - case nir_op_fddx: - case nir_op_fddx_fine: - case nir_op_fddy: - case nir_op_fddy_fine: - lane1 = bi_lshift_and_i32(b, - bi_fau(BIR_FAU_LANE_ID, false), - bi_imm_u32(0x3 & ~axis), - bi_imm_u8(0)); - - lane2 = bi_iadd_u32(b, lane1, - bi_imm_u32(axis), - false); + case 3: + op = op_ld_attr_v3; break; - case nir_op_fddx_coarse: - case nir_op_fddy_coarse: - lane1 = bi_imm_u32(0); - lane2 = bi_imm_u32(axis); + case 4: + op = op_ld_attr_v4; break; default: - unreachable("Invalid derivative op"); - } - - bi_index left, right; - - if (b->shader->quirks & BIFROST_LIMITED_CLPER) { - left = bi_clper_old_i32(b, s0, lane1); - right = bi_clper_old_i32(b, s0, lane2); - } else { - left = bi_clper_i32(b, s0, lane1, - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); - - right = bi_clper_i32(b, s0, lane2, - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); + assert(0); + break; } - bi_fadd_to(b, sz, dst, right, bi_neg(left)); - break; - } - - case nir_op_f2f32: - bi_f16_to_f32_to(b, dst, s0); - break; - - case nir_op_fquantize2f16: - { - bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0); - bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false)); - - f16->ftz = f32->ftz = true; - break; - } - - case nir_op_f2i32: - if (src_sz == 32) - bi_f32_to_s32_to(b, dst, s0); - else - bi_f16_to_s32_to(b, dst, s0); - break; - - /* Note 32-bit sources => no vectorization, so 32-bit works */ - case nir_op_f2u16: - if (src_sz == 32) - bi_f32_to_u32_to(b, dst, s0); - else - bi_v2f16_to_v2u16_to(b, dst, s0); - break; - - case nir_op_f2i16: - if (src_sz == 32) - bi_f32_to_s32_to(b, dst, s0); - else - bi_v2f16_to_v2s16_to(b, dst, s0); - break; - - case nir_op_f2u32: - if (src_sz == 32) - bi_f32_to_u32_to(b, dst, s0); - else - bi_f16_to_u32_to(b, dst, s0); - break; - - case nir_op_u2f16: - if (src_sz == 32) - bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false)); - else if (src_sz == 16) - bi_v2u16_to_v2f16_to(b, dst, s0); - else if (src_sz == 8) - bi_v2u8_to_v2f16_to(b, dst, s0); - break; - - case nir_op_u2f32: - if (src_sz == 32) - bi_u32_to_f32_to(b, dst, s0); - else if (src_sz == 16) - bi_u16_to_f32_to(b, dst, s0); - else - bi_u8_to_f32_to(b, dst, s0); - break; - - case nir_op_i2f16: - if (src_sz == 32) - bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false)); - else if (src_sz == 16) - bi_v2s16_to_v2f16_to(b, dst, s0); - else if (src_sz == 8) - bi_v2s8_to_v2f16_to(b, dst, s0); - break; - - case nir_op_i2f32: - assert(src_sz == 32 || src_sz == 16 || src_sz == 8); - - if (src_sz == 32) - bi_s32_to_f32_to(b, dst, s0); - else if (src_sz == 16) - bi_s16_to_f32_to(b, dst, s0); - else if (src_sz == 8) - bi_s8_to_f32_to(b, dst, s0); - break; - - case nir_op_i2i32: - assert(src_sz == 32 || src_sz == 16 || src_sz == 8); - - if (src_sz == 32) - bi_mov_i32_to(b, dst, s0); - else if (src_sz == 16) - bi_s16_to_s32_to(b, dst, s0); - else if (src_sz == 8) - bi_s8_to_s32_to(b, dst, s0); - break; - - case nir_op_u2u32: - assert(src_sz == 32 || src_sz == 16 || src_sz == 8); - - if (src_sz == 32) - bi_mov_i32_to(b, dst, s0); - else if (src_sz == 16) - bi_u16_to_u32_to(b, dst, s0); - else if (src_sz == 8) - bi_u8_to_u32_to(b, dst, s0); - - break; - - case nir_op_i2i16: - assert(src_sz == 8 || src_sz == 32); - - if (src_sz == 8) - bi_v2s8_to_v2s16_to(b, dst, s0); - else - bi_mov_i32_to(b, dst, s0); - break; - - case nir_op_u2u16: - assert(src_sz == 8 || src_sz == 32); - - if (src_sz == 8) - bi_v2u8_to_v2u16_to(b, dst, s0); - else - bi_mov_i32_to(b, dst, s0); - break; - - case nir_op_b2i8: - case nir_op_b2i16: - case nir_op_b2i32: - bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0, BI_MUX_INT_ZERO); - break; - - case nir_op_f2b16: - bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_FP_ZERO); - break; - case nir_op_f2b32: - bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_FP_ZERO); - break; - - case nir_op_i2b8: - bi_mux_v4i8_to(b, dst, bi_imm_u8(0), bi_imm_u8(~0), s0, BI_MUX_INT_ZERO); - break; - case nir_op_i2b16: - bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_INT_ZERO); - break; - case nir_op_i2b32: - bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_INT_ZERO); - break; - - case nir_op_ieq8: - case nir_op_ine8: - case nir_op_ilt8: - case nir_op_ige8: - case nir_op_ieq16: - case nir_op_ine16: - case nir_op_ilt16: - case nir_op_ige16: - case nir_op_ieq32: - case nir_op_ine32: - case nir_op_ilt32: - case nir_op_ige32: - bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); - break; - - case nir_op_ult8: - case nir_op_uge8: - case nir_op_ult16: - case nir_op_uge16: - case nir_op_ult32: - case nir_op_uge32: - bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); - break; - - case nir_op_feq32: - case nir_op_feq16: - case nir_op_flt32: - case nir_op_flt16: - case nir_op_fge32: - case nir_op_fge16: - case nir_op_fneu32: - case nir_op_fneu16: - bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); - break; - - case nir_op_fround_even: - case nir_op_fceil: - case nir_op_ffloor: - case nir_op_ftrunc: - bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op)); - break; - - case nir_op_fmin: - bi_fmin_to(b, sz, dst, s0, s1); - break; - - case nir_op_fmax: - bi_fmax_to(b, sz, dst, s0, s1); - break; - - case nir_op_iadd: - bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); - break; - - case nir_op_iadd_sat: - bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true); - break; - - case nir_op_uadd_sat: - bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true); - break; - - case nir_op_ihadd: - bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN); - break; - - case nir_op_irhadd: - bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP); - break; - - case nir_op_uhadd: - bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN); - break; - - case nir_op_urhadd: - bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP); - break; - - case nir_op_ineg: - bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false); - break; - - case nir_op_isub: - bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false); - break; - - case nir_op_isub_sat: - bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true); - break; - - case nir_op_usub_sat: - bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true); - break; - - case nir_op_imul: - bi_imul_to(b, sz, dst, s0, s1); - break; - - case nir_op_iabs: - bi_iabs_to(b, sz, dst, s0); - break; - - case nir_op_iand: - bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); - break; - - case nir_op_ior: - bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); - break; - - case nir_op_ixor: - bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); - break; - - case nir_op_inot: - bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); - break; - - case nir_op_frsq: - if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_frsq_32(b, dst, s0); - else - bi_frsq_to(b, sz, dst, s0); - break; - - case nir_op_frcp: - if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_frcp_32(b, dst, s0); - else - bi_frcp_to(b, sz, dst, s0); - break; - - case nir_op_uclz: - bi_clz_to(b, sz, dst, s0, false); - break; - - case nir_op_bit_count: - assert(sz == 32 && src_sz == 32 && "should've been lowered"); - bi_popcount_i32_to(b, dst, s0); - break; - - case nir_op_bitfield_reverse: - assert(sz == 32 && src_sz == 32 && "should've been lowered"); - bi_bitrev_i32_to(b, dst, s0); - break; - - case nir_op_ufind_msb: { - bi_index clz = bi_clz(b, src_sz, s0, false); - - if (sz == 8) - clz = bi_byte(clz, 0); - else if (sz == 16) - clz = bi_half(clz, false); - - bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false); - break; - } - - default: - fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); - unreachable("Unknown ALU op"); - } -} - -/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */ -static unsigned -bifrost_tex_format(enum glsl_sampler_dim dim) -{ - switch (dim) { - case GLSL_SAMPLER_DIM_1D: - case GLSL_SAMPLER_DIM_BUF: - return 1; - - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_MS: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_RECT: - return 2; - - case GLSL_SAMPLER_DIM_3D: - return 3; - - case GLSL_SAMPLER_DIM_CUBE: - return 0; - - default: - DBG("Unknown sampler dim type\n"); - assert(0); - return 0; - } -} - -static enum bi_dimension -valhall_tex_dimension(enum glsl_sampler_dim dim) -{ - switch (dim) { - case GLSL_SAMPLER_DIM_1D: - case GLSL_SAMPLER_DIM_BUF: - return BI_DIMENSION_1D; - - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_MS: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_RECT: - return BI_DIMENSION_2D; - - case GLSL_SAMPLER_DIM_3D: - return BI_DIMENSION_3D; - - case GLSL_SAMPLER_DIM_CUBE: - return BI_DIMENSION_CUBE; - - default: - unreachable("Unknown sampler dim type"); - } -} - -static enum bifrost_texture_format_full -bi_texture_format(nir_alu_type T, enum bi_clamp clamp) -{ - switch (T) { - case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp; - case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp; - case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16; - case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16; - case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32; - case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32; - default: unreachable("Invalid type for texturing"); - } -} - -/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */ -static bi_index -bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T) -{ - /* For (u)int we can just passthrough */ - nir_alu_type base = nir_alu_type_get_base_type(T); - if (base == nir_type_int || base == nir_type_uint) - return idx; - - /* Otherwise we convert */ - assert(T == nir_type_float32); - - /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and - * Texel Selection") defines the layer to be taken from clamp(RNE(r), - * 0, dt - 1). So we use round RTE, clamping is handled at the data - * structure level */ - - bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx); - I->round = BI_ROUND_NONE; - return I->dest[0]; -} - -/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a - * 16-bit 8:8 fixed-point format. We lower as: - * - * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF = - * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0) - */ - -static bi_index -bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16) -{ - /* Precompute for constant LODs to avoid general constant folding */ - if (lod.type == BI_INDEX_CONSTANT) { - uint32_t raw = lod.value; - float x = fp16 ? _mesa_half_to_float(raw) : uif(raw); - int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f; - return bi_imm_u32(s32 & 0xFFFF); - } - - /* Sort of arbitrary. Must be less than 128.0, greater than or equal to - * the max LOD (16 since we cap at 2^16 texture dimensions), and - * preferably small to minimize precision loss */ - const float max_lod = 16.0; - - bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader), - fp16 ? bi_half(lod, false) : lod, - bi_imm_f32(1.0f / max_lod), bi_negzero()); - - fsat->clamp = BI_CLAMP_CLAMP_M1_1; - - bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), - bi_negzero()); - - return bi_mkvec_v2i16(b, - bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0)); -} - -/* FETCH takes a 32-bit staging register containing the LOD as an integer in - * the bottom 16-bits and (if present) the cube face index in the top 16-bits. - * TODO: Cube face. - */ - -static bi_index -bi_emit_texc_lod_cube(bi_builder *b, bi_index lod) -{ - return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8)); -} - -/* The hardware specifies texel offsets and multisample indices together as a - * u8vec4 <offset, ms index>. By default all are zero, so if have either a - * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with - * the bits we need and return that to be passed as a staging register. Else we - * return 0 to avoid allocating a data register when everything is zero. */ - -static bi_index -bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr) -{ - bi_index dest = bi_zero(); - - int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); - if (offs_idx >= 0 && - (!nir_src_is_const(instr->src[offs_idx].src) || - nir_src_as_uint(instr->src[offs_idx].src) != 0)) { - unsigned nr = nir_src_num_components(instr->src[offs_idx].src); - bi_index idx = bi_src_index(&instr->src[offs_idx].src); - dest = bi_mkvec_v4i8(b, - (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), - (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), - (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0), - bi_imm_u8(0)); - } - - int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); - if (ms_idx >= 0 && - (!nir_src_is_const(instr->src[ms_idx].src) || - nir_src_as_uint(instr->src[ms_idx].src) != 0)) { - dest = bi_lshift_or_i32(b, - bi_src_index(&instr->src[ms_idx].src), dest, - bi_imm_u8(24)); - } - - return dest; -} - -/* - * Valhall specifies specifies texel offsets, multisample indices, and (for - * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third - * component is either offset.z or multisample index depending on context. Build - * this register. - */ -static bi_index -bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr) -{ - bi_index dest = bi_zero(); - - int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); - int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); - int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); - - /* Components 0-2: offsets */ - if (offs_idx >= 0 && - (!nir_src_is_const(instr->src[offs_idx].src) || - nir_src_as_uint(instr->src[offs_idx].src) != 0)) { - unsigned nr = nir_src_num_components(instr->src[offs_idx].src); - bi_index idx = bi_src_index(&instr->src[offs_idx].src); - - /* No multisample index with 3D */ - assert((nr <= 2) || (ms_idx < 0)); - - /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */ - bi_index z = (nr > 2) ? - bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0), - bi_imm_u8(0), bi_zero()) : - bi_zero(); - - dest = bi_mkvec_v2i8(b, - (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), - (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), - z); - } - - /* Component 2: multisample index */ - if (ms_idx >= 0 && - (!nir_src_is_const(instr->src[ms_idx].src) || - nir_src_as_uint(instr->src[ms_idx].src) != 0)) { - dest = bi_mkvec_v2i16(b, dest, - bi_src_index(&instr->src[ms_idx].src)); - } - - /* Component 3: 8-bit LOD */ - if (lod_idx >= 0 && - (!nir_src_is_const(instr->src[lod_idx].src) || - nir_src_as_uint(instr->src[lod_idx].src) != 0) && - nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) { - dest = bi_lshift_or_i32(b, - bi_src_index(&instr->src[lod_idx].src), dest, - bi_imm_u8(24)); - } - - return dest; -} - -static void -bi_emit_cube_coord(bi_builder *b, bi_index coord, - bi_index *face, bi_index *s, bi_index *t) -{ - /* Compute max { |x|, |y|, |z| } */ - bi_index maxxyz = bi_temp(b->shader); - *face = bi_temp(b->shader); - - bi_index cx = bi_extract(b, coord, 0), - cy = bi_extract(b, coord, 1), - cz = bi_extract(b, coord, 2); - - /* Use a pseudo op on Bifrost due to tuple restrictions */ - if (b->shader->arch <= 8) { - bi_cubeface_to(b, maxxyz, *face, cx, cy, cz); - } else { - bi_cubeface1_to(b, maxxyz, cx, cy, cz); - bi_cubeface2_v9_to(b, *face, cx, cy, cz); - } - - /* Select coordinates */ - bi_index ssel = bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face); - bi_index tsel = bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), - *face); - - /* The OpenGL ES specification requires us to transform an input vector - * (x, y, z) to the coordinate, given the selected S/T: - * - * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1)) - * - * We implement (s shown, t similar) in a form friendlier to FMA - * instructions, and clamp coordinates at the end for correct - * NaN/infinity handling: - * - * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5) - * - * Take the reciprocal of max{x, y, z} - */ - bi_index rcp = bi_frcp_f32(b, maxxyz); - - /* Calculate 0.5 * (1.0 / max{x, y, z}) */ - bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero()); - - /* Transform the coordinates */ - *s = bi_temp(b->shader); - *t = bi_temp(b->shader); - - bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f)); - bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f)); - - S->clamp = BI_CLAMP_CLAMP_0_1; - T->clamp = BI_CLAMP_CLAMP_0_1; -} - -/* Emits a cube map descriptor, returning lower 32-bits and putting upper - * 32-bits in passed pointer t. The packing of the face with the S coordinate - * exploits the redundancy of floating points with the range restriction of - * CUBEFACE output. - * - * struct cube_map_descriptor { - * float s : 29; - * unsigned face : 3; - * float t : 32; - * } - * - * Since the cube face index is preshifted, this is easy to pack with a bitwise - * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3 - * bits from face. - */ - -static bi_index -bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t) -{ - bi_index face, s; - bi_emit_cube_coord(b, coord, &face, &s, t); - bi_index mask = bi_imm_u32(BITFIELD_MASK(29)); - return bi_mux_i32(b, s, face, mask, BI_MUX_BIT); -} - -/* Map to the main texture op used. Some of these (txd in particular) will - * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in - * sequence). We assume that lowering is handled elsewhere. - */ - -static enum bifrost_tex_op -bi_tex_op(nir_texop op) -{ - switch (op) { - case nir_texop_tex: - case nir_texop_txb: - case nir_texop_txl: - case nir_texop_txd: - case nir_texop_tex_prefetch: - return BIFROST_TEX_OP_TEX; - case nir_texop_txf: - case nir_texop_txf_ms: - case nir_texop_txf_ms_fb: - case nir_texop_tg4: - return BIFROST_TEX_OP_FETCH; - case nir_texop_txs: - case nir_texop_lod: - case nir_texop_query_levels: - case nir_texop_texture_samples: - case nir_texop_samples_identical: - unreachable("should've been lowered"); - default: - unreachable("unsupported tex op"); - } -} - -/* Data registers required by texturing in the order they appear. All are - * optional, the texture operation descriptor determines which are present. - * Note since 3D arrays are not permitted at an API level, Z_COORD and - * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */ - -enum bifrost_tex_dreg { - BIFROST_TEX_DREG_Z_COORD = 0, - BIFROST_TEX_DREG_Y_DELTAS = 1, - BIFROST_TEX_DREG_LOD = 2, - BIFROST_TEX_DREG_GRDESC_HI = 3, - BIFROST_TEX_DREG_SHADOW = 4, - BIFROST_TEX_DREG_ARRAY = 5, - BIFROST_TEX_DREG_OFFSETMS = 6, - BIFROST_TEX_DREG_SAMPLER = 7, - BIFROST_TEX_DREG_TEXTURE = 8, - BIFROST_TEX_DREG_COUNT, -}; - -static void -bi_emit_texc(bi_builder *b, nir_tex_instr *instr) -{ - struct bifrost_texture_operation desc = { - .op = bi_tex_op(instr->op), - .offset_or_bias_disable = false, /* TODO */ - .shadow_or_clamp_disable = instr->is_shadow, - .array = instr->is_array, - .dimension = bifrost_tex_format(instr->sampler_dim), - .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */ - .mask = 0xF, - }; + struct bifrost_instruction instr = { + .op = op, + .dest_components = nir_dest_num_components(nir_instr->dest), + .ssa_args = { + .dest = reg, + .src0 = offset, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + } + }; - switch (desc.op) { - case BIFROST_TEX_OP_TEX: - desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE; - break; - case BIFROST_TEX_OP_FETCH: - desc.lod_or_fetch = (enum bifrost_lod_mode) - (instr->op == nir_texop_tg4 ? - BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component : - BIFROST_TEXTURE_FETCH_TEXEL); + emit_mir_instruction(ctx, instr); break; - default: - unreachable("texture op unsupported"); } + case nir_intrinsic_store_output: { + const_offset = nir_src_as_const_value(nir_instr->src[1]); + assert(const_offset && "no indirect outputs"); - /* 32-bit indices to be allocated as consecutive staging registers */ - bi_index dregs[BIFROST_TEX_DREG_COUNT] = { }; - bi_index cx = bi_null(), cy = bi_null(); - - for (unsigned i = 0; i < instr->num_srcs; ++i) { - bi_index index = bi_src_index(&instr->src[i].src); - unsigned sz = nir_src_bit_size(instr->src[i].src); - unsigned components = nir_src_num_components(instr->src[i].src); - ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i); - nir_alu_type T = base | sz; - - switch (instr->src[i].src_type) { - case nir_tex_src_coord: - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - cx = bi_emit_texc_cube_coord(b, index, &cy); - } else { - /* Copy XY (for 2D+) or XX (for 1D) */ - cx = bi_extract(b, index, 0); - cy = bi_extract(b, index, MIN2(1, components - 1)); - - assert(components >= 1 && components <= 3); - - if (components == 3 && !desc.array) { - /* 3D */ - dregs[BIFROST_TEX_DREG_Z_COORD] = - bi_extract(b, index, 2); - } - } + offset = nir_intrinsic_base(nir_instr); + if (ctx->stage == MESA_SHADER_FRAGMENT) { + int comp = nir_intrinsic_component(nir_instr); + offset += comp; + // XXX: Once we support more than colour output then this will need to change + void *entry = _mesa_hash_table_u64_search(ctx->outputs_nir_to_bi, offset + FRAG_RESULT_DATA0 + 1); - if (desc.array) { - dregs[BIFROST_TEX_DREG_ARRAY] = - bi_emit_texc_array_index(b, - bi_extract(b, index, components - 1), T); + if (!entry) { + printf("WARNING: skipping fragment output\n"); + break; } - break; + offset = (uintptr_t) (entry) - 1; + reg = nir_src_index(ctx, &nir_instr->src[0]); - case nir_tex_src_lod: - if (desc.op == BIFROST_TEX_OP_TEX && - nir_src_is_const(instr->src[i].src) && - nir_src_as_uint(instr->src[i].src) == 0) { - desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO; - } else if (desc.op == BIFROST_TEX_OP_TEX) { - assert(base == nir_type_float); - - assert(sz == 16 || sz == 32); - dregs[BIFROST_TEX_DREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); - desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT; - } else { - assert(desc.op == BIFROST_TEX_OP_FETCH); - assert(base == nir_type_uint || base == nir_type_int); - assert(sz == 16 || sz == 32); - - dregs[BIFROST_TEX_DREG_LOD] = - bi_emit_texc_lod_cube(b, index); + enum bifrost_ir_ops op; + switch (nir_src_num_components(nir_instr->src[0])) { + case 1: + op = op_store_v1; + break; + case 2: + op = op_store_v2; + break; + case 3: + op = op_store_v3; + break; + case 4: + op = op_store_v4; + break; + default: + assert(0); + break; } - break; - - case nir_tex_src_bias: - /* Upper 16-bits interpreted as a clamp, leave zero */ - assert(desc.op == BIFROST_TEX_OP_TEX); - assert(base == nir_type_float); - assert(sz == 16 || sz == 32); - dregs[BIFROST_TEX_DREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); - desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS; - break; - - case nir_tex_src_ms_index: - case nir_tex_src_offset: - if (desc.offset_or_bias_disable) + // XXX: All offsets aren't vec4 aligned. Will need to adjust this in the future + // XXX: This needs to offset correctly in to memory so the blend step can pick it up + uint32_t movi = emit_movi(ctx, offset * 16); + uint32_t movi2 = emit_movi(ctx, 0); + + uint32_t comps[2] = { + movi, movi2, + }; + uint32_t offset_val = alloc_mir_temp(ctx); + emit_create_vector(ctx, offset_val, 2, comps); + + struct bifrost_instruction instr = { + .op = op, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = offset_val, + .src1 = reg, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + } + }; + emit_mir_instruction(ctx, instr); + } else if (ctx->stage == MESA_SHADER_VERTEX) { + int comp = nir_intrinsic_component(nir_instr); + offset += comp; + void *entry = _mesa_hash_table_u64_search(ctx->varying_nir_to_bi, offset + 2); + + if (!entry) { + printf("WARNING: skipping varying\n"); break; + } - dregs[BIFROST_TEX_DREG_OFFSETMS] = - bi_emit_texc_offset_ms_index(b, instr); - if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero())) - desc.offset_or_bias_disable = true; - break; - - case nir_tex_src_comparator: - dregs[BIFROST_TEX_DREG_SHADOW] = index; - break; - - case nir_tex_src_texture_offset: - if (instr->texture_index) - index = bi_iadd_u32(b, index, bi_imm_u32(instr->texture_index), false); - - dregs[BIFROST_TEX_DREG_TEXTURE] = index; - - break; - - case nir_tex_src_sampler_offset: - if (instr->sampler_index) - index = bi_iadd_u32(b, index, bi_imm_u32(instr->sampler_index), false); - - dregs[BIFROST_TEX_DREG_SAMPLER] = index; - break; - - default: - unreachable("Unhandled src type in texc emit"); - } - } - - if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) { - dregs[BIFROST_TEX_DREG_LOD] = - bi_emit_texc_lod_cube(b, bi_zero()); - } - - /* Choose an index mode */ + offset = (uintptr_t) (entry) - 1; - bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]); - bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]); - bool direct = direct_tex && direct_samp; + reg = nir_src_index(ctx, &nir_instr->src[0]); + // LD_VAR_ADDR.f32 {R0, T1}, R61, R62, location:1, R12 + // ... + // ST_VAR.v4 T1, R12, R13, R14, R4 - desc.immediate_indices = direct && (instr->sampler_index < 16); + offset = emit_ld_vary_addr_constant(ctx, offset); + enum bifrost_ir_ops op; + switch (nir_src_num_components(nir_instr->src[0])) { + case 1: + op = op_st_vary_v1; + break; + case 2: + op = op_st_vary_v2; + break; + case 3: + op = op_st_vary_v3; + break; + case 4: + op = op_st_vary_v4; + break; + default: + assert(0); + break; + } - if (desc.immediate_indices) { - desc.sampler_index_or_mode = instr->sampler_index; - desc.index = instr->texture_index; - } else { - unsigned mode = 0; - - if (direct && instr->sampler_index == instr->texture_index) { - mode = BIFROST_INDEX_IMMEDIATE_SHARED; - desc.index = instr->texture_index; - } else if (direct) { - mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; - desc.index = instr->sampler_index; - dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b, - bi_imm_u32(instr->texture_index)); - } else if (direct_tex) { - assert(!direct_samp); - mode = BIFROST_INDEX_IMMEDIATE_TEXTURE; - desc.index = instr->texture_index; - } else if (direct_samp) { - assert(!direct_tex); - mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; - desc.index = instr->sampler_index; + struct bifrost_instruction instr = { + .op = op, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = offset, + .src1 = reg, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + } + }; + emit_mir_instruction(ctx, instr); } else { - mode = BIFROST_INDEX_REGISTER; + assert(0 && "Unknown store_output stage"); } - - mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2); - desc.sampler_index_or_mode = mode; + break; } - - /* Allocate staging registers contiguously by compacting the array. */ - unsigned sr_count = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) { - if (!bi_is_null(dregs[i])) - dregs[sr_count++] = dregs[i]; + default: + printf ("Unhandled intrinsic %s\n", nir_intrinsic_infos[nir_instr->intrinsic].name); + break; } - - unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4; - - bi_index sr = sr_count ? bi_temp(b->shader) : bi_null(); - bi_index dst = bi_temp(b->shader); - - if (sr_count) - bi_emit_collect_to(b, sr, dregs, sr_count); - - uint32_t desc_u = 0; - memcpy(&desc_u, &desc, sizeof(desc_u)); - bi_instr *I = bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc_u), - !nir_tex_instr_has_implicit_derivative(instr), - sr_count, 0); - I->register_format = bi_reg_fmt_for_nir(instr->dest_type); - - bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; - bi_emit_split_i32(b, w, dst, res_size); - bi_emit_collect_to(b, bi_dest_index(&instr->dest), w, - DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4)); } -/* Staging registers required by texturing in the order they appear (Valhall) */ - -enum valhall_tex_sreg { - VALHALL_TEX_SREG_X_COORD = 0, - VALHALL_TEX_SREG_Y_COORD = 1, - VALHALL_TEX_SREG_Z_COORD = 2, - VALHALL_TEX_SREG_Y_DELTAS = 3, - VALHALL_TEX_SREG_ARRAY = 4, - VALHALL_TEX_SREG_SHADOW = 5, - VALHALL_TEX_SREG_OFFSETMS = 6, - VALHALL_TEX_SREG_LOD = 7, - VALHALL_TEX_SREG_GRDESC = 8, - VALHALL_TEX_SREG_COUNT, -}; +#define ALU_CASE(arguments, nir, name) \ + case nir_op_##nir: \ + argument_count = arguments; \ + op = op_##name; \ + break +#define ALU_CASE_MOD(arguments, nir, name, modifiers) \ + case nir_op_##nir: \ + argument_count = arguments; \ + op = op_##name; \ + src_modifiers = modifiers; \ + break static void -bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr) -{ - bool explicit_offset = false; - enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD; - - bool has_lod_mode = - (instr->op == nir_texop_tex) || - (instr->op == nir_texop_txl) || - (instr->op == nir_texop_txb); - - /* 32-bit indices to be allocated as consecutive staging registers */ - bi_index sregs[VALHALL_TEX_SREG_COUNT] = { }; - - - bool has_sampler = nir_tex_instr_need_sampler(instr); - bi_index sampler = bi_imm_u32(has_sampler ? instr->sampler_index : 0); - bi_index texture = bi_imm_u32(instr->texture_index); - uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27); - - for (unsigned i = 0; i < instr->num_srcs; ++i) { - bi_index index = bi_src_index(&instr->src[i].src); - unsigned sz = nir_src_bit_size(instr->src[i].src); - unsigned components = nir_src_num_components(instr->src[i].src); - - switch (instr->src[i].src_type) { - case nir_tex_src_coord: - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - sregs[VALHALL_TEX_SREG_X_COORD] = - bi_emit_texc_cube_coord(b, index, - &sregs[VALHALL_TEX_SREG_Y_COORD]); - } else { - assert(components >= 1 && components <= 3); - - /* Copy XY (for 2D+) or XX (for 1D) */ - sregs[VALHALL_TEX_SREG_X_COORD] = index; - - if (components >= 2) - sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1); - - if (components == 3 && !instr->is_array) { - sregs[VALHALL_TEX_SREG_Z_COORD] = - bi_extract(b, index, 2); - } - } - - if (instr->is_array) { - sregs[VALHALL_TEX_SREG_ARRAY] = - bi_extract(b, index, components - 1); - } - - break; - - case nir_tex_src_lod: - if (nir_src_is_const(instr->src[i].src) && - nir_src_as_uint(instr->src[i].src) == 0) { - lod_mode = BI_VA_LOD_MODE_ZERO_LOD; - } else if (has_lod_mode) { - lod_mode = BI_VA_LOD_MODE_EXPLICIT; - - assert(sz == 16 || sz == 32); - sregs[VALHALL_TEX_SREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); - } - break; - - case nir_tex_src_bias: - /* Upper 16-bits interpreted as a clamp, leave zero */ - assert(sz == 16 || sz == 32); - sregs[VALHALL_TEX_SREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); - - lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS; - break; - case nir_tex_src_ms_index: - case nir_tex_src_offset: - /* Handled below */ - break; - - case nir_tex_src_comparator: - sregs[VALHALL_TEX_SREG_SHADOW] = index; - break; - - case nir_tex_src_texture_offset: - assert(instr->texture_index == 0); - texture = index; - break; - - case nir_tex_src_sampler_offset: - assert(instr->sampler_index == 0); - sampler = index; - break; - - default: - unreachable("Unhandled src type in tex emit"); - } - } - - /* Generate packed offset + ms index + LOD register. These default to - * zero so we only need to encode if these features are actually in use. - */ - bi_index offsets = bi_emit_valhall_offsets(b, instr); - - if (!bi_is_equiv(offsets, bi_zero())) { - sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets; - explicit_offset = true; - } +emit_alu(struct compiler_context *ctx, nir_alu_instr *nir_instr) +{ + unsigned dest = nir_dest_index(ctx, &nir_instr->dest.dest); + unsigned op = ~0U, argument_count; + unsigned src_modifiers = 0; + + switch (nir_instr->op) { + ALU_CASE(2, fmul, fmul_f32); + ALU_CASE(2, fadd, fadd_f32); + ALU_CASE_MOD(2, fsub, fadd_f32, SOURCE_MODIFIER(1, SRC_MOD_NEG)); + ALU_CASE(1, ftrunc, trunc); + ALU_CASE(1, fceil, ceil); + ALU_CASE(1, ffloor, floor); + ALU_CASE(1, fround_even, roundeven); + ALU_CASE(1, frcp, frcp_fast_f32); + ALU_CASE(2, fmax, max_f32); + ALU_CASE(2, fmin, min_f32); + ALU_CASE(2, iadd, add_i32); + ALU_CASE(2, isub, sub_i32); + ALU_CASE(2, imul, mul_i32); + ALU_CASE(2, iand, and_i32); + ALU_CASE(2, ior, or_i32); + ALU_CASE(2, ixor, xor_i32); + ALU_CASE(2, ishl, lshift_i32); + ALU_CASE(2, ushr, rshift_i32); + ALU_CASE(2, ishr, arshift_i32); + case nir_op_ineg: { + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + printf("ineg 0x%08x\n", src0); + struct bifrost_instruction instr = { + .op = op_sub_i32, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = SSA_FIXED_CONST_0, + .src1 = src0, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + }; - /* Allocate staging registers contiguously by compacting the array. */ - unsigned sr_count = 0; + emit_mir_instruction(ctx, instr); + return; - for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) { - if (!bi_is_null(sregs[i])) - sregs[sr_count++] = sregs[i]; } - - bi_index idx = sr_count ? bi_temp(b->shader) : bi_null(); - - if (sr_count) - bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32); - - bi_index image_src = bi_imm_u32(tables); - image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0)); - image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16)); - - unsigned mask = BI_WRITE_MASK_RGBA; - unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4; - enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type); - enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim); - bi_index dest = bi_temp(b->shader); - - switch (instr->op) { - case nir_texop_tex: - case nir_texop_txl: - case nir_texop_txb: - bi_tex_single_to(b, dest, idx, image_src, bi_zero(), - instr->is_array, dim, regfmt, instr->is_shadow, - explicit_offset, lod_mode, mask, sr_count); - break; - case nir_texop_txf: - case nir_texop_txf_ms: - bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(), - instr->is_array, dim, regfmt, explicit_offset, - mask, sr_count); + case nir_op_vec2: { + uint32_t comps[3] = { + nir_alu_src_index(ctx, &nir_instr->src[0]), + nir_alu_src_index(ctx, &nir_instr->src[1]), + }; + emit_create_vector(ctx, dest, 2, comps); + return; break; - case nir_texop_tg4: - bi_tex_gather_to(b, dest, idx, image_src, bi_zero(), - instr->is_array, dim, instr->component, false, - regfmt, instr->is_shadow, explicit_offset, - mask, sr_count); + } + case nir_op_vec3: { + uint32_t comps[3] = { + nir_alu_src_index(ctx, &nir_instr->src[0]), + nir_alu_src_index(ctx, &nir_instr->src[1]), + nir_alu_src_index(ctx, &nir_instr->src[2]), + }; + emit_create_vector(ctx, dest, 3, comps); + return; break; - default: - unreachable("Unhandled Valhall texture op"); } - - bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; - bi_emit_split_i32(b, w, dest, res_size); - bi_emit_collect_to(b, bi_dest_index(&instr->dest), w, - DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4)); -} - -/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube - * textures with sufficiently small immediate indices. Anything else - * needs a complete texture op. */ - -static void -bi_emit_texs(bi_builder *b, nir_tex_instr *instr) -{ - int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); - assert(coord_idx >= 0); - bi_index coords = bi_src_index(&instr->src[coord_idx].src); - - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - bi_index face, s, t; - bi_emit_cube_coord(b, coords, &face, &s, &t); - - bi_texs_cube_to(b, nir_dest_bit_size(instr->dest), - bi_dest_index(&instr->dest), - s, t, face, - instr->sampler_index, instr->texture_index); - } else { - bi_texs_2d_to(b, nir_dest_bit_size(instr->dest), - bi_dest_index(&instr->dest), - bi_extract(b, coords, 0), - bi_extract(b, coords, 1), - instr->op != nir_texop_tex, /* zero LOD */ - instr->sampler_index, instr->texture_index); + case nir_op_vec4: { + uint32_t comps[4] = { + nir_alu_src_index(ctx, &nir_instr->src[0]), + nir_alu_src_index(ctx, &nir_instr->src[1]), + nir_alu_src_index(ctx, &nir_instr->src[2]), + nir_alu_src_index(ctx, &nir_instr->src[3]), + }; + emit_create_vector(ctx, dest, 4, comps); + return; + break; } + case nir_op_fdiv: { + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1); + uint32_t mir_temp_location = alloc_mir_temp(ctx); + { + struct bifrost_instruction instr = { + .op = op_frcp_fast_f32, + .dest_components = 1, + .ssa_args = { + .dest = mir_temp_location, + .src0 = src1, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + }; + emit_mir_instruction(ctx, instr); + } - bi_split_dest(b, instr->dest); -} - -static bool -bi_is_simple_tex(nir_tex_instr *instr) -{ - if (instr->op != nir_texop_tex && instr->op != nir_texop_txl) - return false; - - if (instr->dest_type != nir_type_float32 && - instr->dest_type != nir_type_float16) - return false; - - if (instr->is_shadow || instr->is_array) - return false; - - switch (instr->sampler_dim) { - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_RECT: - break; + struct bifrost_instruction instr = { + .op = op_fmul_f32, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .src_modifiers = src_modifiers, + }; - case GLSL_SAMPLER_DIM_CUBE: - /* LOD can't be specified with TEXS_CUBE */ - if (instr->op == nir_texop_txl) - return false; + emit_mir_instruction(ctx, instr); + return; break; - - default: - return false; } + case nir_op_umin: + case nir_op_imin: + case nir_op_umax: + case nir_op_imax: { + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1); + struct bifrost_instruction instr = { + .op = op_csel_i32, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = src0, + .src3 = src1, + }, + .src_modifiers = src_modifiers, + .literal_args[0] = 0, /* XXX: Comparison operator */ + }; - for (unsigned i = 0; i < instr->num_srcs; ++i) { - if (instr->src[i].src_type != nir_tex_src_lod && - instr->src[i].src_type != nir_tex_src_coord) - return false; + emit_mir_instruction(ctx, instr); + return; + break; } + case nir_op_umin3: + case nir_op_imin3: + case nir_op_umax3: + case nir_op_imax3: { + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1); + unsigned src2 = nir_alu_src_index_scalar(ctx, nir_instr, 2); + + unsigned op = 0; + if (nir_instr->op == nir_op_umin3) + op = op_umin3_i32; + else if (nir_instr->op == nir_op_imin3) + op = op_imin3_i32; + else if (nir_instr->op == nir_op_umax3) + op = op_umax3_i32; + else if (nir_instr->op == nir_op_imax3) + op = op_imax3_i32; + struct bifrost_instruction instr = { + .op = op, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = src2, + .src3 = SSA_INVALID_VALUE, + }, + .src_modifiers = src_modifiers, + }; - /* Indices need to fit in provided bits */ - unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3; - if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits)) - return false; - - int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); - if (lod_idx < 0) - return true; + emit_mir_instruction(ctx, instr); - nir_src lod = instr->src[lod_idx].src; - return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0; -} - -static void -bi_emit_tex(bi_builder *b, nir_tex_instr *instr) -{ - switch (instr->op) { - case nir_texop_txs: - bi_load_sysval_to(b, bi_dest_index(&instr->dest), - panfrost_sysval_for_instr(&instr->instr, NULL), - nir_dest_num_components(instr->dest), 0); return; - case nir_texop_tex: - case nir_texop_txl: - case nir_texop_txb: - case nir_texop_txf: - case nir_texop_txf_ms: - case nir_texop_tg4: break; - default: - unreachable("Invalid texture operation"); } + case nir_op_ine: { + uint32_t movi = emit_movi(ctx, ~0U); + unsigned src0 = nir_alu_src_index(ctx, &nir_instr->src[0]); + unsigned src1 = nir_alu_src_index(ctx, &nir_instr->src[1]); + struct bifrost_instruction instr = { + .op = op_csel_i32, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = movi, + .src3 = SSA_FIXED_CONST_0, + }, + .src_modifiers = src_modifiers, + .literal_args[0] = CSEL_IEQ, /* XXX: Comparison operator */ + }; - if (b->shader->arch >= 9) - bi_emit_tex_valhall(b, instr); - else if (bi_is_simple_tex(instr)) - bi_emit_texs(b, instr); - else - bi_emit_texc(b, instr); -} - -static void -bi_emit_phi(bi_builder *b, nir_phi_instr *instr) -{ - unsigned nr_srcs = exec_list_length(&instr->srcs); - bi_instr *I = bi_phi_to(b, bi_dest_index(&instr->dest), nr_srcs); - - /* Deferred */ - I->phi = instr; -} - -/* Look up the AGX block corresponding to a given NIR block. Used when - * translating phi nodes after emitting all blocks. - */ -static bi_block * -bi_from_nir_block(bi_context *ctx, nir_block *block) -{ - return ctx->indexed_nir_blocks[block->index]; -} - -static void -bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I) -{ - nir_phi_instr *phi = I->phi; - - /* Guaranteed by lower_phis_to_scalar */ - assert(phi->dest.ssa.num_components == 1); - - nir_foreach_phi_src(src, phi) { - bi_block *pred = bi_from_nir_block(ctx, src->pred); - unsigned i = bi_predecessor_index(block, pred); - assert(i < I->nr_srcs); - - I->src[i] = bi_src_index(&src->src); + emit_mir_instruction(ctx, instr); + return; + break; + } + default: + printf("Unhandled ALU op %s\n", nir_op_infos[nir_instr->op].name); + return; } - I->phi = NULL; -} + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + unsigned src1 = argument_count >= 2 ? nir_alu_src_index_scalar(ctx, nir_instr, 1) : SSA_INVALID_VALUE; + unsigned src2 = argument_count >= 3 ? nir_alu_src_index_scalar(ctx, nir_instr, 2) : SSA_INVALID_VALUE; + unsigned src3 = argument_count >= 4 ? nir_alu_src_index_scalar(ctx, nir_instr, 3) : SSA_INVALID_VALUE; + + struct bifrost_instruction instr = { + .op = op, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = src2, + .src3 = src3, + }, + .src_modifiers = src_modifiers, + }; -static void -bi_emit_phis_deferred(bi_context *ctx) -{ - bi_foreach_block(ctx, block) { - bi_foreach_instr_in_block(block, I) { - if (I->op == BI_OPCODE_PHI) - bi_emit_phi_deferred(ctx, block, I); - } - } + emit_mir_instruction(ctx, instr); } static void -bi_emit_instr(bi_builder *b, struct nir_instr *instr) +emit_instr(struct compiler_context *ctx, struct nir_instr *instr) { switch (instr->type) { case nir_instr_type_load_const: - bi_emit_load_const(b, nir_instr_as_load_const(instr)); + emit_load_const(ctx, nir_instr_as_load_const(instr)); break; - case nir_instr_type_intrinsic: - bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; - case nir_instr_type_alu: - bi_emit_alu(b, nir_instr_as_alu(instr)); + emit_alu(ctx, nir_instr_as_alu(instr)); break; - case nir_instr_type_tex: - bi_emit_tex(b, nir_instr_as_tex(instr)); + printf("Unhandled NIR inst tex\n"); break; - case nir_instr_type_jump: - bi_emit_jump(b, nir_instr_as_jump(instr)); + printf("Unhandled NIR inst jump\n"); break; - - case nir_instr_type_phi: - bi_emit_phi(b, nir_instr_as_phi(instr)); + case nir_instr_type_ssa_undef: + printf("Unhandled NIR inst ssa_undef\n"); break; - default: - unreachable("should've been lowered"); + printf("Unhandled instruction type\n"); + break; } -} - -static bi_block * -create_empty_block(bi_context *ctx) -{ - bi_block *blk = rzalloc(ctx, bi_block); - - util_dynarray_init(&blk->predecessors, blk); - return blk; } -static bi_block * -emit_block(bi_context *ctx, nir_block *block) +static bifrost_block * +emit_block(struct compiler_context *ctx, nir_block *block) { - if (ctx->after_block) { - ctx->current_block = ctx->after_block; - ctx->after_block = NULL; - } else { - ctx->current_block = create_empty_block(ctx); - } + bifrost_block *this_block = calloc(sizeof(bifrost_block), 1); + list_addtail(&this_block->link, &ctx->blocks); - list_addtail(&ctx->current_block->link, &ctx->blocks); - list_inithead(&ctx->current_block->instructions); + ++ctx->block_count; - bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); + /* Add this block to be a successor to the previous block */ + if (ctx->current_block) + bifrost_block_add_successor(ctx->current_block, this_block); - ctx->indexed_nir_blocks[block->index] = ctx->current_block; + /* Set up current block */ + list_inithead(&this_block->instructions); + ctx->current_block = this_block; nir_foreach_instr(instr, block) { - bi_emit_instr(&_b, instr); + emit_instr(ctx, instr); + ++ctx->instruction_count; } - return ctx->current_block; +#ifdef BI_DEBUG + print_mir_block(this_block, false); +#endif + return this_block; } -static void -emit_if(bi_context *ctx, nir_if *nif) -{ - bi_block *before_block = ctx->current_block; - - /* Speculatively emit the branch, but we can't fill it in until later */ - bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); - bi_instr *then_branch = bi_branchz_i16(&_b, - bi_half(bi_src_index(&nif->condition), false), - bi_zero(), BI_CMPF_EQ); - - /* Emit the two subblocks. */ - bi_block *then_block = emit_cf_list(ctx, &nif->then_list); - bi_block *end_then_block = ctx->current_block; - - /* Emit second block */ - - bi_block *else_block = emit_cf_list(ctx, &nif->else_list); - bi_block *end_else_block = ctx->current_block; - ctx->after_block = create_empty_block(ctx); - - /* Now that we have the subblocks emitted, fix up the branches */ - - assert(then_block); - assert(else_block); - - then_branch->branch_target = else_block; - - /* Emit a jump from the end of the then block to the end of the else */ - _b.cursor = bi_after_block(end_then_block); - bi_instr *then_exit = bi_jump(&_b, bi_zero()); - then_exit->branch_target = ctx->after_block; - - bi_block_add_successor(end_then_block, then_exit->branch_target); - bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */ - - bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */ - bi_block_add_successor(before_block, then_block); /* fallthrough */ -} - -static void -emit_loop(bi_context *ctx, nir_loop *nloop) -{ - /* Remember where we are */ - bi_block *start_block = ctx->current_block; - - bi_block *saved_break = ctx->break_block; - bi_block *saved_continue = ctx->continue_block; - - ctx->continue_block = create_empty_block(ctx); - ctx->break_block = create_empty_block(ctx); - ctx->after_block = ctx->continue_block; - - /* Emit the body itself */ - emit_cf_list(ctx, &nloop->body); - - /* Branch back to loop back */ - bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); - bi_instr *I = bi_jump(&_b, bi_zero()); - I->branch_target = ctx->continue_block; - bi_block_add_successor(start_block, ctx->continue_block); - bi_block_add_successor(ctx->current_block, ctx->continue_block); - - ctx->after_block = ctx->break_block; - - /* Pop off */ - ctx->break_block = saved_break; - ctx->continue_block = saved_continue; - ++ctx->loop_count; -} +void +emit_if(struct compiler_context *ctx, nir_if *nir_inst); -static bi_block * -emit_cf_list(bi_context *ctx, struct exec_list *list) +static struct bifrost_block * +emit_cf_list(struct compiler_context *ctx, struct exec_list *list) { - bi_block *start_block = NULL; - + struct bifrost_block *start_block = NULL; foreach_list_typed(nir_cf_node, node, node, list) { switch (node->type) { case nir_cf_node_block: { - bi_block *block = emit_block(ctx, nir_cf_node_as_block(node)); + bifrost_block *block = emit_block(ctx, nir_cf_node_as_block(node)); if (!start_block) start_block = block; @@ -4140,1183 +866,186 @@ emit_cf_list(bi_context *ctx, struct exec_list *list) emit_if(ctx, nir_cf_node_as_if(node)); break; + default: case nir_cf_node_loop: - emit_loop(ctx, nir_cf_node_as_loop(node)); + case nir_cf_node_function: + assert(0); break; - - default: - unreachable("Unknown control flow"); } } return start_block; } -/* shader-db stuff */ - -struct bi_stats { - unsigned nr_clauses, nr_tuples, nr_ins; - unsigned nr_arith, nr_texture, nr_varying, nr_ldst; -}; - -static void -bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats) -{ - /* Count instructions */ - stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0); - - /* Non-message passing tuples are always arithmetic */ - if (tuple->add != clause->message) { - stats->nr_arith++; - return; - } - - /* Message + FMA we'll count as arithmetic _and_ message */ - if (tuple->fma) - stats->nr_arith++; - - switch (clause->message_type) { - case BIFROST_MESSAGE_VARYING: - /* Check components interpolated */ - stats->nr_varying += (clause->message->vecsize + 1) * - (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2); - break; - - case BIFROST_MESSAGE_VARTEX: - /* 2 coordinates, fp32 each */ - stats->nr_varying += (2 * 2); - FALLTHROUGH; - case BIFROST_MESSAGE_TEX: - stats->nr_texture++; - break; - - case BIFROST_MESSAGE_ATTRIBUTE: - case BIFROST_MESSAGE_LOAD: - case BIFROST_MESSAGE_STORE: - case BIFROST_MESSAGE_ATOMIC: - stats->nr_ldst++; - break; - - case BIFROST_MESSAGE_NONE: - case BIFROST_MESSAGE_BARRIER: - case BIFROST_MESSAGE_BLEND: - case BIFROST_MESSAGE_TILE: - case BIFROST_MESSAGE_Z_STENCIL: - case BIFROST_MESSAGE_ATEST: - case BIFROST_MESSAGE_JOB: - case BIFROST_MESSAGE_64BIT: - /* Nothing to do */ - break; - }; - -} - -/* - * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the - * shader completes. These costs are not accounted for in the general cycle - * counts, so this function calculates the effective cost of these messages, as - * if they were executed by shader code. - */ -static unsigned -bi_count_preload_cost(bi_context *ctx) -{ - /* Units: 1/16 of a normalized cycle, assuming that we may interpolate - * 16 fp16 varying components per cycle or fetch two texels per cycle. - */ - unsigned cost = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) { - struct bifrost_message_preload msg = ctx->info.bifrost->messages[i]; - - if (msg.enabled && msg.texture) { - /* 2 coordinate, 2 half-words each, plus texture */ - cost += 12; - } else if (msg.enabled) { - cost += (msg.num_components * (msg.fp16 ? 1 : 2)); - } - } - - return cost; -} - -static const char * -bi_shader_stage_name(bi_context *ctx) -{ - if (ctx->idvs == BI_IDVS_VARYING) - return "MESA_SHADER_VARYING"; - else if (ctx->idvs == BI_IDVS_POSITION) - return "MESA_SHADER_POSITION"; - else if (ctx->inputs->is_blend) - return "MESA_SHADER_BLEND"; - else - return gl_shader_stage_name(ctx->stage); -} - -static char * -bi_print_stats(bi_context *ctx, unsigned size) +void +emit_if(struct compiler_context *ctx, nir_if *nir_inst) { - struct bi_stats stats = { 0 }; - - /* Count instructions, clauses, and tuples. Also attempt to construct - * normalized execution engine cycle counts, using the following ratio: - * - * 24 arith tuples/cycle - * 2 texture messages/cycle - * 16 x 16-bit varying channels interpolated/cycle - * 1 load store message/cycle - * - * These numbers seem to match Arm Mobile Studio's heuristic. The real - * cycle counts are surely more complicated. - */ - - bi_foreach_block(ctx, block) { - bi_foreach_clause_in_block(block, clause) { - stats.nr_clauses++; - stats.nr_tuples += clause->tuple_count; - - for (unsigned i = 0; i < clause->tuple_count; ++i) - bi_count_tuple_stats(clause, &clause->tuples[i], &stats); - } - } - - float cycles_arith = ((float) stats.nr_arith) / 24.0; - float cycles_texture = ((float) stats.nr_texture) / 2.0; - float cycles_varying = ((float) stats.nr_varying) / 16.0; - float cycles_ldst = ((float) stats.nr_ldst) / 1.0; - - float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst); - float cycles_bound = MAX2(cycles_arith, cycles_message); - - /* Thread count and register pressure are traded off only on v7 */ - bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32); - unsigned nr_threads = full_threads ? 2 : 1; - - /* Dump stats */ - char *str = ralloc_asprintf(NULL, "%s shader: " - "%u inst, %u tuples, %u clauses, " - "%f cycles, %f arith, %f texture, %f vary, %f ldst, " - "%u quadwords, %u threads", - bi_shader_stage_name(ctx), - stats.nr_ins, stats.nr_tuples, stats.nr_clauses, - cycles_bound, cycles_arith, cycles_texture, - cycles_varying, cycles_ldst, - size / 16, nr_threads); - - if (ctx->arch == 7) { - ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx)); - } - - ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills", - ctx->loop_count, ctx->spills, ctx->fills); - return str; -} - -static char * -va_print_stats(bi_context *ctx, unsigned size) -{ - unsigned nr_ins = 0; - struct va_stats stats = { 0 }; + // XXX: Conditional branch instruction can do a variety of comparisons with the sources + // Merge the source instruction `ine` with our conditional branch + { + uint32_t movi = emit_movi(ctx, ~0U); + struct bifrost_instruction instr = { + .op = op_branch, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = nir_src_index(ctx, &nir_inst->condition), + .src1 = movi, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .src_modifiers = 0, + .literal_args[0] = BR_COND_EQ, /* XXX: Comparison Arg type */ + .literal_args[1] = 0, /* XXX: Branch target */ + }; - /* Count instructions */ - bi_foreach_instr_global(ctx, I) { - nr_ins++; - va_count_instr_stats(I, &stats); + emit_mir_instruction(ctx, instr); } - /* Mali G78 peak performance: - * - * 64 FMA instructions per cycle - * 64 CVT instructions per cycle - * 16 SFU instructions per cycle - * 8 x 32-bit varying channels interpolated per cycle - * 4 texture instructions per cycle - * 1 load/store operation per cycle - */ - - float cycles_fma = ((float) stats.fma) / 64.0; - float cycles_cvt = ((float) stats.cvt) / 64.0; - float cycles_sfu = ((float) stats.sfu) / 16.0; - float cycles_v = ((float) stats.v) / 16.0; - float cycles_t = ((float) stats.t) / 4.0; - float cycles_ls = ((float) stats.ls) / 1.0; - - /* Calculate the bound */ - float cycles = MAX2( - MAX3(cycles_fma, cycles_cvt, cycles_sfu), - MAX3(cycles_v, cycles_t, cycles_ls)); - - - /* Thread count and register pressure are traded off */ - unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1; - - /* Dump stats */ - return ralloc_asprintf(NULL, "%s shader: " - "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, " - "%f t, %f ls, %u quadwords, %u threads, %u loops, " - "%u:%u spills:fills", - bi_shader_stage_name(ctx), - nr_ins, cycles, cycles_fma, cycles_cvt, cycles_sfu, - cycles_v, cycles_t, cycles_ls, size / 16, nr_threads, - ctx->loop_count, ctx->spills, ctx->fills); -} + bifrost_instruction *true_branch = mir_last_instr_in_block(ctx->current_block); -static int -glsl_type_size(const struct glsl_type *type, bool bindless) -{ - return glsl_count_attribute_slots(type, false); -} + bifrost_block *true_block = emit_cf_list(ctx, &nir_inst->then_list); -/* Split stores to memory. We don't split stores to vertex outputs, since - * nir_lower_io_to_temporaries will ensure there's only a single write. - */ + { + struct bifrost_instruction instr = { + .op = op_branch, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = SSA_INVALID_VALUE, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .src_modifiers = 0, + .literal_args[0] = BR_ALWAYS, /* XXX: ALWAYS */ + .literal_args[1] = 0, /* XXX: Branch target */ + }; -static bool -should_split_wrmask(const nir_instr *instr, UNUSED const void *data) -{ - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_store_ssbo: - case nir_intrinsic_store_shared: - case nir_intrinsic_store_global: - case nir_intrinsic_store_scratch: - return true; - default: - return false; + emit_mir_instruction(ctx, instr); } -} + bifrost_instruction *true_exit_branch = mir_last_instr_in_block(ctx->current_block); -/* - * Some operations are only available as 32-bit instructions. 64-bit floats are - * unsupported and ints are lowered with nir_lower_int64. Certain 8-bit and - * 16-bit instructions, however, are lowered here. - */ -static unsigned -bi_lower_bit_size(const nir_instr *instr, UNUSED void *data) -{ - if (instr->type != nir_instr_type_alu) - return 0; - - nir_alu_instr *alu = nir_instr_as_alu(instr); - - switch (alu->op) { - case nir_op_fexp2: - case nir_op_flog2: - case nir_op_fpow: - case nir_op_fsin: - case nir_op_fcos: - case nir_op_bit_count: - case nir_op_bitfield_reverse: - return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32; - default: - return 0; - } -} + unsigned false_idx = ctx->block_count; + unsigned inst_count = ctx->instruction_count; -/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4, - * transcendentals are an exception. Also shifts because of lane size mismatch - * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need - * to be scalarized due to type size. */ + bifrost_block *false_block = emit_cf_list(ctx, &nir_inst->else_list); -static uint8_t -bi_vectorize_filter(const nir_instr *instr, const void *data) -{ - /* Defaults work for everything else */ - if (instr->type != nir_instr_type_alu) - return 0; - - const nir_alu_instr *alu = nir_instr_as_alu(instr); - - switch (alu->op) { - case nir_op_frcp: - case nir_op_frsq: - case nir_op_ishl: - case nir_op_ishr: - case nir_op_ushr: - case nir_op_f2i16: - case nir_op_f2u16: - case nir_op_extract_u8: - case nir_op_extract_i8: - case nir_op_extract_u16: - case nir_op_extract_i16: - case nir_op_insert_u16: - return 1; - default: - break; - } + unsigned if_footer_idx = ctx->block_count; + assert(true_block); + assert(false_block); - /* Vectorized instructions cannot write more than 32-bit */ - int dst_bit_size = nir_dest_bit_size(alu->dest.dest); - if (dst_bit_size == 16) - return 2; - else - return 1; -} -static bool -bi_scalarize_filter(const nir_instr *instr, const void *data) -{ - if (instr->type != nir_instr_type_alu) - return false; - - const nir_alu_instr *alu = nir_instr_as_alu(instr); - - switch (alu->op) { - case nir_op_pack_uvec2_to_uint: - case nir_op_pack_uvec4_to_uint: - return false; - default: - return true; + if (ctx->instruction_count == inst_count) { + // If the else branch didn't have anything in it then we can remove the dead jump + mir_remove_instr(true_exit_branch); + } else { + true_exit_branch->literal_args[1] = if_footer_idx; } -} - -/* Ensure we write exactly 4 components */ -static nir_ssa_def * -bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in, - unsigned channel, unsigned first, unsigned mask) -{ - if (!(mask & BITFIELD_BIT(channel))) - channel = first; - - return nir_channel(b, in, channel); -} - -/* Lower fragment store_output instructions to always write 4 components, - * matching the hardware semantic. This may require additional moves. Skipping - * these moves is possible in theory, but invokes undefined behaviour in the - * compiler. The DDK inserts these moves, so we will as well. */ - -static bool -bifrost_nir_lower_blend_components(struct nir_builder *b, - nir_instr *instr, void *data) -{ - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - if (intr->intrinsic != nir_intrinsic_store_output) - return false; - - nir_ssa_def *in = intr->src[0].ssa; - unsigned first = nir_intrinsic_component(intr); - unsigned mask = nir_intrinsic_write_mask(intr); - - assert(first == 0 && "shouldn't get nonzero components"); - - /* Nothing to do */ - if (mask == BITFIELD_MASK(4)) - return false; - b->cursor = nir_before_instr(&intr->instr); - - /* Replicate the first valid component instead */ - nir_ssa_def *replicated = - nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask), - bifrost_nir_valid_channel(b, in, 1, first, mask), - bifrost_nir_valid_channel(b, in, 2, first, mask), - bifrost_nir_valid_channel(b, in, 3, first, mask)); - - /* Rewrite to use our replicated version */ - nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated); - nir_intrinsic_set_component(intr, 0); - nir_intrinsic_set_write_mask(intr, 0xF); - intr->num_components = 4; - - return true; + true_branch->literal_args[1] = false_idx; } -static void -bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) +int +bifrost_compile_shader_nir(nir_shader *nir, struct bifrost_program *program) { - bool progress; - unsigned lower_flrp = 16 | 32 | 64; - - NIR_PASS(progress, nir, nir_lower_regs_to_ssa); - - nir_lower_tex_options lower_tex_options = { - .lower_txs_lod = true, - .lower_txp = ~0, - .lower_tg4_broadcom_swizzle = true, - .lower_txd = true, - .lower_invalid_implicit_lod = true, + struct compiler_context ictx = { + .nir = nir, + .stage = nir->info.stage, }; - NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin); - NIR_PASS(progress, nir, pan_lower_helper_invocation); - - NIR_PASS(progress, nir, nir_lower_int64); + struct compiler_context *ctx = &ictx; - nir_lower_idiv_options idiv_options = { - .allow_fp16 = true, - }; - NIR_PASS(progress, nir, nir_opt_idiv_const, 8); - NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); - - NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); - NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); - NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); - NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); + ctx->mir_temp = 0; - do { - progress = false; + /* Initialize at a global (not block) level hash tables */ + ctx->ssa_constants = _mesa_hash_table_u64_create(NULL); + ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); - NIR_PASS(progress, nir, nir_lower_var_copies); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL); + /* Assign actual uniform location, skipping over samplers */ + ctx->uniform_nir_to_bi = _mesa_hash_table_u64_create(NULL); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_constant_folding); + nir_foreach_variable(var, &nir->uniforms) { + if (glsl_get_base_type(var->type) == GLSL_TYPE_SAMPLER) continue; - NIR_PASS(progress, nir, nir_lower_alu); - - if (lower_flrp != 0) { - bool lower_flrp_progress = false; - NIR_PASS(lower_flrp_progress, - nir, - nir_lower_flrp, - lower_flrp, - false /* always_precise */); - if (lower_flrp_progress) { - NIR_PASS(progress, nir, - nir_opt_constant_folding); - progress = true; - } - - /* Nothing should rematerialize any flrps, so we only - * need to do this lowering once. - */ - lower_flrp = 0; + for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) { + int id = ctx->uniform_count++; + _mesa_hash_table_u64_insert(ctx->uniform_nir_to_bi, var->data.driver_location + col + 1, (void *) ((uintptr_t) (id + 1))); } - - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_lower_undef_to_zero); - - NIR_PASS(progress, nir, nir_opt_shrink_vectors); - NIR_PASS(progress, nir, nir_opt_loop_unroll); - } while (progress); - - /* TODO: Why is 64-bit getting rematerialized? - * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ - NIR_PASS(progress, nir, nir_lower_int64); - - /* We need to cleanup after each iteration of late algebraic - * optimizations, since otherwise NIR can produce weird edge cases - * (like fneg of a constant) which we don't handle */ - bool late_algebraic = true; - while (late_algebraic) { - late_algebraic = false; - NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_cse); } - /* This opt currently helps on Bifrost but not Valhall */ - if (gpu_id < 0x9000) - NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise); - - NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); - NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL); - NIR_PASS(progress, nir, nir_lower_bool_to_bitsize); - - /* Prepass to simplify instruction selection */ - late_algebraic = false; - NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late); + if (ctx->stage == MESA_SHADER_VERTEX) { + ctx->varying_nir_to_bi = _mesa_hash_table_u64_create(NULL); + nir_foreach_variable(var, &nir->outputs) { + if (var->data.location < VARYING_SLOT_VAR0) { + if (var->data.location == VARYING_SLOT_POS) + ctx->varying_count++; + _mesa_hash_table_u64_insert(ctx->varying_nir_to_bi, var->data.driver_location + 1, (void *) ((uintptr_t) (1))); - while (late_algebraic) { - late_algebraic = false; - NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_cse); - } - - NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); - NIR_PASS(progress, nir, nir_opt_dce); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_shader_instructions_pass, - bifrost_nir_lower_blend_components, - nir_metadata_block_index | nir_metadata_dominance, - NULL); - } - - /* Backend scheduler is purely local, so do some global optimizations - * to reduce register pressure. */ - nir_move_options move_all = - nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | - nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; - - NIR_PASS_V(nir, nir_opt_sink, move_all); - NIR_PASS_V(nir, nir_opt_move, move_all); - - /* We might lower attribute, varying, and image indirects. Use the - * gathered info to skip the extra analysis in the happy path. */ - bool any_indirects = - nir->info.inputs_read_indirectly || - nir->info.outputs_accessed_indirectly || - nir->info.patch_inputs_read_indirectly || - nir->info.patch_outputs_accessed_indirectly || - nir->info.images_used[0]; - - if (any_indirects) { - nir_convert_to_lcssa(nir, true, true); - NIR_PASS_V(nir, nir_divergence_analysis); - NIR_PASS_V(nir, bi_lower_divergent_indirects, - pan_subgroup_size(gpu_id >> 12)); - } -} - -static void -bi_opt_post_ra(bi_context *ctx) -{ - bi_foreach_instr_global_safe(ctx, ins) { - if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0])) - bi_remove_instruction(ins); - } -} - -/* Dead code elimination for branches at the end of a block - only one branch - * per block is legal semantically, but unreachable jumps can be generated. - * Likewise on Bifrost we can generate jumps to the terminal block which need - * to be lowered away to a jump to #0x0, which induces successful termination. - * That trick doesn't work on Valhall, which needs a NOP inserted in the - * terminal block instead. - */ -static void -bi_lower_branch(bi_context *ctx, bi_block *block) -{ - bool cull_terminal = (ctx->arch <= 8); - bool branched = false; - - bi_foreach_instr_in_block_safe(block, ins) { - if (!ins->branch_target) continue; - - if (branched) { - bi_remove_instruction(ins); - continue; - } - - branched = true; - - if (!bi_is_terminal_block(ins->branch_target)) - continue; - - if (cull_terminal) - ins->branch_target = NULL; - else if (ins->branch_target) - ins->branch_target->needs_nop = true; - } -} - -static void -bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset) -{ - unsigned final_clause = bi_pack(ctx, binary); - - /* If we need to wait for ATEST or BLEND in the first clause, pass the - * corresponding bits through to the renderer state descriptor */ - bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); - bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); - - unsigned first_deps = first_clause ? first_clause->dependencies : 0; - ctx->info.bifrost->wait_6 = (first_deps & (1 << 6)); - ctx->info.bifrost->wait_7 = (first_deps & (1 << 7)); - - /* Pad the shader with enough zero bytes to trick the prefetcher, - * unless we're compiling an empty shader (in which case we don't pad - * so the size remains 0) */ - unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; - - if (binary->size - offset) { - memset(util_dynarray_grow(binary, uint8_t, prefetch_size), - 0, prefetch_size); - } -} - -/* - * Build a bit mask of varyings (by location) that are flatshaded. This - * information is needed by lower_mediump_io, as we don't yet support 16-bit - * flat varyings. - * - * Also varyings that are used as texture coordinates should be kept at fp32 so - * the texture instruction may be promoted to VAR_TEX. In general this is a good - * idea, as fp16 texture coordinates are not supported by the hardware and are - * usually inappropriate. (There are both relevant CTS bugs here, even.) - * - * TODO: If we compacted the varyings with some fixup code in the vertex shader, - * we could implement 16-bit flat varyings. Consider if this case matters. - * - * TODO: The texture coordinate handling could be less heavyhanded. - */ -static bool -bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data) -{ - uint64_t *mask = data; - - if (instr->type != nir_instr_type_tex) - return false; - - nir_tex_instr *tex = nir_instr_as_tex(instr); - - int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); - if (coord_idx < 0) - return false; - - nir_src src = tex->src[coord_idx].src; - nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0); - nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1); - - if (x.def != y.def) - return false; - - nir_instr *parent = x.def->parent_instr; - - if (parent->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); - - if (intr->intrinsic != nir_intrinsic_load_interpolated_input) - return false; - - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - *mask |= BITFIELD64_BIT(sem.location); - return false; -} - -static uint64_t -bi_fp32_varying_mask(nir_shader *nir) -{ - uint64_t mask = 0; - - assert(nir->info.stage == MESA_SHADER_FRAGMENT); - - nir_foreach_shader_in_variable(var, nir) { - if (var->data.interpolation == INTERP_MODE_FLAT) - mask |= BITFIELD64_BIT(var->data.location); - } - - nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, &mask); - - return mask; -} - -static void -bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) -{ - /* Lower gl_Position pre-optimisation, but after lowering vars to ssa - * (so we don't accidentally duplicate the epilogue since mesa/st has - * messed with our I/O quite a bit already) */ - - NIR_PASS_V(nir, nir_lower_vars_to_ssa); - - if (nir->info.stage == MESA_SHADER_VERTEX) { - NIR_PASS_V(nir, nir_lower_viewport_transform); - NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0); - - nir_variable *psiz = nir_find_variable_with_location(nir, - nir_var_shader_out, - VARYING_SLOT_PSIZ); - if (psiz != NULL) - psiz->data.precision = GLSL_PRECISION_MEDIUM; - } + continue; + } - /* Get rid of any global vars before we lower to scratch. */ - NIR_PASS_V(nir, nir_lower_global_vars_to_local); - - /* Valhall introduces packed thread local storage, which improves cache - * locality of TLS access. However, access to packed TLS cannot - * straddle 16-byte boundaries. As such, when packed TLS is in use - * (currently unconditional for Valhall), we force vec4 alignment for - * scratch access. - */ - bool packed_tls = (gpu_id >= 0x9000); - - /* Lower large arrays to scratch and small arrays to bcsel */ - NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256, - packed_tls ? - glsl_get_vec4_size_align_bytes : - glsl_get_natural_size_align_bytes); - NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); - - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_lower_var_copies); - NIR_PASS_V(nir, nir_lower_vars_to_ssa); - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - glsl_type_size, 0); - - /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for - * offsets it could figure out are constant. Do some constant folding - * before bifrost_nir_lower_store_component below. - */ - NIR_PASS_V(nir, nir_opt_constant_folding); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_lower_mediump_io, - nir_var_shader_in | nir_var_shader_out, - ~bi_fp32_varying_mask(nir), false); - } else if (nir->info.stage == MESA_SHADER_VERTEX) { - if (gpu_id >= 0x9000) { - NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, - BITFIELD64_BIT(VARYING_SLOT_PSIZ), false); + for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) { + for (int comp = 0; comp < 4; ++comp) { + int id = comp + ctx->varying_count++; + _mesa_hash_table_u64_insert(ctx->varying_nir_to_bi, var->data.driver_location + col + comp + 1, (void *) ((uintptr_t) (id + 1))); + } + } } - NIR_PASS_V(nir, pan_nir_lower_store_component); - } - - NIR_PASS_V(nir, nir_lower_ssbo); - NIR_PASS_V(nir, pan_nir_lower_zs_store); - NIR_PASS_V(nir, pan_lower_sample_pos); - NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL); - NIR_PASS_V(nir, nir_lower_64bit_phis); - - if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) { - NIR_PASS_V(nir, nir_io_add_const_offset_to_base, - nir_var_shader_in | nir_var_shader_out); - NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); - NIR_PASS_V(nir, pan_lower_xfb); - } - - bi_optimize_nir(nir, gpu_id, is_blend); -} - -static bi_context * -bi_compile_variant_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct hash_table_u64 *sysval_to_id, - struct bi_shader_info info, - enum bi_idvs_mode idvs) -{ - bi_context *ctx = rzalloc(NULL, bi_context); - - /* There may be another program in the dynarray, start at the end */ - unsigned offset = binary->size; - - ctx->sysval_to_id = sysval_to_id; - ctx->inputs = inputs; - ctx->nir = nir; - ctx->stage = nir->info.stage; - ctx->quirks = bifrost_get_quirks(inputs->gpu_id); - ctx->arch = inputs->gpu_id >> 12; - ctx->info = info; - ctx->idvs = idvs; - ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs; - - if (idvs != BI_IDVS_NONE) { - /* Specializing shaders for IDVS is destructive, so we need to - * clone. However, the last (second) IDVS shader does not need - * to be preserved so we can skip cloning that one. - */ - if (offset == 0) - ctx->nir = nir = nir_shader_clone(ctx, nir); - - NIR_PASS_V(nir, nir_shader_instructions_pass, - bifrost_nir_specialize_idvs, - nir_metadata_block_index | nir_metadata_dominance, - &idvs); - - /* After specializing, clean up the mess */ - bool progress = true; - - while (progress) { - progress = false; - - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_dead_cf); + } else if (ctx->stage == MESA_SHADER_FRAGMENT) { + ctx->outputs_nir_to_bi = _mesa_hash_table_u64_create(NULL); + nir_foreach_variable(var, &nir->outputs) { + if (var->data.location >= FRAG_RESULT_DATA0 && var->data.location <= FRAG_RESULT_DATA7) { + int id = ctx->outputs_count++; + printf("Driver location: %d with id %d\n", var->data.location + 1, id); + _mesa_hash_table_u64_insert(ctx->outputs_nir_to_bi, var->data.location + 1, (void *) ((uintptr_t) (id + 1))); + } } } - /* If nothing is pushed, all UBOs need to be uploaded */ - ctx->ubo_mask = ~0; + /* Optimisation passes */ + optimize_nir(nir); - list_inithead(&ctx->blocks); - - bool skip_internal = nir->info.internal; - skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL); - - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { - nir_print_shader(nir, stdout); - } - - ctx->allocated_vec = _mesa_hash_table_u64_create(ctx); +#ifdef BI_DEBUG + nir_print_shader(nir, stdout); +#endif + /* Generate machine IR for shader */ nir_foreach_function(func, nir) { - if (!func->impl) - continue; - - nir_index_blocks(func->impl); + nir_builder _b; + ctx->b = &_b; + nir_builder_init(ctx->b, func->impl); - ctx->indexed_nir_blocks = - rzalloc_array(ctx, bi_block *, func->impl->num_blocks); - - ctx->ssa_alloc += func->impl->ssa_alloc; - ctx->reg_alloc += func->impl->reg_alloc; + list_inithead(&ctx->blocks); + ctx->block_count = 0; + ctx->func = func; emit_cf_list(ctx, &func->impl->body); - bi_emit_phis_deferred(ctx); - break; /* TODO: Multi-function shaders */ - } - /* Index blocks now that we're done emitting */ - bi_foreach_block(ctx, block) { - block->index = ctx->num_blocks++; + break; // XXX: Once we support multi function shaders then implement } - bi_validate(ctx, "NIR -> BIR"); - - /* If the shader doesn't write any colour or depth outputs, it may - * still need an ATEST at the very end! */ - bool need_dummy_atest = - (ctx->stage == MESA_SHADER_FRAGMENT) && - !ctx->emitted_atest && - !bi_skip_atest(ctx, false); + util_dynarray_init(&program->compiled, NULL); - if (need_dummy_atest) { - bi_block *end = list_last_entry(&ctx->blocks, bi_block, link); - bi_builder b = bi_init_builder(ctx, bi_after_block(end)); - bi_emit_atest(&b, bi_zero()); - } - - bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT); - - /* Runs before constant folding */ - bi_lower_swizzle(ctx); - bi_validate(ctx, "Early lowering"); - - /* Runs before copy prop */ - if (optimize && !ctx->inputs->no_ubo_to_push) { - bi_opt_push_ubo(ctx); - } + // MIR pre-RA optimizations - if (likely(optimize)) { - bi_opt_copy_prop(ctx); + bool progress = false; - while (bi_opt_constant_fold(ctx)) - bi_opt_copy_prop(ctx); - - bi_opt_mod_prop_forward(ctx); - bi_opt_mod_prop_backward(ctx); - - /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after - * mod_prop_backward to fuse VAR_TEX */ - if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT && - !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) { - bi_opt_dead_code_eliminate(ctx); - bi_opt_message_preload(ctx); - bi_opt_copy_prop(ctx); - } - - bi_opt_dead_code_eliminate(ctx); - bi_opt_cse(ctx); - bi_opt_dead_code_eliminate(ctx); - if (!ctx->inputs->no_ubo_to_push) - bi_opt_reorder_push(ctx); - bi_validate(ctx, "Optimization passes"); - } - - bi_lower_opt_instructions(ctx); - - if (ctx->arch >= 9) { - va_optimize(ctx); - va_lower_isel(ctx); - - bi_foreach_instr_global_safe(ctx, I) { - /* Phis become single moves so shouldn't be affected */ - if (I->op == BI_OPCODE_PHI) - continue; - - va_lower_constants(ctx, I); - - bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - va_repair_fau(&b, I); - } - - /* We need to clean up after constant lowering */ - if (likely(optimize)) { - bi_opt_cse(ctx); - bi_opt_dead_code_eliminate(ctx); - } - - bi_validate(ctx, "Valhall passes"); - } - - bi_foreach_block(ctx, block) { - bi_lower_branch(ctx, block); - } - - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) - bi_print_shader(ctx, stdout); - - /* Analyze before register allocation to avoid false dependencies. The - * skip bit is a function of only the data flow graph and is invariant - * under valid scheduling. Helpers are only defined for fragment - * shaders, so this analysis is only required in fragment shaders. - */ - if (ctx->stage == MESA_SHADER_FRAGMENT) - bi_analyze_helper_requirements(ctx); - - /* Fuse TEXC after analyzing helper requirements so the analysis - * doesn't have to know about dual textures */ - if (likely(optimize)) { - bi_opt_fuse_dual_texture(ctx); - } - - /* Lower FAU after fusing dual texture, because fusing dual texture - * creates new immediates that themselves may need lowering. - */ - if (ctx->arch <= 8) { - bi_lower_fau(ctx); - } - - /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */ - if (likely(optimize)) { - bi_opt_cse(ctx); - bi_opt_dead_code_eliminate(ctx); - } - - bi_validate(ctx, "Late lowering"); - - if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) { - bi_pressure_schedule(ctx); - bi_validate(ctx, "Pre-RA scheduling"); - } - - bi_register_allocate(ctx); - - if (likely(optimize)) - bi_opt_post_ra(ctx); - - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) - bi_print_shader(ctx, stdout); - - if (ctx->arch >= 9) { - va_assign_slots(ctx); - va_insert_flow_control_nops(ctx); - va_merge_flow(ctx); - va_mark_last(ctx); - } else { - bi_schedule(ctx); - bi_assign_scoreboard(ctx); - - /* Analyze after scheduling since we depend on instruction - * order. Valhall calls as part of va_insert_flow_control_nops, - * as the handling for clauses differs from instructions. - */ - bi_analyze_helper_terminate(ctx); - bi_mark_clauses_td(ctx); - } - - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) - bi_print_shader(ctx, stdout); - - if (ctx->arch <= 8) { - bi_pack_clauses(ctx, binary, offset); - } else { - bi_pack_valhall(ctx, binary); - } - - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { - if (ctx->arch <= 8) { - disassemble_bifrost(stdout, binary->data + offset, - binary->size - offset, - bifrost_debug & BIFROST_DBG_VERBOSE); - } else { - disassemble_valhall(stdout, binary->data + offset, - binary->size - offset, - bifrost_debug & BIFROST_DBG_VERBOSE); - } - - fflush(stdout); - } - - if (!skip_internal && - ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) { - char *shaderdb; - - if (ctx->arch >= 9) { - shaderdb = va_print_stats(ctx, binary->size - offset); - } else { - shaderdb = bi_print_stats(ctx, binary->size - offset); - } - - if (bifrost_debug & BIFROST_DBG_SHADERDB) - fprintf(stderr, "SHADER-DB: %s\n", shaderdb); - - if (inputs->debug) - util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb); - - ralloc_free(shaderdb); - } - - return ctx; -} - -static void -bi_compile_variant(nir_shader *nir, - const struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct hash_table_u64 *sysval_to_id, - struct pan_shader_info *info, - enum bi_idvs_mode idvs) -{ - struct bi_shader_info local_info = { - .push = &info->push, - .bifrost = &info->bifrost, - .tls_size = info->tls_size, - .sysvals = &info->sysvals, - .push_offset = info->push.count - }; - - unsigned offset = binary->size; - - /* If there is no position shader (gl_Position is not written), then - * there is no need to build a varying shader either. This case is hit - * for transform feedback only vertex shaders which only make sense with - * rasterizer discard. - */ - if ((offset == 0) && (idvs == BI_IDVS_VARYING)) - return; - - /* Software invariant: Only a secondary shader can appear at a nonzero - * offset, to keep the ABI simple. */ - assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); - - bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs); - - /* A register is preloaded <==> it is live before the first block */ - bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); - uint64_t preload = first_block->reg_live_in; - - /* If multisampling is used with a blend shader, the blend shader needs - * to access the sample coverage mask in r60 and the sample ID in r61. - * Blend shaders run in the same context as fragment shaders, so if a - * blend shader could run, we need to preload these registers - * conservatively. There is believed to be little cost to doing so, so - * do so always to avoid variants of the preload descriptor. - * - * We only do this on Valhall, as Bifrost has to update the RSD for - * multisampling w/ blend shader anyway, so this is handled in the - * driver. We could unify the paths if the cost is acceptable. - */ - if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9) - preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61); - - info->ubo_mask |= ctx->ubo_mask; - info->tls_size = MAX2(info->tls_size, ctx->info.tls_size); - - if (idvs == BI_IDVS_VARYING) { - info->vs.secondary_enable = (binary->size > offset); - info->vs.secondary_offset = offset; - info->vs.secondary_preload = preload; - info->vs.secondary_work_reg_count = ctx->info.work_reg_count; - } else { - info->preload = preload; - info->work_reg_count = ctx->info.work_reg_count; - } - - if (idvs == BI_IDVS_POSITION && - !nir->info.internal && - nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) { - /* Find the psiz write */ - bi_instr *write = NULL; - - bi_foreach_instr_global(ctx, I) { - if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) { - write = I; - break; - } - } - - assert(write != NULL); - - /* NOP it out, preserving its flow control. TODO: maybe DCE */ - if (write->flow) { - bi_builder b = bi_init_builder(ctx, bi_before_instr(write)); - bi_instr *nop = bi_nop(&b); - nop->flow = write->flow; + do { + progress = false; + mir_foreach_block(ctx, block) { + // XXX: Not yet working +// progress |= bifrost_opt_branch_fusion(ctx, block); } + } while (progress); - bi_remove_instruction(write); - - info->vs.no_psiz_offset = binary->size; - bi_pack_valhall(ctx, binary); - } - - ralloc_free(ctx); -} - -/* Decide if Index-Driven Vertex Shading should be used for a given shader */ -static bool -bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs) -{ - /* Opt-out */ - if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS) - return false; - - /* IDVS splits up vertex shaders, not defined on other shader stages */ - if (nir->info.stage != MESA_SHADER_VERTEX) - return false; - - /* Bifrost cannot write gl_PointSize during IDVS */ - if ((inputs->gpu_id < 0x9000) && - nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) - return false; - - /* Otherwise, IDVS is usually better */ - return true; -} - -void -bifrost_compile_shader_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct pan_shader_info *info) -{ - bifrost_debug = debug_get_option_bifrost_debug(); - - bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend); - struct hash_table_u64 *sysval_to_id = - panfrost_init_sysvals(&info->sysvals, - inputs->fixed_sysval_layout, - NULL); - - info->tls_size = nir->scratch_size; - info->vs.idvs = bi_should_idvs(nir, inputs); - - pan_nir_collect_varyings(nir, info); - - if (info->vs.idvs) { - bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION); - bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING); - } else { - bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE); - } - - if (gl_shader_stage_is_compute(nir->info.stage)) { - /* Workgroups may be merged if the structure of the workgroup is - * not software visible. This is true if neither shared memory - * nor barriers are used. The hardware may be able to optimize - * compute shaders that set this flag. - */ - info->cs.allow_merging_workgroups = - (nir->info.shared_size == 0) && - !nir->info.uses_control_barrier && - !nir->info.uses_memory_barrier; - } - - info->ubo_mask &= (1 << nir->info.num_ubos) - 1; + schedule_program(ctx); - _mesa_hash_table_u64_destroy(sysval_to_id); +#ifdef BI_DEBUG + nir_print_shader(nir, stdout); + disassemble_bifrost(program->compiled.data, program->compiled.size, false); +#endif + return 0; } diff --git a/lib/mesa/src/panfrost/bifrost/bifrost_compile.h b/lib/mesa/src/panfrost/bifrost/bifrost_compile.h index c23b51afe..e687f64f7 100644 --- a/lib/mesa/src/panfrost/bifrost/bifrost_compile.h +++ b/lib/mesa/src/panfrost/bifrost/bifrost_compile.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io> + * Copyright (C) 2018 Ryan Houdek <Sonicadvance1@gmail.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -21,40 +21,34 @@ * SOFTWARE. */ -#ifndef __BIFROST_PUBLIC_H_ -#define __BIFROST_PUBLIC_H_ +#ifndef __bifrost_compile_h__ +#define __bifrost_compile_h__ #include "compiler/nir/nir.h" #include "util/u_dynarray.h" -#include "panfrost/util/pan_ir.h" -void -bifrost_compile_shader_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct pan_shader_info *info); +struct bifrost_program { + struct util_dynarray compiled; +}; + +int +bifrost_compile_shader_nir(nir_shader *nir, struct bifrost_program *program); static const nir_shader_compiler_options bifrost_nir_options = { - .lower_scmp = true, + .fuse_ffma = true, .lower_flrp16 = true, .lower_flrp32 = true, .lower_flrp64 = true, - .lower_ffract = true, .lower_fmod = true, - .lower_fdiv = true, + .lower_bitfield_extract = true, + .lower_bitfield_extract_to_shifts = true, + .lower_bitfield_insert = true, + .lower_bitfield_insert_to_shifts = true, + .lower_bitfield_reverse = true, + .lower_idiv = true, .lower_isign = true, - .lower_find_lsb = true, - .lower_ifind_msb = true, - .lower_fdph = true, - .lower_fsqrt = true, - .lower_fsign = true, - - .lower_bitfield_insert_to_shifts = true, - .lower_bitfield_extract_to_shifts = true, - .lower_insert_byte = true, - .lower_rotate = true, - + .lower_ffract = true, .lower_pack_half_2x16 = true, .lower_pack_unorm_2x16 = true, .lower_pack_snorm_2x16 = true, @@ -65,33 +59,11 @@ static const nir_shader_compiler_options bifrost_nir_options = { .lower_unpack_snorm_2x16 = true, .lower_unpack_unorm_4x8 = true, .lower_unpack_snorm_4x8 = true, - .lower_pack_split = true, - - .lower_doubles_options = nir_lower_dmod, - /* TODO: Don't lower supported 64-bit operations */ - .lower_int64_options = ~0, - /* TODO: Use IMULD on v7 */ - .lower_mul_high = true, - .lower_fisnormal = true, - .lower_uadd_carry = true, - .lower_usub_borrow = true, - - .has_fsub = true, - .has_isub = true, - .vectorize_io = true, - .vectorize_vec2_16bit = true, - .fuse_ffma16 = true, - .fuse_ffma32 = true, - .fuse_ffma64 = true, - .use_interpolated_input_intrinsics = true, - - .lower_uniforms_to_ubo = true, - - .has_cs_global_id = true, - .lower_cs_local_index_to_id = true, - .max_unroll_iterations = 32, - .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp), - .force_indirect_unrolling_sampler = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_all_io_to_temps = true, + .lower_all_io_to_elements = true, + .vertex_id_zero_based = true, }; #endif diff --git a/lib/mesa/src/panfrost/bifrost/cmdline.c b/lib/mesa/src/panfrost/bifrost/cmdline.c index 2a11486cb..16415bbd7 100644 --- a/lib/mesa/src/panfrost/bifrost/cmdline.c +++ b/lib/mesa/src/panfrost/bifrost/cmdline.c @@ -1,8 +1,5 @@ /* - * Copyright (C) 2021 Collabora, Ltd. * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com> - * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> - * Copyright © 2015 Red Hat * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,11 +21,7 @@ * SOFTWARE. */ -#include <getopt.h> -#include <string.h> #include "disassemble.h" -#include "valhall/disassemble.h" -#include "compiler.h" #include "main/mtypes.h" #include "compiler/glsl/standalone.h" @@ -36,186 +29,47 @@ #include "compiler/glsl/gl_nir.h" #include "compiler/nir_types.h" #include "util/u_dynarray.h" -#include "bifrost_compile.h" - -unsigned gpu_id = 0x7212; -int verbose = 0; - -static gl_shader_stage -filename_to_stage(const char *stage) -{ - const char *ext = strrchr(stage, '.'); - - if (ext == NULL) { - fprintf(stderr, "No extension found in %s\n", stage); - exit(1); - } - - if (!strcmp(ext, ".cs") || !strcmp(ext, ".comp")) - return MESA_SHADER_COMPUTE; - else if (!strcmp(ext, ".vs") || !strcmp(ext, ".vert")) - return MESA_SHADER_VERTEX; - else if (!strcmp(ext, ".fs") || !strcmp(ext, ".frag")) - return MESA_SHADER_FRAGMENT; - else { - fprintf(stderr, "Invalid extension %s\n", ext); - exit(1); - } - unreachable("Should've returned or bailed"); -} - -static int -st_packed_uniforms_type_size(const struct glsl_type *type, bool bindless) -{ - return glsl_count_dword_slots(type, bindless); -} - -static int -glsl_type_size(const struct glsl_type *type, bool bindless) -{ - return glsl_count_attribute_slots(type, false); -} - -static void -insert_sorted(struct exec_list *var_list, nir_variable *new_var) -{ - nir_foreach_variable_in_list (var, var_list) { - if (var->data.location > new_var->data.location) { - exec_node_insert_node_before(&var->node, &new_var->node); - return; - } - } - exec_list_push_tail(var_list, &new_var->node); -} - -static void -sort_varyings(nir_shader *nir, nir_variable_mode mode) -{ - struct exec_list new_list; - exec_list_make_empty(&new_list); - nir_foreach_variable_with_modes_safe (var, nir, mode) { - exec_node_remove(&var->node); - insert_sorted(&new_list, var); - } - exec_list_append(&nir->variables, &new_list); -} - -static void -fixup_varying_slots(nir_shader *nir, nir_variable_mode mode) -{ - nir_foreach_variable_with_modes (var, nir, mode) { - if (var->data.location >= VARYING_SLOT_VAR0) { - var->data.location += 9; - } else if ((var->data.location >= VARYING_SLOT_TEX0) && - (var->data.location <= VARYING_SLOT_TEX7)) { - var->data.location += VARYING_SLOT_VAR0 - VARYING_SLOT_TEX0; - } - } -} +#include "bifrost_compile.h" static void -compile_shader(int stages, char **files) +compile_shader(char **argv) { struct gl_shader_program *prog; - nir_shader *nir[MESA_SHADER_COMPUTE + 1]; - unsigned shader_types[MESA_SHADER_COMPUTE + 1]; - - if (stages > MESA_SHADER_COMPUTE) { - fprintf(stderr, "Too many stages"); - exit(1); - } - - for (unsigned i = 0; i < stages; ++i) - shader_types[i] = filename_to_stage(files[i]); + nir_shader *nir[2]; + unsigned shader_types[2] = { + MESA_SHADER_VERTEX, + MESA_SHADER_FRAGMENT, + }; struct standalone_options options = { - .glsl_version = 300, /* ES - needed for precision */ + .glsl_version = 430, .do_link = true, - .lower_precision = true }; static struct gl_context local_ctx; - prog = standalone_compile_shader(&options, stages, files, &local_ctx); - - for (unsigned i = 0; i < stages; ++i) { - gl_shader_stage stage = shader_types[i]; - prog->_LinkedShaders[stage]->Program->info.stage = stage; - } - - struct util_dynarray binary; - - util_dynarray_init(&binary, NULL); - - for (unsigned i = 0; i < stages; ++i) { - nir[i] = glsl_to_nir(&local_ctx.Const, prog, shader_types[i], &bifrost_nir_options); - - if (shader_types[i] == MESA_SHADER_VERTEX) { - nir_assign_var_locations(nir[i], nir_var_shader_in, &nir[i]->num_inputs, - glsl_type_size); - sort_varyings(nir[i], nir_var_shader_out); - nir_assign_var_locations(nir[i], nir_var_shader_out, &nir[i]->num_outputs, - glsl_type_size); - fixup_varying_slots(nir[i], nir_var_shader_out); - } else if (shader_types[i] == MESA_SHADER_FRAGMENT) { - sort_varyings(nir[i], nir_var_shader_in); - nir_assign_var_locations(nir[i], nir_var_shader_in, &nir[i]->num_inputs, - glsl_type_size); - fixup_varying_slots(nir[i], nir_var_shader_in); - nir_assign_var_locations(nir[i], nir_var_shader_out, &nir[i]->num_outputs, - glsl_type_size); - } - - nir_assign_var_locations(nir[i], nir_var_uniform, &nir[i]->num_uniforms, - glsl_type_size); + prog = standalone_compile_shader(&options, 2, argv, &local_ctx); + prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program->info.stage = MESA_SHADER_FRAGMENT; + struct bifrost_program compiled; + for (unsigned i = 0; i < 2; ++i) { + nir[i] = glsl_to_nir(&local_ctx, prog, shader_types[i], &bifrost_nir_options); NIR_PASS_V(nir[i], nir_lower_global_vars_to_local); - NIR_PASS_V(nir[i], nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir[i]), true, i == 0); - NIR_PASS_V(nir[i], nir_opt_copy_prop_vars); - NIR_PASS_V(nir[i], nir_opt_combine_stores, nir_var_all); - - NIR_PASS_V(nir[i], nir_lower_system_values); - NIR_PASS_V(nir[i], gl_nir_lower_samplers, prog); NIR_PASS_V(nir[i], nir_split_var_copies); NIR_PASS_V(nir[i], nir_lower_var_copies); - NIR_PASS_V(nir[i], nir_lower_io, nir_var_uniform, - st_packed_uniforms_type_size, - (nir_lower_io_options)0); - NIR_PASS_V(nir[i], nir_lower_uniforms_to_ubo, true, false); + NIR_PASS_V(nir[i], nir_lower_alu_to_scalar, NULL); /* before buffers and vars_to_ssa */ - NIR_PASS_V(nir[i], gl_nir_lower_images, true); + NIR_PASS_V(nir[i], gl_nir_lower_bindless_images); NIR_PASS_V(nir[i], gl_nir_lower_buffers, prog); NIR_PASS_V(nir[i], nir_opt_constant_folding); - - struct panfrost_compile_inputs inputs = { - .gpu_id = gpu_id, - .fixed_sysval_ubo = -1, - }; - struct pan_shader_info info = { 0 }; - - util_dynarray_clear(&binary); - bifrost_compile_shader_nir(nir[i], &inputs, &binary, &info); - - char *fn = NULL; - asprintf(&fn, "shader_%u.bin", i); - assert(fn != NULL); - FILE *fp = fopen(fn, "wb"); - fwrite(binary.data, 1, binary.size, fp); - fclose(fp); - free(fn); + bifrost_compile_shader_nir(nir[i], &compiled); } - - util_dynarray_fini(&binary); } -#define BI_FOURCC(ch0, ch1, ch2, ch3) ( \ - (uint32_t)(ch0) | (uint32_t)(ch1) << 8 | \ - (uint32_t)(ch2) << 16 | (uint32_t)(ch3) << 24) - static void disassemble(const char *filename) { @@ -223,122 +77,34 @@ disassemble(const char *filename) assert(fp); fseek(fp, 0, SEEK_END); - unsigned filesize = ftell(fp); + int filesize = ftell(fp); rewind(fp); - uint32_t *code = malloc(filesize); - unsigned res = fread(code, 1, filesize, fp); + unsigned char *code = malloc(filesize); + int res = fread(code, 1, filesize, fp); if (res != filesize) { printf("Couldn't read full file\n"); } - fclose(fp); - void *entrypoint = code; - - if (filesize && code[0] == BI_FOURCC('M', 'B', 'S', '2')) { - for (int i = 0; i < filesize / 4; ++i) { - if (code[i] != BI_FOURCC('O', 'B', 'J', 'C')) - continue; - - unsigned size = code[i + 1]; - unsigned offset = i + 2; - - entrypoint = code + offset; - filesize = size; - } - } - - if ((gpu_id >> 12) >= 9) - disassemble_valhall(stdout, entrypoint, filesize, verbose); - else - disassemble_bifrost(stdout, entrypoint, filesize, verbose); - + disassemble_bifrost(code, filesize, false); free(code); } int main(int argc, char **argv) { - int c; - if (argc < 2) { printf("Pass a command\n"); exit(1); } - static struct option longopts[] = { - { "id", optional_argument, NULL, 'i' }, - { "gpu", optional_argument, NULL, 'g' }, - { "verbose", no_argument, &verbose, 'v' }, - { NULL, 0, NULL, 0 } - }; - - static struct { - const char *name; - unsigned major, minor; - } gpus[] = { - { "G71", 6, 0 }, - { "G72", 6, 2 }, - { "G51", 7, 0 }, - { "G76", 7, 1 }, - { "G52", 7, 2 }, - { "G31", 7, 3 }, - { "G77", 9, 0 }, - { "G57", 9, 1 }, - { "G78", 9, 2 }, - { "G57", 9, 3 }, - { "G68", 9, 4 }, - { "G78AE", 9, 5 }, - }; - - while ((c = getopt_long(argc, argv, "v:", longopts, NULL)) != -1) { - - switch (c) { - case 'i': - gpu_id = atoi(optarg); - - if (!gpu_id) { - fprintf(stderr, "Expected GPU ID, got %s\n", optarg); - return 1; - } - - break; - case 'g': - gpu_id = 0; - - /* Compatibility with the Arm compiler */ - if (strncmp(optarg, "Mali-", 5) == 0) optarg += 5; - - for (unsigned i = 0; i < ARRAY_SIZE(gpus); ++i) { - if (strcmp(gpus[i].name, optarg)) continue; - - unsigned major = gpus[i].major; - unsigned minor = gpus[i].minor; - - gpu_id = (major << 12) | (minor << 8); - break; - } - - if (!gpu_id) { - fprintf(stderr, "Unknown GPU %s\n", optarg); - return 1; - } - - break; - default: - break; - } - } - - if (strcmp(argv[optind], "compile") == 0) - compile_shader(argc - optind - 1, &argv[optind + 1]); - else if (strcmp(argv[optind], "disasm") == 0) - disassemble(argv[optind + 1]); - else { - fprintf(stderr, "Unknown command. Valid: compile/disasm\n"); - return 1; - } + if (strcmp(argv[1], "compile") == 0) + compile_shader(&argv[2]); + else if (strcmp(argv[1], "disasm") == 0) + disassemble(argv[2]); + else + unreachable("Unknown command. Valid: compile/disasm"); return 0; } diff --git a/lib/mesa/src/panfrost/bifrost/disassemble.c b/lib/mesa/src/panfrost/bifrost/disassemble.c index 1bc98e405..c7e131d5d 100644 --- a/lib/mesa/src/panfrost/bifrost/disassemble.c +++ b/lib/mesa/src/panfrost/bifrost/disassemble.c @@ -31,9 +31,8 @@ #include <string.h> #include "bifrost.h" +#include "bifrost_ops.h" #include "disassemble.h" -#include "bi_print_common.h" -#include "util/compiler.h" #include "util/macros.h" // return bits (high, lo] @@ -53,6 +52,15 @@ struct bifrost_alu_inst { uint64_t reg_bits; }; +struct bifrost_regs { + unsigned uniform_const : 8; + unsigned reg2 : 6; + unsigned reg3 : 6; + unsigned reg0 : 5; + unsigned reg1 : 6; + unsigned ctrl : 4; +}; + static unsigned get_reg0(struct bifrost_regs regs) { if (regs.ctrl == 0) @@ -66,74 +74,187 @@ static unsigned get_reg1(struct bifrost_regs regs) return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1; } +enum bifrost_reg_write_unit { + REG_WRITE_NONE = 0, // don't write + REG_WRITE_TWO, // write using reg2 + REG_WRITE_THREE, // write using reg3 +}; + // this represents the decoded version of the ctrl register field. struct bifrost_reg_ctrl { bool read_reg0; bool read_reg1; - struct bifrost_reg_ctrl_23 slot23; + bool read_reg3; + enum bifrost_reg_write_unit fma_write_unit; + enum bifrost_reg_write_unit add_write_unit; + bool clause_start; }; -static void dump_header(FILE *fp, struct bifrost_header header, bool verbose) -{ - fprintf(fp, "ds(%u) ", header.dependency_slot); +enum fma_src_type { + FMA_ONE_SRC, + FMA_TWO_SRC, + FMA_FADD, + FMA_FMINMAX, + FMA_FADD16, + FMA_FMINMAX16, + FMA_FCMP, + FMA_FCMP16, + FMA_THREE_SRC, + FMA_FMA, + FMA_FMA16, + FMA_FOUR_SRC, + FMA_FMA_MSCALE, + FMA_SHIFT_ADD64, +}; - if (header.staging_barrier) - fprintf(fp, "osrb "); +struct fma_op_info { + unsigned op; + char name[30]; + enum fma_src_type src_type; +}; - fprintf(fp, "%s ", bi_flow_control_name(header.flow_control)); +enum add_src_type { + ADD_ONE_SRC, + ADD_TWO_SRC, + ADD_FADD, + ADD_FMINMAX, + ADD_FADD16, + ADD_FMINMAX16, + ADD_THREE_SRC, + ADD_FADDMscale, + ADD_FCMP, + ADD_FCMP16, + ADD_TEX_COMPACT, // texture instruction with embedded sampler + ADD_TEX, // texture instruction with sampler/etc. in uniform port + ADD_VARYING_INTERP, + ADD_BLENDING, + ADD_LOAD_ATTR, + ADD_VARYING_ADDRESS, + ADD_BRANCH, +}; - if (header.suppress_inf) - fprintf(fp, "inf_suppress "); - if (header.suppress_nan) - fprintf(fp, "nan_suppress "); - - if (header.flush_to_zero == BIFROST_FTZ_DX11) - fprintf(fp, "ftz_dx11 "); - else if (header.flush_to_zero == BIFROST_FTZ_ALWAYS) - fprintf(fp, "ftz_hsa "); - if (header.flush_to_zero == BIFROST_FTZ_ABRUPT) - fprintf(fp, "ftz_au "); - - assert(!header.zero1); - assert(!header.zero2); - - if (header.float_exceptions == BIFROST_EXCEPTIONS_DISABLED) - fprintf(fp, "fpe_ts "); - else if (header.float_exceptions == BIFROST_EXCEPTIONS_PRECISE_DIVISION) - fprintf(fp, "fpe_pd "); - else if (header.float_exceptions == BIFROST_EXCEPTIONS_PRECISE_SQRT) - fprintf(fp, "fpe_psqr "); - - if (header.message_type) - fprintf(fp, "%s ", bi_message_type_name(header.message_type)); - - if (header.terminate_discarded_threads) - fprintf(fp, "td "); - - if (header.next_clause_prefetch) - fprintf(fp, "ncph "); - - if (header.next_message_type) - fprintf(fp, "next_%s ", bi_message_type_name(header.next_message_type)); - if (header.dependency_wait != 0) { - fprintf(fp, "dwb("); +struct add_op_info { + unsigned op; + char name[30]; + enum add_src_type src_type; + bool has_data_reg; +}; + +struct bifrost_tex_ctrl { + unsigned sampler_index : 4; // also used to signal indirects + unsigned tex_index : 7; + bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices + bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather) + unsigned unk0 : 2; + bool texel_offset : 1; // *Offset() + bool is_shadow : 1; + bool is_array : 1; + unsigned tex_type : 2; // 2D, 3D, Cube, Buffer + bool compute_lod : 1; // 0 for *Lod() + bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied + bool calc_gradients : 1; // 0 for *Grad() + unsigned unk1 : 1; + unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits? + unsigned unk2 : 4; +}; + +struct bifrost_dual_tex_ctrl { + unsigned sampler_index0 : 2; + unsigned unk0 : 2; + unsigned tex_index0 : 2; + unsigned sampler_index1 : 2; + unsigned tex_index1 : 2; + unsigned unk1 : 22; +}; + +enum branch_bit_size { + BR_SIZE_32 = 0, + BR_SIZE_16XX = 1, + BR_SIZE_16YY = 2, + // For the above combinations of bitsize and location, an extra bit is + // encoded via comparing the sources. The only possible source of ambiguity + // would be if the sources were the same, but then the branch condition + // would be always true or always false anyways, so we can ignore it. But + // this no longer works when comparing the y component to the x component, + // since it's valid to compare the y component of a source against its own + // x component. Instead, the extra bit is encoded via an extra bitsize. + BR_SIZE_16YX0 = 3, + BR_SIZE_16YX1 = 4, + BR_SIZE_32_AND_16X = 5, + BR_SIZE_32_AND_16Y = 6, + // Used for comparisons with zero and always-true, see below. I think this + // only works for integer comparisons. + BR_SIZE_ZERO = 7, +}; + +void dump_header(struct bifrost_header header, bool verbose); +void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, + unsigned data_reg, unsigned offset, bool verbose); +bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose); + +void dump_header(struct bifrost_header header, bool verbose) +{ + if (header.clause_type != 0) { + printf("id(%du) ", header.scoreboard_index); + } + + if (header.scoreboard_deps != 0) { + printf("next-wait("); bool first = true; for (unsigned i = 0; i < 8; i++) { - if (header.dependency_wait & (1 << i)) { + if (header.scoreboard_deps & (1 << i)) { if (!first) { - fprintf(fp, ", "); + printf(", "); } - fprintf(fp, "%u", i); + printf("%d", i); first = false; } } - fprintf(fp, ") "); + printf(") "); } - fprintf(fp, "\n"); + if (header.datareg_writebarrier) + printf("data-reg-barrier "); + + if (!header.no_end_of_shader) + printf("eos "); + + if (!header.back_to_back) { + printf("nbb "); + if (header.branch_cond) + printf("branch-cond "); + else + printf("branch-uncond "); + } + + if (header.elide_writes) + printf("we "); + + if (header.suppress_inf) + printf("suppress-inf "); + if (header.suppress_nan) + printf("suppress-nan "); + + if (header.unk0) + printf("unk0 "); + if (header.unk1) + printf("unk1 "); + if (header.unk2) + printf("unk2 "); + if (header.unk3) + printf("unk3 "); + if (header.unk4) + printf("unk4 "); + + printf("\n"); + + if (verbose) { + printf("# clause type %d, next clause type %d\n", + header.clause_type, header.next_clause_type); + } } -static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs, bool first) +static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs) { struct bifrost_reg_ctrl decoded = {}; unsigned ctrl; @@ -145,199 +266,160 @@ static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs, ctrl = regs.ctrl; decoded.read_reg0 = decoded.read_reg1 = true; } + switch (ctrl) { + case 1: + decoded.fma_write_unit = REG_WRITE_TWO; + break; + case 2: + case 3: + decoded.fma_write_unit = REG_WRITE_TWO; + decoded.read_reg3 = true; + break; + case 4: + decoded.read_reg3 = true; + break; + case 5: + decoded.add_write_unit = REG_WRITE_TWO; + break; + case 6: + decoded.add_write_unit = REG_WRITE_TWO; + decoded.read_reg3 = true; + break; + case 8: + decoded.clause_start = true; + break; + case 9: + decoded.fma_write_unit = REG_WRITE_TWO; + decoded.clause_start = true; + break; + case 11: + break; + case 12: + decoded.read_reg3 = true; + decoded.clause_start = true; + break; + case 13: + decoded.add_write_unit = REG_WRITE_TWO; + decoded.clause_start = true; + break; - /* Modify control based on state */ - if (first) - ctrl = (ctrl & 0x7) | ((ctrl & 0x8) << 1); - else if (regs.reg2 == regs.reg3) - ctrl += 16; - - decoded.slot23 = bifrost_reg_ctrl_lut[ctrl]; - ASSERTED struct bifrost_reg_ctrl_23 reserved = { 0 }; - assert(memcmp(&decoded.slot23, &reserved, sizeof(reserved))); + case 7: + case 15: + decoded.fma_write_unit = REG_WRITE_THREE; + decoded.add_write_unit = REG_WRITE_TWO; + break; + default: + printf("# unknown reg ctrl %d\n", ctrl); + } return decoded; } -static void dump_regs(FILE *fp, struct bifrost_regs srcs, bool first) -{ - struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, srcs, first); - fprintf(fp, " # "); - if (ctrl.read_reg0) - fprintf(fp, "slot 0: r%u ", get_reg0(srcs)); - if (ctrl.read_reg1) - fprintf(fp, "slot 1: r%u ", get_reg1(srcs)); - - const char *slot3_fma = ctrl.slot23.slot3_fma ? "FMA" : "ADD"; - - if (ctrl.slot23.slot2 == BIFROST_OP_WRITE) - fprintf(fp, "slot 2: r%u (write FMA) ", srcs.reg2); - else if (ctrl.slot23.slot2 == BIFROST_OP_WRITE_LO) - fprintf(fp, "slot 2: r%u (write lo FMA) ", srcs.reg2); - else if (ctrl.slot23.slot2 == BIFROST_OP_WRITE_HI) - fprintf(fp, "slot 2: r%u (write hi FMA) ", srcs.reg2); - else if (ctrl.slot23.slot2 == BIFROST_OP_READ) - fprintf(fp, "slot 2: r%u (read) ", srcs.reg2); - - if (ctrl.slot23.slot3 == BIFROST_OP_WRITE) - fprintf(fp, "slot 3: r%u (write %s) ", srcs.reg3, slot3_fma); - else if (ctrl.slot23.slot3 == BIFROST_OP_WRITE_LO) - fprintf(fp, "slot 3: r%u (write lo %s) ", srcs.reg3, slot3_fma); - else if (ctrl.slot23.slot3 == BIFROST_OP_WRITE_HI) - fprintf(fp, "slot 3: r%u (write hi %s) ", srcs.reg3, slot3_fma); - - if (srcs.fau_idx) - fprintf(fp, "fau %X ", srcs.fau_idx); - - fprintf(fp, "\n"); -} - -static void -bi_disasm_dest_mask(FILE *fp, enum bifrost_reg_op op) +// Pass in the add_write_unit or fma_write_unit, and this returns which register +// the ADD/FMA units are writing to +static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs) { - if (op == BIFROST_OP_WRITE_LO) - fprintf(fp, ".h0"); - else if (op == BIFROST_OP_WRITE_HI) - fprintf(fp, ".h1"); + switch (unit) { + case REG_WRITE_TWO: + return regs.reg2; + case REG_WRITE_THREE: + return regs.reg3; + default: /* REG_WRITE_NONE */ + assert(0); + return 0; + } } -void -bi_disasm_dest_fma(FILE *fp, struct bifrost_regs *next_regs, bool last) +static void dump_regs(struct bifrost_regs srcs) { - /* If this is the last instruction, next_regs points to the first reg entry. */ - struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, *next_regs, last); - if (ctrl.slot23.slot2 >= BIFROST_OP_WRITE) { - fprintf(fp, "r%u:t0", next_regs->reg2); - bi_disasm_dest_mask(fp, ctrl.slot23.slot2); - } else if (ctrl.slot23.slot3 >= BIFROST_OP_WRITE && ctrl.slot23.slot3_fma) { - fprintf(fp, "r%u:t0", next_regs->reg3); - bi_disasm_dest_mask(fp, ctrl.slot23.slot3); - } else - fprintf(fp, "t0"); -} + struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(srcs); + printf("# "); + if (ctrl.read_reg0) + printf("port 0: R%d ", get_reg0(srcs)); + if (ctrl.read_reg1) + printf("port 1: R%d ", get_reg1(srcs)); + + if (ctrl.fma_write_unit == REG_WRITE_TWO) + printf("port 2: R%d (write FMA) ", srcs.reg2); + else if (ctrl.add_write_unit == REG_WRITE_TWO) + printf("port 2: R%d (write ADD) ", srcs.reg2); + + if (ctrl.fma_write_unit == REG_WRITE_THREE) + printf("port 3: R%d (write FMA) ", srcs.reg3); + else if (ctrl.add_write_unit == REG_WRITE_THREE) + printf("port 3: R%d (write ADD) ", srcs.reg3); + else if (ctrl.read_reg3) + printf("port 3: R%d (read) ", srcs.reg3); + + if (srcs.uniform_const) { + if (srcs.uniform_const & 0x80) { + printf("uniform: U%d", (srcs.uniform_const & 0x7f) * 2); + } + } -void -bi_disasm_dest_add(FILE *fp, struct bifrost_regs *next_regs, bool last) -{ - /* If this is the last instruction, next_regs points to the first reg entry. */ - struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, *next_regs, last); - - if (ctrl.slot23.slot3 >= BIFROST_OP_WRITE && !ctrl.slot23.slot3_fma) { - fprintf(fp, "r%u:t1", next_regs->reg3); - bi_disasm_dest_mask(fp, ctrl.slot23.slot3); - } else - fprintf(fp, "t1"); + printf("\n"); } - -static void dump_const_imm(FILE *fp, uint32_t imm) +static void dump_const_imm(uint32_t imm) { union { float f; uint32_t i; } fi; fi.i = imm; - fprintf(fp, "0x%08x /* %f */", imm, fi.f); + printf("0x%08x /* %f */", imm, fi.f); } -static void -dump_pc_imm(FILE *fp, uint64_t imm, unsigned branch_offset, enum bi_constmod mod, bool high32) +static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs) { - if (mod == BI_CONSTMOD_PC_HI && !high32) { - dump_const_imm(fp, imm); - return; - } - - /* 60-bit sign-extend */ - uint64_t zx64 = (imm << 4); - int64_t sx64 = zx64; - sx64 >>= 4; - - /* 28-bit sign extend x 2 */ - uint32_t imm32[2] = { (uint32_t) imm, (uint32_t) (imm >> 32) }; - uint32_t zx32[2] = { imm32[0] << 4, imm32[1] << 4 }; - int32_t sx32[2] = { zx32[0], zx32[1] }; - sx32[0] >>= 4; - sx32[1] >>= 4; - - int64_t offs = 0; - - switch (mod) { - case BI_CONSTMOD_PC_LO: - offs = sx64; + unsigned low_bits = srcs.uniform_const & 0xf; + uint64_t imm; + switch (srcs.uniform_const >> 4) { + case 4: + imm = consts[0]; + break; + case 5: + imm = consts[1]; + break; + case 6: + imm = consts[2]; break; - case BI_CONSTMOD_PC_HI: - offs = sx32[1]; + case 7: + imm = consts[3]; break; - case BI_CONSTMOD_PC_LO_HI: - offs = sx32[high32]; + case 2: + imm = consts[4]; + break; + case 3: + imm = consts[5]; break; default: - unreachable("Invalid PC modifier"); + assert(0); + break; } - - assert((offs & 15) == 0); - fprintf(fp, "clause_%" PRId64, branch_offset + (offs / 16)); - - if (mod == BI_CONSTMOD_PC_LO && high32) - fprintf(fp, " >> 32"); - - /* While technically in spec, referencing the current clause as (pc + - * 0) likely indicates an unintended infinite loop */ - if (offs == 0) - fprintf(fp, " /* XXX: likely an infinite loop */"); -} - -/* Convert an index to an embedded constant in FAU-RAM to the index of the - * embedded constant. No, it's not in order. Yes, really. */ - -static unsigned -const_fau_to_idx(unsigned fau_value) -{ - unsigned map[8] = { - ~0, ~0, 4, 5, 0, 1, 2, 3 - }; - - assert(map[fau_value] < 6); - return map[fau_value]; + return imm | low_bits; } -static void dump_fau_src(FILE *fp, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool high32) +static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, bool high32) { - if (srcs.fau_idx & 0x80) { - unsigned uniform = (srcs.fau_idx & 0x7f); - fprintf(fp, "u%u.w%u", uniform, high32); - } else if (srcs.fau_idx >= 0x20) { - unsigned idx = const_fau_to_idx(srcs.fau_idx >> 4); - uint64_t imm = consts->raw[idx]; - imm |= (srcs.fau_idx & 0xf); - if (consts->mods[idx] != BI_CONSTMOD_NONE) - dump_pc_imm(fp, imm, branch_offset, consts->mods[idx], high32); - else if (high32) - dump_const_imm(fp, imm >> 32); + if (srcs.uniform_const & 0x80) { + unsigned uniform = (srcs.uniform_const & 0x7f) * 2; + printf("U%d", uniform + (high32 ? 1 : 0)); + } else if (srcs.uniform_const >= 0x20) { + uint64_t imm = get_const(consts, srcs); + if (high32) + dump_const_imm(imm >> 32); else - dump_const_imm(fp, imm); + dump_const_imm(imm); } else { - switch (srcs.fau_idx) { + switch (srcs.uniform_const) { case 0: - fprintf(fp, "#0"); - break; - case 1: - fprintf(fp, "lane_id"); - break; - case 2: - fprintf(fp, "warp_id"); - break; - case 3: - fprintf(fp, "core_id"); - break; - case 4: - fprintf(fp, "framebuffer_size"); + printf("0"); break; case 5: - fprintf(fp, "atest_datum"); + printf("atest-data"); break; case 6: - fprintf(fp, "sample"); + printf("sample-ptr"); break; case 8: case 9: @@ -347,113 +429,1640 @@ static void dump_fau_src(FILE *fp, struct bifrost_regs srcs, unsigned branch_off case 13: case 14: case 15: - fprintf(fp, "blend_descriptor_%u", (unsigned) srcs.fau_idx - 8); + printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8); break; default: - fprintf(fp, "XXX - reserved%u", (unsigned) srcs.fau_idx); + printf("unkConst%u", (unsigned) srcs.uniform_const); break; } if (high32) - fprintf(fp, ".y"); + printf(".y"); else - fprintf(fp, ".x"); + printf(".x"); } } -void -dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool isFMA) +static void dump_src(unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA) { switch (src) { case 0: - fprintf(fp, "r%u", get_reg0(srcs)); + printf("R%d", get_reg0(srcs)); break; case 1: - fprintf(fp, "r%u", get_reg1(srcs)); + printf("R%d", get_reg1(srcs)); break; case 2: - fprintf(fp, "r%u", srcs.reg2); + printf("R%d", srcs.reg3); break; case 3: if (isFMA) - fprintf(fp, "#0"); + printf("0"); else - fprintf(fp, "t"); // i.e. the output of FMA this cycle + printf("T"); // i.e. the output of FMA this cycle break; case 4: - dump_fau_src(fp, srcs, branch_offset, consts, false); + dump_uniform_const_src(srcs, consts, false); break; case 5: - dump_fau_src(fp, srcs, branch_offset, consts, true); + dump_uniform_const_src(srcs, consts, true); break; case 6: - fprintf(fp, "t0"); + printf("T0"); break; case 7: - fprintf(fp, "t1"); + printf("T1"); break; } } -/* Tables for decoding M0, or if M0 == 7, M1 respectively. - * - * XXX: It's not clear if the third entry of M1_table corresponding to (7, 2) - * should have PC_LO_HI in the EC1 slot, or it's a weird hybrid mode? I would - * say this needs testing but no code should ever actually use this mode. - */ +static void dump_output_mod(unsigned mod) +{ + switch (mod) { + case 0: + break; + case 1: + printf(".clamp_0_inf"); + break; // max(out, 0) + case 2: + printf(".clamp_m1_1"); + break; // clamp(out, -1, 1) + case 3: + printf(".clamp_0_1"); + break; // clamp(out, 0, 1) + default: + break; + } +} -static const enum bi_constmod M1_table[7][2] = { - { BI_CONSTMOD_NONE, BI_CONSTMOD_NONE }, - { BI_CONSTMOD_PC_LO, BI_CONSTMOD_NONE }, - { BI_CONSTMOD_PC_LO, BI_CONSTMOD_PC_LO }, - { ~0, ~0 }, - { BI_CONSTMOD_PC_HI, BI_CONSTMOD_NONE }, - { BI_CONSTMOD_PC_HI, BI_CONSTMOD_PC_HI }, - { BI_CONSTMOD_PC_LO, BI_CONSTMOD_NONE }, -}; +static void dump_minmax_mode(unsigned mod) +{ + switch (mod) { + case 0: + /* Same as fmax() and fmin() -- return the other number if any + * number is NaN. Also always return +0 if one argument is +0 and + * the other is -0. + */ + break; + case 1: + /* Instead of never returning a NaN, always return one. The + * "greater"/"lesser" NaN is always returned, first by checking the + * sign and then the mantissa bits. + */ + printf(".nan_wins"); + break; + case 2: + /* For max, implement src0 > src1 ? src0 : src1 + * For min, implement src0 < src1 ? src0 : src1 + * + * This includes handling NaN's and signedness of 0 differently + * from above, since +0 and -0 compare equal and comparisons always + * return false for NaN's. As a result, this mode is *not* + * commutative. + */ + printf(".src1_wins"); + break; + case 3: + /* For max, implement src0 < src1 ? src1 : src0 + * For min, implement src0 > src1 ? src1 : src0 + */ + printf(".src0_wins"); + break; + default: + break; + } +} + +static void dump_round_mode(unsigned mod) +{ + switch (mod) { + case 0: + /* roundTiesToEven, the IEEE default. */ + break; + case 1: + /* roundTowardPositive in the IEEE spec. */ + printf(".round_pos"); + break; + case 2: + /* roundTowardNegative in the IEEE spec. */ + printf(".round_neg"); + break; + case 3: + /* roundTowardZero in the IEEE spec. */ + printf(".round_zero"); + break; + default: + break; + } +} -static const enum bi_constmod M2_table[4][2] = { - { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_NONE }, - { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_HI }, - { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_LO_HI }, - { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_HI }, +static const struct fma_op_info FMAOpInfos[] = { + { 0x00000, "FMA.f32", FMA_FMA }, + { 0x40000, "MAX.f32", FMA_FMINMAX }, + { 0x44000, "MIN.f32", FMA_FMINMAX }, + { 0x48000, "FCMP.GL", FMA_FCMP }, + { 0x4c000, "FCMP.D3D", FMA_FCMP }, + { 0x4ff98, "ADD.i32", FMA_TWO_SRC }, + { 0x4ffd8, "SUB.i32", FMA_TWO_SRC }, + { 0x4fff0, "SUBB.i32", FMA_TWO_SRC }, + { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE }, + { 0x58000, "ADD.f32", FMA_FADD }, + { 0x5c000, "CSEL.FEQ.f32", FMA_FOUR_SRC }, + { 0x5c200, "CSEL.FGT.f32", FMA_FOUR_SRC }, + { 0x5c400, "CSEL.FGE.f32", FMA_FOUR_SRC }, + { 0x5c600, "CSEL.IEQ.f32", FMA_FOUR_SRC }, + { 0x5c800, "CSEL.IGT.i32", FMA_FOUR_SRC }, + { 0x5ca00, "CSEL.IGE.i32", FMA_FOUR_SRC }, + { 0x5cc00, "CSEL.UGT.i32", FMA_FOUR_SRC }, + { 0x5ce00, "CSEL.UGE.i32", FMA_FOUR_SRC }, + { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC }, + { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC }, + { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC }, + { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC }, + { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC }, + { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0 + { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC }, + { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC }, + { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC }, + { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC }, + { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0 + { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC }, + { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC }, + { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC }, + { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC }, + { 0x60200, "RSHIFT_NAND.i32", FMA_THREE_SRC }, + { 0x603c0, "RSHIFT_NAND.v2i16", FMA_THREE_SRC }, + { 0x60e00, "RSHIFT_OR.i32", FMA_THREE_SRC }, + { 0x60fc0, "RSHIFT_OR.v2i16", FMA_THREE_SRC }, + { 0x61200, "RSHIFT_AND.i32", FMA_THREE_SRC }, + { 0x613c0, "RSHIFT_AND.v2i16", FMA_THREE_SRC }, + { 0x61e00, "RSHIFT_NOR.i32", FMA_THREE_SRC }, // ~((src0 << src2) | src1) + { 0x61fc0, "RSHIFT_NOR.v2i16", FMA_THREE_SRC }, // ~((src0 << src2) | src1) + { 0x62200, "LSHIFT_NAND.i32", FMA_THREE_SRC }, + { 0x623c0, "LSHIFT_NAND.v2i16", FMA_THREE_SRC }, + { 0x62e00, "LSHIFT_OR.i32", FMA_THREE_SRC }, // (src0 << src2) | src1 + { 0x62fc0, "LSHIFT_OR.v2i16", FMA_THREE_SRC }, // (src0 << src2) | src1 + { 0x63200, "LSHIFT_AND.i32", FMA_THREE_SRC }, // (src0 << src2) & src1 + { 0x633c0, "LSHIFT_AND.v2i16", FMA_THREE_SRC }, + { 0x63e00, "LSHIFT_NOR.i32", FMA_THREE_SRC }, + { 0x63fc0, "LSHIFT_NOR.v2i16", FMA_THREE_SRC }, + { 0x64200, "RSHIFT_XOR.i32", FMA_THREE_SRC }, + { 0x643c0, "RSHIFT_XOR.v2i16", FMA_THREE_SRC }, + { 0x64600, "RSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) + { 0x647c0, "RSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) + { 0x64a00, "LSHIFT_XOR.i32", FMA_THREE_SRC }, + { 0x64bc0, "LSHIFT_XOR.v2i16", FMA_THREE_SRC }, + { 0x64e00, "LSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) + { 0x64fc0, "LSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) + { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC }, + { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1 + { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2) + { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC }, + { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC }, + { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC }, + { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC }, + { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC }, + { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC }, + { 0x80000, "FMA.v2f16", FMA_FMA16 }, + { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 }, + { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 }, + { 0xc8000, "FCMP.GL", FMA_FCMP16 }, + { 0xcc000, "FCMP.D3D", FMA_FCMP16 }, + { 0xcf900, "ADD.v2i16", FMA_TWO_SRC }, + { 0xcfc10, "ADDC.i32", FMA_TWO_SRC }, + { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC }, + { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC }, + { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC }, + { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC }, + { 0xd8000, "ADD.v2f16", FMA_FADD16 }, + { 0xdc000, "CSEL.FEQ.v2f16", FMA_FOUR_SRC }, + { 0xdc200, "CSEL.FGT.v2f16", FMA_FOUR_SRC }, + { 0xdc400, "CSEL.FGE.v2f16", FMA_FOUR_SRC }, + { 0xdc600, "CSEL.IEQ.v2f16", FMA_FOUR_SRC }, + { 0xdc800, "CSEL.IGT.v2i16", FMA_FOUR_SRC }, + { 0xdca00, "CSEL.IGE.v2i16", FMA_FOUR_SRC }, + { 0xdcc00, "CSEL.UGT.v2i16", FMA_FOUR_SRC }, + { 0xdce00, "CSEL.UGE.v2i16", FMA_FOUR_SRC }, + { 0xdd000, "F32_TO_F16", FMA_TWO_SRC }, + { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC }, + { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC }, + { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC }, + { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC }, + { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC }, + { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC }, + { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC }, + { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC }, + { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC }, + { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC }, + { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC }, + { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC }, + { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC }, + { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC }, + { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC }, + { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC }, + { 0xe0136, "F32_TO_I32", FMA_ONE_SRC }, + { 0xe0137, "F32_TO_U32", FMA_ONE_SRC }, + { 0xe0178, "I32_TO_F32", FMA_ONE_SRC }, + { 0xe0179, "U32_TO_F32", FMA_ONE_SRC }, + { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC }, + { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC }, + { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC }, + { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC }, + { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC }, + { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC }, + { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC }, + { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC }, + { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC }, + { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC }, + { 0xe032c, "NOP", FMA_ONE_SRC }, + { 0xe032d, "MOV", FMA_ONE_SRC }, + { 0xe032f, "SWZ.YY.v2i16", FMA_ONE_SRC }, + // From the ARM patent US20160364209A1: + // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s, + // and x1 is a floating point value in a predetermined range where the + // value 1 is within the range and not at one extremity of the range (e.g. + // choose a range where 1 is towards middle of range)." + // + // This computes x1. + { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC }, + // Given a floating point number m * 2^e, returns m * 2^{-1}. This is + // exactly the same as the mantissa part of frexp(). + { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC }, + // Given a floating point number m * 2^e, returns m * 2^{-2} if e is even, + // and m * 2^{-1} if e is odd. In other words, scales by powers of 4 until + // within the range [0.25, 1). Used for square-root and reciprocal + // square-root. + { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC }, + // Given a floating point number m * 2^e, computes -e - 1 as an integer. + // Zero and infinity/NaN return 0. + { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC }, + // Computes floor(e/2) + 1. + { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC }, + // Given a floating point number m * 2^e, computes -floor(e/2) - 1 as an + // integer. + { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC }, + { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC }, + { 0xe03fa, "CLZ", FMA_ONE_SRC }, + { 0xe0b80, "IMAX3", FMA_THREE_SRC }, + { 0xe0bc0, "UMAX3", FMA_THREE_SRC }, + { 0xe0c00, "IMIN3", FMA_THREE_SRC }, + { 0xe0c40, "UMIN3", FMA_THREE_SRC }, + { 0xe0ec5, "ROUND", FMA_ONE_SRC }, + { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0 + { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment + { 0xe1805, "ROUNDEVEN", FMA_ONE_SRC }, + { 0xe1845, "CEIL", FMA_ONE_SRC }, + { 0xe1885, "FLOOR", FMA_ONE_SRC }, + { 0xe18c5, "TRUNC", FMA_ONE_SRC }, + { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC }, + { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC }, + // These instructions in the FMA slot, together with LSHIFT_ADD_HIGH32.i32 + // in the ADD slot, allow one to do a 64-bit addition with an extra small + // shift on one of the sources. There are three possible scenarios: + // + // 1) Full 64-bit addition. Do: + // out.x = LSHIFT_ADD_LOW32.i64 src1.x, src2.x, shift + // out.y = LSHIFT_ADD_HIGH32.i32 src1.y, src2.y + // + // The shift amount is applied to src2 before adding. The shift amount, and + // any extra bits from src2 plus the overflow bit, are sent directly from + // FMA to ADD instead of being passed explicitly. Hence, these two must be + // bundled together into the same instruction. + // + // 2) Add a 64-bit value src1 to a zero-extended 32-bit value src2. Do: + // out.x = LSHIFT_ADD_LOW32.u32 src1.x, src2, shift + // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0 + // + // Note that in this case, the second argument to LSHIFT_ADD_HIGH32 is + // ignored, so it can actually be anything. As before, the shift is applied + // to src2 before adding. + // + // 3) Add a 64-bit value to a sign-extended 32-bit value src2. Do: + // out.x = LSHIFT_ADD_LOW32.i32 src1.x, src2, shift + // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0 + // + // The only difference is the .i32 instead of .u32. Otherwise, this is + // exactly the same as before. + // + // In all these instructions, the shift amount is stored where the third + // source would be, so the shift has to be a small immediate from 0 to 7. + // This is fine for the expected use-case of these instructions, which is + // manipulating 64-bit pointers. + // + // These instructions can also be combined with various load/store + // instructions which normally take a 64-bit pointer in order to add a + // 32-bit or 64-bit offset to the pointer before doing the operation, + // optionally shifting the offset. The load/store op implicity does + // LSHIFT_ADD_HIGH32.i32 internally. Letting ptr be the pointer, and offset + // the desired offset, the cases go as follows: + // + // 1) Add a 64-bit offset: + // LSHIFT_ADD_LOW32.i64 ptr.x, offset.x, shift + // ld_st_op ptr.y, offset.y, ... + // + // Note that the output of LSHIFT_ADD_LOW32.i64 is not used, instead being + // implicitly sent to the load/store op to serve as the low 32 bits of the + // pointer. + // + // 2) Add a 32-bit unsigned offset: + // temp = LSHIFT_ADD_LOW32.u32 ptr.x, offset, shift + // ld_st_op temp, ptr.y, ... + // + // Now, the low 32 bits of offset << shift + ptr are passed explicitly to + // the ld_st_op, to match the case where there is no offset and ld_st_op is + // called directly. + // + // 3) Add a 32-bit signed offset: + // temp = LSHIFT_ADD_LOW32.i32 ptr.x, offset, shift + // ld_st_op temp, ptr.y, ... + // + // Again, the same as the unsigned case except for the offset. + { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 }, + { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 }, + { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 }, + { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC }, + { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC }, + { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC }, + { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC }, + { 0xe7800, "IMAD", FMA_THREE_SRC }, + { 0xe78db, "POPCNT", FMA_ONE_SRC }, }; -static void -decode_M(enum bi_constmod *mod, unsigned M1, unsigned M2, bool single) +static struct fma_op_info find_fma_op_info(unsigned op) { - if (M1 >= 8) { - mod[0] = BI_CONSTMOD_NONE; + for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) { + unsigned opCmp = ~0; + switch (FMAOpInfos[i].src_type) { + case FMA_ONE_SRC: + opCmp = op; + break; + case FMA_TWO_SRC: + opCmp = op & ~0x7; + break; + case FMA_FCMP: + case FMA_FCMP16: + opCmp = op & ~0x1fff; + break; + case FMA_THREE_SRC: + case FMA_SHIFT_ADD64: + opCmp = op & ~0x3f; + break; + case FMA_FADD: + case FMA_FMINMAX: + case FMA_FADD16: + case FMA_FMINMAX16: + opCmp = op & ~0x3fff; + break; + case FMA_FMA: + case FMA_FMA16: + opCmp = op & ~0x3ffff; + break; + case FMA_FOUR_SRC: + opCmp = op & ~0x1ff; + break; + case FMA_FMA_MSCALE: + opCmp = op & ~0x7fff; + break; + default: + opCmp = ~0; + break; + } + if (FMAOpInfos[i].op == opCmp) + return FMAOpInfos[i]; + } - if (!single) - mod[1] = BI_CONSTMOD_NONE; + struct fma_op_info info; + snprintf(info.name, sizeof(info.name), "op%04x", op); + info.op = op; + info.src_type = FMA_THREE_SRC; + return info; +} +static void dump_fcmp(unsigned op) +{ + switch (op) { + case 0: + printf(".OEQ"); + break; + case 1: + printf(".OGT"); + break; + case 2: + printf(".OGE"); + break; + case 3: + printf(".UNE"); + break; + case 4: + printf(".OLT"); + break; + case 5: + printf(".OLE"); + break; + default: + printf(".unk%d", op); + break; + } +} + +static void dump_16swizzle(unsigned swiz) +{ + if (swiz == 2) return; - } else if (M1 == 7) { - assert(M2 < 4); - memcpy(mod, M2_table[M2], sizeof(*mod) * (single ? 1 : 2)); + printf(".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]); +} + +static void dump_fma_expand_src0(unsigned ctrl) +{ + switch (ctrl) { + case 3: + case 4: + case 6: + printf(".x"); + break; + case 5: + case 7: + printf(".y"); + break; + case 0: + case 1: + case 2: + break; + default: + printf(".unk"); + break; + } +} + +static void dump_fma_expand_src1(unsigned ctrl) +{ + switch (ctrl) { + case 1: + case 3: + printf(".x"); + break; + case 2: + case 4: + case 5: + printf(".y"); + break; + case 0: + case 6: + case 7: + break; + default: + printf(".unk"); + break; + } +} + +static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose) +{ + if (verbose) { + printf("# FMA: %016" PRIx64 "\n", word); + } + struct bifrost_fma_inst FMA; + memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst)); + struct fma_op_info info = find_fma_op_info(FMA.op); + + printf("%s", info.name); + if (info.src_type == FMA_FADD || + info.src_type == FMA_FMINMAX || + info.src_type == FMA_FMA || + info.src_type == FMA_FADD16 || + info.src_type == FMA_FMINMAX16 || + info.src_type == FMA_FMA16) { + dump_output_mod(bits(FMA.op, 12, 14)); + switch (info.src_type) { + case FMA_FADD: + case FMA_FMA: + case FMA_FADD16: + case FMA_FMA16: + dump_round_mode(bits(FMA.op, 10, 12)); + break; + case FMA_FMINMAX: + case FMA_FMINMAX16: + dump_minmax_mode(bits(FMA.op, 10, 12)); + break; + default: + assert(0); + } + } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) { + dump_fcmp(bits(FMA.op, 10, 13)); + if (info.src_type == FMA_FCMP) + printf(".f32"); + else + printf(".v2f16"); + } else if (info.src_type == FMA_FMA_MSCALE) { + if (FMA.op & (1 << 11)) { + switch ((FMA.op >> 9) & 0x3) { + case 0: + /* This mode seems to do a few things: + * - Makes 0 * infinity (and incidentally 0 * nan) return 0, + * since generating a nan would poison the result of + * 1/infinity and 1/0. + * - Fiddles with which nan is returned in nan * nan, + * presumably to make sure that the same exact nan is + * returned for 1/nan. + */ + printf(".rcp_mode"); + break; + case 3: + /* Similar to the above, but src0 always wins when multiplying + * 0 by infinity. + */ + printf(".sqrt_mode"); + break; + default: + printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3); + } + } else { + dump_output_mod(bits(FMA.op, 9, 11)); + } + } + + printf(" "); + + struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs); + if (next_ctrl.fma_write_unit != REG_WRITE_NONE) { + printf("{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs)); } else { - assert(M1 != 3); - memcpy(mod, M1_table[M1], sizeof(*mod) * (single ? 1 : 2)); + printf("T0, "); } + + switch (info.src_type) { + case FMA_ONE_SRC: + dump_src(FMA.src0, regs, consts, true); + break; + case FMA_TWO_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + break; + case FMA_FADD: + case FMA_FMINMAX: + if (FMA.op & 0x10) + printf("-"); + if (FMA.op & 0x200) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & 0x200) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (FMA.op & 0x8) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & 0x8) + printf(")"); + break; + case FMA_FADD16: + case FMA_FMINMAX16: { + bool abs1 = FMA.op & 0x8; + bool abs2 = (FMA.op & 0x7) < FMA.src0; + if (FMA.op & 0x10) + printf("-"); + if (abs1 || abs2) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_16swizzle((FMA.op >> 6) & 0x3); + if (abs1 || abs2) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (abs1 && abs2) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + if (abs1 && abs2) + printf(")"); + break; + } + case FMA_FCMP: + if (FMA.op & 0x200) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & 0x200) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (FMA.op & 0x8) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & 0x8) + printf(")"); + break; + case FMA_FCMP16: + dump_src(FMA.src0, regs, consts, true); + // Note: this is kinda a guess, I haven't seen the blob set this to + // anything other than the identity, but it matches FMA_TWO_SRCFmod16 + dump_16swizzle((FMA.op >> 6) & 0x3); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + break; + case FMA_SHIFT_ADD64: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + printf("shift:%u", (FMA.op >> 3) & 0x7); + break; + case FMA_THREE_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + break; + case FMA_FMA: + if (FMA.op & (1 << 14)) + printf("-"); + if (FMA.op & (1 << 9)) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & (1 << 9)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 16)) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & (1 << 16)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 15)) + printf("-"); + if (FMA.op & (1 << 17)) + printf("abs("); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + if (FMA.op & (1 << 17)) + printf(")"); + break; + case FMA_FMA16: + if (FMA.op & (1 << 14)) + printf("-"); + dump_src(FMA.src0, regs, consts, true); + dump_16swizzle((FMA.op >> 6) & 0x3); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + printf(", "); + if (FMA.op & (1 << 15)) + printf("-"); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 16) & 0x3); + break; + case FMA_FOUR_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 6) & 0x7, regs, consts, true); + break; + case FMA_FMA_MSCALE: + if (FMA.op & (1 << 12)) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + if (FMA.op & (1 << 12)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 13)) + printf("-"); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + if (FMA.op & (1 << 14)) + printf("-"); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 6) & 0x7, regs, consts, true); + break; + } + printf("\n"); +} + +static const struct add_op_info add_op_infos[] = { + { 0x00000, "MAX.f32", ADD_FMINMAX }, + { 0x02000, "MIN.f32", ADD_FMINMAX }, + { 0x04000, "ADD.f32", ADD_FADD }, + { 0x06000, "FCMP.GL", ADD_FCMP }, + { 0x07000, "FCMP.D3D", ADD_FCMP }, + { 0x07856, "F16_TO_I16", ADD_ONE_SRC }, + { 0x07857, "F16_TO_U16", ADD_ONE_SRC }, + { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC }, + { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC }, + { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC }, + { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC }, + { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC }, + { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC }, + { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC }, + { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC }, + { 0x07936, "F32_TO_I32", ADD_ONE_SRC }, + { 0x07937, "F32_TO_U32", ADD_ONE_SRC }, + { 0x07978, "I32_TO_F32", ADD_ONE_SRC }, + { 0x07979, "U32_TO_F32", ADD_ONE_SRC }, + { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC }, + { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC }, + { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC }, + { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC }, + { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC }, + { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC }, + { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC }, + { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC }, + // take the low 16 bits, and expand it to a 32-bit float + { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC }, + // take the high 16 bits, ... + { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC }, + { 0x07b2b, "SWZ.YX.v2i16", ADD_ONE_SRC }, + { 0x07b2c, "NOP", ADD_ONE_SRC }, + { 0x07b29, "SWZ.XX.v2i16", ADD_ONE_SRC }, + // Logically, this should be SWZ.XY, but that's equivalent to a move, and + // this seems to be the canonical way the blob generates a MOV. + { 0x07b2d, "MOV", ADD_ONE_SRC }, + { 0x07b2f, "SWZ.YY.v2i16", ADD_ONE_SRC }, + // Given a floating point number m * 2^e, returns m ^ 2^{-1}. + { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC }, + { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC }, + { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC }, + { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC }, + { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC }, + // From the ARM patent US20160364209A1: + // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s, + // and x1 is a floating point value in a predetermined range where the + // value 1 is within the range and not at one extremity of the range (e.g. + // choose a range where 1 is towards middle of range)." + // + // This computes s. + { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC }, + { 0x07d45, "CEIL", ADD_ONE_SRC }, + { 0x07d85, "FLOOR", ADD_ONE_SRC }, + { 0x07dc5, "TRUNC", ADD_ONE_SRC }, + { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC }, + { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true }, + { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true }, + { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true }, + { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true }, + { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true }, + { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true }, + { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true }, + { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true }, + { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true }, + { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true }, + { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true }, + { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true }, + { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true }, + { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true }, + { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true }, + { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true }, + { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true }, + { 0x0b000, "TEX", ADD_TEX_COMPACT, true }, + { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true }, + { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true }, + { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true }, + { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true }, + { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true }, + { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true }, + { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true }, + // src0 = offset, src1 = binding + { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true }, + { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true }, + { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true }, + { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true }, + { 0x0c588, "STORE.i32", ADD_TWO_SRC, true }, + { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true }, + { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true }, + { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true }, + { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends + { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true }, + { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true }, + { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true }, + { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true }, + { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true }, + // *_FAST does not exist on G71 (added to G51, G72, and everything after) + { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC }, + { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC }, + // Given a floating point number m * 2^e, produces a table-based + // approximation of 2/m using the top 17 bits. Includes special cases for + // infinity, NaN, and zero, and copies the sign bit. + { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC }, + // Exists on G71 + { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC }, + // A similar table for inverse square root, using the high 17 bits of the + // mantissa as well as the low bit of the exponent. + { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC }, + { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC }, + { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC }, + // Used in the argument reduction for log. Given a floating-point number + // m * 2^e, uses the top 4 bits of m to produce an approximation to 1/m + // with the exponent forced to 0 and only the top 5 bits are nonzero. 0, + // infinity, and NaN all return 1.0. + // See the ARM patent for more information. + { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC }, + { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC }, + { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC }, + { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC }, + { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC }, + { 0x0cf51, "COS_TABLE", ADD_ONE_SRC }, + { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC }, + { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC }, + { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC }, + { 0x0d000, "BRANCH", ADD_BRANCH }, + // For each bit i, return src2[i] ? src0[i] : src1[i]. In other words, this + // is the same as (src2 & src0) | (~src2 & src1). + { 0x0e8c0, "MUX", ADD_THREE_SRC }, + { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC }, + { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC }, + { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC }, + { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC }, + { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC }, + { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC }, + { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC }, + { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0 + { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC }, + { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC }, + { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC }, + { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC }, + { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0 + { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC }, + { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC }, + { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC }, + { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC }, + { 0x10000, "MAX.v2f16", ADD_FMINMAX16 }, + { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale }, + { 0x12000, "MIN.v2f16", ADD_FMINMAX16 }, + { 0x14000, "ADD.v2f16", ADD_FADD16 }, + { 0x17000, "FCMP.D3D", ADD_FCMP16 }, + { 0x178c0, "ADD.i32", ADD_TWO_SRC }, + { 0x17900, "ADD.v2i16", ADD_TWO_SRC }, + { 0x17ac0, "SUB.i32", ADD_TWO_SRC }, + { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1 + { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC }, + { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC }, + { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC }, + { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC }, + // Compute varying address and datatype (for storing in the vertex shader), + // and store the vec3 result in the data register. The result is passed as + // the 3 normal arguments to ST_VAR. + { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true }, + { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true }, + { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true }, + { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true }, + // Implements alpha-to-coverage, as well as possibly the late depth and + // stencil tests. The first source is the existing sample mask in R60 + // (possibly modified by gl_SampleMask), and the second source is the alpha + // value. The sample mask is written right away based on the + // alpha-to-coverage result using the normal register write mechanism, + // since that doesn't need to read from any memory, and then written again + // later based on the result of the stencil and depth tests using the + // special register. + { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true }, + { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true }, + { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true }, + // store a varying given the address and datatype from LD_VAR_ADDR + { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true }, + { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true }, + { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true }, + { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true }, + // This takes the sample coverage mask (computed by ATEST above) as a + // regular argument, in addition to the vec4 color in the special register. + { 0x1952c, "BLEND", ADD_BLENDING, true }, + { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true }, + { 0x1ae60, "TEX", ADD_TEX, true }, + { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC }, + { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC }, + { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC }, + { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC }, + { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC }, + { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC }, + { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC }, + { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC }, + { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC }, + { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC }, + { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC }, + { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC }, + { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC }, + { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC }, + { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC }, + { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC }, + { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC }, + { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC }, + { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC }, + { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC }, + { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC }, + { 0x1dd18, "OR.i32", ADD_TWO_SRC }, + { 0x1dd20, "AND.i32", ADD_TWO_SRC }, + { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC }, + { 0x1dd50, "XOR.i32", ADD_TWO_SRC }, + { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC }, + { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC }, +}; + +static struct add_op_info find_add_op_info(unsigned op) +{ + for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) { + unsigned opCmp = ~0; + switch (add_op_infos[i].src_type) { + case ADD_ONE_SRC: + case ADD_BLENDING: + opCmp = op; + break; + case ADD_TWO_SRC: + opCmp = op & ~0x7; + break; + case ADD_THREE_SRC: + opCmp = op & ~0x3f; + break; + case ADD_TEX: + opCmp = op & ~0xf; + break; + case ADD_FADD: + case ADD_FMINMAX: + case ADD_FADD16: + opCmp = op & ~0x1fff; + break; + case ADD_FMINMAX16: + case ADD_FADDMscale: + opCmp = op & ~0xfff; + break; + case ADD_FCMP: + case ADD_FCMP16: + opCmp = op & ~0x7ff; + break; + case ADD_TEX_COMPACT: + opCmp = op & ~0x3ff; + break; + case ADD_VARYING_INTERP: + opCmp = op & ~0x7ff; + break; + case ADD_VARYING_ADDRESS: + opCmp = op & ~0xff; + break; + case ADD_LOAD_ATTR: + opCmp = op & ~0x7f; + break; + case ADD_BRANCH: + opCmp = op & ~0xfff; + break; + default: + opCmp = ~0; + break; + } + if (add_op_infos[i].op == opCmp) + return add_op_infos[i]; + } + + struct add_op_info info; + snprintf(info.name, sizeof(info.name), "op%04x", op); + info.op = op; + info.src_type = ADD_TWO_SRC; + info.has_data_reg = true; + return info; +} + +static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, + unsigned data_reg, unsigned offset, bool verbose) +{ + if (verbose) { + printf("# ADD: %016" PRIx64 "\n", word); + } + struct bifrost_add_inst ADD; + memcpy((char *) &ADD, (char *) &word, sizeof(ADD)); + struct add_op_info info = find_add_op_info(ADD.op); + + printf("%s", info.name); + + // float16 seems like it doesn't support output modifiers + if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) { + // output modifiers + dump_output_mod(bits(ADD.op, 8, 10)); + if (info.src_type == ADD_FADD) + dump_round_mode(bits(ADD.op, 10, 12)); + else + dump_minmax_mode(bits(ADD.op, 10, 12)); + } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) { + dump_fcmp(bits(ADD.op, 3, 6)); + if (info.src_type == ADD_FCMP) + printf(".f32"); + else + printf(".v2f16"); + } else if (info.src_type == ADD_FADDMscale) { + switch ((ADD.op >> 6) & 0x7) { + case 0: + break; + // causes GPU hangs on G71 + case 1: + printf(".invalid"); + break; + // Same as usual outmod value. + case 2: + printf(".clamp_0_1"); + break; + // If src0 is infinite or NaN, flush it to zero so that the other + // source is passed through unmodified. + case 3: + printf(".flush_src0_inf_nan"); + break; + // Vice versa. + case 4: + printf(".flush_src1_inf_nan"); + break; + // Every other case seems to behave the same as the above? + default: + printf(".unk%d", (ADD.op >> 6) & 0x7); + break; + } + } else if (info.src_type == ADD_VARYING_INTERP) { + if (ADD.op & 0x200) + printf(".reuse"); + if (ADD.op & 0x400) + printf(".flat"); + switch ((ADD.op >> 7) & 0x3) { + case 0: + printf(".per_frag"); + break; + case 1: + printf(".centroid"); + break; + case 2: + break; + case 3: + printf(".explicit"); + break; + } + printf(".v%d", ((ADD.op >> 5) & 0x3) + 1); + } else if (info.src_type == ADD_BRANCH) { + enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f); + if (branchCode == BR_ALWAYS) { + // unconditional branch + } else { + enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7); + enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); + bool portSwapped = (ADD.op & 0x7) < ADD.src0; + // See the comment in branch_bit_size + if (size == BR_SIZE_16YX0) + portSwapped = true; + if (size == BR_SIZE_16YX1) + portSwapped = false; + // These sizes are only for floating point comparisons, so the + // non-floating-point comparisons are reused to encode the flipped + // versions. + if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) + portSwapped = false; + // There's only one argument, so we reuse the extra argument to + // encode this. + if (size == BR_SIZE_ZERO) + portSwapped = !(ADD.op & 1); + + switch (cond) { + case BR_COND_LT: + if (portSwapped) + printf(".LT.u"); + else + printf(".LT.i"); + break; + case BR_COND_LE: + if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) { + printf(".UNE.f"); + } else { + if (portSwapped) + printf(".LE.u"); + else + printf(".LE.i"); + } + break; + case BR_COND_GT: + if (portSwapped) + printf(".GT.u"); + else + printf(".GT.i"); + break; + case BR_COND_GE: + if (portSwapped) + printf(".GE.u"); + else + printf(".GE.i"); + break; + case BR_COND_EQ: + if (portSwapped) + printf(".NE.i"); + else + printf(".EQ.i"); + break; + case BR_COND_OEQ: + if (portSwapped) + printf(".UNE.f"); + else + printf(".OEQ.f"); + break; + case BR_COND_OGT: + if (portSwapped) + printf(".OGT.unk.f"); + else + printf(".OGT.f"); + break; + case BR_COND_OLT: + if (portSwapped) + printf(".OLT.unk.f"); + else + printf(".OLT.f"); + break; + } + switch (size) { + case BR_SIZE_32: + case BR_SIZE_32_AND_16X: + case BR_SIZE_32_AND_16Y: + printf("32"); + break; + case BR_SIZE_16XX: + case BR_SIZE_16YY: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + printf("16"); + break; + case BR_SIZE_ZERO: { + unsigned ctrl = (ADD.op >> 1) & 0x3; + if (ctrl == 0) + printf("32.Z"); + else + printf("16.Z"); + break; + } + } + } + } + printf(" "); + + struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs); + if (next_ctrl.add_write_unit != REG_WRITE_NONE) { + printf("{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs)); + } else { + printf("T1, "); + } + + switch (info.src_type) { + case ADD_BLENDING: + // Note: in this case, regs.uniform_const == location | 0x8 + // This probably means we can't load uniforms or immediates in the + // same instruction. This re-uses the encoding that normally means + // "disabled", where the low 4 bits are ignored. Perhaps the extra + // 0x8 or'd in indicates this is happening. + printf("location:%d, ", regs.uniform_const & 0x7); + // fallthrough + case ADD_ONE_SRC: + dump_src(ADD.src0, regs, consts, false); + break; + case ADD_TEX: + case ADD_TEX_COMPACT: { + int tex_index; + int sampler_index; + bool dualTex = false; + if (info.src_type == ADD_TEX_COMPACT) { + tex_index = (ADD.op >> 3) & 0x7; + sampler_index = (ADD.op >> 7) & 0x7; + bool unknown = (ADD.op & 0x40); + // TODO: figure out if the unknown bit is ever 0 + if (!unknown) + printf("unknown "); + } else { + uint64_t constVal = get_const(consts, regs); + uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal; + struct bifrost_tex_ctrl ctrl; + memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl)); + + // TODO: figure out what actually triggers dual-tex + if (ctrl.result_type == 9) { + struct bifrost_dual_tex_ctrl dualCtrl; + memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl)); + printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ", + dualCtrl.tex_index0, dualCtrl.sampler_index0, + dualCtrl.tex_index1, dualCtrl.sampler_index1); + if (dualCtrl.unk0 != 3) + printf("unk:%d ", dualCtrl.unk0); + dualTex = true; + } else { + if (ctrl.no_merge_index) { + tex_index = ctrl.tex_index; + sampler_index = ctrl.sampler_index; + } else { + tex_index = sampler_index = ctrl.tex_index; + unsigned unk = ctrl.sampler_index >> 2; + if (unk != 3) + printf("unk:%d ", unk); + if (ctrl.sampler_index & 1) + tex_index = -1; + if (ctrl.sampler_index & 2) + sampler_index = -1; + } + + if (ctrl.unk0 != 3) + printf("unk0:%d ", ctrl.unk0); + if (ctrl.unk1) + printf("unk1 "); + if (ctrl.unk2 != 0xf) + printf("unk2:%x ", ctrl.unk2); + + switch (ctrl.result_type) { + case 0x4: + printf("f32 "); + break; + case 0xe: + printf("i32 "); + break; + case 0xf: + printf("u32 "); + break; + default: + printf("unktype(%x) ", ctrl.result_type); + } + + switch (ctrl.tex_type) { + case 0: + printf("cube "); + break; + case 1: + printf("buffer "); + break; + case 2: + printf("2D "); + break; + case 3: + printf("3D "); + break; + } + + if (ctrl.is_shadow) + printf("shadow "); + if (ctrl.is_array) + printf("array "); + + if (!ctrl.filter) { + if (ctrl.calc_gradients) { + int comp = (controlBits >> 20) & 0x3; + printf("txg comp:%d ", comp); + } else { + printf("txf "); + } + } else { + if (!ctrl.not_supply_lod) { + if (ctrl.compute_lod) + printf("lod_bias "); + else + printf("lod "); + } + + if (!ctrl.calc_gradients) + printf("grad "); + } + + if (ctrl.texel_offset) + printf("offset "); + } + } + + if (!dualTex) { + if (tex_index == -1) + printf("tex:indirect "); + else + printf("tex:%d ", tex_index); + + if (sampler_index == -1) + printf("samp:indirect "); + else + printf("samp:%d ", sampler_index); + } + break; + } + case ADD_VARYING_INTERP: { + unsigned addr = ADD.op & 0x1f; + if (addr < 0b10100) { + // direct addr + printf("%d", addr); + } else if (addr < 0b11000) { + if (addr == 22) + printf("fragw"); + else if (addr == 23) + printf("fragz"); + else + printf("unk%d", addr); + } else { + dump_src(ADD.op & 0x7, regs, consts, false); + } + printf(", "); + dump_src(ADD.src0, regs, consts, false); + break; + } + case ADD_VARYING_ADDRESS: { + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + printf(", "); + unsigned location = (ADD.op >> 3) & 0x1f; + if (location < 16) { + printf("location:%d", location); + } else if (location == 20) { + printf("location:%u", (uint32_t) get_const(consts, regs)); + } else if (location == 21) { + printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32)); + } else { + printf("location:%d(unk)", location); + } + break; + } + case ADD_LOAD_ATTR: + printf("location:%d, ", (ADD.op >> 3) & 0xf); + case ADD_TWO_SRC: + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + break; + case ADD_THREE_SRC: + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + printf(", "); + dump_src((ADD.op >> 3) & 0x7, regs, consts, false); + break; + case ADD_FADD: + case ADD_FMINMAX: + if (ADD.op & 0x10) + printf("-"); + if (ADD.op & 0x1000) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 3: + printf(".x"); + break; + default: + break; + } + if (ADD.op & 0x1000) + printf(")"); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (ADD.op & 0x8) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 1: + case 3: + printf(".x"); + break; + case 2: + printf(".y"); + break; + case 0: + break; + default: + printf(".unk"); + break; + } + if (ADD.op & 0x8) + printf(")"); + break; + case ADD_FADD16: + if (ADD.op & 0x10) + printf("-"); + if (ADD.op & 0x1000) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + if (ADD.op & 0x1000) + printf(")"); + dump_16swizzle((ADD.op >> 6) & 0x3); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (ADD.op & 0x8) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + if (ADD.op & 0x8) + printf(")"); + break; + case ADD_FMINMAX16: { + bool abs1 = ADD.op & 0x8; + bool abs2 = (ADD.op & 0x7) < ADD.src0; + if (ADD.op & 0x10) + printf("-"); + if (abs1 || abs2) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + dump_16swizzle((ADD.op >> 6) & 0x3); + if (abs1 || abs2) + printf(")"); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (abs1 && abs2) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + if (abs1 && abs2) + printf(")"); + break; + } + case ADD_FADDMscale: { + if (ADD.op & 0x400) + printf("-"); + if (ADD.op & 0x200) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + if (ADD.op & 0x200) + printf(")"); + + printf(", "); + + if (ADD.op & 0x800) + printf("-"); + dump_src(ADD.op & 0x7, regs, consts, false); + + printf(", "); + + dump_src((ADD.op >> 3) & 0x7, regs, consts, false); + break; + } + case ADD_FCMP: + if (ADD.op & 0x400) { + printf("-"); + } + if (ADD.op & 0x100) { + printf("abs("); + } + dump_src(ADD.src0, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 3: + printf(".x"); + break; + default: + break; + } + if (ADD.op & 0x100) { + printf(")"); + } + printf(", "); + if (ADD.op & 0x200) { + printf("abs("); + } + dump_src(ADD.op & 0x7, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 1: + case 3: + printf(".x"); + break; + case 2: + printf(".y"); + break; + case 0: + break; + default: + printf(".unk"); + break; + } + if (ADD.op & 0x200) { + printf(")"); + } + break; + case ADD_FCMP16: + dump_src(ADD.src0, regs, consts, false); + dump_16swizzle((ADD.op >> 6) & 0x3); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + break; + case ADD_BRANCH: { + enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f); + enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); + if (code != BR_ALWAYS) { + dump_src(ADD.src0, regs, consts, false); + switch (size) { + case BR_SIZE_16XX: + printf(".x"); + break; + case BR_SIZE_16YY: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + printf(".y"); + break; + case BR_SIZE_ZERO: { + unsigned ctrl = (ADD.op >> 1) & 0x3; + switch (ctrl) { + case 1: + printf(".y"); + break; + case 2: + printf(".x"); + break; + default: + break; + } + } + default: + break; + } + printf(", "); + } + if (code != BR_ALWAYS && size != BR_SIZE_ZERO) { + dump_src(ADD.op & 0x7, regs, consts, false); + switch (size) { + case BR_SIZE_16XX: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + case BR_SIZE_32_AND_16X: + printf(".x"); + break; + case BR_SIZE_16YY: + case BR_SIZE_32_AND_16Y: + printf(".y"); + break; + default: + break; + } + printf(", "); + } + // I haven't had the chance to test if this actually specifies the + // branch offset, since I couldn't get it to produce values other + // than 5 (uniform/const high), but these three bits are always + // consistent across branch instructions, so it makes sense... + int offsetSrc = (ADD.op >> 3) & 0x7; + if (offsetSrc == 4 || offsetSrc == 5) { + // If the offset is known/constant, we can decode it + uint32_t raw_offset; + if (offsetSrc == 4) + raw_offset = get_const(consts, regs); + else + raw_offset = get_const(consts, regs) >> 32; + // The high 4 bits are flags, while the rest is the + // twos-complement offset in bytes (here we convert to + // clauses). + int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8; + + // If high4 is the high 4 bits of the last 64-bit constant, + // this is calculated as (high4 + 4) & 0xf, or 0 if the branch + // offset itself is the last constant. Not sure if this is + // actually used, or just garbage in unused bits, but in any + // case, we can just ignore it here since it's redundant. Note + // that if there is any padding, this will be 4 since the + // padding counts as the last constant. + unsigned flags = raw_offset >> 28; + (void) flags; + + // Note: the offset is in bytes, relative to the beginning of the + // current clause, so a zero offset would be a loop back to the + // same clause (annoyingly different from Midgard). + printf("clause_%d", offset + branch_offset); + } else { + dump_src(offsetSrc, regs, consts, false); + } + } + } + if (info.has_data_reg) { + printf(", R%d", data_reg); + } + printf("\n"); } -static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose) +void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, + unsigned data_reg, unsigned offset, bool verbose) +{ + struct bifrost_regs regs; + memcpy((char *) ®s, (char *) &instr->reg_bits, sizeof(regs)); + + if (verbose) { + printf("# regs: %016" PRIx64 "\n", instr->reg_bits); + dump_regs(regs); + } + dump_fma(instr->fma_bits, regs, next_regs, consts, verbose); + dump_add(instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose); +} + +bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) { // State for a decoded clause struct bifrost_alu_inst instrs[8] = {}; - struct bi_constants consts = {}; + uint64_t consts[6] = {}; unsigned num_instrs = 0; unsigned num_consts = 0; uint64_t header_bits = 0; + bool stopbit = false; unsigned i; for (i = 0; ; i++, words += 4) { if (verbose) { - fprintf(fp, "# "); + printf("# "); for (int j = 0; j < 4; j++) - fprintf(fp, "%08x ", words[3 - j]); // low bit on the right - fprintf(fp, "\n"); + printf("%08x ", words[3 - j]); // low bit on the right + printf("\n"); } unsigned tag = bits(words[0], 0, 8); @@ -469,45 +2078,39 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60; uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32; - /* Z-bit */ bool stop = tag & 0x40; if (verbose) { - fprintf(fp, "# tag: 0x%02x\n", tag); + printf("# tag: 0x%02x\n", tag); } if (tag & 0x80) { - /* Format 5 or 10 */ unsigned idx = stop ? 5 : 2; main_instr.add_bits |= ((tag >> 3) & 0x7) << 17; instrs[idx + 1] = main_instr; instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17); instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10; - consts.raw[0] = bits(words[3], 17, 32) << 4; + consts[0] = bits(words[3], 17, 32) << 4; } else { bool done = false; switch ((tag >> 3) & 0x7) { case 0x0: switch (tag & 0x7) { case 0x3: - /* Format 1 */ main_instr.add_bits |= bits(words[3], 29, 32) << 17; instrs[1] = main_instr; num_instrs = 2; done = stop; break; case 0x4: - /* Format 3 */ instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; - consts.raw[0] = const0; - decode_M(&consts.mods[0], bits(words[2], 4, 8), bits(words[2], 8, 12), true); + consts[0] = const0; num_instrs = 3; num_consts = 1; done = stop; break; case 0x1: case 0x5: - /* Format 4 */ instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; main_instr.add_bits |= bits(words[3], 26, 29) << 17; @@ -518,17 +2121,14 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs } break; case 0x6: - /* Format 8 */ instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; - consts.raw[0] = const0; - decode_M(&consts.mods[0], bits(words[2], 4, 8), bits(words[2], 8, 12), true); + consts[0] = const0; num_instrs = 6; num_consts = 1; done = stop; break; case 0x7: - /* Format 9 */ instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; main_instr.add_bits |= bits(words[3], 26, 29) << 17; @@ -537,23 +2137,21 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs done = stop; break; default: - unreachable("[INSTR_INVALID_ENC] Invalid tag bits"); + printf("unknown tag bits 0x%02x\n", tag); } break; case 0x2: case 0x3: { - /* Format 6 or 11 */ unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7; main_instr.add_bits |= (tag & 0x7) << 17; instrs[idx] = main_instr; - consts.raw[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19; + consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19; num_consts = 1; num_instrs = idx + 1; done = stop; break; } case 0x4: { - /* Format 2 */ unsigned idx = stop ? 4 : 1; main_instr.add_bits |= (tag & 0x7) << 17; instrs[idx] = main_instr; @@ -562,69 +2160,56 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs break; } case 0x1: - /* Format 0 - followed by constants */ + // only constants can come after this num_instrs = 1; done = stop; - FALLTHROUGH; case 0x5: - /* Format 0 - followed by instructions */ header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19)); main_instr.add_bits |= (tag & 0x7) << 17; instrs[0] = main_instr; break; case 0x6: case 0x7: { - /* Format 12 */ unsigned pos = tag & 0xf; - - struct { - unsigned const_idx; - unsigned nr_tuples; - } pos_table[0x10] = { - { 0, 1 }, - { 0, 2 }, - { 0, 4 }, - { 1, 3 }, - { 1, 5 }, - { 2, 4 }, - { 0, 7 }, - { 1, 6 }, - { 3, 5 }, - { 1, 8 }, - { 2, 7 }, - { 3, 6 }, - { 3, 8 }, - { 4, 7 }, - { 5, 6 }, - { ~0, ~0 } - }; - - ASSERTED bool valid_count = pos_table[pos].nr_tuples == num_instrs; - assert(valid_count && "INSTR_INVALID_ENC"); - - unsigned const_idx = pos_table[pos].const_idx; - + // note that `pos' encodes both the total number of + // instructions and the position in the constant stream, + // presumably because decoded constants and instructions + // share a buffer in the decoder, but we only care about + // the position in the constant stream; the total number of + // instructions is redundant. + unsigned const_idx = 7; + switch (pos) { + case 0: + case 1: + case 2: + case 6: + const_idx = 0; + break; + case 3: + case 4: + case 7: + case 9: + const_idx = 1; + break; + case 5: + case 0xa: + const_idx = 2; + break; + case 8: + case 0xb: + case 0xc: + const_idx = 3; + break; + case 0xd: + const_idx = 4; + break; + default: + printf("# unknown pos 0x%x\n", pos); + } if (num_consts < const_idx + 2) num_consts = const_idx + 2; - - consts.raw[const_idx] = const0; - consts.raw[const_idx + 1] = const1; - - /* Calculate M values from A, B and 4-bit - * unsigned arithmetic. Mathematically it - * should be (A - B) % 16 but we use this - * alternate form to avoid sign issues */ - - unsigned A1 = bits(words[2], 0, 4); - unsigned B1 = bits(words[3], 28, 32); - unsigned A2 = bits(words[1], 0, 4); - unsigned B2 = bits(words[2], 28, 32); - - unsigned M1 = (16 + A1 - B1) & 0xF; - unsigned M2 = (16 + A2 - B2) & 0xF; - - decode_M(&consts.mods[const_idx], M1, M2, false); - + consts[const_idx] = const0; + consts[const_idx + 1] = const1; done = stop; break; } @@ -640,16 +2225,18 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs *size = i + 1; if (verbose) { - fprintf(fp, "# header: %012" PRIx64 "\n", header_bits); + printf("# header: %012" PRIx64 "\n", header_bits); } struct bifrost_header header; memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header)); - dump_header(fp, header, verbose); + dump_header(header, verbose); + if (!header.no_end_of_shader) + stopbit = true; - fprintf(fp, "{\n"); + printf("{\n"); for (i = 0; i < num_instrs; i++) { - struct bifrost_regs regs, next_regs; + struct bifrost_regs next_regs; if (i + 1 == num_instrs) { memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits, sizeof(next_regs)); @@ -658,51 +2245,36 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs sizeof(next_regs)); } - memcpy((char *) ®s, (char *) &instrs[i].reg_bits, sizeof(regs)); - - if (verbose) { - fprintf(fp, " # regs: %016" PRIx64 "\n", instrs[i].reg_bits); - dump_regs(fp, regs, i == 0); - } - - bi_disasm_fma(fp, instrs[i].fma_bits, ®s, &next_regs, - header.staging_register, offset, &consts, - i + 1 == num_instrs); - - bi_disasm_add(fp, instrs[i].add_bits, ®s, &next_regs, - header.staging_register, offset, &consts, - i + 1 == num_instrs); + dump_instr(&instrs[i], next_regs, consts, header.datareg, offset, verbose); } - fprintf(fp, "}\n"); + printf("}\n"); if (verbose) { for (unsigned i = 0; i < num_consts; i++) { - fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i, consts.raw[i] & 0xffffffff); - fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i + 1, consts.raw[i] >> 32); + printf("# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff); + printf("# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32); } } - - fprintf(fp, "\n"); - return; + return stopbit; } -void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose) +void disassemble_bifrost(uint8_t *code, size_t size, bool verbose) { uint32_t *words = (uint32_t *) code; uint32_t *words_end = words + (size / 4); // used for displaying branch targets unsigned offset = 0; while (words != words_end) { - /* Shaders have zero bytes at the end for padding; stop - * disassembling when we hit them. */ - if (*words == 0) + // we don't know what the program-end bit is quite yet, so for now just + // assume that an all-0 quadword is padding + uint32_t zero[4] = {}; + if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0) break; - - fprintf(fp, "clause_%u:\n", offset); - + printf("clause_%d:\n", offset); unsigned size; - dump_clause(fp, words, &size, offset, verbose); - + if (dump_clause(words, &size, offset, verbose) == true) { + break; + } words += size * 4; offset += size; } diff --git a/lib/mesa/src/panfrost/bifrost/disassemble.h b/lib/mesa/src/panfrost/bifrost/disassemble.h index 1e39c20d6..f5bce2e30 100644 --- a/lib/mesa/src/panfrost/bifrost/disassemble.h +++ b/lib/mesa/src/panfrost/bifrost/disassemble.h @@ -23,25 +23,7 @@ * SOFTWARE. */ -#ifndef __BI_DISASM_H -#define __BI_DISASM_H - #include <stdbool.h> #include <stddef.h> #include <stdint.h> -#include <stdio.h> -#include "bifrost.h" - -void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose); - -void -bi_disasm_fma(FILE *fp, unsigned bits, struct bifrost_regs *srcs, struct bifrost_regs *next_regs, unsigned staging_register, unsigned branch_offset, struct bi_constants *consts, bool first); - -void bi_disasm_add(FILE *fp, unsigned bits, struct bifrost_regs *srcs, struct bifrost_regs *next_regs, unsigned staging_register, unsigned branch_offset, struct bi_constants *consts, bool first); - -void bi_disasm_dest_fma(FILE *fp, struct bifrost_regs *next_regs, bool first); -void bi_disasm_dest_add(FILE *fp, struct bifrost_regs *next_regs, bool first); - -void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool isFMA); - -#endif +void disassemble_bifrost(uint8_t *code, size_t size, bool verbose); diff --git a/lib/mesa/src/panfrost/bifrost/meson.build b/lib/mesa/src/panfrost/bifrost/meson.build index 63d1560cc..b49170a35 100644 --- a/lib/mesa/src/panfrost/bifrost/meson.build +++ b/lib/mesa/src/panfrost/bifrost/meson.build @@ -19,172 +19,20 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -subdir('valhall') - -inc_valhall = include_directories(['.', 'valhall']) - libpanfrost_bifrost_files = files( - 'bi_helper_invocations.c', - 'bi_layout.c', - 'bi_liveness.c', - 'bi_lower_divergent_indirects.c', - 'bi_lower_swizzle.c', - 'bi_print.c', - 'bi_opt_constant_fold.c', - 'bi_opt_copy_prop.c', - 'bi_opt_dce.c', - 'bi_opt_cse.c', - 'bi_opt_push_ubo.c', - 'bi_opt_message_preload.c', - 'bi_opt_mod_props.c', - 'bi_opt_dual_tex.c', - 'bi_pressure_schedule.c', - 'bi_pack.c', - 'bi_ra.c', - 'bi_schedule.c', - 'bi_scoreboard.c', - 'bi_validate.c', - 'bir.c', 'bifrost_compile.c', - 'valhall/va_insert_flow.c', - 'valhall/va_lower_constants.c', - 'valhall/va_lower_isel.c', - 'valhall/va_lower_split_64bit.c', - 'valhall/va_optimize.c', - 'valhall/va_mark_last.c', - 'valhall/va_merge_flow.c', - 'valhall/va_pack.c', - 'valhall/va_perf.c', - 'valhall/va_validate.c', -) - -bifrost_gen_disasm_c = custom_target( - 'bifrost_gen_disasm.c', - input : ['gen_disasm.py', 'ISA.xml'], - output : 'bifrost_gen_disasm.c', - command : [prog_python, '@INPUT@'], - capture : true, - depend_files : files('bifrost_isa.py'), -) - -bi_opcodes_c = custom_target( - 'bi_opcodes.c', - input : ['bi_opcodes.c.py', 'ISA.xml'], - output : 'bi_opcodes.c', - command : [prog_python, '@INPUT@'], - capture : true, - depend_files : files('bifrost_isa.py'), -) - -bi_printer_c = custom_target( - 'bi_printer.c', - input : ['bi_printer.c.py', 'ISA.xml'], - output : 'bi_printer.c', - command : [prog_python, '@INPUT@'], - capture : true, - depend_files : files('bifrost_isa.py'), -) - -bi_packer_c = custom_target( - 'bi_packer.c', - input : ['bi_packer.c.py', 'ISA.xml'], - output : 'bi_packer.c', - command : [prog_python, '@INPUT@'], - capture : true, - depend_files : files('bifrost_isa.py'), -) - -bi_opcodes_h = custom_target( - 'bi_opcodes.h', - input : ['bi_opcodes.h.py', 'ISA.xml'], - output : 'bi_opcodes.h', - command : [prog_python, '@INPUT@'], - capture : true, - depend_files : files('bifrost_isa.py'), -) - -idep_bi_opcodes_h = declare_dependency( - sources : [bi_opcodes_h], - include_directories : include_directories('.'), -) - -bi_builder_h = custom_target( - 'bi_builder.h', - input : ['bi_builder.h.py', 'ISA.xml'], - output : 'bi_builder.h', - command : [prog_python, '@INPUT@'], - capture : true, - depend_files : files('bifrost_isa.py'), -) - -idep_bi_builder_h = declare_dependency( - sources : [bi_builder_h], - include_directories : include_directories('.'), -) - -bifrost_nir_algebraic_c = custom_target( - 'bifrost_nir_algebraic.c', - input : 'bifrost_nir_algebraic.py', - output : 'bifrost_nir_algebraic.c', - command : [ - prog_python, '@INPUT@', '-p', dir_compiler_nir, - ], - capture : true, - depend_files : nir_algebraic_depends, -) - -libpanfrost_bifrost_disasm = static_library( - 'panfrost_bifrost_disasm', - ['disassemble.c', 'bi_print_common.c', bifrost_gen_disasm_c], - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw], - dependencies: [idep_nir], - link_with: [libpanfrost_util], - c_args : [no_override_init_args], - gnu_symbol_visibility : 'hidden', - build_by_default : false, + 'bifrost_opts.c', + 'bifrost_sched.c', + 'bifrost_print.c', + 'disassemble.c', ) libpanfrost_bifrost = static_library( 'panfrost_bifrost', - [libpanfrost_bifrost_files, bi_opcodes_c, bi_printer_c, bi_packer_c, bifrost_nir_algebraic_c, valhall_c], - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw, inc_valhall], - dependencies: [idep_nir, idep_bi_opcodes_h, idep_bi_builder_h, idep_valhall_enums_h], - link_with: [libpanfrost_util, libpanfrost_bifrost_disasm, libpanfrost_valhall_disasm], - c_args : [no_override_init_args], - gnu_symbol_visibility : 'hidden', + [libpanfrost_bifrost_files], + include_directories : [inc_common, inc_include, inc_src], + dependencies: [idep_nir], + c_args : [c_vis_args, no_override_init_args], + cpp_args : [cpp_vis_args], build_by_default : false, ) - -if with_tests - test( - 'bifrost_tests', - executable( - 'bifrost_tests', - files( - 'test/test-constant-fold.cpp', - 'test/test-dual-texture.cpp', - 'test/test-lower-swizzle.cpp', - 'test/test-message-preload.cpp', - 'test/test-optimizer.cpp', - 'test/test-pack-formats.cpp', - 'test/test-packing.cpp', - 'test/test-scheduler-predicates.cpp', - 'valhall/test/test-add-imm.cpp', - 'valhall/test/test-validate-fau.cpp', - 'valhall/test/test-insert-flow.cpp', - 'valhall/test/test-lower-isel.cpp', - 'valhall/test/test-lower-constants.cpp', - 'valhall/test/test-mark-last.cpp', - 'valhall/test/test-merge-flow.cpp', - 'valhall/test/test-packing.cpp', - ), - c_args : [c_msvc_compat_args, no_override_init_args], - gnu_symbol_visibility : 'hidden', - include_directories : [inc_include, inc_src, inc_mesa, inc_valhall], - dependencies: [idep_gtest, idep_nir, idep_bi_opcodes_h, idep_bi_builder_h], - link_with : [libpanfrost_bifrost], - ), - suite : ['panfrost'], - protocol : gtest_test_protocol, - ) -endif |