Import Mesa 19.2.8

author: Jonathan Gray <jsg@cvs.openbsd.org> 2020-01-22 02:13:05 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2020-01-22 02:13:05 +0000
commit: 01fb7c3270d8d1e1c53129a974587680aa129089 (patch)
tree: 68033daecea5da5fcb45de5cbef65b8b3fc92845
parent: 53b0736c56ca5142a5722eb827a3675ca08e123d (diff)
8 files changed, 2799 insertions, 6479 deletions
diff --git a/lib/mesa/src/mesa/math/m_vector_asm.h b/lib/mesa/src/mesa/math/m_vector_asm.h
index 90de44b0a..60cf1ec8f 100644
--- a/lib/mesa/src/mesa/math/m_vector_asm.h
+++ b/lib/mesa/src/mesa/math/m_vector_asm.h
@@ -52,6 +52,6 @@
  * _math_matrix_set_identity().
  */
 #define MATRIX_M   0
-#define MATRIX_INV (MATRIX_M + 16 * 4)
+#define MATRIX_INV (MATRIX_M + MATH_ASM_PTR_SIZE)
 
 #endif /* _M_VECTOR_ASM_H */
diff --git a/lib/mesa/src/panfrost/bifrost/bifrost.h b/lib/mesa/src/panfrost/bifrost/bifrost.h
index 9d95de562..aa382b43b 100644
--- a/lib/mesa/src/panfrost/bifrost/bifrost.h
+++ b/lib/mesa/src/panfrost/bifrost/bifrost.h
@@ -28,607 +28,58 @@
 
 #include <stdint.h>
 #include <stdbool.h>
-#include <string.h>
-#include <assert.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define BIFROST_DBG_MSGS        0x0001
-#define BIFROST_DBG_SHADERS     0x0002
-#define BIFROST_DBG_SHADERDB    0x0004
-#define BIFROST_DBG_VERBOSE     0x0008
-#define BIFROST_DBG_INTERNAL    0x0010
-#define BIFROST_DBG_NOSCHED     0x0020
-#define BIFROST_DBG_INORDER     0x0040
-#define BIFROST_DBG_NOVALIDATE  0x0080
-#define BIFROST_DBG_NOOPT       0x0100
-#define BIFROST_DBG_NOIDVS      0x0200
-#define BIFROST_DBG_NOSB        0x0400
-#define BIFROST_DBG_NOPRELOAD   0x0800
-#define BIFROST_DBG_SPILL       0x1000
-#define BIFROST_DBG_NOPSCHED    0x2000
-
-extern int bifrost_debug;
-
-enum bifrost_message_type {
-        BIFROST_MESSAGE_NONE       = 0,
-        BIFROST_MESSAGE_VARYING    = 1,
-        BIFROST_MESSAGE_ATTRIBUTE  = 2,
-        BIFROST_MESSAGE_TEX        = 3,
-        BIFROST_MESSAGE_VARTEX     = 4,
-        BIFROST_MESSAGE_LOAD       = 5,
-        BIFROST_MESSAGE_STORE      = 6,
-        BIFROST_MESSAGE_ATOMIC     = 7,
-        BIFROST_MESSAGE_BARRIER    = 8,
-        BIFROST_MESSAGE_BLEND      = 9,
-        BIFROST_MESSAGE_TILE       = 10,
-        /* type 11 reserved */
-        BIFROST_MESSAGE_Z_STENCIL  = 12,
-        BIFROST_MESSAGE_ATEST      = 13,
-        BIFROST_MESSAGE_JOB        = 14,
-        BIFROST_MESSAGE_64BIT      = 15
-};
-
-enum bifrost_ftz {
-        BIFROST_FTZ_DISABLE = 0,
-        BIFROST_FTZ_DX11 = 1,
-        BIFROST_FTZ_ALWAYS = 2,
-        BIFROST_FTZ_ABRUPT = 3
-};
-
-enum bifrost_exceptions {
-        BIFROST_EXCEPTIONS_ENABLED = 0,
-        BIFROST_EXCEPTIONS_DISABLED = 1,
-        BIFROST_EXCEPTIONS_PRECISE_DIVISION = 2,
-        BIFROST_EXCEPTIONS_PRECISE_SQRT = 3,
-};
-
-/* Describes clause flow control, with respect to control flow and branch
- * reconvergence.
- *
- * Control flow may be considered back-to-back (execute clauses back-to-back),
- * non-back-to-back (switch warps after clause before the next clause), write
- * elision (back-to-back and elide register slot #3 write from the clause), or
- * end of shader.
- *
- * Branch reconvergence may be disabled, enabled unconditionally, or enabled
- * based on the program counter. A clause requires reconvergence if it has a
- * successor that can be executed without first executing the clause itself.
- * Separate iterations of a loop are treated separately here, so it is also the
- * case for a loop exit where the iteration count is not warp-invariant.
- *
- */
-
-enum bifrost_flow {
-        /* End-of-shader */
-        BIFROST_FLOW_END = 0,
-
-        /* Non back-to-back, PC-encoded reconvergence */
-        BIFROST_FLOW_NBTB_PC = 1,
-
-        /* Non back-to-back, unconditional reconvergence */
-        BIFROST_FLOW_NBTB_UNCONDITIONAL = 2,
-
-        /* Non back-to-back, no reconvergence */
-        BIFROST_FLOW_NBTB = 3,
-
-        /* Back-to-back, unconditional reconvergence */
-        BIFROST_FLOW_BTB_UNCONDITIONAL = 4,
-
-        /* Back-to-back, no reconvergence */
-        BIFROST_FLOW_BTB_NONE = 5,
-
-        /* Write elision, unconditional reconvergence */
-        BIFROST_FLOW_WE_UNCONDITIONAL = 6,
-
-        /* Write elision, no reconvergence */
-        BIFROST_FLOW_WE = 7,
-};
-
-enum bifrost_slot {
-        /* 0-5 are general purpose */
-        BIFROST_SLOT_ELDEST_DEPTH = 6,
-        BIFROST_SLOT_ELDEST_COLOUR = 7,
-};
 
 struct bifrost_header {
-        /* Reserved */
-        unsigned zero1 : 5;
-
-        /* Flush-to-zero mode, leave zero for GL */
-        enum bifrost_ftz flush_to_zero : 2;
-
-        /* Convert any infinite result of any floating-point operation to the
-         * biggest representable number */
+        unsigned unk0 : 7;
+        // If true, convert any infinite result of any floating-point operation to
+        // the biggest representable number.
         unsigned suppress_inf: 1;
-
-        /* Convert NaN to +0.0 */
+        // Convert any NaN results to 0.
         unsigned suppress_nan : 1;
-
-        /* Floating-point excception handling mode */
-        enum bifrost_exceptions float_exceptions : 2;
-
-        /* Enum describing the flow control, which matters for handling
-         * divergence and reconvergence efficiently */
-        enum bifrost_flow flow_control : 3;
-
-        /* Reserved */
-        unsigned zero2 : 1;
-
-        /* Terminate discarded threads, rather than continuing execution. Set
-         * for fragment shaders for standard GL behaviour of DISCARD. Also in a
-         * fragment shader, this disables helper invocations, so cannot be used
-         * in a shader that requires derivatives or texture LOD computation */
-        unsigned terminate_discarded_threads : 1;
-
-        /* If set, the hardware may prefetch the next clause. If false, the
-         * hardware may not. Clear for unconditional branches. */
-        unsigned next_clause_prefetch : 1;
-
-        /* If set, a barrier will be inserted after the clause waiting for all
-         * message passing instructions to read their staging registers, such
-         * that it is safe for the next clause to write them. */
-        unsigned staging_barrier: 1;
-        unsigned staging_register : 6;
-
-        /* Slots to wait on and slot to be used for message passing
-         * instructions respectively */
-        unsigned dependency_wait : 8;
-        unsigned dependency_slot : 3;
-
-        enum bifrost_message_type message_type : 5;
-        enum bifrost_message_type next_message_type : 5;
-} __attribute__((packed));
-
-enum bifrost_packed_src {
-        BIFROST_SRC_PORT0    = 0,
-        BIFROST_SRC_PORT1    = 1,
-        BIFROST_SRC_PORT2    = 2,
-        BIFROST_SRC_STAGE    = 3,
-        BIFROST_SRC_FAU_LO   = 4,
-        BIFROST_SRC_FAU_HI   = 5,
-        BIFROST_SRC_PASS_FMA = 6,
-        BIFROST_SRC_PASS_ADD = 7,
+        unsigned unk1 : 2;
+        // true if the execution mask of the next clause is the same as the mask of
+        // the current clause.
+        unsigned back_to_back : 1;
+        unsigned no_end_of_shader: 1;
+        unsigned unk2 : 2;
+        // Set to true for fragment shaders, to implement this bit of spec text
+        // from section 7.1.5 of the GLSL ES spec:
+        //
+        // "Stores to image and buffer variables performed by helper invocations
+        // have no effect on the underlying image or buffer memory."
+        //
+        // Helper invocations are threads (invocations) corresponding to pixels in
+        // a quad that aren't actually part of the triangle, but are included to
+        // make derivatives work correctly. They're usually turned on, but they
+        // need to be masked off for GLSL-level stores. This bit seems to be the
+        // only bit that's actually different between fragment shaders and other
+        // shaders, so this is probably what it's doing.
+        unsigned elide_writes : 1;
+        // If backToBack is off:
+        // - true for conditional branches and fallthrough
+        // - false for unconditional branches
+        // The blob seems to always set it to true if back-to-back is on.
+        unsigned branch_cond : 1;
+        // This bit is set when the next clause writes to the data register of some
+        // previous clause.
+        unsigned datareg_writebarrier: 1;
+        unsigned datareg : 6;
+        unsigned scoreboard_deps: 8;
+        unsigned scoreboard_index: 3;
+        unsigned clause_type: 4;
+        unsigned unk3 : 1; // part of clauseType?
+        unsigned next_clause_type: 4;
+        unsigned unk4 : 1; // part of nextClauseType?
 };
 
 struct bifrost_fma_inst {
         unsigned src0 : 3;
         unsigned op   : 20;
-} __attribute__((packed));
+};
 
 struct bifrost_add_inst {
         unsigned src0 : 3;
         unsigned op   : 17;
-} __attribute__((packed));
-
-enum branch_bit_size {
-        BR_SIZE_32 = 0,
-        BR_SIZE_16XX = 1,
-        BR_SIZE_16YY = 2,
-        // For the above combinations of bitsize and location, an extra bit is
-        // encoded via comparing the sources. The only possible source of ambiguity
-        // would be if the sources were the same, but then the branch condition
-        // would be always true or always false anyways, so we can ignore it. But
-        // this no longer works when comparing the y component to the x component,
-        // since it's valid to compare the y component of a source against its own
-        // x component. Instead, the extra bit is encoded via an extra bitsize.
-        BR_SIZE_16YX0 = 3,
-        BR_SIZE_16YX1 = 4,
-        BR_SIZE_32_AND_16X = 5,
-        BR_SIZE_32_AND_16Y = 6,
-        // Used for comparisons with zero and always-true, see below. I think this
-        // only works for integer comparisons.
-        BR_SIZE_ZERO = 7,
-};
-
-struct bifrost_regs {
-        unsigned fau_idx : 8;
-        unsigned reg3 : 6;
-        unsigned reg2 : 6;
-        unsigned reg0 : 5;
-        unsigned reg1 : 6;
-        unsigned ctrl : 4;
-} __attribute__((packed));
-
-#define BIFROST_FMTC_CONSTANTS       0b0011
-#define BIFROST_FMTC_FINAL           0b0111
-
-struct bifrost_fmt_constant {
-        unsigned pos : 4;
-        unsigned tag : 4;
-        uint64_t imm_1 : 60;
-        uint64_t imm_2 : 60;
-} __attribute__((packed));
-
-/* Clause formats, encoded in a table */
-
-enum bi_clause_subword {
-        /* Literal 3-bit values */
-        BI_CLAUSE_SUBWORD_LITERAL_0 = 0,
-        /* etc */
-        BI_CLAUSE_SUBWORD_LITERAL_7 = 7,
-
-        /* The value of the corresponding tuple in the corresponding bits */
-        BI_CLAUSE_SUBWORD_TUPLE_0 = 8,
-        /* etc */
-        BI_CLAUSE_SUBWORD_TUPLE_7 = 15,
-
-        /* Clause header */
-        BI_CLAUSE_SUBWORD_HEADER = 16,
-
-        /* Leave zero, but semantically distinct from literal 0 */
-        BI_CLAUSE_SUBWORD_RESERVED = 17,
-
-        /* Embedded constant 0 */
-        BI_CLAUSE_SUBWORD_CONSTANT = 18,
-
-        /* M bits controlling modifier for the constant */
-        BI_CLAUSE_SUBWORD_M = 19,
-
-        /* Z bit: 1 to begin encoding constants, 0 to terminate the clause */
-        BI_CLAUSE_SUBWORD_Z = 20,
-
-        /* Upper 3-bits of a given tuple and zero extended */
-        BI_CLAUSE_SUBWORD_UPPER_0 = 32,
-        /* etc */
-        BI_CLAUSE_SUBWORD_UPPER_7 = BI_CLAUSE_SUBWORD_UPPER_0 + 7,
-
-        /* Upper 3-bits of two tuples, concatenated and zero-extended */
-        BI_CLAUSE_SUBWORD_UPPER_23 = BI_CLAUSE_SUBWORD_UPPER_0 + 23,
-        BI_CLAUSE_SUBWORD_UPPER_56 = BI_CLAUSE_SUBWORD_UPPER_0 + 56,
-};
-
-#define L(x) ((enum bi_clause_subword)(BI_CLAUSE_SUBWORD_LITERAL_0 + x))
-#define U(x) ((enum bi_clause_subword)(BI_CLAUSE_SUBWORD_UPPER_0 + x))
-#define T(x) ((enum bi_clause_subword)(BI_CLAUSE_SUBWORD_TUPLE_0 + x))
-#define EC   BI_CLAUSE_SUBWORD_CONSTANT
-#define M    BI_CLAUSE_SUBWORD_M
-#define Z    BI_CLAUSE_SUBWORD_Z
-#define H    BI_CLAUSE_SUBWORD_HEADER
-#define R    BI_CLAUSE_SUBWORD_RESERVED
-
-struct bi_clause_format {
-        unsigned format; /* format number */
-        unsigned pos; /* index in the clause */
-        enum bi_clause_subword tag_1; /* 2-bits */
-        enum bi_clause_subword tag_2; /* 3-bits */
-        enum bi_clause_subword tag_3; /* 3-bits */
-        enum bi_clause_subword s0_s3; /* 60 bits */
-        enum bi_clause_subword s4; /* 15 bits */
-        enum bi_clause_subword s5_s6; /* 30 bits */
-        enum bi_clause_subword s7; /* 15 bits */
-};
-
-static const struct bi_clause_format bi_clause_formats[] = {
-        {  0, 0, L(0), L(5), U(0), T(0), T(0), H,    H     },
-        {  0, 0, Z,    L(1), U(0), T(0), T(0), H,    H     },
-        {  1, 1, Z,    L(0), L(3), T(1), T(1), R,    U(1)  },
-        {  2, 1, L(0), L(4), U(1), T(1), T(1), T(2), T(2)  },
-        {  3, 2, Z,    L(0), L(4), EC,   M,    T(2), U(2)  },
-        {  4, 2, L(0), L(0), L(1), T(3), T(3), T(2), U(23) },
-        {  4, 2, Z,    L(0), L(5), T(3), T(3), T(2), U(23) },
-        {  5, 2, L(2), U(3), U(2), T(3), T(3), T(2), EC    },
-        {  6, 3, Z,    L(2), U(4), T(4), T(4), EC,   EC    },
-        {  7, 3, L(1), L(4), U(4), T(4), T(4), T(5), T(5)  },
-        {  8, 4, Z,    L(0), L(6), EC,   M,    T(5), U(5)  },
-        {  9, 4, Z,    L(0), L(7), T(6), T(6), T(5), U(56) },
-        { 10, 4, L(3), U(6), U(5), T(6), T(6), T(5), EC    },
-        { 11, 5, Z,    L(3), U(7), T(7), T(7), EC,   EC    },
-};
-
-#undef L
-#undef U
-#undef T
-#undef EC
-#undef M
-#undef Z
-#undef H
-#undef R
-
-/* 32-bit modes for slots 2/3, as encoded in the register block. Other values
- * are reserved. First part specifies behaviour of slot 2 (Idle, Read, Write
- * Full, Write Low, Write High), second part behaviour of slot 3, and the last
- * part specifies the source for the write (FMA, ADD, or MIX for FMA/ADD).
- *
- * IDLE is a special mode disabling both slots, except for the first
- * instruction in the clause which uses IDLE_1 for the same purpose.
- *
- * All fields 0 used as sentinel for reserved encoding, so IDLE(_1) have FMA
- * set (and ignored) as a placeholder to differentiate from reserved.
- */
-enum bifrost_reg_mode {
-        BIFROST_R_WL_FMA  = 1,
-        BIFROST_R_WH_FMA  = 2,
-        BIFROST_R_W_FMA   = 3,
-        BIFROST_R_WL_ADD  = 4,
-        BIFROST_R_WH_ADD  = 5,
-        BIFROST_R_W_ADD   = 6,
-        BIFROST_WL_WL_ADD = 7,
-        BIFROST_WL_WH_ADD = 8,
-        BIFROST_WL_W_ADD  = 9,
-        BIFROST_WH_WL_ADD = 10,
-        BIFROST_WH_WH_ADD = 11,
-        BIFROST_WH_W_ADD  = 12,
-        BIFROST_W_WL_ADD  = 13,
-        BIFROST_W_WH_ADD  = 14,
-        BIFROST_W_W_ADD   = 15,
-        BIFROST_IDLE_1    = 16,
-        BIFROST_I_W_FMA   = 17,
-        BIFROST_I_WL_FMA  = 18,
-        BIFROST_I_WH_FMA  = 19,
-        BIFROST_R_I       = 20,
-        BIFROST_I_W_ADD   = 21,
-        BIFROST_I_WL_ADD  = 22,
-        BIFROST_I_WH_ADD  = 23,
-        BIFROST_WL_WH_MIX = 24,
-        BIFROST_WH_WL_MIX = 26,
-        BIFROST_IDLE      = 27,
 };
 
-enum bifrost_reg_op {
-        BIFROST_OP_IDLE = 0,
-        BIFROST_OP_READ = 1,
-        BIFROST_OP_WRITE = 2,
-        BIFROST_OP_WRITE_LO = 3,
-        BIFROST_OP_WRITE_HI = 4,
-};
-
-struct bifrost_reg_ctrl_23 {
-        enum bifrost_reg_op slot2;
-        enum bifrost_reg_op slot3;
-        bool slot3_fma;
-};
-
-#ifndef __cplusplus
-static const struct bifrost_reg_ctrl_23 bifrost_reg_ctrl_lut[32] = {
-        [BIFROST_R_WL_FMA]  = { BIFROST_OP_READ,     BIFROST_OP_WRITE_LO, true },
-        [BIFROST_R_WH_FMA]  = { BIFROST_OP_READ,     BIFROST_OP_WRITE_HI, true },
-        [BIFROST_R_W_FMA]   = { BIFROST_OP_READ,     BIFROST_OP_WRITE,    true },
-        [BIFROST_R_WL_ADD]  = { BIFROST_OP_READ,     BIFROST_OP_WRITE_LO, false },
-        [BIFROST_R_WH_ADD]  = { BIFROST_OP_READ,     BIFROST_OP_WRITE_HI, false },
-        [BIFROST_R_W_ADD]   = { BIFROST_OP_READ,     BIFROST_OP_WRITE,    false },
-        [BIFROST_WL_WL_ADD] = { BIFROST_OP_WRITE_LO, BIFROST_OP_WRITE_LO, false },
-        [BIFROST_WL_WH_ADD] = { BIFROST_OP_WRITE_LO, BIFROST_OP_WRITE_HI, false },
-        [BIFROST_WL_W_ADD]  = { BIFROST_OP_WRITE_LO, BIFROST_OP_WRITE,    false },
-        [BIFROST_WH_WL_ADD] = { BIFROST_OP_WRITE_HI, BIFROST_OP_WRITE_LO, false },
-        [BIFROST_WH_WH_ADD] = { BIFROST_OP_WRITE_HI, BIFROST_OP_WRITE_HI, false },
-        [BIFROST_WH_W_ADD]  = { BIFROST_OP_WRITE_HI, BIFROST_OP_WRITE,    false },
-        [BIFROST_W_WL_ADD]  = { BIFROST_OP_WRITE,    BIFROST_OP_WRITE_LO, false },
-        [BIFROST_W_WH_ADD]  = { BIFROST_OP_WRITE,    BIFROST_OP_WRITE_HI, false },
-        [BIFROST_W_W_ADD]   = { BIFROST_OP_WRITE,    BIFROST_OP_WRITE,    false },
-        [BIFROST_IDLE_1]    = { BIFROST_OP_IDLE,     BIFROST_OP_IDLE,     true },
-        [BIFROST_I_W_FMA]   = { BIFROST_OP_IDLE,     BIFROST_OP_WRITE,    true },
-        [BIFROST_I_WL_FMA]  = { BIFROST_OP_IDLE,     BIFROST_OP_WRITE_LO, true },
-        [BIFROST_I_WH_FMA]  = { BIFROST_OP_IDLE,     BIFROST_OP_WRITE_HI, true },
-        [BIFROST_R_I]       = { BIFROST_OP_READ,     BIFROST_OP_IDLE,     false },
-        [BIFROST_I_W_ADD]   = { BIFROST_OP_IDLE,     BIFROST_OP_WRITE,    false },
-        [BIFROST_I_WL_ADD]  = { BIFROST_OP_IDLE,     BIFROST_OP_WRITE_LO, false },
-        [BIFROST_I_WH_ADD]  = { BIFROST_OP_IDLE,     BIFROST_OP_WRITE_HI, false },
-        [BIFROST_WL_WH_MIX] = { BIFROST_OP_WRITE_LO, BIFROST_OP_WRITE_HI, false },
-        [BIFROST_WH_WL_MIX] = { BIFROST_OP_WRITE_HI, BIFROST_OP_WRITE_LO, false },
-        [BIFROST_IDLE]      = { BIFROST_OP_IDLE,     BIFROST_OP_IDLE,     true },
-};
-#endif
-
-/* Texture operator descriptors in various states. Usually packed in the
- * compiler and stored as a constant */
-
-enum bifrost_texture_operation_mode {
-        /* Dual texturing */
-        BIFROST_TEXTURE_OPERATION_DUAL = 1,
-
-        /* Single texturing */
-        BIFROST_TEXTURE_OPERATION_SINGLE = 3,
-};
-
-enum bifrost_index {
-        /* Both texture/sampler index immediate */
-        BIFROST_INDEX_IMMEDIATE_SHARED = 0,
-
-        /* Sampler index immediate, texture index from staging */
-        BIFROST_INDEX_IMMEDIATE_SAMPLER = 1,
-
-        /* Texture index immediate, sampler index from staging */
-        BIFROST_INDEX_IMMEDIATE_TEXTURE = 2,
-
-        /* Both indices from (separate) staging registers */
-        BIFROST_INDEX_REGISTER = 3,
-};
-
-enum bifrost_tex_op {
-        /* Given explicit derivatives, compute a gradient descriptor */
-        BIFROST_TEX_OP_GRDESC_DER = 4,
-
-        /* Given implicit derivatives (texture coordinates in a fragment
-         * shader), compute a gradient descriptor */
-        BIFROST_TEX_OP_GRDESC = 5,
-
-        /* Fetch a texel. Takes a staging register with LOD level / face index
-         * packed 16:16 */
-        BIFROST_TEX_OP_FETCH = 6,
-
-        /* Filtered texture */
-        BIFROST_TEX_OP_TEX = 7,
-};
-
-enum bifrost_lod_mode {
-        /* Takes two staging registers forming a 64-bit gradient descriptor
-         * (computed by a previous GRDESC or GRDESC_DER operation) */
-        BIFROST_LOD_MODE_GRDESC = 3,
-
-        /* Take a staging register with 8:8 fixed-point in bottom 16-bits
-         * specifying an explicit LOD */
-        BIFROST_LOD_MODE_EXPLICIT = 4,
-
-        /* Takes a staging register with bottom 16-bits as 8:8 fixed-point LOD
-         * bias and top 16-bit as 8:8 fixed-point lower bound (generally left
-         * zero), added and clamped to a computed LOD */
-        BIFROST_LOD_MODE_BIAS = 5,
-
-        /* Set LOD to zero */
-        BIFROST_LOD_MODE_ZERO = 6,
-
-        /* Compute LOD */
-        BIFROST_LOD_MODE_COMPUTE = 7,
-};
-
-enum bifrost_texture_format {
-        /* 16-bit floating point, with optional clamping */
-        BIFROST_TEXTURE_FORMAT_F16 = 0,
-        BIFROST_TEXTURE_FORMAT_F16_POS = 1,
-        BIFROST_TEXTURE_FORMAT_F16_PM1 = 2,
-        BIFROST_TEXTURE_FORMAT_F16_1 = 3,
-
-        /* 32-bit floating point, with optional clamping */
-        BIFROST_TEXTURE_FORMAT_F32 = 4,
-        BIFROST_TEXTURE_FORMAT_F32_POS = 5,
-        BIFROST_TEXTURE_FORMAT_F32_PM1 = 6,
-        BIFROST_TEXTURE_FORMAT_F32_1 = 7,
-};
-
-enum bifrost_texture_format_full {
-        /* Transclude bifrost_texture_format from above */
-
-        /* Integers, unclamped */
-        BIFROST_TEXTURE_FORMAT_U16 = 12,
-        BIFROST_TEXTURE_FORMAT_S16 = 13,
-        BIFROST_TEXTURE_FORMAT_U32 = 14,
-        BIFROST_TEXTURE_FORMAT_S32 = 15,
-};
-
-enum bifrost_texture_fetch {
-        /* Default texelFetch */
-        BIFROST_TEXTURE_FETCH_TEXEL = 1,
-
-        /* Deprecated, fetches 4x U32 of a U8 x 4 texture. Do not use. */
-        BIFROST_TEXTURE_FETCH_GATHER4_RGBA = 3,
-
-        /* Gathers */
-        BIFROST_TEXTURE_FETCH_GATHER4_R = 4,
-        BIFROST_TEXTURE_FETCH_GATHER4_G = 5,
-        BIFROST_TEXTURE_FETCH_GATHER4_B = 6,
-        BIFROST_TEXTURE_FETCH_GATHER4_A = 7
-};
-
-struct bifrost_texture_operation {
-        /* If immediate_indices is set:
-         *     - immediate sampler index
-         *     - index used as texture index
-         * Otherwise:
-         *      - bifrost_single_index in lower 2 bits
-         *      - 0x3 in upper 2 bits (single-texturing)
-         */
-        unsigned sampler_index_or_mode : 4;
-        unsigned index : 7;
-        bool immediate_indices : 1;
-        enum bifrost_tex_op op : 3;
-
-        /* If set for TEX/FETCH, loads texel offsets and multisample index from
-         * a staging register containing offset_x:offset_y:offset_z:ms_index
-         * packed 8:8:8:8. Offsets must be in [-31, +31]. If set for
-         * GRDESC(_DER), disable LOD bias. */
-        bool offset_or_bias_disable : 1;
-
-        /* If set for TEX/FETCH, loads fp32 shadow comparison value from a
-         * staging register. Implies fetch_component = gather4_r. If set for
-         * GRDESC(_DER), disables LOD clamping. */
-        bool shadow_or_clamp_disable : 1;
-
-        /* If set, loads an uint32 array index from a staging register. */
-        bool array : 1;
-
-        /* Texture dimension, or 0 for a cubemap */
-        unsigned dimension : 2;
-
-        /* Method to compute LOD value or for a FETCH, the
-         * bifrost_texture_fetch component specification */
-        enum bifrost_lod_mode lod_or_fetch : 3;
-
-        /* Reserved */
-        unsigned zero : 1;
-
-        /* Register format for the result */
-        enum bifrost_texture_format_full format : 4;
-
-        /* Write mask for the result */
-        unsigned mask : 4;
-} __attribute__((packed));
-
-struct bifrost_dual_texture_operation {
-        unsigned primary_sampler_index : 2;
-        unsigned mode : 2; /* 0x1 for dual */
-        unsigned primary_texture_index : 2;
-        unsigned secondary_sampler_index : 2;
-        unsigned secondary_texture_index : 2;
-
-        /* Leave zero for dual texturing */
-        unsigned reserved : 1;
-        unsigned index_mode_zero : 1;
-
-        /* Base staging register to write the secondary results to */
-        unsigned secondary_register : 6;
-
-        /* Format/mask for each texture */
-        enum bifrost_texture_format secondary_format : 3;
-        unsigned secondary_mask : 4;
-
-        enum bifrost_texture_format primary_format : 3;
-        unsigned primary_mask : 4;
-} __attribute__((packed));
-
-static inline uint32_t
-bi_dual_tex_as_u32(struct bifrost_dual_texture_operation desc)
-{
-        uint32_t desc_u;
-        memcpy(&desc_u, &desc, sizeof(desc));
-
-        return desc_u;
-}
-
-#define BIFROST_MEGA_SAMPLE 128
-#define BIFROST_ALL_SAMPLES 255
-#define BIFROST_CURRENT_PIXEL 255
-
-struct bifrost_pixel_indices {
-        unsigned sample : 8;
-        unsigned rt : 8;
-        unsigned x : 8;
-        unsigned y : 8;
-} __attribute__((packed));
-
-enum bi_constmod {
-        BI_CONSTMOD_NONE,
-        BI_CONSTMOD_PC_LO,
-        BI_CONSTMOD_PC_HI,
-        BI_CONSTMOD_PC_LO_HI
-};
-
-struct bi_constants {
-        /* Raw constant values */
-        uint64_t raw[6];
-
-        /* Associated modifier derived from M values */
-        enum bi_constmod mods[6];
-};
-
-/* FAU selectors for constants are out-of-order, construct the top bits
- * here given a embedded constant index in a clause */
-
-static inline unsigned
-bi_constant_field(unsigned idx)
-{
-        const unsigned values[] = {
-                4, 5, 6, 7, 2, 3
-        };
-
-        assert(idx <= 5);
-        return values[idx] << 4;
-}
-
-#ifdef __cplusplus
-} /* extern C */
-#endif
-
 #endif
diff --git a/lib/mesa/src/panfrost/bifrost/bifrost_compile.c b/lib/mesa/src/panfrost/bifrost/bifrost_compile.c
index f0aab763e..061eab11a 100644
--- a/lib/mesa/src/panfrost/bifrost/bifrost_compile.c
+++ b/lib/mesa/src/panfrost/bifrost/bifrost_compile.c
@@ -1,6 +1,5 @@
 /*
- * Copyright (C) 2020 Collabora Ltd.
- * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -20,4115 +19,842 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * Authors (Collabora):
- *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
  */
 
-#include "compiler/glsl/glsl_to_nir.h"
-#include "compiler/nir_types.h"
 #include "compiler/nir/nir_builder.h"
-#include "compiler/nir/nir_schedule.h"
-#include "util/u_debug.h"
-
-#include "disassemble.h"
-#include "valhall/va_compiler.h"
-#include "valhall/disassemble.h"
 #include "bifrost_compile.h"
-#include "compiler.h"
-#include "valhall/va_compiler.h"
-#include "bi_quirks.h"
-#include "bi_builder.h"
-#include "bifrost_nir.h"
-
-static const struct debug_named_value bifrost_debug_options[] = {
-        {"msgs",      BIFROST_DBG_MSGS,		"Print debug messages"},
-        {"shaders",   BIFROST_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
-        {"shaderdb",  BIFROST_DBG_SHADERDB,	"Print statistics"},
-        {"verbose",   BIFROST_DBG_VERBOSE,	"Disassemble verbosely"},
-        {"internal",  BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
-        {"nosched",   BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
-        {"nopsched",  BIFROST_DBG_NOPSCHED,     "Disable scheduling for pressure"},
-        {"inorder",   BIFROST_DBG_INORDER, 	"Force in-order bundling"},
-        {"novalidate",BIFROST_DBG_NOVALIDATE,   "Skip IR validation"},
-        {"noopt",     BIFROST_DBG_NOOPT,        "Skip optimization passes"},
-        {"noidvs",    BIFROST_DBG_NOIDVS,       "Disable IDVS"},
-        {"nosb",      BIFROST_DBG_NOSB,         "Disable scoreboarding"},
-        {"nopreload", BIFROST_DBG_NOPRELOAD,    "Disable message preloading"},
-        {"spill",     BIFROST_DBG_SPILL,        "Test register spilling"},
-        DEBUG_NAMED_VALUE_END
-};
-
-DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)
-
-/* How many bytes are prefetched by the Bifrost shader core. From the final
- * clause of the shader, this range must be valid instructions or zero. */
-#define BIFROST_SHADER_PREFETCH 128
-
-int bifrost_debug = 0;
-
-#define DBG(fmt, ...) \
-		do { if (bifrost_debug & BIFROST_DBG_MSGS) \
-			fprintf(stderr, "%s:%d: "fmt, \
-				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
-
-static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
-
-static bi_index
-bi_preload(bi_builder *b, unsigned reg)
-{
-        if (bi_is_null(b->shader->preloaded[reg])) {
-                /* Insert at the beginning of the shader */
-                bi_builder b_ = *b;
-                b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
-
-                /* Cache the result */
-                b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
-        }
-
-        return b->shader->preloaded[reg];
-}
-
-static bi_index
-bi_coverage(bi_builder *b)
-{
-        if (bi_is_null(b->shader->coverage))
-                b->shader->coverage = bi_preload(b, 60);
-
-        return b->shader->coverage;
-}
-
-/*
- * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
- * changed from Bifrost to Valhall. Provide helpers that smooth over the
- * architectural difference.
- */
-static inline bi_index
-bi_vertex_id(bi_builder *b)
-{
-        return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
-}
-
-static inline bi_index
-bi_instance_id(bi_builder *b)
-{
-        return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
-}
-
-static void
-bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
-{
-        bi_instr *branch = bi_jump(b, bi_zero());
-
-        switch (instr->type) {
-        case nir_jump_break:
-                branch->branch_target = b->shader->break_block;
-                break;
-        case nir_jump_continue:
-                branch->branch_target = b->shader->continue_block;
-                break;
-        default:
-                unreachable("Unhandled jump type");
-        }
-
-        bi_block_add_successor(b->shader->current_block, branch->branch_target);
-        b->shader->current_block->unconditional_jumps = true;
-}
-
-/* Builds a 64-bit hash table key for an index */
-static uint64_t
-bi_index_to_key(bi_index idx)
-{
-        static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
-
-        uint64_t key = 0;
-        memcpy(&key, &idx, sizeof(idx));
-        return key;
-}
-
-/*
- * Extract a single channel out of a vector source. We split vectors with SPLIT
- * so we can use the split components directly, without emitting an extract.
- * This has advantages of RA, as the split can usually be optimized away.
- */
-static bi_index
-bi_extract(bi_builder *b, bi_index vec, unsigned channel)
-{
-        bi_index *components =
-                _mesa_hash_table_u64_search(b->shader->allocated_vec,
-                                            bi_index_to_key(vec));
-
-        /* No extract needed for scalars.
-         *
-         * This is a bit imprecise, but actual bugs (missing splits for vectors)
-         * should be caught by the following assertion. It is too difficult to
-         * ensure bi_extract is only called for real vectors.
-         */
-        if (components == NULL && channel == 0)
-                return vec;
-
-        assert(components != NULL && "missing bi_cache_collect()");
-        return components[channel];
-}
-
-static void
-bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
-{
-        /* Lifetime of a hash table entry has to be at least as long as the table */
-        bi_index *channels = ralloc_array(b->shader, bi_index, n);
-        memcpy(channels, s, sizeof(bi_index) * n);
-
-        _mesa_hash_table_u64_insert(b->shader->allocated_vec,
-                                    bi_index_to_key(dst), channels);
-}
-
-/*
- * Splits an n-component vector (vec) into n scalar destinations (dests) using a
- * split pseudo-instruction.
- *
- * Pre-condition: dests is filled with bi_null().
- */
-static void
-bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
-{
-        /* Setup the destinations */
-        for (unsigned i = 0; i < n; ++i) {
-                dests[i] = bi_temp(b->shader);
-        }
-
-        /* Emit the split */
-        if (n == 1) {
-                bi_mov_i32_to(b, dests[0], vec);
-        } else {
-                bi_instr *I = bi_split_i32_to(b, n, vec);
-
-                bi_foreach_dest(I, j)
-                        I->dest[j] = dests[j];
-        }
-}
-
-static void
-bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
-{
-        bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
-        bi_emit_split_i32(b, dests, vec, n);
-        bi_cache_collect(b, vec, dests, n);
-}
-
-/*
- * Emit and cache a split for a vector of a given bitsize. The vector may not be
- * composed of 32-bit words, but it will be split at 32-bit word boundaries.
- */
-static void
-bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
-{
-        bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
-}
-
-static void
-bi_split_dest(bi_builder *b, nir_dest dest)
-{
-        bi_emit_cached_split(b, bi_dest_index(&dest),
-                                nir_dest_bit_size(dest) *
-                                nir_dest_num_components(dest));
-}
-
-static bi_instr *
-bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
-{
-        /* Special case: COLLECT of a single value is a scalar move */
-        if (n == 1)
-                return bi_mov_i32_to(b, dst, chan[0]);
-
-        bi_instr *I = bi_collect_i32_to(b, dst, n);
-
-        bi_foreach_src(I, i)
-                I->src[i] = chan[i];
-
-        bi_cache_collect(b, dst, chan, n);
-        return I;
-}
-
-static bi_instr *
-bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
-{
-        return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2);
-}
-
-static bi_instr *
-bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2)
-{
-        return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3);
-}
-
-static bi_index
-bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
-{
-        bi_index dst = bi_temp(b->shader);
-        bi_collect_v2i32_to(b, dst, s0, s1);
-        return dst;
-}
-
-static bi_index
-bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
-{
-        switch (intr->intrinsic) {
-        case nir_intrinsic_load_barycentric_centroid:
-        case nir_intrinsic_load_barycentric_sample:
-                return bi_preload(b, 61);
-
-        /* Need to put the sample ID in the top 16-bits */
-        case nir_intrinsic_load_barycentric_at_sample:
-                return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
-                                bi_half(bi_src_index(&intr->src[0]), false));
-
-        /* Interpret as 8:8 signed fixed point positions in pixels along X and
-         * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
-         * is the center of the pixel so we first fixup and then convert. For
-         * fp16 input:
-         *
-         * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
-         * f2i16((256 * (x, y)) + (128, 128)) =
-         * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
-         *
-         * For fp32 input, that lacks enough precision for MSAA 16x, but the
-         * idea is the same. FIXME: still doesn't pass
-         */
-        case nir_intrinsic_load_barycentric_at_offset: {
-                bi_index offset = bi_src_index(&intr->src[0]);
-                bi_index f16 = bi_null();
-                unsigned sz = nir_src_bit_size(intr->src[0]);
-
-                if (sz == 16) {
-                        f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
-                                        bi_imm_f16(128.0));
-                } else {
-                        assert(sz == 32);
-                        bi_index f[2];
-                        for (unsigned i = 0; i < 2; ++i) {
-                                f[i] = bi_fadd_rscale_f32(b,
-                                                bi_extract(b, offset, i),
-                                                bi_imm_f32(0.5), bi_imm_u32(8),
-                                                BI_SPECIAL_NONE);
-                        }
-
-                        f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
-                }
-
-                return bi_v2f16_to_v2s16(b, f16);
-        }
-
-        case nir_intrinsic_load_barycentric_pixel:
-        default:
-                return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
-        }
-}
-
-static enum bi_sample
-bi_interp_for_intrinsic(nir_intrinsic_op op)
-{
-        switch (op) {
-        case nir_intrinsic_load_barycentric_centroid:
-                return BI_SAMPLE_CENTROID;
-        case nir_intrinsic_load_barycentric_sample:
-        case nir_intrinsic_load_barycentric_at_sample:
-                return BI_SAMPLE_SAMPLE;
-        case nir_intrinsic_load_barycentric_at_offset:
-                return BI_SAMPLE_EXPLICIT;
-        case nir_intrinsic_load_barycentric_pixel:
-        default:
-                return BI_SAMPLE_CENTER;
-        }
-}
-
-/* auto, 64-bit omitted */
-static enum bi_register_format
-bi_reg_fmt_for_nir(nir_alu_type T)
-{
-        switch (T) {
-        case nir_type_float16: return BI_REGISTER_FORMAT_F16;
-        case nir_type_float32: return BI_REGISTER_FORMAT_F32;
-        case nir_type_int16:   return BI_REGISTER_FORMAT_S16;
-        case nir_type_uint16:  return BI_REGISTER_FORMAT_U16;
-        case nir_type_int32:   return BI_REGISTER_FORMAT_S32;
-        case nir_type_uint32:  return BI_REGISTER_FORMAT_U32;
-        default: unreachable("Invalid type for register format");
-        }
-}
+#include "bifrost_opts.h"
+#include "bifrost_sched.h"
+#include "compiler_defines.h"
+#include "disassemble.h"
+#include "bifrost_print.h"
 
-/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
- * immediate to be used (which applies even if _IMM can't be used) */
+#define BI_DEBUG
 
-static bool
-bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)
+static int
+glsl_type_size(const struct glsl_type *type, bool bindless)
 {
-        nir_src *offset = nir_get_io_offset_src(instr);
-
-        if (!nir_src_is_const(*offset))
-                return false;
-
-        *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
-        return (*immediate) < max;
+        return glsl_count_attribute_slots(type, false);
 }
 
 static void
-bi_make_vec_to(bi_builder *b, bi_index final_dst,
-                bi_index *src,
-                unsigned *channel,
-                unsigned count,
-                unsigned bitsize);
-
-/* Bifrost's load instructions lack a component offset despite operating in
- * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
- * but they may be unavoidable with separate shaders in use. To solve this, we
- * lower to a larger load and an explicit copy of the desired components. */
-
-static void
-bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
+optimize_nir(nir_shader *nir)
 {
-        unsigned component = nir_intrinsic_component(instr);
-        unsigned nr = instr->num_components;
-        unsigned total = nr + component;
-        unsigned bitsize = nir_dest_bit_size(instr->dest);
-
-        assert(total <= 4 && "should be vec4");
-        bi_emit_cached_split(b, tmp, total * bitsize);
+        bool progress;
 
-        if (component == 0)
-                return;
+        NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
+        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
 
-        bi_index srcs[] = { tmp, tmp, tmp };
-        unsigned channels[] = { component, component + 1, component + 2 };
+        do {
+                progress = false;
 
-        bi_make_vec_to(b, bi_dest_index(&instr->dest),
-                       srcs, channels, nr, nir_dest_bit_size(instr->dest));
-}
+                NIR_PASS(progress, nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
 
-static void
-bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        nir_alu_type T = nir_intrinsic_dest_type(instr);
-        enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
-        nir_src *offset = nir_get_io_offset_src(instr);
-        unsigned component = nir_intrinsic_component(instr);
-        enum bi_vecsize vecsize = (instr->num_components + component - 1);
-        unsigned imm_index = 0;
-        unsigned base = nir_intrinsic_base(instr);
-        bool constant = nir_src_is_const(*offset);
-        bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
-        bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
-        bi_instr *I;
-
-        if (immediate) {
-                I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b),
-                                      bi_instance_id(b), regfmt, vecsize,
-                                      imm_index);
-        } else {
-                bi_index idx = bi_src_index(&instr->src[0]);
+                NIR_PASS(progress, nir, nir_lower_var_copies);
+                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
 
-                if (constant)
-                        idx = bi_imm_u32(imm_index);
-                else if (base != 0)
-                        idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
+                NIR_PASS(progress, nir, nir_copy_prop);
+                NIR_PASS(progress, nir, nir_opt_constant_folding);
 
-                I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b),
-                                  idx, regfmt, vecsize);
-        }
+                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+                NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL);
+                NIR_PASS(progress, nir, nir_opt_if, true);
 
-        if (b->shader->arch >= 9)
-                I->table = PAN_TABLE_ATTRIBUTE;
+        } while (progress);
 
-        bi_copy_component(b, instr, dest);
+        NIR_PASS(progress, nir, nir_copy_prop);
+        NIR_PASS(progress, nir, nir_opt_dce);
 }
 
-/*
- * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
- * come later, sparsely packed. This handles both linked and separable shaders
- * with a common code path, with minimal keying only for desktop GL. Each slot
- * consumes 16 bytes (TODO: fp16, partial vectors).
- */
 static unsigned
-bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
+nir_src_index(compiler_context *ctx, nir_src *src)
 {
-        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-        uint32_t mask = ctx->inputs->fixed_varying_mask;
-
-        if (sem.location >= VARYING_SLOT_VAR0) {
-                unsigned nr_special = util_bitcount(mask);
-                unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
-
-                return 16 * (nr_special + general_index);
-        } else {
-                return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
-        }
+        if (src->is_ssa)
+                return src->ssa->index;
+        else
+                return ctx->func->impl->ssa_alloc + src->reg.reg->index;
 }
 
-/*
- * Compute the offset in bytes of a varying with an immediate offset, adding the
- * offset to the base computed above. Convenience method.
- */
 static unsigned
-bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
+nir_dest_index(compiler_context *ctx, nir_dest *dst)
 {
-        nir_src *src = nir_get_io_offset_src(intr);
-        assert(nir_src_is_const(*src) && "assumes immediate offset");
-
-        return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
-}
-
-static void
-bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        enum bi_sample sample = BI_SAMPLE_CENTER;
-        enum bi_update update = BI_UPDATE_STORE;
-        enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
-        bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
-        bi_index src0 = bi_null();
-
-        unsigned component = nir_intrinsic_component(instr);
-        enum bi_vecsize vecsize = (instr->num_components + component - 1);
-        bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
-
-        unsigned sz = nir_dest_bit_size(instr->dest);
-
-        if (smooth) {
-                nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
-                assert(parent);
-
-                sample = bi_interp_for_intrinsic(parent->intrinsic);
-                src0 = bi_varying_src0_for_barycentric(b, parent);
-
-                assert(sz == 16 || sz == 32);
-                regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16
-                        : BI_REGISTER_FORMAT_F32;
-        } else {
-                assert(sz == 32);
-                regfmt = BI_REGISTER_FORMAT_U32;
-
-                /* Valhall can't have bi_null() here, although the source is
-                 * logically unused for flat varyings
-                 */
-                if (b->shader->arch >= 9)
-                        src0 = bi_preload(b, 61);
-
-                /* Gather info as we go */
-                b->shader->info.bifrost->uses_flat_shading = true;
-        }
-
-        enum bi_source_format source_format =
-                smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
-
-        nir_src *offset = nir_get_io_offset_src(instr);
-        unsigned imm_index = 0;
-        bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
-        bi_instr *I = NULL;
-
-        if (b->shader->malloc_idvs && immediate) {
-                /* Immediate index given in bytes. */
-                bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt,
-                                     sample, source_format, update, vecsize,
-                                     bi_varying_offset(b->shader, instr));
-        } else if (immediate && smooth) {
-                I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
-                                     vecsize, imm_index);
-        } else if (immediate && !smooth) {
-                I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,
-                                          vecsize, imm_index);
-        } else {
-                bi_index idx = bi_src_index(offset);
-                unsigned base = nir_intrinsic_base(instr);
-
-                if (b->shader->malloc_idvs) {
-                        /* Index needs to be in bytes, but NIR gives the index
-                         * in slots. For now assume 16 bytes per element.
-                         */
-                        bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
-                        unsigned vbase = bi_varying_base_bytes(b->shader, instr);
-
-                        if (vbase != 0)
-                                idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
-
-                        bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt,
-                                         sample, source_format, update,
-                                         vecsize);
-                } else if (smooth) {
-                        if (base != 0)
-                                idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
-
-                        I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
-                                         update, vecsize);
-                } else {
-                        if (base != 0)
-                                idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
-
-                        I = bi_ld_var_flat_to(b, dest, idx,
-                                              BI_FUNCTION_NONE, regfmt,
-                                              vecsize);
-                }
-        }
-
-        /* Valhall usually uses machine-allocated IDVS. If this is disabled, use
-         * a simple Midgard-style ABI.
-         */
-        if (b->shader->arch >= 9 && I != NULL)
-                I->table = PAN_TABLE_ATTRIBUTE;
-
-        bi_copy_component(b, instr, dest);
-}
-
-static bi_index
-bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel, unsigned count)
-{
-        assert(1 <= count && count <= 4);
-
-        bi_index bytes[4] = {
-                bi_imm_u8(0),
-                bi_imm_u8(0),
-                bi_imm_u8(0),
-                bi_imm_u8(0)
-        };
-
-        for (unsigned i = 0; i < count; ++i) {
-                unsigned chan = channel ? channel[i] : 0;
-
-                bytes[i] = bi_byte(bi_extract(b, src[i], chan >> 2), chan & 3);
-        }
-
-        if (b->shader->arch >= 9) {
-                bi_index vec = bi_zero();
-
-                if (count >= 3)
-                        vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);
-
-                return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
-        } else {
-                return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
-        }
-}
-
-static bi_index
-bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel, unsigned count)
-{
-        unsigned chan0 = channel ? channel[0] : 0;
-        bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
-        bi_index h0 = bi_half(w0, chan0 & 1);
-
-        /* Zero extend */
-        if (count == 1)
-                return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));
-
-        /* Else, create a vector */
-        assert(count == 2);
-
-        unsigned chan1 = channel ? channel[1] : 0;
-        bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
-        bi_index h1 = bi_half(w1, chan1 & 1);
-
-        if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
-                return bi_mov_i32(b, w0);
-        else if (bi_is_word_equiv(w0, w1))
-                return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
+        if (dst->is_ssa)
+                return dst->ssa.index;
         else
-                return bi_mkvec_v2i16(b, h0, h1);
-}
-
-static void
-bi_make_vec_to(bi_builder *b, bi_index dst,
-                bi_index *src,
-                unsigned *channel,
-                unsigned count,
-                unsigned bitsize)
-{
-        assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
-        unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
-        unsigned chan_per_word = 1 << shift;
-
-        assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
-               "unnecessarily large vector should have been lowered");
-
-        bi_index srcs[BI_MAX_VEC];
-
-        for (unsigned i = 0; i < count; i += chan_per_word) {
-                unsigned rem = MIN2(count - i, chan_per_word);
-                unsigned *channel_offset = channel ? (channel + i) : NULL;
-
-                if (bitsize == 32)
-                        srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
-                else if (bitsize == 16)
-                        srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
-                else
-                        srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
-        }
-
-        bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
-}
-
-static inline bi_instr *
-bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
-                bi_index src1)
-{
-        bi_instr *I;
-
-        if (b->shader->arch >= 9) {
-                I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
-                I->seg = BI_SEG_UBO;
-        } else {
-                I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
-        }
-
-        bi_emit_cached_split(b, dest0, bitsize);
-        return I;
-}
-
-static bi_instr *
-bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
-                unsigned nr_components, unsigned offset)
-{
-        unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 ?
-                              b->shader->inputs->fixed_sysval_ubo :
-                              b->shader->nir->info.num_ubos;
-        unsigned uniform =
-                pan_lookup_sysval(b->shader->sysval_to_id,
-                                  b->shader->info.sysvals,
-                                  sysval);
-        unsigned idx = (uniform * 16) + offset;
-
-        return bi_load_ubo_to(b, nr_components * 32, dest,
-                              bi_imm_u32(idx), bi_imm_u32(sysval_ubo));
+                return ctx->func->impl->ssa_alloc + dst->reg.reg->index;
 }
 
-static void
-bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,
-                unsigned nr_components, unsigned offset)
+static unsigned
+nir_alu_src_index(compiler_context *ctx, nir_alu_src *src)
 {
-        bi_load_sysval_to(b, bi_dest_index(&intr->dest),
-                        panfrost_sysval_for_instr(&intr->instr, NULL),
-                        nr_components, offset);
+        return nir_src_index(ctx, &src->src);
 }
 
-static bi_index
-bi_load_sysval(bi_builder *b, int sysval,
-                unsigned nr_components, unsigned offset)
+struct bifrost_instruction *
+mir_alloc_ins(struct bifrost_instruction instr)
 {
-        bi_index tmp = bi_temp(b->shader);
-        bi_load_sysval_to(b, tmp, sysval, nr_components, offset);
-        return tmp;
+        struct bifrost_instruction *heap_ins = malloc(sizeof(instr));
+        memcpy(heap_ins, &instr, sizeof(instr));
+        return heap_ins;
 }
 
 static void
-bi_load_sample_id_to(bi_builder *b, bi_index dst)
-{
-        /* r61[16:23] contains the sampleID, mask it out. Upper bits
-         * seem to read garbage (despite being architecturally defined
-         * as zero), so use a 5-bit mask instead of 8-bits */
-
-        bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
-                                bi_imm_u8(16), false);
-}
-
-static bi_index
-bi_load_sample_id(bi_builder *b)
-{
-        bi_index sample_id = bi_temp(b->shader);
-        bi_load_sample_id_to(b, sample_id);
-        return sample_id;
-}
-
-static bi_index
-bi_pixel_indices(bi_builder *b, unsigned rt)
+emit_mir_instruction(struct compiler_context *ctx, struct bifrost_instruction instr)
 {
-        /* We want to load the current pixel. */
-        struct bifrost_pixel_indices pix = {
-                .y = BIFROST_CURRENT_PIXEL,
-                .rt = rt
-        };
-
-        uint32_t indices_u32 = 0;
-        memcpy(&indices_u32, &pix, sizeof(indices_u32));
-        bi_index indices = bi_imm_u32(indices_u32);
-
-        /* Sample index above is left as zero. For multisampling, we need to
-         * fill in the actual sample ID in the lower byte */
-
-        if (b->shader->inputs->blend.nr_samples > 1)
-                indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
-
-        return indices;
+        list_addtail(&(mir_alloc_ins(instr))->link, &ctx->current_block->instructions);
 }
 
-/* Source color is passed through r0-r3, or r4-r7 for the second source when
- * dual-source blending. Preload the corresponding vector.
- */
 static void
-bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
+bifrost_block_add_successor(bifrost_block *block, bifrost_block *successor)
 {
-        nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
-        unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
-        unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
-        assert(size == 16 || size == 32);
-
-        bi_index srcs[] = {
-                bi_preload(b, base + 0), bi_preload(b, base + 1),
-                bi_preload(b, base + 2), bi_preload(b, base + 3)
-        };
-
-        bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2);
+        assert(block->num_successors < ARRAY_SIZE(block->successors));
+        block->successors[block->num_successors++] = successor;
 }
 
 static void
-bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
-                 bi_index rgba2, nir_alu_type T2, unsigned rt)
+emit_load_const(struct compiler_context *ctx, nir_load_const_instr *instr)
 {
-        /* Reads 2 or 4 staging registers to cover the input */
-        unsigned size = nir_alu_type_get_type_size(T);
-        unsigned size_2 = nir_alu_type_get_type_size(T2);
-        unsigned sr_count = (size <= 16) ? 2 : 4;
-        unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
-        const struct panfrost_compile_inputs *inputs = b->shader->inputs;
-        uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
-        enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
-
-        /* Workaround for NIR-to-TGSI */
-        if (b->shader->nir->info.fs.untyped_color_outputs)
-                regfmt = BI_REGISTER_FORMAT_AUTO;
-
-        if (inputs->is_blend && inputs->blend.nr_samples > 1) {
-                /* Conversion descriptor comes from the compile inputs, pixel
-                 * indices derived at run time based on sample ID */
-                bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
-                                bi_imm_u32(blend_desc >> 32),
-                                regfmt, BI_VECSIZE_V4);
-        } else if (b->shader->inputs->is_blend) {
-                uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
-
-                /* Blend descriptor comes from the compile inputs */
-                /* Put the result in r0 */
-
-                bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
-                               bi_imm_u32(blend_desc),
-                               bi_imm_u32(blend_desc >> 32),
-                               bi_null(), regfmt, sr_count, 0);
-        } else {
-                /* Blend descriptor comes from the FAU RAM. By convention, the
-                 * return address on Bifrost is stored in r48 and will be used
-                 * by the blend shader to jump back to the fragment shader */
-
-                bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
-                               bi_fau(BIR_FAU_BLEND_0 + rt, false),
-                               bi_fau(BIR_FAU_BLEND_0 + rt, true),
-                               rgba2, regfmt, sr_count, sr_count_2);
-        }
-
-        assert(rt < 8);
-        b->shader->info.bifrost->blend[rt].type = T;
+        nir_ssa_def def = instr->def;
 
-        if (T2)
-                b->shader->info.bifrost->blend_src1_type = T2;
+        float *v = ralloc_array(NULL, float, 1);
+        nir_const_load_to_arr(v, instr, f32);
+        _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v);
 }
 
-/* Blend shaders do not need to run ATEST since they are dependent on a
- * fragment shader that runs it. Blit shaders may not need to run ATEST, since
- * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
- * there are no writes to the coverage mask. The latter two are satisfied for
- * all blit shaders, so we just care about early-z, which blit shaders force
- * iff they do not write depth or stencil */
-
-static bool
-bi_skip_atest(bi_context *ctx, bool emit_zs)
-{
-        return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
-}
-
-static void
-bi_emit_atest(bi_builder *b, bi_index alpha)
+static uint32_t
+alloc_mir_temp(struct compiler_context *ctx)
 {
-        b->shader->coverage = bi_atest(b, bi_coverage(b), alpha,
-                                          bi_fau(BIR_FAU_ATEST_PARAM, false));
-        b->shader->emitted_atest = true;
+        return SSA_TEMP_VALUE(ctx->mir_temp++);
 }
 
-static void
-bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
+static uint32_t
+emit_ld_vary_addr_constant(struct compiler_context *ctx, uint32_t location)
 {
-        bool combined = instr->intrinsic ==
-                nir_intrinsic_store_combined_output_pan;
+        // LD_VAR_ADDR.f32 {R0, T1}, R61, R62, location:1, R12
+        // ...
+        // ST_VAR.v4 T1, R12, R13, R14, R4
 
-        unsigned writeout = combined ? nir_intrinsic_component(instr) :
-                PAN_WRITEOUT_C;
-
-        bool emit_blend = writeout & (PAN_WRITEOUT_C);
-        bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
-
-        unsigned loc = nir_intrinsic_io_semantics(instr).location;
-        bi_index src0 = bi_src_index(&instr->src[0]);
-
-        /* By ISA convention, the coverage mask is stored in R60. The store
-         * itself will be handled by a subsequent ATEST instruction */
-        if (loc == FRAG_RESULT_SAMPLE_MASK) {
-                bi_index orig = bi_coverage(b);
-                bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
-                bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0));
-
-                b->shader->coverage =
-                        bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO);
-                return;
-        }
+        // R61-R62 is filled with information needed for varying interpolation
+        // This loads a vec3 with the information that ST_VAR needs to work
 
-        /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
-         * value, but render target #0 might not be floating point. However the
-         * alpha value is only used for alpha-to-coverage, a stage which is
-         * skipped for pure integer framebuffers, so the issue is moot. */
-
-        if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
-                nir_alu_type T = nir_intrinsic_src_type(instr);
-
-                bi_index rgba = bi_src_index(&instr->src[0]);
-                bi_index alpha =
-                        (T == nir_type_float16) ? bi_half(bi_extract(b, rgba, 1), true) :
-                        (T == nir_type_float32) ? bi_extract(b, rgba, 3) :
-                        bi_dontcare(b);
-
-                /* Don't read out-of-bounds */
-                if (nir_src_num_components(instr->src[0]) < 4)
-                        alpha = bi_imm_f32(1.0);
-
-                bi_emit_atest(b, alpha);
-        }
-
-        if (emit_zs) {
-                bi_index z = bi_dontcare(b), s = bi_dontcare(b);
-
-                if (writeout & PAN_WRITEOUT_Z)
-                        z = bi_src_index(&instr->src[2]);
-
-                if (writeout & PAN_WRITEOUT_S)
-                        s = bi_src_index(&instr->src[3]);
-
-                b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b),
-                                                 writeout & PAN_WRITEOUT_S,
-                                                 writeout & PAN_WRITEOUT_Z);
-        }
-
-        if (emit_blend) {
-                unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
-                bool dual = (writeout & PAN_WRITEOUT_2);
-                bi_index color = bi_src_index(&instr->src[0]);
-                bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null();
-                nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
-
-                /* Explicit copy since BLEND inputs are precoloured to R0-R3,
-                 * TODO: maybe schedule around this or implement in RA as a
-                 * spill */
-                bool has_mrt = (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);
-
-                if (has_mrt) {
-                        bi_index srcs[4] = { color, color, color, color };
-                        unsigned channels[4] = { 0, 1, 2, 3 };
-                        color = bi_temp(b->shader);
-                        bi_make_vec_to(b, color, srcs, channels,
-                                       nir_src_num_components(instr->src[0]),
-                                       nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
-                }
-
-                bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr),
-                                    color2, T2, rt);
-        }
-
-        if (b->shader->inputs->is_blend) {
-                /* Jump back to the fragment shader, return address is stored
-                 * in r48 (see above). On Valhall, only jump if the address is
-                 * nonzero. The check is free there and it implements the "jump
-                 * to 0 terminates the blend shader" that's automatic on
-                 * Bifrost.
-                 */
-                if (b->shader->arch >= 8)
-                        bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
-                else
-                        bi_jump(b, bi_preload(b, 48));
-        }
-}
-
-/**
- * In a vertex shader, is the specified variable a position output? These kinds
- * of outputs are written from position shaders when IDVS is enabled. All other
- * outputs are written from the varying shader.
- */
-static bool
-bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
-{
-        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-
-        switch (sem.location) {
-        case VARYING_SLOT_POS:
-        case VARYING_SLOT_PSIZ:
-                return idvs == BI_IDVS_VARYING;
-        default:
-                return idvs == BI_IDVS_POSITION;
-        }
-}
-
-static bool
-bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
-{
-        enum bi_idvs_mode *idvs = data;
-
-        if (instr->type != nir_instr_type_intrinsic)
-                return false;
-
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-        if (intr->intrinsic != nir_intrinsic_store_output)
-                return false;
-
-        if (bi_should_remove_store(intr, *idvs)) {
-                nir_instr_remove(instr);
-                return  true;
-        }
+        uint32_t mir_temp_location = alloc_mir_temp(ctx);
+        // This instruction loads a vec3 starting from the initial register
+        struct bifrost_instruction instr = {
+                .op = op_ld_var_addr,
+                .dest_components = 3,
+                .ssa_args = {
+                        .dest = mir_temp_location,
+                        .src0 = SSA_FIXED_REGISTER(61),
+                        .src1 = SSA_FIXED_REGISTER(62),
+                        .src2 = SSA_INVALID_VALUE,
+                        .src3 = SSA_INVALID_VALUE,
+                },
+                .literal_args[0] = location,
+        };
+        emit_mir_instruction(ctx, instr);
 
-        return false;
+        return mir_temp_location;
 }
 
+// XXX: Doesn't support duplicated values in the components!
+// RA WILL fail!
 static void
-bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
+emit_create_vector(struct compiler_context *ctx, unsigned dest, unsigned num_comps, uint32_t *comps)
 {
-        /* In principle we can do better for 16-bit. At the moment we require
-         * 32-bit to permit the use of .auto, in order to force .u32 for flat
-         * varyings, to handle internal TGSI shaders that set flat in the VS
-         * but smooth in the FS */
-
-        ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
-        ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
-        assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
-        enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
-
-        unsigned imm_index = 0;
-        bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
-
-        /* Only look at the total components needed. In effect, we fill in all
-         * the intermediate "holes" in the write mask, since we can't mask off
-         * stores. Since nir_lower_io_to_temporaries ensures each varying is
-         * written at most once, anything that's masked out is undefined, so it
-         * doesn't matter what we write there. So we may as well do the
-         * simplest thing possible. */
-        unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
-        assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
-
-        bi_index data = bi_src_index(&instr->src[0]);
-
-        /* To keep the vector dimensions consistent, we need to drop some
-         * components. This should be coalesced.
-         *
-         * TODO: This is ugly and maybe inefficient. Would we rather
-         * introduce a TRIM.i32 pseudoinstruction?
-         */
-        if (nr < nir_intrinsic_src_components(instr, 0)) {
-                assert(T_size == 32 && "todo: 16-bit trim");
-
-                bi_index chans[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
-                unsigned src_comps = nir_intrinsic_src_components(instr, 0);
-
-                bi_emit_split_i32(b, chans, data, src_comps);
-
-                bi_index tmp = bi_temp(b->shader);
-                bi_instr *collect = bi_collect_i32_to(b, tmp, nr);
-
-                bi_foreach_src(collect, w)
-                        collect->src[w] = chans[w];
-
-                data = tmp;
-        }
-
-        bool psiz = (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
-
-        bi_index a[4] = { bi_null() };
+        assert(num_comps <= 4 && "Can't make a vector larger than 4 components");
 
-        if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
-                /* Bifrost position shaders have a fast path */
-                assert(T == nir_type_float16 || T == nir_type_float32);
-                unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
-                unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
-                unsigned snap4 = 0x5E;
-                uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
-
-                bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
-                          bi_imm_u32(format), regfmt, nr - 1);
-        } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
-                bi_index index = bi_preload(b, 59);
-
-                if (psiz) {
-                        assert(T_size == 16 && "should've been lowered");
-                        index = bi_iadd_imm_i32(b, index, 4);
+        // This instruction loads a vec3 starting from the initial register
+        struct bifrost_instruction instr = {
+                .op = op_create_vector,
+                .dest_components = num_comps,
+                .ssa_args = {
+                        .dest = dest,
                 }
-
-                bi_index address = bi_lea_buf_imm(b, index);
-                bi_emit_split_i32(b, a, address, 2);
-
-                bool varying = (b->shader->idvs == BI_IDVS_VARYING);
-
-                bi_store(b, nr * nir_src_bit_size(instr->src[0]),
-                         data, a[0], a[1],
-                         varying ? BI_SEG_VARY : BI_SEG_POS,
-                         varying ? bi_varying_offset(b->shader, instr) : 0);
-        } else if (immediate) {
-                bi_index address = bi_lea_attr_imm(b,
-                                          bi_vertex_id(b), bi_instance_id(b),
-                                          regfmt, imm_index);
-                bi_emit_split_i32(b, a, address, 3);
-
-                bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
-        } else {
-                bi_index idx =
-                        bi_iadd_u32(b,
-                                    bi_src_index(nir_get_io_offset_src(instr)),
-                                    bi_imm_u32(nir_intrinsic_base(instr)),
-                                    false);
-                bi_index address = bi_lea_attr(b,
-                                      bi_vertex_id(b), bi_instance_id(b),
-                                      idx, regfmt);
-                bi_emit_split_i32(b, a, address, 3);
-
-                bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
-        }
-}
-
-static void
-bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        nir_src *offset = nir_get_io_offset_src(instr);
-
-        bool offset_is_const = nir_src_is_const(*offset);
-        bi_index dyn_offset = bi_src_index(offset);
-        uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
-
-        bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
-                        bi_dest_index(&instr->dest), offset_is_const ?
-                        bi_imm_u32(const_offset) : dyn_offset,
-                        bi_src_index(&instr->src[0]));
-}
-
-static void
-bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
-
-        nir_src *offset = &instr->src[0];
-        assert(nir_src_is_const(*offset) && "no indirect push constants");
-        uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
-        assert((base & 3) == 0 && "unaligned push constants");
-
-        unsigned bits = nir_dest_bit_size(instr->dest) *
-                        nir_dest_num_components(instr->dest);
-
-        unsigned n = DIV_ROUND_UP(bits, 32);
-        assert(n <= 4);
-        bi_index channels[4] = { bi_null() };
-
-        for (unsigned i = 0; i < n; ++i) {
-                unsigned word = (base >> 2) + i;
-
-                channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
-        }
-
-        bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n);
-}
-
-static bi_index
-bi_addr_high(bi_builder *b, nir_src *src)
-{
-	return (nir_src_bit_size(*src) == 64) ?
-		bi_extract(b, bi_src_index(src), 1) : bi_zero();
-}
-
-static void
-bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, enum bi_seg seg, int16_t *offset)
-{
-        /* Not needed on Bifrost or for global accesses */
-        if (b->shader->arch < 9 || seg == BI_SEG_NONE)
-                return;
-
-        /* There is no segment modifier on Valhall. Instead, we need to
-         * emit the arithmetic ourselves. We do have an offset
-         * available, which saves an instruction for constant offsets.
-         */
-        bool wls = (seg == BI_SEG_WLS);
-        assert(wls || (seg == BI_SEG_TL));
-
-        enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
-
-        bi_index base_lo = bi_fau(fau, false);
-
-        if (offset && addr_lo->type == BI_INDEX_CONSTANT && addr_lo->value == (int16_t) addr_lo->value) {
-                *offset = addr_lo->value;
-                *addr_lo = base_lo;
-        } else {
-                *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
-        }
-
-        /* Do not allow overflow for WLS or TLS */
-        *addr_hi = bi_fau(fau, true);
-}
-
-static void
-bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
-{
-        int16_t offset = 0;
-        unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest);
-        bi_index dest = bi_dest_index(&instr->dest);
-        bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
-        bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
-
-        bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
-
-        bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
-        bi_emit_cached_split(b, dest, bits);
-}
-
-static void
-bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
-{
-        /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
-        assert(nir_intrinsic_write_mask(instr) ==
-                        BITFIELD_MASK(instr->num_components));
-
-        int16_t offset = 0;
-        bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
-        bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
-
-        bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
-
-        bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
-                 bi_src_index(&instr->src[0]),
-                 addr_lo, addr_hi, seg, offset);
-}
-
-/* Exchanges the staging register with memory */
-
-static void
-bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)
-{
-        assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
-
-        unsigned sz = nir_src_bit_size(*arg);
-        assert(sz == 32 || sz == 64);
-
-        bi_index data = bi_src_index(arg);
-
-        bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
-
-        if (b->shader->arch >= 9)
-                bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
-        else if (seg == BI_SEG_WLS)
-                addr_hi = bi_zero();
-
-        bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
-}
-
-/* Exchanges the second staging register with memory if comparison with first
- * staging register passes */
-
-static void
-bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)
-{
-        assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
-
-        /* hardware is swapped from NIR */
-        bi_index src0 = bi_src_index(arg_2);
-        bi_index src1 = bi_src_index(arg_1);
-
-        unsigned sz = nir_src_bit_size(*arg_1);
-        assert(sz == 32 || sz == 64);
-
-        bi_index data_words[] = {
-                bi_extract(b, src0, 0),
-                sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
-
-                /* 64-bit */
-                bi_extract(b, src1, 0),
-                sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
         };
 
-        bi_index in = bi_temp(b->shader);
-        bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
-        bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
-
-        if (b->shader->arch >= 9)
-                bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
-        else if (seg == BI_SEG_WLS)
-                addr_hi = bi_zero();
-
-        bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
-        bi_emit_cached_split(b, out, sz);
-
-        bi_index inout_words[] = {
-                bi_extract(b, out, 0),
-                sz == 64 ? bi_extract(b, out, 1) : bi_null()
+        uint32_t *srcs[4] = {
+                &instr.ssa_args.src0,
+                &instr.ssa_args.src1,
+                &instr.ssa_args.src2,
+                &instr.ssa_args.src3,
         };
 
-        bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
-}
-
-/* Extracts an atomic opcode */
-
-static enum bi_atom_opc
-bi_atom_opc_for_nir(nir_intrinsic_op op)
-{
-        switch (op) {
-        case nir_intrinsic_global_atomic_add:
-        case nir_intrinsic_shared_atomic_add:
-        case nir_intrinsic_image_atomic_add:
-                return BI_ATOM_OPC_AADD;
-
-        case nir_intrinsic_global_atomic_imin:
-        case nir_intrinsic_shared_atomic_imin:
-        case nir_intrinsic_image_atomic_imin:
-                return BI_ATOM_OPC_ASMIN;
-
-        case nir_intrinsic_global_atomic_umin:
-        case nir_intrinsic_shared_atomic_umin:
-        case nir_intrinsic_image_atomic_umin:
-                return BI_ATOM_OPC_AUMIN;
-
-        case nir_intrinsic_global_atomic_imax:
-        case nir_intrinsic_shared_atomic_imax:
-        case nir_intrinsic_image_atomic_imax:
-                return BI_ATOM_OPC_ASMAX;
-
-        case nir_intrinsic_global_atomic_umax:
-        case nir_intrinsic_shared_atomic_umax:
-        case nir_intrinsic_image_atomic_umax:
-                return BI_ATOM_OPC_AUMAX;
-
-        case nir_intrinsic_global_atomic_and:
-        case nir_intrinsic_shared_atomic_and:
-        case nir_intrinsic_image_atomic_and:
-                return BI_ATOM_OPC_AAND;
-
-        case nir_intrinsic_global_atomic_or:
-        case nir_intrinsic_shared_atomic_or:
-        case nir_intrinsic_image_atomic_or:
-                return BI_ATOM_OPC_AOR;
-
-        case nir_intrinsic_global_atomic_xor:
-        case nir_intrinsic_shared_atomic_xor:
-        case nir_intrinsic_image_atomic_xor:
-                return BI_ATOM_OPC_AXOR;
-
-        default:
-                unreachable("Unexpected computational atomic");
-        }
-}
-
-/* Optimized unary atomics are available with an implied #1 argument */
-
-static bool
-bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
-{
-        /* Check we have a compatible constant */
-        if (arg.type != BI_INDEX_CONSTANT)
-                return false;
-
-        if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
-                return false;
-
-        /* Check for a compatible operation */
-        switch (op) {
-        case BI_ATOM_OPC_AADD:
-                *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
-                return true;
-        case BI_ATOM_OPC_ASMAX:
-                *out = BI_ATOM_OPC_ASMAX1;
-                return true;
-        case BI_ATOM_OPC_AUMAX:
-                *out = BI_ATOM_OPC_AUMAX1;
-                return true;
-        case BI_ATOM_OPC_AOR:
-                *out = BI_ATOM_OPC_AOR1;
-                return true;
-        default:
-                return false;
-        }
-}
-
-/*
- * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
- * translate between these forms (with MKVEC.v2i16).
- *
- * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
- * arrays.  For uniform handling, we also treat 3D textures like 2D arrays.
- *
- * Our indexing needs to reflects this.
- */
-static bi_index
-bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
-                    unsigned coord_comps, bool is_array)
-{
-        assert(coord_comps > 0 && coord_comps <= 3);
-
-        if (src_idx == 0) {
-                if (coord_comps == 1 || (coord_comps == 2 && is_array))
-                        return bi_extract(b, coord, 0);
-                else
-                        return bi_mkvec_v2i16(b,
-                                              bi_half(bi_extract(b, coord, 0), false),
-                                              bi_half(bi_extract(b, coord, 1), false));
-        } else {
-                if (coord_comps == 3 && b->shader->arch >= 9)
-                        return bi_mkvec_v2i16(b, bi_imm_u16(0),
-                                              bi_half(bi_extract(b, coord, 2), false));
-                else if (coord_comps == 2 && is_array && b->shader->arch >= 9)
-                        return bi_mkvec_v2i16(b, bi_imm_u16(0),
-                                                 bi_half(bi_extract(b, coord, 1), false));
-                else if (coord_comps == 3)
-                        return bi_extract(b, coord, 2);
-                else if (coord_comps == 2 && is_array)
-                        return bi_extract(b, coord, 1);
+        for (unsigned i = 0; i < 4; ++i) {
+                if (i < num_comps)
+                        *srcs[i] = comps[i];
                 else
-                        return bi_zero();
-        }
-}
-
-static bi_index
-bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        nir_src src = instr->src[0];
-        bi_index index = bi_src_index(&src);
-        bi_context *ctx = b->shader;
-
-        /* Images come after vertex attributes, so handle an explicit offset */
-        unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?
-                util_bitcount64(ctx->nir->info.inputs_read) : 0;
-
-        if (offset == 0)
-                return index;
-        else if (nir_src_is_const(src))
-                return bi_imm_u32(nir_src_as_uint(src) + offset);
-        else
-                return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
-}
-
-static void
-bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
-        unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
-        bool array = nir_intrinsic_image_array(instr);
-        ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
-
-        bi_index coords = bi_src_index(&instr->src[1]);
-        bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
-        bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
-        bi_index dest = bi_dest_index(&instr->dest);
-        enum bi_register_format regfmt = bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
-        enum bi_vecsize vecsize = instr->num_components - 1;
-
-        /* TODO: MSAA */
-        assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
-
-        if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
-                bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize,
-                                                nir_src_as_uint(instr->src[0]));
-
-                I->table = PAN_TABLE_IMAGE;
-        } else if (b->shader->arch >= 9) {
-                unreachable("Indirect images on Valhall not yet supported");
-        } else {
-                bi_ld_attr_tex_to(b, dest, xy, zw,
-                                  bi_emit_image_index(b, instr), regfmt,
-                                  vecsize);
-        }
-
-        bi_split_dest(b, instr->dest);
-}
-
-static bi_index
-bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
-        bool array = nir_intrinsic_image_array(instr);
-        ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
-        unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
-
-        /* TODO: MSAA */
-        assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
-
-        enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?
-                bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :
-                BI_REGISTER_FORMAT_AUTO;
-
-        bi_index coords = bi_src_index(&instr->src[1]);
-        bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
-        bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
-        bi_index dest = bi_temp(b->shader);
-
-        if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
-                bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false,
-                                                nir_src_as_uint(instr->src[0]));
-
-                I->table = PAN_TABLE_IMAGE;
-        } else if (b->shader->arch >= 9) {
-                unreachable("Indirect images on Valhall not yet supported");
-        } else {
-                bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
-                                bi_emit_image_index(b, instr), type);
-
-                /* LEA_ATTR_TEX defaults to the secondary attribute table, but
-                 * our ABI has all images in the primary attribute table
-                 */
-                I->table = BI_TABLE_ATTRIBUTE_1;
-        }
-
-        bi_emit_cached_split(b, dest, 3 * 32);
-        return dest;
-}
-
-static void
-bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        bi_index a[4] = { bi_null() };
-        bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
-
-        /* Due to SPIR-V limitations, the source type is not fully reliable: it
-         * reports uint32 even for write_imagei. This causes an incorrect
-         * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
-         * instead, which will match per the OpenCL spec. Of course this does
-         * not work for 16-bit stores, but those are not available in OpenCL.
-         */
-        nir_alu_type T = nir_intrinsic_src_type(instr);
-        assert(nir_alu_type_get_type_size(T) == 32);
-
-        bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
-                     BI_REGISTER_FORMAT_AUTO,
-                     instr->num_components - 1);
-}
-
-static void
-bi_emit_atomic_i32_to(bi_builder *b, bi_index dst,
-                bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)
-{
-        enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);
-        enum bi_atom_opc post_opc = opc;
-        bool bifrost = b->shader->arch <= 8;
-
-        /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
-         * take any vector but can still output in RETURN mode */
-        bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
-        unsigned sr_count = bifrost ? 2 : 1;
-
-        /* Generate either ATOM or ATOM1 as required */
-        if (bi_promote_atom_c1(opc, arg, &opc)) {
-                bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
-                                       bi_extract(b, addr, 1), opc, sr_count);
-        } else {
-                bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
-                                      bi_extract(b, addr, 1), opc, sr_count);
-        }
-
-        if (bifrost) {
-                /* Post-process it */
-                bi_emit_cached_split_i32(b, tmp_dest, 2);
-                bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), bi_extract(b, tmp_dest, 1), post_opc);
-        }
-}
-
-/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
- * gl_FragCoord.z = ld_vary(fragz)
- * gl_FragCoord.w = ld_vary(fragw)
- */
-
-static void
-bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        bi_index src[4] = {};
-
-        for (unsigned i = 0; i < 2; ++i) {
-                src[i] = bi_fadd_f32(b,
-                                bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)),
-                                bi_imm_f32(0.5f));
-        }
-
-        for (unsigned i = 0; i < 2; ++i) {
-                src[2 + i] = bi_ld_var_special(b, bi_zero(),
-                                BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
-                                BI_UPDATE_CLOBBER,
-                                (i == 0) ? BI_VARYING_NAME_FRAG_Z :
-                                        BI_VARYING_NAME_FRAG_W,
-                                BI_VECSIZE_NONE);
-        }
-
-        bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);
-}
-
-static void
-bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        bi_index dest = bi_dest_index(&instr->dest);
-        nir_alu_type T = nir_intrinsic_dest_type(instr);
-        enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
-        unsigned rt = b->shader->inputs->blend.rt;
-        unsigned size = nir_dest_bit_size(instr->dest);
-        unsigned nr = instr->num_components;
-
-        /* Get the render target */
-        if (!b->shader->inputs->is_blend) {
-                nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
-                unsigned loc = sem.location;
-                assert(loc >= FRAG_RESULT_DATA0);
-                rt = (loc - FRAG_RESULT_DATA0);
-        }
-
-        bi_index desc = b->shader->inputs->is_blend ?
-                bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :
-                b->shader->inputs->bifrost.static_rt_conv ?
-                bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) :
-                bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);
-
-        bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc,
-                      regfmt, nr - 1);
-        bi_emit_cached_split(b, dest, size * nr);
-}
-
-static void
-bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
-{
-        bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
-                bi_dest_index(&instr->dest) : bi_null();
-        gl_shader_stage stage = b->shader->stage;
-
-        switch (instr->intrinsic) {
-        case nir_intrinsic_load_barycentric_pixel:
-        case nir_intrinsic_load_barycentric_centroid:
-        case nir_intrinsic_load_barycentric_sample:
-        case nir_intrinsic_load_barycentric_at_sample:
-        case nir_intrinsic_load_barycentric_at_offset:
-                /* handled later via load_vary */
-                break;
-        case nir_intrinsic_load_interpolated_input:
-        case nir_intrinsic_load_input:
-                if (b->shader->inputs->is_blend)
-                        bi_emit_load_blend_input(b, instr);
-                else if (stage == MESA_SHADER_FRAGMENT)
-                        bi_emit_load_vary(b, instr);
-                else if (stage == MESA_SHADER_VERTEX)
-                        bi_emit_load_attr(b, instr);
-                else
-                        unreachable("Unsupported shader stage");
-                break;
-
-        case nir_intrinsic_store_output:
-                if (stage == MESA_SHADER_FRAGMENT)
-                        bi_emit_fragment_out(b, instr);
-                else if (stage == MESA_SHADER_VERTEX)
-                        bi_emit_store_vary(b, instr);
-                else
-                        unreachable("Unsupported shader stage");
-                break;
-
-        case nir_intrinsic_store_combined_output_pan:
-                assert(stage == MESA_SHADER_FRAGMENT);
-                bi_emit_fragment_out(b, instr);
-                break;
-
-        case nir_intrinsic_load_ubo:
-                bi_emit_load_ubo(b, instr);
-                break;
-
-        case nir_intrinsic_load_push_constant:
-                bi_emit_load_push_constant(b, instr);
-                break;
-
-        case nir_intrinsic_load_global:
-        case nir_intrinsic_load_global_constant:
-                bi_emit_load(b, instr, BI_SEG_NONE);
-                break;
-
-        case nir_intrinsic_store_global:
-                bi_emit_store(b, instr, BI_SEG_NONE);
-                break;
-
-        case nir_intrinsic_load_scratch:
-                bi_emit_load(b, instr, BI_SEG_TL);
-                break;
-
-        case nir_intrinsic_store_scratch:
-                bi_emit_store(b, instr, BI_SEG_TL);
-                break;
-
-        case nir_intrinsic_load_shared:
-                bi_emit_load(b, instr, BI_SEG_WLS);
-                break;
-
-        case nir_intrinsic_store_shared:
-                bi_emit_store(b, instr, BI_SEG_WLS);
-                break;
-
-        /* Blob doesn't seem to do anything for memory barriers, note +BARRIER
-         * is illegal in fragment shaders */
-        case nir_intrinsic_memory_barrier:
-        case nir_intrinsic_memory_barrier_buffer:
-        case nir_intrinsic_memory_barrier_image:
-        case nir_intrinsic_memory_barrier_shared:
-        case nir_intrinsic_group_memory_barrier:
-                break;
-
-        case nir_intrinsic_control_barrier:
-                assert(b->shader->stage != MESA_SHADER_FRAGMENT);
-                bi_barrier(b);
-                break;
-
-        case nir_intrinsic_scoped_barrier:
-                assert(b->shader->stage != MESA_SHADER_FRAGMENT);
-                assert(nir_intrinsic_memory_scope(instr) > NIR_SCOPE_SUBGROUP &&
-                       "todo: subgroup barriers (different divergence rules)");
-
-                bi_barrier(b);
-                break;
-
-        case nir_intrinsic_shared_atomic_add:
-        case nir_intrinsic_shared_atomic_imin:
-        case nir_intrinsic_shared_atomic_umin:
-        case nir_intrinsic_shared_atomic_imax:
-        case nir_intrinsic_shared_atomic_umax:
-        case nir_intrinsic_shared_atomic_and:
-        case nir_intrinsic_shared_atomic_or:
-        case nir_intrinsic_shared_atomic_xor: {
-                assert(nir_src_bit_size(instr->src[1]) == 32);
-
-                bi_index addr = bi_src_index(&instr->src[0]);
-                bi_index addr_hi;
-
-                if (b->shader->arch >= 9) {
-                        bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
-                        addr = bi_collect_v2i32(b, addr, addr_hi);
-                } else {
-                        addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
-                        bi_emit_cached_split(b, addr, 64);
-                }
-
-                bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),
-                                instr->intrinsic);
-                bi_split_dest(b, instr->dest);
-                break;
-        }
-
-        case nir_intrinsic_image_atomic_add:
-        case nir_intrinsic_image_atomic_imin:
-        case nir_intrinsic_image_atomic_umin:
-        case nir_intrinsic_image_atomic_imax:
-        case nir_intrinsic_image_atomic_umax:
-        case nir_intrinsic_image_atomic_and:
-        case nir_intrinsic_image_atomic_or:
-        case nir_intrinsic_image_atomic_xor:
-                assert(nir_src_bit_size(instr->src[3]) == 32);
-
-                bi_emit_atomic_i32_to(b, dst,
-                                bi_emit_lea_image(b, instr),
-                                bi_src_index(&instr->src[3]),
-                                instr->intrinsic);
-                bi_split_dest(b, instr->dest);
-                break;
-
-        case nir_intrinsic_global_atomic_add:
-        case nir_intrinsic_global_atomic_imin:
-        case nir_intrinsic_global_atomic_umin:
-        case nir_intrinsic_global_atomic_imax:
-        case nir_intrinsic_global_atomic_umax:
-        case nir_intrinsic_global_atomic_and:
-        case nir_intrinsic_global_atomic_or:
-        case nir_intrinsic_global_atomic_xor:
-                assert(nir_src_bit_size(instr->src[1]) == 32);
-
-                bi_emit_atomic_i32_to(b, dst,
-                                bi_src_index(&instr->src[0]),
-                                bi_src_index(&instr->src[1]),
-                                instr->intrinsic);
-
-                bi_split_dest(b, instr->dest);
-                break;
-
-        case nir_intrinsic_image_load:
-                bi_emit_image_load(b, instr);
-                break;
-
-        case nir_intrinsic_image_store:
-                bi_emit_image_store(b, instr);
-                break;
-
-        case nir_intrinsic_global_atomic_exchange:
-                bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
-                                &instr->src[1], BI_SEG_NONE);
-                bi_split_dest(b, instr->dest);
-                break;
-
-        case nir_intrinsic_image_atomic_exchange:
-                bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),
-                                &instr->src[3], BI_SEG_NONE);
-                bi_split_dest(b, instr->dest);
-                break;
-
-        case nir_intrinsic_shared_atomic_exchange:
-                bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
-                                &instr->src[1], BI_SEG_WLS);
-                bi_split_dest(b, instr->dest);
-                break;
-
-        case nir_intrinsic_global_atomic_comp_swap:
-                bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
-                                &instr->src[1], &instr->src[2], BI_SEG_NONE);
-                bi_split_dest(b, instr->dest);
-                break;
-
-        case nir_intrinsic_image_atomic_comp_swap:
-                bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),
-                                &instr->src[3], &instr->src[4], BI_SEG_NONE);
-                bi_split_dest(b, instr->dest);
-                break;
-
-        case nir_intrinsic_shared_atomic_comp_swap:
-                bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
-                                &instr->src[1], &instr->src[2], BI_SEG_WLS);
-                bi_split_dest(b, instr->dest);
-                break;
-
-        case nir_intrinsic_load_frag_coord:
-                bi_emit_load_frag_coord(b, instr);
-                break;
-
-        case nir_intrinsic_load_output:
-                bi_emit_ld_tile(b, instr);
-                break;
-
-        case nir_intrinsic_discard_if:
-                bi_discard_b32(b, bi_src_index(&instr->src[0]));
-                break;
-
-        case nir_intrinsic_discard:
-                bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
-                break;
-
-        case nir_intrinsic_load_ssbo_address:
-        case nir_intrinsic_load_xfb_address:
-                bi_load_sysval_nir(b, instr, 2, 0);
-                break;
-
-        case nir_intrinsic_load_work_dim:
-        case nir_intrinsic_load_num_vertices:
-        case nir_intrinsic_load_first_vertex:
-        case nir_intrinsic_load_draw_id:
-                bi_load_sysval_nir(b, instr, 1, 0);
-                break;
-
-        case nir_intrinsic_load_base_vertex:
-                bi_load_sysval_nir(b, instr, 1, 4);
-                break;
-
-        case nir_intrinsic_load_base_instance:
-        case nir_intrinsic_get_ssbo_size:
-                bi_load_sysval_nir(b, instr, 1, 8);
-                break;
-
-        case nir_intrinsic_load_viewport_scale:
-        case nir_intrinsic_load_viewport_offset:
-        case nir_intrinsic_load_num_workgroups:
-        case nir_intrinsic_load_workgroup_size:
-                bi_load_sysval_nir(b, instr, 3, 0);
-                break;
-
-        case nir_intrinsic_image_size:
-                bi_load_sysval_nir(b, instr,
-                                nir_dest_num_components(instr->dest), 0);
-                break;
-
-        case nir_intrinsic_load_blend_const_color_rgba:
-                bi_load_sysval_nir(b, instr,
-                                   nir_dest_num_components(instr->dest), 0);
-                break;
-
-	case nir_intrinsic_load_sample_positions_pan:
-                bi_collect_v2i32_to(b, dst,
-                                    bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
-                                    bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
-                break;
-
-	case nir_intrinsic_load_sample_mask_in:
-                /* r61[0:15] contains the coverage bitmap */
-                bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
-                break;
-
-        case nir_intrinsic_load_sample_id:
-                bi_load_sample_id_to(b, dst);
-                break;
-
-	case nir_intrinsic_load_front_face:
-                /* r58 == 0 means primitive is front facing */
-                bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
-                                BI_RESULT_TYPE_M1);
-                break;
-
-        case nir_intrinsic_load_point_coord:
-                bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
-                                BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
-                                BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
-                bi_emit_cached_split_i32(b, dst, 2);
-                break;
-
-        /* It appears vertex_id is zero-based with Bifrost geometry flows, but
-         * not with Valhall's memory-allocation IDVS geometry flow. Ostensibly
-         * we support the legacy geometry flow even on Valhall, so
-         * vertex_id_zero_based isn't a machine property for us. Don't set it,
-         * and lower here if needed.
-         */
-        case nir_intrinsic_load_vertex_id:
-                if (b->shader->malloc_idvs) {
-                        bi_mov_i32_to(b, dst, bi_vertex_id(b));
-                } else {
-                        bi_index first = bi_load_sysval(b,
-                                                        PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS,
-                                                        1, 0);
-
-                        bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false);
-                }
-
-                break;
-
-        /* We only use in our transform feedback lowering */
-        case nir_intrinsic_load_vertex_id_zero_base:
-                assert(b->shader->nir->info.has_transform_feedback_varyings);
-                bi_mov_i32_to(b, dst, bi_vertex_id(b));
-                break;
-
-        case nir_intrinsic_load_instance_id:
-                bi_mov_i32_to(b, dst, bi_instance_id(b));
-                break;
-
-        case nir_intrinsic_load_subgroup_invocation:
-                bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
-                break;
-
-        case nir_intrinsic_load_local_invocation_id:
-                bi_collect_v3i32_to(b, dst,
-                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
-                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
-                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
-                break;
-
-        case nir_intrinsic_load_workgroup_id:
-                bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
-                                    bi_preload(b, 59));
-                break;
-
-        case nir_intrinsic_load_global_invocation_id:
-        case nir_intrinsic_load_global_invocation_id_zero_base:
-                bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
-                                    bi_preload(b, 62));
-                break;
-
-        case nir_intrinsic_shader_clock:
-                bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
-                bi_split_dest(b, instr->dest);
-                break;
-
-        default:
-                fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
-                assert(0);
-        }
-}
-
-static void
-bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
-{
-        /* Make sure we've been lowered */
-        assert(instr->def.num_components <= (32 / instr->def.bit_size));
-
-        /* Accumulate all the channels of the constant, as if we did an
-         * implicit SEL over them */
-        uint32_t acc = 0;
-
-        for (unsigned i = 0; i < instr->def.num_components; ++i) {
-                unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
-                acc |= (v << (i * instr->def.bit_size));
-        }
-
-        bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
-}
-
-static bi_index
-bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
-{
-        /* we don't lower modifiers until the backend */
-        assert(!(src.negate || src.abs));
-
-        unsigned bitsize = nir_src_bit_size(src.src);
-
-        /* the bi_index carries the 32-bit (word) offset separate from the
-         * subword swizzle, first handle the offset */
-
-        unsigned offset = 0;
-
-        assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
-        unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
-
-        for (unsigned i = 0; i < comps; ++i) {
-                unsigned new_offset = (src.swizzle[i] >> subword_shift);
-
-                if (i > 0)
-                        assert(offset == new_offset && "wrong vectorization");
-
-                offset = new_offset;
-        }
-
-        bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
-
-        /* Compose the subword swizzle with existing (identity) swizzle */
-        assert(idx.swizzle == BI_SWIZZLE_H01);
-
-        /* Bigger vectors should have been lowered */
-        assert(comps <= (1 << subword_shift));
-
-        if (bitsize == 16) {
-                unsigned c0 = src.swizzle[0] & 1;
-                unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
-                idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
-        } else if (bitsize == 8) {
-                /* 8-bit vectors not yet supported */
-                assert(comps == 1 && "8-bit vectors not supported");
-                idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
-        }
-
-        return idx;
-}
-
-static enum bi_round
-bi_nir_round(nir_op op)
-{
-        switch (op) {
-        case nir_op_fround_even: return BI_ROUND_NONE;
-        case nir_op_ftrunc: return BI_ROUND_RTZ;
-        case nir_op_fceil: return BI_ROUND_RTP;
-        case nir_op_ffloor: return BI_ROUND_RTN;
-        default: unreachable("invalid nir round op");
-        }
-}
-
-/* Convenience for lowered transcendentals */
-
-static bi_index
-bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
-{
-        return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
-}
-
-/* Approximate with FRCP_APPROX.f32 and apply a single iteration of
- * Newton-Raphson to improve precision */
-
-static void
-bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
-{
-        bi_index x1 = bi_frcp_approx_f32(b, s0);
-        bi_index m  = bi_frexpm_f32(b, s0, false, false);
-        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, false);
-        bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
-                        bi_zero(), BI_SPECIAL_N);
-        bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
-}
-
-static void
-bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
-{
-        bi_index x1 = bi_frsq_approx_f32(b, s0);
-        bi_index m  = bi_frexpm_f32(b, s0, false, true);
-        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, true);
-        bi_index t1 = bi_fmul_f32(b, x1, x1);
-        bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
-                        bi_imm_u32(-1), BI_SPECIAL_N);
-        bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
-}
-
-/* More complex transcendentals, see
- * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
- * for documentation */
-
-static void
-bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
-{
-        bi_index t1 = bi_temp(b->shader);
-        bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
-        t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
-
-        bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
-
-        bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
-        a2->clamp = BI_CLAMP_CLAMP_M1_1;
-
-        bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
-        bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
-        bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
-        bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
-                        bi_imm_u32(0x3e75fffa));
-        bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
-        bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
-        bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
-                        p3, a1t, a1t, a1i, BI_SPECIAL_NONE);
-        x->clamp = BI_CLAMP_CLAMP_0_INF;
-
-        bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
-        max->sem = BI_SEM_NAN_PROPAGATE;
-}
-
-static void
-bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
-{
-        /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
-         * fixed-point input */
-        bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
-                        bi_imm_u32(24), BI_SPECIAL_NONE);
-        bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
-        fixed_pt->round = BI_ROUND_NONE; // XXX
-
-        /* Compute the result for the fixed-point input, but pass along
-         * the floating-point scale for correct NaN propagation */
-        bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
-}
-
-static void
-bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
-{
-        /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
-        bi_index a1 = bi_frexpm_f32(b, s0, true, false);
-        bi_index ei = bi_frexpe_f32(b, s0, true, false);
-        bi_index ef = bi_s32_to_f32(b, ei);
-
-        /* xt estimates -log(r1), a coarse approximation of log(a1) */
-        bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
-        bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
-
-        /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
-         * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
-         * and then log(s0) = x1 + x2 */
-        bi_index x1 = bi_fadd_f32(b, ef, xt);
-
-        /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
-         * polynomial approximation around 1. The series is expressed around
-         * 1, so set y = (a1 * r1) - 1.0 */
-        bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
-
-        /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
-         * log_e(1 + y) by the Taylor series (lower precision than the blob):
-         * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
-        bi_index loge = bi_fmul_f32(b, y,
-                bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
-
-        bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
-
-        /* log(s0) = x1 + x2 */
-        bi_fadd_f32_to(b, dst, x1, x2);
-}
+                        *srcs[i] = SSA_INVALID_VALUE;
+        }
+        emit_mir_instruction(ctx, instr);
+}
+
+static uint32_t
+emit_extract_vector_element(struct compiler_context *ctx, unsigned ssa_vector, unsigned element)
+{
+        uint32_t mir_temp_location = alloc_mir_temp(ctx);
+        // This instruction loads a vec3 starting from the initial register
+        struct bifrost_instruction instr = {
+                .op = op_extract_element,
+                .dest_components = 1,
+                .ssa_args = {
+                        .dest = mir_temp_location,
+                        .src0 = ssa_vector,
+                        .src1 = SSA_INVALID_VALUE,
+                        .src2 = SSA_INVALID_VALUE,
+                        .src3 = SSA_INVALID_VALUE,
+                },
+                .literal_args[0] = element,
+        };
+        emit_mir_instruction(ctx, instr);
+
+        return mir_temp_location;
+}
+static uint32_t
+emit_movi(struct compiler_context *ctx, uint32_t literal)
+{
+        uint32_t mir_temp_location = alloc_mir_temp(ctx);
+        // This instruction loads a vec3 starting from the initial register
+        struct bifrost_instruction instr = {
+                .op = op_movi,
+                .dest_components = 1,
+                .ssa_args = {
+                        .dest = mir_temp_location,
+                        .src0 = SSA_INVALID_VALUE,
+                        .src1 = SSA_INVALID_VALUE,
+                        .src2 = SSA_INVALID_VALUE,
+                        .src3 = SSA_INVALID_VALUE,
+                },
+                .literal_args[0] = literal,
+        };
+        emit_mir_instruction(ctx, instr);
 
-static void
-bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
-{
-        bi_index frexp = bi_frexpe_f32(b, s0, true, false);
-        bi_index frexpi = bi_s32_to_f32(b, frexp);
-        bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
-        bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
+        return mir_temp_location;
 }
 
-static void
-bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
+static unsigned
+nir_alu_src_index_scalar(compiler_context *ctx, nir_alu_instr *nir_instr, unsigned src)
 {
-        bi_index log2_base = bi_null();
+        // NIR uses a combination of single channels plus swizzles to determine which component is pulled out of a source
+        for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; c++) {
+                if (!nir_alu_instr_channel_used(nir_instr, src, c))
+                        continue;
+                // Pull the swizzle from this element that is active and use it as the source
+                unsigned element = nir_instr->src[src].swizzle[c];
 
-        if (base.type == BI_INDEX_CONSTANT) {
-                log2_base = bi_imm_f32(log2f(uif(base.value)));
-        } else {
-                log2_base = bi_temp(b->shader);
-                bi_lower_flog2_32(b, log2_base, base);
+                // Create an op that extracts an element from a vector
+                return emit_extract_vector_element(ctx, nir_alu_src_index(ctx, &nir_instr->src[src]), element);
         }
-
-        return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
+        assert(0);
+        return 0;
 }
 
 static void
-bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
+emit_intrinsic(struct compiler_context *ctx, nir_intrinsic_instr *nir_instr)
 {
-        bi_index log2_base = bi_null();
-
-        if (base.type == BI_INDEX_CONSTANT) {
-                log2_base = bi_imm_f32(log2f(uif(base.value)));
-        } else {
-                log2_base = bi_temp(b->shader);
-                bi_flog2_32(b, log2_base, base);
-        }
-
-        return bi_fexp_32(b, dst, exp, log2_base);
-}
-
-/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
- * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
- * calculates the results. We use them to calculate sin/cos via a Taylor
- * approximation:
- *
- * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
- * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
- * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
- */
-
-#define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
-#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
-#define SINCOS_BIAS  bi_imm_u32(0x49400000)
+        nir_const_value *const_offset;
+        unsigned offset, reg;
 
-static void
-bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
-{
-        /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
-        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
-
-        /* Approximate domain error (small) */
-        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
-                        MPI_OVER_TWO, s0);
-
-        /* Lookup sin(x), cos(x) */
-        bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
-        bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
-
-        /* e^2 / 2 */
-        bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
-                        bi_imm_u32(-1), BI_SPECIAL_NONE);
-
-        /* (-e^2)/2 f''(x) */
-        bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
-                        cos ? cosx : sinx,
-                        bi_negzero());
-
-        /* e f'(x) - (e^2/2) f''(x) */
-        bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
-                        cos ? bi_neg(sinx) : cosx,
-                        quadratic);
-        I->clamp = BI_CLAMP_CLAMP_M1_1;
-
-        /* f(x) + e f'(x) - (e^2/2) f''(x) */
-        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
-}
+        switch (nir_instr->intrinsic) {
+        case nir_intrinsic_load_ubo: {
+                nir_const_value *location = nir_src_as_const_value(nir_instr->src[0]);
+                const_offset = nir_src_as_const_value(nir_instr->src[1]);
+                assert (location && "no indirect ubo selection");
+                assert (const_offset && "no indirect inputs");
 
-/*
- * The XOR lane op is useful for derivative calculations, but not all Bifrost
- * implementations have it. Add a safe helper that uses the hardware
- * functionality when available and lowers where unavailable.
- */
-static bi_index
-bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
-{
-        if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) {
-                return bi_clper_i32(b, s0, s1,
-                                BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
-                                BI_SUBGROUP_SUBGROUP4);
-        }
+                enum bifrost_ir_ops op;
 
-        bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
-        bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
-        return bi_clper_old_i32(b, s0, lane);
-}
-
-static enum bi_cmpf
-bi_translate_cmpf(nir_op op)
-{
-        switch (op) {
-        case nir_op_ieq8:
-        case nir_op_ieq16:
-        case nir_op_ieq32:
-        case nir_op_feq16:
-        case nir_op_feq32:
-                return BI_CMPF_EQ;
-
-        case nir_op_ine8:
-        case nir_op_ine16:
-        case nir_op_ine32:
-        case nir_op_fneu16:
-        case nir_op_fneu32:
-                return BI_CMPF_NE;
-
-        case nir_op_ilt8:
-        case nir_op_ilt16:
-        case nir_op_ilt32:
-        case nir_op_flt16:
-        case nir_op_flt32:
-        case nir_op_ult8:
-        case nir_op_ult16:
-        case nir_op_ult32:
-                return BI_CMPF_LT;
-
-        case nir_op_ige8:
-        case nir_op_ige16:
-        case nir_op_ige32:
-        case nir_op_fge16:
-        case nir_op_fge32:
-        case nir_op_uge8:
-        case nir_op_uge16:
-        case nir_op_uge32:
-                return BI_CMPF_GE;
-
-        default:
-                unreachable("invalid comparison");
-        }
-}
-
-static bool
-bi_nir_is_replicated(nir_alu_src *src)
-{
-        for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
-                if (src->swizzle[0] == src->swizzle[i])
-                        return false;
-        }
-
-        return true;
-}
-
-static void
-bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
-{
-        bi_index dst = bi_dest_index(&instr->dest.dest);
-        unsigned srcs = nir_op_infos[instr->op].num_inputs;
-        unsigned sz = nir_dest_bit_size(instr->dest.dest);
-        unsigned comps = nir_dest_num_components(instr->dest.dest);
-        unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
-
-        /* Indicate scalarness */
-        if (sz == 16 && comps == 1)
-                dst.swizzle = BI_SWIZZLE_H00;
-
-        /* First, match against the various moves in NIR. These are
-         * special-cased because they can operate on vectors even after
-         * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
-         * instruction is no "bigger" than SIMD-within-a-register. These moves
-         * are the exceptions that need to handle swizzles specially. */
-
-        switch (instr->op) {
-        case nir_op_vec2:
-        case nir_op_vec3:
-        case nir_op_vec4:
-        case nir_op_vec8:
-        case nir_op_vec16: {
-                bi_index unoffset_srcs[16] = { bi_null() };
-                unsigned channels[16] = { 0 };
-
-                for (unsigned i = 0; i < srcs; ++i) {
-                        unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
-                        channels[i] = instr->src[i].swizzle[0];
+                // load_ubo <UBO binding>, <byte offset>
+                // ld_ubo <byte offset>, <UBO binding>
+                switch (nir_dest_num_components(nir_instr->dest)) {
+                case 1:
+                        op = op_ld_ubo_v1;
+                        break;
+                case 2:
+                        op = op_ld_ubo_v2;
+                        break;
+                case 3:
+                        op = op_ld_ubo_v3;
+                        break;
+                case 4:
+                        op = op_ld_ubo_v4;
+                        break;
+                default:
+                        assert(0);
+                        break;
                 }
 
-                bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
-                return;
-        }
-
-        case nir_op_unpack_32_2x16: {
-                /* Should have been scalarized */
-                assert(comps == 2 && sz == 16);
-
-                bi_index vec = bi_src_index(&instr->src[0].src);
-                unsigned chan = instr->src[0].swizzle[0];
-
-                bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
-                return;
-        }
-
-        case nir_op_unpack_64_2x32_split_x:
-        {
-                unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
-                bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), chan));
-                return;
-        }
-
-        case nir_op_unpack_64_2x32_split_y:
-        {
-                unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
-                bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), chan));
-                return;
-        }
-
-        case nir_op_pack_64_2x32_split:
-                bi_collect_v2i32_to(b, dst,
-                                    bi_extract(b, bi_src_index(&instr->src[0].src), instr->src[0].swizzle[0]),
-                                    bi_extract(b, bi_src_index(&instr->src[1].src), instr->src[1].swizzle[0]));
-                return;
-
-        case nir_op_pack_64_2x32:
-                bi_collect_v2i32_to(b, dst,
-                                    bi_extract(b, bi_src_index(&instr->src[0].src), 0),
-                                    bi_extract(b, bi_src_index(&instr->src[0].src), 1));
-                return;
-
-        case nir_op_pack_uvec2_to_uint: {
-                bi_index src = bi_src_index(&instr->src[0].src);
-
-                assert(sz == 32 && src_sz == 32);
-                bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false),
-                                          bi_half(bi_extract(b, src, 1), false));
-                return;
-        }
-
-        case nir_op_pack_uvec4_to_uint: {
-                bi_index src = bi_src_index(&instr->src[0].src);
-
-                assert(sz == 32 && src_sz == 32);
-                bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0),
-                                         bi_byte(bi_extract(b, src, 1), 0),
-                                         bi_byte(bi_extract(b, src, 2), 0),
-                                         bi_byte(bi_extract(b, src, 3), 0));
-                return;
-        }
-
-        case nir_op_mov: {
-                bi_index idx = bi_src_index(&instr->src[0].src);
-                bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
-
-                unsigned channels[4] = {
-                        comps > 0 ? instr->src[0].swizzle[0] : 0,
-                        comps > 1 ? instr->src[0].swizzle[1] : 0,
-                        comps > 2 ? instr->src[0].swizzle[2] : 0,
-                        comps > 3 ? instr->src[0].swizzle[3] : 0,
-                };
-
-                bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
-                return;
-        }
-
-        case nir_op_pack_32_2x16: {
-                assert(comps == 1);
-
-                bi_index idx = bi_src_index(&instr->src[0].src);
-                bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
-
-                unsigned channels[2] = {
-                        instr->src[0].swizzle[0],
-                        instr->src[0].swizzle[1]
+                reg = nir_dest_index(ctx, &nir_instr->dest);
+                struct bifrost_instruction instr = {
+                        .op = op,
+                        .dest_components = nir_dest_num_components(nir_instr->dest),
+                        .ssa_args = {
+                                .dest = reg,
+                                .src0 = SSA_INVALID_VALUE,
+                                .src1 = SSA_INVALID_VALUE,
+                                .src2 = SSA_INVALID_VALUE,
+                                .src3 = SSA_INVALID_VALUE,
+                        },
+                        .literal_args[0] = nir_src_as_uint(nir_instr->src[1]),
+                        .literal_args[1] = nir_src_as_uint(nir_instr->src[0]),
                 };
 
-                bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
-                return;
+                emit_mir_instruction(ctx, instr);
+                break;
         }
+        case nir_intrinsic_store_ssbo: {
+                nir_const_value *location = nir_src_as_const_value(nir_instr->src[1]);
+                const_offset = nir_src_as_const_value(nir_instr->src[2]);
+                assert (location && "no indirect ubo selection");
+                assert (const_offset && "no indirect inputs");
 
-        case nir_op_f2f16:
-        case nir_op_f2f16_rtz:
-        case nir_op_f2f16_rtne: {
-                assert(src_sz == 32);
-                bi_index idx = bi_src_index(&instr->src[0].src);
-                bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
-                bi_index s1 = comps > 1 ?
-                        bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
-
-                bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
-
-                /* Override rounding if explicitly requested. Otherwise, the
-                 * default rounding mode is selected by the builder. Depending
-                 * on the float controls required by the shader, the default
-                 * mode may not be nearest-even.
-                 */
-                if (instr->op == nir_op_f2f16_rtz)
-                        I->round = BI_ROUND_RTZ;
-                else if (instr->op == nir_op_f2f16_rtne)
-                        I->round = BI_ROUND_NONE; /* Nearest even */
+                // store_ssbo <Value>, <binding>, <offset>
+                // store_vN <Addr>, <Value>
+                reg = nir_src_index(ctx, &nir_instr->src[0]);
 
-                return;
-        }
-
-        /* Vectorized downcasts */
-        case nir_op_u2u16:
-        case nir_op_i2i16: {
-                if (!(src_sz == 32 && comps == 2))
+                enum bifrost_ir_ops op;
+                switch (nir_src_num_components(nir_instr->src[0])) {
+                case 1:
+                        op = op_store_v1;
                         break;
-
-                bi_index idx = bi_src_index(&instr->src[0].src);
-                bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
-                bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
-
-                bi_mkvec_v2i16_to(b, dst,
-                                bi_half(s0, false), bi_half(s1, false));
-                return;
-        }
-
-        /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
-         * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
-         * scalarizing due to scheduling (equal cost on Valhall). Additionally
-         * if the source is replicated the MKVEC.v2i16 can be optimized out.
-         */
-        case nir_op_u2f16:
-        case nir_op_i2f16: {
-                if (!(src_sz == 32 && comps == 2))
+                case 2:
+                        op = op_store_v2;
                         break;
-
-                nir_alu_src *src = &instr->src[0];
-                bi_index idx = bi_src_index(&src->src);
-                bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
-                bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
-
-                bi_index t = (src->swizzle[0] == src->swizzle[1]) ?
-                        bi_half(s0, false) :
-                        bi_mkvec_v2i16(b, bi_half(s0, false),
-                                          bi_half(s1, false));
-
-                if (instr->op == nir_op_u2f16)
-                        bi_v2u16_to_v2f16_to(b, dst, t);
-                else
-                        bi_v2s16_to_v2f16_to(b, dst, t);
-
-                return;
-        }
-
-        case nir_op_i2i8:
-        case nir_op_u2u8:
-        {
-                /* Acts like an 8-bit swizzle */
-                bi_index idx = bi_src_index(&instr->src[0].src);
-                unsigned factor = src_sz / 8;
-                unsigned chan[4] = { 0 };
-
-                for (unsigned i = 0; i < comps; ++i)
-                        chan[i] = instr->src[0].swizzle[i] * factor;
-
-                bi_make_vec_to(b, dst, &idx, chan, comps, 8);
-                return;
-        }
-
-        case nir_op_b32csel:
-        {
-                if (sz != 16)
+                case 3:
+                        op = op_store_v3;
+                        break;
+                case 4:
+                        op = op_store_v4;
+                        break;
+                default:
+                        assert(0);
                         break;
-
-                /* We allow vectorizing b32csel(cond, A, B) which can be
-                 * translated as MUX.v2i16, even though cond is a 32-bit vector.
-                 *
-                 * If the source condition vector is replicated, we can use
-                 * MUX.v2i16 directly, letting each component use the
-                 * corresponding half of the 32-bit source. NIR uses 0/~0
-                 * booleans so that's guaranteed to work (that is, 32-bit NIR
-                 * booleans are 16-bit replicated).
-                 *
-                 * If we're not replicated, we use the same trick but must
-                 * insert a MKVEC.v2i16 first to convert down to 16-bit.
-                 */
-                bi_index idx = bi_src_index(&instr->src[0].src);
-                bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
-                bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
-                bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
-
-                if (!bi_nir_is_replicated(&instr->src[0])) {
-                        s0 = bi_mkvec_v2i16(b, bi_half(s0, false),
-                                            bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
                 }
 
-                bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
-                return;
-        }
-
-        default:
-                break;
-        }
-
-        bi_index s0 = srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
-        bi_index s1 = srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
-        bi_index s2 = srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
-
-        switch (instr->op) {
-        case nir_op_ffma:
-                bi_fma_to(b, sz, dst, s0, s1, s2);
-                break;
-
-        case nir_op_fmul:
-                bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
-                break;
-
-        case nir_op_fsub:
-                s1 = bi_neg(s1);
-                FALLTHROUGH;
-        case nir_op_fadd:
-                bi_fadd_to(b, sz, dst, s0, s1);
-                break;
-
-        case nir_op_fsat: {
-                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
-                I->clamp = BI_CLAMP_CLAMP_0_1;
-                break;
-        }
-
-        case nir_op_fsat_signed_mali: {
-                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
-                I->clamp = BI_CLAMP_CLAMP_M1_1;
-                break;
-        }
-
-        case nir_op_fclamp_pos_mali: {
-                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
-                I->clamp = BI_CLAMP_CLAMP_0_INF;
+                struct bifrost_instruction instr = {
+                        .op = op,
+                        .dest_components = 0,
+                        .ssa_args = {
+                                .dest = SSA_INVALID_VALUE,
+                                .src0 = reg,
+                                .src1 = SSA_INVALID_VALUE,
+                                .src2 = SSA_INVALID_VALUE,
+                                .src3 = SSA_INVALID_VALUE,
+                        },
+                        .literal_args[0] = nir_src_as_uint(nir_instr->src[2]),
+                };
+                emit_mir_instruction(ctx, instr);
                 break;
         }
+        case nir_intrinsic_load_uniform:
+                offset = nir_intrinsic_base(nir_instr);
 
-        case nir_op_fneg:
-                bi_fabsneg_to(b, sz, dst, bi_neg(s0));
-                break;
-
-        case nir_op_fabs:
-                bi_fabsneg_to(b, sz, dst, bi_abs(s0));
-                break;
-
-        case nir_op_fsin:
-                bi_lower_fsincos_32(b, dst, s0, false);
-                break;
-
-        case nir_op_fcos:
-                bi_lower_fsincos_32(b, dst, s0, true);
-                break;
-
-        case nir_op_fexp2:
-                assert(sz == 32); /* should've been lowered */
-
-                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
-                        bi_lower_fexp2_32(b, dst, s0);
-                else
-                        bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
-
-                break;
-
-        case nir_op_flog2:
-                assert(sz == 32); /* should've been lowered */
-
-                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
-                        bi_lower_flog2_32(b, dst, s0);
-                else
-                        bi_flog2_32(b, dst, s0);
-
-                break;
-
-        case nir_op_fpow:
-                assert(sz == 32); /* should've been lowered */
-
-                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
-                        bi_lower_fpow_32(b, dst, s0, s1);
-                else
-                        bi_fpow_32(b, dst, s0, s1);
-
-                break;
-
-        case nir_op_frexp_exp:
-                bi_frexpe_to(b, sz, dst, s0, false, false);
-                break;
-
-        case nir_op_frexp_sig:
-                bi_frexpm_to(b, sz, dst, s0, false, false);
-                break;
-
-        case nir_op_ldexp:
-                bi_ldexp_to(b, sz, dst, s0, s1);
-                break;
-
-        case nir_op_b8csel:
-                bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
-                break;
-
-        case nir_op_b16csel:
-                bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
-                break;
-
-        case nir_op_b32csel:
-                bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
-                break;
-
-        case nir_op_extract_u8:
-        case nir_op_extract_i8: {
-                assert(comps == 1 && "should be scalarized");
-                assert((src_sz == 16 || src_sz == 32) && "should be lowered");
-                unsigned byte = nir_src_as_uint(instr->src[1].src);
-
-                if (s0.swizzle == BI_SWIZZLE_H11) {
-                        assert(byte < 2);
-                        byte += 2;
-                } else if (s0.swizzle != BI_SWIZZLE_H01) {
-                        assert(s0.swizzle == BI_SWIZZLE_H00);
+                if (nir_src_is_const(nir_instr->src[0])) {
+                        offset += nir_src_as_uint(nir_instr->src[0]);
+                } else {
+                        assert(0 && "Can't handle indirect load_uniform");
                 }
 
-                assert(byte < 4);
-
-                s0.swizzle = BI_SWIZZLE_H01;
-
-                if (instr->op == nir_op_extract_i8)
-                        bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
-                else
-                        bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
-                break;
-        }
-
-        case nir_op_extract_u16:
-        case nir_op_extract_i16: {
-                assert(comps == 1 && "should be scalarized");
-                assert(src_sz == 32 && "should be lowered");
-                unsigned half = nir_src_as_uint(instr->src[1].src);
-                assert(half == 0 || half == 1);
-
-                if (instr->op == nir_op_extract_i16)
-                        bi_s16_to_s32_to(b, dst, bi_half(s0, half));
-                else
-                        bi_u16_to_u32_to(b, dst, bi_half(s0, half));
-                break;
-        }
-
-        case nir_op_insert_u16: {
-                assert(comps == 1 && "should be scalarized");
-                unsigned half = nir_src_as_uint(instr->src[1].src);
-                assert(half == 0 || half == 1);
-
-                if (half == 0)
-                        bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
-                else
-                        bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
-                break;
-        }
-
-        case nir_op_ishl:
-                bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
-                break;
-        case nir_op_ushr:
-                bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
-                break;
+                reg = nir_dest_index(ctx, &nir_instr->dest);
+
+                unsigned num_components = nir_dest_num_components(nir_instr->dest);
+                if (num_components == 1) {
+                        struct bifrost_instruction instr = {
+                                .op = op_mov,
+                                .dest_components = 1,
+                                .ssa_args = {
+                                        .dest = reg,
+                                        .src0 = SSA_FIXED_UREGISTER(offset),
+                                        .src1 = SSA_INVALID_VALUE,
+                                        .src2 = SSA_INVALID_VALUE,
+                                        .src3 = SSA_INVALID_VALUE,
+                                },
+                        };
+                        emit_mir_instruction(ctx, instr);
+                } else {
+                        uint32_t comps[4];
+
+                        for (unsigned i = 0; i < nir_dest_num_components(nir_instr->dest); ++i) {
+                                uint32_t temp_dest = alloc_mir_temp(ctx);
+                                comps[i] = temp_dest;
+                                struct bifrost_instruction instr = {
+                                        .op = op_mov,
+                                        .dest_components = 1,
+                                        .ssa_args = {
+                                                .dest = temp_dest,
+                                                .src0 = SSA_FIXED_UREGISTER(offset + (i * 4)),
+                                                .src1 = SSA_INVALID_VALUE,
+                                                .src2 = SSA_INVALID_VALUE,
+                                                .src3 = SSA_INVALID_VALUE,
+                                        },
+                                };
+                                emit_mir_instruction(ctx, instr);
+                        }
 
-        case nir_op_ishr:
-                if (b->shader->arch >= 9)
-                        bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
-                else
-                        bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
+                        emit_create_vector(ctx, reg, num_components, comps);
+                }
                 break;
 
-        case nir_op_imin:
-        case nir_op_umin:
-                bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
-                                s0, s1, s0, s1, BI_CMPF_LT);
-                break;
+        case nir_intrinsic_load_input: {
+                const_offset = nir_src_as_const_value(nir_instr->src[0]);
+                assert (const_offset && "no indirect inputs");
 
-        case nir_op_imax:
-        case nir_op_umax:
-                bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
-                                s0, s1, s0, s1, BI_CMPF_GT);
-                break;
+                offset = nir_intrinsic_base(nir_instr) + nir_src_as_uint(nir_instr->src[0]);
 
-        case nir_op_fddx_must_abs_mali:
-        case nir_op_fddy_must_abs_mali: {
-                bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
-                bi_index adjacent = bi_clper_xor(b, s0, bit);
-                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0));
-                break;
-        }
+                reg = nir_dest_index(ctx, &nir_instr->dest);
 
-        case nir_op_fddx:
-        case nir_op_fddy:
-        case nir_op_fddx_coarse:
-        case nir_op_fddy_coarse:
-        case nir_op_fddx_fine:
-        case nir_op_fddy_fine: {
-                unsigned axis;
-                switch (instr->op) {
-                case nir_op_fddx:
-                case nir_op_fddx_coarse:
-                case nir_op_fddx_fine:
-                        axis = 1;
+                enum bifrost_ir_ops op;
+                switch (nir_dest_num_components(nir_instr->dest)) {
+                case 1:
+                        op = op_ld_attr_v1;
                         break;
-                case nir_op_fddy:
-                case nir_op_fddy_coarse:
-                case nir_op_fddy_fine:
-                        axis = 2;
+                case 2:
+                        op = op_ld_attr_v2;
                         break;
-                default:
-                        unreachable("Invalid derivative op");
-                }
-
-                bi_index lane1, lane2;
-                switch (instr->op) {
-                case nir_op_fddx:
-                case nir_op_fddx_fine:
-                case nir_op_fddy:
-                case nir_op_fddy_fine:
-                        lane1 = bi_lshift_and_i32(b,
-                                bi_fau(BIR_FAU_LANE_ID, false),
-                                bi_imm_u32(0x3 & ~axis),
-                                bi_imm_u8(0));
-
-                        lane2 = bi_iadd_u32(b, lane1,
-                                bi_imm_u32(axis),
-                                false);
+                case 3:
+                        op = op_ld_attr_v3;
                         break;
-                case nir_op_fddx_coarse:
-                case nir_op_fddy_coarse:
-                        lane1 = bi_imm_u32(0);
-                        lane2 = bi_imm_u32(axis);
+                case 4:
+                        op = op_ld_attr_v4;
                         break;
                 default:
-                        unreachable("Invalid derivative op");
-                }
-
-                bi_index left, right;
-
-                if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
-                        left = bi_clper_old_i32(b, s0, lane1);
-                        right = bi_clper_old_i32(b, s0, lane2);
-                } else {
-                        left = bi_clper_i32(b, s0, lane1,
-                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
-                                        BI_SUBGROUP_SUBGROUP4);
-
-                        right = bi_clper_i32(b, s0, lane2,
-                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
-                                        BI_SUBGROUP_SUBGROUP4);
+                        assert(0);
+                        break;
                 }
 
-                bi_fadd_to(b, sz, dst, right, bi_neg(left));
-                break;
-        }
-
-        case nir_op_f2f32:
-                bi_f16_to_f32_to(b, dst, s0);
-                break;
-
-        case nir_op_fquantize2f16:
-        {
-                bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
-                bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
-
-                f16->ftz = f32->ftz = true;
-                break;
-        }
-
-        case nir_op_f2i32:
-                if (src_sz == 32)
-                        bi_f32_to_s32_to(b, dst, s0);
-                else
-                        bi_f16_to_s32_to(b, dst, s0);
-                break;
-
-        /* Note 32-bit sources => no vectorization, so 32-bit works */
-        case nir_op_f2u16:
-                if (src_sz == 32)
-                        bi_f32_to_u32_to(b, dst, s0);
-                else
-                        bi_v2f16_to_v2u16_to(b, dst, s0);
-                break;
-
-        case nir_op_f2i16:
-                if (src_sz == 32)
-                        bi_f32_to_s32_to(b, dst, s0);
-                else
-                        bi_v2f16_to_v2s16_to(b, dst, s0);
-                break;
-
-        case nir_op_f2u32:
-                if (src_sz == 32)
-                        bi_f32_to_u32_to(b, dst, s0);
-                else
-                        bi_f16_to_u32_to(b, dst, s0);
-                break;
-
-        case nir_op_u2f16:
-                if (src_sz == 32)
-                        bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
-                else if (src_sz == 16)
-                        bi_v2u16_to_v2f16_to(b, dst, s0);
-                else if (src_sz == 8)
-                        bi_v2u8_to_v2f16_to(b, dst, s0);
-                break;
-
-        case nir_op_u2f32:
-                if (src_sz == 32)
-                        bi_u32_to_f32_to(b, dst, s0);
-                else if (src_sz == 16)
-                        bi_u16_to_f32_to(b, dst, s0);
-                else
-                        bi_u8_to_f32_to(b, dst, s0);
-                break;
-
-        case nir_op_i2f16:
-                if (src_sz == 32)
-                        bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
-                else if (src_sz == 16)
-                        bi_v2s16_to_v2f16_to(b, dst, s0);
-                else if (src_sz == 8)
-                        bi_v2s8_to_v2f16_to(b, dst, s0);
-                break;
-
-        case nir_op_i2f32:
-                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
-
-                if (src_sz == 32)
-                        bi_s32_to_f32_to(b, dst, s0);
-                else if (src_sz == 16)
-                        bi_s16_to_f32_to(b, dst, s0);
-                else if (src_sz == 8)
-                        bi_s8_to_f32_to(b, dst, s0);
-                break;
-
-        case nir_op_i2i32:
-                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
-
-                if (src_sz == 32)
-                        bi_mov_i32_to(b, dst, s0);
-                else if (src_sz == 16)
-                        bi_s16_to_s32_to(b, dst, s0);
-                else if (src_sz == 8)
-                        bi_s8_to_s32_to(b, dst, s0);
-                break;
-
-        case nir_op_u2u32:
-                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
-
-                if (src_sz == 32)
-                        bi_mov_i32_to(b, dst, s0);
-                else if (src_sz == 16)
-                        bi_u16_to_u32_to(b, dst, s0);
-                else if (src_sz == 8)
-                        bi_u8_to_u32_to(b, dst, s0);
-
-                break;
-
-        case nir_op_i2i16:
-                assert(src_sz == 8 || src_sz == 32);
-
-                if (src_sz == 8)
-                        bi_v2s8_to_v2s16_to(b, dst, s0);
-                else
-                        bi_mov_i32_to(b, dst, s0);
-                break;
-
-        case nir_op_u2u16:
-                assert(src_sz == 8 || src_sz == 32);
-
-                if (src_sz == 8)
-                        bi_v2u8_to_v2u16_to(b, dst, s0);
-                else
-                        bi_mov_i32_to(b, dst, s0);
-                break;
-
-        case nir_op_b2i8:
-        case nir_op_b2i16:
-        case nir_op_b2i32:
-                bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0, BI_MUX_INT_ZERO);
-                break;
-
-        case nir_op_f2b16:
-                bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_FP_ZERO);
-                break;
-        case nir_op_f2b32:
-                bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_FP_ZERO);
-                break;
-
-        case nir_op_i2b8:
-                bi_mux_v4i8_to(b, dst, bi_imm_u8(0), bi_imm_u8(~0), s0, BI_MUX_INT_ZERO);
-                break;
-        case nir_op_i2b16:
-                bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_INT_ZERO);
-                break;
-        case nir_op_i2b32:
-                bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_INT_ZERO);
-                break;
-
-        case nir_op_ieq8:
-        case nir_op_ine8:
-        case nir_op_ilt8:
-        case nir_op_ige8:
-        case nir_op_ieq16:
-        case nir_op_ine16:
-        case nir_op_ilt16:
-        case nir_op_ige16:
-        case nir_op_ieq32:
-        case nir_op_ine32:
-        case nir_op_ilt32:
-        case nir_op_ige32:
-                bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
-                break;
-
-        case nir_op_ult8:
-        case nir_op_uge8:
-        case nir_op_ult16:
-        case nir_op_uge16:
-        case nir_op_ult32:
-        case nir_op_uge32:
-                bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
-                break;
-
-        case nir_op_feq32:
-        case nir_op_feq16:
-        case nir_op_flt32:
-        case nir_op_flt16:
-        case nir_op_fge32:
-        case nir_op_fge16:
-        case nir_op_fneu32:
-        case nir_op_fneu16:
-                bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
-                break;
-
-        case nir_op_fround_even:
-        case nir_op_fceil:
-        case nir_op_ffloor:
-        case nir_op_ftrunc:
-                bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
-                break;
-
-        case nir_op_fmin:
-                bi_fmin_to(b, sz, dst, s0, s1);
-                break;
-
-        case nir_op_fmax:
-                bi_fmax_to(b, sz, dst, s0, s1);
-                break;
-
-        case nir_op_iadd:
-                bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
-                break;
-
-        case nir_op_iadd_sat:
-                bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
-                break;
-
-        case nir_op_uadd_sat:
-                bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
-                break;
-
-        case nir_op_ihadd:
-                bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
-                break;
-
-        case nir_op_irhadd:
-                bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
-                break;
-
-        case nir_op_uhadd:
-                bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN);
-                break;
-
-        case nir_op_urhadd:
-                bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP);
-                break;
-
-        case nir_op_ineg:
-                bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
-                break;
-
-        case nir_op_isub:
-                bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
-                break;
-
-        case nir_op_isub_sat:
-                bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
-                break;
-
-        case nir_op_usub_sat:
-                bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
-                break;
-
-        case nir_op_imul:
-                bi_imul_to(b, sz, dst, s0, s1);
-                break;
-
-        case nir_op_iabs:
-                bi_iabs_to(b, sz, dst, s0);
-                break;
-
-        case nir_op_iand:
-                bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
-                break;
-
-        case nir_op_ior:
-                bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
-                break;
-
-        case nir_op_ixor:
-                bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
-                break;
-
-        case nir_op_inot:
-                bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
-                break;
-
-        case nir_op_frsq:
-                if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
-                        bi_lower_frsq_32(b, dst, s0);
-                else
-                        bi_frsq_to(b, sz, dst, s0);
-                break;
-
-        case nir_op_frcp:
-                if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
-                        bi_lower_frcp_32(b, dst, s0);
-                else
-                        bi_frcp_to(b, sz, dst, s0);
-                break;
-
-        case nir_op_uclz:
-                bi_clz_to(b, sz, dst, s0, false);
-                break;
-
-        case nir_op_bit_count:
-                assert(sz == 32 && src_sz == 32 && "should've been lowered");
-                bi_popcount_i32_to(b, dst, s0);
-                break;
-
-        case nir_op_bitfield_reverse:
-                assert(sz == 32 && src_sz == 32 && "should've been lowered");
-                bi_bitrev_i32_to(b, dst, s0);
-                break;
-
-        case nir_op_ufind_msb: {
-                bi_index clz = bi_clz(b, src_sz, s0, false);
-
-                if (sz == 8)
-                        clz = bi_byte(clz, 0);
-                else if (sz == 16)
-                        clz = bi_half(clz, false);
-
-                bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
-                break;
-        }
-
-        default:
-                fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
-                unreachable("Unknown ALU op");
-        }
-}
-
-/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */
-static unsigned
-bifrost_tex_format(enum glsl_sampler_dim dim)
-{
-        switch (dim) {
-        case GLSL_SAMPLER_DIM_1D:
-        case GLSL_SAMPLER_DIM_BUF:
-                return 1;
-
-        case GLSL_SAMPLER_DIM_2D:
-        case GLSL_SAMPLER_DIM_MS:
-        case GLSL_SAMPLER_DIM_EXTERNAL:
-        case GLSL_SAMPLER_DIM_RECT:
-                return 2;
-
-        case GLSL_SAMPLER_DIM_3D:
-                return 3;
-
-        case GLSL_SAMPLER_DIM_CUBE:
-                return 0;
-
-        default:
-                DBG("Unknown sampler dim type\n");
-                assert(0);
-                return 0;
-        }
-}
-
-static enum bi_dimension
-valhall_tex_dimension(enum glsl_sampler_dim dim)
-{
-        switch (dim) {
-        case GLSL_SAMPLER_DIM_1D:
-        case GLSL_SAMPLER_DIM_BUF:
-                return BI_DIMENSION_1D;
-
-        case GLSL_SAMPLER_DIM_2D:
-        case GLSL_SAMPLER_DIM_MS:
-        case GLSL_SAMPLER_DIM_EXTERNAL:
-        case GLSL_SAMPLER_DIM_RECT:
-                return BI_DIMENSION_2D;
-
-        case GLSL_SAMPLER_DIM_3D:
-                return BI_DIMENSION_3D;
-
-        case GLSL_SAMPLER_DIM_CUBE:
-                return BI_DIMENSION_CUBE;
-
-        default:
-                unreachable("Unknown sampler dim type");
-        }
-}
-
-static enum bifrost_texture_format_full
-bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
-{
-        switch (T) {
-        case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp;
-        case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp;
-        case nir_type_uint16:  return BIFROST_TEXTURE_FORMAT_U16;
-        case nir_type_int16:   return BIFROST_TEXTURE_FORMAT_S16;
-        case nir_type_uint32:  return BIFROST_TEXTURE_FORMAT_U32;
-        case nir_type_int32:   return BIFROST_TEXTURE_FORMAT_S32;
-        default:              unreachable("Invalid type for texturing");
-        }
-}
-
-/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
-static bi_index
-bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
-{
-        /* For (u)int we can just passthrough */
-        nir_alu_type base = nir_alu_type_get_base_type(T);
-        if (base == nir_type_int || base == nir_type_uint)
-                return idx;
-
-        /* Otherwise we convert */
-        assert(T == nir_type_float32);
-
-        /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
-         * Texel Selection") defines the layer to be taken from clamp(RNE(r),
-         * 0, dt - 1). So we use round RTE, clamping is handled at the data
-         * structure level */
-
-        bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
-        I->round = BI_ROUND_NONE;
-        return I->dest[0];
-}
-
-/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
- * 16-bit 8:8 fixed-point format. We lower as:
- *
- * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
- * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
- */
-
-static bi_index
-bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
-{
-        /* Precompute for constant LODs to avoid general constant folding */
-        if (lod.type == BI_INDEX_CONSTANT) {
-                uint32_t raw = lod.value;
-                float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
-                int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
-                return bi_imm_u32(s32 & 0xFFFF);
-        }
-
-        /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
-         * the max LOD (16 since we cap at 2^16 texture dimensions), and
-         * preferably small to minimize precision loss */
-        const float max_lod = 16.0;
-
-        bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
-                        fp16 ? bi_half(lod, false) : lod,
-                        bi_imm_f32(1.0f / max_lod), bi_negzero());
-
-        fsat->clamp = BI_CLAMP_CLAMP_M1_1;
-
-        bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
-                        bi_negzero());
-
-        return bi_mkvec_v2i16(b,
-                        bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0));
-}
-
-/* FETCH takes a 32-bit staging register containing the LOD as an integer in
- * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
- * TODO: Cube face.
- */
-
-static bi_index
-bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
-{
-        return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
-}
-
-/* The hardware specifies texel offsets and multisample indices together as a
- * u8vec4 <offset, ms index>. By default all are zero, so if have either a
- * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
- * the bits we need and return that to be passed as a staging register. Else we
- * return 0 to avoid allocating a data register when everything is zero. */
-
-static bi_index
-bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
-{
-        bi_index dest = bi_zero();
-
-        int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
-        if (offs_idx >= 0 &&
-            (!nir_src_is_const(instr->src[offs_idx].src) ||
-             nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
-                unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
-                bi_index idx = bi_src_index(&instr->src[offs_idx].src);
-                dest = bi_mkvec_v4i8(b, 
-                                (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
-                                (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
-                                (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
-                                bi_imm_u8(0));
-        }
-
-        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
-        if (ms_idx >= 0 &&
-            (!nir_src_is_const(instr->src[ms_idx].src) ||
-             nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
-                dest = bi_lshift_or_i32(b,
-                                bi_src_index(&instr->src[ms_idx].src), dest,
-                                bi_imm_u8(24));
-        }
-
-        return dest;
-}
-
-/*
- * Valhall specifies specifies texel offsets, multisample indices, and (for
- * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
- * component is either offset.z or multisample index depending on context. Build
- * this register.
- */
-static bi_index
-bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
-{
-        bi_index dest = bi_zero();
-
-        int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
-        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
-        int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
-
-        /* Components 0-2: offsets */
-        if (offs_idx >= 0 &&
-            (!nir_src_is_const(instr->src[offs_idx].src) ||
-             nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
-                unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
-                bi_index idx = bi_src_index(&instr->src[offs_idx].src);
-
-                /* No multisample index with 3D */
-                assert((nr <= 2) || (ms_idx < 0));
-
-                /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
-                bi_index z = (nr > 2) ?
-                             bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
-                                              bi_imm_u8(0), bi_zero()) :
-                             bi_zero();
-
-                dest = bi_mkvec_v2i8(b,
-                                (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
-                                (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
-                                z);
-        }
-
-        /* Component 2: multisample index */
-        if (ms_idx >= 0 &&
-            (!nir_src_is_const(instr->src[ms_idx].src) ||
-             nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
-                dest = bi_mkvec_v2i16(b, dest,
-                                bi_src_index(&instr->src[ms_idx].src));
-        }
-
-        /* Component 3: 8-bit LOD */
-        if (lod_idx >= 0 &&
-            (!nir_src_is_const(instr->src[lod_idx].src) ||
-             nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
-            nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
-                dest = bi_lshift_or_i32(b,
-                                bi_src_index(&instr->src[lod_idx].src), dest,
-                                bi_imm_u8(24));
-        }
-
-        return dest;
-}
-
-static void
-bi_emit_cube_coord(bi_builder *b, bi_index coord,
-                    bi_index *face, bi_index *s, bi_index *t)
-{
-        /* Compute max { |x|, |y|, |z| } */
-        bi_index maxxyz = bi_temp(b->shader);
-        *face = bi_temp(b->shader);
-
-        bi_index cx = bi_extract(b, coord, 0),
-                 cy = bi_extract(b, coord, 1),
-                 cz = bi_extract(b, coord, 2);
-
-        /* Use a pseudo op on Bifrost due to tuple restrictions */
-        if (b->shader->arch <= 8) {
-                bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
-        } else {
-                bi_cubeface1_to(b, maxxyz, cx, cy, cz);
-                bi_cubeface2_v9_to(b, *face, cx, cy, cz);
-        }
-
-        /* Select coordinates */
-        bi_index ssel = bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
-        bi_index tsel = bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2),
-                        *face);
-
-        /* The OpenGL ES specification requires us to transform an input vector
-         * (x, y, z) to the coordinate, given the selected S/T:
-         *
-         * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
-         *
-         * We implement (s shown, t similar) in a form friendlier to FMA
-         * instructions, and clamp coordinates at the end for correct
-         * NaN/infinity handling:
-         *
-         * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
-         *
-         * Take the reciprocal of max{x, y, z}
-         */
-        bi_index rcp = bi_frcp_f32(b, maxxyz);
-
-        /* Calculate 0.5 * (1.0 / max{x, y, z}) */
-        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
-
-        /* Transform the coordinates */
-        *s = bi_temp(b->shader);
-        *t = bi_temp(b->shader);
-
-        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
-        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
-
-        S->clamp = BI_CLAMP_CLAMP_0_1;
-        T->clamp = BI_CLAMP_CLAMP_0_1;
-}
-
-/* Emits a cube map descriptor, returning lower 32-bits and putting upper
- * 32-bits in passed pointer t. The packing of the face with the S coordinate
- * exploits the redundancy of floating points with the range restriction of
- * CUBEFACE output.
- *
- *     struct cube_map_descriptor {
- *         float s : 29;
- *         unsigned face : 3;
- *         float t : 32;
- *     }
- *
- * Since the cube face index is preshifted, this is easy to pack with a bitwise
- * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
- * bits from face.
- */
-
-static bi_index
-bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
-{
-        bi_index face, s;
-        bi_emit_cube_coord(b, coord, &face, &s, t);
-        bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
-        return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
-}
-
-/* Map to the main texture op used. Some of these (txd in particular) will
- * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
- * sequence). We assume that lowering is handled elsewhere.
- */
-
-static enum bifrost_tex_op
-bi_tex_op(nir_texop op)
-{
-        switch (op) {
-        case nir_texop_tex:
-        case nir_texop_txb:
-        case nir_texop_txl:
-        case nir_texop_txd:
-        case nir_texop_tex_prefetch:
-                return BIFROST_TEX_OP_TEX;
-        case nir_texop_txf:
-        case nir_texop_txf_ms:
-        case nir_texop_txf_ms_fb:
-        case nir_texop_tg4:
-                return BIFROST_TEX_OP_FETCH;
-        case nir_texop_txs:
-        case nir_texop_lod:
-        case nir_texop_query_levels:
-        case nir_texop_texture_samples:
-        case nir_texop_samples_identical:
-                unreachable("should've been lowered");
-        default:
-                unreachable("unsupported tex op");
-        }
-}
-
-/* Data registers required by texturing in the order they appear. All are
- * optional, the texture operation descriptor determines which are present.
- * Note since 3D arrays are not permitted at an API level, Z_COORD and
- * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
-
-enum bifrost_tex_dreg {
-        BIFROST_TEX_DREG_Z_COORD = 0,
-        BIFROST_TEX_DREG_Y_DELTAS = 1,
-        BIFROST_TEX_DREG_LOD = 2,
-        BIFROST_TEX_DREG_GRDESC_HI = 3,
-        BIFROST_TEX_DREG_SHADOW = 4,
-        BIFROST_TEX_DREG_ARRAY = 5,
-        BIFROST_TEX_DREG_OFFSETMS = 6,
-        BIFROST_TEX_DREG_SAMPLER = 7,
-        BIFROST_TEX_DREG_TEXTURE = 8,
-        BIFROST_TEX_DREG_COUNT,
-};
-
-static void
-bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
-{
-        struct bifrost_texture_operation desc = {
-                .op = bi_tex_op(instr->op),
-                .offset_or_bias_disable = false, /* TODO */
-                .shadow_or_clamp_disable = instr->is_shadow,
-                .array = instr->is_array,
-                .dimension = bifrost_tex_format(instr->sampler_dim),
-                .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */
-                .mask = 0xF,
-        };
+                struct bifrost_instruction instr = {
+                        .op = op,
+                        .dest_components = nir_dest_num_components(nir_instr->dest),
+                        .ssa_args = {
+                                .dest = reg,
+                                .src0 = offset,
+                                .src1 = SSA_INVALID_VALUE,
+                                .src2 = SSA_INVALID_VALUE,
+                                .src3 = SSA_INVALID_VALUE,
+                        }
+                };
 
-        switch (desc.op) {
-        case BIFROST_TEX_OP_TEX:
-                desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
-                break;
-        case BIFROST_TEX_OP_FETCH:
-                desc.lod_or_fetch = (enum bifrost_lod_mode)
-                   (instr->op == nir_texop_tg4 ?
-                        BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component :
-                        BIFROST_TEXTURE_FETCH_TEXEL);
+                emit_mir_instruction(ctx, instr);
                 break;
-        default:
-                unreachable("texture op unsupported");
         }
+        case nir_intrinsic_store_output: {
+                const_offset = nir_src_as_const_value(nir_instr->src[1]);
+                assert(const_offset && "no indirect outputs");
 
-        /* 32-bit indices to be allocated as consecutive staging registers */
-        bi_index dregs[BIFROST_TEX_DREG_COUNT] = { };
-        bi_index cx = bi_null(), cy = bi_null();
-
-        for (unsigned i = 0; i < instr->num_srcs; ++i) {
-                bi_index index = bi_src_index(&instr->src[i].src);
-                unsigned sz = nir_src_bit_size(instr->src[i].src);
-                unsigned components = nir_src_num_components(instr->src[i].src);
-                ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
-                nir_alu_type T = base | sz;
-
-                switch (instr->src[i].src_type) {
-                case nir_tex_src_coord:
-                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                                cx = bi_emit_texc_cube_coord(b, index, &cy);
-			} else {
-                                /* Copy XY (for 2D+) or XX (for 1D) */
-                                cx = bi_extract(b, index, 0);
-                                cy = bi_extract(b, index, MIN2(1, components - 1));
-
-                                assert(components >= 1 && components <= 3);
-
-                                if (components == 3 && !desc.array) {
-                                        /* 3D */
-                                        dregs[BIFROST_TEX_DREG_Z_COORD] =
-                                                bi_extract(b, index, 2);
-                                }
-                        }
+                offset = nir_intrinsic_base(nir_instr);
+                if (ctx->stage == MESA_SHADER_FRAGMENT) {
+                        int comp = nir_intrinsic_component(nir_instr);
+                        offset += comp;
+                        // XXX: Once we support more than colour output then this will need to change
+                        void *entry = _mesa_hash_table_u64_search(ctx->outputs_nir_to_bi, offset + FRAG_RESULT_DATA0 + 1);
 
-                        if (desc.array) {
-                                dregs[BIFROST_TEX_DREG_ARRAY] =
-                                                bi_emit_texc_array_index(b,
-                                                                bi_extract(b, index, components - 1), T);
+                        if (!entry) {
+                                printf("WARNING: skipping fragment output\n");
+                                break;
                         }
 
-                        break;
+                        offset = (uintptr_t) (entry) - 1;
+                        reg = nir_src_index(ctx, &nir_instr->src[0]);
 
-                case nir_tex_src_lod:
-                        if (desc.op == BIFROST_TEX_OP_TEX &&
-                            nir_src_is_const(instr->src[i].src) &&
-                            nir_src_as_uint(instr->src[i].src) == 0) {
-                                desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
-                        } else if (desc.op == BIFROST_TEX_OP_TEX) {
-                                assert(base == nir_type_float);
-
-                                assert(sz == 16 || sz == 32);
-                                dregs[BIFROST_TEX_DREG_LOD] =
-                                        bi_emit_texc_lod_88(b, index, sz == 16);
-                                desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
-                        } else {
-                                assert(desc.op == BIFROST_TEX_OP_FETCH);
-                                assert(base == nir_type_uint || base == nir_type_int);
-                                assert(sz == 16 || sz == 32);
-
-                                dregs[BIFROST_TEX_DREG_LOD] =
-                                        bi_emit_texc_lod_cube(b, index);
+                        enum bifrost_ir_ops op;
+                        switch (nir_src_num_components(nir_instr->src[0])) {
+                        case 1:
+                                op = op_store_v1;
+                                break;
+                        case 2:
+                                op = op_store_v2;
+                                break;
+                        case 3:
+                                op = op_store_v3;
+                                break;
+                        case 4:
+                                op = op_store_v4;
+                                break;
+                        default:
+                                assert(0);
+                                break;
                         }
 
-                        break;
-
-                case nir_tex_src_bias:
-                        /* Upper 16-bits interpreted as a clamp, leave zero */
-                        assert(desc.op == BIFROST_TEX_OP_TEX);
-                        assert(base == nir_type_float);
-                        assert(sz == 16 || sz == 32);
-                        dregs[BIFROST_TEX_DREG_LOD] =
-                                bi_emit_texc_lod_88(b, index, sz == 16);
-                        desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
-                        break;
-
-                case nir_tex_src_ms_index:
-                case nir_tex_src_offset:
-                        if (desc.offset_or_bias_disable)
+                        // XXX: All offsets aren't vec4 aligned. Will need to adjust this in the future
+                        // XXX: This needs to offset correctly in to memory so the blend step can pick it up
+                        uint32_t movi = emit_movi(ctx, offset * 16);
+                        uint32_t movi2 = emit_movi(ctx, 0);
+
+                        uint32_t comps[2] = {
+                                movi, movi2,
+                        };
+                        uint32_t offset_val = alloc_mir_temp(ctx);
+                        emit_create_vector(ctx, offset_val, 2, comps);
+
+                        struct bifrost_instruction instr = {
+                                .op = op,
+                                .dest_components = 0,
+                                .ssa_args = {
+                                        .dest = SSA_INVALID_VALUE,
+                                        .src0 = offset_val,
+                                        .src1 = reg,
+                                        .src2 = SSA_INVALID_VALUE,
+                                        .src3 = SSA_INVALID_VALUE,
+                                }
+                        };
+                        emit_mir_instruction(ctx, instr);
+                } else if (ctx->stage == MESA_SHADER_VERTEX) {
+                        int comp = nir_intrinsic_component(nir_instr);
+                        offset += comp;
+                        void *entry = _mesa_hash_table_u64_search(ctx->varying_nir_to_bi, offset + 2);
+
+                        if (!entry) {
+                                printf("WARNING: skipping varying\n");
                                 break;
+                        }
 
-                        dregs[BIFROST_TEX_DREG_OFFSETMS] =
-	                        bi_emit_texc_offset_ms_index(b, instr);
-                        if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
-                                desc.offset_or_bias_disable = true;
-                        break;
-
-                case nir_tex_src_comparator:
-                        dregs[BIFROST_TEX_DREG_SHADOW] = index;
-                        break;
-
-                case nir_tex_src_texture_offset:
-                        if (instr->texture_index)
-                                index = bi_iadd_u32(b, index, bi_imm_u32(instr->texture_index), false);
-
-                        dregs[BIFROST_TEX_DREG_TEXTURE] = index;
-
-                        break;
-
-                case nir_tex_src_sampler_offset:
-                        if (instr->sampler_index)
-                                index = bi_iadd_u32(b, index, bi_imm_u32(instr->sampler_index), false);
-
-                        dregs[BIFROST_TEX_DREG_SAMPLER] = index;
-                        break;
-
-                default:
-                        unreachable("Unhandled src type in texc emit");
-                }
-        }
-
-        if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
-                dregs[BIFROST_TEX_DREG_LOD] =
-                        bi_emit_texc_lod_cube(b, bi_zero());
-        }
-
-        /* Choose an index mode */
+                        offset = (uintptr_t) (entry) - 1;
 
-        bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
-        bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
-        bool direct = direct_tex && direct_samp;
+                        reg = nir_src_index(ctx, &nir_instr->src[0]);
+                        // LD_VAR_ADDR.f32 {R0, T1}, R61, R62, location:1, R12
+                        // ...
+                        // ST_VAR.v4 T1, R12, R13, R14, R4
 
-        desc.immediate_indices = direct && (instr->sampler_index < 16);
+                        offset = emit_ld_vary_addr_constant(ctx, offset);
+                        enum bifrost_ir_ops op;
+                        switch (nir_src_num_components(nir_instr->src[0])) {
+                        case 1:
+                                op = op_st_vary_v1;
+                                break;
+                        case 2:
+                                op = op_st_vary_v2;
+                                break;
+                        case 3:
+                                op = op_st_vary_v3;
+                                break;
+                        case 4:
+                                op = op_st_vary_v4;
+                                break;
+                        default:
+                                assert(0);
+                                break;
+                        }
 
-        if (desc.immediate_indices) {
-                desc.sampler_index_or_mode = instr->sampler_index;
-                desc.index = instr->texture_index;
-        } else {
-                unsigned mode = 0;
-
-                if (direct && instr->sampler_index == instr->texture_index) {
-                        mode = BIFROST_INDEX_IMMEDIATE_SHARED;
-                        desc.index = instr->texture_index;
-                } else if (direct) {
-                        mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
-                        desc.index = instr->sampler_index;
-                        dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b,
-                                        bi_imm_u32(instr->texture_index));
-                } else if (direct_tex) {
-                        assert(!direct_samp);
-                        mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
-                        desc.index = instr->texture_index;
-                } else if (direct_samp) {
-                        assert(!direct_tex);
-                        mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
-                        desc.index = instr->sampler_index;
+                        struct bifrost_instruction instr = {
+                                .op = op,
+                                .dest_components = 0,
+                                .ssa_args = {
+                                        .dest = SSA_INVALID_VALUE,
+                                        .src0 = offset,
+                                        .src1 = reg,
+                                        .src2 = SSA_INVALID_VALUE,
+                                        .src3 = SSA_INVALID_VALUE,
+                                }
+                        };
+                        emit_mir_instruction(ctx, instr);
                 } else {
-                        mode = BIFROST_INDEX_REGISTER;
+                        assert(0 && "Unknown store_output stage");
                 }
-
-                mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
-                desc.sampler_index_or_mode = mode;
+                break;
         }
-
-        /* Allocate staging registers contiguously by compacting the array. */
-        unsigned sr_count = 0;
-
-        for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
-                if (!bi_is_null(dregs[i]))
-                        dregs[sr_count++] = dregs[i];
+        default:
+                printf ("Unhandled intrinsic %s\n", nir_intrinsic_infos[nir_instr->intrinsic].name);
+                break;
         }
-
-        unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4;
-
-        bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
-        bi_index dst = bi_temp(b->shader);
-
-        if (sr_count)
-                bi_emit_collect_to(b, sr, dregs, sr_count);
-
-        uint32_t desc_u = 0;
-        memcpy(&desc_u, &desc, sizeof(desc_u));
-        bi_instr *I = bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc_u),
-                                 !nir_tex_instr_has_implicit_derivative(instr),
-                                 sr_count, 0);
-        I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
-
-        bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
-        bi_emit_split_i32(b, w, dst, res_size);
-        bi_emit_collect_to(b, bi_dest_index(&instr->dest), w,
-                        DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4));
 }
 
-/* Staging registers required by texturing in the order they appear (Valhall) */
-
-enum valhall_tex_sreg {
-        VALHALL_TEX_SREG_X_COORD = 0,
-        VALHALL_TEX_SREG_Y_COORD = 1,
-        VALHALL_TEX_SREG_Z_COORD = 2,
-        VALHALL_TEX_SREG_Y_DELTAS = 3,
-        VALHALL_TEX_SREG_ARRAY = 4,
-        VALHALL_TEX_SREG_SHADOW = 5,
-        VALHALL_TEX_SREG_OFFSETMS = 6,
-        VALHALL_TEX_SREG_LOD = 7,
-        VALHALL_TEX_SREG_GRDESC = 8,
-        VALHALL_TEX_SREG_COUNT,
-};
+#define ALU_CASE(arguments, nir, name) \
+	case nir_op_##nir: \
+                argument_count = arguments; \
+		op = op_##name; \
+		break
+#define ALU_CASE_MOD(arguments, nir, name, modifiers) \
+	case nir_op_##nir: \
+                argument_count = arguments; \
+		op = op_##name; \
+                src_modifiers = modifiers; \
+		break
 
 static void
-bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
-{
-        bool explicit_offset = false;
-        enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
-
-        bool has_lod_mode =
-                (instr->op == nir_texop_tex) ||
-                (instr->op == nir_texop_txl) ||
-                (instr->op == nir_texop_txb);
-
-        /* 32-bit indices to be allocated as consecutive staging registers */
-        bi_index sregs[VALHALL_TEX_SREG_COUNT] = { };
-
-
-        bool has_sampler = nir_tex_instr_need_sampler(instr);
-        bi_index sampler = bi_imm_u32(has_sampler ? instr->sampler_index : 0);
-        bi_index texture = bi_imm_u32(instr->texture_index);
-        uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27);
-
-        for (unsigned i = 0; i < instr->num_srcs; ++i) {
-                bi_index index = bi_src_index(&instr->src[i].src);
-                unsigned sz = nir_src_bit_size(instr->src[i].src);
-                unsigned components = nir_src_num_components(instr->src[i].src);
-
-                switch (instr->src[i].src_type) {
-                case nir_tex_src_coord:
-                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                                sregs[VALHALL_TEX_SREG_X_COORD] =
-                                        bi_emit_texc_cube_coord(b, index,
-                                                &sregs[VALHALL_TEX_SREG_Y_COORD]);
-			} else {
-                                assert(components >= 1 && components <= 3);
-
-                                /* Copy XY (for 2D+) or XX (for 1D) */
-                                sregs[VALHALL_TEX_SREG_X_COORD] = index;
-
-                                if (components >= 2)
-                                        sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
-
-                                if (components == 3 && !instr->is_array) {
-                                        sregs[VALHALL_TEX_SREG_Z_COORD] =
-                                                bi_extract(b, index, 2);
-                                }
-                        }
-
-                        if (instr->is_array) {
-                                sregs[VALHALL_TEX_SREG_ARRAY] =
-                                        bi_extract(b, index, components - 1);
-                        }
-
-                        break;
-
-                case nir_tex_src_lod:
-                        if (nir_src_is_const(instr->src[i].src) &&
-                            nir_src_as_uint(instr->src[i].src) == 0) {
-                                lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
-                        } else if (has_lod_mode) {
-                                lod_mode = BI_VA_LOD_MODE_EXPLICIT;
-
-                                assert(sz == 16 || sz == 32);
-                                sregs[VALHALL_TEX_SREG_LOD] =
-                                        bi_emit_texc_lod_88(b, index, sz == 16);
-                        }
-                        break;
-
-                case nir_tex_src_bias:
-                        /* Upper 16-bits interpreted as a clamp, leave zero */
-                        assert(sz == 16 || sz == 32);
-                        sregs[VALHALL_TEX_SREG_LOD] =
-                                bi_emit_texc_lod_88(b, index, sz == 16);
-
-                        lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
-                        break;
-                case nir_tex_src_ms_index:
-                case nir_tex_src_offset:
-                        /* Handled below */
-                        break;
-
-                case nir_tex_src_comparator:
-                        sregs[VALHALL_TEX_SREG_SHADOW] = index;
-                        break;
-
-                case nir_tex_src_texture_offset:
-                        assert(instr->texture_index == 0);
-                        texture = index;
-                        break;
-
-                case nir_tex_src_sampler_offset:
-                        assert(instr->sampler_index == 0);
-                        sampler = index;
-                        break;
-
-                default:
-                        unreachable("Unhandled src type in tex emit");
-                }
-        }
-
-        /* Generate packed offset + ms index + LOD register. These default to
-         * zero so we only need to encode if these features are actually in use.
-         */
-        bi_index offsets = bi_emit_valhall_offsets(b, instr);
-
-        if (!bi_is_equiv(offsets, bi_zero())) {
-                sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
-                explicit_offset = true;
-        }
+emit_alu(struct compiler_context *ctx, nir_alu_instr *nir_instr)
+{
+        unsigned dest = nir_dest_index(ctx, &nir_instr->dest.dest);
+        unsigned op = ~0U, argument_count;
+        unsigned src_modifiers = 0;
+
+        switch (nir_instr->op) {
+                ALU_CASE(2, fmul, fmul_f32);
+                ALU_CASE(2, fadd, fadd_f32);
+                ALU_CASE_MOD(2, fsub, fadd_f32, SOURCE_MODIFIER(1, SRC_MOD_NEG));
+                ALU_CASE(1, ftrunc, trunc);
+                ALU_CASE(1, fceil, ceil);
+                ALU_CASE(1, ffloor, floor);
+                ALU_CASE(1, fround_even, roundeven);
+                ALU_CASE(1, frcp, frcp_fast_f32);
+                ALU_CASE(2, fmax, max_f32);
+                ALU_CASE(2, fmin, min_f32);
+                ALU_CASE(2, iadd, add_i32);
+                ALU_CASE(2, isub, sub_i32);
+                ALU_CASE(2, imul, mul_i32);
+                ALU_CASE(2, iand, and_i32);
+                ALU_CASE(2, ior, or_i32);
+                ALU_CASE(2, ixor, xor_i32);
+                ALU_CASE(2, ishl, lshift_i32);
+                ALU_CASE(2, ushr, rshift_i32);
+                ALU_CASE(2, ishr, arshift_i32);
+        case nir_op_ineg: {
+                unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0);
+                printf("ineg 0x%08x\n", src0);
+                struct bifrost_instruction instr = {
+                        .op = op_sub_i32,
+                        .dest_components = 1,
+                        .ssa_args = {
+                                .dest = dest,
+                                .src0 = SSA_FIXED_CONST_0,
+                                .src1 = src0,
+                                .src2 = SSA_INVALID_VALUE,
+                                .src3 = SSA_INVALID_VALUE,
+                        },
+                };
 
-        /* Allocate staging registers contiguously by compacting the array. */
-        unsigned sr_count = 0;
+                emit_mir_instruction(ctx, instr);
+                return;
 
-        for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
-                if (!bi_is_null(sregs[i]))
-                        sregs[sr_count++] = sregs[i];
         }
-
-        bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
-
-        if (sr_count)
-                bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
-
-        bi_index image_src = bi_imm_u32(tables);
-        image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0));
-        image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16));
-
-        unsigned mask = BI_WRITE_MASK_RGBA;
-        unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4;
-        enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
-        enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
-        bi_index dest = bi_temp(b->shader);
-
-        switch (instr->op) {
-        case nir_texop_tex:
-        case nir_texop_txl:
-        case nir_texop_txb:
-                bi_tex_single_to(b, dest, idx, image_src, bi_zero(),
-                                 instr->is_array, dim, regfmt, instr->is_shadow,
-                                 explicit_offset, lod_mode, mask, sr_count);
-                break;
-        case nir_texop_txf:
-        case nir_texop_txf_ms:
-                bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(),
-                                instr->is_array, dim, regfmt, explicit_offset,
-                                mask, sr_count);
+        case nir_op_vec2: {
+                uint32_t comps[3] = {
+                        nir_alu_src_index(ctx, &nir_instr->src[0]),
+                        nir_alu_src_index(ctx, &nir_instr->src[1]),
+                };
+                emit_create_vector(ctx, dest, 2, comps);
+                return;
                 break;
-        case nir_texop_tg4:
-                bi_tex_gather_to(b, dest, idx, image_src, bi_zero(),
-                                 instr->is_array, dim, instr->component, false,
-                                 regfmt, instr->is_shadow, explicit_offset,
-                                 mask, sr_count);
+        }
+        case nir_op_vec3: {
+                uint32_t comps[3] = {
+                        nir_alu_src_index(ctx, &nir_instr->src[0]),
+                        nir_alu_src_index(ctx, &nir_instr->src[1]),
+                        nir_alu_src_index(ctx, &nir_instr->src[2]),
+                };
+                emit_create_vector(ctx, dest, 3, comps);
+                return;
                 break;
-        default:
-                unreachable("Unhandled Valhall texture op");
         }
-
-        bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
-        bi_emit_split_i32(b, w, dest, res_size);
-        bi_emit_collect_to(b, bi_dest_index(&instr->dest), w,
-                        DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4));
-}
-
-/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
- * textures with sufficiently small immediate indices. Anything else
- * needs a complete texture op. */
-
-static void
-bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
-{
-        int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
-        assert(coord_idx >= 0);
-        bi_index coords = bi_src_index(&instr->src[coord_idx].src);
-
-        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                bi_index face, s, t;
-                bi_emit_cube_coord(b, coords, &face, &s, &t);
-
-                bi_texs_cube_to(b, nir_dest_bit_size(instr->dest),
-                                bi_dest_index(&instr->dest),
-                                s, t, face,
-                                instr->sampler_index, instr->texture_index);
-        } else {
-                bi_texs_2d_to(b, nir_dest_bit_size(instr->dest),
-                                bi_dest_index(&instr->dest),
-                                bi_extract(b, coords, 0),
-                                bi_extract(b, coords, 1),
-                                instr->op != nir_texop_tex, /* zero LOD */
-                                instr->sampler_index, instr->texture_index);
+        case nir_op_vec4: {
+                uint32_t comps[4] = {
+                        nir_alu_src_index(ctx, &nir_instr->src[0]),
+                        nir_alu_src_index(ctx, &nir_instr->src[1]),
+                        nir_alu_src_index(ctx, &nir_instr->src[2]),
+                        nir_alu_src_index(ctx, &nir_instr->src[3]),
+                };
+                emit_create_vector(ctx, dest, 4, comps);
+                return;
+                break;
         }
+        case nir_op_fdiv: {
+                unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0);
+                unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1);
+                uint32_t mir_temp_location = alloc_mir_temp(ctx);
+                {
+                        struct bifrost_instruction instr = {
+                                .op = op_frcp_fast_f32,
+                                .dest_components = 1,
+                                .ssa_args = {
+                                        .dest = mir_temp_location,
+                                        .src0 = src1,
+                                        .src1 = SSA_INVALID_VALUE,
+                                        .src2 = SSA_INVALID_VALUE,
+                                        .src3 = SSA_INVALID_VALUE,
+                                },
+                        };
+                        emit_mir_instruction(ctx, instr);
+                }
 
-        bi_split_dest(b, instr->dest);
-}
-
-static bool
-bi_is_simple_tex(nir_tex_instr *instr)
-{
-        if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
-                return false;
-
-        if (instr->dest_type != nir_type_float32 &&
-            instr->dest_type != nir_type_float16)
-                return false;
-
-        if (instr->is_shadow || instr->is_array)
-                return false;
-
-        switch (instr->sampler_dim) {
-        case GLSL_SAMPLER_DIM_2D:
-        case GLSL_SAMPLER_DIM_EXTERNAL:
-        case GLSL_SAMPLER_DIM_RECT:
-                break;
+                struct bifrost_instruction instr = {
+                        .op = op_fmul_f32,
+                        .dest_components = 1,
+                        .ssa_args = {
+                                .dest = dest,
+                                .src0 = src0,
+                                .src1 = src1,
+                                .src2 = SSA_INVALID_VALUE,
+                                .src3 = SSA_INVALID_VALUE,
+                        },
+                        .src_modifiers = src_modifiers,
+                };
 
-        case GLSL_SAMPLER_DIM_CUBE:
-                /* LOD can't be specified with TEXS_CUBE */
-                if (instr->op == nir_texop_txl)
-                        return false;
+                emit_mir_instruction(ctx, instr);
+                return;
                 break;
-
-        default:
-                return false;
         }
+        case nir_op_umin:
+        case nir_op_imin:
+        case nir_op_umax:
+        case nir_op_imax: {
+                unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0);
+                unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1);
+                struct bifrost_instruction instr = {
+                        .op = op_csel_i32,
+                        .dest_components = 1,
+                        .ssa_args = {
+                                .dest = dest,
+                                .src0 = src0,
+                                .src1 = src1,
+                                .src2 = src0,
+                                .src3 = src1,
+                        },
+                        .src_modifiers = src_modifiers,
+                        .literal_args[0] = 0, /* XXX: Comparison operator */
+                };
 
-        for (unsigned i = 0; i < instr->num_srcs; ++i) {
-                if (instr->src[i].src_type != nir_tex_src_lod &&
-                    instr->src[i].src_type != nir_tex_src_coord)
-                        return false;
+                emit_mir_instruction(ctx, instr);
+                return;
+                break;
         }
+        case nir_op_umin3:
+        case nir_op_imin3:
+        case nir_op_umax3:
+        case nir_op_imax3: {
+                unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0);
+                unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1);
+                unsigned src2 = nir_alu_src_index_scalar(ctx, nir_instr, 2);
+
+                unsigned op = 0;
+                if (nir_instr->op == nir_op_umin3)
+                        op = op_umin3_i32;
+                else if (nir_instr->op == nir_op_imin3)
+                        op = op_imin3_i32;
+                else if (nir_instr->op == nir_op_umax3)
+                        op = op_umax3_i32;
+                else if (nir_instr->op == nir_op_imax3)
+                        op = op_imax3_i32;
+                struct bifrost_instruction instr = {
+                        .op = op,
+                        .dest_components = 1,
+                        .ssa_args = {
+                                .dest = dest,
+                                .src0 = src0,
+                                .src1 = src1,
+                                .src2 = src2,
+                                .src3 = SSA_INVALID_VALUE,
+                        },
+                        .src_modifiers = src_modifiers,
+                };
 
-        /* Indices need to fit in provided bits */
-        unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
-        if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
-                return false;
-
-        int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
-        if (lod_idx < 0)
-                return true;
+                emit_mir_instruction(ctx, instr);
 
-        nir_src lod = instr->src[lod_idx].src;
-        return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
-}
-
-static void
-bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
-{
-        switch (instr->op) {
-        case nir_texop_txs:
-                bi_load_sysval_to(b, bi_dest_index(&instr->dest),
-                                panfrost_sysval_for_instr(&instr->instr, NULL),
-                                nir_dest_num_components(instr->dest), 0);
                 return;
-        case nir_texop_tex:
-        case nir_texop_txl:
-        case nir_texop_txb:
-        case nir_texop_txf:
-        case nir_texop_txf_ms:
-        case nir_texop_tg4:
                 break;
-        default:
-                unreachable("Invalid texture operation");
         }
+        case nir_op_ine: {
+                uint32_t movi = emit_movi(ctx, ~0U);
+                unsigned src0 = nir_alu_src_index(ctx, &nir_instr->src[0]);
+                unsigned src1 = nir_alu_src_index(ctx, &nir_instr->src[1]);
+                struct bifrost_instruction instr = {
+                        .op = op_csel_i32,
+                        .dest_components = 1,
+                        .ssa_args = {
+                                .dest = dest,
+                                .src0 = src0,
+                                .src1 = src1,
+                                .src2 = movi,
+                                .src3 = SSA_FIXED_CONST_0,
+                        },
+                        .src_modifiers = src_modifiers,
+                        .literal_args[0] = CSEL_IEQ, /* XXX: Comparison operator */
+                };
 
-        if (b->shader->arch >= 9)
-                bi_emit_tex_valhall(b, instr);
-        else if (bi_is_simple_tex(instr))
-                bi_emit_texs(b, instr);
-        else
-                bi_emit_texc(b, instr);
-}
-
-static void
-bi_emit_phi(bi_builder *b, nir_phi_instr *instr)
-{
-        unsigned nr_srcs = exec_list_length(&instr->srcs);
-        bi_instr *I = bi_phi_to(b, bi_dest_index(&instr->dest), nr_srcs);
-
-        /* Deferred */
-        I->phi = instr;
-}
-
-/* Look up the AGX block corresponding to a given NIR block. Used when
- * translating phi nodes after emitting all blocks.
- */
-static bi_block *
-bi_from_nir_block(bi_context *ctx, nir_block *block)
-{
-        return ctx->indexed_nir_blocks[block->index];
-}
-
-static void
-bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I)
-{
-        nir_phi_instr *phi = I->phi;
-
-        /* Guaranteed by lower_phis_to_scalar */
-        assert(phi->dest.ssa.num_components == 1);
-
-        nir_foreach_phi_src(src, phi) {
-                bi_block *pred = bi_from_nir_block(ctx, src->pred);
-                unsigned i = bi_predecessor_index(block, pred);
-                assert(i < I->nr_srcs);
-
-                I->src[i] = bi_src_index(&src->src);
+                emit_mir_instruction(ctx, instr);
+                return;
+                break;
+        }
+        default:
+                printf("Unhandled ALU op %s\n", nir_op_infos[nir_instr->op].name);
+                return;
         }
 
-        I->phi = NULL;
-}
+        unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0);
+        unsigned src1 = argument_count >= 2 ? nir_alu_src_index_scalar(ctx, nir_instr, 1) : SSA_INVALID_VALUE;
+        unsigned src2 = argument_count >= 3 ? nir_alu_src_index_scalar(ctx, nir_instr, 2) : SSA_INVALID_VALUE;
+        unsigned src3 = argument_count >= 4 ? nir_alu_src_index_scalar(ctx, nir_instr, 3) : SSA_INVALID_VALUE;
+
+        struct bifrost_instruction instr = {
+                .op = op,
+                .dest_components = 1,
+                .ssa_args = {
+                        .dest = dest,
+                        .src0 = src0,
+                        .src1 = src1,
+                        .src2 = src2,
+                        .src3 = src3,
+                },
+                .src_modifiers = src_modifiers,
+        };
 
-static void
-bi_emit_phis_deferred(bi_context *ctx)
-{
-        bi_foreach_block(ctx, block) {
-                bi_foreach_instr_in_block(block, I) {
-                        if (I->op == BI_OPCODE_PHI)
-                                bi_emit_phi_deferred(ctx, block, I);
-                }
-        }
+        emit_mir_instruction(ctx, instr);
 }
 
 static void
-bi_emit_instr(bi_builder *b, struct nir_instr *instr)
+emit_instr(struct compiler_context *ctx, struct nir_instr *instr)
 {
         switch (instr->type) {
         case nir_instr_type_load_const:
-                bi_emit_load_const(b, nir_instr_as_load_const(instr));
+                emit_load_const(ctx, nir_instr_as_load_const(instr));
                 break;
-
         case nir_instr_type_intrinsic:
-                bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
+                emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
                 break;
-
         case nir_instr_type_alu:
-                bi_emit_alu(b, nir_instr_as_alu(instr));
+                emit_alu(ctx, nir_instr_as_alu(instr));
                 break;
-
         case nir_instr_type_tex:
-                bi_emit_tex(b, nir_instr_as_tex(instr));
+                printf("Unhandled NIR inst tex\n");
                 break;
-
         case nir_instr_type_jump:
-                bi_emit_jump(b, nir_instr_as_jump(instr));
+                printf("Unhandled NIR inst jump\n");
                 break;
-
-        case nir_instr_type_phi:
-                bi_emit_phi(b, nir_instr_as_phi(instr));
+        case nir_instr_type_ssa_undef:
+                printf("Unhandled NIR inst ssa_undef\n");
                 break;
-
         default:
-                unreachable("should've been lowered");
+                printf("Unhandled instruction type\n");
+                break;
         }
-}
-
-static bi_block *
-create_empty_block(bi_context *ctx)
-{
-        bi_block *blk = rzalloc(ctx, bi_block);
-
-        util_dynarray_init(&blk->predecessors, blk);
 
-        return blk;
 }
 
-static bi_block *
-emit_block(bi_context *ctx, nir_block *block)
+static bifrost_block *
+emit_block(struct compiler_context *ctx, nir_block *block)
 {
-        if (ctx->after_block) {
-                ctx->current_block = ctx->after_block;
-                ctx->after_block = NULL;
-        } else {
-                ctx->current_block = create_empty_block(ctx);
-        }
+        bifrost_block *this_block = calloc(sizeof(bifrost_block), 1);
+        list_addtail(&this_block->link, &ctx->blocks);
 
-        list_addtail(&ctx->current_block->link, &ctx->blocks);
-        list_inithead(&ctx->current_block->instructions);
+        ++ctx->block_count;
 
-        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
+        /* Add this block to be a successor to the previous block */
+        if (ctx->current_block)
+                bifrost_block_add_successor(ctx->current_block, this_block);
 
-        ctx->indexed_nir_blocks[block->index] = ctx->current_block;
+        /* Set up current block */
+        list_inithead(&this_block->instructions);
+        ctx->current_block = this_block;
 
         nir_foreach_instr(instr, block) {
-                bi_emit_instr(&_b, instr);
+                emit_instr(ctx, instr);
+                ++ctx->instruction_count;
         }
 
-        return ctx->current_block;
+#ifdef BI_DEBUG
+        print_mir_block(this_block, false);
+#endif
+        return this_block;
 }
 
-static void
-emit_if(bi_context *ctx, nir_if *nif)
-{
-        bi_block *before_block = ctx->current_block;
-
-        /* Speculatively emit the branch, but we can't fill it in until later */
-        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
-        bi_instr *then_branch = bi_branchz_i16(&_b,
-                        bi_half(bi_src_index(&nif->condition), false),
-                        bi_zero(), BI_CMPF_EQ);
-
-        /* Emit the two subblocks. */
-        bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
-        bi_block *end_then_block = ctx->current_block;
-
-        /* Emit second block */
-
-        bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
-        bi_block *end_else_block = ctx->current_block;
-        ctx->after_block = create_empty_block(ctx);
-
-        /* Now that we have the subblocks emitted, fix up the branches */
-
-        assert(then_block);
-        assert(else_block);
-
-        then_branch->branch_target = else_block;
-
-        /* Emit a jump from the end of the then block to the end of the else */
-        _b.cursor = bi_after_block(end_then_block);
-        bi_instr *then_exit = bi_jump(&_b, bi_zero());
-        then_exit->branch_target = ctx->after_block;
-
-        bi_block_add_successor(end_then_block, then_exit->branch_target);
-        bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
-
-        bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */
-        bi_block_add_successor(before_block, then_block); /* fallthrough */
-}
-
-static void
-emit_loop(bi_context *ctx, nir_loop *nloop)
-{
-        /* Remember where we are */
-        bi_block *start_block = ctx->current_block;
-
-        bi_block *saved_break = ctx->break_block;
-        bi_block *saved_continue = ctx->continue_block;
-
-        ctx->continue_block = create_empty_block(ctx);
-        ctx->break_block = create_empty_block(ctx);
-        ctx->after_block = ctx->continue_block;
-
-        /* Emit the body itself */
-        emit_cf_list(ctx, &nloop->body);
-
-        /* Branch back to loop back */
-        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
-        bi_instr *I = bi_jump(&_b, bi_zero());
-        I->branch_target = ctx->continue_block;
-        bi_block_add_successor(start_block, ctx->continue_block);
-        bi_block_add_successor(ctx->current_block, ctx->continue_block);
-
-        ctx->after_block = ctx->break_block;
-
-        /* Pop off */
-        ctx->break_block = saved_break;
-        ctx->continue_block = saved_continue;
-        ++ctx->loop_count;
-}
+void
+emit_if(struct compiler_context *ctx, nir_if *nir_inst);
 
-static bi_block *
-emit_cf_list(bi_context *ctx, struct exec_list *list)
+static struct bifrost_block *
+emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
 {
-        bi_block *start_block = NULL;
-
+        struct bifrost_block *start_block = NULL;
         foreach_list_typed(nir_cf_node, node, node, list) {
                 switch (node->type) {
                 case nir_cf_node_block: {
-                        bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
+                        bifrost_block *block = emit_block(ctx, nir_cf_node_as_block(node));
 
                         if (!start_block)
                                 start_block = block;
@@ -4140,1183 +866,186 @@ emit_cf_list(bi_context *ctx, struct exec_list *list)
                         emit_if(ctx, nir_cf_node_as_if(node));
                         break;
 
+                default:
                 case nir_cf_node_loop:
-                        emit_loop(ctx, nir_cf_node_as_loop(node));
+                case nir_cf_node_function:
+                        assert(0);
                         break;
-
-                default:
-                        unreachable("Unknown control flow");
                 }
         }
 
         return start_block;
 }
 
-/* shader-db stuff */
-
-struct bi_stats {
-        unsigned nr_clauses, nr_tuples, nr_ins;
-        unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
-};
-
-static void
-bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
-{
-        /* Count instructions */
-        stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
-
-        /* Non-message passing tuples are always arithmetic */
-        if (tuple->add != clause->message) {
-                stats->nr_arith++;
-                return;
-        }
-
-        /* Message + FMA we'll count as arithmetic _and_ message */
-        if (tuple->fma)
-                stats->nr_arith++;
-
-        switch (clause->message_type) {
-        case BIFROST_MESSAGE_VARYING:
-                /* Check components interpolated */
-                stats->nr_varying += (clause->message->vecsize + 1) *
-                        (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
-                break;
-
-        case BIFROST_MESSAGE_VARTEX:
-                /* 2 coordinates, fp32 each */
-                stats->nr_varying += (2 * 2);
-                FALLTHROUGH;
-        case BIFROST_MESSAGE_TEX:
-                stats->nr_texture++;
-                break;
-
-        case BIFROST_MESSAGE_ATTRIBUTE:
-        case BIFROST_MESSAGE_LOAD:
-        case BIFROST_MESSAGE_STORE:
-        case BIFROST_MESSAGE_ATOMIC:
-                stats->nr_ldst++;
-                break;
-
-        case BIFROST_MESSAGE_NONE:
-        case BIFROST_MESSAGE_BARRIER:
-        case BIFROST_MESSAGE_BLEND:
-        case BIFROST_MESSAGE_TILE:
-        case BIFROST_MESSAGE_Z_STENCIL:
-        case BIFROST_MESSAGE_ATEST:
-        case BIFROST_MESSAGE_JOB:
-        case BIFROST_MESSAGE_64BIT:
-                /* Nothing to do */
-                break;
-        };
-
-}
-
-/*
- * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
- * shader completes. These costs are not accounted for in the general cycle
- * counts, so this function calculates the effective cost of these messages, as
- * if they were executed by shader code.
- */
-static unsigned
-bi_count_preload_cost(bi_context *ctx)
-{
-        /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
-         * 16 fp16 varying components per cycle or fetch two texels per cycle.
-         */
-        unsigned cost = 0;
-
-        for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
-                struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
-
-                if (msg.enabled && msg.texture) {
-                        /* 2 coordinate, 2 half-words each, plus texture */
-                        cost += 12;
-                } else if (msg.enabled) {
-                        cost += (msg.num_components * (msg.fp16 ? 1 : 2));
-                }
-        }
-
-        return cost;
-}
-
-static const char *
-bi_shader_stage_name(bi_context *ctx)
-{
-        if (ctx->idvs == BI_IDVS_VARYING)
-                return "MESA_SHADER_VARYING";
-        else if (ctx->idvs == BI_IDVS_POSITION)
-                return "MESA_SHADER_POSITION";
-        else if (ctx->inputs->is_blend)
-                return "MESA_SHADER_BLEND";
-        else
-                return gl_shader_stage_name(ctx->stage);
-}
-
-static char *
-bi_print_stats(bi_context *ctx, unsigned size)
+void
+emit_if(struct compiler_context *ctx, nir_if *nir_inst)
 {
-        struct bi_stats stats = { 0 };
-
-        /* Count instructions, clauses, and tuples. Also attempt to construct
-         * normalized execution engine cycle counts, using the following ratio:
-         *
-         * 24 arith tuples/cycle
-         * 2 texture messages/cycle
-         * 16 x 16-bit varying channels interpolated/cycle
-         * 1 load store message/cycle
-         *
-         * These numbers seem to match Arm Mobile Studio's heuristic. The real
-         * cycle counts are surely more complicated.
-         */
-
-        bi_foreach_block(ctx, block) {
-                bi_foreach_clause_in_block(block, clause) {
-                        stats.nr_clauses++;
-                        stats.nr_tuples += clause->tuple_count;
-
-                        for (unsigned i = 0; i < clause->tuple_count; ++i)
-                                bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
-                }
-        }
-
-        float cycles_arith = ((float) stats.nr_arith) / 24.0;
-        float cycles_texture = ((float) stats.nr_texture) / 2.0;
-        float cycles_varying = ((float) stats.nr_varying) / 16.0;
-        float cycles_ldst = ((float) stats.nr_ldst) / 1.0;
-
-        float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
-        float cycles_bound = MAX2(cycles_arith, cycles_message);
-
-        /* Thread count and register pressure are traded off only on v7 */
-        bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
-        unsigned nr_threads = full_threads ? 2 : 1;
-
-        /* Dump stats */
-        char *str = ralloc_asprintf(NULL, "%s shader: "
-                        "%u inst, %u tuples, %u clauses, "
-                        "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
-                        "%u quadwords, %u threads",
-                        bi_shader_stage_name(ctx),
-                        stats.nr_ins, stats.nr_tuples, stats.nr_clauses,
-                        cycles_bound, cycles_arith, cycles_texture,
-                        cycles_varying, cycles_ldst,
-                        size / 16, nr_threads);
-
-        if (ctx->arch == 7) {
-                ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
-        }
-
-        ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
-                        ctx->loop_count, ctx->spills, ctx->fills);
 
-        return str;
-}
-
-static char *
-va_print_stats(bi_context *ctx, unsigned size)
-{
-        unsigned nr_ins = 0;
-        struct va_stats stats = { 0 };
+        // XXX: Conditional branch instruction can do a variety of comparisons with the sources
+        // Merge the source instruction `ine` with our conditional branch
+        {
+                uint32_t movi = emit_movi(ctx, ~0U);
+                struct bifrost_instruction instr = {
+                        .op = op_branch,
+                        .dest_components = 0,
+                        .ssa_args = {
+                                .dest = SSA_INVALID_VALUE,
+                                .src0 = nir_src_index(ctx, &nir_inst->condition),
+                                .src1 = movi,
+                                .src2 = SSA_INVALID_VALUE,
+                                .src3 = SSA_INVALID_VALUE,
+                        },
+                        .src_modifiers = 0,
+                        .literal_args[0] = BR_COND_EQ, /* XXX: Comparison Arg type */
+                        .literal_args[1] = 0, /* XXX: Branch target */
+                };
 
-        /* Count instructions */
-        bi_foreach_instr_global(ctx, I) {
-                nr_ins++;
-                va_count_instr_stats(I, &stats);
+                emit_mir_instruction(ctx, instr);
         }
 
-        /* Mali G78 peak performance:
-         *
-         * 64 FMA instructions per cycle
-         * 64 CVT instructions per cycle
-         * 16 SFU instructions per cycle
-         * 8 x 32-bit varying channels interpolated per cycle
-         * 4 texture instructions per cycle
-         * 1 load/store operation per cycle
-         */
-
-        float cycles_fma = ((float) stats.fma) / 64.0;
-        float cycles_cvt = ((float) stats.cvt) / 64.0;
-        float cycles_sfu = ((float) stats.sfu) / 16.0;
-        float cycles_v = ((float) stats.v) / 16.0;
-        float cycles_t = ((float) stats.t) / 4.0;
-        float cycles_ls = ((float) stats.ls) / 1.0;
-
-        /* Calculate the bound */
-        float cycles = MAX2(
-                        MAX3(cycles_fma, cycles_cvt, cycles_sfu),
-                        MAX3(cycles_v,   cycles_t,   cycles_ls));
-
-
-        /* Thread count and register pressure are traded off */
-        unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
-
-        /* Dump stats */
-        return ralloc_asprintf(NULL, "%s shader: "
-                        "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
-                        "%f t, %f ls, %u quadwords, %u threads, %u loops, "
-                        "%u:%u spills:fills",
-                        bi_shader_stage_name(ctx),
-                        nr_ins, cycles, cycles_fma, cycles_cvt, cycles_sfu,
-                        cycles_v, cycles_t, cycles_ls, size / 16, nr_threads,
-                        ctx->loop_count, ctx->spills, ctx->fills);
-}
+        bifrost_instruction *true_branch = mir_last_instr_in_block(ctx->current_block);
 
-static int
-glsl_type_size(const struct glsl_type *type, bool bindless)
-{
-        return glsl_count_attribute_slots(type, false);
-}
+        bifrost_block *true_block = emit_cf_list(ctx, &nir_inst->then_list);
 
-/* Split stores to memory. We don't split stores to vertex outputs, since
- * nir_lower_io_to_temporaries will ensure there's only a single write.
- */
+        {
+                struct bifrost_instruction instr = {
+                        .op = op_branch,
+                        .dest_components = 0,
+                        .ssa_args = {
+                                .dest = SSA_INVALID_VALUE,
+                                .src0 = SSA_INVALID_VALUE,
+                                .src1 = SSA_INVALID_VALUE,
+                                .src2 = SSA_INVALID_VALUE,
+                                .src3 = SSA_INVALID_VALUE,
+                        },
+                        .src_modifiers = 0,
+                        .literal_args[0] = BR_ALWAYS, /* XXX: ALWAYS */
+                        .literal_args[1] = 0, /* XXX: Branch target */
+                };
 
-static bool
-should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
-{
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-        switch (intr->intrinsic) {
-        case nir_intrinsic_store_ssbo:
-        case nir_intrinsic_store_shared:
-        case nir_intrinsic_store_global:
-        case nir_intrinsic_store_scratch:
-                return true;
-        default:
-                return false;
+                emit_mir_instruction(ctx, instr);
         }
-}
+        bifrost_instruction *true_exit_branch = mir_last_instr_in_block(ctx->current_block);
 
-/*
- * Some operations are only available as 32-bit instructions. 64-bit floats are
- * unsupported and ints are lowered with nir_lower_int64.  Certain 8-bit and
- * 16-bit instructions, however, are lowered here.
- */
-static unsigned
-bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
-{
-        if (instr->type != nir_instr_type_alu)
-                return 0;
-
-        nir_alu_instr *alu = nir_instr_as_alu(instr);
-
-        switch (alu->op) {
-        case nir_op_fexp2:
-        case nir_op_flog2:
-        case nir_op_fpow:
-        case nir_op_fsin:
-        case nir_op_fcos:
-        case nir_op_bit_count:
-        case nir_op_bitfield_reverse:
-                return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32;
-        default:
-                return 0;
-        }
-}
+        unsigned false_idx = ctx->block_count;
+        unsigned inst_count = ctx->instruction_count;
 
-/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
- * transcendentals are an exception. Also shifts because of lane size mismatch
- * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
- * to be scalarized due to type size. */
+        bifrost_block *false_block = emit_cf_list(ctx, &nir_inst->else_list);
 
-static uint8_t
-bi_vectorize_filter(const nir_instr *instr, const void *data)
-{
-        /* Defaults work for everything else */
-        if (instr->type != nir_instr_type_alu)
-                return 0;
-
-        const nir_alu_instr *alu = nir_instr_as_alu(instr);
-
-        switch (alu->op) {
-        case nir_op_frcp:
-        case nir_op_frsq:
-        case nir_op_ishl:
-        case nir_op_ishr:
-        case nir_op_ushr:
-        case nir_op_f2i16:
-        case nir_op_f2u16:
-        case nir_op_extract_u8:
-        case nir_op_extract_i8:
-        case nir_op_extract_u16:
-        case nir_op_extract_i16:
-        case nir_op_insert_u16:
-                return 1;
-        default:
-                break;
-        }
+        unsigned if_footer_idx = ctx->block_count;
+        assert(true_block);
+        assert(false_block);
 
-        /* Vectorized instructions cannot write more than 32-bit */
-        int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
-        if (dst_bit_size == 16)
-                return 2;
-        else
-                return 1;
-}
 
-static bool
-bi_scalarize_filter(const nir_instr *instr, const void *data)
-{
-        if (instr->type != nir_instr_type_alu)
-                return false;
-
-        const nir_alu_instr *alu = nir_instr_as_alu(instr);
-
-        switch (alu->op) {
-        case nir_op_pack_uvec2_to_uint:
-        case nir_op_pack_uvec4_to_uint:
-                return false;
-        default:
-                return true;
+        if (ctx->instruction_count == inst_count) {
+                // If the else branch didn't have anything in it then we can remove the dead jump
+                mir_remove_instr(true_exit_branch);
+        } else {
+                true_exit_branch->literal_args[1] = if_footer_idx;
         }
-}
-
-/* Ensure we write exactly 4 components */
-static nir_ssa_def *
-bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in,
-                          unsigned channel, unsigned first, unsigned mask)
-{
-        if (!(mask & BITFIELD_BIT(channel)))
-                channel = first;
-
-        return nir_channel(b, in, channel);
-}
-
-/* Lower fragment store_output instructions to always write 4 components,
- * matching the hardware semantic. This may require additional moves. Skipping
- * these moves is possible in theory, but invokes undefined behaviour in the
- * compiler. The DDK inserts these moves, so we will as well. */
-
-static bool
-bifrost_nir_lower_blend_components(struct nir_builder *b,
-                                   nir_instr *instr, void *data)
-{
-        if (instr->type != nir_instr_type_intrinsic)
-                return false;
-
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-        if (intr->intrinsic != nir_intrinsic_store_output)
-                return false;
-
-        nir_ssa_def *in = intr->src[0].ssa;
-        unsigned first = nir_intrinsic_component(intr);
-        unsigned mask = nir_intrinsic_write_mask(intr);
-
-        assert(first == 0 && "shouldn't get nonzero components");
-
-        /* Nothing to do */
-        if (mask == BITFIELD_MASK(4))
-                return false;
 
-        b->cursor = nir_before_instr(&intr->instr);
-
-        /* Replicate the first valid component instead */
-        nir_ssa_def *replicated =
-                nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
-                            bifrost_nir_valid_channel(b, in, 1, first, mask),
-                            bifrost_nir_valid_channel(b, in, 2, first, mask),
-                            bifrost_nir_valid_channel(b, in, 3, first, mask));
-
-        /* Rewrite to use our replicated version */
-        nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated);
-        nir_intrinsic_set_component(intr, 0);
-        nir_intrinsic_set_write_mask(intr, 0xF);
-        intr->num_components = 4;
-
-        return true;
+        true_branch->literal_args[1] = false_idx;
 }
 
-static void
-bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
+int
+bifrost_compile_shader_nir(nir_shader *nir, struct bifrost_program *program)
 {
-        bool progress;
-        unsigned lower_flrp = 16 | 32 | 64;
-
-        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
-
-        nir_lower_tex_options lower_tex_options = {
-                .lower_txs_lod = true,
-                .lower_txp = ~0,
-                .lower_tg4_broadcom_swizzle = true,
-                .lower_txd = true,
-                .lower_invalid_implicit_lod = true,
+        struct compiler_context ictx = {
+                .nir = nir,
+                .stage = nir->info.stage,
         };
 
-        NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin);
-        NIR_PASS(progress, nir, pan_lower_helper_invocation);
-
-        NIR_PASS(progress, nir, nir_lower_int64);
+        struct compiler_context *ctx = &ictx;
 
-        nir_lower_idiv_options idiv_options = {
-                .allow_fp16 = true,
-        };
-        NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
-        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
-
-        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
-        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
-        NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
-        NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
+        ctx->mir_temp = 0;
 
-        do {
-                progress = false;
+        /* Initialize at a global (not block) level hash tables */
+        ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
+        ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
 
-                NIR_PASS(progress, nir, nir_lower_var_copies);
-                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
-                NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
+        /* Assign actual uniform location, skipping over samplers */
+        ctx->uniform_nir_to_bi  = _mesa_hash_table_u64_create(NULL);
 
-                NIR_PASS(progress, nir, nir_copy_prop);
-                NIR_PASS(progress, nir, nir_opt_remove_phis);
-                NIR_PASS(progress, nir, nir_opt_dce);
-                NIR_PASS(progress, nir, nir_opt_dead_cf);
-                NIR_PASS(progress, nir, nir_opt_cse);
-                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
-                NIR_PASS(progress, nir, nir_opt_algebraic);
-                NIR_PASS(progress, nir, nir_opt_constant_folding);
+        nir_foreach_variable(var, &nir->uniforms) {
+                if (glsl_get_base_type(var->type) == GLSL_TYPE_SAMPLER) continue;
 
-                NIR_PASS(progress, nir, nir_lower_alu);
-
-                if (lower_flrp != 0) {
-                        bool lower_flrp_progress = false;
-                        NIR_PASS(lower_flrp_progress,
-                                 nir,
-                                 nir_lower_flrp,
-                                 lower_flrp,
-                                 false /* always_precise */);
-                        if (lower_flrp_progress) {
-                                NIR_PASS(progress, nir,
-                                         nir_opt_constant_folding);
-                                progress = true;
-                        }
-
-                        /* Nothing should rematerialize any flrps, so we only
-                         * need to do this lowering once.
-                         */
-                        lower_flrp = 0;
+                for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) {
+                        int id = ctx->uniform_count++;
+                        _mesa_hash_table_u64_insert(ctx->uniform_nir_to_bi, var->data.driver_location + col + 1, (void *) ((uintptr_t) (id + 1)));
                 }
-
-                NIR_PASS(progress, nir, nir_opt_undef);
-                NIR_PASS(progress, nir, nir_lower_undef_to_zero);
-
-                NIR_PASS(progress, nir, nir_opt_shrink_vectors);
-                NIR_PASS(progress, nir, nir_opt_loop_unroll);
-        } while (progress);
-
-        /* TODO: Why is 64-bit getting rematerialized?
-         * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
-        NIR_PASS(progress, nir, nir_lower_int64);
-
-        /* We need to cleanup after each iteration of late algebraic
-         * optimizations, since otherwise NIR can produce weird edge cases
-         * (like fneg of a constant) which we don't handle */
-        bool late_algebraic = true;
-        while (late_algebraic) {
-                late_algebraic = false;
-                NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
-                NIR_PASS(progress, nir, nir_opt_constant_folding);
-                NIR_PASS(progress, nir, nir_copy_prop);
-                NIR_PASS(progress, nir, nir_opt_dce);
-                NIR_PASS(progress, nir, nir_opt_cse);
         }
 
-        /* This opt currently helps on Bifrost but not Valhall */
-        if (gpu_id < 0x9000)
-                NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);
-
-        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
-        NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
-        NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
-
-        /* Prepass to simplify instruction selection */
-        late_algebraic = false;
-        NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
+        if (ctx->stage == MESA_SHADER_VERTEX) {
+                ctx->varying_nir_to_bi = _mesa_hash_table_u64_create(NULL);
+                nir_foreach_variable(var, &nir->outputs) {
+                        if (var->data.location < VARYING_SLOT_VAR0) {
+                                if (var->data.location == VARYING_SLOT_POS)
+                                        ctx->varying_count++;
+                                _mesa_hash_table_u64_insert(ctx->varying_nir_to_bi, var->data.driver_location + 1, (void *) ((uintptr_t) (1)));
 
-        while (late_algebraic) {
-                late_algebraic = false;
-                NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
-                NIR_PASS(progress, nir, nir_opt_constant_folding);
-                NIR_PASS(progress, nir, nir_copy_prop);
-                NIR_PASS(progress, nir, nir_opt_dce);
-                NIR_PASS(progress, nir, nir_opt_cse);
-        }
-
-        NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
-        NIR_PASS(progress, nir, nir_opt_dce);
-
-        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-                NIR_PASS_V(nir, nir_shader_instructions_pass,
-                           bifrost_nir_lower_blend_components,
-                           nir_metadata_block_index | nir_metadata_dominance,
-                           NULL);
-        }
-
-        /* Backend scheduler is purely local, so do some global optimizations
-         * to reduce register pressure. */
-        nir_move_options move_all =
-                nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
-                nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
-
-        NIR_PASS_V(nir, nir_opt_sink, move_all);
-        NIR_PASS_V(nir, nir_opt_move, move_all);
-
-        /* We might lower attribute, varying, and image indirects. Use the
-         * gathered info to skip the extra analysis in the happy path. */
-        bool any_indirects =
-                nir->info.inputs_read_indirectly ||
-                nir->info.outputs_accessed_indirectly ||
-                nir->info.patch_inputs_read_indirectly ||
-                nir->info.patch_outputs_accessed_indirectly ||
-                nir->info.images_used[0];
-
-        if (any_indirects) {
-                nir_convert_to_lcssa(nir, true, true);
-                NIR_PASS_V(nir, nir_divergence_analysis);
-                NIR_PASS_V(nir, bi_lower_divergent_indirects,
-                                pan_subgroup_size(gpu_id >> 12));
-        }
-}
-
-static void
-bi_opt_post_ra(bi_context *ctx)
-{
-        bi_foreach_instr_global_safe(ctx, ins) {
-                if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0]))
-                        bi_remove_instruction(ins);
-        }
-}
-
-/* Dead code elimination for branches at the end of a block - only one branch
- * per block is legal semantically, but unreachable jumps can be generated.
- * Likewise on Bifrost we can generate jumps to the terminal block which need
- * to be lowered away to a jump to #0x0, which induces successful termination.
- * That trick doesn't work on Valhall, which needs a NOP inserted in the
- * terminal block instead.
- */
-static void
-bi_lower_branch(bi_context *ctx, bi_block *block)
-{
-        bool cull_terminal = (ctx->arch <= 8);
-        bool branched = false;
-
-        bi_foreach_instr_in_block_safe(block, ins) {
-                if (!ins->branch_target) continue;
-
-                if (branched) {
-                        bi_remove_instruction(ins);
-                        continue;
-                }
-
-                branched = true;
-
-                if (!bi_is_terminal_block(ins->branch_target))
-                        continue;
-
-                if (cull_terminal)
-                        ins->branch_target = NULL;
-                else if (ins->branch_target)
-                        ins->branch_target->needs_nop = true;
-        }
-}
-
-static void
-bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
-{
-        unsigned final_clause = bi_pack(ctx, binary);
-
-        /* If we need to wait for ATEST or BLEND in the first clause, pass the
-         * corresponding bits through to the renderer state descriptor */
-        bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
-        bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
-
-        unsigned first_deps = first_clause ? first_clause->dependencies : 0;
-        ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
-        ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
-
-        /* Pad the shader with enough zero bytes to trick the prefetcher,
-         * unless we're compiling an empty shader (in which case we don't pad
-         * so the size remains 0) */
-        unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
-
-        if (binary->size - offset) {
-                memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
-                       0, prefetch_size);
-        }
-}
-
-/*
- * Build a bit mask of varyings (by location) that are flatshaded. This
- * information is needed by lower_mediump_io, as we don't yet support 16-bit
- * flat varyings.
- *
- * Also varyings that are used as texture coordinates should be kept at fp32 so
- * the texture instruction may be promoted to VAR_TEX. In general this is a good
- * idea, as fp16 texture coordinates are not supported by the hardware and are
- * usually inappropriate. (There are both relevant CTS bugs here, even.)
- *
- * TODO: If we compacted the varyings with some fixup code in the vertex shader,
- * we could implement 16-bit flat varyings. Consider if this case matters.
- *
- * TODO: The texture coordinate handling could be less heavyhanded.
- */
-static bool
-bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
-{
-        uint64_t *mask = data;
-
-        if (instr->type != nir_instr_type_tex)
-                return false;
-
-        nir_tex_instr *tex = nir_instr_as_tex(instr);
-
-        int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
-        if (coord_idx < 0)
-                return false;
-
-        nir_src src = tex->src[coord_idx].src;
-        nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0);
-        nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1);
-
-        if (x.def != y.def)
-                return false;
-
-        nir_instr *parent = x.def->parent_instr;
-
-        if (parent->type != nir_instr_type_intrinsic)
-                return false;
-
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
-
-        if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
-                return false;
-
-        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-        *mask |= BITFIELD64_BIT(sem.location);
-        return false;
-}
-
-static uint64_t
-bi_fp32_varying_mask(nir_shader *nir)
-{
-        uint64_t mask = 0;
-
-        assert(nir->info.stage == MESA_SHADER_FRAGMENT);
-
-        nir_foreach_shader_in_variable(var, nir) {
-                if (var->data.interpolation == INTERP_MODE_FLAT)
-                        mask |= BITFIELD64_BIT(var->data.location);
-        }
-
-        nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, &mask);
-
-        return mask;
-}
-
-static void
-bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
-{
-        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
-         * (so we don't accidentally duplicate the epilogue since mesa/st has
-         * messed with our I/O quite a bit already) */
-
-        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
-        if (nir->info.stage == MESA_SHADER_VERTEX) {
-                NIR_PASS_V(nir, nir_lower_viewport_transform);
-                NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
-
-                nir_variable *psiz = nir_find_variable_with_location(nir,
-                                                                     nir_var_shader_out,
-                                                                     VARYING_SLOT_PSIZ);
-                if (psiz != NULL)
-                        psiz->data.precision = GLSL_PRECISION_MEDIUM;
-        }
+                                continue;
+                        }
 
-        /* Get rid of any global vars before we lower to scratch. */
-        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
-
-        /* Valhall introduces packed thread local storage, which improves cache
-         * locality of TLS access. However, access to packed TLS cannot
-         * straddle 16-byte boundaries. As such, when packed TLS is in use
-         * (currently unconditional for Valhall), we force vec4 alignment for
-         * scratch access.
-         */
-        bool packed_tls = (gpu_id >= 0x9000);
-
-        /* Lower large arrays to scratch and small arrays to bcsel */
-        NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
-                        packed_tls ?
-                        glsl_get_vec4_size_align_bytes :
-                        glsl_get_natural_size_align_bytes);
-        NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
-
-        NIR_PASS_V(nir, nir_split_var_copies);
-        NIR_PASS_V(nir, nir_lower_var_copies);
-        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-        NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
-                        glsl_type_size, 0);
-
-        /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
-         * offsets it could figure out are constant.  Do some constant folding
-         * before bifrost_nir_lower_store_component below.
-         */
-        NIR_PASS_V(nir, nir_opt_constant_folding);
-
-        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-                NIR_PASS_V(nir, nir_lower_mediump_io,
-                           nir_var_shader_in | nir_var_shader_out,
-                           ~bi_fp32_varying_mask(nir), false);
-        } else if (nir->info.stage == MESA_SHADER_VERTEX) {
-                if (gpu_id >= 0x9000) {
-                        NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
-                                        BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
+                        for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) {
+                                for (int comp = 0; comp < 4; ++comp) {
+                                        int id = comp + ctx->varying_count++;
+                                        _mesa_hash_table_u64_insert(ctx->varying_nir_to_bi, var->data.driver_location + col + comp + 1, (void *) ((uintptr_t) (id + 1)));
+                                }
+                        }
                 }
 
-                NIR_PASS_V(nir, pan_nir_lower_store_component);
-        }
-
-        NIR_PASS_V(nir, nir_lower_ssbo);
-        NIR_PASS_V(nir, pan_nir_lower_zs_store);
-        NIR_PASS_V(nir, pan_lower_sample_pos);
-        NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
-        NIR_PASS_V(nir, nir_lower_64bit_phis);
-
-        if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) {
-                NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
-                           nir_var_shader_in | nir_var_shader_out);
-                NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
-                NIR_PASS_V(nir, pan_lower_xfb);
-        }
-
-        bi_optimize_nir(nir, gpu_id, is_blend);
-}
-
-static bi_context *
-bi_compile_variant_nir(nir_shader *nir,
-                       const struct panfrost_compile_inputs *inputs,
-                       struct util_dynarray *binary,
-                       struct hash_table_u64 *sysval_to_id,
-                       struct bi_shader_info info,
-                       enum bi_idvs_mode idvs)
-{
-        bi_context *ctx = rzalloc(NULL, bi_context);
-
-        /* There may be another program in the dynarray, start at the end */
-        unsigned offset = binary->size;
-
-        ctx->sysval_to_id = sysval_to_id;
-        ctx->inputs = inputs;
-        ctx->nir = nir;
-        ctx->stage = nir->info.stage;
-        ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
-        ctx->arch = inputs->gpu_id >> 12;
-        ctx->info = info;
-        ctx->idvs = idvs;
-        ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
-
-        if (idvs != BI_IDVS_NONE) {
-                /* Specializing shaders for IDVS is destructive, so we need to
-                 * clone. However, the last (second) IDVS shader does not need
-                 * to be preserved so we can skip cloning that one.
-                 */
-                if (offset == 0)
-                        ctx->nir = nir = nir_shader_clone(ctx, nir);
-
-                NIR_PASS_V(nir, nir_shader_instructions_pass,
-                           bifrost_nir_specialize_idvs,
-                           nir_metadata_block_index | nir_metadata_dominance,
-                           &idvs);
-
-                /* After specializing, clean up the mess */
-                bool progress = true;
-
-                while (progress) {
-                        progress = false;
-
-                        NIR_PASS(progress, nir, nir_opt_dce);
-                        NIR_PASS(progress, nir, nir_opt_dead_cf);
+        } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
+                ctx->outputs_nir_to_bi = _mesa_hash_table_u64_create(NULL);
+                nir_foreach_variable(var, &nir->outputs) {
+                        if (var->data.location >= FRAG_RESULT_DATA0 && var->data.location <= FRAG_RESULT_DATA7) {
+                                int id = ctx->outputs_count++;
+                                printf("Driver location: %d with id %d\n", var->data.location + 1, id);
+                                _mesa_hash_table_u64_insert(ctx->outputs_nir_to_bi, var->data.location + 1, (void *) ((uintptr_t) (id + 1)));
+                        }
                 }
         }
 
-        /* If nothing is pushed, all UBOs need to be uploaded */
-        ctx->ubo_mask = ~0;
+        /* Optimisation passes */
+        optimize_nir(nir);
 
-        list_inithead(&ctx->blocks);
-
-        bool skip_internal = nir->info.internal;
-        skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
-
-        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
-                nir_print_shader(nir, stdout);
-        }
-
-        ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
+#ifdef BI_DEBUG
+        nir_print_shader(nir, stdout);
+#endif
 
+        /* Generate machine IR for shader */
         nir_foreach_function(func, nir) {
-                if (!func->impl)
-                        continue;
-
-                nir_index_blocks(func->impl);
+                nir_builder _b;
+                ctx->b = &_b;
+                nir_builder_init(ctx->b, func->impl);
 
-                ctx->indexed_nir_blocks =
-                        rzalloc_array(ctx, bi_block *, func->impl->num_blocks);
-
-                ctx->ssa_alloc += func->impl->ssa_alloc;
-                ctx->reg_alloc += func->impl->reg_alloc;
+                list_inithead(&ctx->blocks);
+                ctx->block_count = 0;
+                ctx->func = func;
 
                 emit_cf_list(ctx, &func->impl->body);
-                bi_emit_phis_deferred(ctx);
-                break; /* TODO: Multi-function shaders */
-        }
 
-        /* Index blocks now that we're done emitting */
-        bi_foreach_block(ctx, block) {
-                block->index = ctx->num_blocks++;
+                break; // XXX: Once we support multi function shaders then implement
         }
 
-        bi_validate(ctx, "NIR -> BIR");
-
-        /* If the shader doesn't write any colour or depth outputs, it may
-         * still need an ATEST at the very end! */
-        bool need_dummy_atest =
-                (ctx->stage == MESA_SHADER_FRAGMENT) &&
-                !ctx->emitted_atest &&
-                !bi_skip_atest(ctx, false);
+        util_dynarray_init(&program->compiled, NULL);
 
-        if (need_dummy_atest) {
-                bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
-                bi_builder b = bi_init_builder(ctx, bi_after_block(end));
-                bi_emit_atest(&b, bi_zero());
-        }
-
-        bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
-
-        /* Runs before constant folding */
-        bi_lower_swizzle(ctx);
-        bi_validate(ctx, "Early lowering");
-
-        /* Runs before copy prop */
-        if (optimize && !ctx->inputs->no_ubo_to_push) {
-                bi_opt_push_ubo(ctx);
-        }
+        // MIR pre-RA optimizations
 
-        if (likely(optimize)) {
-                bi_opt_copy_prop(ctx);
+        bool progress = false;
 
-                while (bi_opt_constant_fold(ctx))
-                        bi_opt_copy_prop(ctx);
-
-                bi_opt_mod_prop_forward(ctx);
-                bi_opt_mod_prop_backward(ctx);
-
-                /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
-                 * mod_prop_backward to fuse VAR_TEX */
-                if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
-                    !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
-                        bi_opt_dead_code_eliminate(ctx);
-                        bi_opt_message_preload(ctx);
-                        bi_opt_copy_prop(ctx);
-                }
-
-                bi_opt_dead_code_eliminate(ctx);
-                bi_opt_cse(ctx);
-                bi_opt_dead_code_eliminate(ctx);
-                if (!ctx->inputs->no_ubo_to_push)
-                        bi_opt_reorder_push(ctx);
-                bi_validate(ctx, "Optimization passes");
-        }
-
-        bi_lower_opt_instructions(ctx);
-
-        if (ctx->arch >= 9) {
-                va_optimize(ctx);
-                va_lower_isel(ctx);
-
-                bi_foreach_instr_global_safe(ctx, I) {
-                        /* Phis become single moves so shouldn't be affected */
-                        if (I->op == BI_OPCODE_PHI)
-                                continue;
-
-                        va_lower_constants(ctx, I);
-
-                        bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
-                        va_repair_fau(&b, I);
-                }
-
-                /* We need to clean up after constant lowering */
-                if (likely(optimize)) {
-                        bi_opt_cse(ctx);
-                        bi_opt_dead_code_eliminate(ctx);
-                }
-
-                bi_validate(ctx, "Valhall passes");
-        }
-
-        bi_foreach_block(ctx, block) {
-                bi_lower_branch(ctx, block);
-        }
-
-        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
-                bi_print_shader(ctx, stdout);
-
-        /* Analyze before register allocation to avoid false dependencies. The
-         * skip bit is a function of only the data flow graph and is invariant
-         * under valid scheduling. Helpers are only defined for fragment
-         * shaders, so this analysis is only required in fragment shaders.
-         */
-        if (ctx->stage == MESA_SHADER_FRAGMENT)
-                bi_analyze_helper_requirements(ctx);
-
-        /* Fuse TEXC after analyzing helper requirements so the analysis
-         * doesn't have to know about dual textures */
-        if (likely(optimize)) {
-                bi_opt_fuse_dual_texture(ctx);
-        }
-
-        /* Lower FAU after fusing dual texture, because fusing dual texture
-         * creates new immediates that themselves may need lowering.
-         */
-        if (ctx->arch <= 8) {
-                bi_lower_fau(ctx);
-        }
-
-        /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
-        if (likely(optimize)) {
-                bi_opt_cse(ctx);
-                bi_opt_dead_code_eliminate(ctx);
-        }
-
-        bi_validate(ctx, "Late lowering");
-
-        if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) {
-                bi_pressure_schedule(ctx);
-                bi_validate(ctx, "Pre-RA scheduling");
-        }
-
-        bi_register_allocate(ctx);
-
-        if (likely(optimize))
-                bi_opt_post_ra(ctx);
-
-        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
-                bi_print_shader(ctx, stdout);
-
-        if (ctx->arch >= 9) {
-                va_assign_slots(ctx);
-                va_insert_flow_control_nops(ctx);
-                va_merge_flow(ctx);
-                va_mark_last(ctx);
-        } else {
-                bi_schedule(ctx);
-                bi_assign_scoreboard(ctx);
-
-                /* Analyze after scheduling since we depend on instruction
-                 * order. Valhall calls as part of va_insert_flow_control_nops,
-                 * as the handling for clauses differs from instructions.
-                 */
-                bi_analyze_helper_terminate(ctx);
-                bi_mark_clauses_td(ctx);
-        }
-
-        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
-                bi_print_shader(ctx, stdout);
-
-        if (ctx->arch <= 8) {
-                bi_pack_clauses(ctx, binary, offset);
-        } else {
-                bi_pack_valhall(ctx, binary);
-        }
-
-        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
-                if (ctx->arch <= 8) {
-                        disassemble_bifrost(stdout, binary->data + offset,
-                                            binary->size - offset,
-                                            bifrost_debug & BIFROST_DBG_VERBOSE);
-                } else {
-                        disassemble_valhall(stdout, binary->data + offset,
-                                            binary->size - offset,
-                                            bifrost_debug & BIFROST_DBG_VERBOSE);
-                }
-
-                fflush(stdout);
-        }
-
-        if (!skip_internal &&
-            ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
-                char *shaderdb;
-
-                if (ctx->arch >= 9) {
-                        shaderdb = va_print_stats(ctx, binary->size - offset);
-                } else {
-                        shaderdb = bi_print_stats(ctx, binary->size - offset);
-                }
-
-                if (bifrost_debug & BIFROST_DBG_SHADERDB)
-                        fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
-
-                if (inputs->debug)
-                        util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
-
-                ralloc_free(shaderdb);
-        }
-
-        return ctx;
-}
-
-static void
-bi_compile_variant(nir_shader *nir,
-                   const struct panfrost_compile_inputs *inputs,
-                   struct util_dynarray *binary,
-                   struct hash_table_u64 *sysval_to_id,
-                   struct pan_shader_info *info,
-                   enum bi_idvs_mode idvs)
-{
-        struct bi_shader_info local_info = {
-                .push = &info->push,
-                .bifrost = &info->bifrost,
-                .tls_size = info->tls_size,
-                .sysvals = &info->sysvals,
-                .push_offset = info->push.count
-        };
-
-        unsigned offset = binary->size;
-
-        /* If there is no position shader (gl_Position is not written), then
-         * there is no need to build a varying shader either. This case is hit
-         * for transform feedback only vertex shaders which only make sense with
-         * rasterizer discard.
-         */
-        if ((offset == 0) && (idvs == BI_IDVS_VARYING))
-                return;
-
-        /* Software invariant: Only a secondary shader can appear at a nonzero
-         * offset, to keep the ABI simple. */
-        assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
-
-        bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs);
-
-        /* A register is preloaded <==> it is live before the first block */
-        bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
-        uint64_t preload = first_block->reg_live_in;
-
-        /* If multisampling is used with a blend shader, the blend shader needs
-         * to access the sample coverage mask in r60 and the sample ID in r61.
-         * Blend shaders run in the same context as fragment shaders, so if a
-         * blend shader could run, we need to preload these registers
-         * conservatively. There is believed to be little cost to doing so, so
-         * do so always to avoid variants of the preload descriptor.
-         *
-         * We only do this on Valhall, as Bifrost has to update the RSD for
-         * multisampling w/ blend shader anyway, so this is handled in the
-         * driver. We could unify the paths if the cost is acceptable.
-         */
-        if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
-                preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
-
-        info->ubo_mask |= ctx->ubo_mask;
-        info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
-
-        if (idvs == BI_IDVS_VARYING) {
-                info->vs.secondary_enable = (binary->size > offset);
-                info->vs.secondary_offset = offset;
-                info->vs.secondary_preload = preload;
-                info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
-        } else {
-                info->preload = preload;
-                info->work_reg_count = ctx->info.work_reg_count;
-        }
-
-        if (idvs == BI_IDVS_POSITION &&
-            !nir->info.internal &&
-            nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
-                /* Find the psiz write */
-                bi_instr *write = NULL;
-
-                bi_foreach_instr_global(ctx, I) {
-                        if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
-                                write = I;
-                                break;
-                        }
-                }
-
-                assert(write != NULL);
-
-                /* NOP it out, preserving its flow control. TODO: maybe DCE */
-                if (write->flow) {
-                        bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
-                        bi_instr *nop = bi_nop(&b);
-                        nop->flow = write->flow;
+        do {
+                progress = false;
+                mir_foreach_block(ctx, block) {
+                        // XXX: Not yet working
+//                        progress |= bifrost_opt_branch_fusion(ctx, block);
                 }
+        } while (progress);
 
-                bi_remove_instruction(write);
-
-                info->vs.no_psiz_offset = binary->size;
-                bi_pack_valhall(ctx, binary);
-        }
-
-        ralloc_free(ctx);
-}
-
-/* Decide if Index-Driven Vertex Shading should be used for a given shader */
-static bool
-bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
-{
-        /* Opt-out */
-        if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
-                return false;
-
-        /* IDVS splits up vertex shaders, not defined on other shader stages */
-        if (nir->info.stage != MESA_SHADER_VERTEX)
-                return false;
-
-        /* Bifrost cannot write gl_PointSize during IDVS */
-        if ((inputs->gpu_id < 0x9000) &&
-            nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
-                return false;
-
-        /* Otherwise, IDVS is usually better */
-        return true;
-}
-
-void
-bifrost_compile_shader_nir(nir_shader *nir,
-                           const struct panfrost_compile_inputs *inputs,
-                           struct util_dynarray *binary,
-                           struct pan_shader_info *info)
-{
-        bifrost_debug = debug_get_option_bifrost_debug();
-
-        bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend);
-        struct hash_table_u64 *sysval_to_id =
-                panfrost_init_sysvals(&info->sysvals,
-                                      inputs->fixed_sysval_layout,
-                                      NULL);
-
-        info->tls_size = nir->scratch_size;
-        info->vs.idvs = bi_should_idvs(nir, inputs);
-
-        pan_nir_collect_varyings(nir, info);
-
-        if (info->vs.idvs) {
-                bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION);
-                bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING);
-        } else {
-                bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE);
-        }
-
-        if (gl_shader_stage_is_compute(nir->info.stage)) {
-                /* Workgroups may be merged if the structure of the workgroup is
-                 * not software visible. This is true if neither shared memory
-                 * nor barriers are used. The hardware may be able to optimize
-                 * compute shaders that set this flag.
-                 */
-                info->cs.allow_merging_workgroups =
-                        (nir->info.shared_size == 0) &&
-                        !nir->info.uses_control_barrier &&
-                        !nir->info.uses_memory_barrier;
-        }
-
-        info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
+        schedule_program(ctx);
 
-        _mesa_hash_table_u64_destroy(sysval_to_id);
+#ifdef BI_DEBUG
+        nir_print_shader(nir, stdout);
+        disassemble_bifrost(program->compiled.data, program->compiled.size, false);
+#endif
+        return 0;
 }
diff --git a/lib/mesa/src/panfrost/bifrost/bifrost_compile.h b/lib/mesa/src/panfrost/bifrost/bifrost_compile.h
index c23b51afe..e687f64f7 100644
--- a/lib/mesa/src/panfrost/bifrost/bifrost_compile.h
+++ b/lib/mesa/src/panfrost/bifrost/bifrost_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ * Copyright (C) 2018 Ryan Houdek <Sonicadvance1@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,40 +21,34 @@
  * SOFTWARE.
  */
 
-#ifndef __BIFROST_PUBLIC_H_
-#define __BIFROST_PUBLIC_H_
+#ifndef __bifrost_compile_h__
+#define __bifrost_compile_h__
 
 #include "compiler/nir/nir.h"
 #include "util/u_dynarray.h"
-#include "panfrost/util/pan_ir.h"
 
-void
-bifrost_compile_shader_nir(nir_shader *nir,
-                           const struct panfrost_compile_inputs *inputs,
-                           struct util_dynarray *binary,
-                           struct pan_shader_info *info);
+struct bifrost_program {
+        struct util_dynarray compiled;
+};
+
+int
+bifrost_compile_shader_nir(nir_shader *nir, struct bifrost_program *program);
 
 static const nir_shader_compiler_options bifrost_nir_options = {
-        .lower_scmp = true,
+        .fuse_ffma = true,
         .lower_flrp16 = true,
         .lower_flrp32 = true,
         .lower_flrp64 = true,
-        .lower_ffract = true,
         .lower_fmod = true,
-        .lower_fdiv = true,
+        .lower_bitfield_extract = true,
+        .lower_bitfield_extract_to_shifts = true,
+        .lower_bitfield_insert = true,
+        .lower_bitfield_insert_to_shifts = true,
+        .lower_bitfield_reverse = true,
+        .lower_idiv = true,
         .lower_isign = true,
-        .lower_find_lsb = true,
-        .lower_ifind_msb = true,
-        .lower_fdph = true,
-        .lower_fsqrt = true,
-
         .lower_fsign = true,
-
-        .lower_bitfield_insert_to_shifts = true,
-        .lower_bitfield_extract_to_shifts = true,
-        .lower_insert_byte = true,
-        .lower_rotate = true,
-
+        .lower_ffract = true,
         .lower_pack_half_2x16 = true,
         .lower_pack_unorm_2x16 = true,
         .lower_pack_snorm_2x16 = true,
@@ -65,33 +59,11 @@ static const nir_shader_compiler_options bifrost_nir_options = {
         .lower_unpack_snorm_2x16 = true,
         .lower_unpack_unorm_4x8 = true,
         .lower_unpack_snorm_4x8 = true,
-        .lower_pack_split = true,
-
-        .lower_doubles_options = nir_lower_dmod,
-        /* TODO: Don't lower supported 64-bit operations */
-        .lower_int64_options = ~0,
-        /* TODO: Use IMULD on v7 */
-        .lower_mul_high = true,
-        .lower_fisnormal = true,
-        .lower_uadd_carry = true,
-        .lower_usub_borrow = true,
-
-        .has_fsub = true,
-        .has_isub = true,
-        .vectorize_io = true,
-        .vectorize_vec2_16bit = true,
-        .fuse_ffma16 = true,
-        .fuse_ffma32 = true,
-        .fuse_ffma64 = true,
-        .use_interpolated_input_intrinsics = true,
-
-        .lower_uniforms_to_ubo = true,
-
-        .has_cs_global_id = true,
-        .lower_cs_local_index_to_id = true,
-        .max_unroll_iterations = 32,
-        .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp),
-        .force_indirect_unrolling_sampler = true,
+        .lower_extract_byte = true,
+        .lower_extract_word = true,
+        .lower_all_io_to_temps = true,
+        .lower_all_io_to_elements = true,
+        .vertex_id_zero_based = true,
 };
 
 #endif
diff --git a/lib/mesa/src/panfrost/bifrost/cmdline.c b/lib/mesa/src/panfrost/bifrost/cmdline.c
index 2a11486cb..16415bbd7 100644
--- a/lib/mesa/src/panfrost/bifrost/cmdline.c
+++ b/lib/mesa/src/panfrost/bifrost/cmdline.c
@@ -1,8 +1,5 @@
 /*
- * Copyright (C) 2021 Collabora, Ltd.
  * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- * Copyright © 2015 Red Hat
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,11 +21,7 @@
  * SOFTWARE.
  */
 
-#include <getopt.h>
-#include <string.h>
 #include "disassemble.h"
-#include "valhall/disassemble.h"
-#include "compiler.h"
 
 #include "main/mtypes.h"
 #include "compiler/glsl/standalone.h"
@@ -36,186 +29,47 @@
 #include "compiler/glsl/gl_nir.h"
 #include "compiler/nir_types.h"
 #include "util/u_dynarray.h"
-#include "bifrost_compile.h"
-
-unsigned gpu_id = 0x7212;
-int verbose = 0;
-
-static gl_shader_stage
-filename_to_stage(const char *stage)
-{
-        const char *ext = strrchr(stage, '.');
-
-        if (ext == NULL) {
-                fprintf(stderr, "No extension found in %s\n", stage);
-                exit(1);
-        }
-
-        if (!strcmp(ext, ".cs") || !strcmp(ext, ".comp"))
-                return MESA_SHADER_COMPUTE;
-        else if (!strcmp(ext, ".vs") || !strcmp(ext, ".vert"))
-                return MESA_SHADER_VERTEX;
-        else if (!strcmp(ext, ".fs") || !strcmp(ext, ".frag"))
-                return MESA_SHADER_FRAGMENT;
-        else {
-                fprintf(stderr, "Invalid extension %s\n", ext);
-                exit(1);
-        }
 
-        unreachable("Should've returned or bailed");
-}
-
-static int
-st_packed_uniforms_type_size(const struct glsl_type *type, bool bindless)
-{
-   return glsl_count_dword_slots(type, bindless);
-}
-
-static int
-glsl_type_size(const struct glsl_type *type, bool bindless)
-{
-   return glsl_count_attribute_slots(type, false);
-}
-
-static void
-insert_sorted(struct exec_list *var_list, nir_variable *new_var)
-{
-   nir_foreach_variable_in_list (var, var_list) {
-      if (var->data.location > new_var->data.location) {
-         exec_node_insert_node_before(&var->node, &new_var->node);
-         return;
-      }
-   }
-   exec_list_push_tail(var_list, &new_var->node);
-}
-
-static void
-sort_varyings(nir_shader *nir, nir_variable_mode mode)
-{
-   struct exec_list new_list;
-   exec_list_make_empty(&new_list);
-   nir_foreach_variable_with_modes_safe (var, nir, mode) {
-      exec_node_remove(&var->node);
-      insert_sorted(&new_list, var);
-   }
-   exec_list_append(&nir->variables, &new_list);
-}
-
-static void
-fixup_varying_slots(nir_shader *nir, nir_variable_mode mode)
-{
-   nir_foreach_variable_with_modes (var, nir, mode) {
-      if (var->data.location >= VARYING_SLOT_VAR0) {
-         var->data.location += 9;
-      } else if ((var->data.location >= VARYING_SLOT_TEX0) &&
-                 (var->data.location <= VARYING_SLOT_TEX7)) {
-         var->data.location += VARYING_SLOT_VAR0 - VARYING_SLOT_TEX0;
-      }
-   }
-}
+#include "bifrost_compile.h"
 
 static void
-compile_shader(int stages, char **files)
+compile_shader(char **argv)
 {
         struct gl_shader_program *prog;
-        nir_shader *nir[MESA_SHADER_COMPUTE + 1];
-        unsigned shader_types[MESA_SHADER_COMPUTE + 1];
-
-        if (stages > MESA_SHADER_COMPUTE) {
-                fprintf(stderr, "Too many stages");
-                exit(1);
-        }
-
-        for (unsigned i = 0; i < stages; ++i)
-                shader_types[i] = filename_to_stage(files[i]);
+        nir_shader *nir[2];
+        unsigned shader_types[2] = {
+                MESA_SHADER_VERTEX,
+                MESA_SHADER_FRAGMENT,
+        };
 
         struct standalone_options options = {
-                .glsl_version = 300, /* ES - needed for precision */
+                .glsl_version = 430,
                 .do_link = true,
-                .lower_precision = true
         };
 
         static struct gl_context local_ctx;
 
-        prog = standalone_compile_shader(&options, stages, files, &local_ctx);
-
-        for (unsigned i = 0; i < stages; ++i) {
-                gl_shader_stage stage = shader_types[i];
-                prog->_LinkedShaders[stage]->Program->info.stage = stage;
-        }
-
-        struct util_dynarray binary;
-
-        util_dynarray_init(&binary, NULL);
-
-        for (unsigned i = 0; i < stages; ++i) {
-                nir[i] = glsl_to_nir(&local_ctx.Const, prog, shader_types[i], &bifrost_nir_options);
-
-                if (shader_types[i] == MESA_SHADER_VERTEX) {
-                        nir_assign_var_locations(nir[i], nir_var_shader_in, &nir[i]->num_inputs,
-                                        glsl_type_size);
-                        sort_varyings(nir[i], nir_var_shader_out);
-                        nir_assign_var_locations(nir[i], nir_var_shader_out, &nir[i]->num_outputs,
-                                        glsl_type_size);
-                        fixup_varying_slots(nir[i], nir_var_shader_out);
-                } else if (shader_types[i] == MESA_SHADER_FRAGMENT) {
-                      sort_varyings(nir[i], nir_var_shader_in);
-                      nir_assign_var_locations(nir[i], nir_var_shader_in, &nir[i]->num_inputs,
-                                      glsl_type_size);
-                      fixup_varying_slots(nir[i], nir_var_shader_in);
-                      nir_assign_var_locations(nir[i], nir_var_shader_out, &nir[i]->num_outputs,
-                                      glsl_type_size);
-                }
-
-                nir_assign_var_locations(nir[i], nir_var_uniform, &nir[i]->num_uniforms,
-                                glsl_type_size);
+        prog = standalone_compile_shader(&options, 2, argv, &local_ctx);
+        prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program->info.stage = MESA_SHADER_FRAGMENT;
 
+        struct bifrost_program compiled;
+        for (unsigned i = 0; i < 2; ++i) {
+                nir[i] = glsl_to_nir(&local_ctx, prog, shader_types[i], &bifrost_nir_options);
                 NIR_PASS_V(nir[i], nir_lower_global_vars_to_local);
-                NIR_PASS_V(nir[i], nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir[i]), true, i == 0);
-                NIR_PASS_V(nir[i], nir_opt_copy_prop_vars);
-                NIR_PASS_V(nir[i], nir_opt_combine_stores, nir_var_all);
-
-                NIR_PASS_V(nir[i], nir_lower_system_values);
-                NIR_PASS_V(nir[i], gl_nir_lower_samplers, prog);
                 NIR_PASS_V(nir[i], nir_split_var_copies);
                 NIR_PASS_V(nir[i], nir_lower_var_copies);
 
-                NIR_PASS_V(nir[i], nir_lower_io, nir_var_uniform,
-                                st_packed_uniforms_type_size,
-                                (nir_lower_io_options)0);
-                NIR_PASS_V(nir[i], nir_lower_uniforms_to_ubo, true, false);
+                NIR_PASS_V(nir[i], nir_lower_alu_to_scalar, NULL);
 
                 /* before buffers and vars_to_ssa */
-                NIR_PASS_V(nir[i], gl_nir_lower_images, true);
+                NIR_PASS_V(nir[i], gl_nir_lower_bindless_images);
 
                 NIR_PASS_V(nir[i], gl_nir_lower_buffers, prog);
                 NIR_PASS_V(nir[i], nir_opt_constant_folding);
-
-                struct panfrost_compile_inputs inputs = {
-                        .gpu_id = gpu_id,
-                        .fixed_sysval_ubo = -1,
-                };
-                struct pan_shader_info info = { 0 };
-
-                util_dynarray_clear(&binary);
-                bifrost_compile_shader_nir(nir[i], &inputs, &binary, &info);
-
-                char *fn = NULL;
-                asprintf(&fn, "shader_%u.bin", i);
-                assert(fn != NULL);
-                FILE *fp = fopen(fn, "wb");
-                fwrite(binary.data, 1, binary.size, fp);
-                fclose(fp);
-                free(fn);
+                bifrost_compile_shader_nir(nir[i], &compiled);
         }
-
-        util_dynarray_fini(&binary);
 }
 
-#define BI_FOURCC(ch0, ch1, ch2, ch3) ( \
-  (uint32_t)(ch0)        | (uint32_t)(ch1) << 8 | \
-  (uint32_t)(ch2) << 16  | (uint32_t)(ch3) << 24)
-
 static void
 disassemble(const char *filename)
 {
@@ -223,122 +77,34 @@ disassemble(const char *filename)
         assert(fp);
 
         fseek(fp, 0, SEEK_END);
-        unsigned filesize = ftell(fp);
+        int filesize = ftell(fp);
         rewind(fp);
 
-        uint32_t *code = malloc(filesize);
-        unsigned res = fread(code, 1, filesize, fp);
+        unsigned char *code = malloc(filesize);
+        int res = fread(code, 1, filesize, fp);
         if (res != filesize) {
                 printf("Couldn't read full file\n");
         }
-
         fclose(fp);
 
-        void *entrypoint = code;
-
-        if (filesize && code[0] == BI_FOURCC('M', 'B', 'S', '2')) {
-                for (int i = 0; i < filesize / 4; ++i) {
-                        if (code[i] != BI_FOURCC('O', 'B', 'J', 'C'))
-                                continue;
-
-                        unsigned size = code[i + 1];
-                        unsigned offset = i + 2;
-
-                        entrypoint = code + offset;
-                        filesize = size;
-                }
-        }
-
-        if ((gpu_id >> 12) >= 9)
-                disassemble_valhall(stdout, entrypoint, filesize, verbose);
-        else
-                disassemble_bifrost(stdout, entrypoint, filesize, verbose);
-
+        disassemble_bifrost(code, filesize, false);
         free(code);
 }
 
 int
 main(int argc, char **argv)
 {
-        int c;
-
         if (argc < 2) {
                 printf("Pass a command\n");
                 exit(1);
         }
 
-        static struct option longopts[] = {
-                { "id", optional_argument, NULL, 'i' },
-                { "gpu", optional_argument, NULL, 'g' },
-                { "verbose", no_argument, &verbose, 'v' },
-                { NULL, 0, NULL, 0 }
-        };
-
-        static struct {
-                const char *name;
-                unsigned major, minor;
-        } gpus[] = {
-                { "G71",   6, 0 },
-                { "G72",   6, 2 },
-                { "G51",   7, 0 },
-                { "G76",   7, 1 },
-                { "G52",   7, 2 },
-                { "G31",   7, 3 },
-                { "G77",   9, 0 },
-                { "G57",   9, 1 },
-                { "G78",   9, 2 },
-                { "G57",   9, 3 },
-                { "G68",   9, 4 },
-                { "G78AE", 9, 5 },
-        };
-
-        while ((c = getopt_long(argc, argv, "v:", longopts, NULL)) != -1) {
-
-                switch (c) {
-                case 'i':
-                        gpu_id = atoi(optarg);
-
-                        if (!gpu_id) {
-                                fprintf(stderr, "Expected GPU ID, got %s\n", optarg);
-                                return 1;
-                        }
-
-                        break;
-                case 'g':
-                        gpu_id = 0;
-
-                        /* Compatibility with the Arm compiler */
-                        if (strncmp(optarg, "Mali-", 5) == 0) optarg += 5;
-
-                        for (unsigned i = 0; i < ARRAY_SIZE(gpus); ++i) {
-                                if (strcmp(gpus[i].name, optarg)) continue;
-
-                                unsigned major = gpus[i].major;
-                                unsigned minor = gpus[i].minor;
-
-                                gpu_id = (major << 12) | (minor << 8);
-                                break;
-                        }
-
-                        if (!gpu_id) {
-                                fprintf(stderr, "Unknown GPU %s\n", optarg);
-                                return 1;
-                        }
-
-                        break;
-                default:
-                        break;
-                }
-        }
-
-        if (strcmp(argv[optind], "compile") == 0)
-                compile_shader(argc - optind - 1, &argv[optind + 1]);
-        else if (strcmp(argv[optind], "disasm") == 0)
-                disassemble(argv[optind + 1]);
-        else {
-                fprintf(stderr, "Unknown command. Valid: compile/disasm\n");
-                return 1;
-        }
+        if (strcmp(argv[1], "compile") == 0)
+                compile_shader(&argv[2]);
+        else if (strcmp(argv[1], "disasm") == 0)
+                disassemble(argv[2]);
+        else
+                unreachable("Unknown command. Valid: compile/disasm");
 
         return 0;
 }
diff --git a/lib/mesa/src/panfrost/bifrost/disassemble.c b/lib/mesa/src/panfrost/bifrost/disassemble.c
index 1bc98e405..c7e131d5d 100644
--- a/lib/mesa/src/panfrost/bifrost/disassemble.c
+++ b/lib/mesa/src/panfrost/bifrost/disassemble.c
@@ -31,9 +31,8 @@
 #include <string.h>
 
 #include "bifrost.h"
+#include "bifrost_ops.h"
 #include "disassemble.h"
-#include "bi_print_common.h"
-#include "util/compiler.h"
 #include "util/macros.h"
 
 // return bits (high, lo]
@@ -53,6 +52,15 @@ struct bifrost_alu_inst {
         uint64_t reg_bits;
 };
 
+struct bifrost_regs {
+        unsigned uniform_const : 8;
+        unsigned reg2 : 6;
+        unsigned reg3 : 6;
+        unsigned reg0 : 5;
+        unsigned reg1 : 6;
+        unsigned ctrl : 4;
+};
+
 static unsigned get_reg0(struct bifrost_regs regs)
 {
         if (regs.ctrl == 0)
@@ -66,74 +74,187 @@ static unsigned get_reg1(struct bifrost_regs regs)
         return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1;
 }
 
+enum bifrost_reg_write_unit {
+        REG_WRITE_NONE = 0, // don't write
+        REG_WRITE_TWO, // write using reg2
+        REG_WRITE_THREE, // write using reg3
+};
+
 // this represents the decoded version of the ctrl register field.
 struct bifrost_reg_ctrl {
         bool read_reg0;
         bool read_reg1;
-        struct bifrost_reg_ctrl_23 slot23;
+        bool read_reg3;
+        enum bifrost_reg_write_unit fma_write_unit;
+        enum bifrost_reg_write_unit add_write_unit;
+        bool clause_start;
 };
 
-static void dump_header(FILE *fp, struct bifrost_header header, bool verbose)
-{
-        fprintf(fp, "ds(%u) ", header.dependency_slot);
+enum fma_src_type {
+        FMA_ONE_SRC,
+        FMA_TWO_SRC,
+        FMA_FADD,
+        FMA_FMINMAX,
+        FMA_FADD16,
+        FMA_FMINMAX16,
+        FMA_FCMP,
+        FMA_FCMP16,
+        FMA_THREE_SRC,
+        FMA_FMA,
+        FMA_FMA16,
+        FMA_FOUR_SRC,
+        FMA_FMA_MSCALE,
+        FMA_SHIFT_ADD64,
+};
 
-        if (header.staging_barrier)
-                fprintf(fp, "osrb ");
+struct fma_op_info {
+        unsigned op;
+        char name[30];
+        enum fma_src_type src_type;
+};
 
-        fprintf(fp, "%s ", bi_flow_control_name(header.flow_control));
+enum add_src_type {
+        ADD_ONE_SRC,
+        ADD_TWO_SRC,
+        ADD_FADD,
+        ADD_FMINMAX,
+        ADD_FADD16,
+        ADD_FMINMAX16,
+        ADD_THREE_SRC,
+        ADD_FADDMscale,
+        ADD_FCMP,
+        ADD_FCMP16,
+        ADD_TEX_COMPACT, // texture instruction with embedded sampler
+        ADD_TEX, // texture instruction with sampler/etc. in uniform port
+        ADD_VARYING_INTERP,
+        ADD_BLENDING,
+        ADD_LOAD_ATTR,
+        ADD_VARYING_ADDRESS,
+        ADD_BRANCH,
+};
 
-        if (header.suppress_inf)
-                fprintf(fp, "inf_suppress ");
-        if (header.suppress_nan)
-                fprintf(fp, "nan_suppress ");
-
-        if (header.flush_to_zero == BIFROST_FTZ_DX11)
-                fprintf(fp, "ftz_dx11 ");
-        else if (header.flush_to_zero == BIFROST_FTZ_ALWAYS)
-                fprintf(fp, "ftz_hsa ");
-        if (header.flush_to_zero == BIFROST_FTZ_ABRUPT)
-                fprintf(fp, "ftz_au ");
-
-        assert(!header.zero1);
-        assert(!header.zero2);
-
-        if (header.float_exceptions == BIFROST_EXCEPTIONS_DISABLED)
-                fprintf(fp, "fpe_ts ");
-        else if (header.float_exceptions == BIFROST_EXCEPTIONS_PRECISE_DIVISION)
-                fprintf(fp, "fpe_pd ");
-        else if (header.float_exceptions == BIFROST_EXCEPTIONS_PRECISE_SQRT)
-                fprintf(fp, "fpe_psqr ");
-
-        if (header.message_type)
-                fprintf(fp, "%s ", bi_message_type_name(header.message_type));
-
-        if (header.terminate_discarded_threads)
-                fprintf(fp, "td ");
-
-        if (header.next_clause_prefetch)
-                fprintf(fp, "ncph ");
-
-        if (header.next_message_type)
-                fprintf(fp, "next_%s ", bi_message_type_name(header.next_message_type));
-        if (header.dependency_wait != 0) {
-                fprintf(fp, "dwb(");
+struct add_op_info {
+        unsigned op;
+        char name[30];
+        enum add_src_type src_type;
+        bool has_data_reg;
+};
+
+struct bifrost_tex_ctrl {
+        unsigned sampler_index : 4; // also used to signal indirects
+        unsigned tex_index : 7;
+        bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices
+        bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather)
+        unsigned unk0 : 2;
+        bool texel_offset : 1; // *Offset()
+        bool is_shadow : 1;
+        bool is_array : 1;
+        unsigned tex_type : 2; // 2D, 3D, Cube, Buffer
+        bool compute_lod : 1; // 0 for *Lod()
+        bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied
+        bool calc_gradients : 1; // 0 for *Grad()
+        unsigned unk1 : 1;
+        unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits?
+        unsigned unk2 : 4;
+};
+
+struct bifrost_dual_tex_ctrl {
+        unsigned sampler_index0 : 2;
+        unsigned unk0 : 2;
+        unsigned tex_index0 : 2;
+        unsigned sampler_index1 : 2;
+        unsigned tex_index1 : 2;
+        unsigned unk1 : 22;
+};
+
+enum branch_bit_size {
+        BR_SIZE_32 = 0,
+        BR_SIZE_16XX = 1,
+        BR_SIZE_16YY = 2,
+        // For the above combinations of bitsize and location, an extra bit is
+        // encoded via comparing the sources. The only possible source of ambiguity
+        // would be if the sources were the same, but then the branch condition
+        // would be always true or always false anyways, so we can ignore it. But
+        // this no longer works when comparing the y component to the x component,
+        // since it's valid to compare the y component of a source against its own
+        // x component. Instead, the extra bit is encoded via an extra bitsize.
+        BR_SIZE_16YX0 = 3,
+        BR_SIZE_16YX1 = 4,
+        BR_SIZE_32_AND_16X = 5,
+        BR_SIZE_32_AND_16Y = 6,
+        // Used for comparisons with zero and always-true, see below. I think this
+        // only works for integer comparisons.
+        BR_SIZE_ZERO = 7,
+};
+
+void dump_header(struct bifrost_header header, bool verbose);
+void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts,
+                unsigned data_reg, unsigned offset, bool verbose);
+bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose);
+
+void dump_header(struct bifrost_header header, bool verbose)
+{
+        if (header.clause_type != 0) {
+                printf("id(%du) ", header.scoreboard_index);
+        }
+
+        if (header.scoreboard_deps != 0) {
+                printf("next-wait(");
                 bool first = true;
                 for (unsigned i = 0; i < 8; i++) {
-                        if (header.dependency_wait & (1 << i)) {
+                        if (header.scoreboard_deps & (1 << i)) {
                                 if (!first) {
-                                        fprintf(fp, ", ");
+                                        printf(", ");
                                 }
-                                fprintf(fp, "%u", i);
+                                printf("%d", i);
                                 first = false;
                         }
                 }
-                fprintf(fp, ") ");
+                printf(") ");
         }
 
-        fprintf(fp, "\n");
+        if (header.datareg_writebarrier)
+                printf("data-reg-barrier ");
+
+        if (!header.no_end_of_shader)
+                printf("eos ");
+
+        if (!header.back_to_back) {
+                printf("nbb ");
+                if (header.branch_cond)
+                        printf("branch-cond ");
+                else
+                        printf("branch-uncond ");
+        }
+
+        if (header.elide_writes)
+                printf("we ");
+
+        if (header.suppress_inf)
+                printf("suppress-inf ");
+        if (header.suppress_nan)
+                printf("suppress-nan ");
+
+        if (header.unk0)
+                printf("unk0 ");
+        if (header.unk1)
+                printf("unk1 ");
+        if  (header.unk2)
+                printf("unk2 ");
+        if (header.unk3)
+                printf("unk3 ");
+        if (header.unk4)
+                printf("unk4 ");
+
+        printf("\n");
+
+        if (verbose) {
+                printf("# clause type %d, next clause type %d\n",
+                       header.clause_type, header.next_clause_type);
+        }
 }
 
-static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs, bool first)
+static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs)
 {
         struct bifrost_reg_ctrl decoded = {};
         unsigned ctrl;
@@ -145,199 +266,160 @@ static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs,
                 ctrl = regs.ctrl;
                 decoded.read_reg0 = decoded.read_reg1 = true;
         }
+        switch (ctrl) {
+        case 1:
+                decoded.fma_write_unit = REG_WRITE_TWO;
+                break;
+        case 2:
+        case 3:
+                decoded.fma_write_unit = REG_WRITE_TWO;
+                decoded.read_reg3 = true;
+                break;
+        case 4:
+                decoded.read_reg3 = true;
+                break;
+        case 5:
+                decoded.add_write_unit = REG_WRITE_TWO;
+                break;
+        case 6:
+                decoded.add_write_unit = REG_WRITE_TWO;
+                decoded.read_reg3 = true;
+                break;
+        case 8:
+                decoded.clause_start = true;
+                break;
+        case 9:
+                decoded.fma_write_unit = REG_WRITE_TWO;
+                decoded.clause_start = true;
+                break;
+        case 11:
+                break;
+        case 12:
+                decoded.read_reg3 = true;
+                decoded.clause_start = true;
+                break;
+        case 13:
+                decoded.add_write_unit = REG_WRITE_TWO;
+                decoded.clause_start = true;
+                break;
 
-        /* Modify control based on state */
-        if (first)
-                ctrl = (ctrl & 0x7) | ((ctrl & 0x8) << 1);
-        else if (regs.reg2 == regs.reg3)
-                ctrl += 16;
-
-        decoded.slot23 = bifrost_reg_ctrl_lut[ctrl];
-        ASSERTED struct bifrost_reg_ctrl_23 reserved = { 0 };
-        assert(memcmp(&decoded.slot23, &reserved, sizeof(reserved)));
+        case 7:
+        case 15:
+                decoded.fma_write_unit = REG_WRITE_THREE;
+                decoded.add_write_unit = REG_WRITE_TWO;
+                break;
+        default:
+                printf("# unknown reg ctrl %d\n", ctrl);
+        }
 
         return decoded;
 }
 
-static void dump_regs(FILE *fp, struct bifrost_regs srcs, bool first)
-{
-        struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, srcs, first);
-        fprintf(fp, "    # ");
-        if (ctrl.read_reg0)
-                fprintf(fp, "slot 0: r%u ", get_reg0(srcs));
-        if (ctrl.read_reg1)
-                fprintf(fp, "slot 1: r%u ", get_reg1(srcs));
-
-        const char *slot3_fma = ctrl.slot23.slot3_fma ? "FMA" : "ADD";
-
-        if (ctrl.slot23.slot2 == BIFROST_OP_WRITE)
-                fprintf(fp, "slot 2: r%u (write FMA) ", srcs.reg2);
-        else if (ctrl.slot23.slot2 == BIFROST_OP_WRITE_LO)
-                fprintf(fp, "slot 2: r%u (write lo FMA) ", srcs.reg2);
-        else if (ctrl.slot23.slot2 == BIFROST_OP_WRITE_HI)
-                fprintf(fp, "slot 2: r%u (write hi FMA) ", srcs.reg2);
-        else if (ctrl.slot23.slot2 == BIFROST_OP_READ)
-                fprintf(fp, "slot 2: r%u (read) ", srcs.reg2);
-
-        if (ctrl.slot23.slot3 == BIFROST_OP_WRITE)
-                fprintf(fp, "slot 3: r%u (write %s) ", srcs.reg3, slot3_fma);
-        else if (ctrl.slot23.slot3 == BIFROST_OP_WRITE_LO)
-                fprintf(fp, "slot 3: r%u (write lo %s) ", srcs.reg3, slot3_fma);
-        else if (ctrl.slot23.slot3 == BIFROST_OP_WRITE_HI)
-                fprintf(fp, "slot 3: r%u (write hi %s) ", srcs.reg3, slot3_fma);
-
-        if (srcs.fau_idx)
-                fprintf(fp, "fau %X ", srcs.fau_idx);
-
-        fprintf(fp, "\n");
-}
-
-static void
-bi_disasm_dest_mask(FILE *fp, enum bifrost_reg_op op)
+// Pass in the add_write_unit or fma_write_unit, and this returns which register
+// the ADD/FMA units are writing to
+static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs)
 {
-        if (op == BIFROST_OP_WRITE_LO)
-                fprintf(fp, ".h0");
-        else if (op == BIFROST_OP_WRITE_HI)
-                fprintf(fp, ".h1");
+        switch (unit) {
+        case REG_WRITE_TWO:
+                return regs.reg2;
+        case REG_WRITE_THREE:
+                return regs.reg3;
+        default: /* REG_WRITE_NONE */
+                assert(0);
+                return 0;
+        }
 }
 
-void
-bi_disasm_dest_fma(FILE *fp, struct bifrost_regs *next_regs, bool last)
+static void dump_regs(struct bifrost_regs srcs)
 {
-    /* If this is the last instruction, next_regs points to the first reg entry. */
-    struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, *next_regs, last);
-    if (ctrl.slot23.slot2 >= BIFROST_OP_WRITE) {
-        fprintf(fp, "r%u:t0", next_regs->reg2);
-        bi_disasm_dest_mask(fp, ctrl.slot23.slot2);
-    } else if (ctrl.slot23.slot3 >= BIFROST_OP_WRITE && ctrl.slot23.slot3_fma) {
-        fprintf(fp, "r%u:t0", next_regs->reg3);
-        bi_disasm_dest_mask(fp, ctrl.slot23.slot3);
-    } else
-        fprintf(fp, "t0");
-}
+        struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(srcs);
+        printf("# ");
+        if (ctrl.read_reg0)
+                printf("port 0: R%d ", get_reg0(srcs));
+        if (ctrl.read_reg1)
+                printf("port 1: R%d ", get_reg1(srcs));
+
+        if (ctrl.fma_write_unit == REG_WRITE_TWO)
+                printf("port 2: R%d (write FMA) ", srcs.reg2);
+        else if (ctrl.add_write_unit == REG_WRITE_TWO)
+                printf("port 2: R%d (write ADD) ", srcs.reg2);
+
+        if (ctrl.fma_write_unit == REG_WRITE_THREE)
+                printf("port 3: R%d (write FMA) ", srcs.reg3);
+        else if (ctrl.add_write_unit == REG_WRITE_THREE)
+                printf("port 3: R%d (write ADD) ", srcs.reg3);
+        else if (ctrl.read_reg3)
+                printf("port 3: R%d (read) ", srcs.reg3);
+
+        if (srcs.uniform_const) {
+                if (srcs.uniform_const & 0x80) {
+                        printf("uniform: U%d", (srcs.uniform_const & 0x7f) * 2);
+                }
+        }
 
-void
-bi_disasm_dest_add(FILE *fp, struct bifrost_regs *next_regs, bool last)
-{
-    /* If this is the last instruction, next_regs points to the first reg entry. */
-    struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, *next_regs, last);
-
-    if (ctrl.slot23.slot3 >= BIFROST_OP_WRITE && !ctrl.slot23.slot3_fma) {
-        fprintf(fp, "r%u:t1", next_regs->reg3);
-        bi_disasm_dest_mask(fp, ctrl.slot23.slot3);
-    } else
-        fprintf(fp, "t1");
+        printf("\n");
 }
-
-static void dump_const_imm(FILE *fp, uint32_t imm)
+static void dump_const_imm(uint32_t imm)
 {
         union {
                 float f;
                 uint32_t i;
         } fi;
         fi.i = imm;
-        fprintf(fp, "0x%08x /* %f */", imm, fi.f);
+        printf("0x%08x /* %f */", imm, fi.f);
 }
 
-static void
-dump_pc_imm(FILE *fp, uint64_t imm, unsigned branch_offset, enum bi_constmod mod, bool high32)
+static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs)
 {
-        if (mod == BI_CONSTMOD_PC_HI && !high32) {
-                dump_const_imm(fp, imm);
-                return;
-        }
-
-        /* 60-bit sign-extend */
-        uint64_t zx64 = (imm << 4);
-        int64_t sx64 = zx64;
-        sx64 >>= 4;
-
-        /* 28-bit sign extend x 2 */
-        uint32_t imm32[2] = { (uint32_t) imm, (uint32_t) (imm >> 32) };
-        uint32_t zx32[2] = { imm32[0] << 4, imm32[1] << 4 };
-        int32_t sx32[2] = { zx32[0], zx32[1] };
-        sx32[0] >>= 4;
-        sx32[1] >>= 4;
-
-        int64_t offs = 0;
-
-        switch (mod) {
-        case BI_CONSTMOD_PC_LO:
-                offs = sx64;
+        unsigned low_bits = srcs.uniform_const & 0xf;
+        uint64_t imm;
+        switch (srcs.uniform_const >> 4) {
+        case 4:
+                imm = consts[0];
+                break;
+        case 5:
+                imm = consts[1];
+                break;
+        case 6:
+                imm = consts[2];
                 break;
-        case BI_CONSTMOD_PC_HI:
-                offs = sx32[1];
+        case 7:
+                imm = consts[3];
                 break;
-        case BI_CONSTMOD_PC_LO_HI:
-                offs = sx32[high32];
+        case 2:
+                imm = consts[4];
+                break;
+        case 3:
+                imm = consts[5];
                 break;
         default:
-                unreachable("Invalid PC modifier");
+                assert(0);
+                break;
         }
-
-        assert((offs & 15) == 0);
-        fprintf(fp, "clause_%" PRId64, branch_offset + (offs / 16));
-
-        if (mod == BI_CONSTMOD_PC_LO && high32)
-                fprintf(fp, " >> 32");
-
-        /* While technically in spec, referencing the current clause as (pc +
-         * 0) likely indicates an unintended infinite loop  */
-        if (offs == 0)
-                fprintf(fp, " /* XXX: likely an infinite loop */");
-}
-
-/* Convert an index to an embedded constant in FAU-RAM to the index of the
- * embedded constant. No, it's not in order. Yes, really. */
-
-static unsigned
-const_fau_to_idx(unsigned fau_value)
-{
-        unsigned map[8] = {
-                ~0, ~0, 4, 5, 0, 1, 2, 3
-        };
-
-        assert(map[fau_value] < 6);
-        return map[fau_value];
+        return imm | low_bits;
 }
 
-static void dump_fau_src(FILE *fp, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool high32)
+static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, bool high32)
 {
-        if (srcs.fau_idx & 0x80) {
-                unsigned uniform = (srcs.fau_idx & 0x7f);
-                fprintf(fp, "u%u.w%u", uniform, high32);
-        } else if (srcs.fau_idx >= 0x20) {
-                unsigned idx = const_fau_to_idx(srcs.fau_idx >> 4);
-                uint64_t imm = consts->raw[idx];
-                imm |= (srcs.fau_idx & 0xf);
-                if (consts->mods[idx] != BI_CONSTMOD_NONE)
-                        dump_pc_imm(fp, imm, branch_offset, consts->mods[idx], high32);
-                else if (high32)
-                        dump_const_imm(fp, imm >> 32);
+        if (srcs.uniform_const & 0x80) {
+                unsigned uniform = (srcs.uniform_const & 0x7f) * 2;
+                printf("U%d", uniform + (high32 ? 1 : 0));
+        } else if (srcs.uniform_const >= 0x20) {
+                uint64_t imm = get_const(consts, srcs);
+                if (high32)
+                        dump_const_imm(imm >> 32);
                 else
-                        dump_const_imm(fp, imm);
+                        dump_const_imm(imm);
         } else {
-                switch (srcs.fau_idx) {
+                switch (srcs.uniform_const) {
                 case 0:
-                        fprintf(fp, "#0");
-                        break;
-                case 1:
-                        fprintf(fp, "lane_id");
-                        break;
-                case 2:
-                        fprintf(fp, "warp_id");
-                        break;
-                case 3:
-                        fprintf(fp, "core_id");
-                        break;
-                case 4:
-                        fprintf(fp, "framebuffer_size");
+                        printf("0");
                         break;
                 case 5:
-                        fprintf(fp, "atest_datum");
+                        printf("atest-data");
                         break;
                 case 6:
-                        fprintf(fp, "sample");
+                        printf("sample-ptr");
                         break;
                 case 8:
                 case 9:
@@ -347,113 +429,1640 @@ static void dump_fau_src(FILE *fp, struct bifrost_regs srcs, unsigned branch_off
                 case 13:
                 case 14:
                 case 15:
-                        fprintf(fp, "blend_descriptor_%u", (unsigned) srcs.fau_idx - 8);
+                        printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8);
                         break;
                 default:
-                        fprintf(fp, "XXX - reserved%u", (unsigned) srcs.fau_idx);
+                        printf("unkConst%u", (unsigned) srcs.uniform_const);
                         break;
                 }
 
                 if (high32)
-                        fprintf(fp, ".y");
+                        printf(".y");
                 else
-                        fprintf(fp, ".x");
+                        printf(".x");
         }
 }
 
-void
-dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool isFMA)
+static void dump_src(unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA)
 {
         switch (src) {
         case 0:
-                fprintf(fp, "r%u", get_reg0(srcs));
+                printf("R%d", get_reg0(srcs));
                 break;
         case 1:
-                fprintf(fp, "r%u", get_reg1(srcs));
+                printf("R%d", get_reg1(srcs));
                 break;
         case 2:
-                fprintf(fp, "r%u", srcs.reg2);
+                printf("R%d", srcs.reg3);
                 break;
         case 3:
                 if (isFMA)
-                        fprintf(fp, "#0");
+                        printf("0");
                 else
-                        fprintf(fp, "t"); // i.e. the output of FMA this cycle
+                        printf("T"); // i.e. the output of FMA this cycle
                 break;
         case 4:
-                dump_fau_src(fp, srcs, branch_offset, consts, false);
+                dump_uniform_const_src(srcs, consts, false);
                 break;
         case 5:
-                dump_fau_src(fp, srcs, branch_offset, consts, true);
+                dump_uniform_const_src(srcs, consts, true);
                 break;
         case 6:
-                fprintf(fp, "t0");
+                printf("T0");
                 break;
         case 7:
-                fprintf(fp, "t1");
+                printf("T1");
                 break;
         }
 }
 
-/* Tables for decoding M0, or if M0 == 7, M1 respectively.
- *
- * XXX: It's not clear if the third entry of M1_table corresponding to (7, 2)
- * should have PC_LO_HI in the EC1 slot, or it's a weird hybrid mode? I would
- * say this needs testing but no code should ever actually use this mode.
- */
+static void dump_output_mod(unsigned mod)
+{
+        switch (mod) {
+        case 0:
+                break;
+        case 1:
+                printf(".clamp_0_inf");
+                break; // max(out, 0)
+        case 2:
+                printf(".clamp_m1_1");
+                break; // clamp(out, -1, 1)
+        case 3:
+                printf(".clamp_0_1");
+                break; // clamp(out, 0, 1)
+        default:
+                break;
+        }
+}
 
-static const enum bi_constmod M1_table[7][2] = {
-        { BI_CONSTMOD_NONE, BI_CONSTMOD_NONE },
-        { BI_CONSTMOD_PC_LO, BI_CONSTMOD_NONE },
-        { BI_CONSTMOD_PC_LO, BI_CONSTMOD_PC_LO },
-        { ~0, ~0 },
-        { BI_CONSTMOD_PC_HI, BI_CONSTMOD_NONE },
-        { BI_CONSTMOD_PC_HI, BI_CONSTMOD_PC_HI },
-        { BI_CONSTMOD_PC_LO, BI_CONSTMOD_NONE },
-};
+static void dump_minmax_mode(unsigned mod)
+{
+        switch (mod) {
+        case 0:
+                /* Same as fmax() and fmin() -- return the other number if any
+                 * number is NaN.  Also always return +0 if one argument is +0 and
+                 * the other is -0.
+                 */
+                break;
+        case 1:
+                /* Instead of never returning a NaN, always return one. The
+                 * "greater"/"lesser" NaN is always returned, first by checking the
+                 * sign and then the mantissa bits.
+                 */
+                printf(".nan_wins");
+                break;
+        case 2:
+                /* For max, implement src0 > src1 ? src0 : src1
+                 * For min, implement src0 < src1 ? src0 : src1
+                 *
+                 * This includes handling NaN's and signedness of 0 differently
+                 * from above, since +0 and -0 compare equal and comparisons always
+                 * return false for NaN's. As a result, this mode is *not*
+                 * commutative.
+                 */
+                printf(".src1_wins");
+                break;
+        case 3:
+                /* For max, implement src0 < src1 ? src1 : src0
+                 * For min, implement src0 > src1 ? src1 : src0
+                 */
+                printf(".src0_wins");
+                break;
+        default:
+                break;
+        }
+}
+
+static void dump_round_mode(unsigned mod)
+{
+        switch (mod) {
+        case 0:
+                /* roundTiesToEven, the IEEE default. */
+                break;
+        case 1:
+                /* roundTowardPositive in the IEEE spec. */
+                printf(".round_pos");
+                break;
+        case 2:
+                /* roundTowardNegative in the IEEE spec. */
+                printf(".round_neg");
+                break;
+        case 3:
+                /* roundTowardZero in the IEEE spec. */
+                printf(".round_zero");
+                break;
+        default:
+                break;
+        }
+}
 
-static const enum bi_constmod M2_table[4][2] = {
-        { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_NONE },
-        { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_HI },
-        { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_LO_HI },
-        { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_HI },
+static const struct fma_op_info FMAOpInfos[] = {
+        { 0x00000, "FMA.f32",  FMA_FMA },
+        { 0x40000, "MAX.f32", FMA_FMINMAX },
+        { 0x44000, "MIN.f32", FMA_FMINMAX },
+        { 0x48000, "FCMP.GL", FMA_FCMP },
+        { 0x4c000, "FCMP.D3D", FMA_FCMP },
+        { 0x4ff98, "ADD.i32", FMA_TWO_SRC },
+        { 0x4ffd8, "SUB.i32", FMA_TWO_SRC },
+        { 0x4fff0, "SUBB.i32", FMA_TWO_SRC },
+        { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE },
+        { 0x58000, "ADD.f32", FMA_FADD },
+        { 0x5c000, "CSEL.FEQ.f32", FMA_FOUR_SRC },
+        { 0x5c200, "CSEL.FGT.f32", FMA_FOUR_SRC },
+        { 0x5c400, "CSEL.FGE.f32", FMA_FOUR_SRC },
+        { 0x5c600, "CSEL.IEQ.f32", FMA_FOUR_SRC },
+        { 0x5c800, "CSEL.IGT.i32", FMA_FOUR_SRC },
+        { 0x5ca00, "CSEL.IGE.i32", FMA_FOUR_SRC },
+        { 0x5cc00, "CSEL.UGT.i32", FMA_FOUR_SRC },
+        { 0x5ce00, "CSEL.UGE.i32", FMA_FOUR_SRC },
+        { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC },
+        { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC },
+        { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC },
+        { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC },
+        { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC },
+        { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0
+        { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC },
+        { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC },
+        { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC },
+        { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC },
+        { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0
+        { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC },
+        { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC },
+        { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC },
+        { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC },
+        { 0x60200, "RSHIFT_NAND.i32", FMA_THREE_SRC },
+        { 0x603c0, "RSHIFT_NAND.v2i16", FMA_THREE_SRC },
+        { 0x60e00, "RSHIFT_OR.i32", FMA_THREE_SRC },
+        { 0x60fc0, "RSHIFT_OR.v2i16", FMA_THREE_SRC },
+        { 0x61200, "RSHIFT_AND.i32", FMA_THREE_SRC },
+        { 0x613c0, "RSHIFT_AND.v2i16", FMA_THREE_SRC },
+        { 0x61e00, "RSHIFT_NOR.i32", FMA_THREE_SRC }, // ~((src0 << src2) | src1)
+        { 0x61fc0, "RSHIFT_NOR.v2i16", FMA_THREE_SRC }, // ~((src0 << src2) | src1)
+        { 0x62200, "LSHIFT_NAND.i32", FMA_THREE_SRC },
+        { 0x623c0, "LSHIFT_NAND.v2i16", FMA_THREE_SRC },
+        { 0x62e00, "LSHIFT_OR.i32",  FMA_THREE_SRC }, // (src0 << src2) | src1
+        { 0x62fc0, "LSHIFT_OR.v2i16",  FMA_THREE_SRC }, // (src0 << src2) | src1
+        { 0x63200, "LSHIFT_AND.i32", FMA_THREE_SRC }, // (src0 << src2) & src1
+        { 0x633c0, "LSHIFT_AND.v2i16", FMA_THREE_SRC },
+        { 0x63e00, "LSHIFT_NOR.i32", FMA_THREE_SRC },
+        { 0x63fc0, "LSHIFT_NOR.v2i16", FMA_THREE_SRC },
+        { 0x64200, "RSHIFT_XOR.i32", FMA_THREE_SRC },
+        { 0x643c0, "RSHIFT_XOR.v2i16", FMA_THREE_SRC },
+        { 0x64600, "RSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
+        { 0x647c0, "RSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
+        { 0x64a00, "LSHIFT_XOR.i32", FMA_THREE_SRC },
+        { 0x64bc0, "LSHIFT_XOR.v2i16", FMA_THREE_SRC },
+        { 0x64e00, "LSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
+        { 0x64fc0, "LSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
+        { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC },
+        { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1
+        { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2)
+        { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC },
+        { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC },
+        { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC },
+        { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC },
+        { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC },
+        { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC },
+        { 0x80000, "FMA.v2f16",  FMA_FMA16 },
+        { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 },
+        { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 },
+        { 0xc8000, "FCMP.GL", FMA_FCMP16 },
+        { 0xcc000, "FCMP.D3D", FMA_FCMP16 },
+        { 0xcf900, "ADD.v2i16", FMA_TWO_SRC },
+        { 0xcfc10, "ADDC.i32", FMA_TWO_SRC },
+        { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC },
+        { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC },
+        { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC },
+        { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC },
+        { 0xd8000, "ADD.v2f16", FMA_FADD16 },
+        { 0xdc000, "CSEL.FEQ.v2f16", FMA_FOUR_SRC },
+        { 0xdc200, "CSEL.FGT.v2f16", FMA_FOUR_SRC },
+        { 0xdc400, "CSEL.FGE.v2f16", FMA_FOUR_SRC },
+        { 0xdc600, "CSEL.IEQ.v2f16", FMA_FOUR_SRC },
+        { 0xdc800, "CSEL.IGT.v2i16", FMA_FOUR_SRC },
+        { 0xdca00, "CSEL.IGE.v2i16", FMA_FOUR_SRC },
+        { 0xdcc00, "CSEL.UGT.v2i16", FMA_FOUR_SRC },
+        { 0xdce00, "CSEL.UGE.v2i16", FMA_FOUR_SRC },
+        { 0xdd000, "F32_TO_F16", FMA_TWO_SRC },
+        { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC },
+        { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC },
+        { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC },
+        { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC },
+        { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC },
+        { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC },
+        { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC },
+        { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC },
+        { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC },
+        { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC },
+        { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC },
+        { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC },
+        { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC },
+        { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC },
+        { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC },
+        { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC },
+        { 0xe0136, "F32_TO_I32", FMA_ONE_SRC },
+        { 0xe0137, "F32_TO_U32", FMA_ONE_SRC },
+        { 0xe0178, "I32_TO_F32", FMA_ONE_SRC },
+        { 0xe0179, "U32_TO_F32", FMA_ONE_SRC },
+        { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC },
+        { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC },
+        { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC },
+        { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC },
+        { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC },
+        { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC },
+        { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC },
+        { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC },
+        { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC },
+        { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC },
+        { 0xe032c, "NOP",  FMA_ONE_SRC },
+        { 0xe032d, "MOV",  FMA_ONE_SRC },
+        { 0xe032f, "SWZ.YY.v2i16",  FMA_ONE_SRC },
+        // From the ARM patent US20160364209A1:
+        // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s,
+        // and x1 is a floating point value in a predetermined range where the
+        // value 1 is within the range and not at one extremity of the range (e.g.
+        // choose a range where 1 is towards middle of range)."
+        //
+        // This computes x1.
+        { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC },
+        // Given a floating point number m * 2^e, returns m * 2^{-1}. This is
+        // exactly the same as the mantissa part of frexp().
+        { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC },
+        // Given a floating point number m * 2^e, returns m * 2^{-2} if e is even,
+        // and m * 2^{-1} if e is odd. In other words, scales by powers of 4 until
+        // within the range [0.25, 1). Used for square-root and reciprocal
+        // square-root.
+        { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC },
+        // Given a floating point number m * 2^e, computes -e - 1 as an integer.
+        // Zero and infinity/NaN return 0.
+        { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC },
+        // Computes floor(e/2) + 1.
+        { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC },
+        // Given a floating point number m * 2^e, computes -floor(e/2) - 1 as an
+        // integer.
+        { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC },
+        { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC },
+        { 0xe03fa, "CLZ", FMA_ONE_SRC },
+        { 0xe0b80, "IMAX3", FMA_THREE_SRC },
+        { 0xe0bc0, "UMAX3", FMA_THREE_SRC },
+        { 0xe0c00, "IMIN3", FMA_THREE_SRC },
+        { 0xe0c40, "UMIN3", FMA_THREE_SRC },
+        { 0xe0ec5, "ROUND", FMA_ONE_SRC },
+        { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0
+        { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment
+        { 0xe1805, "ROUNDEVEN", FMA_ONE_SRC },
+        { 0xe1845, "CEIL", FMA_ONE_SRC },
+        { 0xe1885, "FLOOR", FMA_ONE_SRC },
+        { 0xe18c5, "TRUNC", FMA_ONE_SRC },
+        { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC },
+        { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC },
+        // These instructions in the FMA slot, together with LSHIFT_ADD_HIGH32.i32
+        // in the ADD slot, allow one to do a 64-bit addition with an extra small
+        // shift on one of the sources. There are three possible scenarios:
+        //
+        // 1) Full 64-bit addition. Do:
+        // out.x = LSHIFT_ADD_LOW32.i64 src1.x, src2.x, shift
+        // out.y = LSHIFT_ADD_HIGH32.i32 src1.y, src2.y
+        //
+        // The shift amount is applied to src2 before adding. The shift amount, and
+        // any extra bits from src2 plus the overflow bit, are sent directly from
+        // FMA to ADD instead of being passed explicitly. Hence, these two must be
+        // bundled together into the same instruction.
+        //
+        // 2) Add a 64-bit value src1 to a zero-extended 32-bit value src2. Do:
+        // out.x = LSHIFT_ADD_LOW32.u32 src1.x, src2, shift
+        // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0
+        //
+        // Note that in this case, the second argument to LSHIFT_ADD_HIGH32 is
+        // ignored, so it can actually be anything. As before, the shift is applied
+        // to src2 before adding.
+        //
+        // 3) Add a 64-bit value to a sign-extended 32-bit value src2. Do:
+        // out.x = LSHIFT_ADD_LOW32.i32 src1.x, src2, shift
+        // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0
+        //
+        // The only difference is the .i32 instead of .u32. Otherwise, this is
+        // exactly the same as before.
+        //
+        // In all these instructions, the shift amount is stored where the third
+        // source would be, so the shift has to be a small immediate from 0 to 7.
+        // This is fine for the expected use-case of these instructions, which is
+        // manipulating 64-bit pointers.
+        //
+        // These instructions can also be combined with various load/store
+        // instructions which normally take a 64-bit pointer in order to add a
+        // 32-bit or 64-bit offset to the pointer before doing the operation,
+        // optionally shifting the offset. The load/store op implicity does
+        // LSHIFT_ADD_HIGH32.i32 internally. Letting ptr be the pointer, and offset
+        // the desired offset, the cases go as follows:
+        //
+        // 1) Add a 64-bit offset:
+        // LSHIFT_ADD_LOW32.i64 ptr.x, offset.x, shift
+        // ld_st_op ptr.y, offset.y, ...
+        //
+        // Note that the output of LSHIFT_ADD_LOW32.i64 is not used, instead being
+        // implicitly sent to the load/store op to serve as the low 32 bits of the
+        // pointer.
+        //
+        // 2) Add a 32-bit unsigned offset:
+        // temp = LSHIFT_ADD_LOW32.u32 ptr.x, offset, shift
+        // ld_st_op temp, ptr.y, ...
+        //
+        // Now, the low 32 bits of offset << shift + ptr are passed explicitly to
+        // the ld_st_op, to match the case where there is no offset and ld_st_op is
+        // called directly.
+        //
+        // 3) Add a 32-bit signed offset:
+        // temp = LSHIFT_ADD_LOW32.i32 ptr.x, offset, shift
+        // ld_st_op temp, ptr.y, ...
+        //
+        // Again, the same as the unsigned case except for the offset.
+        { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 },
+        { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 },
+        { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 },
+        { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC },
+        { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC },
+        { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC },
+        { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC },
+        { 0xe7800, "IMAD", FMA_THREE_SRC },
+        { 0xe78db, "POPCNT", FMA_ONE_SRC },
 };
 
-static void
-decode_M(enum bi_constmod *mod, unsigned M1, unsigned M2, bool single)
+static struct fma_op_info find_fma_op_info(unsigned op)
 {
-        if (M1 >= 8) {
-                mod[0] = BI_CONSTMOD_NONE;
+        for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) {
+                unsigned opCmp = ~0;
+                switch (FMAOpInfos[i].src_type) {
+                case FMA_ONE_SRC:
+                        opCmp = op;
+                        break;
+                case FMA_TWO_SRC:
+                        opCmp = op & ~0x7;
+                        break;
+                case FMA_FCMP:
+                case FMA_FCMP16:
+                        opCmp = op & ~0x1fff;
+                        break;
+                case FMA_THREE_SRC:
+                case FMA_SHIFT_ADD64:
+                        opCmp = op & ~0x3f;
+                        break;
+                case FMA_FADD:
+                case FMA_FMINMAX:
+                case FMA_FADD16:
+                case FMA_FMINMAX16:
+                        opCmp = op & ~0x3fff;
+                        break;
+                case FMA_FMA:
+                case FMA_FMA16:
+                        opCmp = op & ~0x3ffff;
+                        break;
+                case FMA_FOUR_SRC:
+                        opCmp = op & ~0x1ff;
+                        break;
+                case FMA_FMA_MSCALE:
+                        opCmp = op & ~0x7fff;
+                        break;
+                default:
+                        opCmp = ~0;
+                        break;
+                }
+                if (FMAOpInfos[i].op == opCmp)
+                        return FMAOpInfos[i];
+        }
 
-                if (!single)
-                        mod[1] = BI_CONSTMOD_NONE;
+        struct fma_op_info info;
+        snprintf(info.name, sizeof(info.name), "op%04x", op);
+        info.op = op;
+        info.src_type = FMA_THREE_SRC;
+        return info;
+}
 
+static void dump_fcmp(unsigned op)
+{
+        switch (op) {
+        case 0:
+                printf(".OEQ");
+                break;
+        case 1:
+                printf(".OGT");
+                break;
+        case 2:
+                printf(".OGE");
+                break;
+        case 3:
+                printf(".UNE");
+                break;
+        case 4:
+                printf(".OLT");
+                break;
+        case 5:
+                printf(".OLE");
+                break;
+        default:
+                printf(".unk%d", op);
+                break;
+        }
+}
+
+static void dump_16swizzle(unsigned swiz)
+{
+        if (swiz == 2)
                 return;
-        } else if (M1 == 7) {
-                assert(M2 < 4);
-                memcpy(mod, M2_table[M2], sizeof(*mod) * (single ? 1 : 2));
+        printf(".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]);
+}
+
+static void dump_fma_expand_src0(unsigned ctrl)
+{
+        switch (ctrl) {
+        case 3:
+        case 4:
+        case 6:
+                printf(".x");
+                break;
+        case 5:
+        case 7:
+                printf(".y");
+                break;
+        case 0:
+        case 1:
+        case 2:
+                break;
+        default:
+                printf(".unk");
+                break;
+        }
+}
+
+static void dump_fma_expand_src1(unsigned ctrl)
+{
+        switch (ctrl) {
+        case 1:
+        case 3:
+                printf(".x");
+                break;
+        case 2:
+        case 4:
+        case 5:
+                printf(".y");
+                break;
+        case 0:
+        case 6:
+        case 7:
+                break;
+        default:
+                printf(".unk");
+                break;
+        }
+}
+
+static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose)
+{
+        if (verbose) {
+                printf("# FMA: %016" PRIx64 "\n", word);
+        }
+        struct bifrost_fma_inst FMA;
+        memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst));
+        struct fma_op_info info = find_fma_op_info(FMA.op);
+
+        printf("%s", info.name);
+        if (info.src_type == FMA_FADD ||
+            info.src_type == FMA_FMINMAX ||
+            info.src_type == FMA_FMA ||
+            info.src_type == FMA_FADD16 ||
+            info.src_type == FMA_FMINMAX16 ||
+            info.src_type == FMA_FMA16) {
+                dump_output_mod(bits(FMA.op, 12, 14));
+                switch (info.src_type) {
+                case FMA_FADD:
+                case FMA_FMA:
+                case FMA_FADD16:
+                case FMA_FMA16:
+                        dump_round_mode(bits(FMA.op, 10, 12));
+                        break;
+                case FMA_FMINMAX:
+                case FMA_FMINMAX16:
+                        dump_minmax_mode(bits(FMA.op, 10, 12));
+                        break;
+                default:
+                        assert(0);
+                }
+        } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) {
+                dump_fcmp(bits(FMA.op, 10, 13));
+                if (info.src_type == FMA_FCMP)
+                        printf(".f32");
+                else
+                        printf(".v2f16");
+        } else if (info.src_type == FMA_FMA_MSCALE) {
+                if (FMA.op & (1 << 11)) {
+                        switch ((FMA.op >> 9) & 0x3) {
+                        case 0:
+                                /* This mode seems to do a few things:
+                                 * - Makes 0 * infinity (and incidentally 0 * nan) return 0,
+                                 *   since generating a nan would poison the result of
+                                 *   1/infinity and 1/0.
+                                 * - Fiddles with which nan is returned in nan * nan,
+                                 *   presumably to make sure that the same exact nan is
+                                 *   returned for 1/nan.
+                                 */
+                                printf(".rcp_mode");
+                                break;
+                        case 3:
+                                /* Similar to the above, but src0 always wins when multiplying
+                                 * 0 by infinity.
+                                 */
+                                printf(".sqrt_mode");
+                                break;
+                        default:
+                                printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3);
+                        }
+                } else {
+                        dump_output_mod(bits(FMA.op, 9, 11));
+                }
+        }
+
+        printf(" ");
+
+        struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs);
+        if (next_ctrl.fma_write_unit != REG_WRITE_NONE) {
+                printf("{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs));
         } else {
-                assert(M1 != 3);
-                memcpy(mod, M1_table[M1], sizeof(*mod) * (single ? 1 : 2));
+                printf("T0, ");
         }
+
+        switch (info.src_type) {
+        case FMA_ONE_SRC:
+                dump_src(FMA.src0, regs, consts, true);
+                break;
+        case FMA_TWO_SRC:
+                dump_src(FMA.src0, regs, consts, true);
+                printf(", ");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                break;
+        case FMA_FADD:
+        case FMA_FMINMAX:
+                if (FMA.op & 0x10)
+                        printf("-");
+                if (FMA.op & 0x200)
+                        printf("abs(");
+                dump_src(FMA.src0, regs, consts, true);
+                dump_fma_expand_src0((FMA.op >> 6) & 0x7);
+                if (FMA.op & 0x200)
+                        printf(")");
+                printf(", ");
+                if (FMA.op & 0x20)
+                        printf("-");
+                if (FMA.op & 0x8)
+                        printf("abs(");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                dump_fma_expand_src1((FMA.op >> 6) & 0x7);
+                if (FMA.op & 0x8)
+                        printf(")");
+                break;
+        case FMA_FADD16:
+        case FMA_FMINMAX16: {
+                bool abs1 = FMA.op & 0x8;
+                bool abs2 = (FMA.op & 0x7) < FMA.src0;
+                if (FMA.op & 0x10)
+                        printf("-");
+                if (abs1 || abs2)
+                        printf("abs(");
+                dump_src(FMA.src0, regs, consts, true);
+                dump_16swizzle((FMA.op >> 6) & 0x3);
+                if (abs1 || abs2)
+                        printf(")");
+                printf(", ");
+                if (FMA.op & 0x20)
+                        printf("-");
+                if (abs1 && abs2)
+                        printf("abs(");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                dump_16swizzle((FMA.op >> 8) & 0x3);
+                if (abs1 && abs2)
+                        printf(")");
+                break;
+        }
+        case FMA_FCMP:
+                if (FMA.op & 0x200)
+                        printf("abs(");
+                dump_src(FMA.src0, regs, consts, true);
+                dump_fma_expand_src0((FMA.op >> 6) & 0x7);
+                if (FMA.op & 0x200)
+                        printf(")");
+                printf(", ");
+                if (FMA.op & 0x20)
+                        printf("-");
+                if (FMA.op & 0x8)
+                        printf("abs(");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                dump_fma_expand_src1((FMA.op >> 6) & 0x7);
+                if (FMA.op & 0x8)
+                        printf(")");
+                break;
+        case FMA_FCMP16:
+                dump_src(FMA.src0, regs, consts, true);
+                // Note: this is kinda a guess, I haven't seen the blob set this to
+                // anything other than the identity, but it matches FMA_TWO_SRCFmod16
+                dump_16swizzle((FMA.op >> 6) & 0x3);
+                printf(", ");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                dump_16swizzle((FMA.op >> 8) & 0x3);
+                break;
+        case FMA_SHIFT_ADD64:
+                dump_src(FMA.src0, regs, consts, true);
+                printf(", ");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                printf(", ");
+                printf("shift:%u", (FMA.op >> 3) & 0x7);
+                break;
+        case FMA_THREE_SRC:
+                dump_src(FMA.src0, regs, consts, true);
+                printf(", ");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                printf(", ");
+                dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+                break;
+        case FMA_FMA:
+                if (FMA.op & (1 << 14))
+                        printf("-");
+                if (FMA.op & (1 << 9))
+                        printf("abs(");
+                dump_src(FMA.src0, regs, consts, true);
+                dump_fma_expand_src0((FMA.op >> 6) & 0x7);
+                if (FMA.op & (1 << 9))
+                        printf(")");
+                printf(", ");
+                if (FMA.op & (1 << 16))
+                        printf("abs(");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                dump_fma_expand_src1((FMA.op >> 6) & 0x7);
+                if (FMA.op & (1 << 16))
+                        printf(")");
+                printf(", ");
+                if (FMA.op & (1 << 15))
+                        printf("-");
+                if (FMA.op & (1 << 17))
+                        printf("abs(");
+                dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+                if (FMA.op & (1 << 17))
+                        printf(")");
+                break;
+        case FMA_FMA16:
+                if (FMA.op & (1 << 14))
+                        printf("-");
+                dump_src(FMA.src0, regs, consts, true);
+                dump_16swizzle((FMA.op >> 6) & 0x3);
+                printf(", ");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                dump_16swizzle((FMA.op >> 8) & 0x3);
+                printf(", ");
+                if (FMA.op & (1 << 15))
+                        printf("-");
+                dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+                dump_16swizzle((FMA.op >> 16) & 0x3);
+                break;
+        case FMA_FOUR_SRC:
+                dump_src(FMA.src0, regs, consts, true);
+                printf(", ");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                printf(", ");
+                dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+                printf(", ");
+                dump_src((FMA.op >> 6) & 0x7, regs, consts, true);
+                break;
+        case FMA_FMA_MSCALE:
+                if (FMA.op & (1 << 12))
+                        printf("abs(");
+                dump_src(FMA.src0, regs, consts, true);
+                if (FMA.op & (1 << 12))
+                        printf(")");
+                printf(", ");
+                if (FMA.op & (1 << 13))
+                        printf("-");
+                dump_src(FMA.op & 0x7, regs, consts, true);
+                printf(", ");
+                if (FMA.op & (1 << 14))
+                        printf("-");
+                dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+                printf(", ");
+                dump_src((FMA.op >> 6) & 0x7, regs, consts, true);
+                break;
+        }
+        printf("\n");
+}
+
+static const struct add_op_info add_op_infos[] = {
+        { 0x00000, "MAX.f32", ADD_FMINMAX },
+        { 0x02000, "MIN.f32", ADD_FMINMAX },
+        { 0x04000, "ADD.f32", ADD_FADD },
+        { 0x06000, "FCMP.GL", ADD_FCMP },
+        { 0x07000, "FCMP.D3D", ADD_FCMP },
+        { 0x07856, "F16_TO_I16", ADD_ONE_SRC },
+        { 0x07857, "F16_TO_U16", ADD_ONE_SRC },
+        { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC },
+        { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC },
+        { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC },
+        { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC },
+        { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC },
+        { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC },
+        { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC },
+        { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC },
+        { 0x07936, "F32_TO_I32", ADD_ONE_SRC },
+        { 0x07937, "F32_TO_U32", ADD_ONE_SRC },
+        { 0x07978, "I32_TO_F32", ADD_ONE_SRC },
+        { 0x07979, "U32_TO_F32", ADD_ONE_SRC },
+        { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC },
+        { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC },
+        { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC },
+        { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC },
+        { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC },
+        { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC },
+        { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC },
+        { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC },
+        // take the low 16 bits, and expand it to a 32-bit float
+        { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC },
+        // take the high 16 bits, ...
+        { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC },
+        { 0x07b2b, "SWZ.YX.v2i16",  ADD_ONE_SRC },
+        { 0x07b2c, "NOP",  ADD_ONE_SRC },
+        { 0x07b29, "SWZ.XX.v2i16",  ADD_ONE_SRC },
+        // Logically, this should be SWZ.XY, but that's equivalent to a move, and
+        // this seems to be the canonical way the blob generates a MOV.
+        { 0x07b2d, "MOV",  ADD_ONE_SRC },
+        { 0x07b2f, "SWZ.YY.v2i16",  ADD_ONE_SRC },
+        // Given a floating point number m * 2^e, returns m ^ 2^{-1}.
+        { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC },
+        { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC },
+        { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC },
+        { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC },
+        { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC },
+        // From the ARM patent US20160364209A1:
+        // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s,
+        // and x1 is a floating point value in a predetermined range where the
+        // value 1 is within the range and not at one extremity of the range (e.g.
+        // choose a range where 1 is towards middle of range)."
+        //
+        // This computes s.
+        { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC },
+        { 0x07d45, "CEIL", ADD_ONE_SRC },
+        { 0x07d85, "FLOOR", ADD_ONE_SRC },
+        { 0x07dc5, "TRUNC", ADD_ONE_SRC },
+        { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC },
+        { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true },
+        { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true },
+        { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true },
+        { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true },
+        { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true },
+        { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
+        { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
+        { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true },
+        { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true },
+        { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
+        { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
+        { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true },
+        { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true },
+        { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
+        { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
+        { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true },
+        { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true },
+        { 0x0b000, "TEX", ADD_TEX_COMPACT, true },
+        { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true },
+        { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true },
+        { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
+        { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true },
+        { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true },
+        { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
+        { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true },
+        // src0 = offset, src1 = binding
+        { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true },
+        { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true },
+        { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true },
+        { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true },
+        { 0x0c588, "STORE.i32", ADD_TWO_SRC, true },
+        { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true },
+        { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true },
+        { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true },
+        { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends
+        { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true },
+        { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true },
+        { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true },
+        { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true },
+        { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true },
+        // *_FAST does not exist on G71 (added to G51, G72, and everything after)
+        { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC },
+        { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC },
+        // Given a floating point number m * 2^e, produces a table-based
+        // approximation of 2/m using the top 17 bits. Includes special cases for
+        // infinity, NaN, and zero, and copies the sign bit.
+        { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC },
+        // Exists on G71
+        { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC },
+        // A similar table for inverse square root, using the high 17 bits of the
+        // mantissa as well as the low bit of the exponent.
+        { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC },
+        { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC },
+        { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC },
+        // Used in the argument reduction for log. Given a floating-point number
+        // m * 2^e, uses the top 4 bits of m to produce an approximation to 1/m
+        // with the exponent forced to 0 and only the top 5 bits are nonzero. 0,
+        // infinity, and NaN all return 1.0.
+        // See the ARM patent for more information.
+        { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC },
+        { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC },
+        { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC },
+        { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC },
+        { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC },
+        { 0x0cf51, "COS_TABLE", ADD_ONE_SRC },
+        { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC },
+        { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC },
+        { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC },
+        { 0x0d000, "BRANCH", ADD_BRANCH },
+        // For each bit i, return src2[i] ? src0[i] : src1[i]. In other words, this
+        // is the same as (src2 & src0) | (~src2 & src1).
+        { 0x0e8c0, "MUX", ADD_THREE_SRC },
+        { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC },
+        { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC },
+        { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC },
+        { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC },
+        { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC },
+        { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC },
+        { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC },
+        { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0
+        { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC },
+        { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC },
+        { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC },
+        { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC },
+        { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0
+        { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC },
+        { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC },
+        { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC },
+        { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC },
+        { 0x10000, "MAX.v2f16", ADD_FMINMAX16 },
+        { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale },
+        { 0x12000, "MIN.v2f16", ADD_FMINMAX16 },
+        { 0x14000, "ADD.v2f16", ADD_FADD16 },
+        { 0x17000, "FCMP.D3D", ADD_FCMP16 },
+        { 0x178c0, "ADD.i32",  ADD_TWO_SRC },
+        { 0x17900, "ADD.v2i16", ADD_TWO_SRC },
+        { 0x17ac0, "SUB.i32",  ADD_TWO_SRC },
+        { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1
+        { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC },
+        { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC },
+        { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC },
+        { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC },
+        // Compute varying address and datatype (for storing in the vertex shader),
+        // and store the vec3 result in the data register. The result is passed as
+        // the 3 normal arguments to ST_VAR.
+        { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true },
+        { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true },
+        { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true },
+        { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true },
+        // Implements alpha-to-coverage, as well as possibly the late depth and
+        // stencil tests. The first source is the existing sample mask in R60
+        // (possibly modified by gl_SampleMask), and the second source is the alpha
+        // value.  The sample mask is written right away based on the
+        // alpha-to-coverage result using the normal register write mechanism,
+        // since that doesn't need to read from any memory, and then written again
+        // later based on the result of the stencil and depth tests using the
+        // special register.
+        { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true },
+        { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true },
+        { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true },
+        // store a varying given the address and datatype from LD_VAR_ADDR
+        { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true },
+        { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true },
+        { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true },
+        { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true },
+        // This takes the sample coverage mask (computed by ATEST above) as a
+        // regular argument, in addition to the vec4 color in the special register.
+        { 0x1952c, "BLEND", ADD_BLENDING, true },
+        { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true },
+        { 0x1ae60, "TEX", ADD_TEX, true },
+        { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC },
+        { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC },
+        { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC },
+        { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC },
+        { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC },
+        { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC },
+        { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC },
+        { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC },
+        { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC },
+        { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC },
+        { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC },
+        { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC },
+        { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC },
+        { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC },
+        { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC },
+        { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC },
+        { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC },
+        { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC },
+        { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC },
+        { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC },
+        { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC },
+        { 0x1dd18, "OR.i32",  ADD_TWO_SRC },
+        { 0x1dd20, "AND.i32",  ADD_TWO_SRC },
+        { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC },
+        { 0x1dd50, "XOR.i32",  ADD_TWO_SRC },
+        { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC },
+        { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC },
+};
+
+static struct add_op_info find_add_op_info(unsigned op)
+{
+        for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) {
+                unsigned opCmp = ~0;
+                switch (add_op_infos[i].src_type) {
+                case ADD_ONE_SRC:
+                case ADD_BLENDING:
+                        opCmp = op;
+                        break;
+                case ADD_TWO_SRC:
+                        opCmp = op & ~0x7;
+                        break;
+                case ADD_THREE_SRC:
+                        opCmp = op & ~0x3f;
+                        break;
+                case ADD_TEX:
+                        opCmp = op & ~0xf;
+                        break;
+                case ADD_FADD:
+                case ADD_FMINMAX:
+                case ADD_FADD16:
+                        opCmp = op & ~0x1fff;
+                        break;
+                case ADD_FMINMAX16:
+                case ADD_FADDMscale:
+                        opCmp = op & ~0xfff;
+                        break;
+                case ADD_FCMP:
+                case ADD_FCMP16:
+                        opCmp = op & ~0x7ff;
+                        break;
+                case ADD_TEX_COMPACT:
+                        opCmp = op & ~0x3ff;
+                        break;
+                case ADD_VARYING_INTERP:
+                        opCmp = op & ~0x7ff;
+                        break;
+                case ADD_VARYING_ADDRESS:
+                        opCmp = op & ~0xff;
+                        break;
+                case ADD_LOAD_ATTR:
+                        opCmp = op & ~0x7f;
+                        break;
+                case ADD_BRANCH:
+                        opCmp = op & ~0xfff;
+                        break;
+                default:
+                        opCmp = ~0;
+                        break;
+                }
+                if (add_op_infos[i].op == opCmp)
+                        return add_op_infos[i];
+        }
+
+        struct add_op_info info;
+        snprintf(info.name, sizeof(info.name), "op%04x", op);
+        info.op = op;
+        info.src_type = ADD_TWO_SRC;
+        info.has_data_reg = true;
+        return info;
+}
+
+static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts,
+                     unsigned data_reg, unsigned offset, bool verbose)
+{
+        if (verbose) {
+                printf("# ADD: %016" PRIx64 "\n", word);
+        }
+        struct bifrost_add_inst ADD;
+        memcpy((char *) &ADD, (char *) &word, sizeof(ADD));
+        struct add_op_info info = find_add_op_info(ADD.op);
+
+        printf("%s", info.name);
+
+        // float16 seems like it doesn't support output modifiers
+        if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) {
+                // output modifiers
+                dump_output_mod(bits(ADD.op, 8, 10));
+                if (info.src_type == ADD_FADD)
+                        dump_round_mode(bits(ADD.op, 10, 12));
+                else
+                        dump_minmax_mode(bits(ADD.op, 10, 12));
+        } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) {
+                dump_fcmp(bits(ADD.op, 3, 6));
+                if (info.src_type == ADD_FCMP)
+                        printf(".f32");
+                else
+                        printf(".v2f16");
+        } else if (info.src_type == ADD_FADDMscale) {
+                switch ((ADD.op >> 6) & 0x7) {
+                case 0:
+                        break;
+                // causes GPU hangs on G71
+                case 1:
+                        printf(".invalid");
+                        break;
+                // Same as usual outmod value.
+                case 2:
+                        printf(".clamp_0_1");
+                        break;
+                // If src0 is infinite or NaN, flush it to zero so that the other
+                // source is passed through unmodified.
+                case 3:
+                        printf(".flush_src0_inf_nan");
+                        break;
+                // Vice versa.
+                case 4:
+                        printf(".flush_src1_inf_nan");
+                        break;
+                // Every other case seems to behave the same as the above?
+                default:
+                        printf(".unk%d", (ADD.op >> 6) & 0x7);
+                        break;
+                }
+        } else if (info.src_type == ADD_VARYING_INTERP) {
+                if (ADD.op & 0x200)
+                        printf(".reuse");
+                if (ADD.op & 0x400)
+                        printf(".flat");
+                switch ((ADD.op >> 7) & 0x3) {
+                case 0:
+                        printf(".per_frag");
+                        break;
+                case 1:
+                        printf(".centroid");
+                        break;
+                case 2:
+                        break;
+                case 3:
+                        printf(".explicit");
+                        break;
+                }
+                printf(".v%d", ((ADD.op >> 5) & 0x3) + 1);
+        } else if (info.src_type == ADD_BRANCH) {
+                enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f);
+                if (branchCode == BR_ALWAYS) {
+                        // unconditional branch
+                } else {
+                        enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7);
+                        enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
+                        bool portSwapped = (ADD.op & 0x7) < ADD.src0;
+                        // See the comment in branch_bit_size
+                        if (size == BR_SIZE_16YX0)
+                                portSwapped = true;
+                        if (size == BR_SIZE_16YX1)
+                                portSwapped = false;
+                        // These sizes are only for floating point comparisons, so the
+                        // non-floating-point comparisons are reused to encode the flipped
+                        // versions.
+                        if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y)
+                                portSwapped = false;
+                        // There's only one argument, so we reuse the extra argument to
+                        // encode this.
+                        if (size == BR_SIZE_ZERO)
+                                portSwapped = !(ADD.op & 1);
+
+                        switch (cond) {
+                        case BR_COND_LT:
+                                if (portSwapped)
+                                        printf(".LT.u");
+                                else
+                                        printf(".LT.i");
+                                break;
+                        case BR_COND_LE:
+                                if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) {
+                                        printf(".UNE.f");
+                                } else {
+                                        if (portSwapped)
+                                                printf(".LE.u");
+                                        else
+                                                printf(".LE.i");
+                                }
+                                break;
+                        case BR_COND_GT:
+                                if (portSwapped)
+                                        printf(".GT.u");
+                                else
+                                        printf(".GT.i");
+                                break;
+                        case BR_COND_GE:
+                                if (portSwapped)
+                                        printf(".GE.u");
+                                else
+                                        printf(".GE.i");
+                                break;
+                        case BR_COND_EQ:
+                                if (portSwapped)
+                                        printf(".NE.i");
+                                else
+                                        printf(".EQ.i");
+                                break;
+                        case BR_COND_OEQ:
+                                if (portSwapped)
+                                        printf(".UNE.f");
+                                else
+                                        printf(".OEQ.f");
+                                break;
+                        case BR_COND_OGT:
+                                if (portSwapped)
+                                        printf(".OGT.unk.f");
+                                else
+                                        printf(".OGT.f");
+                                break;
+                        case BR_COND_OLT:
+                                if (portSwapped)
+                                        printf(".OLT.unk.f");
+                                else
+                                        printf(".OLT.f");
+                                break;
+                        }
+                        switch (size) {
+                        case BR_SIZE_32:
+                        case BR_SIZE_32_AND_16X:
+                        case BR_SIZE_32_AND_16Y:
+                                printf("32");
+                                break;
+                        case BR_SIZE_16XX:
+                        case BR_SIZE_16YY:
+                        case BR_SIZE_16YX0:
+                        case BR_SIZE_16YX1:
+                                printf("16");
+                                break;
+                        case BR_SIZE_ZERO: {
+                                unsigned ctrl = (ADD.op >> 1) & 0x3;
+                                if (ctrl == 0)
+                                        printf("32.Z");
+                                else
+                                        printf("16.Z");
+                                break;
+                        }
+                        }
+                }
+        }
+        printf(" ");
+
+        struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs);
+        if (next_ctrl.add_write_unit != REG_WRITE_NONE) {
+                printf("{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs));
+        } else {
+                printf("T1, ");
+        }
+
+        switch (info.src_type) {
+        case ADD_BLENDING:
+                // Note: in this case, regs.uniform_const == location | 0x8
+                // This probably means we can't load uniforms or immediates in the
+                // same instruction. This re-uses the encoding that normally means
+                // "disabled", where the low 4 bits are ignored. Perhaps the extra
+                // 0x8 or'd in indicates this is happening.
+                printf("location:%d, ", regs.uniform_const & 0x7);
+        // fallthrough
+        case ADD_ONE_SRC:
+                dump_src(ADD.src0, regs, consts, false);
+                break;
+        case ADD_TEX:
+        case ADD_TEX_COMPACT: {
+                int tex_index;
+                int sampler_index;
+                bool dualTex = false;
+                if (info.src_type == ADD_TEX_COMPACT) {
+                        tex_index = (ADD.op >> 3) & 0x7;
+                        sampler_index = (ADD.op >> 7) & 0x7;
+                        bool unknown = (ADD.op & 0x40);
+                        // TODO: figure out if the unknown bit is ever 0
+                        if (!unknown)
+                                printf("unknown ");
+                } else {
+                        uint64_t constVal = get_const(consts, regs);
+                        uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal;
+                        struct bifrost_tex_ctrl ctrl;
+                        memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl));
+
+                        // TODO: figure out what actually triggers dual-tex
+                        if (ctrl.result_type == 9) {
+                                struct bifrost_dual_tex_ctrl dualCtrl;
+                                memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl));
+                                printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ",
+                                       dualCtrl.tex_index0, dualCtrl.sampler_index0,
+                                       dualCtrl.tex_index1, dualCtrl.sampler_index1);
+                                if (dualCtrl.unk0 != 3)
+                                        printf("unk:%d ", dualCtrl.unk0);
+                                dualTex = true;
+                        } else {
+                                if (ctrl.no_merge_index) {
+                                        tex_index = ctrl.tex_index;
+                                        sampler_index = ctrl.sampler_index;
+                                } else {
+                                        tex_index = sampler_index = ctrl.tex_index;
+                                        unsigned unk = ctrl.sampler_index >> 2;
+                                        if (unk != 3)
+                                                printf("unk:%d ", unk);
+                                        if (ctrl.sampler_index & 1)
+                                                tex_index = -1;
+                                        if (ctrl.sampler_index & 2)
+                                                sampler_index = -1;
+                                }
+
+                                if (ctrl.unk0 != 3)
+                                        printf("unk0:%d ", ctrl.unk0);
+                                if (ctrl.unk1)
+                                        printf("unk1 ");
+                                if (ctrl.unk2 != 0xf)
+                                        printf("unk2:%x ", ctrl.unk2);
+
+                                switch (ctrl.result_type) {
+                                case 0x4:
+                                        printf("f32 ");
+                                        break;
+                                case 0xe:
+                                        printf("i32 ");
+                                        break;
+                                case 0xf:
+                                        printf("u32 ");
+                                        break;
+                                default:
+                                        printf("unktype(%x) ", ctrl.result_type);
+                                }
+
+                                switch (ctrl.tex_type) {
+                                case 0:
+                                        printf("cube ");
+                                        break;
+                                case 1:
+                                        printf("buffer ");
+                                        break;
+                                case 2:
+                                        printf("2D ");
+                                        break;
+                                case 3:
+                                        printf("3D ");
+                                        break;
+                                }
+
+                                if (ctrl.is_shadow)
+                                        printf("shadow ");
+                                if (ctrl.is_array)
+                                        printf("array ");
+
+                                if (!ctrl.filter) {
+                                        if (ctrl.calc_gradients) {
+                                                int comp = (controlBits >> 20) & 0x3;
+                                                printf("txg comp:%d ", comp);
+                                        } else {
+                                                printf("txf ");
+                                        }
+                                } else {
+                                        if (!ctrl.not_supply_lod) {
+                                                if (ctrl.compute_lod)
+                                                        printf("lod_bias ");
+                                                else
+                                                        printf("lod ");
+                                        }
+
+                                        if (!ctrl.calc_gradients)
+                                                printf("grad ");
+                                }
+
+                                if (ctrl.texel_offset)
+                                        printf("offset ");
+                        }
+                }
+
+                if (!dualTex) {
+                        if (tex_index == -1)
+                                printf("tex:indirect ");
+                        else
+                                printf("tex:%d ", tex_index);
+
+                        if (sampler_index == -1)
+                                printf("samp:indirect ");
+                        else
+                                printf("samp:%d ", sampler_index);
+                }
+                break;
+        }
+        case ADD_VARYING_INTERP: {
+                unsigned addr = ADD.op & 0x1f;
+                if (addr < 0b10100) {
+                        // direct addr
+                        printf("%d", addr);
+                } else if (addr < 0b11000) {
+                        if (addr == 22)
+                                printf("fragw");
+                        else if (addr == 23)
+                                printf("fragz");
+                        else
+                                printf("unk%d", addr);
+                } else {
+                        dump_src(ADD.op & 0x7, regs, consts, false);
+                }
+                printf(", ");
+                dump_src(ADD.src0, regs, consts, false);
+                break;
+        }
+        case ADD_VARYING_ADDRESS: {
+                dump_src(ADD.src0, regs, consts, false);
+                printf(", ");
+                dump_src(ADD.op & 0x7, regs, consts, false);
+                printf(", ");
+                unsigned location = (ADD.op >> 3) & 0x1f;
+                if (location < 16) {
+                        printf("location:%d", location);
+                } else if (location == 20) {
+                        printf("location:%u", (uint32_t) get_const(consts, regs));
+                } else if (location == 21) {
+                        printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32));
+                } else {
+                        printf("location:%d(unk)", location);
+                }
+                break;
+        }
+        case ADD_LOAD_ATTR:
+                printf("location:%d, ", (ADD.op >> 3) & 0xf);
+        case ADD_TWO_SRC:
+                dump_src(ADD.src0, regs, consts, false);
+                printf(", ");
+                dump_src(ADD.op & 0x7, regs, consts, false);
+                break;
+        case ADD_THREE_SRC:
+                dump_src(ADD.src0, regs, consts, false);
+                printf(", ");
+                dump_src(ADD.op & 0x7, regs, consts, false);
+                printf(", ");
+                dump_src((ADD.op >> 3) & 0x7, regs, consts, false);
+                break;
+        case ADD_FADD:
+        case ADD_FMINMAX:
+                if (ADD.op & 0x10)
+                        printf("-");
+                if (ADD.op & 0x1000)
+                        printf("abs(");
+                dump_src(ADD.src0, regs, consts, false);
+                switch ((ADD.op >> 6) & 0x3) {
+                case 3:
+                        printf(".x");
+                        break;
+                default:
+                        break;
+                }
+                if (ADD.op & 0x1000)
+                        printf(")");
+                printf(", ");
+                if (ADD.op & 0x20)
+                        printf("-");
+                if (ADD.op & 0x8)
+                        printf("abs(");
+                dump_src(ADD.op & 0x7, regs, consts, false);
+                switch ((ADD.op >> 6) & 0x3) {
+                case 1:
+                case 3:
+                        printf(".x");
+                        break;
+                case 2:
+                        printf(".y");
+                        break;
+                case 0:
+                        break;
+                default:
+                        printf(".unk");
+                        break;
+                }
+                if (ADD.op & 0x8)
+                        printf(")");
+                break;
+        case ADD_FADD16:
+                if (ADD.op & 0x10)
+                        printf("-");
+                if (ADD.op & 0x1000)
+                        printf("abs(");
+                dump_src(ADD.src0, regs, consts, false);
+                if (ADD.op & 0x1000)
+                        printf(")");
+                dump_16swizzle((ADD.op >> 6) & 0x3);
+                printf(", ");
+                if (ADD.op & 0x20)
+                        printf("-");
+                if (ADD.op & 0x8)
+                        printf("abs(");
+                dump_src(ADD.op & 0x7, regs, consts, false);
+                dump_16swizzle((ADD.op >> 8) & 0x3);
+                if (ADD.op & 0x8)
+                        printf(")");
+                break;
+        case ADD_FMINMAX16: {
+                bool abs1 = ADD.op & 0x8;
+                bool abs2 = (ADD.op & 0x7) < ADD.src0;
+                if (ADD.op & 0x10)
+                        printf("-");
+                if (abs1 || abs2)
+                        printf("abs(");
+                dump_src(ADD.src0, regs, consts, false);
+                dump_16swizzle((ADD.op >> 6) & 0x3);
+                if (abs1 || abs2)
+                        printf(")");
+                printf(", ");
+                if (ADD.op & 0x20)
+                        printf("-");
+                if (abs1 && abs2)
+                        printf("abs(");
+                dump_src(ADD.op & 0x7, regs, consts, false);
+                dump_16swizzle((ADD.op >> 8) & 0x3);
+                if (abs1 && abs2)
+                        printf(")");
+                break;
+        }
+        case ADD_FADDMscale: {
+                if (ADD.op & 0x400)
+                        printf("-");
+                if (ADD.op & 0x200)
+                        printf("abs(");
+                dump_src(ADD.src0, regs, consts, false);
+                if (ADD.op & 0x200)
+                        printf(")");
+
+                printf(", ");
+
+                if (ADD.op & 0x800)
+                        printf("-");
+                dump_src(ADD.op & 0x7, regs, consts, false);
+
+                printf(", ");
+
+                dump_src((ADD.op >> 3) & 0x7, regs, consts, false);
+                break;
+        }
+        case ADD_FCMP:
+                if (ADD.op & 0x400) {
+                        printf("-");
+                }
+                if (ADD.op & 0x100) {
+                        printf("abs(");
+                }
+                dump_src(ADD.src0, regs, consts, false);
+                switch ((ADD.op >> 6) & 0x3) {
+                case 3:
+                        printf(".x");
+                        break;
+                default:
+                        break;
+                }
+                if (ADD.op & 0x100) {
+                        printf(")");
+                }
+                printf(", ");
+                if (ADD.op & 0x200) {
+                        printf("abs(");
+                }
+                dump_src(ADD.op & 0x7, regs, consts, false);
+                switch ((ADD.op >> 6) & 0x3) {
+                case 1:
+                case 3:
+                        printf(".x");
+                        break;
+                case 2:
+                        printf(".y");
+                        break;
+                case 0:
+                        break;
+                default:
+                        printf(".unk");
+                        break;
+                }
+                if (ADD.op & 0x200) {
+                        printf(")");
+                }
+                break;
+        case ADD_FCMP16:
+                dump_src(ADD.src0, regs, consts, false);
+                dump_16swizzle((ADD.op >> 6) & 0x3);
+                printf(", ");
+                dump_src(ADD.op & 0x7, regs, consts, false);
+                dump_16swizzle((ADD.op >> 8) & 0x3);
+                break;
+        case ADD_BRANCH: {
+                enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f);
+                enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
+                if (code != BR_ALWAYS) {
+                        dump_src(ADD.src0, regs, consts, false);
+                        switch (size) {
+                        case BR_SIZE_16XX:
+                                printf(".x");
+                                break;
+                        case BR_SIZE_16YY:
+                        case BR_SIZE_16YX0:
+                        case BR_SIZE_16YX1:
+                                printf(".y");
+                                break;
+                        case BR_SIZE_ZERO: {
+                                unsigned ctrl = (ADD.op >> 1) & 0x3;
+                                switch (ctrl) {
+                                case 1:
+                                        printf(".y");
+                                        break;
+                                case 2:
+                                        printf(".x");
+                                        break;
+                                default:
+                                        break;
+                                }
+                        }
+                        default:
+                                break;
+                        }
+                        printf(", ");
+                }
+                if (code != BR_ALWAYS && size != BR_SIZE_ZERO) {
+                        dump_src(ADD.op & 0x7, regs, consts, false);
+                        switch (size) {
+                        case BR_SIZE_16XX:
+                        case BR_SIZE_16YX0:
+                        case BR_SIZE_16YX1:
+                        case BR_SIZE_32_AND_16X:
+                                printf(".x");
+                                break;
+                        case BR_SIZE_16YY:
+                        case BR_SIZE_32_AND_16Y:
+                                printf(".y");
+                                break;
+                        default:
+                                break;
+                        }
+                        printf(", ");
+                }
+                // I haven't had the chance to test if this actually specifies the
+                // branch offset, since I couldn't get it to produce values other
+                // than 5 (uniform/const high), but these three bits are always
+                // consistent across branch instructions, so it makes sense...
+                int offsetSrc = (ADD.op >> 3) & 0x7;
+                if (offsetSrc == 4 || offsetSrc == 5) {
+                        // If the offset is known/constant, we can decode it
+                        uint32_t raw_offset;
+                        if (offsetSrc == 4)
+                                raw_offset = get_const(consts, regs);
+                        else
+                                raw_offset = get_const(consts, regs) >> 32;
+                        // The high 4 bits are flags, while the rest is the
+                        // twos-complement offset in bytes (here we convert to
+                        // clauses).
+                        int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8;
+
+                        // If high4 is the high 4 bits of the last 64-bit constant,
+                        // this is calculated as (high4 + 4) & 0xf, or 0 if the branch
+                        // offset itself is the last constant. Not sure if this is
+                        // actually used, or just garbage in unused bits, but in any
+                        // case, we can just ignore it here since it's redundant. Note
+                        // that if there is any padding, this will be 4 since the
+                        // padding counts as the last constant.
+                        unsigned flags = raw_offset >> 28;
+                        (void) flags;
+
+                        // Note: the offset is in bytes, relative to the beginning of the
+                        // current clause, so a zero offset would be a loop back to the
+                        // same clause (annoyingly different from Midgard).
+                        printf("clause_%d", offset + branch_offset);
+                } else {
+                        dump_src(offsetSrc, regs, consts, false);
+                }
+        }
+        }
+        if (info.has_data_reg) {
+                printf(", R%d", data_reg);
+        }
+        printf("\n");
 }
 
-static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose)
+void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts,
+                unsigned data_reg, unsigned offset, bool verbose)
+{
+        struct bifrost_regs regs;
+        memcpy((char *) &regs, (char *) &instr->reg_bits, sizeof(regs));
+
+        if (verbose) {
+                printf("# regs: %016" PRIx64 "\n", instr->reg_bits);
+                dump_regs(regs);
+        }
+        dump_fma(instr->fma_bits, regs, next_regs, consts, verbose);
+        dump_add(instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose);
+}
+
+bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose)
 {
         // State for a decoded clause
         struct bifrost_alu_inst instrs[8] = {};
-        struct bi_constants consts = {};
+        uint64_t consts[6] = {};
         unsigned num_instrs = 0;
         unsigned num_consts = 0;
         uint64_t header_bits = 0;
+        bool stopbit = false;
 
         unsigned i;
         for (i = 0; ; i++, words += 4) {
                 if (verbose) {
-                        fprintf(fp, "# ");
+                        printf("# ");
                         for (int j = 0; j < 4; j++)
-                                fprintf(fp, "%08x ", words[3 - j]); // low bit on the right
-                        fprintf(fp, "\n");
+                                printf("%08x ", words[3 - j]); // low bit on the right
+                        printf("\n");
                 }
                 unsigned tag = bits(words[0], 0, 8);
 
@@ -469,45 +2078,39 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs
                 uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60;
                 uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32;
 
-                /* Z-bit */
                 bool stop = tag & 0x40;
 
                 if (verbose) {
-                        fprintf(fp, "# tag: 0x%02x\n", tag);
+                        printf("# tag: 0x%02x\n", tag);
                 }
                 if (tag & 0x80) {
-                        /* Format 5 or 10 */
                         unsigned idx = stop ? 5 : 2;
                         main_instr.add_bits |= ((tag >> 3) & 0x7) << 17;
                         instrs[idx + 1] = main_instr;
                         instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17);
                         instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10;
-                        consts.raw[0] = bits(words[3], 17, 32) << 4;
+                        consts[0] = bits(words[3], 17, 32) << 4;
                 } else {
                         bool done = false;
                         switch ((tag >> 3) & 0x7) {
                         case 0x0:
                                 switch (tag & 0x7) {
                                 case 0x3:
-                                        /* Format 1 */
                                         main_instr.add_bits |= bits(words[3], 29, 32) << 17;
                                         instrs[1] = main_instr;
                                         num_instrs = 2;
                                         done = stop;
                                         break;
                                 case 0x4:
-                                        /* Format 3 */
                                         instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
                                         instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
-                                        consts.raw[0] = const0;
-                                        decode_M(&consts.mods[0], bits(words[2], 4, 8), bits(words[2], 8, 12), true);
+                                        consts[0] = const0;
                                         num_instrs = 3;
                                         num_consts = 1;
                                         done = stop;
                                         break;
                                 case 0x1:
                                 case 0x5:
-                                        /* Format 4 */
                                         instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
                                         instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
                                         main_instr.add_bits |= bits(words[3], 26, 29) << 17;
@@ -518,17 +2121,14 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs
                                         }
                                         break;
                                 case 0x6:
-                                        /* Format 8 */
                                         instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
                                         instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
-                                        consts.raw[0] = const0;
-                                        decode_M(&consts.mods[0], bits(words[2], 4, 8), bits(words[2], 8, 12), true);
+                                        consts[0] = const0;
                                         num_instrs = 6;
                                         num_consts = 1;
                                         done = stop;
                                         break;
                                 case 0x7:
-                                        /* Format 9 */
                                         instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
                                         instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
                                         main_instr.add_bits |= bits(words[3], 26, 29) << 17;
@@ -537,23 +2137,21 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs
                                         done = stop;
                                         break;
                                 default:
-                                        unreachable("[INSTR_INVALID_ENC] Invalid tag bits");
+                                        printf("unknown tag bits 0x%02x\n", tag);
                                 }
                                 break;
                         case 0x2:
                         case 0x3: {
-                                /* Format 6 or 11 */
                                 unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7;
                                 main_instr.add_bits |= (tag & 0x7) << 17;
                                 instrs[idx] = main_instr;
-                                consts.raw[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19;
+                                consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19;
                                 num_consts = 1;
                                 num_instrs = idx + 1;
                                 done = stop;
                                 break;
                         }
                         case 0x4: {
-                                /* Format 2 */
                                 unsigned idx = stop ? 4 : 1;
                                 main_instr.add_bits |= (tag & 0x7) << 17;
                                 instrs[idx] = main_instr;
@@ -562,69 +2160,56 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs
                                 break;
                         }
                         case 0x1:
-                                /* Format 0 - followed by constants */
+                                // only constants can come after this
                                 num_instrs = 1;
                                 done = stop;
-                                FALLTHROUGH;
                         case 0x5:
-                                /* Format 0 - followed by instructions */
                                 header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19));
                                 main_instr.add_bits |= (tag & 0x7) << 17;
                                 instrs[0] = main_instr;
                                 break;
                         case 0x6:
                         case 0x7: {
-                                /* Format 12 */
                                 unsigned pos = tag & 0xf;
-
-                                struct {
-                                        unsigned const_idx;
-                                        unsigned nr_tuples;
-                                } pos_table[0x10] = {
-                                        { 0, 1 },
-                                        { 0, 2 },
-                                        { 0, 4 },
-                                        { 1, 3 },
-                                        { 1, 5 },
-                                        { 2, 4 },
-                                        { 0, 7 },
-                                        { 1, 6 },
-                                        { 3, 5 },
-                                        { 1, 8 },
-                                        { 2, 7 },
-                                        { 3, 6 },
-                                        { 3, 8 },
-                                        { 4, 7 },
-                                        { 5, 6 },
-                                        { ~0, ~0 }
-                                };
-
-                                ASSERTED bool valid_count = pos_table[pos].nr_tuples == num_instrs;
-                                assert(valid_count && "INSTR_INVALID_ENC");
-
-                                unsigned const_idx = pos_table[pos].const_idx;
-
+                                // note that `pos' encodes both the total number of
+                                // instructions and the position in the constant stream,
+                                // presumably because decoded constants and instructions
+                                // share a buffer in the decoder, but we only care about
+                                // the position in the constant stream; the total number of
+                                // instructions is redundant.
+                                unsigned const_idx = 7;
+                                switch (pos) {
+                                case 0:
+                                case 1:
+                                case 2:
+                                case 6:
+                                        const_idx = 0;
+                                        break;
+                                case 3:
+                                case 4:
+                                case 7:
+                                case 9:
+                                        const_idx = 1;
+                                        break;
+                                case 5:
+                                case 0xa:
+                                        const_idx = 2;
+                                        break;
+                                case 8:
+                                case 0xb:
+                                case 0xc:
+                                        const_idx = 3;
+                                        break;
+                                case 0xd:
+                                        const_idx = 4;
+                                        break;
+                                default:
+                                        printf("# unknown pos 0x%x\n", pos);
+                                }
                                 if (num_consts < const_idx + 2)
                                         num_consts = const_idx + 2;
-
-                                consts.raw[const_idx] = const0;
-                                consts.raw[const_idx + 1] = const1;
-
-                                /* Calculate M values from A, B and 4-bit
-                                 * unsigned arithmetic. Mathematically it
-                                 * should be (A - B) % 16 but we use this
-                                 * alternate form to avoid sign issues */
-
-                                unsigned A1 = bits(words[2], 0, 4);
-                                unsigned B1 = bits(words[3], 28, 32);
-                                unsigned A2 = bits(words[1], 0, 4);
-                                unsigned B2 = bits(words[2], 28, 32);
-
-                                unsigned M1 = (16 + A1 - B1) & 0xF;
-                                unsigned M2 = (16 + A2 - B2) & 0xF;
-
-                                decode_M(&consts.mods[const_idx], M1, M2, false);
-
+                                consts[const_idx] = const0;
+                                consts[const_idx + 1] = const1;
                                 done = stop;
                                 break;
                         }
@@ -640,16 +2225,18 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs
         *size = i + 1;
 
         if (verbose) {
-                fprintf(fp, "# header: %012" PRIx64 "\n", header_bits);
+                printf("# header: %012" PRIx64 "\n", header_bits);
         }
 
         struct bifrost_header header;
         memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header));
-        dump_header(fp, header, verbose);
+        dump_header(header, verbose);
+        if (!header.no_end_of_shader)
+                stopbit = true;
 
-        fprintf(fp, "{\n");
+        printf("{\n");
         for (i = 0; i < num_instrs; i++) {
-                struct bifrost_regs regs, next_regs;
+                struct bifrost_regs next_regs;
                 if (i + 1 == num_instrs) {
                         memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits,
                                sizeof(next_regs));
@@ -658,51 +2245,36 @@ static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offs
                                sizeof(next_regs));
                 }
 
-                memcpy((char *) &regs, (char *) &instrs[i].reg_bits, sizeof(regs));
-
-                if (verbose) {
-                        fprintf(fp, "    # regs: %016" PRIx64 "\n", instrs[i].reg_bits);
-                        dump_regs(fp, regs, i == 0);
-                }
-
-                bi_disasm_fma(fp, instrs[i].fma_bits, &regs, &next_regs,
-                                header.staging_register, offset, &consts,
-                                i + 1 == num_instrs);
-
-                bi_disasm_add(fp, instrs[i].add_bits, &regs, &next_regs,
-                                header.staging_register, offset, &consts,
-                                i + 1 == num_instrs);
+                dump_instr(&instrs[i], next_regs, consts, header.datareg, offset, verbose);
         }
-        fprintf(fp, "}\n");
+        printf("}\n");
 
         if (verbose) {
                 for (unsigned i = 0; i < num_consts; i++) {
-                        fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i, consts.raw[i] & 0xffffffff);
-                        fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i + 1, consts.raw[i] >> 32);
+                        printf("# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff);
+                        printf("# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32);
                 }
         }
-
-        fprintf(fp, "\n");
-        return;
+        return stopbit;
 }
 
-void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose)
+void disassemble_bifrost(uint8_t *code, size_t size, bool verbose)
 {
         uint32_t *words = (uint32_t *) code;
         uint32_t *words_end = words + (size / 4);
         // used for displaying branch targets
         unsigned offset = 0;
         while (words != words_end) {
-                /* Shaders have zero bytes at the end for padding; stop
-                 * disassembling when we hit them. */
-                if (*words == 0)
+                // we don't know what the program-end bit is quite yet, so for now just
+                // assume that an all-0 quadword is padding
+                uint32_t zero[4] = {};
+                if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0)
                         break;
-
-                fprintf(fp, "clause_%u:\n", offset);
-
+                printf("clause_%d:\n", offset);
                 unsigned size;
-                dump_clause(fp, words, &size, offset, verbose);
-
+                if (dump_clause(words, &size, offset, verbose) == true) {
+                        break;
+                }
                 words += size * 4;
                 offset += size;
         }
diff --git a/lib/mesa/src/panfrost/bifrost/disassemble.h b/lib/mesa/src/panfrost/bifrost/disassemble.h
index 1e39c20d6..f5bce2e30 100644
--- a/lib/mesa/src/panfrost/bifrost/disassemble.h
+++ b/lib/mesa/src/panfrost/bifrost/disassemble.h
@@ -23,25 +23,7 @@
  * SOFTWARE.
  */
 
-#ifndef __BI_DISASM_H
-#define __BI_DISASM_H
-
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include "bifrost.h"
-
-void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose);
-
-void
-bi_disasm_fma(FILE *fp, unsigned bits, struct bifrost_regs *srcs, struct bifrost_regs *next_regs, unsigned staging_register, unsigned branch_offset, struct bi_constants *consts, bool first);
-
-void bi_disasm_add(FILE *fp, unsigned bits, struct bifrost_regs *srcs, struct bifrost_regs *next_regs, unsigned staging_register, unsigned branch_offset, struct bi_constants *consts, bool first);
-
-void bi_disasm_dest_fma(FILE *fp, struct bifrost_regs *next_regs, bool first);
-void bi_disasm_dest_add(FILE *fp, struct bifrost_regs *next_regs, bool first);
-
-void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool isFMA);
-
-#endif
+void disassemble_bifrost(uint8_t *code, size_t size, bool verbose);
diff --git a/lib/mesa/src/panfrost/bifrost/meson.build b/lib/mesa/src/panfrost/bifrost/meson.build
index 63d1560cc..b49170a35 100644
--- a/lib/mesa/src/panfrost/bifrost/meson.build
+++ b/lib/mesa/src/panfrost/bifrost/meson.build
@@ -19,172 +19,20 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-subdir('valhall')
-
-inc_valhall = include_directories(['.', 'valhall'])
-
 libpanfrost_bifrost_files = files(
-  'bi_helper_invocations.c',
-  'bi_layout.c',
-  'bi_liveness.c',
-  'bi_lower_divergent_indirects.c',
-  'bi_lower_swizzle.c',
-  'bi_print.c',
-  'bi_opt_constant_fold.c',
-  'bi_opt_copy_prop.c',
-  'bi_opt_dce.c',
-  'bi_opt_cse.c',
-  'bi_opt_push_ubo.c',
-  'bi_opt_message_preload.c',
-  'bi_opt_mod_props.c',
-  'bi_opt_dual_tex.c',
-  'bi_pressure_schedule.c',
-  'bi_pack.c',
-  'bi_ra.c',
-  'bi_schedule.c',
-  'bi_scoreboard.c',
-  'bi_validate.c',
-  'bir.c',
   'bifrost_compile.c',
-  'valhall/va_insert_flow.c',
-  'valhall/va_lower_constants.c',
-  'valhall/va_lower_isel.c',
-  'valhall/va_lower_split_64bit.c',
-  'valhall/va_optimize.c',
-  'valhall/va_mark_last.c',
-  'valhall/va_merge_flow.c',
-  'valhall/va_pack.c',
-  'valhall/va_perf.c',
-  'valhall/va_validate.c',
-)
-
-bifrost_gen_disasm_c = custom_target(
-  'bifrost_gen_disasm.c',
-  input : ['gen_disasm.py', 'ISA.xml'],
-  output : 'bifrost_gen_disasm.c',
-  command : [prog_python, '@INPUT@'],
-  capture : true,
-  depend_files : files('bifrost_isa.py'),
-)
-
-bi_opcodes_c = custom_target(
-  'bi_opcodes.c',
-  input : ['bi_opcodes.c.py', 'ISA.xml'],
-  output : 'bi_opcodes.c',
-  command : [prog_python, '@INPUT@'],
-  capture : true,
-  depend_files : files('bifrost_isa.py'),
-)
-
-bi_printer_c = custom_target(
-  'bi_printer.c',
-  input : ['bi_printer.c.py', 'ISA.xml'],
-  output : 'bi_printer.c',
-  command : [prog_python, '@INPUT@'],
-  capture : true,
-  depend_files : files('bifrost_isa.py'),
-)
-
-bi_packer_c = custom_target(
-  'bi_packer.c',
-  input : ['bi_packer.c.py', 'ISA.xml'],
-  output : 'bi_packer.c',
-  command : [prog_python, '@INPUT@'],
-  capture : true,
-  depend_files : files('bifrost_isa.py'),
-)
-
-bi_opcodes_h = custom_target(
-  'bi_opcodes.h',
-  input : ['bi_opcodes.h.py', 'ISA.xml'],
-  output : 'bi_opcodes.h',
-  command : [prog_python, '@INPUT@'],
-  capture : true,
-  depend_files : files('bifrost_isa.py'),
-)
-
-idep_bi_opcodes_h = declare_dependency(
-  sources : [bi_opcodes_h],
-  include_directories : include_directories('.'),
-)
-
-bi_builder_h = custom_target(
-  'bi_builder.h',
-  input : ['bi_builder.h.py', 'ISA.xml'],
-  output : 'bi_builder.h',
-  command : [prog_python, '@INPUT@'],
-  capture : true,
-  depend_files : files('bifrost_isa.py'),
-)
-
-idep_bi_builder_h = declare_dependency(
-  sources : [bi_builder_h],
-  include_directories : include_directories('.'),
-)
-
-bifrost_nir_algebraic_c = custom_target(
-  'bifrost_nir_algebraic.c',
-  input : 'bifrost_nir_algebraic.py',
-  output : 'bifrost_nir_algebraic.c',
-  command : [
-    prog_python, '@INPUT@', '-p', dir_compiler_nir,
-  ],
-  capture : true,
-  depend_files : nir_algebraic_depends,
-)
-
-libpanfrost_bifrost_disasm = static_library(
-  'panfrost_bifrost_disasm',
-  ['disassemble.c', 'bi_print_common.c', bifrost_gen_disasm_c],
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw],
-  dependencies: [idep_nir],
-  link_with: [libpanfrost_util],
-  c_args : [no_override_init_args],
-  gnu_symbol_visibility : 'hidden',
-  build_by_default : false,
+  'bifrost_opts.c',
+  'bifrost_sched.c',
+  'bifrost_print.c',
+  'disassemble.c',
 )
 
 libpanfrost_bifrost = static_library(
   'panfrost_bifrost',
-  [libpanfrost_bifrost_files, bi_opcodes_c, bi_printer_c, bi_packer_c, bifrost_nir_algebraic_c, valhall_c],
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw, inc_valhall],
-  dependencies: [idep_nir, idep_bi_opcodes_h, idep_bi_builder_h, idep_valhall_enums_h],
-  link_with: [libpanfrost_util, libpanfrost_bifrost_disasm, libpanfrost_valhall_disasm],
-  c_args : [no_override_init_args],
-  gnu_symbol_visibility : 'hidden',
+  [libpanfrost_bifrost_files],
+  include_directories : [inc_common, inc_include, inc_src],
+  dependencies: [idep_nir],
+  c_args : [c_vis_args, no_override_init_args],
+  cpp_args : [cpp_vis_args],
   build_by_default : false,
 )
-
-if with_tests
-  test(
-    'bifrost_tests',
-    executable(
-      'bifrost_tests',
-      files(
-        'test/test-constant-fold.cpp',
-        'test/test-dual-texture.cpp',
-        'test/test-lower-swizzle.cpp',
-        'test/test-message-preload.cpp',
-	'test/test-optimizer.cpp',
-	'test/test-pack-formats.cpp',
-	'test/test-packing.cpp',
-	'test/test-scheduler-predicates.cpp',
-        'valhall/test/test-add-imm.cpp',
-        'valhall/test/test-validate-fau.cpp',
-        'valhall/test/test-insert-flow.cpp',
-        'valhall/test/test-lower-isel.cpp',
-        'valhall/test/test-lower-constants.cpp',
-        'valhall/test/test-mark-last.cpp',
-        'valhall/test/test-merge-flow.cpp',
-        'valhall/test/test-packing.cpp',
-      ),
-      c_args : [c_msvc_compat_args, no_override_init_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mesa, inc_valhall],
-      dependencies: [idep_gtest, idep_nir, idep_bi_opcodes_h, idep_bi_builder_h],
-      link_with : [libpanfrost_bifrost],
-    ),
-    suite : ['panfrost'],
-    protocol : gtest_test_protocol,
-  )
-endif
author	Jonathan Gray <jsg@cvs.openbsd.org>	2020-01-22 02:13:05 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2020-01-22 02:13:05 +0000
commit	01fb7c3270d8d1e1c53129a974587680aa129089 (patch)
tree	68033daecea5da5fcb45de5cbef65b8b3fc92845
parent	53b0736c56ca5142a5722eb827a3675ca08e123d (diff)