import Mesa 11.0.6

author: Jonathan Gray <jsg@cvs.openbsd.org> 2015-11-22 02:45:45 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2015-11-22 02:45:45 +0000
commit: b7ab2ee0fa1e6e04a545a9bd2088ac621c810081 (patch)
tree: db90836dcf322d66f4369cb79b21ec5e68986925 /lib/mesa/src/gallium/drivers
parent: f00235c070468f96521cd88ebc8919fa0cb89a25 (diff)
19 files changed, 192 insertions, 2119 deletions
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile
index 115f6d0c0..06d1979d8 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile
@@ -3,9 +3,9 @@ ENVYAS ?= envyas
 all: gf100.asm.h gk104.asm.h gk110.asm.h gm107.asm.h
 
 gf100.asm.h: %.asm.h: %.asm
-	$(ENVYAS) -a -W -mgf100 -Vgf100 $< -o $@
+	$(ENVYAS) -a -W -mnvc0 -Vnvc0 $< -o $@
 gk104.asm.h: %.asm.h: %.asm
-	$(ENVYAS) -a -W -mgf100 -Vgk104 $< -o $@
+	$(ENVYAS) -a -W -mnvc0 -Vnve4 $< -o $@
 gk110.asm.h: %.asm.h: %.asm
 	$(ENVYAS) -a -W -mgk110 $< -o $@
 gm107.asm.h: %.asm.h: %.asm
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
index 21a6b4de6..cd65b5472 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
@@ -543,8 +543,6 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
 $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
 long mov b32 $r3 0x3f800000
 long nop
-sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
-long nop
 long ret
 
 
@@ -556,144 +554,7 @@ long ret
 // SIZE:    9 * 8 bytes
 //
 gk104_rcp_f64:
-   // Step 1: classify input according to exponent and value, and calculate
-   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
-   // bit 52 (bit 20 of the upper half) and is 11 bits in length
-   ext u32 $r2 $r1 0xb14
-   add b32 $r3 $r2 0xffffffff
-   joinat #rcp_rejoin
-   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
-   // denorm, or 0). Do this by subtracting 1 from the exponent, which will
-   // mean that it's > 0x7fd in those cases when doing unsigned comparison
-   set $p0 0x1 gt u32 $r3 0x7fd
-   // $r3: 0 for norms, 0x36 for denorms, -1 for others
-   long mov b32 $r3 0x0
-   sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
-   join (not $p0) nop
-   // Process all special values: NaN, inf, denorm, 0
-   mov b32 $r3 0xffffffff
-   // A number is NaN if its abs value is greater than or unordered with inf
-   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
-   (not $p0) bra #rcp_inf_or_denorm_or_zero
-   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
-   // behavior is both seen on the CPU and the blob
-   join or b32 $r1 $r1 0x80000
-rcp_inf_or_denorm_or_zero:
-   and b32 $r4 $r1 0x7ff00000
-   // Other values with nonzero in exponent field should be inf
-   set $p0 0x1 eq s32 $r4 0x0
-   sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
-   $p0 bra #rcp_denorm_or_zero
-   // +/-Inf -> +/-0
-   xor b32 $r1 $r1 0x7ff00000
-   join mov b32 $r0 0x0
-rcp_denorm_or_zero:
-   set $p0 0x1 gtu f64 abs $r0d 0x0
-   $p0 bra #rcp_denorm
-   // +/-0 -> +/-Inf
-   join or b32 $r1 $r1 0x7ff00000
-rcp_denorm:
-   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
-   mul rn f64 $r0d $r0d 0x4350000000000000
-   sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
-   join mov b32 $r3 0x36
-rcp_rejoin:
-   // All numbers with -1 in $r3 have their result ready in $r0d, return them
-   // others need further calculation
-   set $p0 0x1 lt s32 $r3 0x0
-   $p0 bra #rcp_end
-   // Step 2: Before the real calculation goes on, renormalize the values to
-   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
-   // result in $r6d. The exponent will be recovered later.
-   ext u32 $r2 $r1 0xb14
-   and b32 $r7 $r1 0x800fffff
-   add b32 $r7 $r7 0x3ff00000
-   long mov b32 $r6 $r0
-   sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
-   // Step 3: Convert new value to float (no overflow will occur due to step
-   // 2), calculate rcp and do newton-raphson step once
-   cvt rz f32 $r5 f64 $r6d
-   long rcp f32 $r4 $r5
-   mov b32 $r0 0xbf800000
-   fma rn f32 $r5 $r4 $r5 $r0
-   fma rn f32 $r0 neg $r4 $r5 $r4
-   // Step 4: convert result $r0 back to double, do newton-raphson steps
-   cvt f64 $r0d f32 $r0
-   cvt f64 $r6d neg f64 $r6d
-   sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
-   cvt f64 $r8d f32 0x3f800000
-   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
-   // The formula used here (and above) is:
-   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
-   // The following code uses 2 FMAs for each step, and it will basically
-   // looks like:
-   //     tmp = -src * RCP_{n} + 1
-   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
-   fma rn f64 $r4d $r6d $r0d $r8d
-   fma rn f64 $r0d $r0d $r4d $r0d
-   fma rn f64 $r4d $r6d $r0d $r8d
-   fma rn f64 $r0d $r0d $r4d $r0d
-   fma rn f64 $r4d $r6d $r0d $r8d
-   fma rn f64 $r0d $r0d $r4d $r0d
-   sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
-   fma rn f64 $r4d $r6d $r0d $r8d
-   fma rn f64 $r0d $r0d $r4d $r0d
-   // Step 5: Exponent recovery and final processing
-   // The exponent is recovered by adding what we added to the exponent.
-   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
-   //     rcp(x) = c * rcp(cx)
-   // The delta in exponent comes from two sources:
-   //   1) The renormalization in step 2. The delta is:
-   //      0x3ff - $r2
-   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
-   //      in $r3
-   // These 2 sources are calculated in the first two lines below, and then
-   // added to the exponent extracted from the result above.
-   // Note that after processing, the new exponent may >= 0x7ff (inf)
-   // or <= 0 (denorm). Those cases will be handled respectively below
-   subr b32 $r2 $r2 0x3ff
-   long add b32 $r4 $r2 $r3
-   ext u32 $r3 $r1 0xb14
-   // New exponent in $r3
-   long add b32 $r3 $r3 $r4
-   add b32 $r2 $r3 0xffffffff
-   sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
-   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
-   // (same logic as in step 1)
-   set $p0 0x1 lt u32 $r2 0x7fe
-   (not $p0) bra #rcp_result_inf_or_denorm
-   // Norms: convert exponents back and return
-   shl b32 $r4 $r4 clamp 0x14
-   long add b32 $r1 $r4 $r1
-   bra #rcp_end
-rcp_result_inf_or_denorm:
-   // New exponent >= 0x7ff means that result is inf
-   set $p0 0x1 ge s32 $r3 0x7ff
-   (not $p0) bra #rcp_result_denorm
-   sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
-   // Infinity
-   and b32 $r1 $r1 0x80000000
-   long mov b32 $r0 0x0
-   add b32 $r1 $r1 0x7ff00000
-   bra #rcp_end
-rcp_result_denorm:
-   // Denorm result comes from huge input. The greatest possible fp64, i.e.
-   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
-   // normal value. Other rcp result should be greater than that. If we
-   // set the exponent field to 1, we can recover the result by multiplying
-   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
-   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
-   // the logic here.
-   set $p0 0x1 ne u32 $r3 0x0
-   and b32 $r1 $r1 0x800fffff
-   // 0x3e800000: 1/4
-   $p0 cvt f64 $r6d f32 0x3e800000
-   sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
-   // 0x3f000000: 1/2
-   (not $p0) cvt f64 $r6d f32 0x3f000000
-   add b32 $r1 $r1 0x00100000
-   mul rn f64 $r0d $r0d $r6d
-rcp_end:
+   long nop
    long ret
 
 // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
@@ -704,73 +565,13 @@ rcp_end:
 // SIZE:    14 * 8 bytes
 //
 gk104_rsq_f64:
-   // Before getting initial result rsqrt64h, two special cases should be
-   // handled first.
-   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
-   //    as NaN in rsqrt64h
-   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
-   $p0 or b32 $r1 $r1 0x00080000
-   and b32 $r2 $r1 0x7fffffff
-   sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
-   // 2. denorms and small normal values: using their original value will
-   //    lose precision either at rsqrt64h or the first step in newton-raphson
-   //    steps below. Take 2 as a threshold in exponent field, and multiply
-   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
-   //    to recover in the end)
-   ext u32 $r3 $r1 0xb14
-   set $p1 0x1 le u32 $r3 0x2
-   long or b32 $r2 $r0 $r2
-   $p1 mul rn f64 $r0d $r0d 0x4350000000000000
-   rsqrt64h $r5 $r1
-   // rsqrt64h will give correct result for 0/inf/nan, the following logic
-   // checks whether the input is one of those (exponent is 0x7ff or all 0
-   // except for the sign bit)
-   set b32 $r6 ne u32 $r3 0x7ff
-   long and b32 $r2 $r2 $r6
-   sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
-   set $p0 0x1 ne u32 $r2 0x0
-   $p0 bra #rsq_norm
-   // For 0/inf/nan, make sure the sign bit agrees with input and return
-   and b32 $r1 $r1 0x80000000
-   long mov b32 $r0 0x0
-   long or b32 $r1 $r1 $r5
-   long ret
-rsq_norm:
-   // For others, do 4 Newton-Raphson steps with the formula:
-   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
-   // In the code below, each step is written as:
-   //     tmp1 = 0.5 * x * RSQ_{n}
-   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
-   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
-   long mov b32 $r4 0x0
-   sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
-   // 0x3f000000: 1/2
-   cvt f64 $r8d f32 0x3f000000
-   mul rn f64 $r2d $r0d $r8d
-   mul rn f64 $r0d $r2d $r4d
-   fma rn f64 $r6d neg $r4d $r0d $r8d
-   fma rn f64 $r4d $r4d $r6d $r4d
-   mul rn f64 $r0d $r2d $r4d
-   fma rn f64 $r6d neg $r4d $r0d $r8d
-   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
-   fma rn f64 $r4d $r4d $r6d $r4d
-   mul rn f64 $r0d $r2d $r4d
-   fma rn f64 $r6d neg $r4d $r0d $r8d
-   fma rn f64 $r4d $r4d $r6d $r4d
-   mul rn f64 $r0d $r2d $r4d
-   fma rn f64 $r6d neg $r4d $r0d $r8d
-   fma rn f64 $r4d $r4d $r6d $r4d
-   sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
-   // Multiply 2^27 to result for small inputs to recover
-   $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
-   long mov b32 $r1 $r5
-   long mov b32 $r0 $r4
+   long nop
    long ret
 
 //
 // Trap handler.
 // Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
-// Low 32 bytes of l[] memory shouldn't be used if resumability is required.
+// Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
 //
 // Trap info:
 // 0x000: mutex
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
index ed948dee4..37998768e 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
@@ -481,132 +481,12 @@ uint64_t gk104_builtin_code[] = {
 	0xd40040000840c785,
 	0x18fe00000000dde2,
 	0x4000000000001de4,
-	0x2000000000000007,
+	0x9000000000001de7,
+/* 0x0f08: gk104_rcp_f64 */
 	0x4000000000001de4,
 	0x9000000000001de7,
-/* 0x0f18: gk104_rcp_f64 */
-	0x7000c02c50109c03,
-	0x0bfffffffc20dc02,
-	0x6000000280000007,
-	0x1a0ec01ff431dc03,
-	0x180000000000dde2,
-	0x228282f2b2d042f7,
-	0x40000000000021f4,
-	0x1bfffffffc00dde2,
-	0x1e0edffc0001dc81,
-	0x40000000200021e7,
-	0x3800200000105c52,
-/* 0x0f70: rcp_inf_or_denorm_or_zero */
-	0x39ffc00000111c02,
-	0x190e0000fc41dc23,
-	0x2202f2b2d2f042b7,
-	0x40000000400001e7,
-	0x39ffc00000105c82,
-	0x1800000000001df2,
-/* 0x0fa0: rcp_denorm_or_zero */
-	0x1e0ec0000001dc81,
-	0x40000000200001e7,
-	0x39ffc00000105c52,
-/* 0x0fb8: rcp_denorm */
-	0x5000d0d400001c01,
-	0x2280428282b282f7,
-	0x18000000d800ddf2,
-/* 0x0fd0: rcp_rejoin */
-	0x188e0000fc31dc23,
-	0x40000006000001e7,
-	0x7000c02c50109c03,
-	0x3a003ffffc11dc02,
-	0x08ffc0000071dc02,
-	0x2800000000019de4,
-	0x22e2b2a2828042b7,
-	0x1006000019a15c04,
-	0xc800000010511c00,
-	0x1afe000000001de2,
-	0x3000000014415c00,
-	0x3008000014401e00,
-	0x1000000001301c04,
-	0x1000000019b19d04,
-	0x22929292929292e7,
-	0x1000cfe001321c04,
-	0x2010000000611c01,
-	0x2000000010001c01,
-	0x2010000000611c01,
-	0x2000000010001c01,
-	0x2010000000611c01,
-	0x2000000010001c01,
-	0x2282828282820297,
-	0x2010000000611c01,
-	0x2000000010001c01,
-	0x0800000ffc209e02,
-	0x480000000c211c03,
-	0x7000c02c5010dc03,
-	0x480000001030dc03,
-	0x0bfffffffc309c02,
-	0x22b28282b282b287,
-	0x188ec01ff821dc03,
-	0x40000000600021e7,
-	0x6000c00050411c03,
-	0x4800000004405c03,
-	0x40000001c0001de7,
-/* 0x10f0: rcp_result_inf_or_denorm */
-	0x1b0ec01ffc31dc23,
-	0x40000000a00021e7,
-	0x22f25232b2825207,
-	0x3a00000000105c02,
-	0x1800000000001de2,
-	0x09ffc00000105c02,
-	0x40000000e0001de7,
-/* 0x1128: rcp_result_denorm */
-	0x1a8e0000fc31dc03,
-	0x3a003ffffc105c02,
-	0x1000cfa001318004,
-	0x227202a2e2c282f7,
-	0x1000cfc00131a004,
-	0x0800400000105c02,
-	0x5000000018001c01,
-/* 0x1160: rcp_end */
-	0x9000000000001de7,
-/* 0x1168: gk104_rsq_f64 */
-	0x1e0edffc0001dc81,
-	0x3800200000104042,
-	0x39fffffffc109c02,
-	0x22828252c2820277,
-	0x7000c02c5010dc03,
-	0x198ec0000833dc03,
-	0x6800000008009c43,
-	0x5000d0d400000401,
-	0xc80000001c115c00,
-	0x128ec01ffc319c03,
-	0x6800000018209c03,
-	0x2282e2827202b287,
-	0x1a8e0000fc21dc03,
-	0x40000000800001e7,
-	0x3a00000000105c02,
-	0x1800000000001de2,
-	0x6800000014105c43,
-	0x9000000000001de7,
-/* 0x11f8: rsq_norm */
-	0x1800000000011de2,
-	0x22929292929292f7,
-	0x1000cfc001321c04,
-	0x5000000020009c01,
-	0x5000000010201c01,
-	0x2010000000419e01,
-	0x2008000018411c01,
-	0x5000000010201c01,
-	0x2010000000419e01,
-	0x2292929292929297,
-	0x2008000018411c01,
-	0x5000000010201c01,
-	0x2010000000419e01,
-	0x2008000018411c01,
-	0x5000000010201c01,
-	0x2010000000419e01,
-	0x2008000018411c01,
-	0x20000002e2820297,
-	0x5000d06800410401,
-	0x2800000014005de4,
-	0x2800000010001de4,
+/* 0x0f18: gk104_rsq_f64 */
+	0x4000000000001de4,
 	0x9000000000001de7,
 	0xc800000003f01cc5,
 	0x2c00000100005c04,
@@ -615,7 +495,7 @@ uint64_t gk104_builtin_code[] = {
 	0x680100000c1fdc03,
 	0x4000000a60001c47,
 	0x180000004000dde2,
-/* 0x12e0: spill_cfstack */
+/* 0x0f60: spill_cfstack */
 	0x78000009c0000007,
 	0x0c0000000430dd02,
 	0x4003ffffa0001ca7,
@@ -663,14 +543,14 @@ uint64_t gk104_builtin_code[] = {
 	0x4000000100001ea7,
 	0x480100000c001c03,
 	0x0800000000105c42,
-/* 0x1458: shared_loop */
+/* 0x10d8: shared_loop */
 	0xc100000000309c85,
 	0x9400000500009c85,
 	0x0c00000010001d02,
 	0x0800000000105d42,
 	0x0c0000001030dd02,
 	0x4003ffff40001ca7,
-/* 0x1488: shared_done */
+/* 0x1108: shared_done */
 	0x2800406420001de4,
 	0x2800406430005de4,
 	0xe000000000001c45,
@@ -684,7 +564,7 @@ uint64_t gk104_builtin_code[] = {
 	0x480000000c209c03,
 	0x4801000008001c03,
 	0x0800000000105c42,
-/* 0x14f0: search_cstack */
+/* 0x1170: search_cstack */
 	0x280040646000dde4,
 	0x8400000020009f05,
 	0x190ec0002821dc03,
@@ -693,17 +573,17 @@ uint64_t gk104_builtin_code[] = {
 	0x0800000000105c42,
 	0x0c0000004030dd02,
 	0x00029dff0ffc5cbf,
-/* 0x1530: entry_found */
+/* 0x11b0: entry_found */
 	0x8400000000009f85,
 	0x2800406400001de4,
 	0x2800406410005de4,
 	0x9400000010009c85,
 	0x4000000000001df4,
-/* 0x1558: end_exit */
+/* 0x11d8: end_exit */
 	0x9800000003ffdcc5,
 	0xd000000000008007,
 	0xa000000000004007,
-/* 0x1570: end_cont */
+/* 0x11f0: end_cont */
 	0xd000000000008007,
 	0x3400c3fffc201c04,
 	0xc000000003f01ec5,
@@ -713,6 +593,6 @@ uint64_t gk104_builtin_code[] = {
 uint64_t gk104_builtin_offsets[] = {
 	0x0000000000000000,
 	0x00000000000000f0,
+	0x0000000000000f08,
 	0x0000000000000f18,
-	0x0000000000001168,
 };
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
index 66626b471..b9c05a04b 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -83,229 +83,12 @@ gk110_div_s32:
    $p0 sub b32 $r1 $r1 $r2
    $p0 add b32 $r0 $r0 0x1
    $p3 cvt s32 $r0 neg s32 $r0
-   sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28
+   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
    $p2 cvt s32 $r1 neg s32 $r1
    ret
 
-// RCP F64
-//
-// INPUT:   $r0d
-// OUTPUT:  $r0d
-// CLOBBER: $r2 - $r9, $p0
-//
-// The core of RCP and RSQ implementation is Newton-Raphson step, which is
-// used to find successively better approximation from an imprecise initial
-// value (single precision rcp in RCP and rsqrt64h in RSQ).
-//
 gk110_rcp_f64:
-   // Step 1: classify input according to exponent and value, and calculate
-   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
-   // bit 52 (bit 20 of the upper half) and is 11 bits in length
-   ext u32 $r2 $r1 0xb14
-   add b32 $r3 $r2 0xffffffff
-   joinat #rcp_rejoin
-   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
-   // denorm, or 0). Do this by subtracting 1 from the exponent, which will
-   // mean that it's > 0x7fd in those cases when doing unsigned comparison
-   set b32 $p0 0x1 gt u32 $r3 0x7fd
-   // $r3: 0 for norms, 0x36 for denorms, -1 for others
-   mov b32 $r3 0x0
-   sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
-   join (not $p0) nop
-   // Process all special values: NaN, inf, denorm, 0
-   mov b32 $r3 0xffffffff
-   // A number is NaN if its abs value is greater than or unordered with inf
-   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
-   (not $p0) bra #rcp_inf_or_denorm_or_zero
-   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
-   // behavior is both seen on the CPU and the blob
-   join or b32 $r1 $r1 0x80000
-rcp_inf_or_denorm_or_zero:
-   and b32 $r4 $r1 0x7ff00000
-   // Other values with nonzero in exponent field should be inf
-   set b32 $p0 0x1 eq s32 $r4 0x0
-   sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
-   $p0 bra #rcp_denorm_or_zero
-   // +/-Inf -> +/-0
-   xor b32 $r1 $r1 0x7ff00000
-   join mov b32 $r0 0x0
-rcp_denorm_or_zero:
-   set $p0 0x1 gtu f64 abs $r0d 0x0
-   $p0 bra #rcp_denorm
-   // +/-0 -> +/-Inf
-   join or b32 $r1 $r1 0x7ff00000
-rcp_denorm:
-   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
-   mul rn f64 $r0d $r0d 0x4350000000000000
-   sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
-   join mov b32 $r3 0x36
-rcp_rejoin:
-   // All numbers with -1 in $r3 have their result ready in $r0d, return them
-   // others need further calculation
-   set b32 $p0 0x1 lt s32 $r3 0x0
-   $p0 bra #rcp_end
-   // Step 2: Before the real calculation goes on, renormalize the values to
-   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
-   // result in $r6d. The exponent will be recovered later.
-   ext u32 $r2 $r1 0xb14
-   and b32 $r7 $r1 0x800fffff
-   add b32 $r7 $r7 0x3ff00000
-   mov b32 $r6 $r0
-   sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
-   // Step 3: Convert new value to float (no overflow will occur due to step
-   // 2), calculate rcp and do newton-raphson step once
-   cvt rz f32 $r5 f64 $r6d
-   rcp f32 $r4 $r5
-   mov b32 $r0 0xbf800000
-   fma rn f32 $r5 $r4 $r5 $r0
-   fma rn f32 $r0 neg $r4 $r5 $r4
-   // Step 4: convert result $r0 back to double, do newton-raphson steps
-   cvt f64 $r0d f32 $r0
-   cvt f64 $r6d f64 neg $r6d
-   sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
-   cvt f64 $r8d f32 0x3f800000
-   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
-   // The formula used here (and above) is:
-   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
-   // The following code uses 2 FMAs for each step, and it will basically
-   // looks like:
-   //     tmp = -src * RCP_{n} + 1
-   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
-   fma rn f64 $r4d $r6d $r0d $r8d
-   fma rn f64 $r0d $r0d $r4d $r0d
-   fma rn f64 $r4d $r6d $r0d $r8d
-   fma rn f64 $r0d $r0d $r4d $r0d
-   fma rn f64 $r4d $r6d $r0d $r8d
-   fma rn f64 $r0d $r0d $r4d $r0d
-   sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
-   fma rn f64 $r4d $r6d $r0d $r8d
-   fma rn f64 $r0d $r0d $r4d $r0d
-   // Step 5: Exponent recovery and final processing
-   // The exponent is recovered by adding what we added to the exponent.
-   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
-   //     rcp(x) = c * rcp(cx)
-   // The delta in exponent comes from two sources:
-   //   1) The renormalization in step 2. The delta is:
-   //      0x3ff - $r2
-   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
-   //      in $r3
-   // These 2 sources are calculated in the first two lines below, and then
-   // added to the exponent extracted from the result above.
-   // Note that after processing, the new exponent may >= 0x7ff (inf)
-   // or <= 0 (denorm). Those cases will be handled respectively below
-   subr b32 $r2 $r2 0x3ff
-   add b32 $r4 $r2 $r3
-   ext u32 $r3 $r1 0xb14
-   // New exponent in $r3
-   add b32 $r3 $r3 $r4
-   add b32 $r2 $r3 0xffffffff
-   sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
-   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
-   // (same logic as in step 1)
-   set b32 $p0 0x1 lt u32 $r2 0x7fe
-   (not $p0) bra #rcp_result_inf_or_denorm
-   // Norms: convert exponents back and return
-   shl b32 $r4 $r4 clamp 0x14
-   add b32 $r1 $r4 $r1
-   bra #rcp_end
-rcp_result_inf_or_denorm:
-   // New exponent >= 0x7ff means that result is inf
-   set b32 $p0 0x1 ge s32 $r3 0x7ff
-   (not $p0) bra #rcp_result_denorm
-   sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
-   // Infinity
-   and b32 $r1 $r1 0x80000000
-   mov b32 $r0 0x0
-   add b32 $r1 $r1 0x7ff00000
-   bra #rcp_end
-rcp_result_denorm:
-   // Denorm result comes from huge input. The greatest possible fp64, i.e.
-   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
-   // normal value. Other rcp result should be greater than that. If we
-   // set the exponent field to 1, we can recover the result by multiplying
-   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
-   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
-   // the logic here.
-   set b32 $p0 0x1 ne u32 $r3 0x0
-   and b32 $r1 $r1 0x800fffff
-   // 0x3e800000: 1/4
-   $p0 cvt f64 $r6d f32 0x3e800000
-   sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
-   // 0x3f000000: 1/2
-   (not $p0) cvt f64 $r6d f32 0x3f000000
-   add b32 $r1 $r1 0x00100000
-   mul rn f64 $r0d $r0d $r6d
-rcp_end:
-   ret
-
-// RSQ F64
-//
-// INPUT:   $r0d
-// OUTPUT:  $r0d
-// CLOBBER: $r2 - $r9, $p0 - $p1
-//
 gk110_rsq_f64:
-   // Before getting initial result rsqrt64h, two special cases should be
-   // handled first.
-   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
-   //    as NaN in rsqrt64h
-   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
-   $p0 or b32 $r1 $r1 0x00080000
-   and b32 $r2 $r1 0x7fffffff
-   sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
-   // 2. denorms and small normal values: using their original value will
-   //    lose precision either at rsqrt64h or the first step in newton-raphson
-   //    steps below. Take 2 as a threshold in exponent field, and multiply
-   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
-   //    to recover in the end)
-   ext u32 $r3 $r1 0xb14
-   set b32 $p1 0x1 le u32 $r3 0x2
-   or b32 $r2 $r0 $r2
-   $p1 mul rn f64 $r0d $r0d 0x4350000000000000
-   rsqrt64h f32 $r5 $r1
-   // rsqrt64h will give correct result for 0/inf/nan, the following logic
-   // checks whether the input is one of those (exponent is 0x7ff or all 0
-   // except for the sign bit)
-   set b32 $r6 ne u32 $r3 0x7ff
-   and b32 $r2 $r2 $r6
-   sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
-   set b32 $p0 0x1 ne u32 $r2 0x0
-   $p0 bra #rsq_norm
-   // For 0/inf/nan, make sure the sign bit agrees with input and return
-   and b32 $r1 $r1 0x80000000
-   mov b32 $r0 0x0
-   or b32 $r1 $r1 $r5
-   ret
-rsq_norm:
-   // For others, do 4 Newton-Raphson steps with the formula:
-   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
-   // In the code below, each step is written as:
-   //     tmp1 = 0.5 * x * RSQ_{n}
-   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
-   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
-   mov b32 $r4 0x0
-   sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
-   // 0x3f000000: 1/2
-   cvt f64 $r8d f32 0x3f000000
-   mul rn f64 $r2d $r0d $r8d
-   mul rn f64 $r0d $r2d $r4d
-   fma rn f64 $r6d neg $r4d $r0d $r8d
-   fma rn f64 $r4d $r4d $r6d $r4d
-   mul rn f64 $r0d $r2d $r4d
-   fma rn f64 $r6d neg $r4d $r0d $r8d
-   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
-   fma rn f64 $r4d $r4d $r6d $r4d
-   mul rn f64 $r0d $r2d $r4d
-   fma rn f64 $r6d neg $r4d $r0d $r8d
-   fma rn f64 $r4d $r4d $r6d $r4d
-   mul rn f64 $r0d $r2d $r4d
-   fma rn f64 $r6d neg $r4d $r0d $r8d
-   fma rn f64 $r4d $r4d $r6d $r4d
-   sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
-   // Multiply 2^27 to result for small inputs to recover
-   $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
-   mov b32 $r1 $r5
-   mov b32 $r0 $r4
    ret
 
 .section #gk110_builtin_offsets
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
index 3d1523f2f..8d00e2a22 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
@@ -65,132 +65,11 @@ uint64_t gk110_builtin_code[] = {
 	0xe088000001000406,
 	0x4000000000800001,
 	0xe6010000000ce802,
-	0x08a0a0a010a0b810,
+	0x08b08010a010b810,
 	0xe60100000088e806,
 	0x19000000001c003c,
 /* 0x0218: gk110_rcp_f64 */
-	0xc00000058a1c0409,
-	0x407fffffff9c080d,
-	0x1480000050000000,
-	0xb3401c03fe9c0c1d,
-	0xe4c03c007f9c000e,
-	0x08a0a0bcacb410bc,
-	0x8580000000603c02,
-	0x747fffffff9fc00e,
-	0xb4601fff801c021d,
-	0x120000000420003c,
-	0x21000400005c0404,
-/* 0x0270: rcp_inf_or_denorm_or_zero */
-	0x203ff800001c0410,
-	0xb3281c00001c101d,
-	0x0880bcacb4bc10ac,
-	0x120000000800003c,
-	0x223ff800001c0404,
-	0xe4c03c007fdc0002,
-/* 0x02a0: rcp_denorm_or_zero */
-	0xb4601c00001c021d,
-	0x120000000400003c,
-	0x213ff800005c0404,
-/* 0x02b8: rcp_denorm */
-	0xc400021a801c0001,
-	0x08a010a0a0aca0bc,
-	0x740000001b5fc00e,
-/* 0x02d0: rcp_rejoin */
-	0xb3181c00001c0c1d,
-	0x12000000c000003c,
-	0xc00000058a1c0409,
-	0x204007ffff9c041c,
-	0x401ff800001c1c1d,
-	0xe4c03c00001c001a,
-	0x08b8aca8a0a010ac,
-	0xe5400c00031c3816,
-	0x84000000021c1412,
-	0x745fc000001fc002,
-	0xcc000000029c1016,
-	0xcc081000029c1002,
-	0xe5400000001c2c02,
-	0xe5410000031c3c1a,
-	0x08a4a4a4a4a4a4b8,
-	0xc54001fc001c2c21,
-	0xdb802000001c1812,
-	0xdb800000021c0002,
-	0xdb802000001c1812,
-	0xdb800000021c0002,
-	0xdb802000001c1812,
-	0xdb800000021c0002,
-	0x08a0a0a0a0a080a4,
-	0xdb802000001c1812,
-	0xdb800000021c0002,
-	0x48000001ff9c0809,
-	0xe0800000019c0812,
-	0xc00000058a1c040d,
-	0xe0800000021c0c0e,
-	0x407fffffff9c0c09,
-	0x08aca0a0aca0aca0,
-	0xb3101c03ff1c081d,
-	0x120000000c20003c,
-	0xc24000000a1c1011,
-	0xe0800000009c1006,
-	0x12000000381c003c,
-/* 0x03f0: rcp_result_inf_or_denorm */
-	0xb3681c03ff9c0c1d,
-	0x120000001420003c,
-	0x08bc948caca09480,
-	0x20400000001c0404,
-	0xe4c03c007f9c0002,
-	0x403ff800001c0405,
-	0x120000001c1c003c,
-/* 0x0428: rcp_result_denorm */
-	0xb3501c00001c0c1d,
-	0x204007ffff9c0404,
-	0xc54001f400002c19,
-	0x089c80a8b8b0a0bc,
-	0xc54001f800202c19,
-	0x40000800001c0405,
-	0xe4000000031c0002,
-/* 0x0460: rcp_end */
-	0x19000000001c003c,
-/* 0x0468: gk110_rsq_f64 */
-	0xb4601fff801c021d,
-	0x2100040000000404,
-	0x203fffffff9c0408,
-	0x08a0a094b0a0809c,
-	0xc00000058a1c040d,
-	0xb3301c00011c0c3d,
-	0xe2001000011c000a,
-	0xc400021a80040001,
-	0x84000000039c0416,
-	0xb2d01c03ff9c0c19,
-	0xe2000000031c080a,
-	0x08a0b8a09c80aca0,
-	0xb3501c00001c081d,
-	0x120000001000003c,
-	0x20400000001c0404,
-	0xe4c03c007f9c0002,
-	0xe2001000029c0406,
-	0x19000000001c003c,
-/* 0x04f8: rsq_norm */
-	0xe4c03c007f9c0012,
-	0x08a4a4a4a4a4a4bc,
-	0xc54001f8001c2c21,
-	0xe4000000041c000a,
-	0xe4000000021c0802,
-	0xdb882000001c101a,
-	0xdb801000031c1012,
-	0xe4000000021c0802,
-	0xdb882000001c101a,
-	0x08a4a4a4a4a4a4a4,
-	0xdb801000031c1012,
-	0xe4000000021c0802,
-	0xdb882000001c101a,
-	0xdb801000031c1012,
-	0xe4000000021c0802,
-	0xdb882000001c101a,
-	0xdb801000031c1012,
-	0x08000000b8a080a4,
-	0xc400020d00041011,
-	0xe4c03c00029c0006,
-	0xe4c03c00021c0002,
+/* 0x0218: gk110_rsq_f64 */
 	0x19000000001c003c,
 };
 
@@ -198,5 +77,5 @@ uint64_t gk110_builtin_offsets[] = {
 	0x0000000000000000,
 	0x00000000000000f0,
 	0x0000000000000218,
-	0x0000000000000468,
+	0x0000000000000218,
 };
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
index 76fee8c79..fa8ee072a 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)
 
    if (b->prev)
       b->prev->next = b;
-   if (a->next)
+   if (a->prev)
       a->next->prev = a;
 }
 
@@ -536,6 +536,9 @@ Function::printCFGraph(const char *filePath)
          case Graph::Edge::BACK:
             fprintf(out, "\t%i -> %i;\n", idA, idB);
             break;
+         case Graph::Edge::DUMMY:
+            fprintf(out, "\t%i -> %i [style=dotted];\n", idA, idB);
+            break;
          default:
             assert(0);
             break;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 20ed5cd52..19418c0e0 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -44,8 +44,6 @@ BuildUtil::init(Program *prog)
    bb = NULL;
    pos = NULL;
 
-   tail = false;
-
    memset(imms, 0, sizeof(imms));
    immCount = 0;
 }
@@ -340,7 +338,7 @@ BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit)
       int base2 = (baseSize2[mask] >>  8) & 0xf;
       int size2 = (baseSize2[mask] >> 12) & 0xf;
       Instruction *insn = mkOp(OP_NOP, TYPE_NONE, NULL);
-      if (true) { // size1 can't be 0
+      if (1) { // size1 can't be 0
          LValue *reg = new_LValue(func, f);
          reg->reg.size = size1 << unit;
          reg->reg.data.id = base + base1;
@@ -356,18 +354,6 @@ BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit)
 }
 
 ImmediateValue *
-BuildUtil::mkImm(uint16_t u)
-{
-   ImmediateValue *imm = new_ImmediateValue(prog, (uint32_t)0);
-
-   imm->reg.size = 2;
-   imm->reg.type = TYPE_U16;
-   imm->reg.data.u32 = u;
-
-   return imm;
-}
-
-ImmediateValue *
 BuildUtil::mkImm(uint32_t u)
 {
    unsigned int pos = u32Hash(u);
@@ -406,12 +392,6 @@ BuildUtil::mkImm(float f)
    return mkImm(u.u32);
 }
 
-ImmediateValue *
-BuildUtil::mkImm(double d)
-{
-   return new_ImmediateValue(prog, d);
-}
-
 Value *
 BuildUtil::loadImm(Value *dst, float f)
 {
@@ -419,18 +399,6 @@ BuildUtil::loadImm(Value *dst, float f)
 }
 
 Value *
-BuildUtil::loadImm(Value *dst, double d)
-{
-   return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(8), mkImm(d));
-}
-
-Value *
-BuildUtil::loadImm(Value *dst, uint16_t u)
-{
-   return mkOp1v(OP_MOV, TYPE_U16, dst ? dst : getScratch(2), mkImm(u));
-}
-
-Value *
 BuildUtil::loadImm(Value *dst, uint32_t u)
 {
    return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u));
@@ -486,16 +454,6 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
    return sym;
 }
 
-Symbol *
-BuildUtil::mkTSVal(TSSemantic tsName)
-{
-   Symbol *sym = new_Symbol(prog, FILE_THREAD_STATE, 0);
-   sym->reg.type = TYPE_U32;
-   sym->reg.size = typeSizeof(sym->reg.type);
-   sym->reg.data.ts = tsName;
-   return sym;
-}
-
 void
 BuildUtil::DataArray::setup(unsigned array, unsigned arrayIdx,
                             uint32_t base, int len, int vecDim, int eltSize,
@@ -529,7 +487,7 @@ BuildUtil::DataArray::acquire(ValueMap &m, int i, int c)
 
       return v;
    } else {
-      return up->getScratch(eltSize);
+      return up->getScratch();
    }
 }
 
@@ -597,12 +555,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
    switch (i->dType) {
    case TYPE_U64: hTy = TYPE_U32; break;
    case TYPE_S64: hTy = TYPE_S32; break;
-   case TYPE_F64:
-      if (i->op == OP_MOV) {
-         hTy = TYPE_U32;
-         break;
-      }
-      FALLTHROUGH;
    default:
       return NULL;
    }
@@ -615,7 +567,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
          return NULL;
       srcNr = 2;
       break;
-   case OP_SELP: srcNr = 3; break;
    default:
       // TODO when needed
       return NULL;
@@ -632,10 +583,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
 
    for (int s = 0; s < srcNr; ++s) {
       if (lo->getSrc(s)->reg.size < 8) {
-         if (s == 2)
-            hi->setSrc(s, lo->getSrc(s));
-         else
-            hi->setSrc(s, zero);
+         hi->setSrc(s, zero);
       } else {
          if (lo->getSrc(s)->refCount() > 1)
             lo->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
@@ -649,7 +597,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
          case FILE_MEMORY_CONST:
          case FILE_MEMORY_SHARED:
          case FILE_SHADER_INPUT:
-         case FILE_SHADER_OUTPUT:
             hi->getSrc(s)->reg.data.offset += 4;
             break;
          default:
@@ -660,7 +607,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
       }
    }
    if (srcNr == 2) {
-      lo->setFlagsDef(1, carry);
+      lo->setDef(1, carry);
       hi->setFlagsSrc(hi->srcCount(), carry);
    }
    return hi;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
index 5c3a01df9..a610c773f 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -66,7 +66,6 @@ public:
    Instruction *mkMov(Value *, Value *, DataType = TYPE_U32);
    Instruction *mkMovToReg(int id, Value *);
    Instruction *mkMovFromReg(Value *, int id);
-   inline Instruction *mkBMov(Value *, Value *);
 
    Instruction *mkInterp(unsigned mode, Value *, int32_t offset, Value *rel);
    Instruction *mkFetch(Value *, DataType, DataFile, int32_t offset,
@@ -74,8 +73,8 @@ public:
 
    Instruction *mkCvt(operation, DataType, Value *, DataType, Value *);
    CmpInstruction *mkCmp(operation, CondCode, DataType,
-                         Value *,
-                         DataType, Value *, Value *, Value * = NULL);
+			 Value *,
+			 DataType, Value *, Value *, Value * = NULL);
    TexInstruction *mkTex(operation, TexTarget,
                          uint16_t tic, uint16_t tsc,
                          const std::vector<Value *> &def,
@@ -91,16 +90,12 @@ public:
    void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
 
    ImmediateValue *mkImm(float);
-   ImmediateValue *mkImm(double);
-   ImmediateValue *mkImm(uint16_t);
    ImmediateValue *mkImm(uint32_t);
    ImmediateValue *mkImm(uint64_t);
 
    ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }
 
    Value *loadImm(Value *dst, float);
-   Value *loadImm(Value *dst, double);
-   Value *loadImm(Value *dst, uint16_t);
    Value *loadImm(Value *dst, uint32_t);
    Value *loadImm(Value *dst, uint64_t);
 
@@ -140,9 +135,7 @@ public:
    class DataArray
    {
    public:
-      DataArray(BuildUtil *bld) : up(bld), array(0), arrayIdx(0), baseAddr(0),
-         arrayLen(0), baseSym(NULL), vecDim(0), eltSize(0), file(FILE_NULL),
-         regOnly(false) { }
+      DataArray(BuildUtil *bld) : up(bld) { }
 
       void setup(unsigned array, unsigned arrayIdx,
                  uint32_t base, int len, int vecDim, int eltSize,
@@ -179,7 +172,6 @@ public:
                     DataType ty, uint32_t baseAddress);
 
    Symbol *mkSysVal(SVSemantic svName, uint32_t svIndex);
-   Symbol *mkTSVal(TSSemantic tsName);
 
 private:
    void init(Program *);
@@ -301,17 +293,11 @@ BuildUtil::mkOp3v(operation op, DataType ty, Value *dst,
 inline LValue *
 BuildUtil::mkLoadv(DataType ty, Symbol *mem, Value *ptr)
 {
-   LValue *dst = getScratch(typeSizeof(ty));
+   LValue *dst = getScratch();
    mkLoad(ty, dst, mem, ptr);
    return dst;
 }
 
-inline Instruction *
-BuildUtil::mkBMov(Value *dst, Value *src)
-{
-   return mkCvt(OP_CVT, TYPE_U32, dst, TYPE_U32, src);
-}
-
 bool
 BuildUtil::DataArray::exists(ValueMap &m, unsigned int i, unsigned int c)
 {
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 1a0c63b70..90147668c 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -33,12 +33,14 @@ namespace nv50_ir {
 class CodeEmitterNV50 : public CodeEmitter
 {
 public:
-   CodeEmitterNV50(Program::Type, const TargetNV50 *);
+   CodeEmitterNV50(const TargetNV50 *);
 
    virtual bool emitInstruction(Instruction *);
 
    virtual uint32_t getMinEncodingSize(const Instruction *) const;
 
+   inline void setProgramType(Program::Type pType) { progType = pType; }
+
    virtual void prepareEmission(Function *);
 
 private:
@@ -94,12 +96,9 @@ private:
    void emitUADD(const Instruction *);
    void emitAADD(const Instruction *);
    void emitFADD(const Instruction *);
-   void emitDADD(const Instruction *);
    void emitIMUL(const Instruction *);
    void emitFMUL(const Instruction *);
-   void emitDMUL(const Instruction *);
    void emitFMAD(const Instruction *);
-   void emitDMAD(const Instruction *);
    void emitIMAD(const Instruction *);
    void emitISAD(const Instruction *);
 
@@ -270,7 +269,7 @@ CodeEmitterNV50::emitFlagsWr(const Instruction *i)
       for (int d = 0; i->defExists(d); ++d)
          if (i->def(d).getFile() == FILE_FLAGS)
             flagsDef = d;
-      if (flagsDef >= 0 && false) // TODO: enforce use of flagsDef at some point
+      if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point
          WARN("Instruction::flagsDef was not set properly\n");
    }
    if (flagsDef == 0 && i->defExists(1))
@@ -373,7 +372,7 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
          mode |= 3 << (s * 2);
          break;
       default:
-         ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
+	      ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
          assert(0);
          break;
       }
@@ -439,9 +438,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
       return;
 
    if ((mode & 3) == 1) {
-      const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14;
+      const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
 
-      switch (i->sType) {
+      switch (i->getSrc(0)->reg.type) {
       case TYPE_U8:
          break;
       case TYPE_U16:
@@ -525,8 +524,7 @@ CodeEmitterNV50::emitForm_ADD(const Instruction *i)
 
    setSrcFileBits(i, NV50_OP_ENC_LONG_ALT);
    setSrc(i, 0, 0);
-   if (i->predSrc != 1)
-      setSrc(i, 1, 2);
+   setSrc(i, 1, 2);
 
    if (i->getIndirect(0, 0)) {
       assert(!i->getIndirect(1, 0));
@@ -619,7 +617,7 @@ void
 CodeEmitterNV50::emitLOAD(const Instruction *i)
 {
    DataFile sf = i->src(0).getFile();
-   ASSERTED int32_t offset = i->getSrc(0)->reg.data.offset;
+   int32_t offset = i->getSrc(0)->reg.data.offset;
 
    switch (sf) {
    case FILE_SHADER_INPUT:
@@ -642,9 +640,6 @@ CodeEmitterNV50::emitLOAD(const Instruction *i)
             code[1] |= 0x04000000;
 
          emitLoadStoreSizeCS(i->sType);
-
-         if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED)
-            code[1] |= 0x00800000;
       } else {
          assert(offset <= (int32_t)(0x1f * typeSizeof(i->sType)));
          code[0] = 0x10000001;
@@ -715,8 +710,6 @@ CodeEmitterNV50::emitSTORE(const Instruction *i)
    case FILE_MEMORY_SHARED:
       code[0] = 0x00000001;
       code[1] = 0xe0000000;
-      if (i->subOp == NV50_IR_SUBOP_STORE_UNLOCKED)
-         code[1] |= 0x00800000;
       switch (typeSizeof(i->dType)) {
       case 1:
          code[0] |= offset << 9;
@@ -760,10 +753,10 @@ CodeEmitterNV50::emitMOV(const Instruction *i)
    assert(sf == FILE_GPR || df == FILE_GPR);
 
    if (sf == FILE_FLAGS) {
-      assert(i->flagsSrc >= 0);
       code[0] = 0x00000001;
       code[1] = 0x20000000;
       defId(i->def(0), 2);
+      srcId(i->src(0), 12);
       emitFlagsRd(i);
    } else
    if (sf == FILE_ADDRESS) {
@@ -774,31 +767,26 @@ CodeEmitterNV50::emitMOV(const Instruction *i)
       emitFlagsRd(i);
    } else
    if (df == FILE_FLAGS) {
-      assert(i->flagsDef >= 0);
       code[0] = 0x00000001;
       code[1] = 0xa0000000;
+      defId(i->def(0), 4);
       srcId(i->src(0), 9);
       emitFlagsRd(i);
-      emitFlagsWr(i);
    } else
    if (sf == FILE_IMMEDIATE) {
-      code[0] = 0x10000001;
+      code[0] = 0x10008001;
       code[1] = 0x00000003;
       emitForm_IMM(i);
-
-      code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000;
    } else {
       if (i->encSize == 4) {
-         code[0] = 0x10000000;
-         code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000;
-         defId(i->def(0), 2);
+         code[0] = 0x10008000;
       } else {
          code[0] = 0x10000001;
          code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
          code[1] |= (i->lanes << 14);
-         setDst(i, 0);
          emitFlagsRd(i);
       }
+      defId(i->def(0), 2);
       srcId(i->src(0), 9);
    }
    if (df == FILE_SHADER_OUTPUT) {
@@ -848,7 +836,7 @@ CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp)
 
    emitForm_ADD(i);
 
-   if (!i->srcExists(1) || i->predSrc == 1)
+   if (!i->srcExists(1))
       srcId(i->src(0), 32 + 14);
 }
 
@@ -889,36 +877,12 @@ CodeEmitterNV50::emitPFETCH(const Instruction *i)
 }
 
 void
-nv50_interpApply(const FixupEntry *entry, uint32_t *code, const FixupData& data)
-{
-   int ipa = entry->ipa;
-   int encSize = entry->reg;
-   int loc = entry->loc;
-
-   if ((ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
-       (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
-      if (data.force_persample_interp) {
-         if (encSize == 8)
-            code[loc + 1] |= 1 << 16;
-         else
-            code[loc + 0] |= 1 << 24;
-      } else {
-         if (encSize == 8)
-            code[loc + 1] &= ~(1 << 16);
-         else
-            code[loc + 0] &= ~(1 << 24);
-      }
-   }
-}
-
-void
 CodeEmitterNV50::emitINTERP(const Instruction *i)
 {
    code[0] = 0x80000000;
 
    defId(i->def(0), 2);
    srcAddr8(i->src(0), 16);
-   setAReg16(i, 0);
 
    if (i->encSize != 8 && i->getInterpMode() == NV50_IR_INTERP_FLAT) {
       code[0] |= 1 << 8;
@@ -940,8 +904,6 @@ CodeEmitterNV50::emitINTERP(const Instruction *i)
       code[0] |= 1;
       emitFlagsRd(i);
    }
-
-   addInterp(i->ipa, i->encSize, nv50_interpApply);
 }
 
 void
@@ -966,13 +928,11 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
          assert(0);
          break;
       }
+      code[1] |= i->src(0).mod.abs() << 20;
+      code[1] |= i->src(0).mod.neg() << 26;
+      code[1] |= i->src(1).mod.abs() << 19;
+      code[1] |= i->src(1).mod.neg() << 27;
    }
-
-   code[1] |= i->src(0).mod.abs() << 20;
-   code[1] |= i->src(0).mod.neg() << 26;
-   code[1] |= i->src(1).mod.abs() << 19;
-   code[1] |= i->src(1).mod.neg() << 27;
-
    emitForm_MAD(i);
 }
 
@@ -1008,26 +968,6 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
 }
 
 void
-CodeEmitterNV50::emitDMAD(const Instruction *i)
-{
-   const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
-   const int neg_add = i->src(2).mod.neg();
-
-   assert(i->encSize == 8);
-   assert(!i->saturate);
-
-   code[1] = 0x40000000;
-   code[0] = 0xe0000000;
-
-   code[1] |= neg_mul << 26;
-   code[1] |= neg_add << 27;
-
-   roundMode_MAD(i);
-
-   emitForm_MAD(i);
-}
-
-void
 CodeEmitterNV50::emitFADD(const Instruction *i)
 {
    const int neg0 = i->src(0).mod.neg();
@@ -1062,42 +1002,22 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
 }
 
 void
-CodeEmitterNV50::emitDADD(const Instruction *i)
-{
-   const int neg0 = i->src(0).mod.neg();
-   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
-
-   assert(!(i->src(0).mod | i->src(1).mod).abs());
-   assert(!i->saturate);
-   assert(i->encSize == 8);
-
-   code[1] = 0x60000000;
-   code[0] = 0xe0000000;
-
-   emitForm_ADD(i);
-
-   code[1] |= neg0 << 26;
-   code[1] |= neg1 << 27;
-}
-
-void
 CodeEmitterNV50::emitUADD(const Instruction *i)
 {
    const int neg0 = i->src(0).mod.neg();
    const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
 
-   code[0] = 0x20000000;
+   code[0] = 0x20008000;
 
    if (i->src(1).getFile() == FILE_IMMEDIATE) {
-      code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000;
       code[1] = 0;
       emitForm_IMM(i);
    } else
    if (i->encSize == 8) {
+      code[0] = 0x20000000;
       code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
       emitForm_ADD(i);
    } else {
-      code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000;
       emitForm_MUL(i);
    }
    assert(!(neg0 && neg1));
@@ -1133,12 +1053,6 @@ CodeEmitterNV50::emitIMUL(const Instruction *i)
 {
    code[0] = 0x40000000;
 
-   if (i->src(1).getFile() == FILE_IMMEDIATE) {
-      if (i->sType == TYPE_S16)
-         code[0] |= 0x8100;
-      code[1] = 0;
-      emitForm_IMM(i);
-   } else
    if (i->encSize == 8) {
       code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
       emitForm_MAD(i);
@@ -1181,66 +1095,28 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
 }
 
 void
-CodeEmitterNV50::emitDMUL(const Instruction *i)
-{
-   const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
-
-   assert(!i->saturate);
-   assert(i->encSize == 8);
-
-   code[1] = 0x80000000;
-   code[0] = 0xe0000000;
-
-   if (neg)
-      code[1] |= 0x08000000;
-
-   roundMode_CVT(i->rnd);
-
-   emitForm_MAD(i);
-}
-
-void
 CodeEmitterNV50::emitIMAD(const Instruction *i)
 {
-   int mode;
    code[0] = 0x60000000;
-
-   assert(!i->src(0).mod && !i->src(1).mod && !i->src(2).mod);
-   if (!isSignedType(i->sType))
-      mode = 0;
-   else if (i->saturate)
-      mode = 2;
+   if (isSignedType(i->sType))
+      code[1] = i->saturate ? 0x40000000 : 0x20000000;
    else
-      mode = 1;
+      code[1] = 0x00000000;
 
-   if (i->src(1).getFile() == FILE_IMMEDIATE) {
-      code[1] = 0;
-      emitForm_IMM(i);
-      code[0] |= (mode & 1) << 8 | (mode & 2) << 14;
-      if (i->flagsSrc >= 0) {
-         assert(!(code[0] & 0x10400000));
-         assert(SDATA(i->src(i->flagsSrc)).id == 0);
-         code[0] |= 0x10400000;
-      }
-   } else
-   if (i->encSize == 4) {
-      emitForm_MUL(i);
-      code[0] |= (mode & 1) << 8 | (mode & 2) << 14;
-      if (i->flagsSrc >= 0) {
-         assert(!(code[0] & 0x10400000));
-         assert(SDATA(i->src(i->flagsSrc)).id == 0);
-         code[0] |= 0x10400000;
-      }
-   } else {
-      code[1] = mode << 29;
-      emitForm_MAD(i);
+   int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   int neg2 = i->src(2).mod.neg();
 
-      if (i->flagsSrc >= 0) {
-         // add with carry from $cX
-         assert(!(code[1] & 0x0c000000) && !i->getPredicate());
-         code[1] |= 0xc << 24;
-         srcId(i->src(i->flagsSrc), 32 + 12);
-      }
+   assert(!(neg1 & neg2));
+   code[1] |= neg1 << 27;
+   code[1] |= neg2 << 26;
+
+   emitForm_MAD(i);
+
+   if (i->flagsSrc >= 0) {
+      // add with carry from $cX
+      assert(!(code[1] & 0x0c000000) && !i->getPredicate());
+      code[1] |= 0xc << 24;
+      srcId(i->src(i->flagsSrc), 32 + 12);
    }
 }
 
@@ -1273,39 +1149,15 @@ CodeEmitterNV50::emitISAD(const Instruction *i)
    }
 }
 
-static void
-alphatestSet(const FixupEntry *entry, uint32_t *code, const FixupData& data)
-{
-   int loc = entry->loc;
-   int enc;
-
-   switch (data.alphatest) {
-   case PIPE_FUNC_NEVER: enc = 0x0; break;
-   case PIPE_FUNC_LESS: enc = 0x1; break;
-   case PIPE_FUNC_EQUAL: enc = 0x2; break;
-   case PIPE_FUNC_LEQUAL: enc = 0x3; break;
-   case PIPE_FUNC_GREATER: enc = 0x4; break;
-   case PIPE_FUNC_NOTEQUAL: enc = 0x5; break;
-   case PIPE_FUNC_GEQUAL: enc = 0x6; break;
-   default:
-   case PIPE_FUNC_ALWAYS: enc = 0xf; break;
-   }
-
-   code[loc + 1] &= ~(0x1f << 14);
-   code[loc + 1] |= enc << 14;
-}
-
 void
 CodeEmitterNV50::emitSET(const Instruction *i)
 {
    code[0] = 0x30000000;
    code[1] = 0x60000000;
 
+   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
    switch (i->sType) {
-   case TYPE_F64:
-      code[0] = 0xe0000000;
-      code[1] = 0xe0000000;
-      break;
    case TYPE_F32: code[0] |= 0x80000000; break;
    case TYPE_S32: code[1] |= 0x0c000000; break;
    case TYPE_U32: code[1] |= 0x04000000; break;
@@ -1315,19 +1167,12 @@ CodeEmitterNV50::emitSET(const Instruction *i)
       assert(0);
       break;
    }
-
-   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
    if (i->src(0).mod.neg()) code[1] |= 0x04000000;
    if (i->src(1).mod.neg()) code[1] |= 0x08000000;
    if (i->src(0).mod.abs()) code[1] |= 0x00100000;
    if (i->src(1).mod.abs()) code[1] |= 0x00080000;
 
    emitForm_MAD(i);
-
-   if (i->subOp == 1) {
-      addInterp(0, 0, alphatestSet);
-   }
 }
 
 void
@@ -1412,9 +1257,6 @@ CodeEmitterNV50::emitCVT(const Instruction *i)
       case TYPE_U32: code[1] = 0x44004000; break;
       case TYPE_F16: code[1] = 0xc4000000; break;
       case TYPE_U16: code[1] = 0x44000000; break;
-      case TYPE_S16: code[1] = 0x44010000; break;
-      case TYPE_S8:  code[1] = 0x44018000; break;
-      case TYPE_U8:  code[1] = 0x44008000; break;
       default:
          assert(0);
          break;
@@ -1452,73 +1294,10 @@ CodeEmitterNV50::emitCVT(const Instruction *i)
          break;
       }
       break;
-   case TYPE_F16:
-      switch (i->sType) {
-      case TYPE_F16: code[1] = 0xc0000000; break;
-      case TYPE_F32: code[1] = 0xc0004000; break;
-      default:
-         assert(0);
-         break;
-      }
-      break;
    case TYPE_S16:
-      switch (i->sType) {
-      case TYPE_F32: code[1] = 0x88004000; break;
-      case TYPE_S32: code[1] = 0x08014000; break;
-      case TYPE_U32: code[1] = 0x08004000; break;
-      case TYPE_F16: code[1] = 0x88000000; break;
-      case TYPE_S16: code[1] = 0x08010000; break;
-      case TYPE_U16: code[1] = 0x08000000; break;
-      case TYPE_S8:  code[1] = 0x08018000; break;
-      case TYPE_U8:  code[1] = 0x08008000; break;
-      default:
-         assert(0);
-         break;
-      }
-      break;
    case TYPE_U16:
-      switch (i->sType) {
-      case TYPE_F32: code[1] = 0x80004000; break;
-      case TYPE_S32: code[1] = 0x00014000; break;
-      case TYPE_U32: code[1] = 0x00004000; break;
-      case TYPE_F16: code[1] = 0x80000000; break;
-      case TYPE_S16: code[1] = 0x00010000; break;
-      case TYPE_U16: code[1] = 0x00000000; break;
-      case TYPE_S8:  code[1] = 0x00018000; break;
-      case TYPE_U8:  code[1] = 0x00008000; break;
-      default:
-         assert(0);
-         break;
-      }
-      break;
    case TYPE_S8:
-      switch (i->sType) {
-      case TYPE_S32: code[1] = 0x08094000; break;
-      case TYPE_U32: code[1] = 0x08084000; break;
-      case TYPE_F16: code[1] = 0x88080000; break;
-      case TYPE_S16: code[1] = 0x08090000; break;
-      case TYPE_U16: code[1] = 0x08080000; break;
-      case TYPE_S8:  code[1] = 0x08098000; break;
-      case TYPE_U8:  code[1] = 0x08088000; break;
-      default:
-         assert(0);
-         break;
-      }
-      break;
    case TYPE_U8:
-      switch (i->sType) {
-      case TYPE_S32: code[1] = 0x00094000; break;
-      case TYPE_U32: code[1] = 0x00084000; break;
-      case TYPE_F16: code[1] = 0x80080000; break;
-      case TYPE_S16: code[1] = 0x00090000; break;
-      case TYPE_U16: code[1] = 0x00080000; break;
-      case TYPE_S8:  code[1] = 0x00098000; break;
-      case TYPE_U8:  code[1] = 0x00088000; break;
-      default:
-         assert(0);
-         break;
-      }
-      break;
    default:
       assert(0);
       break;
@@ -1564,7 +1343,6 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
 
    if (i->encSize == 4) {
       assert(i->op == OP_RCP);
-      assert(!i->saturate);
       code[0] |= i->src(0).mod.abs() << 15;
       code[0] |= i->src(0).mod.neg() << 22;
       emitForm_MUL(i);
@@ -1572,10 +1350,6 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
       code[1] = subOp << 29;
       code[1] |= i->src(0).mod.abs() << 20;
       code[1] |= i->src(0).mod.neg() << 26;
-      if (i->saturate) {
-         assert(subOp == 6 && i->op == OP_EX2);
-         code[1] |= 1 << 27;
-      }
       emitForm_MAD(i);
    }
 }
@@ -1618,15 +1392,13 @@ CodeEmitterNV50::emitLogicOp(const Instruction *i)
       emitForm_IMM(i);
    } else {
       switch (i->op) {
-      case OP_AND: code[1] = 0x00000000; break;
-      case OP_OR:  code[1] = 0x00004000; break;
-      case OP_XOR: code[1] = 0x00008000; break;
+      case OP_AND: code[1] = 0x04000000; break;
+      case OP_OR:  code[1] = 0x04004000; break;
+      case OP_XOR: code[1] = 0x04008000; break;
       default:
          assert(0);
          break;
       }
-      if (typeSizeof(i->dType) == 4)
-         code[1] |= 0x04000000;
       if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
          code[1] |= 1 << 16;
       if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))
@@ -1657,9 +1429,7 @@ CodeEmitterNV50::emitShift(const Instruction *i)
       emitARL(i, i->getSrc(1)->reg.data.u32 & 0x3f);
    } else {
       code[0] = 0x30000001;
-      code[1] = (i->op == OP_SHR) ? 0xe0000000 : 0xc0000000;
-      if (typeSizeof(i->dType) == 4)
-         code[1] |= 0x04000000;
+      code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000;
       if (i->op == OP_SHR && isSignedType(i->sType))
           code[1] |= 1 << 27;
 
@@ -1738,9 +1508,7 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
    code[1] |= (i->tex.mask & 0xc) << 12;
 
    if (i->tex.liveOnly)
-      code[1] |= 1 << 2;
-   if (i->tex.derivAll)
-      code[1] |= 1 << 3;
+      code[1] |= 4;
 
    defId(i->def(0), 2);
 
@@ -1901,28 +1669,19 @@ CodeEmitterNV50::emitATOM(const Instruction *i)
       return;
    }
    code[0] = 0xd0000001;
-   code[1] = 0xc0c00000 | (subOp << 2);
+   code[1] = 0xe0c00000 | (subOp << 2);
    if (isSignedType(i->dType))
       code[1] |= 1 << 21;
 
    // args
    emitFlagsRd(i);
-   if (i->subOp == NV50_IR_SUBOP_ATOM_EXCH ||
-       i->subOp == NV50_IR_SUBOP_ATOM_CAS ||
-       i->defExists(0)) {
-      code[1] |= 0x20000000;
-      setDst(i, 0);
-      setSrc(i, 1, 1);
-      // g[] pointer
-      code[0] |= i->getSrc(0)->reg.fileIndex << 23;
-   } else {
-      srcId(i->src(1), 2);
-      // g[] pointer
-      code[0] |= i->getSrc(0)->reg.fileIndex << 16;
-   }
+   setDst(i, 0);
+   setSrc(i, 1, 1);
    if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
       setSrc(i, 2, 2);
 
+   // g[] pointer
+   code[0] |= i->getSrc(0)->reg.fileIndex << 23;
    srcId(i->getIndirect(0, 0), 9);
 }
 
@@ -1971,9 +1730,7 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
       break;
    case OP_ADD:
    case OP_SUB:
-      if (insn->dType == TYPE_F64)
-         emitDADD(insn);
-      else if (isFloatType(insn->dType))
+      if (isFloatType(insn->dType))
          emitFADD(insn);
       else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
          emitAADD(insn);
@@ -1981,18 +1738,14 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
          emitUADD(insn);
       break;
    case OP_MUL:
-      if (insn->dType == TYPE_F64)
-         emitDMUL(insn);
-      else if (isFloatType(insn->dType))
+      if (isFloatType(insn->dType))
          emitFMUL(insn);
       else
          emitIMUL(insn);
       break;
    case OP_MAD:
    case OP_FMA:
-      if (insn->dType == TYPE_F64)
-         emitDMAD(insn);
-      else if (isFloatType(insn->dType))
+      if (isFloatType(insn->dType))
          emitFMAD(insn);
       else
          emitIMAD(insn);
@@ -2164,7 +1917,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 {
    const Target::OpInfo &info = targ->getOpInfo(i);
 
-   if (info.minEncSize > 4 || i->dType == TYPE_F64)
+   if (info.minEncSize > 4)
       return 8;
 
    // check constraints on dst and src operands
@@ -2194,9 +1947,8 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 
    // check constraints on short MAD
    if (info.srcNr >= 2 && i->srcExists(2)) {
-      if (!i->defExists(0) ||
-          (i->flagsSrc >= 0 && SDATA(i->src(i->flagsSrc)).id > 0) ||
-          DDATA(i->def(0)).id != SDATA(i->src(2)).id)
+      if (!i->defExists(0) || !isFloatType(i->dType) ||
+          i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id)
          return 8;
    }
 
@@ -2226,7 +1978,7 @@ makeInstructionLong(Instruction *insn)
    insn->encSize = 8;
 
    for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) {
-      fn->bbArray[i]->binPos += adj;
+      fn->bbArray[i]->binPos += 4;
    }
    fn->binSize += adj;
    insn->bb->binSize += adj;
@@ -2278,16 +2030,9 @@ replaceExitWithModifier(Function *func)
             return;
       }
    }
-
-   int adj = epilogue->getExit()->encSize;
-   epilogue->binSize -= adj;
-   func->binSize -= adj;
+   epilogue->binSize -= 8;
+   func->binSize -= 8;
    delete_Instruction(func->getProgram(), epilogue->getExit());
-
-   // There may be BB's that are laid out after the exit block
-   for (int i = func->bbCount - 1; i >= 0 && func->bbArray[i] != epilogue; --i) {
-      func->bbArray[i]->binPos -= adj;
-   }
 }
 
 void
@@ -2298,8 +2043,8 @@ CodeEmitterNV50::prepareEmission(Function *func)
    replaceExitWithModifier(func);
 }
 
-CodeEmitterNV50::CodeEmitterNV50(Program::Type type, const TargetNV50 *target) :
-   CodeEmitter(target), progType(type), targNV50(target)
+CodeEmitterNV50::CodeEmitterNV50(const TargetNV50 *target) :
+   CodeEmitter(target), targNV50(target)
 {
    targ = target; // specialized
    code = NULL;
@@ -2310,7 +2055,8 @@ CodeEmitterNV50::CodeEmitterNV50(Program::Type type, const TargetNV50 *target) :
 CodeEmitter *
 TargetNV50::getCodeEmitter(Program::Type type)
 {
-   CodeEmitterNV50 *emit = new CodeEmitterNV50(type, this);
+   CodeEmitterNV50 *emit = new CodeEmitterNV50(this);
+   emit->setProgramType(type);
    return emit;
 }
 
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
index 3f9967a7b..23414d54a 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
@@ -77,6 +77,7 @@ const char *Graph::Edge::typeStr() const
    case FORWARD: return "forward";
    case BACK:    return "back";
    case CROSS:   return "cross";
+   case DUMMY:   return "dummy";
    case UNKNOWN:
    default:
       return "unk";
@@ -86,8 +87,7 @@ const char *Graph::Edge::typeStr() const
 Graph::Node::Node(void *priv) : data(priv),
                                 in(0), out(0), graph(0),
                                 visited(0),
-                                inCount(0), outCount(0),
-                                tag(0)
+                                inCount(0), outCount(0)
 {
    // nothing to do
 }
@@ -184,7 +184,7 @@ Graph::Node::reachableBy(const Node *node, const Node *term) const
          continue;
 
       for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) {
-         if (ei.getType() == Edge::BACK)
+         if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY)
             continue;
          if (ei.getNode()->visit(seq))
             stack.push(ei.getNode());
@@ -287,10 +287,7 @@ private:
 
       bb.push(node);
 
-      while (bb.getSize() || cross.getSize()) {
-         if (bb.getSize() == 0)
-            cross.moveTo(bb);
-
+      while (bb.getSize()) {
          node = reinterpret_cast<Graph::Node *>(bb.pop().u.p);
          assert(node);
          if (!node->visit(sequence))
@@ -301,6 +298,7 @@ private:
             switch (ei.getType()) {
             case Graph::Edge::TREE:
             case Graph::Edge::FORWARD:
+            case Graph::Edge::DUMMY:
                if (++(ei.getNode()->tag) == ei.getNode()->incidentCountFwd())
                   bb.push(ei.getNode());
                break;
@@ -316,6 +314,9 @@ private:
             }
          }
          nodes[count++] = node;
+
+         if (bb.getSize() == 0)
+            cross.moveTo(bb);
       }
    }
 
@@ -370,6 +371,8 @@ void Graph::classifyDFS(Node *curr, int& seq)
 
    for (edge = curr->out; edge; edge = edge->next[0]) {
       node = edge->target;
+      if (edge->type == Edge::DUMMY)
+         continue;
 
       if (node->getSequence() == 0) {
          edge->type = Edge::TREE;
@@ -384,6 +387,8 @@ void Graph::classifyDFS(Node *curr, int& seq)
 
    for (edge = curr->in; edge; edge = edge->next[1]) {
       node = edge->origin;
+      if (edge->type == Edge::DUMMY)
+         continue;
 
       if (node->getSequence() == 0) {
          edge->type = Edge::TREE;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
index fc85e78a5..b0981ff69 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
@@ -47,6 +47,7 @@ public:
          FORWARD,
          BACK,
          CROSS, // e.g. loop break
+         DUMMY
       };
 
       Edge(Node *dst, Node *src, Type kind);
@@ -146,7 +147,7 @@ public:
 
 public:
    Graph();
-   virtual ~Graph(); // does *not* free the nodes (make it an option ?)
+   ~Graph(); // does *not* free the nodes (make it an option ?)
 
    inline Node *getRoot() const { return root; }
 
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
index 749e6b40b..e465f2484 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
@@ -48,7 +48,7 @@ static inline bool isTextureOp(operation op)
 
 static inline bool isSurfaceOp(operation op)
 {
-   return (op >= OP_SULDB && op <= OP_SULEA) || (op == OP_SUQ);
+   return (op >= OP_SULDB && op <= OP_SULEA);
 }
 
 static inline unsigned int typeSizeof(DataType ty)
@@ -126,7 +126,7 @@ static inline bool isFloatType(DataType ty)
 
 static inline bool isSignedIntType(DataType ty)
 {
-   return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32 || ty == TYPE_S64);
+   return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32);
 }
 
 static inline bool isSignedType(DataType ty)
@@ -136,7 +136,6 @@ static inline bool isSignedType(DataType ty)
    case TYPE_U8:
    case TYPE_U16:
    case TYPE_U32:
-   case TYPE_U64:
    case TYPE_B96:
    case TYPE_B128:
       return false;
@@ -148,7 +147,6 @@ static inline bool isSignedType(DataType ty)
 static inline DataType intTypeToSigned(DataType ty)
 {
    switch (ty) {
-   case TYPE_U64: return TYPE_S64;
    case TYPE_U32: return TYPE_S32;
    case TYPE_U16: return TYPE_S16;
    case TYPE_U8: return TYPE_S8;
@@ -222,7 +220,7 @@ Instruction *Value::getUniqueInsn() const
             return (*it)->getInsn();
       // should be unreachable and trigger assertion at the end
    }
-#ifndef NDEBUG
+#ifdef DEBUG
    if (reg.data.id < 0) {
       int n = 0;
       for (DefCIterator it = defs.begin(); n < 2 && it != defs.end(); ++it)
@@ -311,14 +309,14 @@ const FlowInstruction *Instruction::asFlow() const
 
 TexInstruction *Instruction::asTex()
 {
-   if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ)
+   if (op >= OP_TEX && op <= OP_SULEA)
       return static_cast<TexInstruction *>(this);
    return NULL;
 }
 
 const TexInstruction *Instruction::asTex() const
 {
-   if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ)
+   if (op >= OP_TEX && op <= OP_SULEA)
       return static_cast<const TexInstruction *>(this);
    return NULL;
 }
@@ -336,7 +334,7 @@ static inline Instruction *cloneForward(Function *ctx, Instruction *obj)
 // XXX: use a virtual function so we're really really safe ?
 LValue *Value::asLValue()
 {
-   if (reg.file >= FILE_GPR && reg.file <= LAST_REGISTER_FILE)
+   if (reg.file >= FILE_GPR && reg.file <= FILE_ADDRESS)
       return static_cast<LValue *>(this);
    return NULL;
 }
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 2b09855b1..d87cdfff8 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -25,24 +25,6 @@
 
 #include "codegen/nv50_ir_target_nv50.h"
 
-#define NV50_SU_INFO_SIZE_X   0x00
-#define NV50_SU_INFO_SIZE_Y   0x04
-#define NV50_SU_INFO_SIZE_Z   0x08
-#define NV50_SU_INFO_BSIZE    0x0c
-#define NV50_SU_INFO_STRIDE_Y 0x10
-#define NV50_SU_INFO_MS_X     0x18
-#define NV50_SU_INFO_MS_Y     0x1c
-#define NV50_SU_INFO_TILE_SHIFT_X 0x20
-#define NV50_SU_INFO_TILE_SHIFT_Y 0x24
-#define NV50_SU_INFO_TILE_SHIFT_Z 0x28
-#define NV50_SU_INFO_OFFSET_Z 0x2c
-
-#define NV50_SU_INFO__STRIDE 0x30
-
-#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)
-#define NV50_SU_INFO_MS(i)   (0x18 + (i) * 4)
-#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)
-
 namespace nv50_ir {
 
 // nv50 doesn't support 32 bit integer multiplication
@@ -62,8 +44,6 @@ static bool
 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
 {
    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
-   ImmediateValue src1;
-   bool src1imm = mul->src(1).getImmediate(src1);
 
    DataType fTy; // full type
    switch (mul->sType) {
@@ -92,41 +72,24 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
    for (int j = 0; j < 4; ++j)
       t[j] = bld->getSSA(fullSize);
 
-   if (isSignedType(mul->sType) && highResult) {
+   s[0] = mul->getSrc(0);
+   s[1] = mul->getSrc(1);
+
+   if (isSignedType(mul->sType)) {
       s[0] = bld->getSSA(fullSize);
       s[1] = bld->getSSA(fullSize);
       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
-      src1.reg.data.s32 = abs(src1.reg.data.s32);
-   } else {
-      s[0] = mul->getSrc(0);
-      s[1] = mul->getSrc(1);
    }
 
    // split sources into halves
    i[0] = bld->mkSplit(a, halfSize, s[0]);
    i[1] = bld->mkSplit(b, halfSize, s[1]);
 
-   if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
-      i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
-                               bld->mkImm(src1.reg.data.u32 & 0xffff));
-   } else {
-      i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
-                        src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
-      if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
-         i[3] = i[2];
-         t[1] = t[0];
-      } else {
-         i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
-      }
-   }
+   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
+   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
-   if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
-      i[4] = i[3];
-      t[3] = t[2];
-   } else {
-      i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
-   }
+   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
 
    if (highResult) {
       Value *c[2];
@@ -223,9 +186,6 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
 
 class NV50LegalizePostRA : public Pass
 {
-public:
-   NV50LegalizePostRA() : r63(NULL) { }
-
 private:
    virtual bool visit(Function *);
    virtual bool visit(BasicBlock *);
@@ -233,8 +193,6 @@ private:
    void handlePRERET(FlowInstruction *);
    void replaceZero(Instruction *);
 
-   BuildUtil bld;
-
    LValue *r63;
 };
 
@@ -244,8 +202,7 @@ NV50LegalizePostRA::visit(Function *fn)
    Program *prog = fn->getProgram();
 
    r63 = new_LValue(fn, FILE_GPR);
-   // GPR units on nv50 are in half-regs
-   if (prog->maxGPR < 126)
+   if (prog->maxGPR < 63)
       r63->reg.data.id = 63;
    else
       r63->reg.data.id = 127;
@@ -336,7 +293,8 @@ NV50LegalizePostRA::visit(BasicBlock *bb)
                next = hi;
          }
 
-         if (i->op != OP_PFETCH && i->op != OP_BAR &&
+         if (i->op != OP_MOV && i->op != OP_PFETCH &&
+             i->op != OP_BAR &&
              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
             replaceZero(i);
       }
@@ -395,8 +353,7 @@ NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
       return;
 
    for (int s = 0; di->srcExists(s); ++s)
-      if (di->src(s).getFile() == FILE_IMMEDIATE ||
-          di->src(s).getFile() == FILE_MEMORY_LOCAL)
+      if (di->src(s).getFile() == FILE_IMMEDIATE)
          return;
 
    if (prog->getType() == Program::TYPE_GEOMETRY) {
@@ -646,13 +603,6 @@ private:
    bool handlePFETCH(Instruction *);
    bool handleEXPORT(Instruction *);
    bool handleLOAD(Instruction *);
-   bool handleLDST(Instruction *);
-   bool handleMEMBAR(Instruction *);
-   bool handleSharedATOM(Instruction *);
-   bool handleSULDP(TexInstruction *);
-   bool handleSUREDP(TexInstruction *);
-   bool handleSUSTP(TexInstruction *);
-   Value *processSurfaceCoords(TexInstruction *);
 
    bool handleDIV(Instruction *);
    bool handleSQRT(Instruction *);
@@ -667,9 +617,6 @@ private:
    bool handleTXL(TexInstruction *); // hate
    bool handleTXD(TexInstruction *); // these 3
    bool handleTXLQ(TexInstruction *);
-   bool handleTXQ(TexInstruction *);
-   bool handleSUQ(TexInstruction *);
-   bool handleBUFQ(Instruction *);
 
    bool handleCALL(Instruction *);
    bool handlePRECONT(Instruction *);
@@ -678,8 +625,6 @@ private:
    void checkPredicate(Instruction *);
    void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
    void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
-   Value *loadSuInfo(int slot, uint32_t off);
-   Value *loadSuInfo16(int slot, uint32_t off);
 
 private:
    const Target *const targ;
@@ -717,14 +662,12 @@ void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
                                        Value **ms_x, Value **ms_y) {
    // This loads the texture-indexed ms setting from the constant buffer
    Value *tmp = new_LValue(func, FILE_GPR);
-   uint8_t b = prog->driver->io.auxCBSlot;
+   uint8_t b = prog->driver->io.resInfoCBSlot;
    off += prog->driver->io.suInfoBase;
    if (prog->getType() > Program::TYPE_VERTEX)
       off += 16 * 2 * 4;
    if (prog->getType() > Program::TYPE_GEOMETRY)
       off += 16 * 2 * 4;
-   if (prog->getType() > Program::TYPE_FRAGMENT)
-      off += 16 * 2 * 4;
    *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
                              FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
    *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
@@ -754,24 +697,6 @@ void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy)
                            prog->driver->io.msInfoBase + 4), off);
 }
 
-Value *
-NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)
-{
-   uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
-   return bld.mkLoadv(TYPE_U32, bld.mkSymbol(
-                            FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);
-}
-
-Value *
-NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)
-{
-   uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
-   return bld.mkLoadv(TYPE_U16, bld.mkSymbol(
-                            FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);
-}
-
 bool
 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
 {
@@ -779,23 +704,6 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
    const int dref = arg;
    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
 
-   /* Only normalize in the non-explicit derivatives case.
-    */
-   if (i->tex.target.isCube() && i->op != OP_TXD) {
-      Value *src[3], *val;
-      int c;
-      for (c = 0; c < 3; ++c)
-         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
-      val = bld.getScratch();
-      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
-      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
-      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
-      for (c = 0; c < 3; ++c) {
-         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
-                                 i->getSrc(c), val));
-      }
-   }
-
    // handle MS, which means looking up the MS params for this texture, and
    // adjusting the input coordinates to point at the right sample.
    if (i->tex.target.isMS()) {
@@ -923,7 +831,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i)
    }
    Value *flags = bld.getScratch(1, FILE_FLAGS);
    bld.setPosition(cond, true);
-   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
 
    Instruction *tex[4];
    for (l = 0; l < 4; ++l) {
@@ -1002,18 +910,16 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
    Instruction *tex;
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
-   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
+   const int dim = i->tex.target.getDim();
 
    handleTEX(i);
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
-   i->tex.derivAll = true;
 
    for (c = 0; c < dim; ++c)
       crd[c] = bld.getScratch();
 
    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
    for (l = 0; l < 4; ++l) {
-      Value *src[3], *val;
       // mov coordinates from lane l to all lanes
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
@@ -1023,24 +929,10 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
       // add dPdy from lane l to lanes dy
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
-      // normalize cube coordinates if necessary
-      if (i->tex.target.isCube()) {
-         for (c = 0; c < 3; ++c)
-            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
-         val = bld.getScratch();
-         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
-         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
-         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
-         for (c = 0; c < 3; ++c)
-            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
-      } else {
-         for (c = 0; c < dim; ++c)
-            src[c] = crd[c];
-      }
       // texture
       bld.insert(tex = cloneForward(func, i));
       for (c = 0; c < dim; ++c)
-         tex->setSrc(c, src[c]);
+         tex->setSrc(c, crd[c]);
       // save results
       for (c = 0; i->defExists(c); ++c) {
          Instruction *mov;
@@ -1083,87 +975,6 @@ NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
 }
 
 bool
-NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
-{
-   Value *ms, *ms_x, *ms_y;
-   if (i->tex.query == TXQ_DIMS) {
-      if (i->tex.target.isMS()) {
-         bld.setPosition(i, true);
-         loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
-         int d = 0;
-         if (i->tex.mask & 1) {
-            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x);
-            d++;
-         }
-         if (i->tex.mask & 2) {
-            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y);
-            d++;
-         }
-      }
-      return true;
-   }
-   assert(i->tex.query == TXQ_TYPE);
-   assert(i->tex.mask == 4);
-
-   loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
-   bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
-   i->bb->remove(i);
-
-   return true;
-}
-
-bool
-NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
-{
-   const int dim = suq->tex.target.getDim();
-   const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
-   int mask = suq->tex.mask;
-   int slot = suq->tex.r;
-   int c, d;
-
-   for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
-      if (c >= arg || !(mask & 1))
-         continue;
-
-      int offset;
-
-      if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
-         offset = NV50_SU_INFO_SIZE(2);
-      } else {
-         offset = NV50_SU_INFO_SIZE(c);
-      }
-      bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));
-      if (c == 2 && suq->tex.target.isCube())
-         bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
-                   bld.loadImm(NULL, 6));
-   }
-
-   if (mask & 1) {
-      if (suq->tex.target.isMS()) {
-         Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));
-         Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));
-         Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
-         bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
-      } else {
-         bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
-      }
-   }
-
-   bld.remove(suq);
-   return true;
-}
-
-bool
-NV50LoweringPreSSA::handleBUFQ(Instruction *bufq)
-{
-   bufq->op = OP_MOV;
-   bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));
-   bufq->setIndirect(0, 0, NULL);
-   bufq->setIndirect(0, 1, NULL);
-   return true;
-}
-
-bool
 NV50LoweringPreSSA::handleSET(Instruction *i)
 {
    if (i->dType == TYPE_F32) {
@@ -1294,13 +1105,19 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
       break;
    case SV_NCTAID:
    case SV_CTAID:
-   case SV_NTID: {
-      Value *x = bld.getSSA(2);
-      bld.mkOp1(OP_LOAD, TYPE_U16, x,
-                bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
-      bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+   case SV_NTID:
+      if ((sv == SV_NCTAID && idx >= 2) ||
+          (sv == SV_NTID && idx >= 3)) {
+         bld.mkMov(def, bld.mkImm(1));
+      } else if (sv == SV_CTAID && idx >= 2) {
+         bld.mkMov(def, bld.mkImm(0));
+      } else {
+         Value *x = bld.getSSA(2);
+         bld.mkOp1(OP_LOAD, TYPE_U16, x,
+                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
+         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+      }
       break;
-   }
    case SV_TID:
       if (idx == 0) {
          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
@@ -1313,9 +1130,6 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
          bld.mkMov(def, bld.mkImm(0));
       }
       break;
-   case SV_COMBINED_TID:
-      bld.mkMov(def, tid);
-      break;
    case SV_SAMPLE_POS: {
       Value *off = new_LValue(func, FILE_ADDRESS);
       bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
@@ -1323,16 +1137,11 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
       bld.mkLoad(TYPE_F32,
                  def,
                  bld.mkSymbol(
-                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
                  off);
       break;
    }
-   case SV_THREAD_KILL:
-      // Not actually supported. But it's implementation-dependent, so we can
-      // always just say it's not a helper.
-      bld.mkMov(def, bld.loadImm(NULL, 0));
-      break;
    default:
       bld.mkFetch(i->getDef(0), i->dType,
                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
@@ -1357,9 +1166,10 @@ NV50LoweringPreSSA::handleDIV(Instruction *i)
 bool
 NV50LoweringPreSSA::handleSQRT(Instruction *i)
 {
-   bld.setPosition(i, true);
-   i->op = OP_RSQ;
-   bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
+   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+                                bld.getSSA(), i->getSrc(0));
+   i->op = OP_MUL;
+   i->setSrc(1, rsq->getDef(0));
 
    return true;
 }
@@ -1397,7 +1207,7 @@ NV50LoweringPreSSA::handleEXPORT(Instruction *i)
          i->setDef(0, new_LValue(func, FILE_GPR));
          i->getDef(0)->reg.data.id = id;
 
-         prog->maxGPR = MAX2(prog->maxGPR, id * 2);
+         prog->maxGPR = MAX2(prog->maxGPR, id);
       }
    }
    return true;
@@ -1412,15 +1222,6 @@ bool
 NV50LoweringPreSSA::handleLOAD(Instruction *i)
 {
    ValueRef src = i->src(0);
-   Symbol *sym = i->getSrc(0)->asSym();
-
-   if (prog->getType() == Program::TYPE_COMPUTE) {
-      if (sym->inFile(FILE_MEMORY_SHARED) ||
-          sym->inFile(FILE_MEMORY_BUFFER) ||
-          sym->inFile(FILE_MEMORY_GLOBAL)) {
-         return handleLDST(i);
-      }
-   }
 
    if (src.isIndirect(1)) {
       assert(prog->getType() == Program::TYPE_GEOMETRY);
@@ -1458,677 +1259,6 @@ NV50LoweringPreSSA::handleLOAD(Instruction *i)
 }
 
 bool
-NV50LoweringPreSSA::handleSharedATOM(Instruction *atom)
-{
-   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
-
-   BasicBlock *currBB = atom->bb;
-   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
-   BasicBlock *joinBB = atom->bb->splitAfter(atom);
-   BasicBlock *setAndUnlockBB = new BasicBlock(func);
-   BasicBlock *failLockBB = new BasicBlock(func);
-
-   bld.setPosition(currBB, true);
-   assert(!currBB->joinAt);
-   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
-
-   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
-   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
-
-   bld.setPosition(tryLockBB, true);
-
-   Instruction *ld =
-      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
-                 atom->getIndirect(0, 0));
-   Value *locked = bld.getSSA(1, FILE_FLAGS);
-   if (prog->getTarget()->getChipset() >= 0xa0) {
-      ld->setFlagsDef(1, locked);
-      ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
-   } else {
-      bld.mkMov(locked, bld.loadImm(NULL, 2))
-         ->flagsDef = 0;
-   }
-
-   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked);
-   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
-   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
-   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
-
-   tryLockBB->cfg.detach(&joinBB->cfg);
-   bld.remove(atom);
-
-   bld.setPosition(setAndUnlockBB, true);
-   Value *stVal;
-   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
-      // Read the old value, and write the new one.
-      stVal = atom->getSrc(1);
-   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
-      CmpInstruction *set =
-         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS),
-                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
-
-      Instruction *selp =
-         bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2),
-                   ld->getDef(0), set->getDef(0));
-      stVal = selp->getDef(0);
-
-      handleSELP(selp);
-   } else {
-      operation op;
-
-      switch (atom->subOp) {
-      case NV50_IR_SUBOP_ATOM_ADD:
-         op = OP_ADD;
-         break;
-      case NV50_IR_SUBOP_ATOM_AND:
-         op = OP_AND;
-         break;
-      case NV50_IR_SUBOP_ATOM_OR:
-         op = OP_OR;
-         break;
-      case NV50_IR_SUBOP_ATOM_XOR:
-         op = OP_XOR;
-         break;
-      case NV50_IR_SUBOP_ATOM_MIN:
-         op = OP_MIN;
-         break;
-      case NV50_IR_SUBOP_ATOM_MAX:
-         op = OP_MAX;
-         break;
-      default:
-         assert(0);
-         return false;
-      }
-
-      Instruction *i =
-         bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
-                   atom->getSrc(1));
-
-      stVal = i->getDef(0);
-   }
-
-   Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
-               atom->getIndirect(0, 0), stVal);
-   if (prog->getTarget()->getChipset() >= 0xa0) {
-      store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
-   }
-
-   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
-   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
-
-   // Loop until the lock is acquired.
-   bld.setPosition(failLockBB, true);
-   bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked);
-   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
-   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
-   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
-
-   bld.setPosition(joinBB, false);
-   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
-
-   return true;
-}
-
-bool
-NV50LoweringPreSSA::handleLDST(Instruction *i)
-{
-   ValueRef src = i->src(0);
-   Symbol *sym = i->getSrc(0)->asSym();
-
-   if (prog->getType() != Program::TYPE_COMPUTE) {
-      return true;
-   }
-
-   // Buffers just map directly to the different global memory spaces
-   if (sym->inFile(FILE_MEMORY_BUFFER)) {
-      sym->reg.file = FILE_MEMORY_GLOBAL;
-   }
-
-   if (sym->inFile(FILE_MEMORY_SHARED)) {
-
-      if (src.isIndirect(0)) {
-         Value *addr = i->getIndirect(0, 0);
-
-         if (!addr->inFile(FILE_ADDRESS)) {
-            // Move address from GPR into an address register
-            Value *new_addr = bld.getSSA(2, FILE_ADDRESS);
-            bld.mkMov(new_addr, addr);
-
-            i->setIndirect(0, 0, new_addr);
-         }
-      }
-
-      if (i->op == OP_ATOM)
-         handleSharedATOM(i);
-   } else if (sym->inFile(FILE_MEMORY_GLOBAL)) {
-      // All global access must be indirect. There are no instruction forms
-      // with direct access.
-      Value *addr = i->getIndirect(0, 0);
-
-      Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset);
-      Value *sum;
-      if (addr != NULL)
-         sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr,
-                          offset);
-      else
-         sum = offset;
-
-      i->setIndirect(0, 0, sum);
-      sym->reg.data.offset = 0;
-   }
-
-   return true;
-}
-
-bool
-NV50LoweringPreSSA::handleMEMBAR(Instruction *i)
-{
-   // For global memory, apparently doing a bunch of reads at different
-   // addresses forces things to get sufficiently flushed.
-   if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) {
-      uint8_t b = prog->driver->io.auxCBSlot;
-      Value *base =
-         bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32,
-                                            prog->driver->io.membarOffset), NULL);
-      Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0));
-      Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                              bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
-                                         physid, bld.loadImm(NULL, 0x1f)),
-                              bld.loadImm(NULL, 2));
-      base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off);
-      Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0);
-      for (int i = 0; i < 8; i++) {
-         if (i != 0) {
-            base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100));
-         }
-         bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base)
-            ->fixed = 1;
-      }
-   }
-
-   // Both global and shared memory barriers also need a regular control bar
-   // TODO: double-check this is the case
-   i->op = OP_BAR;
-   i->subOp = NV50_IR_SUBOP_BAR_SYNC;
-   i->setSrc(0, bld.mkImm(0u));
-   i->setSrc(1, bld.mkImm(0u));
-
-   return true;
-}
-
-// The type that bests represents how each component can be stored when packed.
-static DataType
-getPackedType(const TexInstruction::ImgFormatDesc *t, int c)
-{
-   switch (t->type) {
-   case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
-   case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
-   case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
-   case UINT:
-      return (t->bits[c] == 8 ? TYPE_U8 :
-              (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));
-   case SINT:
-      return (t->bits[c] == 8 ? TYPE_S8 :
-              (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));
-   }
-   return TYPE_NONE;
-}
-
-// The type that the rest of the shader expects to process this image type in.
-static DataType
-getShaderType(const ImgType type) {
-   switch (type) {
-   case FLOAT:
-   case UNORM:
-   case SNORM:
-      return TYPE_F32;
-   case UINT:
-      return TYPE_U32;
-   case SINT:
-      return TYPE_S32;
-   default:
-      assert(!"Impossible type");
-      return TYPE_NONE;
-   }
-}
-
-// Reads the raw coordinates out of the input instruction, and returns a
-// single-value coordinate which is what the hardware expects to receive in a
-// ld/st op.
-Value *
-NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
-{
-   const int slot = su->tex.r;
-   const int dim = su->tex.target.getDim();
-   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
-
-   const TexInstruction::ImgFormatDesc *format = su->tex.format;
-   const uint16_t bytes = (format->bits[0] + format->bits[1] +
-                           format->bits[2] + format->bits[3]) / 8;
-   uint16_t shift = ffs(bytes) - 1;
-
-   // Buffer sizes don't necessarily fit in 16-bit values
-   if (su->tex.target == TEX_TARGET_BUFFER) {
-      return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                        su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));
-   }
-
-   // For buffers, we just need the byte offset. And for 2d buffers we want
-   // the x coordinate in bytes as well.
-   Value *coords[3] = {};
-   for (int i = 0; i < arg; i++) {
-      Value *src[2];
-      bld.mkSplit(src, 2, su->getSrc(i));
-      coords[i] = src[0];
-      // For 1d-images, we want the y coord to be 0, which it will be here.
-      if (i == 0)
-         coords[1] = src[1];
-   }
-
-   coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
-                          coords[0], bld.loadImm(NULL, shift));
-
-   if (su->tex.target.isMS()) {
-      Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));
-      Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));
-      coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);
-      coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);
-   }
-
-   // If there are more dimensions, we just want the y-offset. But that needs
-   // to be adjusted up by the y-stride for array images.
-   if (su->tex.target.isArray() || su->tex.target.isCube()) {
-      Value *index = coords[dim];
-      Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
-      Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);
-      mul->sType = TYPE_U16;
-      Value *muls[2];
-      bld.mkSplit(muls, 2, mul->getDef(0));
-      if (dim > 1)
-         coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);
-      else
-         coords[1] = muls[0];
-   }
-
-   // 3d is special-cased. Note that a single "slice" of a 3d image may
-   // also be attached as 2d, so we have to do the same 3d processing for
-   // 2d as well, just in case. In order to remap a 3d image onto a 2d
-   // image, we have to retile it "by hand".
-   if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
-      Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);
-      Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
-      // Add the z coordinate for actual 3d-images
-      if (dim > 2)
-         coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);
-      else
-         coords[2] = z;
-
-      // Compute the surface parameters from tile shifts
-      Value *tile_shift[3];
-      Value *tile_size[3];
-      Value *tile_mask[3];
-      // We only ever use one kind of X-tiling.
-      tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);
-      tile_size[0] = bld.loadImm(NULL, (uint16_t)64);
-      tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);
-      // Fetch the "real" tiling parameters of the underlying surface
-      for (int i = 1; i < 3; i++) {
-         tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));
-         tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);
-         tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));
-      }
-
-      // Compute the location of given coordinate, both inside the tile as
-      // well as which (linearly-laid out) tile it's in.
-      Value *coord_in_tile[3];
-      Value *tile[3];
-      for (int i = 0; i < 3; i++) {
-         coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);
-         tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);
-      }
-
-      // Based on the "real" tiling parameters, compute x/y coordinates in the
-      // larger surface with 2d tiling that was supplied to the hardware. This
-      // was determined and verified with the help of the tiling pseudocode in
-      // the envytools docs.
-      //
-      // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
-      //         z_coord_in_tile * x_tile_size
-      // adj_y = y_coord_in_tile + y_tile * y_tile_size +
-      //         z_tile * y_tile_size * y_tiles
-      //
-      // Note: STRIDE_Y = y_tile_size * y_tiles
-
-      coords[0] = bld.mkOp2v(
-            OP_ADD, TYPE_U16, bld.getSSA(2),
-            bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
-                       coord_in_tile[0],
-                       bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
-                                  tile[0],
-                                  bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
-                                             tile_shift[2], tile_shift[0]))),
-            bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
-                       coord_in_tile[2], tile_shift[0]));
-
-      Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),
-                                   tile[2], y_size_aligned);
-      mul->sType = TYPE_U16;
-      Value *muls[2];
-      bld.mkSplit(muls, 2, mul->getDef(0));
-
-      coords[1] = bld.mkOp2v(
-            OP_ADD, TYPE_U16, bld.getSSA(2),
-            muls[0],
-            bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
-                       coord_in_tile[1],
-                       bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
-                                  tile[1], tile_shift[1])));
-   }
-
-   return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);
-}
-
-// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but
-// adjusted to make use of 16-bit math where possible.
-bool
-NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
-{
-   const int slot = su->tex.r;
-   assert(!su->getIndirectR());
-
-   bld.setPosition(su, false);
-
-   const TexInstruction::ImgFormatDesc *format = su->tex.format;
-   const int bytes = (su->tex.format->bits[0] +
-                      su->tex.format->bits[1] +
-                      su->tex.format->bits[2] +
-                      su->tex.format->bits[3]) / 8;
-   DataType ty = typeOfSize(bytes);
-
-   Value *coord = processSurfaceCoords(su);
-
-   Value *untypedDst[4] = {};
-   Value *typedDst[4] = {};
-   int i;
-   for (i = 0; i < bytes / 4; i++)
-      untypedDst[i] = bld.getSSA();
-   if (bytes < 4)
-      untypedDst[0] = bld.getSSA();
-
-   for (i = 0; i < 4; i++)
-      typedDst[i] = su->getDef(i);
-
-   Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);
-   for (i = 0; i < 4 && untypedDst[i]; i++)
-      load->setDef(i, untypedDst[i]);
-
-   // Unpack each component into the typed dsts
-   int bits = 0;
-   for (int i = 0; i < 4; bits += format->bits[i], i++) {
-      if (!typedDst[i])
-         continue;
-
-      if (i >= format->components) {
-         if (format->type == FLOAT ||
-             format->type == UNORM ||
-             format->type == SNORM)
-            bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
-         else
-            bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
-         continue;
-      }
-
-      // Get just that component's data into the relevant place
-      if (format->bits[i] == 32)
-         bld.mkMov(typedDst[i], untypedDst[i]);
-      else if (format->bits[i] == 16) {
-         // We can always convert directly from the appropriate half of the
-         // loaded value into the typed result.
-         Value *src[2];
-         bld.mkSplit(src, 2, untypedDst[i / 2]);
-         bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
-                   getPackedType(format, i), src[i & 1]);
-      }
-      else if (format->bits[i] == 8) {
-         // Same approach as for 16 bits, but we have to massage the value a
-         // bit more, since we have to get the appropriate 8 bits from the
-         // half-register. In all cases, we can CVT from a 8-bit source, so we
-         // only have to shift when we want the upper 8 bits.
-         Value *src[2], *shifted;
-         bld.mkSplit(src, 2, untypedDst[0]);
-         DataType packedType = getPackedType(format, i);
-         if (i & 1)
-            shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));
-         else
-            shifted = src[!!(i & 2)];
-
-         bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
-                   packedType, shifted);
-      }
-      else {
-         // The options are 10, 11, and 2. Get it into a 32-bit reg, then
-         // shift/mask. That's where it'll have to end up anyways. For signed,
-         // we have to make sure to get sign-extension, so we actually have to
-         // shift *up* first, and then shift down. There's no advantage to
-         // AND'ing, so we don't.
-         DataType ty = TYPE_U32;
-         if (format->type == SNORM || format->type == SINT) {
-            ty = TYPE_S32;
-         }
-
-         // Poor man's EXTBF
-         bld.mkOp2(
-               OP_SHR, ty, typedDst[i],
-               bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),
-               bld.loadImm(NULL, 32 - format->bits[i]));
-
-         // If the stored data is already in the appropriate type, we don't
-         // have to do anything. Convert to float for the *NORM formats.
-         if (format->type == UNORM || format->type == SNORM)
-            bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);
-      }
-
-      // Normalize / convert as necessary
-      if (format->type == UNORM)
-         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
-      else if (format->type == SNORM)
-         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
-      else if (format->type == FLOAT && format->bits[i] < 16) {
-         // We expect the value to be in the low bits of the register, so we
-         // have to shift back up.
-         bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
-         Value *src[2];
-         bld.mkSplit(src, 2, typedDst[i]);
-         bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);
-      }
-   }
-
-   if (format->bgra) {
-      std::swap(typedDst[0], typedDst[2]);
-   }
-
-   bld.getBB()->remove(su);
-   return true;
-}
-
-bool
-NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
-{
-   const int slot = su->tex.r;
-   const int dim = su->tex.target.getDim();
-   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
-   assert(!su->getIndirectR());
-
-   bld.setPosition(su, false);
-
-   Value *coord = processSurfaceCoords(su);
-
-   // This is guaranteed to be a 32-bit format. So there's nothing to
-   // pack/unpack.
-   Instruction *atom = bld.mkOp2(
-         OP_ATOM, su->dType, su->getDef(0),
-         bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));
-   if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
-      atom->setSrc(2, su->getSrc(arg + 1));
-   atom->setIndirect(0, 0, coord);
-   atom->subOp = su->subOp;
-
-   bld.getBB()->remove(su);
-   return true;
-}
-
-bool
-NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
-{
-   const int slot = su->tex.r;
-   const int dim = su->tex.target.getDim();
-   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
-   assert(!su->getIndirectR());
-
-   bld.setPosition(su, false);
-
-   const TexInstruction::ImgFormatDesc *format = su->tex.format;
-   const int bytes = (su->tex.format->bits[0] +
-                      su->tex.format->bits[1] +
-                      su->tex.format->bits[2] +
-                      su->tex.format->bits[3]) / 8;
-   DataType ty = typeOfSize(bytes);
-
-   Value *coord = processSurfaceCoords(su);
-
-   // The packed values we will eventually store into memory
-   Value *untypedDst[4] = {};
-   // Each component's packed representation, in 16-bit registers (only used
-   // where appropriate)
-   Value *untypedDst16[4] = {};
-   // The original values that are being packed
-   Value *typedDst[4] = {};
-   int i;
-
-   for (i = 0; i < bytes / 4; i++)
-      untypedDst[i] = bld.getSSA();
-   for (i = 0; i < format->components; i++)
-      untypedDst16[i] = bld.getSSA(2);
-   // Make sure we get at least one of each value allocated for the
-   // super-narrow formats.
-   if (bytes < 4)
-      untypedDst[0] = bld.getSSA();
-   if (bytes < 2)
-      untypedDst16[0] = bld.getSSA(2);
-
-   for (i = 0; i < 4; i++) {
-      typedDst[i] = bld.getSSA();
-      bld.mkMov(typedDst[i], su->getSrc(arg + i));
-   }
-
-   if (format->bgra) {
-      std::swap(typedDst[0], typedDst[2]);
-   }
-
-   // Pack each component into the untyped dsts.
-   int bits = 0;
-   for (int i = 0; i < format->components; bits += format->bits[i], i++) {
-      // Un-normalize / convert as necessary
-      if (format->type == UNORM)
-         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));
-      else if (format->type == SNORM)
-         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));
-
-      // There is nothing to convert/pack for 32-bit values
-      if (format->bits[i] == 32) {
-         bld.mkMov(untypedDst[i], typedDst[i]);
-         continue;
-      }
-
-      // The remainder of the cases will naturally want to deal in 16-bit
-      // registers. We will put these into untypedDst16 and then merge them
-      // together later.
-      if (format->type == FLOAT && format->bits[i] < 16) {
-         bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);
-         bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));
-
-         // For odd bit sizes, it's easier to pack it into the final
-         // destination directly.
-         Value *tmp = bld.getSSA();
-         bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
-         if (i == 0) {
-            untypedDst[0] = tmp;
-         } else {
-            bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
-            bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
-         }
-      } else if (format->bits[i] == 16) {
-         // We can always convert the shader value into the packed value
-         // directly here
-         bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],
-                   getShaderType(format->type), typedDst[i]);
-      } else if (format->bits[i] < 16) {
-         DataType packedType = getPackedType(format, i);
-         DataType shaderType = getShaderType(format->type);
-         // We can't convert F32 to U8/S8 directly, so go to U16/S16 first.
-         if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {
-            packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;
-         }
-         bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);
-         // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of
-         // the size, it's easier to dump them into a 32-bit value and OR
-         // everything later.
-         if (format->bits[i] != 8) {
-            // Restrict value to the appropriate bits (although maybe supposed
-            // to clamp instead?)
-            bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));
-            // And merge into final packed value
-            Value *tmp = bld.getSSA();
-            bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
-            if (i == 0) {
-               untypedDst[0] = tmp;
-            } else {
-               bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
-               bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
-            }
-         } else if (i & 1) {
-            // Shift the 8-bit value up (so that it can be OR'd later)
-            bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));
-         } else if (packedType != TYPE_U8) {
-            // S8 (or the *16 if converted from float) will all have high bits
-            // set, so AND them out.
-            bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));
-         }
-      }
-   }
-
-   // OR pairs of 8-bit values together (into the even value)
-   if (format->bits[0] == 8) {
-      for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)
-         bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);
-   }
-
-   // We'll always want to have at least a 32-bit source register for the store
-   Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));
-   if (format->bits[0] == 32) {
-      for (i = 0; i < 4 && untypedDst[i]; i++)
-         merge->setSrc(i, untypedDst[i]);
-   } else if (format->bits[0] == 16) {
-      for (i = 0; i < 4 && untypedDst16[i]; i++)
-         merge->setSrc(i, untypedDst16[i]);
-      if (i == 1)
-         merge->setSrc(i, bld.getSSA(2));
-   } else if (format->bits[0] == 8) {
-      for (i = 0; i < 2 && untypedDst16[2 * i]; i++)
-         merge->setSrc(i, untypedDst16[2 * i]);
-      if (i == 1)
-         merge->setSrc(i, bld.getSSA(2));
-   } else {
-      merge->setSrc(0, untypedDst[0]);
-   }
-
-   bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));
-
-   bld.getBB()->remove(su);
-   return true;
-}
-
-bool
 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
 {
    assert(prog->getType() == Program::TYPE_GEOMETRY);
@@ -2203,8 +1333,6 @@ NV50LoweringPreSSA::visit(Instruction *i)
       return handleTXD(i->asTex());
    case OP_TXLQ:
       return handleTXLQ(i->asTex());
-   case OP_TXQ:
-      return handleTXQ(i->asTex());
    case OP_EX2:
       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
       i->setSrc(0, i->getDef(0));
@@ -2225,21 +1353,6 @@ NV50LoweringPreSSA::visit(Instruction *i)
       return handleEXPORT(i);
    case OP_LOAD:
       return handleLOAD(i);
-   case OP_MEMBAR:
-      return handleMEMBAR(i);
-   case OP_ATOM:
-   case OP_STORE:
-      return handleLDST(i);
-   case OP_SULDP:
-      return handleSULDP(i->asTex());
-   case OP_SUSTP:
-      return handleSUSTP(i->asTex());
-   case OP_SUREDP:
-      return handleSUREDP(i->asTex());
-   case OP_SUQ:
-      return handleSUQ(i->asTex());
-   case OP_BUFQ:
-      return handleBUFQ(i);
    case OP_RDSV:
       return handleRDSV(i);
    case OP_WRSV:
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
index 3d25ad928..2e432349f 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
@@ -87,7 +87,6 @@ DominatorTree::DominatorTree(Graph *cfgraph) : cfg(cfgraph),
       LABEL(i) = i;
       SEMI(i) = ANCESTOR(i) = -1;
    }
-   assert(i == count);
 
    build();
 
@@ -169,7 +168,7 @@ void DominatorTree::build()
    do {
       p = 0;
       for (v = 1; v < count; ++v) {
-         nw = &BasicBlock::get(vert[DOM(v)])->dom;
+         nw = &BasicBlock::get(vert[DOM(v)])->dom;;
          nv = &BasicBlock::get(vert[v])->dom;
          if (nw->getGraph() && !nv->getGraph()) {
             ++p;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index b9c3746ad..f3ddcaa51 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -66,7 +66,7 @@ TargetNV50::getBuiltinOffset(int builtin) const
    return 0;
 }
 
-struct nv50_opProperties
+struct opProperties
 {
    operation op;
    unsigned int mNeg    : 4;
@@ -79,7 +79,7 @@ struct nv50_opProperties
    unsigned int fImm    : 3;
 };
 
-static const struct nv50_opProperties _initProps[] =
+static const struct opProperties _initProps[] =
 {
    //           neg  abs  not  sat  c[]  s[], a[], imm
    { OP_ADD,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
@@ -99,7 +99,6 @@ static const struct nv50_opProperties _initProps[] =
    { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
    { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
-   { OP_EX2,    0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0 },
    { OP_LG2,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_RCP,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_RSQ,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
@@ -111,15 +110,15 @@ void TargetNV50::initOpInfo()
 {
    unsigned int i, j;
 
-   static const operation commutativeList[] =
+   static const uint32_t commutative[(OP_LAST + 31) / 32] =
    {
-      OP_ADD, OP_MUL, OP_MAD, OP_FMA, OP_AND, OP_OR, OP_XOR, OP_MAX, OP_MIN,
-      OP_SET_AND, OP_SET_OR, OP_SET_XOR, OP_SET, OP_SELP, OP_SLCT
+      // ADD,MAD,MUL,AND,OR,XOR,MAX,MIN
+      0x0670ca00, 0x0000003f, 0x00000000, 0x00000000
    };
-   static const operation shortFormList[] =
+   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
    {
-      OP_MOV, OP_ADD, OP_SUB, OP_MUL, OP_MAD, OP_SAD, OP_RCP, OP_LINTERP,
-      OP_PINTERP, OP_TEX, OP_TXF
+      // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF
+      0x00014e40, 0x00000040, 0x00000930, 0x00000000
    };
    static const operation noDestList[] =
    {
@@ -156,23 +155,19 @@ void TargetNV50::initOpInfo()
 
       opInfo[i].hasDest = 1;
       opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
-      opInfo[i].commutative = false; /* set below */
+      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
       opInfo[i].pseudo = (i < OP_MOV);
       opInfo[i].predicate = !opInfo[i].pseudo;
       opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
-      opInfo[i].minEncSize = 8; /* set below */
+      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
    }
-   for (i = 0; i < ARRAY_SIZE(commutativeList); ++i)
-      opInfo[commutativeList[i]].commutative = true;
-   for (i = 0; i < ARRAY_SIZE(shortFormList); ++i)
-      opInfo[shortFormList[i]].minEncSize = 4;
-   for (i = 0; i < ARRAY_SIZE(noDestList); ++i)
+   for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i)
       opInfo[noDestList[i]].hasDest = 0;
-   for (i = 0; i < ARRAY_SIZE(noPredList); ++i)
+   for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i)
       opInfo[noPredList[i]].predicate = 0;
 
-   for (i = 0; i < ARRAY_SIZE(_initProps); ++i) {
-      const struct nv50_opProperties *prop = &_initProps[i];
+   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+      const struct opProperties *prop = &_initProps[i];
 
       for (int s = 0; s < 3; ++s) {
          if (prop->mNeg & (1 << s))
@@ -203,16 +198,14 @@ TargetNV50::getFileSize(DataFile file) const
 {
    switch (file) {
    case FILE_NULL:          return 0;
-   case FILE_GPR:           return 254; // in 16-bit units **
+   case FILE_GPR:           return 256; // in 16-bit units **
    case FILE_PREDICATE:     return 0;
    case FILE_FLAGS:         return 4;
    case FILE_ADDRESS:       return 4;
-   case FILE_BARRIER:       return 0;
    case FILE_IMMEDIATE:     return 0;
    case FILE_MEMORY_CONST:  return 65536;
    case FILE_SHADER_INPUT:  return 0x200;
    case FILE_SHADER_OUTPUT: return 0x200;
-   case FILE_MEMORY_BUFFER: return 0xffffffff;
    case FILE_MEMORY_GLOBAL: return 0xffffffff;
    case FILE_MEMORY_SHARED: return 16 << 10;
    case FILE_MEMORY_LOCAL:  return 48 << 10;
@@ -252,18 +245,15 @@ TargetNV50::getSVAddress(DataFile shaderFile, const Symbol *sym) const
       return shaderFile == FILE_SHADER_INPUT ? 0x18 :
          sysvalLocation[sym->reg.data.sv.sv];
    case SV_NCTAID:
-      return sym->reg.data.sv.index >= 2 ? 0x10 : 0x8 + 2 * sym->reg.data.sv.index;
+      return 0x8 + 2 * sym->reg.data.sv.index;
    case SV_CTAID:
-      return sym->reg.data.sv.index >= 2 ? 0x12 : 0xc + 2 * sym->reg.data.sv.index;
+      return 0xc + 2 * sym->reg.data.sv.index;
    case SV_NTID:
       return 0x2 + 2 * sym->reg.data.sv.index;
    case SV_TID:
-   case SV_COMBINED_TID:
       return 0;
    case SV_SAMPLE_POS:
       return 0; /* sample position is handled differently */
-   case SV_THREAD_KILL:
-      return 0;
    default:
       return sysvalLocation[sym->reg.data.sv.sv];
    }
@@ -278,16 +268,6 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
 {
    DataFile sf = ld->src(0).getFile();
 
-   // immediate 0 can be represented by GPR $r63/$r127
-   // this does not work with global memory ld/st/atom
-   if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
-      return (!i->isPseudo() &&
-              !i->asTex() &&
-              i->op != OP_EXPORT &&
-              i->op != OP_STORE &&
-              ((i->op != OP_ATOM && i->op != OP_LOAD) ||
-               i->src(0).getFile() != FILE_MEMORY_GLOBAL));
-
    if (sf == FILE_IMMEDIATE && (i->predSrc >= 0 || i->flagsDef >= 0))
       return false;
    if (s >= opInfo[i->op].srcNr)
@@ -362,11 +342,8 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
       ldSize = typeSizeof(ld->dType);
    }
 
-   if (sf == FILE_IMMEDIATE) {
-      if (ldSize == 2 && (i->op == OP_AND || i->op == OP_OR || i->op == OP_XOR))
-         return false;
-      return ldSize <= 4;
-   }
+   if (sf == FILE_IMMEDIATE)
+      return true;
 
 
    // Check if memory access is encodable:
@@ -402,29 +379,12 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
 }
 
 bool
-TargetNV50::insnCanLoadOffset(const Instruction *i, int s, int offset) const
-{
-   if (!i->src(s).isIndirect(0))
-      return true;
-   offset += i->src(s).get()->reg.data.offset;
-   if (i->op == OP_LOAD || i->op == OP_STORE || i->op == OP_ATOM) {
-      // There are some restrictions in theory, but in practice they're never
-      // going to be hit. However offsets on global/shared memory are just
-      // plain not supported.
-      return i->src(s).getFile() != FILE_MEMORY_GLOBAL &&
-         i->src(s).getFile() != FILE_MEMORY_SHARED;
-   }
-   return offset >= 0 && offset <= (int32_t)(127 * i->src(s).get()->reg.size);
-}
-
-bool
 TargetNV50::isAccessSupported(DataFile file, DataType ty) const
 {
    if (ty == TYPE_B96 || ty == TYPE_NONE)
       return false;
    if (typeSizeof(ty) > 4)
-      return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL) ||
-             (file == FILE_MEMORY_BUFFER);
+      return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL);
    return true;
 }
 
@@ -453,8 +413,6 @@ TargetNV50::isOpSupported(operation op, DataType ty) const
    case OP_EXTBF:
    case OP_EXIT: // want exit modifier instead (on NOP if required)
    case OP_MEMBAR:
-   case OP_SHLADD:
-   case OP_XMAD:
       return false;
    case OP_SAD:
       return ty == TYPE_S32;
@@ -496,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
          return false;
       }
    }
-   if (s >= opInfo[insn->op].srcNr || s >= 3)
+   if (s >= 3)
       return false;
    return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
@@ -529,7 +487,6 @@ int TargetNV50::getLatency(const Instruction *i) const
       switch (i->src(0).getFile()) {
       case FILE_MEMORY_LOCAL:
       case FILE_MEMORY_GLOBAL:
-      case FILE_MEMORY_BUFFER:
          return 100; // really 400 to 800
       default:
          return 22;
@@ -595,24 +552,21 @@ recordLocation(uint16_t *locs, uint8_t *masks,
 }
 
 void
-TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info,
-                            const struct nv50_ir_prog_info_out *info_out)
+TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info)
 {
    unsigned int i;
-   for (i = 0; i < info_out->numOutputs; ++i)
-      recordLocation(sysvalLocation, NULL, &info_out->out[i]);
-   for (i = 0; i < info_out->numInputs; ++i)
-      recordLocation(sysvalLocation, &wposMask, &info_out->in[i]);
-   for (i = 0; i < info_out->numSysVals; ++i)
-      recordLocation(sysvalLocation, NULL, &info_out->sv[i]);
+   for (i = 0; i < info->numOutputs; ++i)
+      recordLocation(sysvalLocation, NULL, &info->out[i]);
+   for (i = 0; i < info->numInputs; ++i)
+      recordLocation(sysvalLocation, &wposMask, &info->in[i]);
+   for (i = 0; i < info->numSysVals; ++i)
+      recordLocation(sysvalLocation, NULL, &info->sv[i]);
 
    if (sysvalLocation[SV_POSITION] >= 0x200) {
       // not assigned by driver, but we need it internally
       wposMask = 0x8;
       sysvalLocation[SV_POSITION] = 0;
    }
-
-   Target::parseDriverInfo(info, info_out);
 }
 
 } // namespace nv50_ir
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
index caf66b269..0cbf180d0 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
@@ -42,13 +42,10 @@ public:
 
    virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
 
-   virtual void parseDriverInfo(const struct nv50_ir_prog_info *,
-                                const struct nv50_ir_prog_info_out *);
+   virtual void parseDriverInfo(const struct nv50_ir_prog_info *);
 
    virtual bool insnCanLoad(const Instruction *insn, int s,
                             const Instruction *ld) const;
-   virtual bool insnCanLoadOffset(const Instruction *insn, int s,
-                                  int offset) const;
    virtual bool isOpSupported(operation, DataType) const;
    virtual bool isAccessSupported(DataFile, DataType) const;
    virtual bool isModSupported(const Instruction *, int s, Modifier) const;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
index 7808164f4..3c5c74804 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@@ -31,15 +31,11 @@ namespace nv50_ir {
 
 #define NVC0_BUILTIN_COUNT 4
 
-struct nvc0_opProperties;
-
 class TargetNVC0 : public Target
 {
 public:
    TargetNVC0(unsigned int chipset);
 
-   void initProps(const struct nvc0_opProperties *props, int size);
-
    virtual CodeEmitter *getCodeEmitter(Program::Type);
 
    CodeEmitter *createCodeEmitterNVC0(Program::Type);
@@ -52,8 +48,6 @@ public:
 
    virtual bool insnCanLoad(const Instruction *insn, int s,
                             const Instruction *ld) const;
-   virtual bool insnCanLoadOffset(const Instruction *insn, int s,
-                                  int offset) const;
    virtual bool isOpSupported(operation, DataType) const;
    virtual bool isAccessSupported(DataFile, DataType) const;
    virtual bool isModSupported(const Instruction *, int s, Modifier) const;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
index dc4ebd51a..d26acb304 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
@@ -297,8 +297,8 @@ void BitSet::fill(uint32_t val)
    unsigned int i;
    for (i = 0; i < (size + 31) / 32; ++i)
       data[i] = val;
-   if (val && i)
-      data[i - 1] &= (1 << (size % 32)) - 1;
+   if (val)
+      data[i] &= ~(0xffffffff << (size % 32)); // BE ?
 }
 
 void BitSet::setOr(BitSet *pA, BitSet *pB)
@@ -311,12 +311,12 @@ void BitSet::setOr(BitSet *pA, BitSet *pB)
    }
 }
 
-int BitSet::findFreeRange(unsigned int count, unsigned int max) const
+int BitSet::findFreeRange(unsigned int count) const
 {
    const uint32_t m = (1 << count) - 1;
-   int pos = max;
+   int pos = size;
    unsigned int i;
-   const unsigned int end = (max + 31) / 32;
+   const unsigned int end = (size + 31) / 32;
 
    if (count == 1) {
       for (i = 0; i < end; ++i) {
@@ -365,15 +365,9 @@ int BitSet::findFreeRange(unsigned int count, unsigned int max) const
          }
       }
    }
-
-   // If we couldn't find a position, we can have a left-over -1 in pos. Make
-   // sure to abort in such a case.
-   if (pos < 0)
-      return -1;
-
    pos += i * 32;
 
-   return ((pos + count) <= max) ? pos : -1;
+   return ((pos + count) <= size) ? pos : -1;
 }
 
 void BitSet::print() const
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
index b1766f482..fa2c4804a 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
@@ -36,14 +36,14 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 
-#define ERROR(args...) _debug_printf("ERROR: " args)
-#define WARN(args...) _debug_printf("WARNING: " args)
-#define INFO(args...) _debug_printf(args)
+#define ERROR(args...) debug_printf("ERROR: " args)
+#define WARN(args...) debug_printf("WARNING: " args)
+#define INFO(args...) debug_printf(args)
 
 #define INFO_DBG(m, f, args...)          \
    do {                                  \
       if (m & NV50_IR_DEBUG_##f)         \
-         _debug_printf(args);             \
+         debug_printf(args);             \
    } while(0)
 
 #define FATAL(args...)          \
@@ -94,11 +94,7 @@ public:
    virtual void reset() { assert(0); } // only for graph iterators
 };
 
-#if __cplusplus >= 201103L
-typedef std::unique_ptr<Iterator> IteratorRef;
-#else
 typedef std::auto_ptr<Iterator> IteratorRef;
-#endif
 
 class ManipIterator : public Iterator
 {
@@ -145,7 +141,7 @@ public:
 #define DLLIST_EMPTY(__list) ((__list)->next == (__list))
 
 #define DLLIST_FOR_EACH(list, it) \
-   for (DLList::Iterator it = (list)->iterator(); !(it).end(); (it).next())
+   for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next())
 
 class DLList
 {
@@ -203,7 +199,7 @@ public:
       virtual void erase();
       virtual bool insert(void *data);
 
-      // move item to another list, no consistency with its iterators though
+      // move item to a another list, no consistency with its iterators though
       void moveToList(DLList&);
 
    private:
@@ -539,11 +535,8 @@ public:
       return data[i / 32] & (((1 << n) - 1) << (i % 32));
    }
 
-   // Find a range of count (<= 32) clear bits aligned to roundup_pow2(count).
-   int findFreeRange(unsigned int count, unsigned int max) const;
-   inline int findFreeRange(unsigned int count) const {
-      return findFreeRange(count, size);
-   }
+   // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size).
+   int findFreeRange(unsigned int size) const;
 
    BitSet& operator|=(const BitSet&);
author	Jonathan Gray <jsg@cvs.openbsd.org>	2015-11-22 02:45:45 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2015-11-22 02:45:45 +0000
commit	b7ab2ee0fa1e6e04a545a9bd2088ac621c810081 (patch)
tree	db90836dcf322d66f4369cb79b21ec5e68986925 /lib/mesa/src/gallium/drivers
parent	f00235c070468f96521cd88ebc8919fa0cb89a25 (diff)