summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/drivers
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2015-11-22 02:45:45 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2015-11-22 02:45:45 +0000
commitb7ab2ee0fa1e6e04a545a9bd2088ac621c810081 (patch)
treedb90836dcf322d66f4369cb79b21ec5e68986925 /lib/mesa/src/gallium/drivers
parentf00235c070468f96521cd88ebc8919fa0cb89a25 (diff)
import Mesa 11.0.6
Diffstat (limited to 'lib/mesa/src/gallium/drivers')
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile4
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm205
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h144
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm219
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h127
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp5
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp61
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h22
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp378
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp19
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h3
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h14
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp953
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp3
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp102
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h5
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h6
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp18
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h23
19 files changed, 192 insertions, 2119 deletions
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile
index 115f6d0c0..06d1979d8 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile
@@ -3,9 +3,9 @@ ENVYAS ?= envyas
all: gf100.asm.h gk104.asm.h gk110.asm.h gm107.asm.h
gf100.asm.h: %.asm.h: %.asm
- $(ENVYAS) -a -W -mgf100 -Vgf100 $< -o $@
+ $(ENVYAS) -a -W -mnvc0 -Vnvc0 $< -o $@
gk104.asm.h: %.asm.h: %.asm
- $(ENVYAS) -a -W -mgf100 -Vgk104 $< -o $@
+ $(ENVYAS) -a -W -mnvc0 -Vnve4 $< -o $@
gk110.asm.h: %.asm.h: %.asm
$(ENVYAS) -a -W -mgk110 $< -o $@
gm107.asm.h: %.asm.h: %.asm
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
index 21a6b4de6..cd65b5472 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
@@ -543,8 +543,6 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
long mov b32 $r3 0x3f800000
long nop
-sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
-long nop
long ret
@@ -556,144 +554,7 @@ long ret
// SIZE: 9 * 8 bytes
//
gk104_rcp_f64:
- // Step 1: classify input according to exponent and value, and calculate
- // result for 0/inf/nan. $r2 holds the exponent value, which starts at
- // bit 52 (bit 20 of the upper half) and is 11 bits in length
- ext u32 $r2 $r1 0xb14
- add b32 $r3 $r2 0xffffffff
- joinat #rcp_rejoin
- // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
- // denorm, or 0). Do this by subtracting 1 from the exponent, which will
- // mean that it's > 0x7fd in those cases when doing unsigned comparison
- set $p0 0x1 gt u32 $r3 0x7fd
- // $r3: 0 for norms, 0x36 for denorms, -1 for others
- long mov b32 $r3 0x0
- sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
- join (not $p0) nop
- // Process all special values: NaN, inf, denorm, 0
- mov b32 $r3 0xffffffff
- // A number is NaN if its abs value is greater than or unordered with inf
- set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
- (not $p0) bra #rcp_inf_or_denorm_or_zero
- // NaN -> NaN, the next line sets the "quiet" bit of the result. This
- // behavior is both seen on the CPU and the blob
- join or b32 $r1 $r1 0x80000
-rcp_inf_or_denorm_or_zero:
- and b32 $r4 $r1 0x7ff00000
- // Other values with nonzero in exponent field should be inf
- set $p0 0x1 eq s32 $r4 0x0
- sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
- $p0 bra #rcp_denorm_or_zero
- // +/-Inf -> +/-0
- xor b32 $r1 $r1 0x7ff00000
- join mov b32 $r0 0x0
-rcp_denorm_or_zero:
- set $p0 0x1 gtu f64 abs $r0d 0x0
- $p0 bra #rcp_denorm
- // +/-0 -> +/-Inf
- join or b32 $r1 $r1 0x7ff00000
-rcp_denorm:
- // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
- mul rn f64 $r0d $r0d 0x4350000000000000
- sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
- join mov b32 $r3 0x36
-rcp_rejoin:
- // All numbers with -1 in $r3 have their result ready in $r0d, return them
- // others need further calculation
- set $p0 0x1 lt s32 $r3 0x0
- $p0 bra #rcp_end
- // Step 2: Before the real calculation goes on, renormalize the values to
- // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
- // result in $r6d. The exponent will be recovered later.
- ext u32 $r2 $r1 0xb14
- and b32 $r7 $r1 0x800fffff
- add b32 $r7 $r7 0x3ff00000
- long mov b32 $r6 $r0
- sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
- // Step 3: Convert new value to float (no overflow will occur due to step
- // 2), calculate rcp and do newton-raphson step once
- cvt rz f32 $r5 f64 $r6d
- long rcp f32 $r4 $r5
- mov b32 $r0 0xbf800000
- fma rn f32 $r5 $r4 $r5 $r0
- fma rn f32 $r0 neg $r4 $r5 $r4
- // Step 4: convert result $r0 back to double, do newton-raphson steps
- cvt f64 $r0d f32 $r0
- cvt f64 $r6d neg f64 $r6d
- sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
- cvt f64 $r8d f32 0x3f800000
- // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
- // The formula used here (and above) is:
- // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
- // The following code uses 2 FMAs for each step, and it will basically
- // looks like:
- // tmp = -src * RCP_{n} + 1
- // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
- fma rn f64 $r4d $r6d $r0d $r8d
- fma rn f64 $r0d $r0d $r4d $r0d
- fma rn f64 $r4d $r6d $r0d $r8d
- fma rn f64 $r0d $r0d $r4d $r0d
- fma rn f64 $r4d $r6d $r0d $r8d
- fma rn f64 $r0d $r0d $r4d $r0d
- sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
- fma rn f64 $r4d $r6d $r0d $r8d
- fma rn f64 $r0d $r0d $r4d $r0d
- // Step 5: Exponent recovery and final processing
- // The exponent is recovered by adding what we added to the exponent.
- // Suppose we want to calculate rcp(x), but we have rcp(cx), then
- // rcp(x) = c * rcp(cx)
- // The delta in exponent comes from two sources:
- // 1) The renormalization in step 2. The delta is:
- // 0x3ff - $r2
- // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
- // in $r3
- // These 2 sources are calculated in the first two lines below, and then
- // added to the exponent extracted from the result above.
- // Note that after processing, the new exponent may >= 0x7ff (inf)
- // or <= 0 (denorm). Those cases will be handled respectively below
- subr b32 $r2 $r2 0x3ff
- long add b32 $r4 $r2 $r3
- ext u32 $r3 $r1 0xb14
- // New exponent in $r3
- long add b32 $r3 $r3 $r4
- add b32 $r2 $r3 0xffffffff
- sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
- // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
- // (same logic as in step 1)
- set $p0 0x1 lt u32 $r2 0x7fe
- (not $p0) bra #rcp_result_inf_or_denorm
- // Norms: convert exponents back and return
- shl b32 $r4 $r4 clamp 0x14
- long add b32 $r1 $r4 $r1
- bra #rcp_end
-rcp_result_inf_or_denorm:
- // New exponent >= 0x7ff means that result is inf
- set $p0 0x1 ge s32 $r3 0x7ff
- (not $p0) bra #rcp_result_denorm
- sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
- // Infinity
- and b32 $r1 $r1 0x80000000
- long mov b32 $r0 0x0
- add b32 $r1 $r1 0x7ff00000
- bra #rcp_end
-rcp_result_denorm:
- // Denorm result comes from huge input. The greatest possible fp64, i.e.
- // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
- // normal value. Other rcp result should be greater than that. If we
- // set the exponent field to 1, we can recover the result by multiplying
- // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
- // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
- // the logic here.
- set $p0 0x1 ne u32 $r3 0x0
- and b32 $r1 $r1 0x800fffff
- // 0x3e800000: 1/4
- $p0 cvt f64 $r6d f32 0x3e800000
- sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
- // 0x3f000000: 1/2
- (not $p0) cvt f64 $r6d f32 0x3f000000
- add b32 $r1 $r1 0x00100000
- mul rn f64 $r0d $r0d $r6d
-rcp_end:
+ long nop
long ret
// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
@@ -704,73 +565,13 @@ rcp_end:
// SIZE: 14 * 8 bytes
//
gk104_rsq_f64:
- // Before getting initial result rsqrt64h, two special cases should be
- // handled first.
- // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
- // as NaN in rsqrt64h
- set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
- $p0 or b32 $r1 $r1 0x00080000
- and b32 $r2 $r1 0x7fffffff
- sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
- // 2. denorms and small normal values: using their original value will
- // lose precision either at rsqrt64h or the first step in newton-raphson
- // steps below. Take 2 as a threshold in exponent field, and multiply
- // with 2^54 if the exponent is smaller or equal. (will multiply 2^27
- // to recover in the end)
- ext u32 $r3 $r1 0xb14
- set $p1 0x1 le u32 $r3 0x2
- long or b32 $r2 $r0 $r2
- $p1 mul rn f64 $r0d $r0d 0x4350000000000000
- rsqrt64h $r5 $r1
- // rsqrt64h will give correct result for 0/inf/nan, the following logic
- // checks whether the input is one of those (exponent is 0x7ff or all 0
- // except for the sign bit)
- set b32 $r6 ne u32 $r3 0x7ff
- long and b32 $r2 $r2 $r6
- sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
- set $p0 0x1 ne u32 $r2 0x0
- $p0 bra #rsq_norm
- // For 0/inf/nan, make sure the sign bit agrees with input and return
- and b32 $r1 $r1 0x80000000
- long mov b32 $r0 0x0
- long or b32 $r1 $r1 $r5
- long ret
-rsq_norm:
- // For others, do 4 Newton-Raphson steps with the formula:
- // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
- // In the code below, each step is written as:
- // tmp1 = 0.5 * x * RSQ_{n}
- // tmp2 = -RSQ_{n} * tmp1 + 0.5
- // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
- long mov b32 $r4 0x0
- sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
- // 0x3f000000: 1/2
- cvt f64 $r8d f32 0x3f000000
- mul rn f64 $r2d $r0d $r8d
- mul rn f64 $r0d $r2d $r4d
- fma rn f64 $r6d neg $r4d $r0d $r8d
- fma rn f64 $r4d $r4d $r6d $r4d
- mul rn f64 $r0d $r2d $r4d
- fma rn f64 $r6d neg $r4d $r0d $r8d
- sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
- fma rn f64 $r4d $r4d $r6d $r4d
- mul rn f64 $r0d $r2d $r4d
- fma rn f64 $r6d neg $r4d $r0d $r8d
- fma rn f64 $r4d $r4d $r6d $r4d
- mul rn f64 $r0d $r2d $r4d
- fma rn f64 $r6d neg $r4d $r0d $r8d
- fma rn f64 $r4d $r4d $r6d $r4d
- sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
- // Multiply 2^27 to result for small inputs to recover
- $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
- long mov b32 $r1 $r5
- long mov b32 $r0 $r4
+ long nop
long ret
//
// Trap handler.
// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
-// Low 32 bytes of l[] memory shouldn't be used if resumability is required.
+// Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
//
// Trap info:
// 0x000: mutex
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
index ed948dee4..37998768e 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
@@ -481,132 +481,12 @@ uint64_t gk104_builtin_code[] = {
0xd40040000840c785,
0x18fe00000000dde2,
0x4000000000001de4,
- 0x2000000000000007,
+ 0x9000000000001de7,
+/* 0x0f08: gk104_rcp_f64 */
0x4000000000001de4,
0x9000000000001de7,
-/* 0x0f18: gk104_rcp_f64 */
- 0x7000c02c50109c03,
- 0x0bfffffffc20dc02,
- 0x6000000280000007,
- 0x1a0ec01ff431dc03,
- 0x180000000000dde2,
- 0x228282f2b2d042f7,
- 0x40000000000021f4,
- 0x1bfffffffc00dde2,
- 0x1e0edffc0001dc81,
- 0x40000000200021e7,
- 0x3800200000105c52,
-/* 0x0f70: rcp_inf_or_denorm_or_zero */
- 0x39ffc00000111c02,
- 0x190e0000fc41dc23,
- 0x2202f2b2d2f042b7,
- 0x40000000400001e7,
- 0x39ffc00000105c82,
- 0x1800000000001df2,
-/* 0x0fa0: rcp_denorm_or_zero */
- 0x1e0ec0000001dc81,
- 0x40000000200001e7,
- 0x39ffc00000105c52,
-/* 0x0fb8: rcp_denorm */
- 0x5000d0d400001c01,
- 0x2280428282b282f7,
- 0x18000000d800ddf2,
-/* 0x0fd0: rcp_rejoin */
- 0x188e0000fc31dc23,
- 0x40000006000001e7,
- 0x7000c02c50109c03,
- 0x3a003ffffc11dc02,
- 0x08ffc0000071dc02,
- 0x2800000000019de4,
- 0x22e2b2a2828042b7,
- 0x1006000019a15c04,
- 0xc800000010511c00,
- 0x1afe000000001de2,
- 0x3000000014415c00,
- 0x3008000014401e00,
- 0x1000000001301c04,
- 0x1000000019b19d04,
- 0x22929292929292e7,
- 0x1000cfe001321c04,
- 0x2010000000611c01,
- 0x2000000010001c01,
- 0x2010000000611c01,
- 0x2000000010001c01,
- 0x2010000000611c01,
- 0x2000000010001c01,
- 0x2282828282820297,
- 0x2010000000611c01,
- 0x2000000010001c01,
- 0x0800000ffc209e02,
- 0x480000000c211c03,
- 0x7000c02c5010dc03,
- 0x480000001030dc03,
- 0x0bfffffffc309c02,
- 0x22b28282b282b287,
- 0x188ec01ff821dc03,
- 0x40000000600021e7,
- 0x6000c00050411c03,
- 0x4800000004405c03,
- 0x40000001c0001de7,
-/* 0x10f0: rcp_result_inf_or_denorm */
- 0x1b0ec01ffc31dc23,
- 0x40000000a00021e7,
- 0x22f25232b2825207,
- 0x3a00000000105c02,
- 0x1800000000001de2,
- 0x09ffc00000105c02,
- 0x40000000e0001de7,
-/* 0x1128: rcp_result_denorm */
- 0x1a8e0000fc31dc03,
- 0x3a003ffffc105c02,
- 0x1000cfa001318004,
- 0x227202a2e2c282f7,
- 0x1000cfc00131a004,
- 0x0800400000105c02,
- 0x5000000018001c01,
-/* 0x1160: rcp_end */
- 0x9000000000001de7,
-/* 0x1168: gk104_rsq_f64 */
- 0x1e0edffc0001dc81,
- 0x3800200000104042,
- 0x39fffffffc109c02,
- 0x22828252c2820277,
- 0x7000c02c5010dc03,
- 0x198ec0000833dc03,
- 0x6800000008009c43,
- 0x5000d0d400000401,
- 0xc80000001c115c00,
- 0x128ec01ffc319c03,
- 0x6800000018209c03,
- 0x2282e2827202b287,
- 0x1a8e0000fc21dc03,
- 0x40000000800001e7,
- 0x3a00000000105c02,
- 0x1800000000001de2,
- 0x6800000014105c43,
- 0x9000000000001de7,
-/* 0x11f8: rsq_norm */
- 0x1800000000011de2,
- 0x22929292929292f7,
- 0x1000cfc001321c04,
- 0x5000000020009c01,
- 0x5000000010201c01,
- 0x2010000000419e01,
- 0x2008000018411c01,
- 0x5000000010201c01,
- 0x2010000000419e01,
- 0x2292929292929297,
- 0x2008000018411c01,
- 0x5000000010201c01,
- 0x2010000000419e01,
- 0x2008000018411c01,
- 0x5000000010201c01,
- 0x2010000000419e01,
- 0x2008000018411c01,
- 0x20000002e2820297,
- 0x5000d06800410401,
- 0x2800000014005de4,
- 0x2800000010001de4,
+/* 0x0f18: gk104_rsq_f64 */
+ 0x4000000000001de4,
0x9000000000001de7,
0xc800000003f01cc5,
0x2c00000100005c04,
@@ -615,7 +495,7 @@ uint64_t gk104_builtin_code[] = {
0x680100000c1fdc03,
0x4000000a60001c47,
0x180000004000dde2,
-/* 0x12e0: spill_cfstack */
+/* 0x0f60: spill_cfstack */
0x78000009c0000007,
0x0c0000000430dd02,
0x4003ffffa0001ca7,
@@ -663,14 +543,14 @@ uint64_t gk104_builtin_code[] = {
0x4000000100001ea7,
0x480100000c001c03,
0x0800000000105c42,
-/* 0x1458: shared_loop */
+/* 0x10d8: shared_loop */
0xc100000000309c85,
0x9400000500009c85,
0x0c00000010001d02,
0x0800000000105d42,
0x0c0000001030dd02,
0x4003ffff40001ca7,
-/* 0x1488: shared_done */
+/* 0x1108: shared_done */
0x2800406420001de4,
0x2800406430005de4,
0xe000000000001c45,
@@ -684,7 +564,7 @@ uint64_t gk104_builtin_code[] = {
0x480000000c209c03,
0x4801000008001c03,
0x0800000000105c42,
-/* 0x14f0: search_cstack */
+/* 0x1170: search_cstack */
0x280040646000dde4,
0x8400000020009f05,
0x190ec0002821dc03,
@@ -693,17 +573,17 @@ uint64_t gk104_builtin_code[] = {
0x0800000000105c42,
0x0c0000004030dd02,
0x00029dff0ffc5cbf,
-/* 0x1530: entry_found */
+/* 0x11b0: entry_found */
0x8400000000009f85,
0x2800406400001de4,
0x2800406410005de4,
0x9400000010009c85,
0x4000000000001df4,
-/* 0x1558: end_exit */
+/* 0x11d8: end_exit */
0x9800000003ffdcc5,
0xd000000000008007,
0xa000000000004007,
-/* 0x1570: end_cont */
+/* 0x11f0: end_cont */
0xd000000000008007,
0x3400c3fffc201c04,
0xc000000003f01ec5,
@@ -713,6 +593,6 @@ uint64_t gk104_builtin_code[] = {
uint64_t gk104_builtin_offsets[] = {
0x0000000000000000,
0x00000000000000f0,
+ 0x0000000000000f08,
0x0000000000000f18,
- 0x0000000000001168,
};
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
index 66626b471..b9c05a04b 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -83,229 +83,12 @@ gk110_div_s32:
$p0 sub b32 $r1 $r1 $r2
$p0 add b32 $r0 $r0 0x1
$p3 cvt s32 $r0 neg s32 $r0
- sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28
+ sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
$p2 cvt s32 $r1 neg s32 $r1
ret
-// RCP F64
-//
-// INPUT: $r0d
-// OUTPUT: $r0d
-// CLOBBER: $r2 - $r9, $p0
-//
-// The core of RCP and RSQ implementation is Newton-Raphson step, which is
-// used to find successively better approximation from an imprecise initial
-// value (single precision rcp in RCP and rsqrt64h in RSQ).
-//
gk110_rcp_f64:
- // Step 1: classify input according to exponent and value, and calculate
- // result for 0/inf/nan. $r2 holds the exponent value, which starts at
- // bit 52 (bit 20 of the upper half) and is 11 bits in length
- ext u32 $r2 $r1 0xb14
- add b32 $r3 $r2 0xffffffff
- joinat #rcp_rejoin
- // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
- // denorm, or 0). Do this by subtracting 1 from the exponent, which will
- // mean that it's > 0x7fd in those cases when doing unsigned comparison
- set b32 $p0 0x1 gt u32 $r3 0x7fd
- // $r3: 0 for norms, 0x36 for denorms, -1 for others
- mov b32 $r3 0x0
- sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
- join (not $p0) nop
- // Process all special values: NaN, inf, denorm, 0
- mov b32 $r3 0xffffffff
- // A number is NaN if its abs value is greater than or unordered with inf
- set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
- (not $p0) bra #rcp_inf_or_denorm_or_zero
- // NaN -> NaN, the next line sets the "quiet" bit of the result. This
- // behavior is both seen on the CPU and the blob
- join or b32 $r1 $r1 0x80000
-rcp_inf_or_denorm_or_zero:
- and b32 $r4 $r1 0x7ff00000
- // Other values with nonzero in exponent field should be inf
- set b32 $p0 0x1 eq s32 $r4 0x0
- sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
- $p0 bra #rcp_denorm_or_zero
- // +/-Inf -> +/-0
- xor b32 $r1 $r1 0x7ff00000
- join mov b32 $r0 0x0
-rcp_denorm_or_zero:
- set $p0 0x1 gtu f64 abs $r0d 0x0
- $p0 bra #rcp_denorm
- // +/-0 -> +/-Inf
- join or b32 $r1 $r1 0x7ff00000
-rcp_denorm:
- // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
- mul rn f64 $r0d $r0d 0x4350000000000000
- sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
- join mov b32 $r3 0x36
-rcp_rejoin:
- // All numbers with -1 in $r3 have their result ready in $r0d, return them
- // others need further calculation
- set b32 $p0 0x1 lt s32 $r3 0x0
- $p0 bra #rcp_end
- // Step 2: Before the real calculation goes on, renormalize the values to
- // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
- // result in $r6d. The exponent will be recovered later.
- ext u32 $r2 $r1 0xb14
- and b32 $r7 $r1 0x800fffff
- add b32 $r7 $r7 0x3ff00000
- mov b32 $r6 $r0
- sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
- // Step 3: Convert new value to float (no overflow will occur due to step
- // 2), calculate rcp and do newton-raphson step once
- cvt rz f32 $r5 f64 $r6d
- rcp f32 $r4 $r5
- mov b32 $r0 0xbf800000
- fma rn f32 $r5 $r4 $r5 $r0
- fma rn f32 $r0 neg $r4 $r5 $r4
- // Step 4: convert result $r0 back to double, do newton-raphson steps
- cvt f64 $r0d f32 $r0
- cvt f64 $r6d f64 neg $r6d
- sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
- cvt f64 $r8d f32 0x3f800000
- // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
- // The formula used here (and above) is:
- // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
- // The following code uses 2 FMAs for each step, and it will basically
- // looks like:
- // tmp = -src * RCP_{n} + 1
- // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
- fma rn f64 $r4d $r6d $r0d $r8d
- fma rn f64 $r0d $r0d $r4d $r0d
- fma rn f64 $r4d $r6d $r0d $r8d
- fma rn f64 $r0d $r0d $r4d $r0d
- fma rn f64 $r4d $r6d $r0d $r8d
- fma rn f64 $r0d $r0d $r4d $r0d
- sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
- fma rn f64 $r4d $r6d $r0d $r8d
- fma rn f64 $r0d $r0d $r4d $r0d
- // Step 5: Exponent recovery and final processing
- // The exponent is recovered by adding what we added to the exponent.
- // Suppose we want to calculate rcp(x), but we have rcp(cx), then
- // rcp(x) = c * rcp(cx)
- // The delta in exponent comes from two sources:
- // 1) The renormalization in step 2. The delta is:
- // 0x3ff - $r2
- // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
- // in $r3
- // These 2 sources are calculated in the first two lines below, and then
- // added to the exponent extracted from the result above.
- // Note that after processing, the new exponent may >= 0x7ff (inf)
- // or <= 0 (denorm). Those cases will be handled respectively below
- subr b32 $r2 $r2 0x3ff
- add b32 $r4 $r2 $r3
- ext u32 $r3 $r1 0xb14
- // New exponent in $r3
- add b32 $r3 $r3 $r4
- add b32 $r2 $r3 0xffffffff
- sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
- // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
- // (same logic as in step 1)
- set b32 $p0 0x1 lt u32 $r2 0x7fe
- (not $p0) bra #rcp_result_inf_or_denorm
- // Norms: convert exponents back and return
- shl b32 $r4 $r4 clamp 0x14
- add b32 $r1 $r4 $r1
- bra #rcp_end
-rcp_result_inf_or_denorm:
- // New exponent >= 0x7ff means that result is inf
- set b32 $p0 0x1 ge s32 $r3 0x7ff
- (not $p0) bra #rcp_result_denorm
- sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
- // Infinity
- and b32 $r1 $r1 0x80000000
- mov b32 $r0 0x0
- add b32 $r1 $r1 0x7ff00000
- bra #rcp_end
-rcp_result_denorm:
- // Denorm result comes from huge input. The greatest possible fp64, i.e.
- // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
- // normal value. Other rcp result should be greater than that. If we
- // set the exponent field to 1, we can recover the result by multiplying
- // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
- // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
- // the logic here.
- set b32 $p0 0x1 ne u32 $r3 0x0
- and b32 $r1 $r1 0x800fffff
- // 0x3e800000: 1/4
- $p0 cvt f64 $r6d f32 0x3e800000
- sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
- // 0x3f000000: 1/2
- (not $p0) cvt f64 $r6d f32 0x3f000000
- add b32 $r1 $r1 0x00100000
- mul rn f64 $r0d $r0d $r6d
-rcp_end:
- ret
-
-// RSQ F64
-//
-// INPUT: $r0d
-// OUTPUT: $r0d
-// CLOBBER: $r2 - $r9, $p0 - $p1
-//
gk110_rsq_f64:
- // Before getting initial result rsqrt64h, two special cases should be
- // handled first.
- // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
- // as NaN in rsqrt64h
- set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
- $p0 or b32 $r1 $r1 0x00080000
- and b32 $r2 $r1 0x7fffffff
- sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
- // 2. denorms and small normal values: using their original value will
- // lose precision either at rsqrt64h or the first step in newton-raphson
- // steps below. Take 2 as a threshold in exponent field, and multiply
- // with 2^54 if the exponent is smaller or equal. (will multiply 2^27
- // to recover in the end)
- ext u32 $r3 $r1 0xb14
- set b32 $p1 0x1 le u32 $r3 0x2
- or b32 $r2 $r0 $r2
- $p1 mul rn f64 $r0d $r0d 0x4350000000000000
- rsqrt64h f32 $r5 $r1
- // rsqrt64h will give correct result for 0/inf/nan, the following logic
- // checks whether the input is one of those (exponent is 0x7ff or all 0
- // except for the sign bit)
- set b32 $r6 ne u32 $r3 0x7ff
- and b32 $r2 $r2 $r6
- sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
- set b32 $p0 0x1 ne u32 $r2 0x0
- $p0 bra #rsq_norm
- // For 0/inf/nan, make sure the sign bit agrees with input and return
- and b32 $r1 $r1 0x80000000
- mov b32 $r0 0x0
- or b32 $r1 $r1 $r5
- ret
-rsq_norm:
- // For others, do 4 Newton-Raphson steps with the formula:
- // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
- // In the code below, each step is written as:
- // tmp1 = 0.5 * x * RSQ_{n}
- // tmp2 = -RSQ_{n} * tmp1 + 0.5
- // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
- mov b32 $r4 0x0
- sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
- // 0x3f000000: 1/2
- cvt f64 $r8d f32 0x3f000000
- mul rn f64 $r2d $r0d $r8d
- mul rn f64 $r0d $r2d $r4d
- fma rn f64 $r6d neg $r4d $r0d $r8d
- fma rn f64 $r4d $r4d $r6d $r4d
- mul rn f64 $r0d $r2d $r4d
- fma rn f64 $r6d neg $r4d $r0d $r8d
- sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
- fma rn f64 $r4d $r4d $r6d $r4d
- mul rn f64 $r0d $r2d $r4d
- fma rn f64 $r6d neg $r4d $r0d $r8d
- fma rn f64 $r4d $r4d $r6d $r4d
- mul rn f64 $r0d $r2d $r4d
- fma rn f64 $r6d neg $r4d $r0d $r8d
- fma rn f64 $r4d $r4d $r6d $r4d
- sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
- // Multiply 2^27 to result for small inputs to recover
- $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
- mov b32 $r1 $r5
- mov b32 $r0 $r4
ret
.section #gk110_builtin_offsets
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
index 3d1523f2f..8d00e2a22 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
@@ -65,132 +65,11 @@ uint64_t gk110_builtin_code[] = {
0xe088000001000406,
0x4000000000800001,
0xe6010000000ce802,
- 0x08a0a0a010a0b810,
+ 0x08b08010a010b810,
0xe60100000088e806,
0x19000000001c003c,
/* 0x0218: gk110_rcp_f64 */
- 0xc00000058a1c0409,
- 0x407fffffff9c080d,
- 0x1480000050000000,
- 0xb3401c03fe9c0c1d,
- 0xe4c03c007f9c000e,
- 0x08a0a0bcacb410bc,
- 0x8580000000603c02,
- 0x747fffffff9fc00e,
- 0xb4601fff801c021d,
- 0x120000000420003c,
- 0x21000400005c0404,
-/* 0x0270: rcp_inf_or_denorm_or_zero */
- 0x203ff800001c0410,
- 0xb3281c00001c101d,
- 0x0880bcacb4bc10ac,
- 0x120000000800003c,
- 0x223ff800001c0404,
- 0xe4c03c007fdc0002,
-/* 0x02a0: rcp_denorm_or_zero */
- 0xb4601c00001c021d,
- 0x120000000400003c,
- 0x213ff800005c0404,
-/* 0x02b8: rcp_denorm */
- 0xc400021a801c0001,
- 0x08a010a0a0aca0bc,
- 0x740000001b5fc00e,
-/* 0x02d0: rcp_rejoin */
- 0xb3181c00001c0c1d,
- 0x12000000c000003c,
- 0xc00000058a1c0409,
- 0x204007ffff9c041c,
- 0x401ff800001c1c1d,
- 0xe4c03c00001c001a,
- 0x08b8aca8a0a010ac,
- 0xe5400c00031c3816,
- 0x84000000021c1412,
- 0x745fc000001fc002,
- 0xcc000000029c1016,
- 0xcc081000029c1002,
- 0xe5400000001c2c02,
- 0xe5410000031c3c1a,
- 0x08a4a4a4a4a4a4b8,
- 0xc54001fc001c2c21,
- 0xdb802000001c1812,
- 0xdb800000021c0002,
- 0xdb802000001c1812,
- 0xdb800000021c0002,
- 0xdb802000001c1812,
- 0xdb800000021c0002,
- 0x08a0a0a0a0a080a4,
- 0xdb802000001c1812,
- 0xdb800000021c0002,
- 0x48000001ff9c0809,
- 0xe0800000019c0812,
- 0xc00000058a1c040d,
- 0xe0800000021c0c0e,
- 0x407fffffff9c0c09,
- 0x08aca0a0aca0aca0,
- 0xb3101c03ff1c081d,
- 0x120000000c20003c,
- 0xc24000000a1c1011,
- 0xe0800000009c1006,
- 0x12000000381c003c,
-/* 0x03f0: rcp_result_inf_or_denorm */
- 0xb3681c03ff9c0c1d,
- 0x120000001420003c,
- 0x08bc948caca09480,
- 0x20400000001c0404,
- 0xe4c03c007f9c0002,
- 0x403ff800001c0405,
- 0x120000001c1c003c,
-/* 0x0428: rcp_result_denorm */
- 0xb3501c00001c0c1d,
- 0x204007ffff9c0404,
- 0xc54001f400002c19,
- 0x089c80a8b8b0a0bc,
- 0xc54001f800202c19,
- 0x40000800001c0405,
- 0xe4000000031c0002,
-/* 0x0460: rcp_end */
- 0x19000000001c003c,
-/* 0x0468: gk110_rsq_f64 */
- 0xb4601fff801c021d,
- 0x2100040000000404,
- 0x203fffffff9c0408,
- 0x08a0a094b0a0809c,
- 0xc00000058a1c040d,
- 0xb3301c00011c0c3d,
- 0xe2001000011c000a,
- 0xc400021a80040001,
- 0x84000000039c0416,
- 0xb2d01c03ff9c0c19,
- 0xe2000000031c080a,
- 0x08a0b8a09c80aca0,
- 0xb3501c00001c081d,
- 0x120000001000003c,
- 0x20400000001c0404,
- 0xe4c03c007f9c0002,
- 0xe2001000029c0406,
- 0x19000000001c003c,
-/* 0x04f8: rsq_norm */
- 0xe4c03c007f9c0012,
- 0x08a4a4a4a4a4a4bc,
- 0xc54001f8001c2c21,
- 0xe4000000041c000a,
- 0xe4000000021c0802,
- 0xdb882000001c101a,
- 0xdb801000031c1012,
- 0xe4000000021c0802,
- 0xdb882000001c101a,
- 0x08a4a4a4a4a4a4a4,
- 0xdb801000031c1012,
- 0xe4000000021c0802,
- 0xdb882000001c101a,
- 0xdb801000031c1012,
- 0xe4000000021c0802,
- 0xdb882000001c101a,
- 0xdb801000031c1012,
- 0x08000000b8a080a4,
- 0xc400020d00041011,
- 0xe4c03c00029c0006,
- 0xe4c03c00021c0002,
+/* 0x0218: gk110_rsq_f64 */
0x19000000001c003c,
};
@@ -198,5 +77,5 @@ uint64_t gk110_builtin_offsets[] = {
0x0000000000000000,
0x00000000000000f0,
0x0000000000000218,
- 0x0000000000000468,
+ 0x0000000000000218,
};
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
index 76fee8c79..fa8ee072a 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)
if (b->prev)
b->prev->next = b;
- if (a->next)
+ if (a->prev)
a->next->prev = a;
}
@@ -536,6 +536,9 @@ Function::printCFGraph(const char *filePath)
case Graph::Edge::BACK:
fprintf(out, "\t%i -> %i;\n", idA, idB);
break;
+ case Graph::Edge::DUMMY:
+ fprintf(out, "\t%i -> %i [style=dotted];\n", idA, idB);
+ break;
default:
assert(0);
break;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 20ed5cd52..19418c0e0 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -44,8 +44,6 @@ BuildUtil::init(Program *prog)
bb = NULL;
pos = NULL;
- tail = false;
-
memset(imms, 0, sizeof(imms));
immCount = 0;
}
@@ -340,7 +338,7 @@ BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit)
int base2 = (baseSize2[mask] >> 8) & 0xf;
int size2 = (baseSize2[mask] >> 12) & 0xf;
Instruction *insn = mkOp(OP_NOP, TYPE_NONE, NULL);
- if (true) { // size1 can't be 0
+ if (1) { // size1 can't be 0
LValue *reg = new_LValue(func, f);
reg->reg.size = size1 << unit;
reg->reg.data.id = base + base1;
@@ -356,18 +354,6 @@ BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit)
}
ImmediateValue *
-BuildUtil::mkImm(uint16_t u)
-{
- ImmediateValue *imm = new_ImmediateValue(prog, (uint32_t)0);
-
- imm->reg.size = 2;
- imm->reg.type = TYPE_U16;
- imm->reg.data.u32 = u;
-
- return imm;
-}
-
-ImmediateValue *
BuildUtil::mkImm(uint32_t u)
{
unsigned int pos = u32Hash(u);
@@ -406,12 +392,6 @@ BuildUtil::mkImm(float f)
return mkImm(u.u32);
}
-ImmediateValue *
-BuildUtil::mkImm(double d)
-{
- return new_ImmediateValue(prog, d);
-}
-
Value *
BuildUtil::loadImm(Value *dst, float f)
{
@@ -419,18 +399,6 @@ BuildUtil::loadImm(Value *dst, float f)
}
Value *
-BuildUtil::loadImm(Value *dst, double d)
-{
- return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(8), mkImm(d));
-}
-
-Value *
-BuildUtil::loadImm(Value *dst, uint16_t u)
-{
- return mkOp1v(OP_MOV, TYPE_U16, dst ? dst : getScratch(2), mkImm(u));
-}
-
-Value *
BuildUtil::loadImm(Value *dst, uint32_t u)
{
return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u));
@@ -486,16 +454,6 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
return sym;
}
-Symbol *
-BuildUtil::mkTSVal(TSSemantic tsName)
-{
- Symbol *sym = new_Symbol(prog, FILE_THREAD_STATE, 0);
- sym->reg.type = TYPE_U32;
- sym->reg.size = typeSizeof(sym->reg.type);
- sym->reg.data.ts = tsName;
- return sym;
-}
-
void
BuildUtil::DataArray::setup(unsigned array, unsigned arrayIdx,
uint32_t base, int len, int vecDim, int eltSize,
@@ -529,7 +487,7 @@ BuildUtil::DataArray::acquire(ValueMap &m, int i, int c)
return v;
} else {
- return up->getScratch(eltSize);
+ return up->getScratch();
}
}
@@ -597,12 +555,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
switch (i->dType) {
case TYPE_U64: hTy = TYPE_U32; break;
case TYPE_S64: hTy = TYPE_S32; break;
- case TYPE_F64:
- if (i->op == OP_MOV) {
- hTy = TYPE_U32;
- break;
- }
- FALLTHROUGH;
default:
return NULL;
}
@@ -615,7 +567,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
return NULL;
srcNr = 2;
break;
- case OP_SELP: srcNr = 3; break;
default:
// TODO when needed
return NULL;
@@ -632,10 +583,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
for (int s = 0; s < srcNr; ++s) {
if (lo->getSrc(s)->reg.size < 8) {
- if (s == 2)
- hi->setSrc(s, lo->getSrc(s));
- else
- hi->setSrc(s, zero);
+ hi->setSrc(s, zero);
} else {
if (lo->getSrc(s)->refCount() > 1)
lo->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
@@ -649,7 +597,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
case FILE_MEMORY_CONST:
case FILE_MEMORY_SHARED:
case FILE_SHADER_INPUT:
- case FILE_SHADER_OUTPUT:
hi->getSrc(s)->reg.data.offset += 4;
break;
default:
@@ -660,7 +607,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
}
}
if (srcNr == 2) {
- lo->setFlagsDef(1, carry);
+ lo->setDef(1, carry);
hi->setFlagsSrc(hi->srcCount(), carry);
}
return hi;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
index 5c3a01df9..a610c773f 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -66,7 +66,6 @@ public:
Instruction *mkMov(Value *, Value *, DataType = TYPE_U32);
Instruction *mkMovToReg(int id, Value *);
Instruction *mkMovFromReg(Value *, int id);
- inline Instruction *mkBMov(Value *, Value *);
Instruction *mkInterp(unsigned mode, Value *, int32_t offset, Value *rel);
Instruction *mkFetch(Value *, DataType, DataFile, int32_t offset,
@@ -74,8 +73,8 @@ public:
Instruction *mkCvt(operation, DataType, Value *, DataType, Value *);
CmpInstruction *mkCmp(operation, CondCode, DataType,
- Value *,
- DataType, Value *, Value *, Value * = NULL);
+ Value *,
+ DataType, Value *, Value *, Value * = NULL);
TexInstruction *mkTex(operation, TexTarget,
uint16_t tic, uint16_t tsc,
const std::vector<Value *> &def,
@@ -91,16 +90,12 @@ public:
void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
ImmediateValue *mkImm(float);
- ImmediateValue *mkImm(double);
- ImmediateValue *mkImm(uint16_t);
ImmediateValue *mkImm(uint32_t);
ImmediateValue *mkImm(uint64_t);
ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }
Value *loadImm(Value *dst, float);
- Value *loadImm(Value *dst, double);
- Value *loadImm(Value *dst, uint16_t);
Value *loadImm(Value *dst, uint32_t);
Value *loadImm(Value *dst, uint64_t);
@@ -140,9 +135,7 @@ public:
class DataArray
{
public:
- DataArray(BuildUtil *bld) : up(bld), array(0), arrayIdx(0), baseAddr(0),
- arrayLen(0), baseSym(NULL), vecDim(0), eltSize(0), file(FILE_NULL),
- regOnly(false) { }
+ DataArray(BuildUtil *bld) : up(bld) { }
void setup(unsigned array, unsigned arrayIdx,
uint32_t base, int len, int vecDim, int eltSize,
@@ -179,7 +172,6 @@ public:
DataType ty, uint32_t baseAddress);
Symbol *mkSysVal(SVSemantic svName, uint32_t svIndex);
- Symbol *mkTSVal(TSSemantic tsName);
private:
void init(Program *);
@@ -301,17 +293,11 @@ BuildUtil::mkOp3v(operation op, DataType ty, Value *dst,
inline LValue *
BuildUtil::mkLoadv(DataType ty, Symbol *mem, Value *ptr)
{
- LValue *dst = getScratch(typeSizeof(ty));
+ LValue *dst = getScratch();
mkLoad(ty, dst, mem, ptr);
return dst;
}
-inline Instruction *
-BuildUtil::mkBMov(Value *dst, Value *src)
-{
- return mkCvt(OP_CVT, TYPE_U32, dst, TYPE_U32, src);
-}
-
bool
BuildUtil::DataArray::exists(ValueMap &m, unsigned int i, unsigned int c)
{
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 1a0c63b70..90147668c 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -33,12 +33,14 @@ namespace nv50_ir {
class CodeEmitterNV50 : public CodeEmitter
{
public:
- CodeEmitterNV50(Program::Type, const TargetNV50 *);
+ CodeEmitterNV50(const TargetNV50 *);
virtual bool emitInstruction(Instruction *);
virtual uint32_t getMinEncodingSize(const Instruction *) const;
+ inline void setProgramType(Program::Type pType) { progType = pType; }
+
virtual void prepareEmission(Function *);
private:
@@ -94,12 +96,9 @@ private:
void emitUADD(const Instruction *);
void emitAADD(const Instruction *);
void emitFADD(const Instruction *);
- void emitDADD(const Instruction *);
void emitIMUL(const Instruction *);
void emitFMUL(const Instruction *);
- void emitDMUL(const Instruction *);
void emitFMAD(const Instruction *);
- void emitDMAD(const Instruction *);
void emitIMAD(const Instruction *);
void emitISAD(const Instruction *);
@@ -270,7 +269,7 @@ CodeEmitterNV50::emitFlagsWr(const Instruction *i)
for (int d = 0; i->defExists(d); ++d)
if (i->def(d).getFile() == FILE_FLAGS)
flagsDef = d;
- if (flagsDef >= 0 && false) // TODO: enforce use of flagsDef at some point
+ if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point
WARN("Instruction::flagsDef was not set properly\n");
}
if (flagsDef == 0 && i->defExists(1))
@@ -373,7 +372,7 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
mode |= 3 << (s * 2);
break;
default:
- ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
+ ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
assert(0);
break;
}
@@ -439,9 +438,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
return;
if ((mode & 3) == 1) {
- const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14;
+ const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
- switch (i->sType) {
+ switch (i->getSrc(0)->reg.type) {
case TYPE_U8:
break;
case TYPE_U16:
@@ -525,8 +524,7 @@ CodeEmitterNV50::emitForm_ADD(const Instruction *i)
setSrcFileBits(i, NV50_OP_ENC_LONG_ALT);
setSrc(i, 0, 0);
- if (i->predSrc != 1)
- setSrc(i, 1, 2);
+ setSrc(i, 1, 2);
if (i->getIndirect(0, 0)) {
assert(!i->getIndirect(1, 0));
@@ -619,7 +617,7 @@ void
CodeEmitterNV50::emitLOAD(const Instruction *i)
{
DataFile sf = i->src(0).getFile();
- ASSERTED int32_t offset = i->getSrc(0)->reg.data.offset;
+ int32_t offset = i->getSrc(0)->reg.data.offset;
switch (sf) {
case FILE_SHADER_INPUT:
@@ -642,9 +640,6 @@ CodeEmitterNV50::emitLOAD(const Instruction *i)
code[1] |= 0x04000000;
emitLoadStoreSizeCS(i->sType);
-
- if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED)
- code[1] |= 0x00800000;
} else {
assert(offset <= (int32_t)(0x1f * typeSizeof(i->sType)));
code[0] = 0x10000001;
@@ -715,8 +710,6 @@ CodeEmitterNV50::emitSTORE(const Instruction *i)
case FILE_MEMORY_SHARED:
code[0] = 0x00000001;
code[1] = 0xe0000000;
- if (i->subOp == NV50_IR_SUBOP_STORE_UNLOCKED)
- code[1] |= 0x00800000;
switch (typeSizeof(i->dType)) {
case 1:
code[0] |= offset << 9;
@@ -760,10 +753,10 @@ CodeEmitterNV50::emitMOV(const Instruction *i)
assert(sf == FILE_GPR || df == FILE_GPR);
if (sf == FILE_FLAGS) {
- assert(i->flagsSrc >= 0);
code[0] = 0x00000001;
code[1] = 0x20000000;
defId(i->def(0), 2);
+ srcId(i->src(0), 12);
emitFlagsRd(i);
} else
if (sf == FILE_ADDRESS) {
@@ -774,31 +767,26 @@ CodeEmitterNV50::emitMOV(const Instruction *i)
emitFlagsRd(i);
} else
if (df == FILE_FLAGS) {
- assert(i->flagsDef >= 0);
code[0] = 0x00000001;
code[1] = 0xa0000000;
+ defId(i->def(0), 4);
srcId(i->src(0), 9);
emitFlagsRd(i);
- emitFlagsWr(i);
} else
if (sf == FILE_IMMEDIATE) {
- code[0] = 0x10000001;
+ code[0] = 0x10008001;
code[1] = 0x00000003;
emitForm_IMM(i);
-
- code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000;
} else {
if (i->encSize == 4) {
- code[0] = 0x10000000;
- code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000;
- defId(i->def(0), 2);
+ code[0] = 0x10008000;
} else {
code[0] = 0x10000001;
code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
code[1] |= (i->lanes << 14);
- setDst(i, 0);
emitFlagsRd(i);
}
+ defId(i->def(0), 2);
srcId(i->src(0), 9);
}
if (df == FILE_SHADER_OUTPUT) {
@@ -848,7 +836,7 @@ CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp)
emitForm_ADD(i);
- if (!i->srcExists(1) || i->predSrc == 1)
+ if (!i->srcExists(1))
srcId(i->src(0), 32 + 14);
}
@@ -889,36 +877,12 @@ CodeEmitterNV50::emitPFETCH(const Instruction *i)
}
void
-nv50_interpApply(const FixupEntry *entry, uint32_t *code, const FixupData& data)
-{
- int ipa = entry->ipa;
- int encSize = entry->reg;
- int loc = entry->loc;
-
- if ((ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
- (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
- if (data.force_persample_interp) {
- if (encSize == 8)
- code[loc + 1] |= 1 << 16;
- else
- code[loc + 0] |= 1 << 24;
- } else {
- if (encSize == 8)
- code[loc + 1] &= ~(1 << 16);
- else
- code[loc + 0] &= ~(1 << 24);
- }
- }
-}
-
-void
CodeEmitterNV50::emitINTERP(const Instruction *i)
{
code[0] = 0x80000000;
defId(i->def(0), 2);
srcAddr8(i->src(0), 16);
- setAReg16(i, 0);
if (i->encSize != 8 && i->getInterpMode() == NV50_IR_INTERP_FLAT) {
code[0] |= 1 << 8;
@@ -940,8 +904,6 @@ CodeEmitterNV50::emitINTERP(const Instruction *i)
code[0] |= 1;
emitFlagsRd(i);
}
-
- addInterp(i->ipa, i->encSize, nv50_interpApply);
}
void
@@ -966,13 +928,11 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
assert(0);
break;
}
+ code[1] |= i->src(0).mod.abs() << 20;
+ code[1] |= i->src(0).mod.neg() << 26;
+ code[1] |= i->src(1).mod.abs() << 19;
+ code[1] |= i->src(1).mod.neg() << 27;
}
-
- code[1] |= i->src(0).mod.abs() << 20;
- code[1] |= i->src(0).mod.neg() << 26;
- code[1] |= i->src(1).mod.abs() << 19;
- code[1] |= i->src(1).mod.neg() << 27;
-
emitForm_MAD(i);
}
@@ -1008,26 +968,6 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
}
void
-CodeEmitterNV50::emitDMAD(const Instruction *i)
-{
- const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
- const int neg_add = i->src(2).mod.neg();
-
- assert(i->encSize == 8);
- assert(!i->saturate);
-
- code[1] = 0x40000000;
- code[0] = 0xe0000000;
-
- code[1] |= neg_mul << 26;
- code[1] |= neg_add << 27;
-
- roundMode_MAD(i);
-
- emitForm_MAD(i);
-}
-
-void
CodeEmitterNV50::emitFADD(const Instruction *i)
{
const int neg0 = i->src(0).mod.neg();
@@ -1062,42 +1002,22 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
}
void
-CodeEmitterNV50::emitDADD(const Instruction *i)
-{
- const int neg0 = i->src(0).mod.neg();
- const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
-
- assert(!(i->src(0).mod | i->src(1).mod).abs());
- assert(!i->saturate);
- assert(i->encSize == 8);
-
- code[1] = 0x60000000;
- code[0] = 0xe0000000;
-
- emitForm_ADD(i);
-
- code[1] |= neg0 << 26;
- code[1] |= neg1 << 27;
-}
-
-void
CodeEmitterNV50::emitUADD(const Instruction *i)
{
const int neg0 = i->src(0).mod.neg();
const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
- code[0] = 0x20000000;
+ code[0] = 0x20008000;
if (i->src(1).getFile() == FILE_IMMEDIATE) {
- code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000;
code[1] = 0;
emitForm_IMM(i);
} else
if (i->encSize == 8) {
+ code[0] = 0x20000000;
code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
emitForm_ADD(i);
} else {
- code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000;
emitForm_MUL(i);
}
assert(!(neg0 && neg1));
@@ -1133,12 +1053,6 @@ CodeEmitterNV50::emitIMUL(const Instruction *i)
{
code[0] = 0x40000000;
- if (i->src(1).getFile() == FILE_IMMEDIATE) {
- if (i->sType == TYPE_S16)
- code[0] |= 0x8100;
- code[1] = 0;
- emitForm_IMM(i);
- } else
if (i->encSize == 8) {
code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
emitForm_MAD(i);
@@ -1181,66 +1095,28 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
}
void
-CodeEmitterNV50::emitDMUL(const Instruction *i)
-{
- const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
-
- assert(!i->saturate);
- assert(i->encSize == 8);
-
- code[1] = 0x80000000;
- code[0] = 0xe0000000;
-
- if (neg)
- code[1] |= 0x08000000;
-
- roundMode_CVT(i->rnd);
-
- emitForm_MAD(i);
-}
-
-void
CodeEmitterNV50::emitIMAD(const Instruction *i)
{
- int mode;
code[0] = 0x60000000;
-
- assert(!i->src(0).mod && !i->src(1).mod && !i->src(2).mod);
- if (!isSignedType(i->sType))
- mode = 0;
- else if (i->saturate)
- mode = 2;
+ if (isSignedType(i->sType))
+ code[1] = i->saturate ? 0x40000000 : 0x20000000;
else
- mode = 1;
+ code[1] = 0x00000000;
- if (i->src(1).getFile() == FILE_IMMEDIATE) {
- code[1] = 0;
- emitForm_IMM(i);
- code[0] |= (mode & 1) << 8 | (mode & 2) << 14;
- if (i->flagsSrc >= 0) {
- assert(!(code[0] & 0x10400000));
- assert(SDATA(i->src(i->flagsSrc)).id == 0);
- code[0] |= 0x10400000;
- }
- } else
- if (i->encSize == 4) {
- emitForm_MUL(i);
- code[0] |= (mode & 1) << 8 | (mode & 2) << 14;
- if (i->flagsSrc >= 0) {
- assert(!(code[0] & 0x10400000));
- assert(SDATA(i->src(i->flagsSrc)).id == 0);
- code[0] |= 0x10400000;
- }
- } else {
- code[1] = mode << 29;
- emitForm_MAD(i);
+ int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+ int neg2 = i->src(2).mod.neg();
- if (i->flagsSrc >= 0) {
- // add with carry from $cX
- assert(!(code[1] & 0x0c000000) && !i->getPredicate());
- code[1] |= 0xc << 24;
- srcId(i->src(i->flagsSrc), 32 + 12);
- }
+ assert(!(neg1 & neg2));
+ code[1] |= neg1 << 27;
+ code[1] |= neg2 << 26;
+
+ emitForm_MAD(i);
+
+ if (i->flagsSrc >= 0) {
+ // add with carry from $cX
+ assert(!(code[1] & 0x0c000000) && !i->getPredicate());
+ code[1] |= 0xc << 24;
+ srcId(i->src(i->flagsSrc), 32 + 12);
}
}
@@ -1273,39 +1149,15 @@ CodeEmitterNV50::emitISAD(const Instruction *i)
}
}
-static void
-alphatestSet(const FixupEntry *entry, uint32_t *code, const FixupData& data)
-{
- int loc = entry->loc;
- int enc;
-
- switch (data.alphatest) {
- case PIPE_FUNC_NEVER: enc = 0x0; break;
- case PIPE_FUNC_LESS: enc = 0x1; break;
- case PIPE_FUNC_EQUAL: enc = 0x2; break;
- case PIPE_FUNC_LEQUAL: enc = 0x3; break;
- case PIPE_FUNC_GREATER: enc = 0x4; break;
- case PIPE_FUNC_NOTEQUAL: enc = 0x5; break;
- case PIPE_FUNC_GEQUAL: enc = 0x6; break;
- default:
- case PIPE_FUNC_ALWAYS: enc = 0xf; break;
- }
-
- code[loc + 1] &= ~(0x1f << 14);
- code[loc + 1] |= enc << 14;
-}
-
void
CodeEmitterNV50::emitSET(const Instruction *i)
{
code[0] = 0x30000000;
code[1] = 0x60000000;
+ emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
switch (i->sType) {
- case TYPE_F64:
- code[0] = 0xe0000000;
- code[1] = 0xe0000000;
- break;
case TYPE_F32: code[0] |= 0x80000000; break;
case TYPE_S32: code[1] |= 0x0c000000; break;
case TYPE_U32: code[1] |= 0x04000000; break;
@@ -1315,19 +1167,12 @@ CodeEmitterNV50::emitSET(const Instruction *i)
assert(0);
break;
}
-
- emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
if (i->src(0).mod.neg()) code[1] |= 0x04000000;
if (i->src(1).mod.neg()) code[1] |= 0x08000000;
if (i->src(0).mod.abs()) code[1] |= 0x00100000;
if (i->src(1).mod.abs()) code[1] |= 0x00080000;
emitForm_MAD(i);
-
- if (i->subOp == 1) {
- addInterp(0, 0, alphatestSet);
- }
}
void
@@ -1412,9 +1257,6 @@ CodeEmitterNV50::emitCVT(const Instruction *i)
case TYPE_U32: code[1] = 0x44004000; break;
case TYPE_F16: code[1] = 0xc4000000; break;
case TYPE_U16: code[1] = 0x44000000; break;
- case TYPE_S16: code[1] = 0x44010000; break;
- case TYPE_S8: code[1] = 0x44018000; break;
- case TYPE_U8: code[1] = 0x44008000; break;
default:
assert(0);
break;
@@ -1452,73 +1294,10 @@ CodeEmitterNV50::emitCVT(const Instruction *i)
break;
}
break;
- case TYPE_F16:
- switch (i->sType) {
- case TYPE_F16: code[1] = 0xc0000000; break;
- case TYPE_F32: code[1] = 0xc0004000; break;
- default:
- assert(0);
- break;
- }
- break;
case TYPE_S16:
- switch (i->sType) {
- case TYPE_F32: code[1] = 0x88004000; break;
- case TYPE_S32: code[1] = 0x08014000; break;
- case TYPE_U32: code[1] = 0x08004000; break;
- case TYPE_F16: code[1] = 0x88000000; break;
- case TYPE_S16: code[1] = 0x08010000; break;
- case TYPE_U16: code[1] = 0x08000000; break;
- case TYPE_S8: code[1] = 0x08018000; break;
- case TYPE_U8: code[1] = 0x08008000; break;
- default:
- assert(0);
- break;
- }
- break;
case TYPE_U16:
- switch (i->sType) {
- case TYPE_F32: code[1] = 0x80004000; break;
- case TYPE_S32: code[1] = 0x00014000; break;
- case TYPE_U32: code[1] = 0x00004000; break;
- case TYPE_F16: code[1] = 0x80000000; break;
- case TYPE_S16: code[1] = 0x00010000; break;
- case TYPE_U16: code[1] = 0x00000000; break;
- case TYPE_S8: code[1] = 0x00018000; break;
- case TYPE_U8: code[1] = 0x00008000; break;
- default:
- assert(0);
- break;
- }
- break;
case TYPE_S8:
- switch (i->sType) {
- case TYPE_S32: code[1] = 0x08094000; break;
- case TYPE_U32: code[1] = 0x08084000; break;
- case TYPE_F16: code[1] = 0x88080000; break;
- case TYPE_S16: code[1] = 0x08090000; break;
- case TYPE_U16: code[1] = 0x08080000; break;
- case TYPE_S8: code[1] = 0x08098000; break;
- case TYPE_U8: code[1] = 0x08088000; break;
- default:
- assert(0);
- break;
- }
- break;
case TYPE_U8:
- switch (i->sType) {
- case TYPE_S32: code[1] = 0x00094000; break;
- case TYPE_U32: code[1] = 0x00084000; break;
- case TYPE_F16: code[1] = 0x80080000; break;
- case TYPE_S16: code[1] = 0x00090000; break;
- case TYPE_U16: code[1] = 0x00080000; break;
- case TYPE_S8: code[1] = 0x00098000; break;
- case TYPE_U8: code[1] = 0x00088000; break;
- default:
- assert(0);
- break;
- }
- break;
default:
assert(0);
break;
@@ -1564,7 +1343,6 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
if (i->encSize == 4) {
assert(i->op == OP_RCP);
- assert(!i->saturate);
code[0] |= i->src(0).mod.abs() << 15;
code[0] |= i->src(0).mod.neg() << 22;
emitForm_MUL(i);
@@ -1572,10 +1350,6 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
code[1] = subOp << 29;
code[1] |= i->src(0).mod.abs() << 20;
code[1] |= i->src(0).mod.neg() << 26;
- if (i->saturate) {
- assert(subOp == 6 && i->op == OP_EX2);
- code[1] |= 1 << 27;
- }
emitForm_MAD(i);
}
}
@@ -1618,15 +1392,13 @@ CodeEmitterNV50::emitLogicOp(const Instruction *i)
emitForm_IMM(i);
} else {
switch (i->op) {
- case OP_AND: code[1] = 0x00000000; break;
- case OP_OR: code[1] = 0x00004000; break;
- case OP_XOR: code[1] = 0x00008000; break;
+ case OP_AND: code[1] = 0x04000000; break;
+ case OP_OR: code[1] = 0x04004000; break;
+ case OP_XOR: code[1] = 0x04008000; break;
default:
assert(0);
break;
}
- if (typeSizeof(i->dType) == 4)
- code[1] |= 0x04000000;
if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
code[1] |= 1 << 16;
if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))
@@ -1657,9 +1429,7 @@ CodeEmitterNV50::emitShift(const Instruction *i)
emitARL(i, i->getSrc(1)->reg.data.u32 & 0x3f);
} else {
code[0] = 0x30000001;
- code[1] = (i->op == OP_SHR) ? 0xe0000000 : 0xc0000000;
- if (typeSizeof(i->dType) == 4)
- code[1] |= 0x04000000;
+ code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000;
if (i->op == OP_SHR && isSignedType(i->sType))
code[1] |= 1 << 27;
@@ -1738,9 +1508,7 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
code[1] |= (i->tex.mask & 0xc) << 12;
if (i->tex.liveOnly)
- code[1] |= 1 << 2;
- if (i->tex.derivAll)
- code[1] |= 1 << 3;
+ code[1] |= 4;
defId(i->def(0), 2);
@@ -1901,28 +1669,19 @@ CodeEmitterNV50::emitATOM(const Instruction *i)
return;
}
code[0] = 0xd0000001;
- code[1] = 0xc0c00000 | (subOp << 2);
+ code[1] = 0xe0c00000 | (subOp << 2);
if (isSignedType(i->dType))
code[1] |= 1 << 21;
// args
emitFlagsRd(i);
- if (i->subOp == NV50_IR_SUBOP_ATOM_EXCH ||
- i->subOp == NV50_IR_SUBOP_ATOM_CAS ||
- i->defExists(0)) {
- code[1] |= 0x20000000;
- setDst(i, 0);
- setSrc(i, 1, 1);
- // g[] pointer
- code[0] |= i->getSrc(0)->reg.fileIndex << 23;
- } else {
- srcId(i->src(1), 2);
- // g[] pointer
- code[0] |= i->getSrc(0)->reg.fileIndex << 16;
- }
+ setDst(i, 0);
+ setSrc(i, 1, 1);
if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
setSrc(i, 2, 2);
+ // g[] pointer
+ code[0] |= i->getSrc(0)->reg.fileIndex << 23;
srcId(i->getIndirect(0, 0), 9);
}
@@ -1971,9 +1730,7 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
break;
case OP_ADD:
case OP_SUB:
- if (insn->dType == TYPE_F64)
- emitDADD(insn);
- else if (isFloatType(insn->dType))
+ if (isFloatType(insn->dType))
emitFADD(insn);
else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
emitAADD(insn);
@@ -1981,18 +1738,14 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
emitUADD(insn);
break;
case OP_MUL:
- if (insn->dType == TYPE_F64)
- emitDMUL(insn);
- else if (isFloatType(insn->dType))
+ if (isFloatType(insn->dType))
emitFMUL(insn);
else
emitIMUL(insn);
break;
case OP_MAD:
case OP_FMA:
- if (insn->dType == TYPE_F64)
- emitDMAD(insn);
- else if (isFloatType(insn->dType))
+ if (isFloatType(insn->dType))
emitFMAD(insn);
else
emitIMAD(insn);
@@ -2164,7 +1917,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
{
const Target::OpInfo &info = targ->getOpInfo(i);
- if (info.minEncSize > 4 || i->dType == TYPE_F64)
+ if (info.minEncSize > 4)
return 8;
// check constraints on dst and src operands
@@ -2194,9 +1947,8 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
// check constraints on short MAD
if (info.srcNr >= 2 && i->srcExists(2)) {
- if (!i->defExists(0) ||
- (i->flagsSrc >= 0 && SDATA(i->src(i->flagsSrc)).id > 0) ||
- DDATA(i->def(0)).id != SDATA(i->src(2)).id)
+ if (!i->defExists(0) || !isFloatType(i->dType) ||
+ i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id)
return 8;
}
@@ -2226,7 +1978,7 @@ makeInstructionLong(Instruction *insn)
insn->encSize = 8;
for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) {
- fn->bbArray[i]->binPos += adj;
+ fn->bbArray[i]->binPos += 4;
}
fn->binSize += adj;
insn->bb->binSize += adj;
@@ -2278,16 +2030,9 @@ replaceExitWithModifier(Function *func)
return;
}
}
-
- int adj = epilogue->getExit()->encSize;
- epilogue->binSize -= adj;
- func->binSize -= adj;
+ epilogue->binSize -= 8;
+ func->binSize -= 8;
delete_Instruction(func->getProgram(), epilogue->getExit());
-
- // There may be BB's that are laid out after the exit block
- for (int i = func->bbCount - 1; i >= 0 && func->bbArray[i] != epilogue; --i) {
- func->bbArray[i]->binPos -= adj;
- }
}
void
@@ -2298,8 +2043,8 @@ CodeEmitterNV50::prepareEmission(Function *func)
replaceExitWithModifier(func);
}
-CodeEmitterNV50::CodeEmitterNV50(Program::Type type, const TargetNV50 *target) :
- CodeEmitter(target), progType(type), targNV50(target)
+CodeEmitterNV50::CodeEmitterNV50(const TargetNV50 *target) :
+ CodeEmitter(target), targNV50(target)
{
targ = target; // specialized
code = NULL;
@@ -2310,7 +2055,8 @@ CodeEmitterNV50::CodeEmitterNV50(Program::Type type, const TargetNV50 *target) :
CodeEmitter *
TargetNV50::getCodeEmitter(Program::Type type)
{
- CodeEmitterNV50 *emit = new CodeEmitterNV50(type, this);
+ CodeEmitterNV50 *emit = new CodeEmitterNV50(this);
+ emit->setProgramType(type);
return emit;
}
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
index 3f9967a7b..23414d54a 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
@@ -77,6 +77,7 @@ const char *Graph::Edge::typeStr() const
case FORWARD: return "forward";
case BACK: return "back";
case CROSS: return "cross";
+ case DUMMY: return "dummy";
case UNKNOWN:
default:
return "unk";
@@ -86,8 +87,7 @@ const char *Graph::Edge::typeStr() const
Graph::Node::Node(void *priv) : data(priv),
in(0), out(0), graph(0),
visited(0),
- inCount(0), outCount(0),
- tag(0)
+ inCount(0), outCount(0)
{
// nothing to do
}
@@ -184,7 +184,7 @@ Graph::Node::reachableBy(const Node *node, const Node *term) const
continue;
for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) {
- if (ei.getType() == Edge::BACK)
+ if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY)
continue;
if (ei.getNode()->visit(seq))
stack.push(ei.getNode());
@@ -287,10 +287,7 @@ private:
bb.push(node);
- while (bb.getSize() || cross.getSize()) {
- if (bb.getSize() == 0)
- cross.moveTo(bb);
-
+ while (bb.getSize()) {
node = reinterpret_cast<Graph::Node *>(bb.pop().u.p);
assert(node);
if (!node->visit(sequence))
@@ -301,6 +298,7 @@ private:
switch (ei.getType()) {
case Graph::Edge::TREE:
case Graph::Edge::FORWARD:
+ case Graph::Edge::DUMMY:
if (++(ei.getNode()->tag) == ei.getNode()->incidentCountFwd())
bb.push(ei.getNode());
break;
@@ -316,6 +314,9 @@ private:
}
}
nodes[count++] = node;
+
+ if (bb.getSize() == 0)
+ cross.moveTo(bb);
}
}
@@ -370,6 +371,8 @@ void Graph::classifyDFS(Node *curr, int& seq)
for (edge = curr->out; edge; edge = edge->next[0]) {
node = edge->target;
+ if (edge->type == Edge::DUMMY)
+ continue;
if (node->getSequence() == 0) {
edge->type = Edge::TREE;
@@ -384,6 +387,8 @@ void Graph::classifyDFS(Node *curr, int& seq)
for (edge = curr->in; edge; edge = edge->next[1]) {
node = edge->origin;
+ if (edge->type == Edge::DUMMY)
+ continue;
if (node->getSequence() == 0) {
edge->type = Edge::TREE;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
index fc85e78a5..b0981ff69 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
@@ -47,6 +47,7 @@ public:
FORWARD,
BACK,
CROSS, // e.g. loop break
+ DUMMY
};
Edge(Node *dst, Node *src, Type kind);
@@ -146,7 +147,7 @@ public:
public:
Graph();
- virtual ~Graph(); // does *not* free the nodes (make it an option ?)
+ ~Graph(); // does *not* free the nodes (make it an option ?)
inline Node *getRoot() const { return root; }
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
index 749e6b40b..e465f2484 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
@@ -48,7 +48,7 @@ static inline bool isTextureOp(operation op)
static inline bool isSurfaceOp(operation op)
{
- return (op >= OP_SULDB && op <= OP_SULEA) || (op == OP_SUQ);
+ return (op >= OP_SULDB && op <= OP_SULEA);
}
static inline unsigned int typeSizeof(DataType ty)
@@ -126,7 +126,7 @@ static inline bool isFloatType(DataType ty)
static inline bool isSignedIntType(DataType ty)
{
- return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32 || ty == TYPE_S64);
+ return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32);
}
static inline bool isSignedType(DataType ty)
@@ -136,7 +136,6 @@ static inline bool isSignedType(DataType ty)
case TYPE_U8:
case TYPE_U16:
case TYPE_U32:
- case TYPE_U64:
case TYPE_B96:
case TYPE_B128:
return false;
@@ -148,7 +147,6 @@ static inline bool isSignedType(DataType ty)
static inline DataType intTypeToSigned(DataType ty)
{
switch (ty) {
- case TYPE_U64: return TYPE_S64;
case TYPE_U32: return TYPE_S32;
case TYPE_U16: return TYPE_S16;
case TYPE_U8: return TYPE_S8;
@@ -222,7 +220,7 @@ Instruction *Value::getUniqueInsn() const
return (*it)->getInsn();
// should be unreachable and trigger assertion at the end
}
-#ifndef NDEBUG
+#ifdef DEBUG
if (reg.data.id < 0) {
int n = 0;
for (DefCIterator it = defs.begin(); n < 2 && it != defs.end(); ++it)
@@ -311,14 +309,14 @@ const FlowInstruction *Instruction::asFlow() const
TexInstruction *Instruction::asTex()
{
- if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ)
+ if (op >= OP_TEX && op <= OP_SULEA)
return static_cast<TexInstruction *>(this);
return NULL;
}
const TexInstruction *Instruction::asTex() const
{
- if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ)
+ if (op >= OP_TEX && op <= OP_SULEA)
return static_cast<const TexInstruction *>(this);
return NULL;
}
@@ -336,7 +334,7 @@ static inline Instruction *cloneForward(Function *ctx, Instruction *obj)
// XXX: use a virtual function so we're really really safe ?
LValue *Value::asLValue()
{
- if (reg.file >= FILE_GPR && reg.file <= LAST_REGISTER_FILE)
+ if (reg.file >= FILE_GPR && reg.file <= FILE_ADDRESS)
return static_cast<LValue *>(this);
return NULL;
}
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 2b09855b1..d87cdfff8 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -25,24 +25,6 @@
#include "codegen/nv50_ir_target_nv50.h"
-#define NV50_SU_INFO_SIZE_X 0x00
-#define NV50_SU_INFO_SIZE_Y 0x04
-#define NV50_SU_INFO_SIZE_Z 0x08
-#define NV50_SU_INFO_BSIZE 0x0c
-#define NV50_SU_INFO_STRIDE_Y 0x10
-#define NV50_SU_INFO_MS_X 0x18
-#define NV50_SU_INFO_MS_Y 0x1c
-#define NV50_SU_INFO_TILE_SHIFT_X 0x20
-#define NV50_SU_INFO_TILE_SHIFT_Y 0x24
-#define NV50_SU_INFO_TILE_SHIFT_Z 0x28
-#define NV50_SU_INFO_OFFSET_Z 0x2c
-
-#define NV50_SU_INFO__STRIDE 0x30
-
-#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)
-#define NV50_SU_INFO_MS(i) (0x18 + (i) * 4)
-#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)
-
namespace nv50_ir {
// nv50 doesn't support 32 bit integer multiplication
@@ -62,8 +44,6 @@ static bool
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
{
const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
- ImmediateValue src1;
- bool src1imm = mul->src(1).getImmediate(src1);
DataType fTy; // full type
switch (mul->sType) {
@@ -92,41 +72,24 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
for (int j = 0; j < 4; ++j)
t[j] = bld->getSSA(fullSize);
- if (isSignedType(mul->sType) && highResult) {
+ s[0] = mul->getSrc(0);
+ s[1] = mul->getSrc(1);
+
+ if (isSignedType(mul->sType)) {
s[0] = bld->getSSA(fullSize);
s[1] = bld->getSSA(fullSize);
bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
- src1.reg.data.s32 = abs(src1.reg.data.s32);
- } else {
- s[0] = mul->getSrc(0);
- s[1] = mul->getSrc(1);
}
// split sources into halves
i[0] = bld->mkSplit(a, halfSize, s[0]);
i[1] = bld->mkSplit(b, halfSize, s[1]);
- if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
- i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
- bld->mkImm(src1.reg.data.u32 & 0xffff));
- } else {
- i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
- src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
- if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
- i[3] = i[2];
- t[1] = t[0];
- } else {
- i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
- }
- }
+ i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
+ i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
- if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
- i[4] = i[3];
- t[3] = t[2];
- } else {
- i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
- }
+ i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
if (highResult) {
Value *c[2];
@@ -223,9 +186,6 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
class NV50LegalizePostRA : public Pass
{
-public:
- NV50LegalizePostRA() : r63(NULL) { }
-
private:
virtual bool visit(Function *);
virtual bool visit(BasicBlock *);
@@ -233,8 +193,6 @@ private:
void handlePRERET(FlowInstruction *);
void replaceZero(Instruction *);
- BuildUtil bld;
-
LValue *r63;
};
@@ -244,8 +202,7 @@ NV50LegalizePostRA::visit(Function *fn)
Program *prog = fn->getProgram();
r63 = new_LValue(fn, FILE_GPR);
- // GPR units on nv50 are in half-regs
- if (prog->maxGPR < 126)
+ if (prog->maxGPR < 63)
r63->reg.data.id = 63;
else
r63->reg.data.id = 127;
@@ -336,7 +293,8 @@ NV50LegalizePostRA::visit(BasicBlock *bb)
next = hi;
}
- if (i->op != OP_PFETCH && i->op != OP_BAR &&
+ if (i->op != OP_MOV && i->op != OP_PFETCH &&
+ i->op != OP_BAR &&
(!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
replaceZero(i);
}
@@ -395,8 +353,7 @@ NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
return;
for (int s = 0; di->srcExists(s); ++s)
- if (di->src(s).getFile() == FILE_IMMEDIATE ||
- di->src(s).getFile() == FILE_MEMORY_LOCAL)
+ if (di->src(s).getFile() == FILE_IMMEDIATE)
return;
if (prog->getType() == Program::TYPE_GEOMETRY) {
@@ -646,13 +603,6 @@ private:
bool handlePFETCH(Instruction *);
bool handleEXPORT(Instruction *);
bool handleLOAD(Instruction *);
- bool handleLDST(Instruction *);
- bool handleMEMBAR(Instruction *);
- bool handleSharedATOM(Instruction *);
- bool handleSULDP(TexInstruction *);
- bool handleSUREDP(TexInstruction *);
- bool handleSUSTP(TexInstruction *);
- Value *processSurfaceCoords(TexInstruction *);
bool handleDIV(Instruction *);
bool handleSQRT(Instruction *);
@@ -667,9 +617,6 @@ private:
bool handleTXL(TexInstruction *); // hate
bool handleTXD(TexInstruction *); // these 3
bool handleTXLQ(TexInstruction *);
- bool handleTXQ(TexInstruction *);
- bool handleSUQ(TexInstruction *);
- bool handleBUFQ(Instruction *);
bool handleCALL(Instruction *);
bool handlePRECONT(Instruction *);
@@ -678,8 +625,6 @@ private:
void checkPredicate(Instruction *);
void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
- Value *loadSuInfo(int slot, uint32_t off);
- Value *loadSuInfo16(int slot, uint32_t off);
private:
const Target *const targ;
@@ -717,14 +662,12 @@ void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
Value **ms_x, Value **ms_y) {
// This loads the texture-indexed ms setting from the constant buffer
Value *tmp = new_LValue(func, FILE_GPR);
- uint8_t b = prog->driver->io.auxCBSlot;
+ uint8_t b = prog->driver->io.resInfoCBSlot;
off += prog->driver->io.suInfoBase;
if (prog->getType() > Program::TYPE_VERTEX)
off += 16 * 2 * 4;
if (prog->getType() > Program::TYPE_GEOMETRY)
off += 16 * 2 * 4;
- if (prog->getType() > Program::TYPE_FRAGMENT)
- off += 16 * 2 * 4;
*ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
*ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
@@ -754,24 +697,6 @@ void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy)
prog->driver->io.msInfoBase + 4), off);
}
-Value *
-NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)
-{
- uint8_t b = prog->driver->io.auxCBSlot;
- off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
- return bld.mkLoadv(TYPE_U32, bld.mkSymbol(
- FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);
-}
-
-Value *
-NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)
-{
- uint8_t b = prog->driver->io.auxCBSlot;
- off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
- return bld.mkLoadv(TYPE_U16, bld.mkSymbol(
- FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);
-}
-
bool
NV50LoweringPreSSA::handleTEX(TexInstruction *i)
{
@@ -779,23 +704,6 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
const int dref = arg;
const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
- /* Only normalize in the non-explicit derivatives case.
- */
- if (i->tex.target.isCube() && i->op != OP_TXD) {
- Value *src[3], *val;
- int c;
- for (c = 0; c < 3; ++c)
- src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
- val = bld.getScratch();
- bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
- bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
- bld.mkOp1(OP_RCP, TYPE_F32, val, val);
- for (c = 0; c < 3; ++c) {
- i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
- i->getSrc(c), val));
- }
- }
-
// handle MS, which means looking up the MS params for this texture, and
// adjusting the input coordinates to point at the right sample.
if (i->tex.target.isMS()) {
@@ -923,7 +831,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i)
}
Value *flags = bld.getScratch(1, FILE_FLAGS);
bld.setPosition(cond, true);
- bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
+ bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
Instruction *tex[4];
for (l = 0; l < 4; ++l) {
@@ -1002,18 +910,16 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
Instruction *tex;
Value *zero = bld.loadImm(bld.getSSA(), 0);
int l, c;
- const int dim = i->tex.target.getDim() + i->tex.target.isCube();
+ const int dim = i->tex.target.getDim();
handleTEX(i);
i->op = OP_TEX; // no need to clone dPdx/dPdy later
- i->tex.derivAll = true;
for (c = 0; c < dim; ++c)
crd[c] = bld.getScratch();
bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
for (l = 0; l < 4; ++l) {
- Value *src[3], *val;
// mov coordinates from lane l to all lanes
for (c = 0; c < dim; ++c)
bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
@@ -1023,24 +929,10 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
// add dPdy from lane l to lanes dy
for (c = 0; c < dim; ++c)
bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
- // normalize cube coordinates if necessary
- if (i->tex.target.isCube()) {
- for (c = 0; c < 3; ++c)
- src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
- val = bld.getScratch();
- bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
- bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
- bld.mkOp1(OP_RCP, TYPE_F32, val, val);
- for (c = 0; c < 3; ++c)
- src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
- } else {
- for (c = 0; c < dim; ++c)
- src[c] = crd[c];
- }
// texture
bld.insert(tex = cloneForward(func, i));
for (c = 0; c < dim; ++c)
- tex->setSrc(c, src[c]);
+ tex->setSrc(c, crd[c]);
// save results
for (c = 0; i->defExists(c); ++c) {
Instruction *mov;
@@ -1083,87 +975,6 @@ NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
}
bool
-NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
-{
- Value *ms, *ms_x, *ms_y;
- if (i->tex.query == TXQ_DIMS) {
- if (i->tex.target.isMS()) {
- bld.setPosition(i, true);
- loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
- int d = 0;
- if (i->tex.mask & 1) {
- bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x);
- d++;
- }
- if (i->tex.mask & 2) {
- bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y);
- d++;
- }
- }
- return true;
- }
- assert(i->tex.query == TXQ_TYPE);
- assert(i->tex.mask == 4);
-
- loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
- bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
- i->bb->remove(i);
-
- return true;
-}
-
-bool
-NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
-{
- const int dim = suq->tex.target.getDim();
- const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
- int mask = suq->tex.mask;
- int slot = suq->tex.r;
- int c, d;
-
- for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
- if (c >= arg || !(mask & 1))
- continue;
-
- int offset;
-
- if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
- offset = NV50_SU_INFO_SIZE(2);
- } else {
- offset = NV50_SU_INFO_SIZE(c);
- }
- bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));
- if (c == 2 && suq->tex.target.isCube())
- bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
- bld.loadImm(NULL, 6));
- }
-
- if (mask & 1) {
- if (suq->tex.target.isMS()) {
- Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));
- Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));
- Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
- bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
- } else {
- bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
- }
- }
-
- bld.remove(suq);
- return true;
-}
-
-bool
-NV50LoweringPreSSA::handleBUFQ(Instruction *bufq)
-{
- bufq->op = OP_MOV;
- bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));
- bufq->setIndirect(0, 0, NULL);
- bufq->setIndirect(0, 1, NULL);
- return true;
-}
-
-bool
NV50LoweringPreSSA::handleSET(Instruction *i)
{
if (i->dType == TYPE_F32) {
@@ -1294,13 +1105,19 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
break;
case SV_NCTAID:
case SV_CTAID:
- case SV_NTID: {
- Value *x = bld.getSSA(2);
- bld.mkOp1(OP_LOAD, TYPE_U16, x,
- bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
- bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+ case SV_NTID:
+ if ((sv == SV_NCTAID && idx >= 2) ||
+ (sv == SV_NTID && idx >= 3)) {
+ bld.mkMov(def, bld.mkImm(1));
+ } else if (sv == SV_CTAID && idx >= 2) {
+ bld.mkMov(def, bld.mkImm(0));
+ } else {
+ Value *x = bld.getSSA(2);
+ bld.mkOp1(OP_LOAD, TYPE_U16, x,
+ bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
+ bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+ }
break;
- }
case SV_TID:
if (idx == 0) {
bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
@@ -1313,9 +1130,6 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
bld.mkMov(def, bld.mkImm(0));
}
break;
- case SV_COMBINED_TID:
- bld.mkMov(def, tid);
- break;
case SV_SAMPLE_POS: {
Value *off = new_LValue(func, FILE_ADDRESS);
bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
@@ -1323,16 +1137,11 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
bld.mkLoad(TYPE_F32,
def,
bld.mkSymbol(
- FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+ FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
off);
break;
}
- case SV_THREAD_KILL:
- // Not actually supported. But it's implementation-dependent, so we can
- // always just say it's not a helper.
- bld.mkMov(def, bld.loadImm(NULL, 0));
- break;
default:
bld.mkFetch(i->getDef(0), i->dType,
FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
@@ -1357,9 +1166,10 @@ NV50LoweringPreSSA::handleDIV(Instruction *i)
bool
NV50LoweringPreSSA::handleSQRT(Instruction *i)
{
- bld.setPosition(i, true);
- i->op = OP_RSQ;
- bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
+ Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+ bld.getSSA(), i->getSrc(0));
+ i->op = OP_MUL;
+ i->setSrc(1, rsq->getDef(0));
return true;
}
@@ -1397,7 +1207,7 @@ NV50LoweringPreSSA::handleEXPORT(Instruction *i)
i->setDef(0, new_LValue(func, FILE_GPR));
i->getDef(0)->reg.data.id = id;
- prog->maxGPR = MAX2(prog->maxGPR, id * 2);
+ prog->maxGPR = MAX2(prog->maxGPR, id);
}
}
return true;
@@ -1412,15 +1222,6 @@ bool
NV50LoweringPreSSA::handleLOAD(Instruction *i)
{
ValueRef src = i->src(0);
- Symbol *sym = i->getSrc(0)->asSym();
-
- if (prog->getType() == Program::TYPE_COMPUTE) {
- if (sym->inFile(FILE_MEMORY_SHARED) ||
- sym->inFile(FILE_MEMORY_BUFFER) ||
- sym->inFile(FILE_MEMORY_GLOBAL)) {
- return handleLDST(i);
- }
- }
if (src.isIndirect(1)) {
assert(prog->getType() == Program::TYPE_GEOMETRY);
@@ -1458,677 +1259,6 @@ NV50LoweringPreSSA::handleLOAD(Instruction *i)
}
bool
-NV50LoweringPreSSA::handleSharedATOM(Instruction *atom)
-{
- assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
-
- BasicBlock *currBB = atom->bb;
- BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
- BasicBlock *joinBB = atom->bb->splitAfter(atom);
- BasicBlock *setAndUnlockBB = new BasicBlock(func);
- BasicBlock *failLockBB = new BasicBlock(func);
-
- bld.setPosition(currBB, true);
- assert(!currBB->joinAt);
- currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
-
- bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
- currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
-
- bld.setPosition(tryLockBB, true);
-
- Instruction *ld =
- bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
- atom->getIndirect(0, 0));
- Value *locked = bld.getSSA(1, FILE_FLAGS);
- if (prog->getTarget()->getChipset() >= 0xa0) {
- ld->setFlagsDef(1, locked);
- ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
- } else {
- bld.mkMov(locked, bld.loadImm(NULL, 2))
- ->flagsDef = 0;
- }
-
- bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked);
- bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
- tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
- tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
-
- tryLockBB->cfg.detach(&joinBB->cfg);
- bld.remove(atom);
-
- bld.setPosition(setAndUnlockBB, true);
- Value *stVal;
- if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
- // Read the old value, and write the new one.
- stVal = atom->getSrc(1);
- } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
- CmpInstruction *set =
- bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS),
- TYPE_U32, ld->getDef(0), atom->getSrc(1));
-
- Instruction *selp =
- bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2),
- ld->getDef(0), set->getDef(0));
- stVal = selp->getDef(0);
-
- handleSELP(selp);
- } else {
- operation op;
-
- switch (atom->subOp) {
- case NV50_IR_SUBOP_ATOM_ADD:
- op = OP_ADD;
- break;
- case NV50_IR_SUBOP_ATOM_AND:
- op = OP_AND;
- break;
- case NV50_IR_SUBOP_ATOM_OR:
- op = OP_OR;
- break;
- case NV50_IR_SUBOP_ATOM_XOR:
- op = OP_XOR;
- break;
- case NV50_IR_SUBOP_ATOM_MIN:
- op = OP_MIN;
- break;
- case NV50_IR_SUBOP_ATOM_MAX:
- op = OP_MAX;
- break;
- default:
- assert(0);
- return false;
- }
-
- Instruction *i =
- bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
- atom->getSrc(1));
-
- stVal = i->getDef(0);
- }
-
- Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
- atom->getIndirect(0, 0), stVal);
- if (prog->getTarget()->getChipset() >= 0xa0) {
- store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
- }
-
- bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
- setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
-
- // Loop until the lock is acquired.
- bld.setPosition(failLockBB, true);
- bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked);
- bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
- failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
- failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
-
- bld.setPosition(joinBB, false);
- bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
-
- return true;
-}
-
-bool
-NV50LoweringPreSSA::handleLDST(Instruction *i)
-{
- ValueRef src = i->src(0);
- Symbol *sym = i->getSrc(0)->asSym();
-
- if (prog->getType() != Program::TYPE_COMPUTE) {
- return true;
- }
-
- // Buffers just map directly to the different global memory spaces
- if (sym->inFile(FILE_MEMORY_BUFFER)) {
- sym->reg.file = FILE_MEMORY_GLOBAL;
- }
-
- if (sym->inFile(FILE_MEMORY_SHARED)) {
-
- if (src.isIndirect(0)) {
- Value *addr = i->getIndirect(0, 0);
-
- if (!addr->inFile(FILE_ADDRESS)) {
- // Move address from GPR into an address register
- Value *new_addr = bld.getSSA(2, FILE_ADDRESS);
- bld.mkMov(new_addr, addr);
-
- i->setIndirect(0, 0, new_addr);
- }
- }
-
- if (i->op == OP_ATOM)
- handleSharedATOM(i);
- } else if (sym->inFile(FILE_MEMORY_GLOBAL)) {
- // All global access must be indirect. There are no instruction forms
- // with direct access.
- Value *addr = i->getIndirect(0, 0);
-
- Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset);
- Value *sum;
- if (addr != NULL)
- sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr,
- offset);
- else
- sum = offset;
-
- i->setIndirect(0, 0, sum);
- sym->reg.data.offset = 0;
- }
-
- return true;
-}
-
-bool
-NV50LoweringPreSSA::handleMEMBAR(Instruction *i)
-{
- // For global memory, apparently doing a bunch of reads at different
- // addresses forces things to get sufficiently flushed.
- if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) {
- uint8_t b = prog->driver->io.auxCBSlot;
- Value *base =
- bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32,
- prog->driver->io.membarOffset), NULL);
- Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0));
- Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
- bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
- physid, bld.loadImm(NULL, 0x1f)),
- bld.loadImm(NULL, 2));
- base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off);
- Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0);
- for (int i = 0; i < 8; i++) {
- if (i != 0) {
- base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100));
- }
- bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base)
- ->fixed = 1;
- }
- }
-
- // Both global and shared memory barriers also need a regular control bar
- // TODO: double-check this is the case
- i->op = OP_BAR;
- i->subOp = NV50_IR_SUBOP_BAR_SYNC;
- i->setSrc(0, bld.mkImm(0u));
- i->setSrc(1, bld.mkImm(0u));
-
- return true;
-}
-
-// The type that bests represents how each component can be stored when packed.
-static DataType
-getPackedType(const TexInstruction::ImgFormatDesc *t, int c)
-{
- switch (t->type) {
- case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
- case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
- case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
- case UINT:
- return (t->bits[c] == 8 ? TYPE_U8 :
- (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));
- case SINT:
- return (t->bits[c] == 8 ? TYPE_S8 :
- (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));
- }
- return TYPE_NONE;
-}
-
-// The type that the rest of the shader expects to process this image type in.
-static DataType
-getShaderType(const ImgType type) {
- switch (type) {
- case FLOAT:
- case UNORM:
- case SNORM:
- return TYPE_F32;
- case UINT:
- return TYPE_U32;
- case SINT:
- return TYPE_S32;
- default:
- assert(!"Impossible type");
- return TYPE_NONE;
- }
-}
-
-// Reads the raw coordinates out of the input instruction, and returns a
-// single-value coordinate which is what the hardware expects to receive in a
-// ld/st op.
-Value *
-NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
-{
- const int slot = su->tex.r;
- const int dim = su->tex.target.getDim();
- const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
-
- const TexInstruction::ImgFormatDesc *format = su->tex.format;
- const uint16_t bytes = (format->bits[0] + format->bits[1] +
- format->bits[2] + format->bits[3]) / 8;
- uint16_t shift = ffs(bytes) - 1;
-
- // Buffer sizes don't necessarily fit in 16-bit values
- if (su->tex.target == TEX_TARGET_BUFFER) {
- return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
- su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));
- }
-
- // For buffers, we just need the byte offset. And for 2d buffers we want
- // the x coordinate in bytes as well.
- Value *coords[3] = {};
- for (int i = 0; i < arg; i++) {
- Value *src[2];
- bld.mkSplit(src, 2, su->getSrc(i));
- coords[i] = src[0];
- // For 1d-images, we want the y coord to be 0, which it will be here.
- if (i == 0)
- coords[1] = src[1];
- }
-
- coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
- coords[0], bld.loadImm(NULL, shift));
-
- if (su->tex.target.isMS()) {
- Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));
- Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));
- coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);
- coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);
- }
-
- // If there are more dimensions, we just want the y-offset. But that needs
- // to be adjusted up by the y-stride for array images.
- if (su->tex.target.isArray() || su->tex.target.isCube()) {
- Value *index = coords[dim];
- Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
- Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);
- mul->sType = TYPE_U16;
- Value *muls[2];
- bld.mkSplit(muls, 2, mul->getDef(0));
- if (dim > 1)
- coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);
- else
- coords[1] = muls[0];
- }
-
- // 3d is special-cased. Note that a single "slice" of a 3d image may
- // also be attached as 2d, so we have to do the same 3d processing for
- // 2d as well, just in case. In order to remap a 3d image onto a 2d
- // image, we have to retile it "by hand".
- if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
- Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);
- Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
- // Add the z coordinate for actual 3d-images
- if (dim > 2)
- coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);
- else
- coords[2] = z;
-
- // Compute the surface parameters from tile shifts
- Value *tile_shift[3];
- Value *tile_size[3];
- Value *tile_mask[3];
- // We only ever use one kind of X-tiling.
- tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);
- tile_size[0] = bld.loadImm(NULL, (uint16_t)64);
- tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);
- // Fetch the "real" tiling parameters of the underlying surface
- for (int i = 1; i < 3; i++) {
- tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));
- tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);
- tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));
- }
-
- // Compute the location of given coordinate, both inside the tile as
- // well as which (linearly-laid out) tile it's in.
- Value *coord_in_tile[3];
- Value *tile[3];
- for (int i = 0; i < 3; i++) {
- coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);
- tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);
- }
-
- // Based on the "real" tiling parameters, compute x/y coordinates in the
- // larger surface with 2d tiling that was supplied to the hardware. This
- // was determined and verified with the help of the tiling pseudocode in
- // the envytools docs.
- //
- // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
- // z_coord_in_tile * x_tile_size
- // adj_y = y_coord_in_tile + y_tile * y_tile_size +
- // z_tile * y_tile_size * y_tiles
- //
- // Note: STRIDE_Y = y_tile_size * y_tiles
-
- coords[0] = bld.mkOp2v(
- OP_ADD, TYPE_U16, bld.getSSA(2),
- bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
- coord_in_tile[0],
- bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
- tile[0],
- bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
- tile_shift[2], tile_shift[0]))),
- bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
- coord_in_tile[2], tile_shift[0]));
-
- Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),
- tile[2], y_size_aligned);
- mul->sType = TYPE_U16;
- Value *muls[2];
- bld.mkSplit(muls, 2, mul->getDef(0));
-
- coords[1] = bld.mkOp2v(
- OP_ADD, TYPE_U16, bld.getSSA(2),
- muls[0],
- bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
- coord_in_tile[1],
- bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
- tile[1], tile_shift[1])));
- }
-
- return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);
-}
-
-// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but
-// adjusted to make use of 16-bit math where possible.
-bool
-NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
-{
- const int slot = su->tex.r;
- assert(!su->getIndirectR());
-
- bld.setPosition(su, false);
-
- const TexInstruction::ImgFormatDesc *format = su->tex.format;
- const int bytes = (su->tex.format->bits[0] +
- su->tex.format->bits[1] +
- su->tex.format->bits[2] +
- su->tex.format->bits[3]) / 8;
- DataType ty = typeOfSize(bytes);
-
- Value *coord = processSurfaceCoords(su);
-
- Value *untypedDst[4] = {};
- Value *typedDst[4] = {};
- int i;
- for (i = 0; i < bytes / 4; i++)
- untypedDst[i] = bld.getSSA();
- if (bytes < 4)
- untypedDst[0] = bld.getSSA();
-
- for (i = 0; i < 4; i++)
- typedDst[i] = su->getDef(i);
-
- Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);
- for (i = 0; i < 4 && untypedDst[i]; i++)
- load->setDef(i, untypedDst[i]);
-
- // Unpack each component into the typed dsts
- int bits = 0;
- for (int i = 0; i < 4; bits += format->bits[i], i++) {
- if (!typedDst[i])
- continue;
-
- if (i >= format->components) {
- if (format->type == FLOAT ||
- format->type == UNORM ||
- format->type == SNORM)
- bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
- else
- bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
- continue;
- }
-
- // Get just that component's data into the relevant place
- if (format->bits[i] == 32)
- bld.mkMov(typedDst[i], untypedDst[i]);
- else if (format->bits[i] == 16) {
- // We can always convert directly from the appropriate half of the
- // loaded value into the typed result.
- Value *src[2];
- bld.mkSplit(src, 2, untypedDst[i / 2]);
- bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
- getPackedType(format, i), src[i & 1]);
- }
- else if (format->bits[i] == 8) {
- // Same approach as for 16 bits, but we have to massage the value a
- // bit more, since we have to get the appropriate 8 bits from the
- // half-register. In all cases, we can CVT from a 8-bit source, so we
- // only have to shift when we want the upper 8 bits.
- Value *src[2], *shifted;
- bld.mkSplit(src, 2, untypedDst[0]);
- DataType packedType = getPackedType(format, i);
- if (i & 1)
- shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));
- else
- shifted = src[!!(i & 2)];
-
- bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
- packedType, shifted);
- }
- else {
- // The options are 10, 11, and 2. Get it into a 32-bit reg, then
- // shift/mask. That's where it'll have to end up anyways. For signed,
- // we have to make sure to get sign-extension, so we actually have to
- // shift *up* first, and then shift down. There's no advantage to
- // AND'ing, so we don't.
- DataType ty = TYPE_U32;
- if (format->type == SNORM || format->type == SINT) {
- ty = TYPE_S32;
- }
-
- // Poor man's EXTBF
- bld.mkOp2(
- OP_SHR, ty, typedDst[i],
- bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),
- bld.loadImm(NULL, 32 - format->bits[i]));
-
- // If the stored data is already in the appropriate type, we don't
- // have to do anything. Convert to float for the *NORM formats.
- if (format->type == UNORM || format->type == SNORM)
- bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);
- }
-
- // Normalize / convert as necessary
- if (format->type == UNORM)
- bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
- else if (format->type == SNORM)
- bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
- else if (format->type == FLOAT && format->bits[i] < 16) {
- // We expect the value to be in the low bits of the register, so we
- // have to shift back up.
- bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
- Value *src[2];
- bld.mkSplit(src, 2, typedDst[i]);
- bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);
- }
- }
-
- if (format->bgra) {
- std::swap(typedDst[0], typedDst[2]);
- }
-
- bld.getBB()->remove(su);
- return true;
-}
-
-bool
-NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
-{
- const int slot = su->tex.r;
- const int dim = su->tex.target.getDim();
- const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
- assert(!su->getIndirectR());
-
- bld.setPosition(su, false);
-
- Value *coord = processSurfaceCoords(su);
-
- // This is guaranteed to be a 32-bit format. So there's nothing to
- // pack/unpack.
- Instruction *atom = bld.mkOp2(
- OP_ATOM, su->dType, su->getDef(0),
- bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));
- if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
- atom->setSrc(2, su->getSrc(arg + 1));
- atom->setIndirect(0, 0, coord);
- atom->subOp = su->subOp;
-
- bld.getBB()->remove(su);
- return true;
-}
-
-bool
-NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
-{
- const int slot = su->tex.r;
- const int dim = su->tex.target.getDim();
- const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
- assert(!su->getIndirectR());
-
- bld.setPosition(su, false);
-
- const TexInstruction::ImgFormatDesc *format = su->tex.format;
- const int bytes = (su->tex.format->bits[0] +
- su->tex.format->bits[1] +
- su->tex.format->bits[2] +
- su->tex.format->bits[3]) / 8;
- DataType ty = typeOfSize(bytes);
-
- Value *coord = processSurfaceCoords(su);
-
- // The packed values we will eventually store into memory
- Value *untypedDst[4] = {};
- // Each component's packed representation, in 16-bit registers (only used
- // where appropriate)
- Value *untypedDst16[4] = {};
- // The original values that are being packed
- Value *typedDst[4] = {};
- int i;
-
- for (i = 0; i < bytes / 4; i++)
- untypedDst[i] = bld.getSSA();
- for (i = 0; i < format->components; i++)
- untypedDst16[i] = bld.getSSA(2);
- // Make sure we get at least one of each value allocated for the
- // super-narrow formats.
- if (bytes < 4)
- untypedDst[0] = bld.getSSA();
- if (bytes < 2)
- untypedDst16[0] = bld.getSSA(2);
-
- for (i = 0; i < 4; i++) {
- typedDst[i] = bld.getSSA();
- bld.mkMov(typedDst[i], su->getSrc(arg + i));
- }
-
- if (format->bgra) {
- std::swap(typedDst[0], typedDst[2]);
- }
-
- // Pack each component into the untyped dsts.
- int bits = 0;
- for (int i = 0; i < format->components; bits += format->bits[i], i++) {
- // Un-normalize / convert as necessary
- if (format->type == UNORM)
- bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));
- else if (format->type == SNORM)
- bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));
-
- // There is nothing to convert/pack for 32-bit values
- if (format->bits[i] == 32) {
- bld.mkMov(untypedDst[i], typedDst[i]);
- continue;
- }
-
- // The remainder of the cases will naturally want to deal in 16-bit
- // registers. We will put these into untypedDst16 and then merge them
- // together later.
- if (format->type == FLOAT && format->bits[i] < 16) {
- bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);
- bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));
-
- // For odd bit sizes, it's easier to pack it into the final
- // destination directly.
- Value *tmp = bld.getSSA();
- bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
- if (i == 0) {
- untypedDst[0] = tmp;
- } else {
- bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
- bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
- }
- } else if (format->bits[i] == 16) {
- // We can always convert the shader value into the packed value
- // directly here
- bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],
- getShaderType(format->type), typedDst[i]);
- } else if (format->bits[i] < 16) {
- DataType packedType = getPackedType(format, i);
- DataType shaderType = getShaderType(format->type);
- // We can't convert F32 to U8/S8 directly, so go to U16/S16 first.
- if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {
- packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;
- }
- bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);
- // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of
- // the size, it's easier to dump them into a 32-bit value and OR
- // everything later.
- if (format->bits[i] != 8) {
- // Restrict value to the appropriate bits (although maybe supposed
- // to clamp instead?)
- bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));
- // And merge into final packed value
- Value *tmp = bld.getSSA();
- bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
- if (i == 0) {
- untypedDst[0] = tmp;
- } else {
- bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
- bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
- }
- } else if (i & 1) {
- // Shift the 8-bit value up (so that it can be OR'd later)
- bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));
- } else if (packedType != TYPE_U8) {
- // S8 (or the *16 if converted from float) will all have high bits
- // set, so AND them out.
- bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));
- }
- }
- }
-
- // OR pairs of 8-bit values together (into the even value)
- if (format->bits[0] == 8) {
- for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)
- bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);
- }
-
- // We'll always want to have at least a 32-bit source register for the store
- Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));
- if (format->bits[0] == 32) {
- for (i = 0; i < 4 && untypedDst[i]; i++)
- merge->setSrc(i, untypedDst[i]);
- } else if (format->bits[0] == 16) {
- for (i = 0; i < 4 && untypedDst16[i]; i++)
- merge->setSrc(i, untypedDst16[i]);
- if (i == 1)
- merge->setSrc(i, bld.getSSA(2));
- } else if (format->bits[0] == 8) {
- for (i = 0; i < 2 && untypedDst16[2 * i]; i++)
- merge->setSrc(i, untypedDst16[2 * i]);
- if (i == 1)
- merge->setSrc(i, bld.getSSA(2));
- } else {
- merge->setSrc(0, untypedDst[0]);
- }
-
- bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));
-
- bld.getBB()->remove(su);
- return true;
-}
-
-bool
NV50LoweringPreSSA::handlePFETCH(Instruction *i)
{
assert(prog->getType() == Program::TYPE_GEOMETRY);
@@ -2203,8 +1333,6 @@ NV50LoweringPreSSA::visit(Instruction *i)
return handleTXD(i->asTex());
case OP_TXLQ:
return handleTXLQ(i->asTex());
- case OP_TXQ:
- return handleTXQ(i->asTex());
case OP_EX2:
bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
i->setSrc(0, i->getDef(0));
@@ -2225,21 +1353,6 @@ NV50LoweringPreSSA::visit(Instruction *i)
return handleEXPORT(i);
case OP_LOAD:
return handleLOAD(i);
- case OP_MEMBAR:
- return handleMEMBAR(i);
- case OP_ATOM:
- case OP_STORE:
- return handleLDST(i);
- case OP_SULDP:
- return handleSULDP(i->asTex());
- case OP_SUSTP:
- return handleSUSTP(i->asTex());
- case OP_SUREDP:
- return handleSUREDP(i->asTex());
- case OP_SUQ:
- return handleSUQ(i->asTex());
- case OP_BUFQ:
- return handleBUFQ(i);
case OP_RDSV:
return handleRDSV(i);
case OP_WRSV:
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
index 3d25ad928..2e432349f 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
@@ -87,7 +87,6 @@ DominatorTree::DominatorTree(Graph *cfgraph) : cfg(cfgraph),
LABEL(i) = i;
SEMI(i) = ANCESTOR(i) = -1;
}
- assert(i == count);
build();
@@ -169,7 +168,7 @@ void DominatorTree::build()
do {
p = 0;
for (v = 1; v < count; ++v) {
- nw = &BasicBlock::get(vert[DOM(v)])->dom;
+ nw = &BasicBlock::get(vert[DOM(v)])->dom;;
nv = &BasicBlock::get(vert[v])->dom;
if (nw->getGraph() && !nv->getGraph()) {
++p;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index b9c3746ad..f3ddcaa51 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -66,7 +66,7 @@ TargetNV50::getBuiltinOffset(int builtin) const
return 0;
}
-struct nv50_opProperties
+struct opProperties
{
operation op;
unsigned int mNeg : 4;
@@ -79,7 +79,7 @@ struct nv50_opProperties
unsigned int fImm : 3;
};
-static const struct nv50_opProperties _initProps[] =
+static const struct opProperties _initProps[] =
{
// neg abs not sat c[] s[], a[], imm
{ OP_ADD, 0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
@@ -99,7 +99,6 @@ static const struct nv50_opProperties _initProps[] =
{ OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
{ OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
{ OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
- { OP_EX2, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0 },
{ OP_LG2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
{ OP_RCP, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
{ OP_RSQ, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
@@ -111,15 +110,15 @@ void TargetNV50::initOpInfo()
{
unsigned int i, j;
- static const operation commutativeList[] =
+ static const uint32_t commutative[(OP_LAST + 31) / 32] =
{
- OP_ADD, OP_MUL, OP_MAD, OP_FMA, OP_AND, OP_OR, OP_XOR, OP_MAX, OP_MIN,
- OP_SET_AND, OP_SET_OR, OP_SET_XOR, OP_SET, OP_SELP, OP_SLCT
+ // ADD,MAD,MUL,AND,OR,XOR,MAX,MIN
+ 0x0670ca00, 0x0000003f, 0x00000000, 0x00000000
};
- static const operation shortFormList[] =
+ static const uint32_t shortForm[(OP_LAST + 31) / 32] =
{
- OP_MOV, OP_ADD, OP_SUB, OP_MUL, OP_MAD, OP_SAD, OP_RCP, OP_LINTERP,
- OP_PINTERP, OP_TEX, OP_TXF
+ // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF
+ 0x00014e40, 0x00000040, 0x00000930, 0x00000000
};
static const operation noDestList[] =
{
@@ -156,23 +155,19 @@ void TargetNV50::initOpInfo()
opInfo[i].hasDest = 1;
opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
- opInfo[i].commutative = false; /* set below */
+ opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
opInfo[i].pseudo = (i < OP_MOV);
opInfo[i].predicate = !opInfo[i].pseudo;
opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
- opInfo[i].minEncSize = 8; /* set below */
+ opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
}
- for (i = 0; i < ARRAY_SIZE(commutativeList); ++i)
- opInfo[commutativeList[i]].commutative = true;
- for (i = 0; i < ARRAY_SIZE(shortFormList); ++i)
- opInfo[shortFormList[i]].minEncSize = 4;
- for (i = 0; i < ARRAY_SIZE(noDestList); ++i)
+ for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i)
opInfo[noDestList[i]].hasDest = 0;
- for (i = 0; i < ARRAY_SIZE(noPredList); ++i)
+ for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i)
opInfo[noPredList[i]].predicate = 0;
- for (i = 0; i < ARRAY_SIZE(_initProps); ++i) {
- const struct nv50_opProperties *prop = &_initProps[i];
+ for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+ const struct opProperties *prop = &_initProps[i];
for (int s = 0; s < 3; ++s) {
if (prop->mNeg & (1 << s))
@@ -203,16 +198,14 @@ TargetNV50::getFileSize(DataFile file) const
{
switch (file) {
case FILE_NULL: return 0;
- case FILE_GPR: return 254; // in 16-bit units **
+ case FILE_GPR: return 256; // in 16-bit units **
case FILE_PREDICATE: return 0;
case FILE_FLAGS: return 4;
case FILE_ADDRESS: return 4;
- case FILE_BARRIER: return 0;
case FILE_IMMEDIATE: return 0;
case FILE_MEMORY_CONST: return 65536;
case FILE_SHADER_INPUT: return 0x200;
case FILE_SHADER_OUTPUT: return 0x200;
- case FILE_MEMORY_BUFFER: return 0xffffffff;
case FILE_MEMORY_GLOBAL: return 0xffffffff;
case FILE_MEMORY_SHARED: return 16 << 10;
case FILE_MEMORY_LOCAL: return 48 << 10;
@@ -252,18 +245,15 @@ TargetNV50::getSVAddress(DataFile shaderFile, const Symbol *sym) const
return shaderFile == FILE_SHADER_INPUT ? 0x18 :
sysvalLocation[sym->reg.data.sv.sv];
case SV_NCTAID:
- return sym->reg.data.sv.index >= 2 ? 0x10 : 0x8 + 2 * sym->reg.data.sv.index;
+ return 0x8 + 2 * sym->reg.data.sv.index;
case SV_CTAID:
- return sym->reg.data.sv.index >= 2 ? 0x12 : 0xc + 2 * sym->reg.data.sv.index;
+ return 0xc + 2 * sym->reg.data.sv.index;
case SV_NTID:
return 0x2 + 2 * sym->reg.data.sv.index;
case SV_TID:
- case SV_COMBINED_TID:
return 0;
case SV_SAMPLE_POS:
return 0; /* sample position is handled differently */
- case SV_THREAD_KILL:
- return 0;
default:
return sysvalLocation[sym->reg.data.sv.sv];
}
@@ -278,16 +268,6 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
{
DataFile sf = ld->src(0).getFile();
- // immediate 0 can be represented by GPR $r63/$r127
- // this does not work with global memory ld/st/atom
- if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
- return (!i->isPseudo() &&
- !i->asTex() &&
- i->op != OP_EXPORT &&
- i->op != OP_STORE &&
- ((i->op != OP_ATOM && i->op != OP_LOAD) ||
- i->src(0).getFile() != FILE_MEMORY_GLOBAL));
-
if (sf == FILE_IMMEDIATE && (i->predSrc >= 0 || i->flagsDef >= 0))
return false;
if (s >= opInfo[i->op].srcNr)
@@ -362,11 +342,8 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
ldSize = typeSizeof(ld->dType);
}
- if (sf == FILE_IMMEDIATE) {
- if (ldSize == 2 && (i->op == OP_AND || i->op == OP_OR || i->op == OP_XOR))
- return false;
- return ldSize <= 4;
- }
+ if (sf == FILE_IMMEDIATE)
+ return true;
// Check if memory access is encodable:
@@ -402,29 +379,12 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
}
bool
-TargetNV50::insnCanLoadOffset(const Instruction *i, int s, int offset) const
-{
- if (!i->src(s).isIndirect(0))
- return true;
- offset += i->src(s).get()->reg.data.offset;
- if (i->op == OP_LOAD || i->op == OP_STORE || i->op == OP_ATOM) {
- // There are some restrictions in theory, but in practice they're never
- // going to be hit. However offsets on global/shared memory are just
- // plain not supported.
- return i->src(s).getFile() != FILE_MEMORY_GLOBAL &&
- i->src(s).getFile() != FILE_MEMORY_SHARED;
- }
- return offset >= 0 && offset <= (int32_t)(127 * i->src(s).get()->reg.size);
-}
-
-bool
TargetNV50::isAccessSupported(DataFile file, DataType ty) const
{
if (ty == TYPE_B96 || ty == TYPE_NONE)
return false;
if (typeSizeof(ty) > 4)
- return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL) ||
- (file == FILE_MEMORY_BUFFER);
+ return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL);
return true;
}
@@ -453,8 +413,6 @@ TargetNV50::isOpSupported(operation op, DataType ty) const
case OP_EXTBF:
case OP_EXIT: // want exit modifier instead (on NOP if required)
case OP_MEMBAR:
- case OP_SHLADD:
- case OP_XMAD:
return false;
case OP_SAD:
return ty == TYPE_S32;
@@ -496,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
return false;
}
}
- if (s >= opInfo[insn->op].srcNr || s >= 3)
+ if (s >= 3)
return false;
return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
}
@@ -529,7 +487,6 @@ int TargetNV50::getLatency(const Instruction *i) const
switch (i->src(0).getFile()) {
case FILE_MEMORY_LOCAL:
case FILE_MEMORY_GLOBAL:
- case FILE_MEMORY_BUFFER:
return 100; // really 400 to 800
default:
return 22;
@@ -595,24 +552,21 @@ recordLocation(uint16_t *locs, uint8_t *masks,
}
void
-TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info,
- const struct nv50_ir_prog_info_out *info_out)
+TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info)
{
unsigned int i;
- for (i = 0; i < info_out->numOutputs; ++i)
- recordLocation(sysvalLocation, NULL, &info_out->out[i]);
- for (i = 0; i < info_out->numInputs; ++i)
- recordLocation(sysvalLocation, &wposMask, &info_out->in[i]);
- for (i = 0; i < info_out->numSysVals; ++i)
- recordLocation(sysvalLocation, NULL, &info_out->sv[i]);
+ for (i = 0; i < info->numOutputs; ++i)
+ recordLocation(sysvalLocation, NULL, &info->out[i]);
+ for (i = 0; i < info->numInputs; ++i)
+ recordLocation(sysvalLocation, &wposMask, &info->in[i]);
+ for (i = 0; i < info->numSysVals; ++i)
+ recordLocation(sysvalLocation, NULL, &info->sv[i]);
if (sysvalLocation[SV_POSITION] >= 0x200) {
// not assigned by driver, but we need it internally
wposMask = 0x8;
sysvalLocation[SV_POSITION] = 0;
}
-
- Target::parseDriverInfo(info, info_out);
}
} // namespace nv50_ir
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
index caf66b269..0cbf180d0 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
@@ -42,13 +42,10 @@ public:
virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
- virtual void parseDriverInfo(const struct nv50_ir_prog_info *,
- const struct nv50_ir_prog_info_out *);
+ virtual void parseDriverInfo(const struct nv50_ir_prog_info *);
virtual bool insnCanLoad(const Instruction *insn, int s,
const Instruction *ld) const;
- virtual bool insnCanLoadOffset(const Instruction *insn, int s,
- int offset) const;
virtual bool isOpSupported(operation, DataType) const;
virtual bool isAccessSupported(DataFile, DataType) const;
virtual bool isModSupported(const Instruction *, int s, Modifier) const;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
index 7808164f4..3c5c74804 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@@ -31,15 +31,11 @@ namespace nv50_ir {
#define NVC0_BUILTIN_COUNT 4
-struct nvc0_opProperties;
-
class TargetNVC0 : public Target
{
public:
TargetNVC0(unsigned int chipset);
- void initProps(const struct nvc0_opProperties *props, int size);
-
virtual CodeEmitter *getCodeEmitter(Program::Type);
CodeEmitter *createCodeEmitterNVC0(Program::Type);
@@ -52,8 +48,6 @@ public:
virtual bool insnCanLoad(const Instruction *insn, int s,
const Instruction *ld) const;
- virtual bool insnCanLoadOffset(const Instruction *insn, int s,
- int offset) const;
virtual bool isOpSupported(operation, DataType) const;
virtual bool isAccessSupported(DataFile, DataType) const;
virtual bool isModSupported(const Instruction *, int s, Modifier) const;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
index dc4ebd51a..d26acb304 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
@@ -297,8 +297,8 @@ void BitSet::fill(uint32_t val)
unsigned int i;
for (i = 0; i < (size + 31) / 32; ++i)
data[i] = val;
- if (val && i)
- data[i - 1] &= (1 << (size % 32)) - 1;
+ if (val)
+ data[i] &= ~(0xffffffff << (size % 32)); // BE ?
}
void BitSet::setOr(BitSet *pA, BitSet *pB)
@@ -311,12 +311,12 @@ void BitSet::setOr(BitSet *pA, BitSet *pB)
}
}
-int BitSet::findFreeRange(unsigned int count, unsigned int max) const
+int BitSet::findFreeRange(unsigned int count) const
{
const uint32_t m = (1 << count) - 1;
- int pos = max;
+ int pos = size;
unsigned int i;
- const unsigned int end = (max + 31) / 32;
+ const unsigned int end = (size + 31) / 32;
if (count == 1) {
for (i = 0; i < end; ++i) {
@@ -365,15 +365,9 @@ int BitSet::findFreeRange(unsigned int count, unsigned int max) const
}
}
}
-
- // If we couldn't find a position, we can have a left-over -1 in pos. Make
- // sure to abort in such a case.
- if (pos < 0)
- return -1;
-
pos += i * 32;
- return ((pos + count) <= max) ? pos : -1;
+ return ((pos + count) <= size) ? pos : -1;
}
void BitSet::print() const
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
index b1766f482..fa2c4804a 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
@@ -36,14 +36,14 @@
#include "util/u_inlines.h"
#include "util/u_memory.h"
-#define ERROR(args...) _debug_printf("ERROR: " args)
-#define WARN(args...) _debug_printf("WARNING: " args)
-#define INFO(args...) _debug_printf(args)
+#define ERROR(args...) debug_printf("ERROR: " args)
+#define WARN(args...) debug_printf("WARNING: " args)
+#define INFO(args...) debug_printf(args)
#define INFO_DBG(m, f, args...) \
do { \
if (m & NV50_IR_DEBUG_##f) \
- _debug_printf(args); \
+ debug_printf(args); \
} while(0)
#define FATAL(args...) \
@@ -94,11 +94,7 @@ public:
virtual void reset() { assert(0); } // only for graph iterators
};
-#if __cplusplus >= 201103L
-typedef std::unique_ptr<Iterator> IteratorRef;
-#else
typedef std::auto_ptr<Iterator> IteratorRef;
-#endif
class ManipIterator : public Iterator
{
@@ -145,7 +141,7 @@ public:
#define DLLIST_EMPTY(__list) ((__list)->next == (__list))
#define DLLIST_FOR_EACH(list, it) \
- for (DLList::Iterator it = (list)->iterator(); !(it).end(); (it).next())
+ for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next())
class DLList
{
@@ -203,7 +199,7 @@ public:
virtual void erase();
virtual bool insert(void *data);
- // move item to another list, no consistency with its iterators though
+ // move item to a another list, no consistency with its iterators though
void moveToList(DLList&);
private:
@@ -539,11 +535,8 @@ public:
return data[i / 32] & (((1 << n) - 1) << (i % 32));
}
- // Find a range of count (<= 32) clear bits aligned to roundup_pow2(count).
- int findFreeRange(unsigned int count, unsigned int max) const;
- inline int findFreeRange(unsigned int count) const {
- return findFreeRange(count, size);
- }
+ // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size).
+ int findFreeRange(unsigned int size) const;
BitSet& operator|=(const BitSet&);