diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2015-11-22 02:45:45 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2015-11-22 02:45:45 +0000 |
commit | b7ab2ee0fa1e6e04a545a9bd2088ac621c810081 (patch) | |
tree | db90836dcf322d66f4369cb79b21ec5e68986925 /lib/mesa/src/gallium/drivers | |
parent | f00235c070468f96521cd88ebc8919fa0cb89a25 (diff) |
import Mesa 11.0.6
Diffstat (limited to 'lib/mesa/src/gallium/drivers')
19 files changed, 192 insertions, 2119 deletions
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile index 115f6d0c0..06d1979d8 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/Makefile @@ -3,9 +3,9 @@ ENVYAS ?= envyas all: gf100.asm.h gk104.asm.h gk110.asm.h gm107.asm.h gf100.asm.h: %.asm.h: %.asm - $(ENVYAS) -a -W -mgf100 -Vgf100 $< -o $@ + $(ENVYAS) -a -W -mnvc0 -Vnvc0 $< -o $@ gk104.asm.h: %.asm.h: %.asm - $(ENVYAS) -a -W -mgf100 -Vgk104 $< -o $@ + $(ENVYAS) -a -W -mnvc0 -Vnve4 $< -o $@ gk110.asm.h: %.asm.h: %.asm $(ENVYAS) -a -W -mgk110 $< -o $@ gm107.asm.h: %.asm.h: %.asm diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm index 21a6b4de6..cd65b5472 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm @@ -543,8 +543,6 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0 $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0 long mov b32 $r3 0x3f800000 long nop -sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 -long nop long ret @@ -556,144 +554,7 @@ long ret // SIZE: 9 * 8 bytes // gk104_rcp_f64: - // Step 1: classify input according to exponent and value, and calculate - // result for 0/inf/nan. $r2 holds the exponent value, which starts at - // bit 52 (bit 20 of the upper half) and is 11 bits in length - ext u32 $r2 $r1 0xb14 - add b32 $r3 $r2 0xffffffff - joinat #rcp_rejoin - // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, - // denorm, or 0). Do this by subtracting 1 from the exponent, which will - // mean that it's > 0x7fd in those cases when doing unsigned comparison - set $p0 0x1 gt u32 $r3 0x7fd - // $r3: 0 for norms, 0x36 for denorms, -1 for others - long mov b32 $r3 0x0 - sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28 - join (not $p0) nop - // Process all special values: NaN, inf, denorm, 0 - mov b32 $r3 0xffffffff - // A number is NaN if its abs value is greater than or unordered with inf - set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 - (not $p0) bra #rcp_inf_or_denorm_or_zero - // NaN -> NaN, the next line sets the "quiet" bit of the result. This - // behavior is both seen on the CPU and the blob - join or b32 $r1 $r1 0x80000 -rcp_inf_or_denorm_or_zero: - and b32 $r4 $r1 0x7ff00000 - // Other values with nonzero in exponent field should be inf - set $p0 0x1 eq s32 $r4 0x0 - sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20 - $p0 bra #rcp_denorm_or_zero - // +/-Inf -> +/-0 - xor b32 $r1 $r1 0x7ff00000 - join mov b32 $r0 0x0 -rcp_denorm_or_zero: - set $p0 0x1 gtu f64 abs $r0d 0x0 - $p0 bra #rcp_denorm - // +/-0 -> +/-Inf - join or b32 $r1 $r1 0x7ff00000 -rcp_denorm: - // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms - mul rn f64 $r0d $r0d 0x4350000000000000 - sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28 - join mov b32 $r3 0x36 -rcp_rejoin: - // All numbers with -1 in $r3 have their result ready in $r0d, return them - // others need further calculation - set $p0 0x1 lt s32 $r3 0x0 - $p0 bra #rcp_end - // Step 2: Before the real calculation goes on, renormalize the values to - // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) - // result in $r6d. The exponent will be recovered later. - ext u32 $r2 $r1 0xb14 - and b32 $r7 $r1 0x800fffff - add b32 $r7 $r7 0x3ff00000 - long mov b32 $r6 $r0 - sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e - // Step 3: Convert new value to float (no overflow will occur due to step - // 2), calculate rcp and do newton-raphson step once - cvt rz f32 $r5 f64 $r6d - long rcp f32 $r4 $r5 - mov b32 $r0 0xbf800000 - fma rn f32 $r5 $r4 $r5 $r0 - fma rn f32 $r0 neg $r4 $r5 $r4 - // Step 4: convert result $r0 back to double, do newton-raphson steps - cvt f64 $r0d f32 $r0 - cvt f64 $r6d neg f64 $r6d - sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29 - cvt f64 $r8d f32 0x3f800000 - // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d - // The formula used here (and above) is: - // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} - // The following code uses 2 FMAs for each step, and it will basically - // looks like: - // tmp = -src * RCP_{n} + 1 - // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} - fma rn f64 $r4d $r6d $r0d $r8d - fma rn f64 $r0d $r0d $r4d $r0d - fma rn f64 $r4d $r6d $r0d $r8d - fma rn f64 $r0d $r0d $r4d $r0d - fma rn f64 $r4d $r6d $r0d $r8d - fma rn f64 $r0d $r0d $r4d $r0d - sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28 - fma rn f64 $r4d $r6d $r0d $r8d - fma rn f64 $r0d $r0d $r4d $r0d - // Step 5: Exponent recovery and final processing - // The exponent is recovered by adding what we added to the exponent. - // Suppose we want to calculate rcp(x), but we have rcp(cx), then - // rcp(x) = c * rcp(cx) - // The delta in exponent comes from two sources: - // 1) The renormalization in step 2. The delta is: - // 0x3ff - $r2 - // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored - // in $r3 - // These 2 sources are calculated in the first two lines below, and then - // added to the exponent extracted from the result above. - // Note that after processing, the new exponent may >= 0x7ff (inf) - // or <= 0 (denorm). Those cases will be handled respectively below - subr b32 $r2 $r2 0x3ff - long add b32 $r4 $r2 $r3 - ext u32 $r3 $r1 0xb14 - // New exponent in $r3 - long add b32 $r3 $r3 $r4 - add b32 $r2 $r3 0xffffffff - sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b - // (exponent-1) < 0x7fe (unsigned) means the result is in norm range - // (same logic as in step 1) - set $p0 0x1 lt u32 $r2 0x7fe - (not $p0) bra #rcp_result_inf_or_denorm - // Norms: convert exponents back and return - shl b32 $r4 $r4 clamp 0x14 - long add b32 $r1 $r4 $r1 - bra #rcp_end -rcp_result_inf_or_denorm: - // New exponent >= 0x7ff means that result is inf - set $p0 0x1 ge s32 $r3 0x7ff - (not $p0) bra #rcp_result_denorm - sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f - // Infinity - and b32 $r1 $r1 0x80000000 - long mov b32 $r0 0x0 - add b32 $r1 $r1 0x7ff00000 - bra #rcp_end -rcp_result_denorm: - // Denorm result comes from huge input. The greatest possible fp64, i.e. - // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest - // normal value. Other rcp result should be greater than that. If we - // set the exponent field to 1, we can recover the result by multiplying - // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise - // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies - // the logic here. - set $p0 0x1 ne u32 $r3 0x0 - and b32 $r1 $r1 0x800fffff - // 0x3e800000: 1/4 - $p0 cvt f64 $r6d f32 0x3e800000 - sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27 - // 0x3f000000: 1/2 - (not $p0) cvt f64 $r6d f32 0x3f000000 - add b32 $r1 $r1 0x00100000 - mul rn f64 $r0d $r0d $r6d -rcp_end: + long nop long ret // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) @@ -704,73 +565,13 @@ rcp_end: // SIZE: 14 * 8 bytes // gk104_rsq_f64: - // Before getting initial result rsqrt64h, two special cases should be - // handled first. - // 1. NaN: set the highest bit in mantissa so it'll be surely recognized - // as NaN in rsqrt64h - set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 - $p0 or b32 $r1 $r1 0x00080000 - and b32 $r2 $r1 0x7fffffff - sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28 - // 2. denorms and small normal values: using their original value will - // lose precision either at rsqrt64h or the first step in newton-raphson - // steps below. Take 2 as a threshold in exponent field, and multiply - // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 - // to recover in the end) - ext u32 $r3 $r1 0xb14 - set $p1 0x1 le u32 $r3 0x2 - long or b32 $r2 $r0 $r2 - $p1 mul rn f64 $r0d $r0d 0x4350000000000000 - rsqrt64h $r5 $r1 - // rsqrt64h will give correct result for 0/inf/nan, the following logic - // checks whether the input is one of those (exponent is 0x7ff or all 0 - // except for the sign bit) - set b32 $r6 ne u32 $r3 0x7ff - long and b32 $r2 $r2 $r6 - sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28 - set $p0 0x1 ne u32 $r2 0x0 - $p0 bra #rsq_norm - // For 0/inf/nan, make sure the sign bit agrees with input and return - and b32 $r1 $r1 0x80000000 - long mov b32 $r0 0x0 - long or b32 $r1 $r1 $r5 - long ret -rsq_norm: - // For others, do 4 Newton-Raphson steps with the formula: - // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) - // In the code below, each step is written as: - // tmp1 = 0.5 * x * RSQ_{n} - // tmp2 = -RSQ_{n} * tmp1 + 0.5 - // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} - long mov b32 $r4 0x0 - sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29 - // 0x3f000000: 1/2 - cvt f64 $r8d f32 0x3f000000 - mul rn f64 $r2d $r0d $r8d - mul rn f64 $r0d $r2d $r4d - fma rn f64 $r6d neg $r4d $r0d $r8d - fma rn f64 $r4d $r4d $r6d $r4d - mul rn f64 $r0d $r2d $r4d - fma rn f64 $r6d neg $r4d $r0d $r8d - sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29 - fma rn f64 $r4d $r4d $r6d $r4d - mul rn f64 $r0d $r2d $r4d - fma rn f64 $r6d neg $r4d $r0d $r8d - fma rn f64 $r4d $r4d $r6d $r4d - mul rn f64 $r0d $r2d $r4d - fma rn f64 $r6d neg $r4d $r0d $r8d - fma rn f64 $r4d $r4d $r6d $r4d - sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00 - // Multiply 2^27 to result for small inputs to recover - $p1 mul rn f64 $r4d $r4d 0x41a0000000000000 - long mov b32 $r1 $r5 - long mov b32 $r0 $r4 + long nop long ret // // Trap handler. // Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs. -// Low 32 bytes of l[] memory shouldn't be used if resumability is required. +// Low 32 bytes of l[] memory shouldn't be used if resumeability is required. // // Trap info: // 0x000: mutex diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h index ed948dee4..37998768e 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h @@ -481,132 +481,12 @@ uint64_t gk104_builtin_code[] = { 0xd40040000840c785, 0x18fe00000000dde2, 0x4000000000001de4, - 0x2000000000000007, + 0x9000000000001de7, +/* 0x0f08: gk104_rcp_f64 */ 0x4000000000001de4, 0x9000000000001de7, -/* 0x0f18: gk104_rcp_f64 */ - 0x7000c02c50109c03, - 0x0bfffffffc20dc02, - 0x6000000280000007, - 0x1a0ec01ff431dc03, - 0x180000000000dde2, - 0x228282f2b2d042f7, - 0x40000000000021f4, - 0x1bfffffffc00dde2, - 0x1e0edffc0001dc81, - 0x40000000200021e7, - 0x3800200000105c52, -/* 0x0f70: rcp_inf_or_denorm_or_zero */ - 0x39ffc00000111c02, - 0x190e0000fc41dc23, - 0x2202f2b2d2f042b7, - 0x40000000400001e7, - 0x39ffc00000105c82, - 0x1800000000001df2, -/* 0x0fa0: rcp_denorm_or_zero */ - 0x1e0ec0000001dc81, - 0x40000000200001e7, - 0x39ffc00000105c52, -/* 0x0fb8: rcp_denorm */ - 0x5000d0d400001c01, - 0x2280428282b282f7, - 0x18000000d800ddf2, -/* 0x0fd0: rcp_rejoin */ - 0x188e0000fc31dc23, - 0x40000006000001e7, - 0x7000c02c50109c03, - 0x3a003ffffc11dc02, - 0x08ffc0000071dc02, - 0x2800000000019de4, - 0x22e2b2a2828042b7, - 0x1006000019a15c04, - 0xc800000010511c00, - 0x1afe000000001de2, - 0x3000000014415c00, - 0x3008000014401e00, - 0x1000000001301c04, - 0x1000000019b19d04, - 0x22929292929292e7, - 0x1000cfe001321c04, - 0x2010000000611c01, - 0x2000000010001c01, - 0x2010000000611c01, - 0x2000000010001c01, - 0x2010000000611c01, - 0x2000000010001c01, - 0x2282828282820297, - 0x2010000000611c01, - 0x2000000010001c01, - 0x0800000ffc209e02, - 0x480000000c211c03, - 0x7000c02c5010dc03, - 0x480000001030dc03, - 0x0bfffffffc309c02, - 0x22b28282b282b287, - 0x188ec01ff821dc03, - 0x40000000600021e7, - 0x6000c00050411c03, - 0x4800000004405c03, - 0x40000001c0001de7, -/* 0x10f0: rcp_result_inf_or_denorm */ - 0x1b0ec01ffc31dc23, - 0x40000000a00021e7, - 0x22f25232b2825207, - 0x3a00000000105c02, - 0x1800000000001de2, - 0x09ffc00000105c02, - 0x40000000e0001de7, -/* 0x1128: rcp_result_denorm */ - 0x1a8e0000fc31dc03, - 0x3a003ffffc105c02, - 0x1000cfa001318004, - 0x227202a2e2c282f7, - 0x1000cfc00131a004, - 0x0800400000105c02, - 0x5000000018001c01, -/* 0x1160: rcp_end */ - 0x9000000000001de7, -/* 0x1168: gk104_rsq_f64 */ - 0x1e0edffc0001dc81, - 0x3800200000104042, - 0x39fffffffc109c02, - 0x22828252c2820277, - 0x7000c02c5010dc03, - 0x198ec0000833dc03, - 0x6800000008009c43, - 0x5000d0d400000401, - 0xc80000001c115c00, - 0x128ec01ffc319c03, - 0x6800000018209c03, - 0x2282e2827202b287, - 0x1a8e0000fc21dc03, - 0x40000000800001e7, - 0x3a00000000105c02, - 0x1800000000001de2, - 0x6800000014105c43, - 0x9000000000001de7, -/* 0x11f8: rsq_norm */ - 0x1800000000011de2, - 0x22929292929292f7, - 0x1000cfc001321c04, - 0x5000000020009c01, - 0x5000000010201c01, - 0x2010000000419e01, - 0x2008000018411c01, - 0x5000000010201c01, - 0x2010000000419e01, - 0x2292929292929297, - 0x2008000018411c01, - 0x5000000010201c01, - 0x2010000000419e01, - 0x2008000018411c01, - 0x5000000010201c01, - 0x2010000000419e01, - 0x2008000018411c01, - 0x20000002e2820297, - 0x5000d06800410401, - 0x2800000014005de4, - 0x2800000010001de4, +/* 0x0f18: gk104_rsq_f64 */ + 0x4000000000001de4, 0x9000000000001de7, 0xc800000003f01cc5, 0x2c00000100005c04, @@ -615,7 +495,7 @@ uint64_t gk104_builtin_code[] = { 0x680100000c1fdc03, 0x4000000a60001c47, 0x180000004000dde2, -/* 0x12e0: spill_cfstack */ +/* 0x0f60: spill_cfstack */ 0x78000009c0000007, 0x0c0000000430dd02, 0x4003ffffa0001ca7, @@ -663,14 +543,14 @@ uint64_t gk104_builtin_code[] = { 0x4000000100001ea7, 0x480100000c001c03, 0x0800000000105c42, -/* 0x1458: shared_loop */ +/* 0x10d8: shared_loop */ 0xc100000000309c85, 0x9400000500009c85, 0x0c00000010001d02, 0x0800000000105d42, 0x0c0000001030dd02, 0x4003ffff40001ca7, -/* 0x1488: shared_done */ +/* 0x1108: shared_done */ 0x2800406420001de4, 0x2800406430005de4, 0xe000000000001c45, @@ -684,7 +564,7 @@ uint64_t gk104_builtin_code[] = { 0x480000000c209c03, 0x4801000008001c03, 0x0800000000105c42, -/* 0x14f0: search_cstack */ +/* 0x1170: search_cstack */ 0x280040646000dde4, 0x8400000020009f05, 0x190ec0002821dc03, @@ -693,17 +573,17 @@ uint64_t gk104_builtin_code[] = { 0x0800000000105c42, 0x0c0000004030dd02, 0x00029dff0ffc5cbf, -/* 0x1530: entry_found */ +/* 0x11b0: entry_found */ 0x8400000000009f85, 0x2800406400001de4, 0x2800406410005de4, 0x9400000010009c85, 0x4000000000001df4, -/* 0x1558: end_exit */ +/* 0x11d8: end_exit */ 0x9800000003ffdcc5, 0xd000000000008007, 0xa000000000004007, -/* 0x1570: end_cont */ +/* 0x11f0: end_cont */ 0xd000000000008007, 0x3400c3fffc201c04, 0xc000000003f01ec5, @@ -713,6 +593,6 @@ uint64_t gk104_builtin_code[] = { uint64_t gk104_builtin_offsets[] = { 0x0000000000000000, 0x00000000000000f0, + 0x0000000000000f08, 0x0000000000000f18, - 0x0000000000001168, }; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm index 66626b471..b9c05a04b 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm @@ -83,229 +83,12 @@ gk110_div_s32: $p0 sub b32 $r1 $r1 $r2 $p0 add b32 $r0 $r0 0x1 $p3 cvt s32 $r0 neg s32 $r0 - sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28 + sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c $p2 cvt s32 $r1 neg s32 $r1 ret -// RCP F64 -// -// INPUT: $r0d -// OUTPUT: $r0d -// CLOBBER: $r2 - $r9, $p0 -// -// The core of RCP and RSQ implementation is Newton-Raphson step, which is -// used to find successively better approximation from an imprecise initial -// value (single precision rcp in RCP and rsqrt64h in RSQ). -// gk110_rcp_f64: - // Step 1: classify input according to exponent and value, and calculate - // result for 0/inf/nan. $r2 holds the exponent value, which starts at - // bit 52 (bit 20 of the upper half) and is 11 bits in length - ext u32 $r2 $r1 0xb14 - add b32 $r3 $r2 0xffffffff - joinat #rcp_rejoin - // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, - // denorm, or 0). Do this by subtracting 1 from the exponent, which will - // mean that it's > 0x7fd in those cases when doing unsigned comparison - set b32 $p0 0x1 gt u32 $r3 0x7fd - // $r3: 0 for norms, 0x36 for denorms, -1 for others - mov b32 $r3 0x0 - sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28 - join (not $p0) nop - // Process all special values: NaN, inf, denorm, 0 - mov b32 $r3 0xffffffff - // A number is NaN if its abs value is greater than or unordered with inf - set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 - (not $p0) bra #rcp_inf_or_denorm_or_zero - // NaN -> NaN, the next line sets the "quiet" bit of the result. This - // behavior is both seen on the CPU and the blob - join or b32 $r1 $r1 0x80000 -rcp_inf_or_denorm_or_zero: - and b32 $r4 $r1 0x7ff00000 - // Other values with nonzero in exponent field should be inf - set b32 $p0 0x1 eq s32 $r4 0x0 - sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20 - $p0 bra #rcp_denorm_or_zero - // +/-Inf -> +/-0 - xor b32 $r1 $r1 0x7ff00000 - join mov b32 $r0 0x0 -rcp_denorm_or_zero: - set $p0 0x1 gtu f64 abs $r0d 0x0 - $p0 bra #rcp_denorm - // +/-0 -> +/-Inf - join or b32 $r1 $r1 0x7ff00000 -rcp_denorm: - // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms - mul rn f64 $r0d $r0d 0x4350000000000000 - sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28 - join mov b32 $r3 0x36 -rcp_rejoin: - // All numbers with -1 in $r3 have their result ready in $r0d, return them - // others need further calculation - set b32 $p0 0x1 lt s32 $r3 0x0 - $p0 bra #rcp_end - // Step 2: Before the real calculation goes on, renormalize the values to - // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) - // result in $r6d. The exponent will be recovered later. - ext u32 $r2 $r1 0xb14 - and b32 $r7 $r1 0x800fffff - add b32 $r7 $r7 0x3ff00000 - mov b32 $r6 $r0 - sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e - // Step 3: Convert new value to float (no overflow will occur due to step - // 2), calculate rcp and do newton-raphson step once - cvt rz f32 $r5 f64 $r6d - rcp f32 $r4 $r5 - mov b32 $r0 0xbf800000 - fma rn f32 $r5 $r4 $r5 $r0 - fma rn f32 $r0 neg $r4 $r5 $r4 - // Step 4: convert result $r0 back to double, do newton-raphson steps - cvt f64 $r0d f32 $r0 - cvt f64 $r6d f64 neg $r6d - sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29 - cvt f64 $r8d f32 0x3f800000 - // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d - // The formula used here (and above) is: - // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} - // The following code uses 2 FMAs for each step, and it will basically - // looks like: - // tmp = -src * RCP_{n} + 1 - // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} - fma rn f64 $r4d $r6d $r0d $r8d - fma rn f64 $r0d $r0d $r4d $r0d - fma rn f64 $r4d $r6d $r0d $r8d - fma rn f64 $r0d $r0d $r4d $r0d - fma rn f64 $r4d $r6d $r0d $r8d - fma rn f64 $r0d $r0d $r4d $r0d - sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28 - fma rn f64 $r4d $r6d $r0d $r8d - fma rn f64 $r0d $r0d $r4d $r0d - // Step 5: Exponent recovery and final processing - // The exponent is recovered by adding what we added to the exponent. - // Suppose we want to calculate rcp(x), but we have rcp(cx), then - // rcp(x) = c * rcp(cx) - // The delta in exponent comes from two sources: - // 1) The renormalization in step 2. The delta is: - // 0x3ff - $r2 - // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored - // in $r3 - // These 2 sources are calculated in the first two lines below, and then - // added to the exponent extracted from the result above. - // Note that after processing, the new exponent may >= 0x7ff (inf) - // or <= 0 (denorm). Those cases will be handled respectively below - subr b32 $r2 $r2 0x3ff - add b32 $r4 $r2 $r3 - ext u32 $r3 $r1 0xb14 - // New exponent in $r3 - add b32 $r3 $r3 $r4 - add b32 $r2 $r3 0xffffffff - sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b - // (exponent-1) < 0x7fe (unsigned) means the result is in norm range - // (same logic as in step 1) - set b32 $p0 0x1 lt u32 $r2 0x7fe - (not $p0) bra #rcp_result_inf_or_denorm - // Norms: convert exponents back and return - shl b32 $r4 $r4 clamp 0x14 - add b32 $r1 $r4 $r1 - bra #rcp_end -rcp_result_inf_or_denorm: - // New exponent >= 0x7ff means that result is inf - set b32 $p0 0x1 ge s32 $r3 0x7ff - (not $p0) bra #rcp_result_denorm - sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f - // Infinity - and b32 $r1 $r1 0x80000000 - mov b32 $r0 0x0 - add b32 $r1 $r1 0x7ff00000 - bra #rcp_end -rcp_result_denorm: - // Denorm result comes from huge input. The greatest possible fp64, i.e. - // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest - // normal value. Other rcp result should be greater than that. If we - // set the exponent field to 1, we can recover the result by multiplying - // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise - // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies - // the logic here. - set b32 $p0 0x1 ne u32 $r3 0x0 - and b32 $r1 $r1 0x800fffff - // 0x3e800000: 1/4 - $p0 cvt f64 $r6d f32 0x3e800000 - sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27 - // 0x3f000000: 1/2 - (not $p0) cvt f64 $r6d f32 0x3f000000 - add b32 $r1 $r1 0x00100000 - mul rn f64 $r0d $r0d $r6d -rcp_end: - ret - -// RSQ F64 -// -// INPUT: $r0d -// OUTPUT: $r0d -// CLOBBER: $r2 - $r9, $p0 - $p1 -// gk110_rsq_f64: - // Before getting initial result rsqrt64h, two special cases should be - // handled first. - // 1. NaN: set the highest bit in mantissa so it'll be surely recognized - // as NaN in rsqrt64h - set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 - $p0 or b32 $r1 $r1 0x00080000 - and b32 $r2 $r1 0x7fffffff - sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28 - // 2. denorms and small normal values: using their original value will - // lose precision either at rsqrt64h or the first step in newton-raphson - // steps below. Take 2 as a threshold in exponent field, and multiply - // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 - // to recover in the end) - ext u32 $r3 $r1 0xb14 - set b32 $p1 0x1 le u32 $r3 0x2 - or b32 $r2 $r0 $r2 - $p1 mul rn f64 $r0d $r0d 0x4350000000000000 - rsqrt64h f32 $r5 $r1 - // rsqrt64h will give correct result for 0/inf/nan, the following logic - // checks whether the input is one of those (exponent is 0x7ff or all 0 - // except for the sign bit) - set b32 $r6 ne u32 $r3 0x7ff - and b32 $r2 $r2 $r6 - sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28 - set b32 $p0 0x1 ne u32 $r2 0x0 - $p0 bra #rsq_norm - // For 0/inf/nan, make sure the sign bit agrees with input and return - and b32 $r1 $r1 0x80000000 - mov b32 $r0 0x0 - or b32 $r1 $r1 $r5 - ret -rsq_norm: - // For others, do 4 Newton-Raphson steps with the formula: - // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) - // In the code below, each step is written as: - // tmp1 = 0.5 * x * RSQ_{n} - // tmp2 = -RSQ_{n} * tmp1 + 0.5 - // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} - mov b32 $r4 0x0 - sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29 - // 0x3f000000: 1/2 - cvt f64 $r8d f32 0x3f000000 - mul rn f64 $r2d $r0d $r8d - mul rn f64 $r0d $r2d $r4d - fma rn f64 $r6d neg $r4d $r0d $r8d - fma rn f64 $r4d $r4d $r6d $r4d - mul rn f64 $r0d $r2d $r4d - fma rn f64 $r6d neg $r4d $r0d $r8d - sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29 - fma rn f64 $r4d $r4d $r6d $r4d - mul rn f64 $r0d $r2d $r4d - fma rn f64 $r6d neg $r4d $r0d $r8d - fma rn f64 $r4d $r4d $r6d $r4d - mul rn f64 $r0d $r2d $r4d - fma rn f64 $r6d neg $r4d $r0d $r8d - fma rn f64 $r4d $r4d $r6d $r4d - sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00 - // Multiply 2^27 to result for small inputs to recover - $p1 mul rn f64 $r4d $r4d 0x41a0000000000000 - mov b32 $r1 $r5 - mov b32 $r0 $r4 ret .section #gk110_builtin_offsets diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h index 3d1523f2f..8d00e2a22 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h @@ -65,132 +65,11 @@ uint64_t gk110_builtin_code[] = { 0xe088000001000406, 0x4000000000800001, 0xe6010000000ce802, - 0x08a0a0a010a0b810, + 0x08b08010a010b810, 0xe60100000088e806, 0x19000000001c003c, /* 0x0218: gk110_rcp_f64 */ - 0xc00000058a1c0409, - 0x407fffffff9c080d, - 0x1480000050000000, - 0xb3401c03fe9c0c1d, - 0xe4c03c007f9c000e, - 0x08a0a0bcacb410bc, - 0x8580000000603c02, - 0x747fffffff9fc00e, - 0xb4601fff801c021d, - 0x120000000420003c, - 0x21000400005c0404, -/* 0x0270: rcp_inf_or_denorm_or_zero */ - 0x203ff800001c0410, - 0xb3281c00001c101d, - 0x0880bcacb4bc10ac, - 0x120000000800003c, - 0x223ff800001c0404, - 0xe4c03c007fdc0002, -/* 0x02a0: rcp_denorm_or_zero */ - 0xb4601c00001c021d, - 0x120000000400003c, - 0x213ff800005c0404, -/* 0x02b8: rcp_denorm */ - 0xc400021a801c0001, - 0x08a010a0a0aca0bc, - 0x740000001b5fc00e, -/* 0x02d0: rcp_rejoin */ - 0xb3181c00001c0c1d, - 0x12000000c000003c, - 0xc00000058a1c0409, - 0x204007ffff9c041c, - 0x401ff800001c1c1d, - 0xe4c03c00001c001a, - 0x08b8aca8a0a010ac, - 0xe5400c00031c3816, - 0x84000000021c1412, - 0x745fc000001fc002, - 0xcc000000029c1016, - 0xcc081000029c1002, - 0xe5400000001c2c02, - 0xe5410000031c3c1a, - 0x08a4a4a4a4a4a4b8, - 0xc54001fc001c2c21, - 0xdb802000001c1812, - 0xdb800000021c0002, - 0xdb802000001c1812, - 0xdb800000021c0002, - 0xdb802000001c1812, - 0xdb800000021c0002, - 0x08a0a0a0a0a080a4, - 0xdb802000001c1812, - 0xdb800000021c0002, - 0x48000001ff9c0809, - 0xe0800000019c0812, - 0xc00000058a1c040d, - 0xe0800000021c0c0e, - 0x407fffffff9c0c09, - 0x08aca0a0aca0aca0, - 0xb3101c03ff1c081d, - 0x120000000c20003c, - 0xc24000000a1c1011, - 0xe0800000009c1006, - 0x12000000381c003c, -/* 0x03f0: rcp_result_inf_or_denorm */ - 0xb3681c03ff9c0c1d, - 0x120000001420003c, - 0x08bc948caca09480, - 0x20400000001c0404, - 0xe4c03c007f9c0002, - 0x403ff800001c0405, - 0x120000001c1c003c, -/* 0x0428: rcp_result_denorm */ - 0xb3501c00001c0c1d, - 0x204007ffff9c0404, - 0xc54001f400002c19, - 0x089c80a8b8b0a0bc, - 0xc54001f800202c19, - 0x40000800001c0405, - 0xe4000000031c0002, -/* 0x0460: rcp_end */ - 0x19000000001c003c, -/* 0x0468: gk110_rsq_f64 */ - 0xb4601fff801c021d, - 0x2100040000000404, - 0x203fffffff9c0408, - 0x08a0a094b0a0809c, - 0xc00000058a1c040d, - 0xb3301c00011c0c3d, - 0xe2001000011c000a, - 0xc400021a80040001, - 0x84000000039c0416, - 0xb2d01c03ff9c0c19, - 0xe2000000031c080a, - 0x08a0b8a09c80aca0, - 0xb3501c00001c081d, - 0x120000001000003c, - 0x20400000001c0404, - 0xe4c03c007f9c0002, - 0xe2001000029c0406, - 0x19000000001c003c, -/* 0x04f8: rsq_norm */ - 0xe4c03c007f9c0012, - 0x08a4a4a4a4a4a4bc, - 0xc54001f8001c2c21, - 0xe4000000041c000a, - 0xe4000000021c0802, - 0xdb882000001c101a, - 0xdb801000031c1012, - 0xe4000000021c0802, - 0xdb882000001c101a, - 0x08a4a4a4a4a4a4a4, - 0xdb801000031c1012, - 0xe4000000021c0802, - 0xdb882000001c101a, - 0xdb801000031c1012, - 0xe4000000021c0802, - 0xdb882000001c101a, - 0xdb801000031c1012, - 0x08000000b8a080a4, - 0xc400020d00041011, - 0xe4c03c00029c0006, - 0xe4c03c00021c0002, +/* 0x0218: gk110_rsq_f64 */ 0x19000000001c003c, }; @@ -198,5 +77,5 @@ uint64_t gk110_builtin_offsets[] = { 0x0000000000000000, 0x00000000000000f0, 0x0000000000000218, - 0x0000000000000468, + 0x0000000000000218, }; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp index 76fee8c79..fa8ee072a 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp @@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b) if (b->prev) b->prev->next = b; - if (a->next) + if (a->prev) a->next->prev = a; } @@ -536,6 +536,9 @@ Function::printCFGraph(const char *filePath) case Graph::Edge::BACK: fprintf(out, "\t%i -> %i;\n", idA, idB); break; + case Graph::Edge::DUMMY: + fprintf(out, "\t%i -> %i [style=dotted];\n", idA, idB); + break; default: assert(0); break; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp index 20ed5cd52..19418c0e0 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp @@ -44,8 +44,6 @@ BuildUtil::init(Program *prog) bb = NULL; pos = NULL; - tail = false; - memset(imms, 0, sizeof(imms)); immCount = 0; } @@ -340,7 +338,7 @@ BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit) int base2 = (baseSize2[mask] >> 8) & 0xf; int size2 = (baseSize2[mask] >> 12) & 0xf; Instruction *insn = mkOp(OP_NOP, TYPE_NONE, NULL); - if (true) { // size1 can't be 0 + if (1) { // size1 can't be 0 LValue *reg = new_LValue(func, f); reg->reg.size = size1 << unit; reg->reg.data.id = base + base1; @@ -356,18 +354,6 @@ BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit) } ImmediateValue * -BuildUtil::mkImm(uint16_t u) -{ - ImmediateValue *imm = new_ImmediateValue(prog, (uint32_t)0); - - imm->reg.size = 2; - imm->reg.type = TYPE_U16; - imm->reg.data.u32 = u; - - return imm; -} - -ImmediateValue * BuildUtil::mkImm(uint32_t u) { unsigned int pos = u32Hash(u); @@ -406,12 +392,6 @@ BuildUtil::mkImm(float f) return mkImm(u.u32); } -ImmediateValue * -BuildUtil::mkImm(double d) -{ - return new_ImmediateValue(prog, d); -} - Value * BuildUtil::loadImm(Value *dst, float f) { @@ -419,18 +399,6 @@ BuildUtil::loadImm(Value *dst, float f) } Value * -BuildUtil::loadImm(Value *dst, double d) -{ - return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(8), mkImm(d)); -} - -Value * -BuildUtil::loadImm(Value *dst, uint16_t u) -{ - return mkOp1v(OP_MOV, TYPE_U16, dst ? dst : getScratch(2), mkImm(u)); -} - -Value * BuildUtil::loadImm(Value *dst, uint32_t u) { return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u)); @@ -486,16 +454,6 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex) return sym; } -Symbol * -BuildUtil::mkTSVal(TSSemantic tsName) -{ - Symbol *sym = new_Symbol(prog, FILE_THREAD_STATE, 0); - sym->reg.type = TYPE_U32; - sym->reg.size = typeSizeof(sym->reg.type); - sym->reg.data.ts = tsName; - return sym; -} - void BuildUtil::DataArray::setup(unsigned array, unsigned arrayIdx, uint32_t base, int len, int vecDim, int eltSize, @@ -529,7 +487,7 @@ BuildUtil::DataArray::acquire(ValueMap &m, int i, int c) return v; } else { - return up->getScratch(eltSize); + return up->getScratch(); } } @@ -597,12 +555,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, switch (i->dType) { case TYPE_U64: hTy = TYPE_U32; break; case TYPE_S64: hTy = TYPE_S32; break; - case TYPE_F64: - if (i->op == OP_MOV) { - hTy = TYPE_U32; - break; - } - FALLTHROUGH; default: return NULL; } @@ -615,7 +567,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, return NULL; srcNr = 2; break; - case OP_SELP: srcNr = 3; break; default: // TODO when needed return NULL; @@ -632,10 +583,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, for (int s = 0; s < srcNr; ++s) { if (lo->getSrc(s)->reg.size < 8) { - if (s == 2) - hi->setSrc(s, lo->getSrc(s)); - else - hi->setSrc(s, zero); + hi->setSrc(s, zero); } else { if (lo->getSrc(s)->refCount() > 1) lo->setSrc(s, cloneShallow(fn, lo->getSrc(s))); @@ -649,7 +597,6 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, case FILE_MEMORY_CONST: case FILE_MEMORY_SHARED: case FILE_SHADER_INPUT: - case FILE_SHADER_OUTPUT: hi->getSrc(s)->reg.data.offset += 4; break; default: @@ -660,7 +607,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, } } if (srcNr == 2) { - lo->setFlagsDef(1, carry); + lo->setDef(1, carry); hi->setFlagsSrc(hi->srcCount(), carry); } return hi; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h index 5c3a01df9..a610c773f 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h @@ -66,7 +66,6 @@ public: Instruction *mkMov(Value *, Value *, DataType = TYPE_U32); Instruction *mkMovToReg(int id, Value *); Instruction *mkMovFromReg(Value *, int id); - inline Instruction *mkBMov(Value *, Value *); Instruction *mkInterp(unsigned mode, Value *, int32_t offset, Value *rel); Instruction *mkFetch(Value *, DataType, DataFile, int32_t offset, @@ -74,8 +73,8 @@ public: Instruction *mkCvt(operation, DataType, Value *, DataType, Value *); CmpInstruction *mkCmp(operation, CondCode, DataType, - Value *, - DataType, Value *, Value *, Value * = NULL); + Value *, + DataType, Value *, Value *, Value * = NULL); TexInstruction *mkTex(operation, TexTarget, uint16_t tic, uint16_t tsc, const std::vector<Value *> &def, @@ -91,16 +90,12 @@ public: void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2); ImmediateValue *mkImm(float); - ImmediateValue *mkImm(double); - ImmediateValue *mkImm(uint16_t); ImmediateValue *mkImm(uint32_t); ImmediateValue *mkImm(uint64_t); ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); } Value *loadImm(Value *dst, float); - Value *loadImm(Value *dst, double); - Value *loadImm(Value *dst, uint16_t); Value *loadImm(Value *dst, uint32_t); Value *loadImm(Value *dst, uint64_t); @@ -140,9 +135,7 @@ public: class DataArray { public: - DataArray(BuildUtil *bld) : up(bld), array(0), arrayIdx(0), baseAddr(0), - arrayLen(0), baseSym(NULL), vecDim(0), eltSize(0), file(FILE_NULL), - regOnly(false) { } + DataArray(BuildUtil *bld) : up(bld) { } void setup(unsigned array, unsigned arrayIdx, uint32_t base, int len, int vecDim, int eltSize, @@ -179,7 +172,6 @@ public: DataType ty, uint32_t baseAddress); Symbol *mkSysVal(SVSemantic svName, uint32_t svIndex); - Symbol *mkTSVal(TSSemantic tsName); private: void init(Program *); @@ -301,17 +293,11 @@ BuildUtil::mkOp3v(operation op, DataType ty, Value *dst, inline LValue * BuildUtil::mkLoadv(DataType ty, Symbol *mem, Value *ptr) { - LValue *dst = getScratch(typeSizeof(ty)); + LValue *dst = getScratch(); mkLoad(ty, dst, mem, ptr); return dst; } -inline Instruction * -BuildUtil::mkBMov(Value *dst, Value *src) -{ - return mkCvt(OP_CVT, TYPE_U32, dst, TYPE_U32, src); -} - bool BuildUtil::DataArray::exists(ValueMap &m, unsigned int i, unsigned int c) { diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index 1a0c63b70..90147668c 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -33,12 +33,14 @@ namespace nv50_ir { class CodeEmitterNV50 : public CodeEmitter { public: - CodeEmitterNV50(Program::Type, const TargetNV50 *); + CodeEmitterNV50(const TargetNV50 *); virtual bool emitInstruction(Instruction *); virtual uint32_t getMinEncodingSize(const Instruction *) const; + inline void setProgramType(Program::Type pType) { progType = pType; } + virtual void prepareEmission(Function *); private: @@ -94,12 +96,9 @@ private: void emitUADD(const Instruction *); void emitAADD(const Instruction *); void emitFADD(const Instruction *); - void emitDADD(const Instruction *); void emitIMUL(const Instruction *); void emitFMUL(const Instruction *); - void emitDMUL(const Instruction *); void emitFMAD(const Instruction *); - void emitDMAD(const Instruction *); void emitIMAD(const Instruction *); void emitISAD(const Instruction *); @@ -270,7 +269,7 @@ CodeEmitterNV50::emitFlagsWr(const Instruction *i) for (int d = 0; i->defExists(d); ++d) if (i->def(d).getFile() == FILE_FLAGS) flagsDef = d; - if (flagsDef >= 0 && false) // TODO: enforce use of flagsDef at some point + if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point WARN("Instruction::flagsDef was not set properly\n"); } if (flagsDef == 0 && i->defExists(1)) @@ -373,7 +372,7 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc) mode |= 3 << (s * 2); break; default: - ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile()); + ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile()); assert(0); break; } @@ -439,9 +438,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc) return; if ((mode & 3) == 1) { - const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14; + const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14; - switch (i->sType) { + switch (i->getSrc(0)->reg.type) { case TYPE_U8: break; case TYPE_U16: @@ -525,8 +524,7 @@ CodeEmitterNV50::emitForm_ADD(const Instruction *i) setSrcFileBits(i, NV50_OP_ENC_LONG_ALT); setSrc(i, 0, 0); - if (i->predSrc != 1) - setSrc(i, 1, 2); + setSrc(i, 1, 2); if (i->getIndirect(0, 0)) { assert(!i->getIndirect(1, 0)); @@ -619,7 +617,7 @@ void CodeEmitterNV50::emitLOAD(const Instruction *i) { DataFile sf = i->src(0).getFile(); - ASSERTED int32_t offset = i->getSrc(0)->reg.data.offset; + int32_t offset = i->getSrc(0)->reg.data.offset; switch (sf) { case FILE_SHADER_INPUT: @@ -642,9 +640,6 @@ CodeEmitterNV50::emitLOAD(const Instruction *i) code[1] |= 0x04000000; emitLoadStoreSizeCS(i->sType); - - if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) - code[1] |= 0x00800000; } else { assert(offset <= (int32_t)(0x1f * typeSizeof(i->sType))); code[0] = 0x10000001; @@ -715,8 +710,6 @@ CodeEmitterNV50::emitSTORE(const Instruction *i) case FILE_MEMORY_SHARED: code[0] = 0x00000001; code[1] = 0xe0000000; - if (i->subOp == NV50_IR_SUBOP_STORE_UNLOCKED) - code[1] |= 0x00800000; switch (typeSizeof(i->dType)) { case 1: code[0] |= offset << 9; @@ -760,10 +753,10 @@ CodeEmitterNV50::emitMOV(const Instruction *i) assert(sf == FILE_GPR || df == FILE_GPR); if (sf == FILE_FLAGS) { - assert(i->flagsSrc >= 0); code[0] = 0x00000001; code[1] = 0x20000000; defId(i->def(0), 2); + srcId(i->src(0), 12); emitFlagsRd(i); } else if (sf == FILE_ADDRESS) { @@ -774,31 +767,26 @@ CodeEmitterNV50::emitMOV(const Instruction *i) emitFlagsRd(i); } else if (df == FILE_FLAGS) { - assert(i->flagsDef >= 0); code[0] = 0x00000001; code[1] = 0xa0000000; + defId(i->def(0), 4); srcId(i->src(0), 9); emitFlagsRd(i); - emitFlagsWr(i); } else if (sf == FILE_IMMEDIATE) { - code[0] = 0x10000001; + code[0] = 0x10008001; code[1] = 0x00000003; emitForm_IMM(i); - - code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000; } else { if (i->encSize == 4) { - code[0] = 0x10000000; - code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000; - defId(i->def(0), 2); + code[0] = 0x10008000; } else { code[0] = 0x10000001; code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000; code[1] |= (i->lanes << 14); - setDst(i, 0); emitFlagsRd(i); } + defId(i->def(0), 2); srcId(i->src(0), 9); } if (df == FILE_SHADER_OUTPUT) { @@ -848,7 +836,7 @@ CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp) emitForm_ADD(i); - if (!i->srcExists(1) || i->predSrc == 1) + if (!i->srcExists(1)) srcId(i->src(0), 32 + 14); } @@ -889,36 +877,12 @@ CodeEmitterNV50::emitPFETCH(const Instruction *i) } void -nv50_interpApply(const FixupEntry *entry, uint32_t *code, const FixupData& data) -{ - int ipa = entry->ipa; - int encSize = entry->reg; - int loc = entry->loc; - - if ((ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT && - (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) { - if (data.force_persample_interp) { - if (encSize == 8) - code[loc + 1] |= 1 << 16; - else - code[loc + 0] |= 1 << 24; - } else { - if (encSize == 8) - code[loc + 1] &= ~(1 << 16); - else - code[loc + 0] &= ~(1 << 24); - } - } -} - -void CodeEmitterNV50::emitINTERP(const Instruction *i) { code[0] = 0x80000000; defId(i->def(0), 2); srcAddr8(i->src(0), 16); - setAReg16(i, 0); if (i->encSize != 8 && i->getInterpMode() == NV50_IR_INTERP_FLAT) { code[0] |= 1 << 8; @@ -940,8 +904,6 @@ CodeEmitterNV50::emitINTERP(const Instruction *i) code[0] |= 1; emitFlagsRd(i); } - - addInterp(i->ipa, i->encSize, nv50_interpApply); } void @@ -966,13 +928,11 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i) assert(0); break; } + code[1] |= i->src(0).mod.abs() << 20; + code[1] |= i->src(0).mod.neg() << 26; + code[1] |= i->src(1).mod.abs() << 19; + code[1] |= i->src(1).mod.neg() << 27; } - - code[1] |= i->src(0).mod.abs() << 20; - code[1] |= i->src(0).mod.neg() << 26; - code[1] |= i->src(1).mod.abs() << 19; - code[1] |= i->src(1).mod.neg() << 27; - emitForm_MAD(i); } @@ -1008,26 +968,6 @@ CodeEmitterNV50::emitFMAD(const Instruction *i) } void -CodeEmitterNV50::emitDMAD(const Instruction *i) -{ - const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg(); - const int neg_add = i->src(2).mod.neg(); - - assert(i->encSize == 8); - assert(!i->saturate); - - code[1] = 0x40000000; - code[0] = 0xe0000000; - - code[1] |= neg_mul << 26; - code[1] |= neg_add << 27; - - roundMode_MAD(i); - - emitForm_MAD(i); -} - -void CodeEmitterNV50::emitFADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -1062,42 +1002,22 @@ CodeEmitterNV50::emitFADD(const Instruction *i) } void -CodeEmitterNV50::emitDADD(const Instruction *i) -{ - const int neg0 = i->src(0).mod.neg(); - const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); - - assert(!(i->src(0).mod | i->src(1).mod).abs()); - assert(!i->saturate); - assert(i->encSize == 8); - - code[1] = 0x60000000; - code[0] = 0xe0000000; - - emitForm_ADD(i); - - code[1] |= neg0 << 26; - code[1] |= neg1 << 27; -} - -void CodeEmitterNV50::emitUADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); - code[0] = 0x20000000; + code[0] = 0x20008000; if (i->src(1).getFile() == FILE_IMMEDIATE) { - code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000; code[1] = 0; emitForm_IMM(i); } else if (i->encSize == 8) { + code[0] = 0x20000000; code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000; emitForm_ADD(i); } else { - code[0] |= (typeSizeof(i->dType) == 2) ? 0 : 0x00008000; emitForm_MUL(i); } assert(!(neg0 && neg1)); @@ -1133,12 +1053,6 @@ CodeEmitterNV50::emitIMUL(const Instruction *i) { code[0] = 0x40000000; - if (i->src(1).getFile() == FILE_IMMEDIATE) { - if (i->sType == TYPE_S16) - code[0] |= 0x8100; - code[1] = 0; - emitForm_IMM(i); - } else if (i->encSize == 8) { code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000; emitForm_MAD(i); @@ -1181,66 +1095,28 @@ CodeEmitterNV50::emitFMUL(const Instruction *i) } void -CodeEmitterNV50::emitDMUL(const Instruction *i) -{ - const int neg = (i->src(0).mod ^ i->src(1).mod).neg(); - - assert(!i->saturate); - assert(i->encSize == 8); - - code[1] = 0x80000000; - code[0] = 0xe0000000; - - if (neg) - code[1] |= 0x08000000; - - roundMode_CVT(i->rnd); - - emitForm_MAD(i); -} - -void CodeEmitterNV50::emitIMAD(const Instruction *i) { - int mode; code[0] = 0x60000000; - - assert(!i->src(0).mod && !i->src(1).mod && !i->src(2).mod); - if (!isSignedType(i->sType)) - mode = 0; - else if (i->saturate) - mode = 2; + if (isSignedType(i->sType)) + code[1] = i->saturate ? 0x40000000 : 0x20000000; else - mode = 1; + code[1] = 0x00000000; - if (i->src(1).getFile() == FILE_IMMEDIATE) { - code[1] = 0; - emitForm_IMM(i); - code[0] |= (mode & 1) << 8 | (mode & 2) << 14; - if (i->flagsSrc >= 0) { - assert(!(code[0] & 0x10400000)); - assert(SDATA(i->src(i->flagsSrc)).id == 0); - code[0] |= 0x10400000; - } - } else - if (i->encSize == 4) { - emitForm_MUL(i); - code[0] |= (mode & 1) << 8 | (mode & 2) << 14; - if (i->flagsSrc >= 0) { - assert(!(code[0] & 0x10400000)); - assert(SDATA(i->src(i->flagsSrc)).id == 0); - code[0] |= 0x10400000; - } - } else { - code[1] = mode << 29; - emitForm_MAD(i); + int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg(); + int neg2 = i->src(2).mod.neg(); - if (i->flagsSrc >= 0) { - // add with carry from $cX - assert(!(code[1] & 0x0c000000) && !i->getPredicate()); - code[1] |= 0xc << 24; - srcId(i->src(i->flagsSrc), 32 + 12); - } + assert(!(neg1 & neg2)); + code[1] |= neg1 << 27; + code[1] |= neg2 << 26; + + emitForm_MAD(i); + + if (i->flagsSrc >= 0) { + // add with carry from $cX + assert(!(code[1] & 0x0c000000) && !i->getPredicate()); + code[1] |= 0xc << 24; + srcId(i->src(i->flagsSrc), 32 + 12); } } @@ -1273,39 +1149,15 @@ CodeEmitterNV50::emitISAD(const Instruction *i) } } -static void -alphatestSet(const FixupEntry *entry, uint32_t *code, const FixupData& data) -{ - int loc = entry->loc; - int enc; - - switch (data.alphatest) { - case PIPE_FUNC_NEVER: enc = 0x0; break; - case PIPE_FUNC_LESS: enc = 0x1; break; - case PIPE_FUNC_EQUAL: enc = 0x2; break; - case PIPE_FUNC_LEQUAL: enc = 0x3; break; - case PIPE_FUNC_GREATER: enc = 0x4; break; - case PIPE_FUNC_NOTEQUAL: enc = 0x5; break; - case PIPE_FUNC_GEQUAL: enc = 0x6; break; - default: - case PIPE_FUNC_ALWAYS: enc = 0xf; break; - } - - code[loc + 1] &= ~(0x1f << 14); - code[loc + 1] |= enc << 14; -} - void CodeEmitterNV50::emitSET(const Instruction *i) { code[0] = 0x30000000; code[1] = 0x60000000; + emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); + switch (i->sType) { - case TYPE_F64: - code[0] = 0xe0000000; - code[1] = 0xe0000000; - break; case TYPE_F32: code[0] |= 0x80000000; break; case TYPE_S32: code[1] |= 0x0c000000; break; case TYPE_U32: code[1] |= 0x04000000; break; @@ -1315,19 +1167,12 @@ CodeEmitterNV50::emitSET(const Instruction *i) assert(0); break; } - - emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); - if (i->src(0).mod.neg()) code[1] |= 0x04000000; if (i->src(1).mod.neg()) code[1] |= 0x08000000; if (i->src(0).mod.abs()) code[1] |= 0x00100000; if (i->src(1).mod.abs()) code[1] |= 0x00080000; emitForm_MAD(i); - - if (i->subOp == 1) { - addInterp(0, 0, alphatestSet); - } } void @@ -1412,9 +1257,6 @@ CodeEmitterNV50::emitCVT(const Instruction *i) case TYPE_U32: code[1] = 0x44004000; break; case TYPE_F16: code[1] = 0xc4000000; break; case TYPE_U16: code[1] = 0x44000000; break; - case TYPE_S16: code[1] = 0x44010000; break; - case TYPE_S8: code[1] = 0x44018000; break; - case TYPE_U8: code[1] = 0x44008000; break; default: assert(0); break; @@ -1452,73 +1294,10 @@ CodeEmitterNV50::emitCVT(const Instruction *i) break; } break; - case TYPE_F16: - switch (i->sType) { - case TYPE_F16: code[1] = 0xc0000000; break; - case TYPE_F32: code[1] = 0xc0004000; break; - default: - assert(0); - break; - } - break; case TYPE_S16: - switch (i->sType) { - case TYPE_F32: code[1] = 0x88004000; break; - case TYPE_S32: code[1] = 0x08014000; break; - case TYPE_U32: code[1] = 0x08004000; break; - case TYPE_F16: code[1] = 0x88000000; break; - case TYPE_S16: code[1] = 0x08010000; break; - case TYPE_U16: code[1] = 0x08000000; break; - case TYPE_S8: code[1] = 0x08018000; break; - case TYPE_U8: code[1] = 0x08008000; break; - default: - assert(0); - break; - } - break; case TYPE_U16: - switch (i->sType) { - case TYPE_F32: code[1] = 0x80004000; break; - case TYPE_S32: code[1] = 0x00014000; break; - case TYPE_U32: code[1] = 0x00004000; break; - case TYPE_F16: code[1] = 0x80000000; break; - case TYPE_S16: code[1] = 0x00010000; break; - case TYPE_U16: code[1] = 0x00000000; break; - case TYPE_S8: code[1] = 0x00018000; break; - case TYPE_U8: code[1] = 0x00008000; break; - default: - assert(0); - break; - } - break; case TYPE_S8: - switch (i->sType) { - case TYPE_S32: code[1] = 0x08094000; break; - case TYPE_U32: code[1] = 0x08084000; break; - case TYPE_F16: code[1] = 0x88080000; break; - case TYPE_S16: code[1] = 0x08090000; break; - case TYPE_U16: code[1] = 0x08080000; break; - case TYPE_S8: code[1] = 0x08098000; break; - case TYPE_U8: code[1] = 0x08088000; break; - default: - assert(0); - break; - } - break; case TYPE_U8: - switch (i->sType) { - case TYPE_S32: code[1] = 0x00094000; break; - case TYPE_U32: code[1] = 0x00084000; break; - case TYPE_F16: code[1] = 0x80080000; break; - case TYPE_S16: code[1] = 0x00090000; break; - case TYPE_U16: code[1] = 0x00080000; break; - case TYPE_S8: code[1] = 0x00098000; break; - case TYPE_U8: code[1] = 0x00088000; break; - default: - assert(0); - break; - } - break; default: assert(0); break; @@ -1564,7 +1343,6 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp) if (i->encSize == 4) { assert(i->op == OP_RCP); - assert(!i->saturate); code[0] |= i->src(0).mod.abs() << 15; code[0] |= i->src(0).mod.neg() << 22; emitForm_MUL(i); @@ -1572,10 +1350,6 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp) code[1] = subOp << 29; code[1] |= i->src(0).mod.abs() << 20; code[1] |= i->src(0).mod.neg() << 26; - if (i->saturate) { - assert(subOp == 6 && i->op == OP_EX2); - code[1] |= 1 << 27; - } emitForm_MAD(i); } } @@ -1618,15 +1392,13 @@ CodeEmitterNV50::emitLogicOp(const Instruction *i) emitForm_IMM(i); } else { switch (i->op) { - case OP_AND: code[1] = 0x00000000; break; - case OP_OR: code[1] = 0x00004000; break; - case OP_XOR: code[1] = 0x00008000; break; + case OP_AND: code[1] = 0x04000000; break; + case OP_OR: code[1] = 0x04004000; break; + case OP_XOR: code[1] = 0x04008000; break; default: assert(0); break; } - if (typeSizeof(i->dType) == 4) - code[1] |= 0x04000000; if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[1] |= 1 << 16; if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) @@ -1657,9 +1429,7 @@ CodeEmitterNV50::emitShift(const Instruction *i) emitARL(i, i->getSrc(1)->reg.data.u32 & 0x3f); } else { code[0] = 0x30000001; - code[1] = (i->op == OP_SHR) ? 0xe0000000 : 0xc0000000; - if (typeSizeof(i->dType) == 4) - code[1] |= 0x04000000; + code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000; if (i->op == OP_SHR && isSignedType(i->sType)) code[1] |= 1 << 27; @@ -1738,9 +1508,7 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i) code[1] |= (i->tex.mask & 0xc) << 12; if (i->tex.liveOnly) - code[1] |= 1 << 2; - if (i->tex.derivAll) - code[1] |= 1 << 3; + code[1] |= 4; defId(i->def(0), 2); @@ -1901,28 +1669,19 @@ CodeEmitterNV50::emitATOM(const Instruction *i) return; } code[0] = 0xd0000001; - code[1] = 0xc0c00000 | (subOp << 2); + code[1] = 0xe0c00000 | (subOp << 2); if (isSignedType(i->dType)) code[1] |= 1 << 21; // args emitFlagsRd(i); - if (i->subOp == NV50_IR_SUBOP_ATOM_EXCH || - i->subOp == NV50_IR_SUBOP_ATOM_CAS || - i->defExists(0)) { - code[1] |= 0x20000000; - setDst(i, 0); - setSrc(i, 1, 1); - // g[] pointer - code[0] |= i->getSrc(0)->reg.fileIndex << 23; - } else { - srcId(i->src(1), 2); - // g[] pointer - code[0] |= i->getSrc(0)->reg.fileIndex << 16; - } + setDst(i, 0); + setSrc(i, 1, 1); if (i->subOp == NV50_IR_SUBOP_ATOM_CAS) setSrc(i, 2, 2); + // g[] pointer + code[0] |= i->getSrc(0)->reg.fileIndex << 23; srcId(i->getIndirect(0, 0), 9); } @@ -1971,9 +1730,7 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) break; case OP_ADD: case OP_SUB: - if (insn->dType == TYPE_F64) - emitDADD(insn); - else if (isFloatType(insn->dType)) + if (isFloatType(insn->dType)) emitFADD(insn); else if (insn->getDef(0)->reg.file == FILE_ADDRESS) emitAADD(insn); @@ -1981,18 +1738,14 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) emitUADD(insn); break; case OP_MUL: - if (insn->dType == TYPE_F64) - emitDMUL(insn); - else if (isFloatType(insn->dType)) + if (isFloatType(insn->dType)) emitFMUL(insn); else emitIMUL(insn); break; case OP_MAD: case OP_FMA: - if (insn->dType == TYPE_F64) - emitDMAD(insn); - else if (isFloatType(insn->dType)) + if (isFloatType(insn->dType)) emitFMAD(insn); else emitIMAD(insn); @@ -2164,7 +1917,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const { const Target::OpInfo &info = targ->getOpInfo(i); - if (info.minEncSize > 4 || i->dType == TYPE_F64) + if (info.minEncSize > 4) return 8; // check constraints on dst and src operands @@ -2194,9 +1947,8 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const // check constraints on short MAD if (info.srcNr >= 2 && i->srcExists(2)) { - if (!i->defExists(0) || - (i->flagsSrc >= 0 && SDATA(i->src(i->flagsSrc)).id > 0) || - DDATA(i->def(0)).id != SDATA(i->src(2)).id) + if (!i->defExists(0) || !isFloatType(i->dType) || + i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id) return 8; } @@ -2226,7 +1978,7 @@ makeInstructionLong(Instruction *insn) insn->encSize = 8; for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) { - fn->bbArray[i]->binPos += adj; + fn->bbArray[i]->binPos += 4; } fn->binSize += adj; insn->bb->binSize += adj; @@ -2278,16 +2030,9 @@ replaceExitWithModifier(Function *func) return; } } - - int adj = epilogue->getExit()->encSize; - epilogue->binSize -= adj; - func->binSize -= adj; + epilogue->binSize -= 8; + func->binSize -= 8; delete_Instruction(func->getProgram(), epilogue->getExit()); - - // There may be BB's that are laid out after the exit block - for (int i = func->bbCount - 1; i >= 0 && func->bbArray[i] != epilogue; --i) { - func->bbArray[i]->binPos -= adj; - } } void @@ -2298,8 +2043,8 @@ CodeEmitterNV50::prepareEmission(Function *func) replaceExitWithModifier(func); } -CodeEmitterNV50::CodeEmitterNV50(Program::Type type, const TargetNV50 *target) : - CodeEmitter(target), progType(type), targNV50(target) +CodeEmitterNV50::CodeEmitterNV50(const TargetNV50 *target) : + CodeEmitter(target), targNV50(target) { targ = target; // specialized code = NULL; @@ -2310,7 +2055,8 @@ CodeEmitterNV50::CodeEmitterNV50(Program::Type type, const TargetNV50 *target) : CodeEmitter * TargetNV50::getCodeEmitter(Program::Type type) { - CodeEmitterNV50 *emit = new CodeEmitterNV50(type, this); + CodeEmitterNV50 *emit = new CodeEmitterNV50(this); + emit->setProgramType(type); return emit; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp index 3f9967a7b..23414d54a 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp @@ -77,6 +77,7 @@ const char *Graph::Edge::typeStr() const case FORWARD: return "forward"; case BACK: return "back"; case CROSS: return "cross"; + case DUMMY: return "dummy"; case UNKNOWN: default: return "unk"; @@ -86,8 +87,7 @@ const char *Graph::Edge::typeStr() const Graph::Node::Node(void *priv) : data(priv), in(0), out(0), graph(0), visited(0), - inCount(0), outCount(0), - tag(0) + inCount(0), outCount(0) { // nothing to do } @@ -184,7 +184,7 @@ Graph::Node::reachableBy(const Node *node, const Node *term) const continue; for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) { - if (ei.getType() == Edge::BACK) + if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY) continue; if (ei.getNode()->visit(seq)) stack.push(ei.getNode()); @@ -287,10 +287,7 @@ private: bb.push(node); - while (bb.getSize() || cross.getSize()) { - if (bb.getSize() == 0) - cross.moveTo(bb); - + while (bb.getSize()) { node = reinterpret_cast<Graph::Node *>(bb.pop().u.p); assert(node); if (!node->visit(sequence)) @@ -301,6 +298,7 @@ private: switch (ei.getType()) { case Graph::Edge::TREE: case Graph::Edge::FORWARD: + case Graph::Edge::DUMMY: if (++(ei.getNode()->tag) == ei.getNode()->incidentCountFwd()) bb.push(ei.getNode()); break; @@ -316,6 +314,9 @@ private: } } nodes[count++] = node; + + if (bb.getSize() == 0) + cross.moveTo(bb); } } @@ -370,6 +371,8 @@ void Graph::classifyDFS(Node *curr, int& seq) for (edge = curr->out; edge; edge = edge->next[0]) { node = edge->target; + if (edge->type == Edge::DUMMY) + continue; if (node->getSequence() == 0) { edge->type = Edge::TREE; @@ -384,6 +387,8 @@ void Graph::classifyDFS(Node *curr, int& seq) for (edge = curr->in; edge; edge = edge->next[1]) { node = edge->origin; + if (edge->type == Edge::DUMMY) + continue; if (node->getSequence() == 0) { edge->type = Edge::TREE; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h index fc85e78a5..b0981ff69 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h @@ -47,6 +47,7 @@ public: FORWARD, BACK, CROSS, // e.g. loop break + DUMMY }; Edge(Node *dst, Node *src, Type kind); @@ -146,7 +147,7 @@ public: public: Graph(); - virtual ~Graph(); // does *not* free the nodes (make it an option ?) + ~Graph(); // does *not* free the nodes (make it an option ?) inline Node *getRoot() const { return root; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h index 749e6b40b..e465f2484 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h @@ -48,7 +48,7 @@ static inline bool isTextureOp(operation op) static inline bool isSurfaceOp(operation op) { - return (op >= OP_SULDB && op <= OP_SULEA) || (op == OP_SUQ); + return (op >= OP_SULDB && op <= OP_SULEA); } static inline unsigned int typeSizeof(DataType ty) @@ -126,7 +126,7 @@ static inline bool isFloatType(DataType ty) static inline bool isSignedIntType(DataType ty) { - return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32 || ty == TYPE_S64); + return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32); } static inline bool isSignedType(DataType ty) @@ -136,7 +136,6 @@ static inline bool isSignedType(DataType ty) case TYPE_U8: case TYPE_U16: case TYPE_U32: - case TYPE_U64: case TYPE_B96: case TYPE_B128: return false; @@ -148,7 +147,6 @@ static inline bool isSignedType(DataType ty) static inline DataType intTypeToSigned(DataType ty) { switch (ty) { - case TYPE_U64: return TYPE_S64; case TYPE_U32: return TYPE_S32; case TYPE_U16: return TYPE_S16; case TYPE_U8: return TYPE_S8; @@ -222,7 +220,7 @@ Instruction *Value::getUniqueInsn() const return (*it)->getInsn(); // should be unreachable and trigger assertion at the end } -#ifndef NDEBUG +#ifdef DEBUG if (reg.data.id < 0) { int n = 0; for (DefCIterator it = defs.begin(); n < 2 && it != defs.end(); ++it) @@ -311,14 +309,14 @@ const FlowInstruction *Instruction::asFlow() const TexInstruction *Instruction::asTex() { - if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ) + if (op >= OP_TEX && op <= OP_SULEA) return static_cast<TexInstruction *>(this); return NULL; } const TexInstruction *Instruction::asTex() const { - if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ) + if (op >= OP_TEX && op <= OP_SULEA) return static_cast<const TexInstruction *>(this); return NULL; } @@ -336,7 +334,7 @@ static inline Instruction *cloneForward(Function *ctx, Instruction *obj) // XXX: use a virtual function so we're really really safe ? LValue *Value::asLValue() { - if (reg.file >= FILE_GPR && reg.file <= LAST_REGISTER_FILE) + if (reg.file >= FILE_GPR && reg.file <= FILE_ADDRESS) return static_cast<LValue *>(this); return NULL; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 2b09855b1..d87cdfff8 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -25,24 +25,6 @@ #include "codegen/nv50_ir_target_nv50.h" -#define NV50_SU_INFO_SIZE_X 0x00 -#define NV50_SU_INFO_SIZE_Y 0x04 -#define NV50_SU_INFO_SIZE_Z 0x08 -#define NV50_SU_INFO_BSIZE 0x0c -#define NV50_SU_INFO_STRIDE_Y 0x10 -#define NV50_SU_INFO_MS_X 0x18 -#define NV50_SU_INFO_MS_Y 0x1c -#define NV50_SU_INFO_TILE_SHIFT_X 0x20 -#define NV50_SU_INFO_TILE_SHIFT_Y 0x24 -#define NV50_SU_INFO_TILE_SHIFT_Z 0x28 -#define NV50_SU_INFO_OFFSET_Z 0x2c - -#define NV50_SU_INFO__STRIDE 0x30 - -#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4) -#define NV50_SU_INFO_MS(i) (0x18 + (i) * 4) -#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4) - namespace nv50_ir { // nv50 doesn't support 32 bit integer multiplication @@ -62,8 +44,6 @@ static bool expandIntegerMUL(BuildUtil *bld, Instruction *mul) { const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH; - ImmediateValue src1; - bool src1imm = mul->src(1).getImmediate(src1); DataType fTy; // full type switch (mul->sType) { @@ -92,41 +72,24 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul) for (int j = 0; j < 4; ++j) t[j] = bld->getSSA(fullSize); - if (isSignedType(mul->sType) && highResult) { + s[0] = mul->getSrc(0); + s[1] = mul->getSrc(1); + + if (isSignedType(mul->sType)) { s[0] = bld->getSSA(fullSize); s[1] = bld->getSSA(fullSize); bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0)); bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1)); - src1.reg.data.s32 = abs(src1.reg.data.s32); - } else { - s[0] = mul->getSrc(0); - s[1] = mul->getSrc(1); } // split sources into halves i[0] = bld->mkSplit(a, halfSize, s[0]); i[1] = bld->mkSplit(b, halfSize, s[1]); - if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) { - i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1], - bld->mkImm(src1.reg.data.u32 & 0xffff)); - } else { - i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], - src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]); - if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) { - i[3] = i[2]; - t[1] = t[0]; - } else { - i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); - } - } + i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]); + i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8)); - if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) { - i[4] = i[3]; - t[3] = t[2]; - } else { - i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); - } + i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); if (highResult) { Value *c[2]; @@ -223,9 +186,6 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul) class NV50LegalizePostRA : public Pass { -public: - NV50LegalizePostRA() : r63(NULL) { } - private: virtual bool visit(Function *); virtual bool visit(BasicBlock *); @@ -233,8 +193,6 @@ private: void handlePRERET(FlowInstruction *); void replaceZero(Instruction *); - BuildUtil bld; - LValue *r63; }; @@ -244,8 +202,7 @@ NV50LegalizePostRA::visit(Function *fn) Program *prog = fn->getProgram(); r63 = new_LValue(fn, FILE_GPR); - // GPR units on nv50 are in half-regs - if (prog->maxGPR < 126) + if (prog->maxGPR < 63) r63->reg.data.id = 63; else r63->reg.data.id = 127; @@ -336,7 +293,8 @@ NV50LegalizePostRA::visit(BasicBlock *bb) next = hi; } - if (i->op != OP_PFETCH && i->op != OP_BAR && + if (i->op != OP_MOV && i->op != OP_PFETCH && + i->op != OP_BAR && (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS)) replaceZero(i); } @@ -395,8 +353,7 @@ NV50LegalizeSSA::propagateWriteToOutput(Instruction *st) return; for (int s = 0; di->srcExists(s); ++s) - if (di->src(s).getFile() == FILE_IMMEDIATE || - di->src(s).getFile() == FILE_MEMORY_LOCAL) + if (di->src(s).getFile() == FILE_IMMEDIATE) return; if (prog->getType() == Program::TYPE_GEOMETRY) { @@ -646,13 +603,6 @@ private: bool handlePFETCH(Instruction *); bool handleEXPORT(Instruction *); bool handleLOAD(Instruction *); - bool handleLDST(Instruction *); - bool handleMEMBAR(Instruction *); - bool handleSharedATOM(Instruction *); - bool handleSULDP(TexInstruction *); - bool handleSUREDP(TexInstruction *); - bool handleSUSTP(TexInstruction *); - Value *processSurfaceCoords(TexInstruction *); bool handleDIV(Instruction *); bool handleSQRT(Instruction *); @@ -667,9 +617,6 @@ private: bool handleTXL(TexInstruction *); // hate bool handleTXD(TexInstruction *); // these 3 bool handleTXLQ(TexInstruction *); - bool handleTXQ(TexInstruction *); - bool handleSUQ(TexInstruction *); - bool handleBUFQ(Instruction *); bool handleCALL(Instruction *); bool handlePRECONT(Instruction *); @@ -678,8 +625,6 @@ private: void checkPredicate(Instruction *); void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y); void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy); - Value *loadSuInfo(int slot, uint32_t off); - Value *loadSuInfo16(int slot, uint32_t off); private: const Target *const targ; @@ -717,14 +662,12 @@ void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y) { // This loads the texture-indexed ms setting from the constant buffer Value *tmp = new_LValue(func, FILE_GPR); - uint8_t b = prog->driver->io.auxCBSlot; + uint8_t b = prog->driver->io.resInfoCBSlot; off += prog->driver->io.suInfoBase; if (prog->getType() > Program::TYPE_VERTEX) off += 16 * 2 * 4; if (prog->getType() > Program::TYPE_GEOMETRY) off += 16 * 2 * 4; - if (prog->getType() > Program::TYPE_FRAGMENT) - off += 16 * 2 * 4; *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol( FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL); *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol( @@ -754,24 +697,6 @@ void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) prog->driver->io.msInfoBase + 4), off); } -Value * -NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off) -{ - uint8_t b = prog->driver->io.auxCBSlot; - off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE; - return bld.mkLoadv(TYPE_U32, bld.mkSymbol( - FILE_MEMORY_CONST, b, TYPE_U32, off), NULL); -} - -Value * -NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off) -{ - uint8_t b = prog->driver->io.auxCBSlot; - off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE; - return bld.mkLoadv(TYPE_U16, bld.mkSymbol( - FILE_MEMORY_CONST, b, TYPE_U16, off), NULL); -} - bool NV50LoweringPreSSA::handleTEX(TexInstruction *i) { @@ -779,23 +704,6 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i) const int dref = arg; const int lod = i->tex.target.isShadow() ? (arg + 1) : arg; - /* Only normalize in the non-explicit derivatives case. - */ - if (i->tex.target.isCube() && i->op != OP_TXD) { - Value *src[3], *val; - int c; - for (c = 0; c < 3; ++c) - src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c)); - val = bld.getScratch(); - bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); - bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); - bld.mkOp1(OP_RCP, TYPE_F32, val, val); - for (c = 0; c < 3; ++c) { - i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), - i->getSrc(c), val)); - } - } - // handle MS, which means looking up the MS params for this texture, and // adjusting the input coordinates to point at the right sample. if (i->tex.target.isMS()) { @@ -923,7 +831,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i) } Value *flags = bld.getScratch(1, FILE_FLAGS); bld.setPosition(cond, true); - bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0; + bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0)); Instruction *tex[4]; for (l = 0; l < 4; ++l) { @@ -1002,18 +910,16 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i) Instruction *tex; Value *zero = bld.loadImm(bld.getSSA(), 0); int l, c; - const int dim = i->tex.target.getDim() + i->tex.target.isCube(); + const int dim = i->tex.target.getDim(); handleTEX(i); i->op = OP_TEX; // no need to clone dPdx/dPdy later - i->tex.derivAll = true; for (c = 0; c < dim; ++c) crd[c] = bld.getScratch(); bld.mkOp(OP_QUADON, TYPE_NONE, NULL); for (l = 0; l < 4; ++l) { - Value *src[3], *val; // mov coordinates from lane l to all lanes for (c = 0; c < dim; ++c) bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); @@ -1023,24 +929,10 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i) // add dPdy from lane l to lanes dy for (c = 0; c < dim; ++c) bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); - // normalize cube coordinates if necessary - if (i->tex.target.isCube()) { - for (c = 0; c < 3; ++c) - src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); - val = bld.getScratch(); - bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); - bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); - bld.mkOp1(OP_RCP, TYPE_F32, val, val); - for (c = 0; c < 3; ++c) - src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); - } else { - for (c = 0; c < dim; ++c) - src[c] = crd[c]; - } // texture bld.insert(tex = cloneForward(func, i)); for (c = 0; c < dim; ++c) - tex->setSrc(c, src[c]); + tex->setSrc(c, crd[c]); // save results for (c = 0; i->defExists(c); ++c) { Instruction *mov; @@ -1083,87 +975,6 @@ NV50LoweringPreSSA::handleTXLQ(TexInstruction *i) } bool -NV50LoweringPreSSA::handleTXQ(TexInstruction *i) -{ - Value *ms, *ms_x, *ms_y; - if (i->tex.query == TXQ_DIMS) { - if (i->tex.target.isMS()) { - bld.setPosition(i, true); - loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y); - int d = 0; - if (i->tex.mask & 1) { - bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x); - d++; - } - if (i->tex.mask & 2) { - bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y); - d++; - } - } - return true; - } - assert(i->tex.query == TXQ_TYPE); - assert(i->tex.mask == 4); - - loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y); - bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms); - i->bb->remove(i); - - return true; -} - -bool -NV50LoweringPreSSA::handleSUQ(TexInstruction *suq) -{ - const int dim = suq->tex.target.getDim(); - const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube()); - int mask = suq->tex.mask; - int slot = suq->tex.r; - int c, d; - - for (c = 0, d = 0; c < 3; ++c, mask >>= 1) { - if (c >= arg || !(mask & 1)) - continue; - - int offset; - - if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) { - offset = NV50_SU_INFO_SIZE(2); - } else { - offset = NV50_SU_INFO_SIZE(c); - } - bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset)); - if (c == 2 && suq->tex.target.isCube()) - bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1), - bld.loadImm(NULL, 6)); - } - - if (mask & 1) { - if (suq->tex.target.isMS()) { - Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0)); - Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1)); - Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y); - bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms); - } else { - bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1)); - } - } - - bld.remove(suq); - return true; -} - -bool -NV50LoweringPreSSA::handleBUFQ(Instruction *bufq) -{ - bufq->op = OP_MOV; - bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X)); - bufq->setIndirect(0, 0, NULL); - bufq->setIndirect(0, 1, NULL); - return true; -} - -bool NV50LoweringPreSSA::handleSET(Instruction *i) { if (i->dType == TYPE_F32) { @@ -1294,13 +1105,19 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i) break; case SV_NCTAID: case SV_CTAID: - case SV_NTID: { - Value *x = bld.getSSA(2); - bld.mkOp1(OP_LOAD, TYPE_U16, x, - bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr)); - bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x); + case SV_NTID: + if ((sv == SV_NCTAID && idx >= 2) || + (sv == SV_NTID && idx >= 3)) { + bld.mkMov(def, bld.mkImm(1)); + } else if (sv == SV_CTAID && idx >= 2) { + bld.mkMov(def, bld.mkImm(0)); + } else { + Value *x = bld.getSSA(2); + bld.mkOp1(OP_LOAD, TYPE_U16, x, + bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr)); + bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x); + } break; - } case SV_TID: if (idx == 0) { bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff)); @@ -1313,9 +1130,6 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i) bld.mkMov(def, bld.mkImm(0)); } break; - case SV_COMBINED_TID: - bld.mkMov(def, tid); - break; case SV_SAMPLE_POS: { Value *off = new_LValue(func, FILE_ADDRESS); bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0)); @@ -1323,16 +1137,11 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i) bld.mkLoad(TYPE_F32, def, bld.mkSymbol( - FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, + FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot, TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx), off); break; } - case SV_THREAD_KILL: - // Not actually supported. But it's implementation-dependent, so we can - // always just say it's not a helper. - bld.mkMov(def, bld.loadImm(NULL, 0)); - break; default: bld.mkFetch(i->getDef(0), i->dType, FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL); @@ -1357,9 +1166,10 @@ NV50LoweringPreSSA::handleDIV(Instruction *i) bool NV50LoweringPreSSA::handleSQRT(Instruction *i) { - bld.setPosition(i, true); - i->op = OP_RSQ; - bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); + Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, + bld.getSSA(), i->getSrc(0)); + i->op = OP_MUL; + i->setSrc(1, rsq->getDef(0)); return true; } @@ -1397,7 +1207,7 @@ NV50LoweringPreSSA::handleEXPORT(Instruction *i) i->setDef(0, new_LValue(func, FILE_GPR)); i->getDef(0)->reg.data.id = id; - prog->maxGPR = MAX2(prog->maxGPR, id * 2); + prog->maxGPR = MAX2(prog->maxGPR, id); } } return true; @@ -1412,15 +1222,6 @@ bool NV50LoweringPreSSA::handleLOAD(Instruction *i) { ValueRef src = i->src(0); - Symbol *sym = i->getSrc(0)->asSym(); - - if (prog->getType() == Program::TYPE_COMPUTE) { - if (sym->inFile(FILE_MEMORY_SHARED) || - sym->inFile(FILE_MEMORY_BUFFER) || - sym->inFile(FILE_MEMORY_GLOBAL)) { - return handleLDST(i); - } - } if (src.isIndirect(1)) { assert(prog->getType() == Program::TYPE_GEOMETRY); @@ -1458,677 +1259,6 @@ NV50LoweringPreSSA::handleLOAD(Instruction *i) } bool -NV50LoweringPreSSA::handleSharedATOM(Instruction *atom) -{ - assert(atom->src(0).getFile() == FILE_MEMORY_SHARED); - - BasicBlock *currBB = atom->bb; - BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false); - BasicBlock *joinBB = atom->bb->splitAfter(atom); - BasicBlock *setAndUnlockBB = new BasicBlock(func); - BasicBlock *failLockBB = new BasicBlock(func); - - bld.setPosition(currBB, true); - assert(!currBB->joinAt); - currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); - - bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL); - currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE); - - bld.setPosition(tryLockBB, true); - - Instruction *ld = - bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(), - atom->getIndirect(0, 0)); - Value *locked = bld.getSSA(1, FILE_FLAGS); - if (prog->getTarget()->getChipset() >= 0xa0) { - ld->setFlagsDef(1, locked); - ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED; - } else { - bld.mkMov(locked, bld.loadImm(NULL, 2)) - ->flagsDef = 0; - } - - bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked); - bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); - tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS); - tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE); - - tryLockBB->cfg.detach(&joinBB->cfg); - bld.remove(atom); - - bld.setPosition(setAndUnlockBB, true); - Value *stVal; - if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) { - // Read the old value, and write the new one. - stVal = atom->getSrc(1); - } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) { - CmpInstruction *set = - bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS), - TYPE_U32, ld->getDef(0), atom->getSrc(1)); - - Instruction *selp = - bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2), - ld->getDef(0), set->getDef(0)); - stVal = selp->getDef(0); - - handleSELP(selp); - } else { - operation op; - - switch (atom->subOp) { - case NV50_IR_SUBOP_ATOM_ADD: - op = OP_ADD; - break; - case NV50_IR_SUBOP_ATOM_AND: - op = OP_AND; - break; - case NV50_IR_SUBOP_ATOM_OR: - op = OP_OR; - break; - case NV50_IR_SUBOP_ATOM_XOR: - op = OP_XOR; - break; - case NV50_IR_SUBOP_ATOM_MIN: - op = OP_MIN; - break; - case NV50_IR_SUBOP_ATOM_MAX: - op = OP_MAX; - break; - default: - assert(0); - return false; - } - - Instruction *i = - bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0), - atom->getSrc(1)); - - stVal = i->getDef(0); - } - - Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(), - atom->getIndirect(0, 0), stVal); - if (prog->getTarget()->getChipset() >= 0xa0) { - store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED; - } - - bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); - setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE); - - // Loop until the lock is acquired. - bld.setPosition(failLockBB, true); - bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked); - bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL); - failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK); - failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE); - - bld.setPosition(joinBB, false); - bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; - - return true; -} - -bool -NV50LoweringPreSSA::handleLDST(Instruction *i) -{ - ValueRef src = i->src(0); - Symbol *sym = i->getSrc(0)->asSym(); - - if (prog->getType() != Program::TYPE_COMPUTE) { - return true; - } - - // Buffers just map directly to the different global memory spaces - if (sym->inFile(FILE_MEMORY_BUFFER)) { - sym->reg.file = FILE_MEMORY_GLOBAL; - } - - if (sym->inFile(FILE_MEMORY_SHARED)) { - - if (src.isIndirect(0)) { - Value *addr = i->getIndirect(0, 0); - - if (!addr->inFile(FILE_ADDRESS)) { - // Move address from GPR into an address register - Value *new_addr = bld.getSSA(2, FILE_ADDRESS); - bld.mkMov(new_addr, addr); - - i->setIndirect(0, 0, new_addr); - } - } - - if (i->op == OP_ATOM) - handleSharedATOM(i); - } else if (sym->inFile(FILE_MEMORY_GLOBAL)) { - // All global access must be indirect. There are no instruction forms - // with direct access. - Value *addr = i->getIndirect(0, 0); - - Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset); - Value *sum; - if (addr != NULL) - sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr, - offset); - else - sum = offset; - - i->setIndirect(0, 0, sum); - sym->reg.data.offset = 0; - } - - return true; -} - -bool -NV50LoweringPreSSA::handleMEMBAR(Instruction *i) -{ - // For global memory, apparently doing a bunch of reads at different - // addresses forces things to get sufficiently flushed. - if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) { - uint8_t b = prog->driver->io.auxCBSlot; - Value *base = - bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, - prog->driver->io.membarOffset), NULL); - Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0)); - Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), - bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), - physid, bld.loadImm(NULL, 0x1f)), - bld.loadImm(NULL, 2)); - base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off); - Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0); - for (int i = 0; i < 8; i++) { - if (i != 0) { - base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100)); - } - bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base) - ->fixed = 1; - } - } - - // Both global and shared memory barriers also need a regular control bar - // TODO: double-check this is the case - i->op = OP_BAR; - i->subOp = NV50_IR_SUBOP_BAR_SYNC; - i->setSrc(0, bld.mkImm(0u)); - i->setSrc(1, bld.mkImm(0u)); - - return true; -} - -// The type that bests represents how each component can be stored when packed. -static DataType -getPackedType(const TexInstruction::ImgFormatDesc *t, int c) -{ - switch (t->type) { - case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32; - case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16; - case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16; - case UINT: - return (t->bits[c] == 8 ? TYPE_U8 : - (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32)); - case SINT: - return (t->bits[c] == 8 ? TYPE_S8 : - (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32)); - } - return TYPE_NONE; -} - -// The type that the rest of the shader expects to process this image type in. -static DataType -getShaderType(const ImgType type) { - switch (type) { - case FLOAT: - case UNORM: - case SNORM: - return TYPE_F32; - case UINT: - return TYPE_U32; - case SINT: - return TYPE_S32; - default: - assert(!"Impossible type"); - return TYPE_NONE; - } -} - -// Reads the raw coordinates out of the input instruction, and returns a -// single-value coordinate which is what the hardware expects to receive in a -// ld/st op. -Value * -NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su) -{ - const int slot = su->tex.r; - const int dim = su->tex.target.getDim(); - const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); - - const TexInstruction::ImgFormatDesc *format = su->tex.format; - const uint16_t bytes = (format->bits[0] + format->bits[1] + - format->bits[2] + format->bits[3]) / 8; - uint16_t shift = ffs(bytes) - 1; - - // Buffer sizes don't necessarily fit in 16-bit values - if (su->tex.target == TEX_TARGET_BUFFER) { - return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), - su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift)); - } - - // For buffers, we just need the byte offset. And for 2d buffers we want - // the x coordinate in bytes as well. - Value *coords[3] = {}; - for (int i = 0; i < arg; i++) { - Value *src[2]; - bld.mkSplit(src, 2, su->getSrc(i)); - coords[i] = src[0]; - // For 1d-images, we want the y coord to be 0, which it will be here. - if (i == 0) - coords[1] = src[1]; - } - - coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), - coords[0], bld.loadImm(NULL, shift)); - - if (su->tex.target.isMS()) { - Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0)); - Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1)); - coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x); - coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y); - } - - // If there are more dimensions, we just want the y-offset. But that needs - // to be adjusted up by the y-stride for array images. - if (su->tex.target.isArray() || su->tex.target.isCube()) { - Value *index = coords[dim]; - Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y); - Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height); - mul->sType = TYPE_U16; - Value *muls[2]; - bld.mkSplit(muls, 2, mul->getDef(0)); - if (dim > 1) - coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]); - else - coords[1] = muls[0]; - } - - // 3d is special-cased. Note that a single "slice" of a 3d image may - // also be attached as 2d, so we have to do the same 3d processing for - // 2d as well, just in case. In order to remap a 3d image onto a 2d - // image, we have to retile it "by hand". - if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) { - Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z); - Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y); - // Add the z coordinate for actual 3d-images - if (dim > 2) - coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]); - else - coords[2] = z; - - // Compute the surface parameters from tile shifts - Value *tile_shift[3]; - Value *tile_size[3]; - Value *tile_mask[3]; - // We only ever use one kind of X-tiling. - tile_shift[0] = bld.loadImm(NULL, (uint16_t)6); - tile_size[0] = bld.loadImm(NULL, (uint16_t)64); - tile_mask[0] = bld.loadImm(NULL, (uint16_t)63); - // Fetch the "real" tiling parameters of the underlying surface - for (int i = 1; i < 3; i++) { - tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i)); - tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]); - tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1)); - } - - // Compute the location of given coordinate, both inside the tile as - // well as which (linearly-laid out) tile it's in. - Value *coord_in_tile[3]; - Value *tile[3]; - for (int i = 0; i < 3; i++) { - coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]); - tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]); - } - - // Based on the "real" tiling parameters, compute x/y coordinates in the - // larger surface with 2d tiling that was supplied to the hardware. This - // was determined and verified with the help of the tiling pseudocode in - // the envytools docs. - // - // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size + - // z_coord_in_tile * x_tile_size - // adj_y = y_coord_in_tile + y_tile * y_tile_size + - // z_tile * y_tile_size * y_tiles - // - // Note: STRIDE_Y = y_tile_size * y_tiles - - coords[0] = bld.mkOp2v( - OP_ADD, TYPE_U16, bld.getSSA(2), - bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), - coord_in_tile[0], - bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), - tile[0], - bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), - tile_shift[2], tile_shift[0]))), - bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), - coord_in_tile[2], tile_shift[0])); - - Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), - tile[2], y_size_aligned); - mul->sType = TYPE_U16; - Value *muls[2]; - bld.mkSplit(muls, 2, mul->getDef(0)); - - coords[1] = bld.mkOp2v( - OP_ADD, TYPE_U16, bld.getSSA(2), - muls[0], - bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), - coord_in_tile[1], - bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), - tile[1], tile_shift[1]))); - } - - return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]); -} - -// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but -// adjusted to make use of 16-bit math where possible. -bool -NV50LoweringPreSSA::handleSULDP(TexInstruction *su) -{ - const int slot = su->tex.r; - assert(!su->getIndirectR()); - - bld.setPosition(su, false); - - const TexInstruction::ImgFormatDesc *format = su->tex.format; - const int bytes = (su->tex.format->bits[0] + - su->tex.format->bits[1] + - su->tex.format->bits[2] + - su->tex.format->bits[3]) / 8; - DataType ty = typeOfSize(bytes); - - Value *coord = processSurfaceCoords(su); - - Value *untypedDst[4] = {}; - Value *typedDst[4] = {}; - int i; - for (i = 0; i < bytes / 4; i++) - untypedDst[i] = bld.getSSA(); - if (bytes < 4) - untypedDst[0] = bld.getSSA(); - - for (i = 0; i < 4; i++) - typedDst[i] = su->getDef(i); - - Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord); - for (i = 0; i < 4 && untypedDst[i]; i++) - load->setDef(i, untypedDst[i]); - - // Unpack each component into the typed dsts - int bits = 0; - for (int i = 0; i < 4; bits += format->bits[i], i++) { - if (!typedDst[i]) - continue; - - if (i >= format->components) { - if (format->type == FLOAT || - format->type == UNORM || - format->type == SNORM) - bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f); - else - bld.loadImm(typedDst[i], i == 3 ? 1 : 0); - continue; - } - - // Get just that component's data into the relevant place - if (format->bits[i] == 32) - bld.mkMov(typedDst[i], untypedDst[i]); - else if (format->bits[i] == 16) { - // We can always convert directly from the appropriate half of the - // loaded value into the typed result. - Value *src[2]; - bld.mkSplit(src, 2, untypedDst[i / 2]); - bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i], - getPackedType(format, i), src[i & 1]); - } - else if (format->bits[i] == 8) { - // Same approach as for 16 bits, but we have to massage the value a - // bit more, since we have to get the appropriate 8 bits from the - // half-register. In all cases, we can CVT from a 8-bit source, so we - // only have to shift when we want the upper 8 bits. - Value *src[2], *shifted; - bld.mkSplit(src, 2, untypedDst[0]); - DataType packedType = getPackedType(format, i); - if (i & 1) - shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8)); - else - shifted = src[!!(i & 2)]; - - bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i], - packedType, shifted); - } - else { - // The options are 10, 11, and 2. Get it into a 32-bit reg, then - // shift/mask. That's where it'll have to end up anyways. For signed, - // we have to make sure to get sign-extension, so we actually have to - // shift *up* first, and then shift down. There's no advantage to - // AND'ing, so we don't. - DataType ty = TYPE_U32; - if (format->type == SNORM || format->type == SINT) { - ty = TYPE_S32; - } - - // Poor man's EXTBF - bld.mkOp2( - OP_SHR, ty, typedDst[i], - bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])), - bld.loadImm(NULL, 32 - format->bits[i])); - - // If the stored data is already in the appropriate type, we don't - // have to do anything. Convert to float for the *NORM formats. - if (format->type == UNORM || format->type == SNORM) - bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]); - } - - // Normalize / convert as necessary - if (format->type == UNORM) - bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1))); - else if (format->type == SNORM) - bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1))); - else if (format->type == FLOAT && format->bits[i] < 16) { - // We expect the value to be in the low bits of the register, so we - // have to shift back up. - bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i])); - Value *src[2]; - bld.mkSplit(src, 2, typedDst[i]); - bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]); - } - } - - if (format->bgra) { - std::swap(typedDst[0], typedDst[2]); - } - - bld.getBB()->remove(su); - return true; -} - -bool -NV50LoweringPreSSA::handleSUREDP(TexInstruction *su) -{ - const int slot = su->tex.r; - const int dim = su->tex.target.getDim(); - const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); - assert(!su->getIndirectR()); - - bld.setPosition(su, false); - - Value *coord = processSurfaceCoords(su); - - // This is guaranteed to be a 32-bit format. So there's nothing to - // pack/unpack. - Instruction *atom = bld.mkOp2( - OP_ATOM, su->dType, su->getDef(0), - bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg)); - if (su->subOp == NV50_IR_SUBOP_ATOM_CAS) - atom->setSrc(2, su->getSrc(arg + 1)); - atom->setIndirect(0, 0, coord); - atom->subOp = su->subOp; - - bld.getBB()->remove(su); - return true; -} - -bool -NV50LoweringPreSSA::handleSUSTP(TexInstruction *su) -{ - const int slot = su->tex.r; - const int dim = su->tex.target.getDim(); - const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); - assert(!su->getIndirectR()); - - bld.setPosition(su, false); - - const TexInstruction::ImgFormatDesc *format = su->tex.format; - const int bytes = (su->tex.format->bits[0] + - su->tex.format->bits[1] + - su->tex.format->bits[2] + - su->tex.format->bits[3]) / 8; - DataType ty = typeOfSize(bytes); - - Value *coord = processSurfaceCoords(su); - - // The packed values we will eventually store into memory - Value *untypedDst[4] = {}; - // Each component's packed representation, in 16-bit registers (only used - // where appropriate) - Value *untypedDst16[4] = {}; - // The original values that are being packed - Value *typedDst[4] = {}; - int i; - - for (i = 0; i < bytes / 4; i++) - untypedDst[i] = bld.getSSA(); - for (i = 0; i < format->components; i++) - untypedDst16[i] = bld.getSSA(2); - // Make sure we get at least one of each value allocated for the - // super-narrow formats. - if (bytes < 4) - untypedDst[0] = bld.getSSA(); - if (bytes < 2) - untypedDst16[0] = bld.getSSA(2); - - for (i = 0; i < 4; i++) { - typedDst[i] = bld.getSSA(); - bld.mkMov(typedDst[i], su->getSrc(arg + i)); - } - - if (format->bgra) { - std::swap(typedDst[0], typedDst[2]); - } - - // Pack each component into the untyped dsts. - int bits = 0; - for (int i = 0; i < format->components; bits += format->bits[i], i++) { - // Un-normalize / convert as necessary - if (format->type == UNORM) - bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1))); - else if (format->type == SNORM) - bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1))); - - // There is nothing to convert/pack for 32-bit values - if (format->bits[i] == 32) { - bld.mkMov(untypedDst[i], typedDst[i]); - continue; - } - - // The remainder of the cases will naturally want to deal in 16-bit - // registers. We will put these into untypedDst16 and then merge them - // together later. - if (format->type == FLOAT && format->bits[i] < 16) { - bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]); - bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i]))); - - // For odd bit sizes, it's easier to pack it into the final - // destination directly. - Value *tmp = bld.getSSA(); - bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]); - if (i == 0) { - untypedDst[0] = tmp; - } else { - bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits)); - bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp); - } - } else if (format->bits[i] == 16) { - // We can always convert the shader value into the packed value - // directly here - bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i], - getShaderType(format->type), typedDst[i]); - } else if (format->bits[i] < 16) { - DataType packedType = getPackedType(format, i); - DataType shaderType = getShaderType(format->type); - // We can't convert F32 to U8/S8 directly, so go to U16/S16 first. - if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) { - packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16; - } - bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]); - // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of - // the size, it's easier to dump them into a 32-bit value and OR - // everything later. - if (format->bits[i] != 8) { - // Restrict value to the appropriate bits (although maybe supposed - // to clamp instead?) - bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1))); - // And merge into final packed value - Value *tmp = bld.getSSA(); - bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]); - if (i == 0) { - untypedDst[0] = tmp; - } else { - bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits)); - bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp); - } - } else if (i & 1) { - // Shift the 8-bit value up (so that it can be OR'd later) - bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16))); - } else if (packedType != TYPE_U8) { - // S8 (or the *16 if converted from float) will all have high bits - // set, so AND them out. - bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff)); - } - } - } - - // OR pairs of 8-bit values together (into the even value) - if (format->bits[0] == 8) { - for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++) - bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]); - } - - // We'll always want to have at least a 32-bit source register for the store - Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes)); - if (format->bits[0] == 32) { - for (i = 0; i < 4 && untypedDst[i]; i++) - merge->setSrc(i, untypedDst[i]); - } else if (format->bits[0] == 16) { - for (i = 0; i < 4 && untypedDst16[i]; i++) - merge->setSrc(i, untypedDst16[i]); - if (i == 1) - merge->setSrc(i, bld.getSSA(2)); - } else if (format->bits[0] == 8) { - for (i = 0; i < 2 && untypedDst16[2 * i]; i++) - merge->setSrc(i, untypedDst16[2 * i]); - if (i == 1) - merge->setSrc(i, bld.getSSA(2)); - } else { - merge->setSrc(0, untypedDst[0]); - } - - bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0)); - - bld.getBB()->remove(su); - return true; -} - -bool NV50LoweringPreSSA::handlePFETCH(Instruction *i) { assert(prog->getType() == Program::TYPE_GEOMETRY); @@ -2203,8 +1333,6 @@ NV50LoweringPreSSA::visit(Instruction *i) return handleTXD(i->asTex()); case OP_TXLQ: return handleTXLQ(i->asTex()); - case OP_TXQ: - return handleTXQ(i->asTex()); case OP_EX2: bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); i->setSrc(0, i->getDef(0)); @@ -2225,21 +1353,6 @@ NV50LoweringPreSSA::visit(Instruction *i) return handleEXPORT(i); case OP_LOAD: return handleLOAD(i); - case OP_MEMBAR: - return handleMEMBAR(i); - case OP_ATOM: - case OP_STORE: - return handleLDST(i); - case OP_SULDP: - return handleSULDP(i->asTex()); - case OP_SUSTP: - return handleSUSTP(i->asTex()); - case OP_SUREDP: - return handleSUREDP(i->asTex()); - case OP_SUQ: - return handleSUQ(i->asTex()); - case OP_BUFQ: - return handleBUFQ(i); case OP_RDSV: return handleRDSV(i); case OP_WRSV: diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp index 3d25ad928..2e432349f 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp @@ -87,7 +87,6 @@ DominatorTree::DominatorTree(Graph *cfgraph) : cfg(cfgraph), LABEL(i) = i; SEMI(i) = ANCESTOR(i) = -1; } - assert(i == count); build(); @@ -169,7 +168,7 @@ void DominatorTree::build() do { p = 0; for (v = 1; v < count; ++v) { - nw = &BasicBlock::get(vert[DOM(v)])->dom; + nw = &BasicBlock::get(vert[DOM(v)])->dom;; nv = &BasicBlock::get(vert[v])->dom; if (nw->getGraph() && !nv->getGraph()) { ++p; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index b9c3746ad..f3ddcaa51 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -66,7 +66,7 @@ TargetNV50::getBuiltinOffset(int builtin) const return 0; } -struct nv50_opProperties +struct opProperties { operation op; unsigned int mNeg : 4; @@ -79,7 +79,7 @@ struct nv50_opProperties unsigned int fImm : 3; }; -static const struct nv50_opProperties _initProps[] = +static const struct opProperties _initProps[] = { // neg abs not sat c[] s[], a[], imm { OP_ADD, 0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 }, @@ -99,7 +99,6 @@ static const struct nv50_opProperties _initProps[] = { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 }, { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, - { OP_EX2, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0 }, { OP_LG2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, { OP_RCP, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, { OP_RSQ, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, @@ -111,15 +110,15 @@ void TargetNV50::initOpInfo() { unsigned int i, j; - static const operation commutativeList[] = + static const uint32_t commutative[(OP_LAST + 31) / 32] = { - OP_ADD, OP_MUL, OP_MAD, OP_FMA, OP_AND, OP_OR, OP_XOR, OP_MAX, OP_MIN, - OP_SET_AND, OP_SET_OR, OP_SET_XOR, OP_SET, OP_SELP, OP_SLCT + // ADD,MAD,MUL,AND,OR,XOR,MAX,MIN + 0x0670ca00, 0x0000003f, 0x00000000, 0x00000000 }; - static const operation shortFormList[] = + static const uint32_t shortForm[(OP_LAST + 31) / 32] = { - OP_MOV, OP_ADD, OP_SUB, OP_MUL, OP_MAD, OP_SAD, OP_RCP, OP_LINTERP, - OP_PINTERP, OP_TEX, OP_TXF + // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF + 0x00014e40, 0x00000040, 0x00000930, 0x00000000 }; static const operation noDestList[] = { @@ -156,23 +155,19 @@ void TargetNV50::initOpInfo() opInfo[i].hasDest = 1; opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA); - opInfo[i].commutative = false; /* set below */ + opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1; opInfo[i].pseudo = (i < OP_MOV); opInfo[i].predicate = !opInfo[i].pseudo; opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN); - opInfo[i].minEncSize = 8; /* set below */ + opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8; } - for (i = 0; i < ARRAY_SIZE(commutativeList); ++i) - opInfo[commutativeList[i]].commutative = true; - for (i = 0; i < ARRAY_SIZE(shortFormList); ++i) - opInfo[shortFormList[i]].minEncSize = 4; - for (i = 0; i < ARRAY_SIZE(noDestList); ++i) + for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i) opInfo[noDestList[i]].hasDest = 0; - for (i = 0; i < ARRAY_SIZE(noPredList); ++i) + for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i) opInfo[noPredList[i]].predicate = 0; - for (i = 0; i < ARRAY_SIZE(_initProps); ++i) { - const struct nv50_opProperties *prop = &_initProps[i]; + for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) { + const struct opProperties *prop = &_initProps[i]; for (int s = 0; s < 3; ++s) { if (prop->mNeg & (1 << s)) @@ -203,16 +198,14 @@ TargetNV50::getFileSize(DataFile file) const { switch (file) { case FILE_NULL: return 0; - case FILE_GPR: return 254; // in 16-bit units ** + case FILE_GPR: return 256; // in 16-bit units ** case FILE_PREDICATE: return 0; case FILE_FLAGS: return 4; case FILE_ADDRESS: return 4; - case FILE_BARRIER: return 0; case FILE_IMMEDIATE: return 0; case FILE_MEMORY_CONST: return 65536; case FILE_SHADER_INPUT: return 0x200; case FILE_SHADER_OUTPUT: return 0x200; - case FILE_MEMORY_BUFFER: return 0xffffffff; case FILE_MEMORY_GLOBAL: return 0xffffffff; case FILE_MEMORY_SHARED: return 16 << 10; case FILE_MEMORY_LOCAL: return 48 << 10; @@ -252,18 +245,15 @@ TargetNV50::getSVAddress(DataFile shaderFile, const Symbol *sym) const return shaderFile == FILE_SHADER_INPUT ? 0x18 : sysvalLocation[sym->reg.data.sv.sv]; case SV_NCTAID: - return sym->reg.data.sv.index >= 2 ? 0x10 : 0x8 + 2 * sym->reg.data.sv.index; + return 0x8 + 2 * sym->reg.data.sv.index; case SV_CTAID: - return sym->reg.data.sv.index >= 2 ? 0x12 : 0xc + 2 * sym->reg.data.sv.index; + return 0xc + 2 * sym->reg.data.sv.index; case SV_NTID: return 0x2 + 2 * sym->reg.data.sv.index; case SV_TID: - case SV_COMBINED_TID: return 0; case SV_SAMPLE_POS: return 0; /* sample position is handled differently */ - case SV_THREAD_KILL: - return 0; default: return sysvalLocation[sym->reg.data.sv.sv]; } @@ -278,16 +268,6 @@ TargetNV50::insnCanLoad(const Instruction *i, int s, { DataFile sf = ld->src(0).getFile(); - // immediate 0 can be represented by GPR $r63/$r127 - // this does not work with global memory ld/st/atom - if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0) - return (!i->isPseudo() && - !i->asTex() && - i->op != OP_EXPORT && - i->op != OP_STORE && - ((i->op != OP_ATOM && i->op != OP_LOAD) || - i->src(0).getFile() != FILE_MEMORY_GLOBAL)); - if (sf == FILE_IMMEDIATE && (i->predSrc >= 0 || i->flagsDef >= 0)) return false; if (s >= opInfo[i->op].srcNr) @@ -362,11 +342,8 @@ TargetNV50::insnCanLoad(const Instruction *i, int s, ldSize = typeSizeof(ld->dType); } - if (sf == FILE_IMMEDIATE) { - if (ldSize == 2 && (i->op == OP_AND || i->op == OP_OR || i->op == OP_XOR)) - return false; - return ldSize <= 4; - } + if (sf == FILE_IMMEDIATE) + return true; // Check if memory access is encodable: @@ -402,29 +379,12 @@ TargetNV50::insnCanLoad(const Instruction *i, int s, } bool -TargetNV50::insnCanLoadOffset(const Instruction *i, int s, int offset) const -{ - if (!i->src(s).isIndirect(0)) - return true; - offset += i->src(s).get()->reg.data.offset; - if (i->op == OP_LOAD || i->op == OP_STORE || i->op == OP_ATOM) { - // There are some restrictions in theory, but in practice they're never - // going to be hit. However offsets on global/shared memory are just - // plain not supported. - return i->src(s).getFile() != FILE_MEMORY_GLOBAL && - i->src(s).getFile() != FILE_MEMORY_SHARED; - } - return offset >= 0 && offset <= (int32_t)(127 * i->src(s).get()->reg.size); -} - -bool TargetNV50::isAccessSupported(DataFile file, DataType ty) const { if (ty == TYPE_B96 || ty == TYPE_NONE) return false; if (typeSizeof(ty) > 4) - return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL) || - (file == FILE_MEMORY_BUFFER); + return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL); return true; } @@ -453,8 +413,6 @@ TargetNV50::isOpSupported(operation op, DataType ty) const case OP_EXTBF: case OP_EXIT: // want exit modifier instead (on NOP if required) case OP_MEMBAR: - case OP_SHLADD: - case OP_XMAD: return false; case OP_SAD: return ty == TYPE_S32; @@ -496,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const return false; } } - if (s >= opInfo[insn->op].srcNr || s >= 3) + if (s >= 3) return false; return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod; } @@ -529,7 +487,6 @@ int TargetNV50::getLatency(const Instruction *i) const switch (i->src(0).getFile()) { case FILE_MEMORY_LOCAL: case FILE_MEMORY_GLOBAL: - case FILE_MEMORY_BUFFER: return 100; // really 400 to 800 default: return 22; @@ -595,24 +552,21 @@ recordLocation(uint16_t *locs, uint8_t *masks, } void -TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info, - const struct nv50_ir_prog_info_out *info_out) +TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info) { unsigned int i; - for (i = 0; i < info_out->numOutputs; ++i) - recordLocation(sysvalLocation, NULL, &info_out->out[i]); - for (i = 0; i < info_out->numInputs; ++i) - recordLocation(sysvalLocation, &wposMask, &info_out->in[i]); - for (i = 0; i < info_out->numSysVals; ++i) - recordLocation(sysvalLocation, NULL, &info_out->sv[i]); + for (i = 0; i < info->numOutputs; ++i) + recordLocation(sysvalLocation, NULL, &info->out[i]); + for (i = 0; i < info->numInputs; ++i) + recordLocation(sysvalLocation, &wposMask, &info->in[i]); + for (i = 0; i < info->numSysVals; ++i) + recordLocation(sysvalLocation, NULL, &info->sv[i]); if (sysvalLocation[SV_POSITION] >= 0x200) { // not assigned by driver, but we need it internally wposMask = 0x8; sysvalLocation[SV_POSITION] = 0; } - - Target::parseDriverInfo(info, info_out); } } // namespace nv50_ir diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h index caf66b269..0cbf180d0 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h @@ -42,13 +42,10 @@ public: virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const; - virtual void parseDriverInfo(const struct nv50_ir_prog_info *, - const struct nv50_ir_prog_info_out *); + virtual void parseDriverInfo(const struct nv50_ir_prog_info *); virtual bool insnCanLoad(const Instruction *insn, int s, const Instruction *ld) const; - virtual bool insnCanLoadOffset(const Instruction *insn, int s, - int offset) const; virtual bool isOpSupported(operation, DataType) const; virtual bool isAccessSupported(DataFile, DataType) const; virtual bool isModSupported(const Instruction *, int s, Modifier) const; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h index 7808164f4..3c5c74804 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h @@ -31,15 +31,11 @@ namespace nv50_ir { #define NVC0_BUILTIN_COUNT 4 -struct nvc0_opProperties; - class TargetNVC0 : public Target { public: TargetNVC0(unsigned int chipset); - void initProps(const struct nvc0_opProperties *props, int size); - virtual CodeEmitter *getCodeEmitter(Program::Type); CodeEmitter *createCodeEmitterNVC0(Program::Type); @@ -52,8 +48,6 @@ public: virtual bool insnCanLoad(const Instruction *insn, int s, const Instruction *ld) const; - virtual bool insnCanLoadOffset(const Instruction *insn, int s, - int offset) const; virtual bool isOpSupported(operation, DataType) const; virtual bool isAccessSupported(DataFile, DataType) const; virtual bool isModSupported(const Instruction *, int s, Modifier) const; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp index dc4ebd51a..d26acb304 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp @@ -297,8 +297,8 @@ void BitSet::fill(uint32_t val) unsigned int i; for (i = 0; i < (size + 31) / 32; ++i) data[i] = val; - if (val && i) - data[i - 1] &= (1 << (size % 32)) - 1; + if (val) + data[i] &= ~(0xffffffff << (size % 32)); // BE ? } void BitSet::setOr(BitSet *pA, BitSet *pB) @@ -311,12 +311,12 @@ void BitSet::setOr(BitSet *pA, BitSet *pB) } } -int BitSet::findFreeRange(unsigned int count, unsigned int max) const +int BitSet::findFreeRange(unsigned int count) const { const uint32_t m = (1 << count) - 1; - int pos = max; + int pos = size; unsigned int i; - const unsigned int end = (max + 31) / 32; + const unsigned int end = (size + 31) / 32; if (count == 1) { for (i = 0; i < end; ++i) { @@ -365,15 +365,9 @@ int BitSet::findFreeRange(unsigned int count, unsigned int max) const } } } - - // If we couldn't find a position, we can have a left-over -1 in pos. Make - // sure to abort in such a case. - if (pos < 0) - return -1; - pos += i * 32; - return ((pos + count) <= max) ? pos : -1; + return ((pos + count) <= size) ? pos : -1; } void BitSet::print() const diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h index b1766f482..fa2c4804a 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h @@ -36,14 +36,14 @@ #include "util/u_inlines.h" #include "util/u_memory.h" -#define ERROR(args...) _debug_printf("ERROR: " args) -#define WARN(args...) _debug_printf("WARNING: " args) -#define INFO(args...) _debug_printf(args) +#define ERROR(args...) debug_printf("ERROR: " args) +#define WARN(args...) debug_printf("WARNING: " args) +#define INFO(args...) debug_printf(args) #define INFO_DBG(m, f, args...) \ do { \ if (m & NV50_IR_DEBUG_##f) \ - _debug_printf(args); \ + debug_printf(args); \ } while(0) #define FATAL(args...) \ @@ -94,11 +94,7 @@ public: virtual void reset() { assert(0); } // only for graph iterators }; -#if __cplusplus >= 201103L -typedef std::unique_ptr<Iterator> IteratorRef; -#else typedef std::auto_ptr<Iterator> IteratorRef; -#endif class ManipIterator : public Iterator { @@ -145,7 +141,7 @@ public: #define DLLIST_EMPTY(__list) ((__list)->next == (__list)) #define DLLIST_FOR_EACH(list, it) \ - for (DLList::Iterator it = (list)->iterator(); !(it).end(); (it).next()) + for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next()) class DLList { @@ -203,7 +199,7 @@ public: virtual void erase(); virtual bool insert(void *data); - // move item to another list, no consistency with its iterators though + // move item to a another list, no consistency with its iterators though void moveToList(DLList&); private: @@ -539,11 +535,8 @@ public: return data[i / 32] & (((1 << n) - 1) << (i % 32)); } - // Find a range of count (<= 32) clear bits aligned to roundup_pow2(count). - int findFreeRange(unsigned int count, unsigned int max) const; - inline int findFreeRange(unsigned int count) const { - return findFreeRange(count, size); - } + // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size). + int findFreeRange(unsigned int size) const; BitSet& operator|=(const BitSet&); |