diff options
-rw-r--r-- | sys/arch/sparc64/sparc64/locore.s | 1303 |
1 files changed, 1 insertions, 1302 deletions
diff --git a/sys/arch/sparc64/sparc64/locore.s b/sys/arch/sparc64/sparc64/locore.s index 57d646f3774..820ca6311ac 100644 --- a/sys/arch/sparc64/sparc64/locore.s +++ b/sys/arch/sparc64/sparc64/locore.s @@ -1,4 +1,4 @@ -/* $OpenBSD: locore.s,v 1.172 2013/06/13 19:11:13 kettenis Exp $ */ +/* $OpenBSD: locore.s,v 1.173 2013/06/13 19:33:04 kettenis Exp $ */ /* $NetBSD: locore.s,v 1.137 2001/08/13 06:10:10 jdolecek Exp $ */ /* @@ -300,69 +300,6 @@ _C_LABEL(sun4u_mtp_patch_end): movz %icc, %o0, %sp .endm -/* - * The following routines allow fpu use in the kernel. - * - * They allocate a stack frame and use all local regs. Extra - * local storage can be requested by setting the siz parameter, - * and can be accessed at %sp+CC64FSZ. - */ - - .macro ENABLE_FPU siz - save %sp, -(CC64FSZ), %sp; ! Allocate a stack frame - GET_CPUINFO_VA(%l1); - add %fp, BIAS-FS_SIZE, %l0; ! Allocate a fpstate - ldx [%l1 + CI_FPPROC], %l2; ! Load fpproc - andn %l0, BLOCK_SIZE, %l0; ! Align it - clr %l3; ! NULL fpstate - brz,pt %l2, 1f; ! fpproc == NULL? - add %l0, -BIAS-CC64FSZ-(\siz), %sp; ! Set proper %sp - ldx [%l2 + P_FPSTATE], %l3; - brz,pn %l3, 1f; ! Make sure we have an fpstate - mov %l3, %o0; - call _C_LABEL(savefpstate); ! Save the old fpstate -1: - set EINTSTACK-BIAS, %l4; ! Are we on intr stack? - cmp %sp, %l4; - bgu,pt %xcc, 1f; - set INTSTACK-BIAS, %l4; - cmp %sp, %l4; - blu %xcc, 1f; -0: - sethi %hi(_C_LABEL(proc0)), %l4; ! Yes, use proc0 - ba,pt %xcc, 2f; ! XXXX needs to change to CPUs idle proc - or %l4, %lo(_C_LABEL(proc0)), %l5; -1: - GET_CURPROC(%l5); ! Use curproc - brz,pn %l5, 0b; nop; ! If curproc is NULL need to use proc0 -2: - ldx [%l5 + P_FPSTATE], %l6; ! Save old fpstate - stx %l0, [%l5 + P_FPSTATE]; ! Insert new fpstate - stx %l5, [%l1 + CI_FPPROC]; ! Set new fpproc - wr %g0, FPRS_FEF, %fprs ! Enable FPU - .endm - -/* - * We've saved our possible fpstate, now disable the fpu - * and continue with life. - */ - - .macro RESTORE_FPU -#ifdef DEBUG - ldx [%l5 + P_FPSTATE], %l7 - cmp %l7, %l0 - tnz 1 -#endif /* DEBUG */ - stx %l2, [%l1 + CI_FPPROC] ! Restore old fproc - wr %g0, 0, %fprs ! Disable fpu - brz,pt %l3, 1f ! Skip if no fpstate - stx %l6, [%l5 + P_FPSTATE] ! Restore old fpstate - - call _C_LABEL(loadfpstate) ! Reload orig fpstate - mov %l3, %o0 -1: - .endm - .data .globl _C_LABEL(data_start) @@ -6481,22 +6418,6 @@ ENTRY(pseg_set) mov 1, %o0 -/* - * Use block_disable to turn off block instructions for - * bcopy/memset - */ - .data - .align 8 - .globl block_disable -block_disable: .xword 1 - .text - -#if 0 -#define ASI_STORE ASI_BLK_COMMIT_P -#else /* 0 */ -#define ASI_STORE ASI_BLK_P -#endif /* 0 */ - #if 1 /* * kernel bcopy/memcpy @@ -6582,11 +6503,6 @@ Lovbcopy: * Plenty of data to copy, so try to do it optimally. */ 2: -#if 0 - ! If it is big enough, use VIS instructions - bge Lbcopy_block - nop -#endif /* 0 */ Lbcopy_fancy: !! @@ -6910,1134 +6826,10 @@ Lbcopy_finish: Lbcopy_complete: ret restore %i1, %g0, %o0 - -#if 1 - -/* - * Block copy. Useful for >256 byte copies. - * - * Benchmarking has shown this always seems to be slower than - * the integer version, so this is disabled. Maybe someone will - * figure out why sometime. - */ - -Lbcopy_block: - sethi %hi(block_disable), %o3 - ldx [ %o3 + %lo(block_disable) ], %o3 - brnz,pn %o3, Lbcopy_fancy - !! Make sure our trap table is installed - set _C_LABEL(trapbase), %o5 - rdpr %tba, %o3 - sub %o3, %o5, %o3 - brnz,pn %o3, Lbcopy_fancy ! No, then don't use block load/store - nop -#ifdef _KERNEL -/* - * Kernel: - * - * Here we use VIS instructions to do a block clear of a page. - * But before we can do that we need to save and enable the FPU. - * The last owner of the FPU registers is fpproc, and - * fpproc->p_md.md_fpstate is the current fpstate. If that's not - * null, call savefpstate() with it to store our current fp state. - * - * Next, allocate an aligned fpstate on the stack. We will properly - * nest calls on a particular stack so this should not be a problem. - * - * Now we grab either curproc (or if we're on the interrupt stack - * proc0). We stash its existing fpstate in a local register and - * put our new fpstate in curproc->p_md.md_fpstate. We point - * fpproc at curproc (or proc0) and enable the FPU. - * - * If we are ever preempted, our FPU state will be saved in our - * fpstate. Then, when we're resumed and we take an FPDISABLED - * trap, the trap handler will be able to fish our FPU state out - * of curproc (or proc0). - * - * On exiting this routine we undo the damage: restore the original - * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable - * the MMU. - * - * - * Register usage, Kernel only (after save): - * - * %i0 src - * %i1 dest - * %i2 size - * - * %l0 XXXX DEBUG old fpstate - * %l1 fpproc (hi bits only) - * %l2 orig fpproc - * %l3 orig fpstate - * %l5 curproc - * %l6 old fpstate - * - * Register ussage, Kernel and user: - * - * %g1 src (retval for memcpy) - * - * %o0 src - * %o1 dest - * %o2 end dest - * %o5 last safe fetchable address - */ - - ENABLE_FPU 0 - mov %i0, %o0 ! Src addr. - mov %i1, %o1 ! Store our dest ptr here. - mov %i2, %o2 ! Len counter -#endif /* _KERNEL */ - - !! - !! First align the output to a 64-bit entity - !! - - mov %o1, %g1 ! memcpy retval - add %o0, %o2, %o5 ! End of source block - - andn %o0, 7, %o3 ! Start of block - dec %o5 - fzero %f0 - - andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr. - ldd [%o3], %f2 ! Load 1st word - - dec 8, %o3 ! Move %o3 1 word back - btst 1, %o1 - bz 4f - - mov -7, %o4 ! Lowest src addr possible - alignaddr %o0, %o4, %o4 ! Base addr for load. - - cmp %o3, %o4 - be,pt %xcc, 1f ! Already loaded? - mov %o4, %o3 - fmovd %f2, %f0 ! No. Shift - ldd [%o3+8], %f2 ! And load -1: - - faligndata %f0, %f2, %f4 ! Isolate 1st byte - - stda %f4, [%o1] ASI_FL8_P ! Store 1st byte - inc 1, %o1 ! Update address - inc 1, %o0 - dec 1, %o2 -4: - btst 2, %o1 - bz 4f - - mov -6, %o4 ! Calculate src - 6 - alignaddr %o0, %o4, %o4 ! calculate shift mask and dest. - - cmp %o3, %o4 ! Addresses same? - be,pt %xcc, 1f - mov %o4, %o3 - fmovd %f2, %f0 ! Shuffle data - ldd [%o3+8], %f2 ! Load word 0 -1: - faligndata %f0, %f2, %f4 ! Move 1st short low part of f8 - - stda %f4, [%o1] ASI_FL16_P ! Store 1st short - dec 2, %o2 - inc 2, %o1 - inc 2, %o0 -4: - brz,pn %o2, Lbcopy_blockfinish ! XXXX - - btst 4, %o1 - bz 4f - - mov -4, %o4 - alignaddr %o0, %o4, %o4 ! calculate shift mask and dest. - - cmp %o3, %o4 ! Addresses same? - beq,pt %xcc, 1f - mov %o4, %o3 - fmovd %f2, %f0 ! Shuffle data - ldd [%o3+8], %f2 ! Load word 0 -1: - faligndata %f0, %f2, %f4 ! Move 1st short low part of f8 - - st %f5, [%o1] ! Store word - dec 4, %o2 - inc 4, %o1 - inc 4, %o0 -4: - brz,pn %o2, Lbcopy_blockfinish ! XXXX - !! - !! We are now 32-bit aligned in the dest. - !! -Lbcopy_block_common: - - mov -0, %o4 - alignaddr %o0, %o4, %o4 ! base - shift - - cmp %o3, %o4 ! Addresses same? - beq,pt %xcc, 1f - mov %o4, %o3 - fmovd %f2, %f0 ! Shuffle data - ldd [%o3+8], %f2 ! Load word 0 -1: - add %o3, 8, %o0 ! now use %o0 for src - - !! - !! Continue until our dest is block aligned - !! -Lbcopy_block_aligned8: -1: - brz %o2, Lbcopy_blockfinish - btst BLOCK_ALIGN, %o1 ! Block aligned? - bz 1f - - faligndata %f0, %f2, %f4 ! Generate result - deccc 8, %o2 - ble,pn %icc, Lbcopy_blockfinish ! Should never happen - fmovd %f4, %f48 - - std %f4, [%o1] ! Store result - inc 8, %o1 - - fmovd %f2, %f0 - inc 8, %o0 - ba,pt %xcc, 1b ! Not yet. - ldd [%o0], %f2 ! Load next part -Lbcopy_block_aligned64: -1: - -/* - * 64-byte aligned -- ready for block operations. - * - * Here we have the destination block aligned, but the - * source pointer may not be. Sub-word alignment will - * be handled by faligndata instructions. But the source - * can still be potentially aligned to 8 different words - * in our 64-bit block, so we have 8 different copy routines. - * - * Once we figure out our source alignment, we branch - * to the appropriate copy routine, which sets up the - * alignment for faligndata and loads (sets) the values - * into the source registers and does the copy loop. - * - * When were down to less than 1 block to store, we - * exit the copy loop and execute cleanup code. - * - * Block loads and stores are not properly interlocked. - * Stores save one reg/cycle, so you can start overwriting - * registers the cycle after the store is issued. - * - * Block loads require a block load to a different register - * block or a membar #Sync before accessing the loaded - * data. - * - * Since the faligndata instructions may be offset as far - * as 7 registers into a block (if you are shifting source - * 7 -> dest 0), you need 3 source register blocks for full - * performance: one you are copying, one you are loading, - * and one for interlocking. Otherwise, we would need to - * sprinkle the code with membar #Sync and lose the advantage - * of running faligndata in parallel with block stores. This - * means we are fetching a full 128 bytes ahead of the stores. - * We need to make sure the prefetch does not inadvertently - * cross a page boundary and fault on data that we will never - * store. - * - */ -#if 1 - and %o0, BLOCK_ALIGN, %o3 - srax %o3, 3, %o3 ! Isolate the offset - - brz %o3, L100 ! 0->0 - btst 4, %o3 - bnz %xcc, 4f - btst 2, %o3 - bnz %xcc, 2f - btst 1, %o3 - ba,pt %xcc, L101 ! 0->1 - nop /* XXX spitfire bug */ -2: - bz %xcc, L102 ! 0->2 - nop - ba,pt %xcc, L103 ! 0->3 - nop /* XXX spitfire bug */ -4: - bnz %xcc, 2f - btst 1, %o3 - bz %xcc, L104 ! 0->4 - nop - ba,pt %xcc, L105 ! 0->5 - nop /* XXX spitfire bug */ -2: - bz %xcc, L106 ! 0->6 - nop - ba,pt %xcc, L107 ! 0->7 - nop /* XXX spitfire bug */ -#else /* 1 */ - - !! - !! Isolate the word offset, which just happens to be - !! the slot in our jump table. - !! - !! This is 6 instructions, most of which cannot be paired, - !! which is about the same as the above version. - !! - rd %pc, %o4 -1: - and %o0, 0x31, %o3 - add %o3, (Lbcopy_block_jmp - 1b), %o3 - jmpl %o4 + %o3, %g0 - nop - - !! - !! Jump table - !! - -Lbcopy_block_jmp: - ba,a,pt %xcc, L100 - nop - ba,a,pt %xcc, L101 - nop - ba,a,pt %xcc, L102 - nop - ba,a,pt %xcc, L103 - nop - ba,a,pt %xcc, L104 - nop - ba,a,pt %xcc, L105 - nop - ba,a,pt %xcc, L106 - nop - ba,a,pt %xcc, L107 - nop -#endif /* 1 */ - - !! - !! Source is block aligned. - !! - !! Just load a block and go. - !! -L100: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L100" - .align 8 -2: -#endif /* RETURN_NAME */ - fmovd %f0 , %f62 - ldda [%o0] ASI_BLK_P, %f0 - inc BLOCK_SIZE, %o0 - cmp %o0, %o5 - bleu,a,pn %icc, 3f - ldda [%o0] ASI_BLK_P, %f16 - ba,pt %icc, 3f - membar #Sync - - .align 32 ! ICache align. -3: - faligndata %f62, %f0, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f0, %f2, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f2, %f4, %f36 - cmp %o0, %o5 - faligndata %f4, %f6, %f38 - faligndata %f6, %f8, %f40 - faligndata %f8, %f10, %f42 - faligndata %f10, %f12, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f12, %f14, %f46 - - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - stda %f32, [%o1] ASI_STORE - faligndata %f14, %f16, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f16, %f18, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f18, %f20, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f20, %f22, %f38 - cmp %o0, %o5 - faligndata %f22, %f24, %f40 - faligndata %f24, %f26, %f42 - faligndata %f26, %f28, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f28, %f30, %f46 - - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - stda %f32, [%o1] ASI_STORE - faligndata %f30, %f48, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f48, %f50, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f50, %f52, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f52, %f54, %f38 - cmp %o0, %o5 - faligndata %f54, %f56, %f40 - faligndata %f56, %f58, %f42 - faligndata %f58, %f60, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f60, %f62, %f46 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top - membar #Sync -2: - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+8 - !! - !! We need to load almost 1 complete block by hand. - !! -L101: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L101" - .align 8 -2: -#endif /* RETURN_NAME */ -! fmovd %f0, %f0 ! Hoist fmovd - ldd [%o0], %f2 - inc 8, %o0 - ldd [%o0], %f4 - inc 8, %o0 - ldd [%o0], %f6 - inc 8, %o0 - ldd [%o0], %f8 - inc 8, %o0 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 3f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -3: - faligndata %f0, %f2, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f2, %f4, %f34 - cmp %o0, %o5 - faligndata %f4, %f6, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f6, %f8, %f38 - faligndata %f8, %f10, %f40 - faligndata %f10, %f12, %f42 - faligndata %f12, %f14, %f44 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - brlez,pn %o2, Lbcopy_blockdone - faligndata %f14, %f16, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f16, %f18, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f20, %f22, %f36 - cmp %o0, %o5 - faligndata %f22, %f24, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f24, %f26, %f40 - faligndata %f26, %f28, %f42 - faligndata %f28, %f30, %f44 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - brlez,pn %o2, Lbcopy_blockdone - faligndata %f30, %f48, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f48, %f50, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f50, %f52, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f52, %f54, %f36 - cmp %o0, %o5 - faligndata %f54, %f56, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f56, %f58, %f40 - faligndata %f58, %f60, %f42 - faligndata %f60, %f62, %f44 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - brlez,pn %o2, Lbcopy_blockdone - faligndata %f62, %f0, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+16 - !! - !! We need to load 6 doubles by hand. - !! -L102: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L102" - .align 8 -2: -#endif /* RETURN_NAME */ - ldd [%o0], %f4 - inc 8, %o0 - fmovd %f0, %f2 ! Hoist fmovd - ldd [%o0], %f6 - inc 8, %o0 - - ldd [%o0], %f8 - inc 8, %o0 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 3f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -3: - faligndata %f2, %f4, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f4, %f6, %f34 - cmp %o0, %o5 - faligndata %f6, %f8, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f8, %f10, %f38 - faligndata %f10, %f12, %f40 - faligndata %f12, %f14, %f42 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f44 - - brlez,pn %o2, Lbcopy_blockdone - faligndata %f16, %f18, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f18, %f20, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f20, %f22, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f22, %f24, %f36 - cmp %o0, %o5 - faligndata %f24, %f26, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f26, %f28, %f40 - faligndata %f28, %f30, %f42 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f48, %f50, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f50, %f52, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f52, %f54, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f54, %f56, %f36 - cmp %o0, %o5 - faligndata %f56, %f58, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f58, %f60, %f40 - faligndata %f60, %f62, %f42 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f0, %f2, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - !! - !! Source at BLOCK_ALIGN+24 - !! - !! We need to load 5 doubles by hand. - !! -L103: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L103" - .align 8 -2: -#endif /* RETURN_NAME */ - fmovd %f0, %f4 - ldd [%o0], %f6 - inc 8, %o0 - ldd [%o0], %f8 - inc 8, %o0 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f4, %f6, %f32 - cmp %o0, %o5 - faligndata %f6, %f8, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f8, %f10, %f36 - faligndata %f10, %f12, %f38 - faligndata %f12, %f14, %f40 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f16, %f18, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f18, %f20, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f20, %f22, %f32 - cmp %o0, %o5 - faligndata %f22, %f24, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f24, %f26, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f26, %f28, %f38 - faligndata %f28, %f30, %f40 - ble,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f48, %f50, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f50, %f52, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f52, %f54, %f32 - cmp %o0, %o5 - faligndata %f54, %f56, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f56, %f58, %f36 - faligndata %f58, %f60, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f60, %f62, %f40 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f0, %f2, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f2, %f4, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+32 - !! - !! We need to load 4 doubles by hand. - !! -L104: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L104" - .align 8 -2: -#endif /* RETURN_NAME */ - fmovd %f0, %f6 - ldd [%o0], %f8 - inc 8, %o0 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f6, %f8, %f32 - cmp %o0, %o5 - faligndata %f8, %f10, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f10, %f12, %f36 - faligndata %f12, %f14, %f38 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f40 - faligndata %f16, %f18, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f20, %f22, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f22, %f24, %f32 - cmp %o0, %o5 - faligndata %f24, %f26, %f34 - faligndata %f26, %f28, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f28, %f30, %f38 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f40 - dec BLOCK_SIZE, %o2 - faligndata %f48, %f50, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f50, %f52, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f52, %f54, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f54, %f56, %f32 - cmp %o0, %o5 - faligndata %f56, %f58, %f34 - faligndata %f58, %f60, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f60, %f62, %f38 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f40 - dec BLOCK_SIZE, %o2 - faligndata %f0, %f2, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f2, %f4, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f4, %f6, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+40 - !! - !! We need to load 3 doubles by hand. - !! -L105: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L105" - .align 8 -2: -#endif /* RETURN_NAME */ - fmovd %f0, %f8 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f8, %f10, %f32 - cmp %o0, %o5 - faligndata %f10, %f12, %f34 - faligndata %f12, %f14, %f36 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f16, %f18, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f42 - faligndata %f20, %f22, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f22, %f24, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f24, %f26, %f32 - cmp %o0, %o5 - faligndata %f26, %f28, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f28, %f30, %f36 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f48, %f50, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f50, %f52, %f42 - faligndata %f52, %f54, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f54, %f56, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f56, %f58, %f32 - cmp %o0, %o5 - faligndata %f58, %f60, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f60, %f62, %f36 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f0, %f2, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f2, %f4, %f42 - faligndata %f4, %f6, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f6, %f8, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - - !! - !! Source at BLOCK_ALIGN+48 - !! - !! We need to load 2 doubles by hand. - !! -L106: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L106" - .align 8 -2: -#endif /* RETURN_NAME */ - fmovd %f0, %f10 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f10, %f12, %f32 - cmp %o0, %o5 - faligndata %f12, %f14, %f34 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f16, %f18, %f38 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f40 - faligndata %f20, %f22, %f42 - faligndata %f22, %f24, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f24, %f26, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f26, %f28, %f32 - cmp %o0, %o5 - faligndata %f28, %f30, %f34 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f48, %f50, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f50, %f52, %f40 - faligndata %f52, %f54, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f54, %f56, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f56, %f58, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f58, %f60, %f32 - cmp %o0, %o5 - faligndata %f60, %f62, %f34 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f0, %f2, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f2, %f4, %f40 - faligndata %f4, %f6, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f6, %f8, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f8, %f10, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - - !! - !! Source at BLOCK_ALIGN+56 - !! - !! We need to load 1 double by hand. - !! -L107: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L107" - .align 8 -2: -#endif /* RETURN_NAME */ - fmovd %f0, %f12 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f12, %f14, %f32 - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f16, %f18, %f36 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f38 - faligndata %f20, %f22, %f40 - faligndata %f22, %f24, %f42 - faligndata %f24, %f26, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f26, %f28, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f28, %f30, %f32 - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f48, %f50, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f50, %f52, %f38 - faligndata %f52, %f54, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f54, %f56, %f42 - faligndata %f56, %f58, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f58, %f60, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f60, %f62, %f32 - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f0, %f2, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f2, %f4, %f38 - faligndata %f4, %f6, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f6, %f8, %f42 - faligndata %f8, %f10, %f44 - - brlez,pn %o2, Lbcopy_blockdone - faligndata %f10, %f12, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - -Lbcopy_blockdone: - inc BLOCK_SIZE, %o2 ! Fixup our overcommit - membar #Sync ! Finish any pending loads -#define FINISH_REG(f) \ - deccc 8, %o2; \ - bl,a Lbcopy_blockfinish; \ - fmovd f, %f48; \ - std f, [%o1]; \ - inc 8, %o1 - - FINISH_REG(%f32) - FINISH_REG(%f34) - FINISH_REG(%f36) - FINISH_REG(%f38) - FINISH_REG(%f40) - FINISH_REG(%f42) - FINISH_REG(%f44) - FINISH_REG(%f46) - FINISH_REG(%f48) -#undef FINISH_REG - !! - !! The low 3 bits have the sub-word bits needed to be - !! stored [because (x-8)&0x7 == x]. - !! -Lbcopy_blockfinish: - brz,pn %o2, 2f ! 100% complete? - fmovd %f48, %f4 - cmp %o2, 8 ! Exactly 8 bytes? - bz,a,pn %xcc, 2f - std %f4, [%o1] - - btst 4, %o2 ! Word store? - bz %xcc, 1f - nop - st %f4, [%o1] - inc 4, %o1 -1: - btst 2, %o2 - fzero %f0 - bz 1f - - mov -6, %o4 - alignaddr %o1, %o4, %g0 - - faligndata %f0, %f4, %f8 - - stda %f8, [%o1] ASI_FL16_P ! Store short - inc 2, %o1 -1: - btst 1, %o2 ! Byte aligned? - bz 2f - - mov -7, %o0 ! Calculate dest - 7 - alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest. - - faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8 - - stda %f8, [%o1] ASI_FL8_P ! Store 1st byte - inc 1, %o1 ! Update address -2: - membar #Sync -#ifdef _KERNEL - -/* - * Weve saved our possible fpstate, now disable the fpu - * and continue with life. - */ - RESTORE_FPU - ret - restore %g1, 0, %o0 ! Return DEST for memcpy -#endif /* _KERNEL */ - retl - mov %g1, %o0 -#endif /* 1 */ - - -#if 1 -/* - * XXXXXXXXXXXXXXXXXXXX - * We need to make sure that this doesn't use floating point - * before our trap handlers are installed or we could panic - * XXXXXXXXXXXXXXXXXXXX - */ /* * bzero(addr, len) * - * We want to use VIS instructions if we're clearing out more than - * 256 bytes, but to do that we need to properly save and restore the - * FP registers. Unfortunately the code to do that in the kernel needs - * to keep track of the current owner of the FPU, hence the different - * code. - * * XXXXX To produce more efficient code, we do not allow lengths * greater than 0x80000000000000000, which are negative numbers. * This should not really be an issue since the VA hole should @@ -8079,11 +6871,6 @@ Lbzero_internal: sllx %o1, 32, %o3 or %o1, %o3, %o1 1: -#if 0 - !! Now we are 64-bit aligned - cmp %o2, 256 ! Use block clear if len > 256 - bge,pt %xcc, Lbzero_block ! use block store instructions -#endif /* 0 */ deccc 8, %o2 Lbzero_longs: bl,pn %xcc, Lbzero_cleanup ! Less than 8 bytes left @@ -8118,94 +6905,6 @@ Lbzero_done: retl mov %o4, %o0 ! Restore pointer for memset (ugh) -#if 1 -Lbzero_block: - sethi %hi(block_disable), %o3 - ldx [ %o3 + %lo(block_disable) ], %o3 - brnz,pn %o3, Lbzero_longs - !! Make sure our trap table is installed - set _C_LABEL(trapbase), %o5 - rdpr %tba, %o3 - sub %o3, %o5, %o3 - brnz,pn %o3, Lbzero_longs ! No, then don't use block load/store - nop -/* - * Kernel: - * - * Here we use VIS instructions to do a block clear of a page. - * But before we can do that we need to save and enable the FPU. - * The last owner of the FPU registers is fpproc, and - * fpproc->p_md.md_fpstate is the current fpstate. If that's not - * null, call savefpstate() with it to store our current fp state. - * - * Next, allocate an aligned fpstate on the stack. We will properly - * nest calls on a particular stack so this should not be a problem. - * - * Now we grab either curproc (or if we're on the interrupt stack - * proc0). We stash its existing fpstate in a local register and - * put our new fpstate in curproc->p_md.md_fpstate. We point - * fpproc at curproc (or proc0) and enable the FPU. - * - * If we are ever preempted, our FPU state will be saved in our - * fpstate. Then, when we're resumed and we take an FPDISABLED - * trap, the trap handler will be able to fish our FPU state out - * of curproc (or proc0). - * - * On exiting this routine we undo the damage: restore the original - * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable - * the MMU. - * - */ - - ENABLE_FPU 0 - !! We are now 8-byte aligned. We need to become 64-byte aligned. - btst 63, %i0 - bz,pt %xcc, 2f - nop -1: - stx %i1, [%i0] - inc 8, %i0 - btst 63, %i0 - bnz,pt %xcc, 1b - dec 8, %i2 - -2: - brz %i1, 3f ! Skip the memory op - fzero %f0 ! for bzero - - stx %i1, [%i0] ! Flush this puppy to RAM - membar #StoreLoad - ldd [%i0], %f0 - -3: - fmovd %f0, %f2 ! Duplicate the pattern - fmovd %f0, %f4 - fmovd %f0, %f6 - fmovd %f0, %f8 - fmovd %f0, %f10 - fmovd %f0, %f12 - fmovd %f0, %f14 - - !! Remember: we were 8 bytes too far - dec 56, %i2 ! Go one iteration too far -5: - stda %f0, [%i0] ASI_BLK_P ! Store 64 bytes - deccc BLOCK_SIZE, %i2 - bg,pt %icc, 5b - inc BLOCK_SIZE, %i0 - - membar #Sync -/* - * We've saved our possible fpstate, now disable the fpu - * and continue with life. - */ - RESTORE_FPU - addcc %i2, 56, %i2 ! Restore the count - ba,pt %xcc, Lbzero_longs ! Finish up the remainder - restore -#endif /* 1 */ -#endif /* 1 */ - /* * kcopy() is exactly like bcopy except that it set pcb_onfault such that * when a fault occurs, it is able to return EFAULT to indicate this to the |