summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiod Vallat <miod@cvs.openbsd.org>2008-05-21 19:45:39 +0000
committerMiod Vallat <miod@cvs.openbsd.org>2008-05-21 19:45:39 +0000
commit42a97c1411943b3eb9382b0861a11c68a8a96aba (patch)
tree6fbbfa8778719d5a851bf47d04cd9994b3fb81e6
parent5785052132ea981fa7a37a802ee141a07453a991 (diff)
Not all cache operations need to be run from P2, so don't do this unless
necessary. Also, let the P2 functions return to P1 addresses, instead of jumping to their own P1 image before returning. This gives a ~15% speedup. From NetBSD, thanks uwe@netbsd for spotting this in the sh4 docs!
-rw-r--r--sys/arch/sh/include/cpu.h71
-rw-r--r--sys/arch/sh/sh/cache_sh4.c32
2 files changed, 73 insertions, 30 deletions
diff --git a/sys/arch/sh/include/cpu.h b/sys/arch/sh/include/cpu.h
index d0d0a7ce0ce..2b8a0d5d579 100644
--- a/sys/arch/sh/include/cpu.h
+++ b/sys/arch/sh/include/cpu.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.h,v 1.12 2008/02/11 20:44:43 miod Exp $ */
+/* $OpenBSD: cpu.h,v 1.13 2008/05/21 19:45:37 miod Exp $ */
/* $NetBSD: cpu.h,v 1.41 2006/01/21 04:24:12 uwe Exp $ */
/*-
@@ -157,26 +157,65 @@ extern int want_resched; /* need_resched() was called */
#ifdef _KERNEL
#ifndef __lint__
-/* switch from P1 to P2 */
-#define RUN_P2 do { \
- void *p; \
- p = &&P2; \
- goto *(void *)SH3_P1SEG_TO_P2SEG(p); \
- P2: (void)0; \
+/*
+ * Switch from P1 (cached) to P2 (uncached). This used to be written
+ * using gcc's assigned goto extension, but gcc4 aggressive optimizations
+ * tend to optimize that away under certain circumstances.
+ */
+#define RUN_P2 \
+ do { \
+ register uint32_t r0 asm("r0"); \
+ uint32_t pc; \
+ __asm volatile( \
+ " mov.l 1f, %1 ;" \
+ " mova 2f, %0 ;" \
+ " or %0, %1 ;" \
+ " jmp @%1 ;" \
+ " nop ;" \
+ " .align 2 ;" \
+ "1: .long 0x20000000;" \
+ "2:;" \
+ : "=r"(r0), "=r"(pc)); \
} while (0)
-/* switch from P2 to P1 */
-#define RUN_P1 do { \
- void *p; \
- p = &&P1; \
- __asm volatile("nop;nop;nop;nop;nop;nop;nop;nop"); \
- goto *(void *)SH3_P2SEG_TO_P1SEG(p); \
- P1: (void)0; \
+/*
+ * Switch from P2 (uncached) back to P1 (cached). We need to be
+ * running on P2 to access cache control, memory-mapped cache and TLB
+ * arrays, etc. and after touching them at least 8 instructinos are
+ * necessary before jumping to P1, so provide that padding here.
+ */
+#define RUN_P1 \
+ do { \
+ register uint32_t r0 asm("r0"); \
+ uint32_t pc; \
+ __asm volatile( \
+ /*1*/ " mov.l 1f, %1 ;" \
+ /*2*/ " mova 2f, %0 ;" \
+ /*3*/ " nop ;" \
+ /*4*/ " and %0, %1 ;" \
+ /*5*/ " nop ;" \
+ /*6*/ " nop ;" \
+ /*7*/ " nop ;" \
+ /*8*/ " nop ;" \
+ " jmp @%1 ;" \
+ " nop ;" \
+ " .align 2 ;" \
+ "1: .long ~0x20000000;" \
+ "2:;" \
+ : "=r"(r0), "=r"(pc)); \
} while (0)
+/*
+ * If RUN_P1 is the last thing we do in a function we can omit it, b/c
+ * we are going to return to a P1 caller anyway, but we still need to
+ * ensure there's at least 8 instructions before jump to P1.
+ */
+#define PAD_P1_SWITCH __asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop;")
+
#else /* __lint__ */
-#define RUN_P2 do {} while (/* CONSTCOND */ 0)
-#define RUN_P1 do {} while (/* CONSTCOND */ 0)
+#define RUN_P2 do {} while (/* CONSTCOND */ 0)
+#define RUN_P1 do {} while (/* CONSTCOND */ 0)
+#define PAD_P1_SWITCH do {} while (/* CONSTCOND */ 0)
#endif
#endif
diff --git a/sys/arch/sh/sh/cache_sh4.c b/sys/arch/sh/sh/cache_sh4.c
index 5d4e37c3dba..03962a67cec 100644
--- a/sys/arch/sh/sh/cache_sh4.c
+++ b/sys/arch/sh/sh/cache_sh4.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cache_sh4.c,v 1.3 2007/03/19 20:12:43 miod Exp $ */
+/* $OpenBSD: cache_sh4.c,v 1.4 2008/05/21 19:45:38 miod Exp $ */
/* $NetBSD: cache_sh4.c,v 1.15 2005/12/24 23:24:02 perry Exp $ */
/*-
@@ -161,6 +161,14 @@ sh4_cache_config(void)
sh_cache_ops._dcache_wb_range = sh4_dcache_wb_range;
switch (cpu_product) {
+ case CPU_PRODUCT_7750:
+ case CPU_PRODUCT_7750S:
+ /* memory mapped D$ can only be accessed from p2 */
+ sh_cache_ops._dcache_wbinv_all =
+ (void *)SH3_P1SEG_TO_P2SEG(sh4_dcache_wbinv_all);
+ sh_cache_ops._dcache_wbinv_range_index =
+ (void *)SH3_P1SEG_TO_P2SEG(sh4_dcache_wbinv_range_index);
+ break;
case CPU_PRODUCT_7750R:
case CPU_PRODUCT_7751R:
if (!(r & SH4_CCR_EMODE)) {
@@ -222,7 +230,7 @@ sh4_icache_sync_all(void)
cache_sh4_op_8lines_32(va, SH4_CCIA, CCIA_ENTRY_MASK, CCIA_V);
va += 32 * 8;
}
- RUN_P1;
+ PAD_P1_SWITCH;
}
void
@@ -241,7 +249,7 @@ sh4_icache_sync_range(vaddr_t va, vsize_t sz)
_reg_write_4(ccia, va & CCIA_TAGADDR_MASK); /* V = 0 */
va += 32;
}
- RUN_P1;
+ PAD_P1_SWITCH;
}
void
@@ -262,7 +270,7 @@ sh4_icache_sync_range_index(vaddr_t va, vsize_t sz)
cache_sh4_op_line_32(va, SH4_CCIA, CCIA_ENTRY_MASK, CCIA_V);
va += 32;
}
- RUN_P1;
+ PAD_P1_SWITCH;
}
void
@@ -271,13 +279,13 @@ sh4_dcache_wbinv_all(void)
vaddr_t va = 0;
vaddr_t eva = SH4_DCACHE_SIZE;
- RUN_P2;
+ /* RUN_P2; */ /* called via P2 address if necessary */
while (va < eva) {
cache_sh4_op_8lines_32(va, SH4_CCDA, CCDA_ENTRY_MASK,
(CCDA_U | CCDA_V));
va += 32 * 8;
}
- RUN_P1;
+ PAD_P1_SWITCH;
}
void
@@ -298,7 +306,7 @@ sh4_dcache_wbinv_range_index(vaddr_t va, vsize_t sz)
vaddr_t eva = round_line(va + sz);
va = trunc_line(va);
- RUN_P2;
+ /* RUN_P2; */ /* called via P2 address if necessary */
while ((eva - va) >= (8 * 32)) {
cache_sh4_op_8lines_32(va, SH4_CCDA, CCDA_ENTRY_MASK,
(CCDA_U | CCDA_V));
@@ -310,7 +318,7 @@ sh4_dcache_wbinv_range_index(vaddr_t va, vsize_t sz)
(CCDA_U | CCDA_V));
va += 32;
}
- RUN_P1;
+ PAD_P1_SWITCH;
}
void
@@ -412,7 +420,7 @@ sh4_emode_icache_sync_all(void)
CCIA_V, 13);
va += 32 * 8;
}
- RUN_P1;
+ PAD_P1_SWITCH;
}
void
@@ -435,7 +443,7 @@ sh4_emode_icache_sync_range_index(vaddr_t va, vsize_t sz)
CCIA_V, 13);
va += 32;
}
- RUN_P1;
+ PAD_P1_SWITCH;
}
void
@@ -444,13 +452,11 @@ sh4_emode_dcache_wbinv_all(void)
vaddr_t va = 0;
vaddr_t eva = SH4_EMODE_DCACHE_SIZE;
- RUN_P2;
while (va < eva) {
cache_sh4_emode_op_8lines_32(va, SH4_CCDA, CCDA_ENTRY_MASK,
(CCDA_U | CCDA_V), 14);
va += 32 * 8;
}
- RUN_P1;
}
void
@@ -459,7 +465,6 @@ sh4_emode_dcache_wbinv_range_index(vaddr_t va, vsize_t sz)
vaddr_t eva = round_line(va + sz);
va = trunc_line(va);
- RUN_P2;
while ((eva - va) >= (8 * 32)) {
cache_sh4_emode_op_8lines_32(va, SH4_CCDA, CCDA_ENTRY_MASK,
(CCDA_U | CCDA_V), 14);
@@ -471,5 +476,4 @@ sh4_emode_dcache_wbinv_range_index(vaddr_t va, vsize_t sz)
(CCDA_U | CCDA_V), 14);
va += 32;
}
- RUN_P1;
}