From e916ce54b942cbe7e945c96363d09a8d0d8abd4a Mon Sep 17 00:00:00 2001 From: Miod Vallat Date: Sat, 17 Jan 2009 18:30:09 +0000 Subject: Recognize environmental change machine checks on ES40, and report what happens instead of panicing; while there, start providing more details for ev6 processor machine checks as well. This allows power supplies to be unplugged and exchanged while the system is running, without causing the kernel to crash. --- sys/arch/alpha/include/alpha_cpu.h | 3 +- sys/arch/alpha/include/logout.h | 217 ++++++++++++++++++++++++++++++++++++- 2 files changed, 215 insertions(+), 5 deletions(-) (limited to 'sys/arch/alpha/include') diff --git a/sys/arch/alpha/include/alpha_cpu.h b/sys/arch/alpha/include/alpha_cpu.h index bb3a6fa3f54..2d33fcf9636 100644 --- a/sys/arch/alpha/include/alpha_cpu.h +++ b/sys/arch/alpha/include/alpha_cpu.h @@ -1,4 +1,4 @@ -/* $OpenBSD: alpha_cpu.h,v 1.8 2002/11/26 01:35:23 art Exp $ */ +/* $OpenBSD: alpha_cpu.h,v 1.9 2009/01/17 18:30:08 miod Exp $ */ /* $NetBSD: alpha_cpu.h,v 1.43 2001/12/18 04:18:22 thorpej Exp $ */ /* @@ -184,6 +184,7 @@ struct alpha_logout_area { #define ALPHA_PROC_ERROR 0x630 /* Processor correctable error */ #define ALPHA_SYS_MCHECK 0x660 /* System machine check */ #define ALPHA_PROC_MCHECK 0x670 /* Processor machine check */ +#define ALPHA_ENV_MCHECK 0x680 /* Environmental machine check */ /* * Virtual Memory Management definitions [OSF/1 PALcode Specific] diff --git a/sys/arch/alpha/include/logout.h b/sys/arch/alpha/include/logout.h index e3c0a17cca2..0baa6e42e85 100644 --- a/sys/arch/alpha/include/logout.h +++ b/sys/arch/alpha/include/logout.h @@ -1,6 +1,21 @@ -/* $OpenBSD: logout.h,v 1.1 2008/07/24 16:34:25 miod Exp $ */ +/* $OpenBSD: logout.h,v 1.2 2009/01/17 18:30:08 miod Exp $ */ /* $NetBSD: logout.h,v 1.6 2005/12/11 12:16:16 christos Exp $ */ +/* + * Copyright (c) 2009 Miodrag Vallat. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ /* * Copyright (c) 1998 by Matthew Jacob * NASA AMES Research Center. @@ -260,7 +275,201 @@ typedef struct { u_int64_t isr; /* Interrupt Status Reg. */ } mc_cc_ev5; +/* + * Information gathered from: AlphaServer ES40 Service Guide + */ + +/* + * EV6 Specific OSF/1 Pal Code Exception Logout Area Definitions + */ + +/* + * EV6 Specific common logout frame header. + * *Almost* identical to the generic logout header listed in alpha_cpu.h. + */ + +typedef struct { + unsigned int la_frame_size; /* frame size */ + unsigned int la_flags; /* flags; see alpha_cpu.h */ + unsigned int la_cpu_offset; /* offset to CPU area */ + unsigned int la_system_offset; /* offset to system area */ + unsigned int mcheck_code; /* machine check code */ + unsigned int mcheck_rev; /* frame revision */ +#define MC_EV6_FRAME_REVISION 1 +} mc_hdr_ev6; + +/* + * EV6 Specific Machine Check processor area. + */ + +typedef struct { + uint64_t i_stat; + uint64_t dc_stat; + uint64_t c_addr; + uint64_t c_syndrome_0; + uint64_t c_syndrome_1; + uint64_t c_stat; + uint64_t c_sts; + uint64_t mm_stat; + /* the following fields only exist for uncorrectable errors */ + uint64_t exc_addr; + uint64_t ier_cm; + uint64_t isum; + uint64_t reserved0; + uint64_t pal_base; + uint64_t i_ctl; + uint64_t pctx; + uint64_t reserved1; + uint64_t reserved2; +} mc_cpu_ev6; + +/* C_STAT bits */ +#define EV6_C_STAT_MASK 0x1f +#define EV6_C_STAT_NO_ERROR 0x00 +#define EV6_C_STAT_SNGL_BC_TAG_PERR 0x01 +#define EV6_C_STAT_SNGL_DC_DUPLICATE_TAG_PERR 0x02 +#define EV6_C_STAT_SNGL_DSTREAM_MEM_ECC_ERR 0x03 +#define EV6_C_STAT_SNGL_DSTREAM_BC_ECC_ERR 0x04 +#define EV6_C_STAT_SNGL_DSTREAM_DC_ECC_ERR 0x05 +#define EV6_C_STAT_SNGL_BC_PROBE_HIT_ERR 0x06 +#define EV6_C_STAT_SNGL_BC_PROBE_HIT_ERR2 0x07 +#define EV6_C_STAT_SNGL_ISTREAM_MEM_ECC_ERR 0x0b +#define EV6_C_STAT_SNGL_ISTREAM_BC_ECC_ERR 0x0c +#define EV6_C_STAT_DBL_DSTREAM_MEM_ECC_ERR 0x13 +#define EV6_C_STAT_DBL_DSTREAM_BC_ECC_ERR 0x14 +#define EV6_C_STAT_DBL_ISTREAM_MEM_ECC_ERR 0x1b +#define EV6_C_STAT_DBL_ISTREAM_BC_ECC_ERR 0x1c + +/* C_STS bits */ +#define EV6_C_STS_MASK 0x0f +#define EV6_C_STS_PARITY 0x08 +#define EV6_C_STS_VALID 0x04 +#define EV6_C_STS_DIRTY 0x02 +#define EV6_C_STS_SHARED 0x01 + +/* DC_STAT */ +#define EV6_DC_STAT_MASK 0x1f +#define EV6_DC_STAT_PIPELINE_0_ERROR 0x01 +#define EV6_DC_STAT_PIPELINE_1_ERROR 0x02 +#define EV6_DC_STAT_STORE_DATA_ECC_ERROR 0x04 +#define EV6_DC_STAT_LOAD_DATA_ECC_ERROR 0x08 +#define EV6_DC_STAT_STORE_DATA_ECC_ERROR_REPEATED 0x10 + +/* MM_STAT */ +#define EV6_MM_STAT_MASK 0x03ff +#define EV6_MM_STAT_WRITE 0x0001 +#define EV6_MM_STAT_ACCESS_VIOLATION 0x0002 +#define EV6_MM_STAT_FOR_SET 0x0004 +#define EV6_MM_STAT_FOW_SET 0x0008 +#define EV6_MM_STAT_OPCODE_MASK 0x02f0 +#define EV6_MM_STAT_DCACHE_CORRECTABLE_ERROR 0x0300 + +/* + * EV6 Specific Machine Check system area. + */ + +typedef struct { + uint64_t flags; + uint64_t c_dir; + uint64_t c_misc; + uint64_t p0_perror; + uint64_t p1_perror; +} mc_sys_ev6; + +/* + * EV6 Environmental Error logout frame. + */ + +typedef struct { + uint64_t flags; + uint64_t c_dir; + uint64_t smir; + uint64_t cpuir; + uint64_t psir; + uint64_t lm78_isr; + uint64_t doors; + uint64_t temp_warning; + uint64_t fan_control; + uint64_t fatal_power_down; + uint64_t reserved; +} mc_env_ev6; + +/* SMIR */ +#define EV6_ENV_SMIR_RESET 0x80 +#define EV6_ENV_SMIR_PCI1_RESET 0x40 +#define EV6_ENV_SMIR_PCI0_RESET 0x20 +#define EV6_ENV_SMIR_OVERTEMP 0x10 +#define EV6_ENV_SMIR_DC_FAILURE 0x04 +#define EV6_ENV_SMIR_RMC_HALT 0x02 +#define EV6_ENV_SMIR_PSU_FAILURE 0x01 + +/* CPUIR */ +#define EV6_ENV_CPUIR_CPU_FAIL(cpuno) ((cpuno) << 4) +#define EV6_ENV_CPUIR_CPU_ENABLE(cpuno) ((cpuno) << 0) + +/* PSIR */ +#define EV6_ENV_PSIR_PSU_FAIL(psuno) ((psuno) << 4) +#define EV6_ENV_PSIR_PSU_ENABLE(psuno) ((psuno) << 0) + +/* LM78_ISR */ +#define EV6_ENV_LM78_PSU_AC_HIGH_LIMIT 0x0000800000000000 +#define EV6_ENV_LM78_PSU_AC_LOW_LIMIT 0x0000400000000000 +#define EV6_ENV_LM78_PSU_OVERTEMP 0x0000200000000000 +#define EV6_ENV_LM78_PSU_12V_OVERAMP 0x0000100000000000 +#define EV6_ENV_LM78_PSU_5V_OVERAMP 0x0000080000000000 +#define EV6_ENV_LM78_PSU_3_3V_OVERAMP 0x0000040000000000 +#define EV6_ENV_LM78_PSU_NUMBER_MASK 0x0000030000000000 +#define EV6_ENV_LM78_PSU_NUMBER_SHIFT 40 +#define EV6_ENV_LM78_FAN6_FAILURE 0x0000008000000000 +#define EV6_ENV_LM78_FAN3_FAILURE 0x0000004000000000 +#define EV6_ENV_LM78_ZONE2_OVERTEMP 0x0000001000000000 +#define EV6_ENV_LM78_CPU3_VIO_OOT 0x0000000800000000 +#define EV6_ENV_LM78_CPU3_VCORE_OOT 0x0000000400000000 +#define EV6_ENV_LM78_CPU2_VIO_OOT 0x0000000200000000 +#define EV6_ENV_LM78_CPU2_VCORE_OOT 0x0000000100000000 +#define EV6_ENV_LM78_FAN5_FAILURE 0x0000000000800000 +#define EV6_ENV_LM78_FAN4_FAILURE 0x0000000000400000 +#define EV6_ENV_LM78_ZONE1_OVERTEMP 0x0000000000100000 +#define EV6_ENV_LM78_CPU1_VIO_OOT 0x0000000000080000 +#define EV6_ENV_LM78_CPU1_VCORE_OOT 0x0000000000040000 +#define EV6_ENV_LM78_CPU0_VIO_OOT 0x0000000000020000 +#define EV6_ENV_LM78_CPU0_VCORE_OOT 0x0000000000010000 +#define EV6_ENV_LM78_PSU_MINUS12V_OOT 0x0000000000000800 +#define EV6_ENV_LM78_CTERM_OOT 0x0000000000000100 +#define EV6_ENV_LM78_FAN2_FAILURE 0x0000000000000080 +#define EV6_ENV_LM78_FAN1_FAILURE 0x0000000000000040 +#define EV6_ENV_LM78_CPU_OVERTEMP 0x0000000000000020 +#define EV6_ENV_LM78_ZONA0_OVERTEMP 0x0000000000000010 +#define EV6_ENV_LM78_VTERM_OOT 0x0000000000000008 +#define EV6_ENV_LM78_PSU_12V_OOT 0x0000000000000004 +#define EV6_ENV_LM78_PSU_5V_OOT 0x0000000000000002 +#define EV6_ENV_LM78_PSU_3_3V_OOT 0x0000000000000001 + +/* Doors */ +#define EV6_ENV_DOORS_PCI_CLOSED 0x80 +#define EV6_ENV_DOORS_FAN_CLOSED 0x40 +#define EV6_ENV_DOORS_CPU_CLOSED 0x20 +#define EV6_ENV_DOORS_PCI_OPEN 0x08 +#define EV6_ENV_DOORS_FAN_OPEN 0x04 +#define EV6_ENV_DOORS_CPU_OPEN 0x02 + +/* System Temperature Warning (sticky?) */ +#define EV6_ENV_STW_ZONE2 0x40 +#define EV6_ENV_STW_ZONE1 0x20 +#define EV6_ENV_STW_ZONE0 0x10 +#define EV6_ENV_STW_CPU3 0x08 +#define EV6_ENV_STW_CPU2 0x04 +#define EV6_ENV_STW_CPU1 0x02 +#define EV6_ENV_STW_CPU0 0x01 -#ifdef _KERNEL -extern void ev5_logout_print(mc_hdr_ev5 *, mc_uc_ev5 *); -#endif +/* System Fan Control Fault */ +#define EV6_ENV_SFCF_FAN1234_LOW_SPEED 0x0800 +#define EV6_ENV_SFCF_FAN1234_HIGH_SPEED 0x0400 +#define EV6_ENV_SFCF_FAN56_LOW_SPEED 0x0200 +#define EV6_ENV_SFCF_FAN56_HIGH_SPEED 0x0100 +#define EV6_ENV_SFCF_FAN6_NONRESPONSIVE 0x0020 +#define EV6_ENV_SFCF_FAN5_NONRESPONSIVE 0x0010 +#define EV6_ENV_SFCF_FAN4_NONRESPONSIVE 0x0008 +#define EV6_ENV_SFCF_FAN3_NONRESPONSIVE 0x0004 +#define EV6_ENV_SFCF_FAN2_NONRESPONSIVE 0x0002 +#define EV6_ENV_SFCF_FAN1_NONRESPONSIVE 0x0001 -- cgit v1.2.3