diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index a4595b22e092..945311840a10 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -14,9 +14,11 @@ Machine check mce=nobootlog Disable boot machine check logging. mce=tolerancelevel (number) - 0: always panic, 1: panic if deadlock possible, - 2: try to avoid panic, 3: never panic or exit (for testing) - default is 1 + 0: always panic on uncorrected errors, log corrected errors + 1: panic or SIGBUS on uncorrected errors, log corrected errors + 2: SIGBUS or log uncorrected errors, log corrected errors + 3: never panic or SIGBUS, log all errors (for testing only) + Default is 1 Can be also set using sysfs which is preferable. nomce (for compatibility with i386): same as mce=off diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck index feaeaf6f6e4d..a05e58e7b159 100644 --- a/Documentation/x86_64/machinecheck +++ b/Documentation/x86_64/machinecheck @@ -49,12 +49,14 @@ tolerant Since machine check exceptions can happen any time it is sometimes risky for the kernel to kill a process because it defies normal kernel locking rules. The tolerance level configures - how hard the kernel tries to recover even at some risk of deadlock. + how hard the kernel tries to recover even at some risk of + deadlock. Higher tolerant values trade potentially better uptime + with the risk of a crash or even corruption (for tolerant >= 3). - 0: always panic, - 1: panic if deadlock possible, - 2: try to avoid panic, - 3: never panic or exit (for testing only) + 0: always panic on uncorrected errors, log corrected errors + 1: panic or SIGBUS on uncorrected errors, log corrected errors + 2: SIGBUS or log uncorrected errors, log corrected errors + 3: never panic or SIGBUS, log all errors (for testing only) Default: 1 diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 968613572b9a..7c8ab423abe3 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -37,8 +37,13 @@ atomic_t mce_entry; static int mce_dont_init; -/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, - 3: never panic or exit (for testing only) */ +/* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ static int tolerant = 1; static int banks; static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; @@ -132,9 +137,6 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) { int i; - if (tolerant >= 3) - return; - oops_begin(); for (i = 0; i < MCE_LOG_LEN; i++) { unsigned long tsc = mcelog.entry[i].tsc; @@ -178,11 +180,19 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) void do_machine_check(struct pt_regs * regs, long error_code) { struct mce m, panicm; - int nowayout = (tolerant < 1); - int kill_it = 0; u64 mcestart = 0; int i; int panicm_found = 0; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; atomic_inc(&mce_entry); @@ -194,8 +204,9 @@ void do_machine_check(struct pt_regs * regs, long error_code) memset(&m, 0, sizeof(struct mce)); m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; + no_way_out = 1; rdtscll(mcestart); barrier(); @@ -214,10 +225,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) continue; if (m.status & MCI_STATUS_EN) { - /* In theory _OVER could be a nowayout too, but - assume any overflowed errors were no fatal. */ - nowayout |= !!(m.status & MCI_STATUS_PCC); - kill_it |= !!(m.status & MCI_STATUS_UC); + /* if PCC was set, there's no way out */ + no_way_out |= !!(m.status & MCI_STATUS_PCC); + /* + * If this error was uncorrectable and there was + * an overflow, we're in trouble. If no overflow, + * we might get away with just killing a task. + */ + if (m.status & MCI_STATUS_UC) { + if (tolerant < 1 || m.status & MCI_STATUS_OVER) + no_way_out = 1; + kill_it = 1; + } } if (m.status & MCI_STATUS_MISCV) @@ -228,7 +247,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_get_rip(&m, regs); if (error_code >= 0) rdtscll(m.tsc); - wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); if (error_code != -2) mce_log(&m); @@ -251,37 +269,52 @@ void do_machine_check(struct pt_regs * regs, long error_code) the last one (shouldn't happen, just being safe). */ if (!panicm_found) panicm = m; - if (nowayout) + + /* + * If we have decided that we just CAN'T continue, and the user + * has not set tolerant to an insane level, give up and die. + */ + if (no_way_out && tolerant < 3) mce_panic("Machine check", &panicm, mcestart); - if (kill_it) { + + /* + * If the error seems to be unrecoverable, something should be + * done. Try to kill as little as possible. If we can kill just + * one task, do that. If the user has set the tolerance very + * high, don't try to do anything at all. + */ + if (kill_it && tolerant < 3) { int user_space = 0; - if (m.mcgstatus & MCG_STATUS_RIPV) + /* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ + if (m.mcgstatus & MCG_STATUS_EIPV) user_space = panicm.rip && (panicm.cs & 3); - - /* When the machine was in user space and the CPU didn't get - confused it's normally not necessary to panic, unless you - are paranoid (tolerant == 0) - RED-PEN could be more tolerant for MCEs in idle, - but most likely they occur at boot anyways, where - it is best to just halt the machine. */ - if ((!user_space && (panic_on_oops || tolerant < 2)) || - (unsigned)current->pid <= 1) - mce_panic("Uncorrected machine check", &panicm, mcestart); - - /* do_exit takes an awful lot of locks and has as - slight risk of deadlocking. If you don't want that - don't set tolerant >= 2 */ - if (tolerant < 3) + /* + * If we know that the error was in user space, send a + * SIGBUS. Otherwise, panic if tolerance is low. + * + * do_exit() takes an awful lot of locks and has a slight + * risk of deadlocking. + */ + if (user_space) { do_exit(SIGBUS); + } else if (panic_on_oops || tolerant < 2) { + mce_panic("Uncorrected machine check", + &panicm, mcestart); + } } /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); out: - /* Last thing done in the machine check exception to clear state. */ + /* the last thing we do is clear state */ + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -506,7 +539,7 @@ static int mce_open(struct inode *inode, struct file *file) spin_unlock(&mce_state_lock); - return 0; + return nonseekable_open(inode, file); } static int mce_release(struct inode *inode, struct file *file)