From bfbe0eeb769e2aff2cb1fc6845c4e4b7eac40bb3 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:48 +0100
Subject: [PATCH 01/15] x86/mce: Fix order of AMD MCE init function call

In mce_amd_feature_init() we take decisions based on mce_flags
being set or not. So the feature detection using CPUID should
naturally be ordered before we call mce_amd_feature_init().

Fix that here.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a006f4cd792b..b7180801ea33 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1617,10 +1617,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	case X86_VENDOR_AMD: {
 		u32 ebx = cpuid_ebx(0x80000007);
 
-		mce_amd_feature_init(c);
 		mce_flags.overflow_recov = !!(ebx & BIT(0));
 		mce_flags.succor	 = !!(ebx & BIT(1));
 		mce_flags.smca		 = !!(ebx & BIT(3));
+		mce_amd_feature_init(c);
 
 		break;
 		}

From 284b965c146f482b4a411133f62288d52b7e3a72 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:49 +0100
Subject: [PATCH 02/15] x86/mce/AMD: Do not perform shared bank check for
 future processors

Fam17h and above should not require a check to see if a bank is
shared or not. For shared banks, there will always be only one
core that has visibility over the MSRs and only that particular
core will be allowed to write to the MSRs.

Fix the code to return early if we have Scalable MCA support. No
change in functionality for earlier processors.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
[ Massaged the changelog text, fixed kbuild test robot build warning. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index e99b15077e94..3068ce25dfa1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -84,6 +84,13 @@ struct thresh_restart {
 
 static inline bool is_shared_bank(int bank)
 {
+	/*
+	 * Scalable MCA provides for only one core to have access to the MSRs of
+	 * a shared bank.
+	 */
+	if (mce_flags.smca)
+		return false;
+
 	/* Bank 4 is for northbridge reporting and is thus shared */
 	return (bank == 4);
 }

From 60f116fca162d9488f783f5014779463243ab7a2 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:50 +0100
Subject: [PATCH 03/15] x86/mce/AMD: Reduce number of blocks scanned per bank

From Fam17h onwards, the number of extended MCx_MISC register blocks is
reduced to 4. It is an architectural change from what we had on
earlier processors.

Although theoritically the total number of extended MCx_MISC
registers was 8 in earlier processor families, in practice we
only had to use the extra registers for MC4. And only 2 of those
were used. So this change does not affect older processors.
Tested on Fam10h and Fam15h systems.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 3068ce25dfa1..5982227990c9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -28,7 +28,7 @@
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
 
-#define NR_BLOCKS         9
+#define NR_BLOCKS         5
 #define THRESHOLD_MAX     0xFFF
 #define INT_TYPE_APIC     0x00020000
 #define MASK_VALID_HI     0x80000000

From f57a1f3c14b9182f1fea667f5a38a1094699db7c Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:51 +0100
Subject: [PATCH 04/15] x86/mce/AMD: Fix LVT offset configuration for
 thresholding

For processor families with the Scalable MCA feature, the LVT
offset for threshold interrupts is configured only in MSR
0xC0000410 and not in each per bank MISC register as was done in
earlier families.

Obtain the LVT offset from the correct MSR for those families.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5982227990c9..a77a4521976a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -49,6 +49,11 @@
 #define DEF_LVT_OFF		0x2
 #define DEF_INT_TYPE_APIC	0x2
 
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF	0xF000
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -142,6 +147,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 	}
 
 	if (apic != msr) {
+		/*
+		 * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+		 * the BIOS provides the value. The original field where LVT offset
+		 * was set is reserved. Return early here:
+		 */
+		if (mce_flags.smca)
+			return 0;
+
 		pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
 		       "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
 		       b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@@ -300,7 +313,19 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 				goto init;
 
 			b.interrupt_enable = 1;
-			new	= (high & MASK_LVTOFF_HI) >> 20;
+
+			if (mce_flags.smca) {
+				u32 smca_low, smca_high;
+
+				/* Gather LVT offset for thresholding: */
+				if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+					break;
+
+				new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+			} else {
+				new = (high & MASK_LVTOFF_HI) >> 20;
+			}
+
 			offset  = setup_APIC_mce_threshold(offset, new);
 
 			if ((offset == new) &&

From 429893b16d35d309ed6b35136aad5f908a08d9b9 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 25 Jan 2016 20:41:52 +0100
Subject: [PATCH 05/15] x86/mce/AMD: Carve out threshold block preparation

mce_amd_feature_init() was getting pretty fat, carve out the
threshold_block setup into a separate function in order to
simplify flow and make it more understandable.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1453750913-4781-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 87 ++++++++++++++++------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a77a4521976a..f2860a118b71 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -267,14 +267,59 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+			int offset, u32 misc_high)
+{
+	unsigned int cpu = smp_processor_id();
+	struct threshold_block b;
+	int new;
+
+	if (!block)
+		per_cpu(bank_map, cpu) |= (1 << bank);
+
+	memset(&b, 0, sizeof(b));
+	b.cpu			= cpu;
+	b.bank			= bank;
+	b.block			= block;
+	b.address		= addr;
+	b.interrupt_capable	= lvt_interrupt_supported(bank, misc_high);
+
+	if (!b.interrupt_capable)
+		goto done;
+
+	b.interrupt_enable = 1;
+
+	if (mce_flags.smca) {
+		u32 smca_low, smca_high;
+
+		/* Gather LVT offset for thresholding: */
+		if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+			goto out;
+
+		new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+	} else {
+		new = (misc_high & MASK_LVTOFF_HI) >> 20;
+	}
+
+	offset = setup_APIC_mce_threshold(offset, new);
+
+	if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
+		mce_threshold_vector = amd_threshold_interrupt;
+
+done:
+	mce_threshold_block_init(&b, offset);
+
+out:
+	return offset;
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
-	struct threshold_block b;
-	unsigned int cpu = smp_processor_id();
 	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block;
-	int offset = -1, new;
+	int offset = -1;
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
@@ -299,41 +344,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			     (high & MASK_LOCKED_HI))
 				continue;
 
-			if (!block)
-				per_cpu(bank_map, cpu) |= (1 << bank);
-
-			memset(&b, 0, sizeof(b));
-			b.cpu			= cpu;
-			b.bank			= bank;
-			b.block			= block;
-			b.address		= address;
-			b.interrupt_capable	= lvt_interrupt_supported(bank, high);
-
-			if (!b.interrupt_capable)
-				goto init;
-
-			b.interrupt_enable = 1;
-
-			if (mce_flags.smca) {
-				u32 smca_low, smca_high;
-
-				/* Gather LVT offset for thresholding: */
-				if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
-					break;
-
-				new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
-			} else {
-				new = (high & MASK_LVTOFF_HI) >> 20;
-			}
-
-			offset  = setup_APIC_mce_threshold(offset, new);
-
-			if ((offset == new) &&
-			    (mce_threshold_vector != amd_threshold_interrupt))
-				mce_threshold_vector = amd_threshold_interrupt;
-
-init:
-			mce_threshold_block_init(&b, offset);
+			offset = prepare_threshold_block(bank, block, address, offset, high);
 		}
 	}
 

From e6c8f1873be8a14c7e44202df1f7e6ea61bf3352 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:53 +0100
Subject: [PATCH 06/15] x86/mce/AMD: Set MCAX Enable bit

It is required for the OS to acknowledge that it is using the
MCAX register set and its associated fields by setting the
'McaXEnable' bit in each bank's MCi_CONFIG register. If it is
not set, then all UC errors will cause a system panic.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr-index.h     |  4 ++++
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b05402ef3b84..552346598dab 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -264,6 +264,10 @@
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
 
+/* 'SMCA': AMD64 Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f2860a118b71..88de27bd5797 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -54,6 +54,14 @@
 /* Threshold LVT offset is at MSR0xC0000410[15:12] */
 #define SMCA_THR_LVT_OFF	0xF000
 
+/*
+ * OS is required to set the MCAX bit to acknowledge that it is now using the
+ * new MSR ranges and new registers under each bank. It also means that the OS
+ * will configure deferred errors in the new MCx_CONFIG register. If the bit is
+ * not set, uncorrectable errors will cause a system panic.
+ */
+#define SMCA_MCAX_EN_OFF	0x1
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -292,6 +300,12 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 
 	if (mce_flags.smca) {
 		u32 smca_low, smca_high;
+		u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+		if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
+			smca_high |= SMCA_MCAX_EN_OFF;
+			wrmsr(smca_addr, smca_low, smca_high);
+		}
 
 		/* Gather LVT offset for thresholding: */
 		if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))

From 548acf19234dbda5a52d5a8e7e205af46e9da840 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 17 Feb 2016 10:20:12 -0800
Subject: [PATCH 07/15] x86/mm: Expand the exception table logic to allow new
 handling options

Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
produce this. Andy provided the inspiration to add classes to the
exception table with a clever bit-squeezing trick, Boris pointed
out how much cleaner it would all be if we just had a new field.

Linus Torvalds blessed the expansion with:

  ' I'd rather not be clever in order to save just a tiny amount of space
    in the exception table, which isn't really criticial for anybody. '

The third field is another relative function pointer, this one to a
handler that executes the actions.

We start out with three handlers:

 1: Legacy - just jumps the to fixup IP
 2: Fault - provide the trap number in %ax to the fixup code
 3: Cleaned up legacy for the uaccess error hack

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f6af78fcbd348cf4939875cfda9c19689b5e50b8.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/exception-tables.txt |  35 +++++++++
 arch/x86/include/asm/asm.h             |  40 ++++++----
 arch/x86/include/asm/uaccess.h         |  16 ++--
 arch/x86/kernel/kprobes/core.c         |   2 +-
 arch/x86/kernel/traps.c                |   6 +-
 arch/x86/mm/extable.c                  | 102 ++++++++++++++++++-------
 arch/x86/mm/fault.c                    |   2 +-
 scripts/sortextable.c                  |  32 ++++++++
 8 files changed, 177 insertions(+), 58 deletions(-)

diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt
index 32901aa36f0a..e396bcd8d830 100644
--- a/Documentation/x86/exception-tables.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -290,3 +290,38 @@ Due to the way that the exception table is built and needs to be ordered,
 only use exceptions for code in the .text section.  Any other section
 will cause the exception table to not be sorted correctly, and the
 exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addresses
+as relative offsets from the table itself. The assembly code changed
+from:
+	.long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+
+and the C-code that uses these values converts back to absolute addresses
+like this:
+
+	ex_insn_addr(const struct exception_table_entry *x)
+	{
+		return (unsigned long)&x->insn + x->insn;
+	}
+
+In v4.6 the exception table entry was expanded with a new field "handler".
+This is also 32-bits wide and contains a third relative function
+pointer which points to one of:
+
+1) int ex_handler_default(const struct exception_table_entry *fixup)
+   This is legacy case that just jumps to the fixup code
+2) int ex_handler_fault(const struct exception_table_entry *fixup)
+   This case provides the fault number of the trap that occurred at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+3) int ex_handler_ext(const struct exception_table_entry *fixup)
+   This case is used for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
+More functions can easily be added.
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..f5063b6659eb 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,19 +44,22 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long (handler) - . ;					\
 	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
-	.popsection
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
@@ -89,19 +92,24 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
 	" .popsection\n"
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
-	" .popsection\n"
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a4a30e4b2d34..c0f27d7ea7ff 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..211c11c7bba4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..9dd7e4b7fcde 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,9 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+			    struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->fixup + x->fixup;
 }
-
-int fixup_exception(struct pt_regs *regs)
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
 {
-	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
+	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
+
+bool ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+bool ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+bool ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
+
+	e = search_exception_tables(ip);
+	if (!e)
+		return false;
+	handler = ex_fixup_handler(e);
+
+	return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
-
-	return 0;
+	handler = ex_fixup_handler(e);
+	return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* special handling not supported during early boot */
+	if (handler != ex_handler_default)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
@@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup += i;
 		i += 4;
+		p->handler += i;
+		i += 4;
 	}
 
 	sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup -= i;
 		i += 4;
+		p->handler -= i;
+		i += 4;
 	}
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d913b46..7b29fb14f870 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,35 @@ static int compare_relative_table(const void *a, const void *b)
 	return 0;
 }
 
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+	int i;
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) + i, loc);
+		w(r(loc + 1) + i + 4, loc + 1);
+		w(r(loc + 2) + i + 8, loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+
+	qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) - i, loc);
+		w(r(loc + 1) - (i + 4), loc + 1);
+		w(r(loc + 2) - (i + 8), loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+}
+
 static void sort_relative_table(char *extab_image, int image_size)
 {
 	int i;
@@ -281,6 +310,9 @@ do_file(char const *const fname)
 		break;
 	case EM_386:
 	case EM_X86_64:
+		custom_sort = x86_sort_relative_table;
+		break;
+
 	case EM_S390:
 		custom_sort = sort_relative_table;
 		break;

From b2f9d678e28ca71ce650eac82f26dd287b47e89a Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 17 Feb 2016 10:20:13 -0800
Subject: [PATCH 08/15] x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT
 exception table entries

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel tagged with _ASM_EXTABLE_FAULT() so that the ex_handler_fault()
handler will provide the fixup code with the trap number.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/89d243d05a7943bb187d1074bb30d9c4f482d5f5.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 22 ++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 70 ++++++++++++-----------
 2 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..5119766d9889 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +30,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +49,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -86,6 +88,10 @@ static struct severity {
 		PANIC, "In kernel and no restart IP",
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
+	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
 	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
@@ -122,6 +128,11 @@ static struct severity {
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
+	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
 	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
@@ -170,6 +181,9 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b7180801ea33..524f2a8492d7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	/* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);

From adc53f2e0ae2fcff10a4b981df14729ffb1482fc Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:17 +0100
Subject: [PATCH 09/15] x86/mce: Move MCx_CONFIG MSR definitions

Those MSRs are used only by the MCE code so move them there.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1456785179-14378-2-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h       | 4 ++++
 arch/x86/include/asm/msr-index.h | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2ea4527e462f..80ba0a8d6d06 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -91,6 +91,10 @@
 #define MCE_LOG_LEN 32
 #define MCE_LOG_SIGNATURE	"MACHINECHECK"
 
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+
 /*
  * This structure contains all data related to the MCE log.  Also
  * carries a signature to make it easier to find from external
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 552346598dab..b05402ef3b84 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -264,10 +264,6 @@
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
 
-/* 'SMCA': AMD64 Scalable MCA */
-#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
-#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
-
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186

From be0aec23bf4624fd55650629fe8df20483487049 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:18 +0100
Subject: [PATCH 10/15] x86/mce/AMD, EDAC: Enable error decoding of Scalable
 MCA errors

For Scalable MCA enabled processors, errors are listed per IP block. And
since it is not required for an IP to map to a particular bank, we need
to use HWID and McaType values from the MCx_IPID register to figure out
which IP a given bank represents.

We also have a new bit (TCC) in the MCx_STATUS register to indicate Task
context is corrupt.

Add logic here to decode errors from all known IP blocks for Fam17h
Model 00-0fh and to print TCC errors.

[ Minor fixups. ]
Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-3-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h           |  59 +++++
 arch/x86/kernel/cpu/mcheck/mce_amd.c |  29 +++
 drivers/edac/mce_amd.c               | 335 ++++++++++++++++++++++++++-
 3 files changed, 420 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 80ba0a8d6d06..9c467fe00551 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,18 @@
 /* AMD-specific bits */
 #define MCI_STATUS_DEFERRED	(1ULL<<44)  /* declare an uncorrected error */
 #define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
+#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ *  - Deferred error interrupt type is specifiable by bank.
+ *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ *    But should not be used to determine MSR numbers.
+ *  - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX		0x1
+#define MCI_IPID_MCATYPE	0xFFFF0000
+#define MCI_IPID_HWID		0xFFF
 
 /*
  * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -93,7 +105,9 @@
 
 /* AMD Scalable MCA */
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
 
 /*
  * This structure contains all data related to the MCE log.  Also
@@ -291,4 +305,49 @@ struct cper_sec_mem_err;
 extern void apei_mce_report_mem_error(int corrected,
 				      struct cper_sec_mem_err *mem_err);
 
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+	SMCA_F17H_CORE = 0,	/* Core errors */
+	SMCA_DF,		/* Data Fabric */
+	SMCA_UMC,		/* Unified Memory Controller */
+	SMCA_PB,		/* Parameter Block */
+	SMCA_PSP,		/* Platform Security Processor */
+	SMCA_SMU,		/* System Management Unit */
+	N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+	const char *name;
+	unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+	SMCA_LS = 0,	/* Load Store */
+	SMCA_IF,	/* Instruction Fetch */
+	SMCA_L2_CACHE,	/* L2 cache */
+	SMCA_DE,	/* Decoder unit */
+	RES,		/* Reserved */
+	SMCA_EX,	/* Execution unit */
+	SMCA_FP,	/* Floating Point */
+	SMCA_L3_CACHE,	/* L3 cache */
+	N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+	SMCA_CS = 0,	/* Coherent Slave */
+	SMCA_PIE,	/* Power management, Interrupts, etc */
+	N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27bd5797..ee487a93ebe7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -71,6 +71,35 @@ static const char * const th_names[] = {
 	"execution_unit",
 };
 
+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+	[SMCA_F17H_CORE]	= { "f17h_core",	0xB0 },
+	[SMCA_DF]		= { "data_fabric",	0x2E },
+	[SMCA_UMC]		= { "umc",		0x96 },
+	[SMCA_PB]		= { "param_block",	0x5 },
+	[SMCA_PSP]		= { "psp",		0xFF },
+	[SMCA_SMU]		= { "smu",		0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+	[SMCA_LS]		= "load_store",
+	[SMCA_IF]		= "insn_fetch",
+	[SMCA_L2_CACHE]		= "l2_cache",
+	[SMCA_DE]		= "decode_unit",
+	[RES]			= "",
+	[SMCA_EX]		= "execution_unit",
+	[SMCA_FP]		= "floating_point",
+	[SMCA_L3_CACHE]		= "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+	[SMCA_CS]		= "coherent_slave",
+	[SMCA_PIE]		= "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e3a945ce374b..49768c08ac07 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
 	"Status Register File",
 };
 
+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+	"Load queue parity",
+	"Store queue parity",
+	"Miss address buffer payload parity",
+	"L1 TLB parity",
+	"",						/* reserved */
+	"DC tag error type 6",
+	"DC tag error type 1",
+	"Internal error type 1",
+	"Internal error type 2",
+	"Sys Read data error thread 0",
+	"Sys read data error thread 1",
+	"DC tag error type 2",
+	"DC data error type 1 (poison comsumption)",
+	"DC data error type 2",
+	"DC data error type 3",
+	"DC tag error type 4",
+	"L2 TLB parity",
+	"PDC parity error",
+	"DC tag error type 3",
+	"DC tag error type 5",
+	"L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+	"microtag probe port parity error",
+	"IC microtag or full tag multi-hit error",
+	"IC full tag parity",
+	"IC data array parity",
+	"Decoupling queue phys addr parity error",
+	"L0 ITLB parity error",
+	"L1 ITLB parity error",
+	"L2 ITLB parity error",
+	"BPQ snoop parity on Thread 0",
+	"BPQ snoop parity on Thread 1",
+	"L1 BTB multi-match error",
+	"L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+	"L2M tag multi-way-hit error",
+	"L2M tag ECC error",
+	"L2M data ECC error",
+	"HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+	"uop cache tag parity error",
+	"uop cache data parity error",
+	"Insn buffer parity error",
+	"Insn dispatch queue parity error",
+	"Fetch address FIFO parity",
+	"Patch RAM data parity",
+	"Patch RAM sequencer parity",
+	"uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+	"Watchdog timeout error",
+	"Phy register file parity",
+	"Flag register file parity",
+	"Immediate displacement register file parity",
+	"Address generator payload parity",
+	"EX payload parity",
+	"Checkpoint queue parity",
+	"Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+	"Physical register file parity",
+	"Freelist parity error",
+	"Schedule queue parity",
+	"NSQ parity error",
+	"Retire queue parity",
+	"Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+	"Shadow tag macro ECC error",
+	"Shadow tag macro multi-way-hit error",
+	"L3M tag ECC error",
+	"L3M tag multi-way-hit error",
+	"L3M data ECC error",
+	"XI parity, L3 fill done channel error",
+	"L3 victim queue parity",
+	"L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+	"Illegal request from transport layer",
+	"Address violation",
+	"Security violation",
+	"Illegal response from transport layer",
+	"Unexpected response",
+	"Parity error on incoming request or probe response data",
+	"Parity error on incoming read response data",
+	"Atomic request parity",
+	"ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+	"HW assert",
+	"Internal PIE register security violation",
+	"Error on GMI link",
+	"Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+	"DRAM ECC error",
+	"Data poison error on DRAM",
+	"SDP parity error",
+	"Advanced peripheral bus error",
+	"Command/address parity error",
+	"Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+	"Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+	"PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+	"SMU RAM ECC or parity error",
+};
+
 static bool f12h_mc0_mce(u16 ec, u8 xec)
 {
 	bool ret = false;
@@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m)
 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 }
 
+static void decode_f17h_core_errors(const char *ip_name, u8 xec,
+				   unsigned int mca_type)
+{
+	const char * const *error_desc_array;
+	size_t len;
+
+	pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+	switch (mca_type) {
+	case SMCA_LS:
+		error_desc_array = f17h_ls_mce_desc;
+		len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+		if (xec == 0x4) {
+			pr_cont("Unrecognized LS MCA error code.\n");
+			return;
+		}
+		break;
+
+	case SMCA_IF:
+		error_desc_array = f17h_if_mce_desc;
+		len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+		break;
+
+	case SMCA_L2_CACHE:
+		error_desc_array = f17h_l2_mce_desc;
+		len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+		break;
+
+	case SMCA_DE:
+		error_desc_array = f17h_de_mce_desc;
+		len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+		break;
+
+	case SMCA_EX:
+		error_desc_array = f17h_ex_mce_desc;
+		len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+		break;
+
+	case SMCA_FP:
+		error_desc_array = f17h_fp_mce_desc;
+		len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+		break;
+
+	case SMCA_L3_CACHE:
+		error_desc_array = f17h_l3_mce_desc;
+		len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+		break;
+
+	default:
+		pr_cont("Corrupted MCA core error info.\n");
+		return;
+	}
+
+	if (xec > len) {
+		pr_cont("Unrecognized %s MCA bank error code.\n",
+			 amd_core_mcablock_names[mca_type]);
+		return;
+	}
+
+	pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+	const char * const *error_desc_array;
+	size_t len;
+
+	pr_emerg(HW_ERR "Data Fabric Error: ");
+
+	switch (mca_type) {
+	case  SMCA_CS:
+		error_desc_array = f17h_cs_mce_desc;
+		len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+		break;
+
+	case SMCA_PIE:
+		error_desc_array = f17h_pie_mce_desc;
+		len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+		break;
+
+	default:
+		pr_cont("Corrupted MCA Data Fabric info.\n");
+		return;
+	}
+
+	if (xec > len) {
+		pr_cont("Unrecognized %s MCA bank error code.\n",
+			 amd_df_mcablock_names[mca_type]);
+		return;
+	}
+
+	pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+	u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+	unsigned int hwid, mca_type, i;
+	u8 xec = XEC(m->status, xec_mask);
+	const char * const *error_desc_array;
+	const char *ip_name;
+	u32 low, high;
+	size_t len;
+
+	if (rdmsr_safe(addr, &low, &high)) {
+		pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+		return;
+	}
+
+	hwid = high & MCI_IPID_HWID;
+	mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+	pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+	/*
+	 * Based on hwid and mca_type values, decode errors from respective IPs.
+	 * Note: mca_type values make sense only in the context of an hwid.
+	 */
+	for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
+		if (amd_hwids[i].hwid == hwid)
+			break;
+
+	switch (i) {
+	case SMCA_F17H_CORE:
+		ip_name = (mca_type == SMCA_L3_CACHE) ?
+			  "L3 Cache" : "F17h Core";
+		return decode_f17h_core_errors(ip_name, xec, mca_type);
+		break;
+
+	case SMCA_DF:
+		return decode_df_errors(xec, mca_type);
+		break;
+
+	case SMCA_UMC:
+		error_desc_array = f17h_umc_mce_desc;
+		len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+		break;
+
+	case SMCA_PB:
+		error_desc_array = f17h_pb_mce_desc;
+		len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+		break;
+
+	case SMCA_PSP:
+		error_desc_array = f17h_psp_mce_desc;
+		len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+		break;
+
+	case SMCA_SMU:
+		error_desc_array = f17h_smu_mce_desc;
+		len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+		break;
+
+	default:
+		pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
+		return;
+	}
+
+	ip_name = amd_hwids[i].name;
+	pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+	if (xec > len) {
+		pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
+		return;
+	}
+
+	pr_cont("%s.\n", error_desc_array[xec]);
+}
+
 static inline void amd_decode_err_code(u16 ec)
 {
 	if (INT_ERROR(ec)) {
@@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	struct mce *m = (struct mce *)data;
 	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
 	int ecc;
+	u32 ebx = cpuid_ebx(0x80000007);
 
 	if (amd_filter_mce(m))
 		return NOTIFY_STOP;
@@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
 
-	if (c->x86 == 0x15 || c->x86 == 0x16)
+	if (c->x86 >= 0x15)
 		pr_cont("|%s|%s",
 			((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
 			((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
 
+	if (!!(ebx & BIT(3))) {
+		u32 low, high;
+		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+		if (!rdmsr_safe(addr, &low, &high) &&
+		    (low & MCI_CONFIG_MCAX))
+			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+	}
+
 	/* do the two bits[14:13] together */
 	ecc = (m->status >> 45) & 0x3;
 	if (ecc)
@@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	if (m->status & MCI_STATUS_ADDRV)
 		pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
 
+	if (!!(ebx & BIT(3))) {
+		decode_smca_errors(m);
+		goto err_code;
+	}
+
 	if (!fam_ops)
 		goto err_code;
 
@@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = {
 static int __init mce_amd_init(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
+	u32 ebx;
 
 	if (c->x86_vendor != X86_VENDOR_AMD)
 		return -ENODEV;
@@ -888,10 +1204,18 @@ static int __init mce_amd_init(void)
 		fam_ops->mc2_mce = f16h_mc2_mce;
 		break;
 
+	case 0x17:
+		ebx = cpuid_ebx(0x80000007);
+		xec_mask = 0x3f;
+		if (!(ebx & BIT(3))) {
+			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
+			goto err_out;
+		}
+		break;
+
 	default:
 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
-		kfree(fam_ops);
-		fam_ops = NULL;
+		goto err_out;
 	}
 
 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1223,11 @@ static int __init mce_amd_init(void)
 	mce_register_decode_chain(&amd_mce_dec_nb);
 
 	return 0;
+
+err_out:
+	kfree(fam_ops);
+	fam_ops = NULL;
+	return -EINVAL;
 }
 early_initcall(mce_amd_init);
 

From 8dd1e17a55b0bb1206c71c7a4344c5e3037cdf65 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:19 +0100
Subject: [PATCH 11/15] x86/mce/AMD: Fix logic to obtain block address

In upcoming processors, the BLKPTR field is no longer used to indicate
the MSR number of the additional register. Insted, it simply indicates
the prescence of additional MSRs.

Fix the logic here to gather MSR address from MSR_AMD64_SMCA_MCx_MISC()
for newer processors and fall back to existing logic for older
processors.

[ Drop nextaddr_out label; style cleanups. ]
Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-4-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h           |  4 ++
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 84 ++++++++++++++++++----------
 2 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9c467fe00551..72f86882d62a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -104,10 +104,14 @@
 #define MCE_LOG_SIGNATURE	"MACHINECHECK"
 
 /* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
 #define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
 
 /*
  * This structure contains all data related to the MCE log.  Also
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ee487a93ebe7..a53eb1ba587e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -304,6 +304,51 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+			     unsigned int bank, unsigned int block)
+{
+	u32 addr = 0, offset = 0;
+
+	if (mce_flags.smca) {
+		if (!block) {
+			addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+		} else {
+			/*
+			 * For SMCA enabled processors, BLKPTR field of the
+			 * first MISC register (MCx_MISC0) indicates presence of
+			 * additional MISC register set (MISC1-4).
+			 */
+			u32 low, high;
+
+			if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+				return addr;
+
+			if (!(low & MCI_CONFIG_MCAX))
+				return addr;
+
+			if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+			    (low & MASK_BLKPTR_LO))
+				addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+		}
+		return addr;
+	}
+
+	/* Fall back to method we used for older processors: */
+	switch (block) {
+	case 0:
+		addr = MSR_IA32_MCx_MISC(bank);
+		break;
+	case 1:
+		offset = ((low & MASK_BLKPTR_LO) >> 21);
+		if (offset)
+			addr = MCG_XBLK_ADDR + offset;
+		break;
+	default:
+		addr = ++current_addr;
+	}
+	return addr;
+}
+
 static int
 prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 			int offset, u32 misc_high)
@@ -366,16 +411,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			if (block == 0)
-				address = MSR_IA32_MCx_MISC(bank);
-			else if (block == 1) {
-				address = (low & MASK_BLKPTR_LO) >> 21;
-				if (!address)
-					break;
-
-				address += MCG_XBLK_ADDR;
-			} else
-				++address;
+			address = get_block_address(address, low, high, bank, block);
+			if (!address)
+				break;
 
 			if (rdmsr_safe(address, &low, &high))
 				break;
@@ -480,16 +518,9 @@ static void amd_threshold_interrupt(void)
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			if (block == 0) {
-				address = MSR_IA32_MCx_MISC(bank);
-			} else if (block == 1) {
-				address = (low & MASK_BLKPTR_LO) >> 21;
-				if (!address)
-					break;
-				address += MCG_XBLK_ADDR;
-			} else {
-				++address;
-			}
+			address = get_block_address(address, low, high, bank, block);
+			if (!address)
+				break;
 
 			if (rdmsr_safe(address, &low, &high))
 				break;
@@ -709,16 +740,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 	if (err)
 		goto out_free;
 recurse:
-	if (!block) {
-		address = (low & MASK_BLKPTR_LO) >> 21;
-		if (!address)
-			return 0;
-		address += MCG_XBLK_ADDR;
-	} else {
-		++address;
-	}
+	address = get_block_address(address, low, high, bank, ++block);
+	if (!address)
+		return 0;
 
-	err = allocate_threshold_blocks(cpu, bank, ++block, address);
+	err = allocate_threshold_blocks(cpu, bank, block, address);
 	if (err)
 		goto out_free;
 

From 2cd3b5f9033f0b051842a279dac5a54271cbd3c8 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:20 +0100
Subject: [PATCH 12/15] x86/mce: Clarify comments regarding deferred error

Deferred errors indicate errors that hardware could not fix. But it
still does not cause any interruption to program flow. So it does not
generate any #MC and UC bit in MCx_STATUS is not set.

Correct comment.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-5-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 72f86882d62a..cfff34172be0 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,7 +40,7 @@
 #define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
 
 /* AMD-specific bits */
-#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* uncorrected error, deferred exception */
 #define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
 #define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
 

From ea2ca36b658cfc6081ee454e97593c81f646806e Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:21 +0100
Subject: [PATCH 13/15] x86/mce/AMD: Document some functionality

In an attempt to aid in understanding of what the threshold_block
structure holds, provide comments to describe the members here. Also,
trim comments around threshold_restart_bank() and update copyright info.

No functional change is introduced.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
[ Shorten comments. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-6-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/amd_nb.h        | 26 +++++++++++++++++---------
 arch/x86/kernel/cpu/mcheck/mce_amd.c |  7 ++-----
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 3c56ef1ae068..5e828da2e18f 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,15 +27,23 @@ struct amd_l3_cache {
 };
 
 struct threshold_block {
-	unsigned int		block;
-	unsigned int		bank;
-	unsigned int		cpu;
-	u32			address;
-	u16			interrupt_enable;
-	bool			interrupt_capable;
-	u16			threshold_limit;
-	struct kobject		kobj;
-	struct list_head	miscj;
+	unsigned int	 block;			/* Number within bank */
+	unsigned int	 bank;			/* MCA bank the block belongs to */
+	unsigned int	 cpu;			/* CPU which controls MCA bank */
+	u32		 address;		/* MSR address for the block */
+	u16		 interrupt_enable;	/* Enable/Disable APIC interrupt */
+	bool		 interrupt_capable;	/* Bank can generate an interrupt. */
+
+	u16		 threshold_limit;	/*
+						 * Value upon which threshold
+						 * interrupt is generated.
+						 */
+
+	struct kobject	 kobj;			/* sysfs object */
+	struct list_head miscj;			/*
+						 * List of threshold blocks
+						 * within a bank.
+						 */
 };
 
 struct threshold_bank {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a53eb1ba587e..9d656fd436ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,5 +1,5 @@
 /*
- *  (c) 2005-2015 Advanced Micro Devices, Inc.
+ *  (c) 2005-2016 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -201,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 	return 1;
 };
 
-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
 static void threshold_restart_bank(void *_tr)
 {
 	struct thresh_restart *tr = _tr;

From 92b0729c34cab1f46d89aace3e66015f0bb4a682 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 18 Feb 2016 11:47:26 -0800
Subject: [PATCH 14/15] x86/mm, x86/mce: Add memcpy_mcsafe()

Make use of the EXTABLE_FAULT exception table entries to write
a kernel copy routine that doesn't crash the system if it
encounters a machine check. Prime use case for this is to copy
from large arrays of non-volatile memory used as storage.

We have to use an unrolled copy loop for now because current
hardware implementations treat a machine check in "rep mov"
as fatal. When that is fixed we can simplify.

Return type is a "bool". True means that we copied OK, false means
that it didn't.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@gmail.com>
Link: http://lkml.kernel.org/r/a44e1055efc2d2a9473307b22c91caa437aa3f8b.1456439214.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/string_64.h |  13 ++++
 arch/x86/kernel/x8664_ksyms_64.c |   2 +
 arch/x86/lib/memcpy_64.S         | 117 +++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..ca6ba3607705 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,19 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+/**
+ * memcpy_mcsafe - copy memory with indication if a machine check happened
+ *
+ * @dst:	destination address
+ * @src:	source address
+ * @cnt:	number of bytes to copy
+ *
+ * Low level memory copy function that catches machine checks
+ *
+ * Return true for success, false for fail
+ */
+bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..cd05942bc918 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..7d37641ada5b 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,120 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(memcpy_mcsafe)
+	cmpl $8, %edx
+	/* Less than 8 bytes? Go to byte copy loop */
+	jb .L_no_whole_words
+
+	/* Check for bad alignment of source */
+	testl $7, %esi
+	/* Already aligned */
+	jz .L_8byte_aligned
+
+	/* Copy one byte at a time until source is 8-byte aligned */
+	movl %esi, %ecx
+	andl $7, %ecx
+	subl $8, %ecx
+	negl %ecx
+	subl %ecx, %edx
+.L_copy_leading_bytes:
+	movb (%rsi), %al
+	movb %al, (%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .L_copy_leading_bytes
+
+.L_8byte_aligned:
+	/* Figure out how many whole cache lines (64-bytes) to copy */
+	movl %edx, %ecx
+	andl $63, %edx
+	shrl $6, %ecx
+	jz .L_no_whole_cache_lines
+
+	/* Loop copying whole cache lines */
+.L_cache_w0: movq (%rsi), %r8
+.L_cache_w1: movq 1*8(%rsi), %r9
+.L_cache_w2: movq 2*8(%rsi), %r10
+.L_cache_w3: movq 3*8(%rsi), %r11
+	movq %r8, (%rdi)
+	movq %r9, 1*8(%rdi)
+	movq %r10, 2*8(%rdi)
+	movq %r11, 3*8(%rdi)
+.L_cache_w4: movq 4*8(%rsi), %r8
+.L_cache_w5: movq 5*8(%rsi), %r9
+.L_cache_w6: movq 6*8(%rsi), %r10
+.L_cache_w7: movq 7*8(%rsi), %r11
+	movq %r8, 4*8(%rdi)
+	movq %r9, 5*8(%rdi)
+	movq %r10, 6*8(%rdi)
+	movq %r11, 7*8(%rdi)
+	leaq 64(%rsi), %rsi
+	leaq 64(%rdi), %rdi
+	decl %ecx
+	jnz .L_cache_w0
+
+	/* Are there any trailing 8-byte words? */
+.L_no_whole_cache_lines:
+	movl %edx, %ecx
+	andl $7, %edx
+	shrl $3, %ecx
+	jz .L_no_whole_words
+
+	/* Copy trailing words */
+.L_copy_trailing_words:
+	movq (%rsi), %r8
+	mov %r8, (%rdi)
+	leaq 8(%rsi), %rsi
+	leaq 8(%rdi), %rdi
+	decl %ecx
+	jnz .L_copy_trailing_words
+
+	/* Any trailing bytes? */
+.L_no_whole_words:
+	andl %edx, %edx
+	jz .L_done_memcpy_trap
+
+	/* Copy trailing bytes */
+	movl %edx, %ecx
+.L_copy_trailing_bytes:
+	movb (%rsi), %al
+	movb %al, (%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .L_copy_trailing_bytes
+
+	/* Copy successful. Return true */
+.L_done_memcpy_trap:
+	xorq %rax, %rax
+	ret
+ENDPROC(memcpy_mcsafe)
+
+	.section .fixup, "ax"
+	/* Return false for any failure */
+.L_memcpy_mcsafe_fail:
+	mov	$1, %rax
+	ret
+
+	.previous
+
+	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+#endif

From eb1af3b71f9d83e45f2fd2fd649356e98e1c582c Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Wed, 9 Mar 2016 16:40:48 -0800
Subject: [PATCH 15/15] EDAC/sb_edac: Fix computation of channel address

Large memory Haswell-EX systems with multiple DIMMs per channel were
sometimes reporting the wrong DIMM.

Found three problems:

 1) Debug printouts for socket and channel interleave were not interpreting
    the register fields correctly. The socket interleave field is a 2^X
    value (0=1, 1=2, 2=4, 3=8). The channel interleave is X+1 (0=1, 1=2,
    2=3. 3=4).

 2) Actual use of the socket interleave value didn't interpret as 2^X

 3) Conversion of address to channel address was complicated, and wrong.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Aristeu Rozanski <arozansk@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/edac/sb_edac.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index e438ee5b433f..34e54a313bf4 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1839,8 +1839,8 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
 		edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n",
 			 n_tads, gb, (mb*1000)/1024,
 			 ((u64)tmp_mb) << 20L,
-			 (u32)TAD_SOCK(reg),
-			 (u32)TAD_CH(reg),
+			 (u32)(1 << TAD_SOCK(reg)),
+			 (u32)TAD_CH(reg) + 1,
 			 (u32)TAD_TGT0(reg),
 			 (u32)TAD_TGT1(reg),
 			 (u32)TAD_TGT2(reg),
@@ -2118,7 +2118,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	}
 
 	ch_way = TAD_CH(reg) + 1;
-	sck_way = TAD_SOCK(reg) + 1;
+	sck_way = 1 << TAD_SOCK(reg);
 
 	if (ch_way == 3)
 		idx = addr >> 6;
@@ -2175,7 +2175,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 		 n_tads,
 		 addr,
 		 limit,
-		 (u32)TAD_SOCK(reg),
+		 sck_way,
 		 ch_way,
 		 offset,
 		 idx,
@@ -2190,18 +2190,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 			offset, addr);
 		return -EINVAL;
 	}
-	addr -= offset;
-	/* Store the low bits [0:6] of the addr */
-	ch_addr = addr & 0x7f;
-	/* Remove socket wayness and remove 6 bits */
-	addr >>= 6;
-	addr = div_u64(addr, sck_xch);
-#if 0
-	/* Divide by channel way */
-	addr = addr / ch_way;
-#endif
-	/* Recover the last 6 bits */
-	ch_addr |= addr << 6;
+
+	ch_addr = addr - offset;
+	ch_addr >>= (6 + shiftup);
+	ch_addr /= ch_way * sck_way;
+	ch_addr <<= (6 + shiftup);
+	ch_addr |= addr & ((1 << (6 + shiftup)) - 1);
 
 	/*
 	 * Step 3) Decode rank