powerpc fixes for 4.17 #4

A bunch of fixes, mostly for existing code and going to stable.
 
 Our memory hot-unplug path wasn't flushing the cache before removing memory.
 That is a problem now that we are doing memory hotplug on bare metal.
 
 Three fixes for the NPU code that supports devices connected via NVLink (ie.
 GPUs). The main one tweaks the TLB flush algorithm to avoid soft lockups for
 large flushes.
 
 A fix for our memory error handling where we would loop infinitely, returning
 back to the bad access and hard lockup the CPU.
 
 Fixes for the OPAL RTC driver, which wasn't handling some error cases correctly.
 
 A fix for a hardlockup in the powernv cpufreq driver.
 
 And finally two fixes to our smp_send_stop(), required due to a recent change to
 use it on shutdown.
 
 Thanks to:
   Alistair Popple, Balbir Singh, Laurentiu Tudor, Mahesh Salgaonkar, Mark
   Hairgrove, Nicholas Piggin, Rashmica Gupta, Shilpasri G Bhat.
 -----BEGIN PGP SIGNATURE-----
 
 iQIwBAABCAAaBQJa5FRaExxtcGVAZWxsZXJtYW4uaWQuYXUACgkQUevqPMjhpYA3
 LQ//es8gvVVYxXOP5m+jl+LP//nQ8Z9l4ezW/0QmtAwuzAnt31F3eYcBwtIa5EaZ
 Fm7iQ5eu+o4JJSj7y/a1gXZOgZaG1uprc6psUdI+FZ6rQ3AAF9BlD7J5ZvkJ/Nuz
 Wo37+oxr8T8dpGYurS2nrOyP1654ZNvtkHzr1rovhNZ/Yx6GuDppyou1cBrcHgoQ
 f/SILBDpwPQ6sEzMOPptN3SNajq2716kgoTT9yU2lEHGReeMPc1RL1gVw91O7jdA
 RJGZl/GTPDDuT2hg0yms4eWhmMDbfQU6kRbPwBtYM5BsCvvBGuISL3RKSceNSo/C
 LO3IqnirNff0zzx5dSuy+cmzoPxMbDhWV91to29HJH5cyvWCqH8V5uJsKeHnDbmr
 YscSvgi6iEbiMtuckYL8Bqe/jcE/4RCRixH+j7mkJc+XUrvjligUFG9VVq8tERXF
 lA/M0Zh+AI0doFjiPbkWHlbcfPu0jhwnZ7aivpf5FKdcfF6aeBr5tX+j0bRqAXEZ
 FVUd2gst7s73q4B8b8QicfMpJkYfWia9PnrifrHe10EYi9kL2z5GjDOz8s6Suzed
 KD+XGuLWb9zm2Fuga/Guzx2YM0DWTEk/or5qbBRh+44WTprEZxDTotVl5tTYfgsU
 ErEnGqlBevCrzknbe7ZaWKlkzSNXxoF9OpETf8kVOocEuWs=
 =JJLB
 -----END PGP SIGNATURE-----

Merge tag 'powerpc-4.17-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc fixes from Michael Ellerman:
 "A bunch of fixes, mostly for existing code and going to stable.

  Our memory hot-unplug path wasn't flushing the cache before removing
  memory. That is a problem now that we are doing memory hotplug on bare
  metal.

  Three fixes for the NPU code that supports devices connected via
  NVLink (ie. GPUs). The main one tweaks the TLB flush algorithm to
  avoid soft lockups for large flushes.

  A fix for our memory error handling where we would loop infinitely,
  returning back to the bad access and hard lockup the CPU.

  Fixes for the OPAL RTC driver, which wasn't handling some error cases
  correctly.

  A fix for a hardlockup in the powernv cpufreq driver.

  And finally two fixes to our smp_send_stop(), required due to a recent
  change to use it on shutdown.

  Thanks to: Alistair Popple, Balbir Singh, Laurentiu Tudor, Mahesh
  Salgaonkar, Mark Hairgrove, Nicholas Piggin, Rashmica Gupta, Shilpasri
  G Bhat"

* tag 'powerpc-4.17-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
  powerpc/kvm/booke: Fix altivec related build break
  powerpc: Fix deadlock with multiple calls to smp_send_stop
  cpufreq: powernv: Fix hardlockup due to synchronous smp_call in timer interrupt
  powerpc: Fix smp_send_stop NMI IPI handling
  rtc: opal: Fix OPAL RTC driver OPAL_BUSY loops
  powerpc/mce: Fix a bug where mce loops on memory UE.
  powerpc/powernv/npu: Do a PID GPU TLB flush when invalidating a large address range
  powerpc/powernv/npu: Prevent overwriting of pnv_npu2_init_contex() callback parameters
  powerpc/powernv/npu: Add lock to prevent race in concurrent context init/destroy
  powerpc/powernv/memtrace: Let the arch hotunplug code flush cache
  powerpc/mm: Flush cache on memory hot(un)plug
This commit is contained in:
Linus Torvalds 2018-04-28 09:45:34 -07:00
commit 0d95cfa922
10 changed files with 166 additions and 65 deletions

View File

@ -15,7 +15,7 @@
extern void powernv_set_nmmu_ptcr(unsigned long ptcr);
extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
unsigned long flags,
struct npu_context *(*cb)(struct npu_context *, void *),
void (*cb)(struct npu_context *, void *),
void *priv);
extern void pnv_npu2_destroy_context(struct npu_context *context,
struct pci_dev *gpdev);

View File

@ -441,7 +441,6 @@ static int mce_handle_ierror(struct pt_regs *regs,
if (pfn != ULONG_MAX) {
*phys_addr =
(pfn << PAGE_SHIFT);
handled = 1;
}
}
}
@ -532,9 +531,7 @@ static int mce_handle_derror(struct pt_regs *regs,
* kernel/exception-64s.h
*/
if (get_paca()->in_mce < MAX_MCE_DEPTH)
if (!mce_find_instr_ea_and_pfn(regs, addr,
phys_addr))
handled = 1;
mce_find_instr_ea_and_pfn(regs, addr, phys_addr);
}
found = 1;
}
@ -572,7 +569,7 @@ static long mce_handle_error(struct pt_regs *regs,
const struct mce_ierror_table itable[])
{
struct mce_error_info mce_err = { 0 };
uint64_t addr, phys_addr;
uint64_t addr, phys_addr = ULONG_MAX;
uint64_t srr1 = regs->msr;
long handled;

View File

@ -566,10 +566,35 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
#endif
#ifdef CONFIG_NMI_IPI
static void stop_this_cpu(struct pt_regs *regs)
#else
static void nmi_stop_this_cpu(struct pt_regs *regs)
{
/*
* This is a special case because it never returns, so the NMI IPI
* handling would never mark it as done, which makes any later
* smp_send_nmi_ipi() call spin forever. Mark it done now.
*
* IRQs are already hard disabled by the smp_handle_nmi_ipi.
*/
nmi_ipi_lock();
nmi_ipi_busy_count--;
nmi_ipi_unlock();
/* Remove this CPU */
set_cpu_online(smp_processor_id(), false);
spin_begin();
while (1)
spin_cpu_relax();
}
void smp_send_stop(void)
{
smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, nmi_stop_this_cpu, 1000000);
}
#else /* CONFIG_NMI_IPI */
static void stop_this_cpu(void *dummy)
#endif
{
/* Remove this CPU */
set_cpu_online(smp_processor_id(), false);
@ -582,12 +607,22 @@ static void stop_this_cpu(void *dummy)
void smp_send_stop(void)
{
#ifdef CONFIG_NMI_IPI
smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, stop_this_cpu, 1000000);
#else
static bool stopped = false;
/*
* Prevent waiting on csd lock from a previous smp_send_stop.
* This is racy, but in general callers try to do the right
* thing and only fire off one smp_send_stop (e.g., see
* kernel/panic.c)
*/
if (stopped)
return;
stopped = true;
smp_call_function(stop_this_cpu, NULL, 0);
#endif
}
#endif /* CONFIG_NMI_IPI */
struct thread_info *current_set[NR_CPUS];

View File

@ -305,6 +305,13 @@ void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)
kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
}
#ifdef CONFIG_ALTIVEC
void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu)
{
kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL);
}
#endif
void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
{
kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);

View File

@ -133,6 +133,7 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *
start, start + size, rc);
return -EFAULT;
}
flush_inval_dcache_range(start, start + size);
return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
}
@ -159,6 +160,7 @@ int __meminit arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap
/* Remove htab bolted mappings for this section of memory */
start = (unsigned long)__va(start);
flush_inval_dcache_range(start, start + size);
ret = remove_section_mapping(start, start + size);
/* Ensure all vmalloc mappings are flushed in case they also

View File

@ -82,19 +82,6 @@ static const struct file_operations memtrace_fops = {
.open = simple_open,
};
static void flush_memory_region(u64 base, u64 size)
{
unsigned long line_size = ppc64_caches.l1d.size;
u64 end = base + size;
u64 addr;
base = round_down(base, line_size);
end = round_up(end, line_size);
for (addr = base; addr < end; addr += line_size)
asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory");
}
static int check_memblock_online(struct memory_block *mem, void *arg)
{
if (mem->state != MEM_ONLINE)
@ -132,10 +119,6 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
change_memblock_state);
/* RCU grace period? */
flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT),
nr_pages << PAGE_SHIFT);
lock_device_hotplug();
remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
unlock_device_hotplug();

View File

@ -33,6 +33,19 @@
#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
/*
* spinlock to protect initialisation of an npu_context for a particular
* mm_struct.
*/
static DEFINE_SPINLOCK(npu_context_lock);
/*
* When an address shootdown range exceeds this threshold we invalidate the
* entire TLB on the GPU for the given PID rather than each specific address in
* the range.
*/
#define ATSD_THRESHOLD (2*1024*1024)
/*
* Other types of TCE cache invalidation are not functional in the
* hardware.
@ -401,7 +414,7 @@ struct npu_context {
bool nmmu_flush;
/* Callback to stop translation requests on a given GPU */
struct npu_context *(*release_cb)(struct npu_context *, void *);
void (*release_cb)(struct npu_context *context, void *priv);
/*
* Private pointer passed to the above callback for usage by
@ -671,11 +684,19 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
struct npu_context *npu_context = mn_to_npu_context(mn);
unsigned long address;
for (address = start; address < end; address += PAGE_SIZE)
mmio_invalidate(npu_context, 1, address, false);
if (end - start > ATSD_THRESHOLD) {
/*
* Just invalidate the entire PID if the address range is too
* large.
*/
mmio_invalidate(npu_context, 0, 0, true);
} else {
for (address = start; address < end; address += PAGE_SIZE)
mmio_invalidate(npu_context, 1, address, false);
/* Do the flush only on the final addess == end */
mmio_invalidate(npu_context, 1, address, true);
/* Do the flush only on the final addess == end */
mmio_invalidate(npu_context, 1, address, true);
}
}
static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
@ -696,11 +717,12 @@ static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
* Returns an error if there no contexts are currently available or a
* npu_context which should be passed to pnv_npu2_handle_fault().
*
* mmap_sem must be held in write mode.
* mmap_sem must be held in write mode and must not be called from interrupt
* context.
*/
struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
unsigned long flags,
struct npu_context *(*cb)(struct npu_context *, void *),
void (*cb)(struct npu_context *, void *),
void *priv)
{
int rc;
@ -743,7 +765,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
/*
* Setup the NPU context table for a particular GPU. These need to be
* per-GPU as we need the tables to filter ATSDs when there are no
* active contexts on a particular GPU.
* active contexts on a particular GPU. It is safe for these to be
* called concurrently with destroy as the OPAL call takes appropriate
* locks and refcounts on init/destroy.
*/
rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
PCI_DEVID(gpdev->bus->number, gpdev->devfn));
@ -754,8 +778,29 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
* We store the npu pci device so we can more easily get at the
* associated npus.
*/
spin_lock(&npu_context_lock);
npu_context = mm->context.npu_context;
if (npu_context) {
if (npu_context->release_cb != cb ||
npu_context->priv != priv) {
spin_unlock(&npu_context_lock);
opal_npu_destroy_context(nphb->opal_id, mm->context.id,
PCI_DEVID(gpdev->bus->number,
gpdev->devfn));
return ERR_PTR(-EINVAL);
}
WARN_ON(!kref_get_unless_zero(&npu_context->kref));
}
spin_unlock(&npu_context_lock);
if (!npu_context) {
/*
* We can set up these fields without holding the
* npu_context_lock as the npu_context hasn't been returned to
* the caller meaning it can't be destroyed. Parallel allocation
* is protected against by mmap_sem.
*/
rc = -ENOMEM;
npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
if (npu_context) {
@ -774,8 +819,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
}
mm->context.npu_context = npu_context;
} else {
WARN_ON(!kref_get_unless_zero(&npu_context->kref));
}
npu_context->release_cb = cb;
@ -814,15 +857,16 @@ static void pnv_npu2_release_context(struct kref *kref)
mm_context_remove_copro(npu_context->mm);
npu_context->mm->context.npu_context = NULL;
mmu_notifier_unregister(&npu_context->mn,
npu_context->mm);
kfree(npu_context);
}
/*
* Destroy a context on the given GPU. May free the npu_context if it is no
* longer active on any GPUs. Must not be called from interrupt context.
*/
void pnv_npu2_destroy_context(struct npu_context *npu_context,
struct pci_dev *gpdev)
{
int removed;
struct pnv_phb *nphb;
struct npu *npu;
struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
@ -844,7 +888,21 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
PCI_DEVID(gpdev->bus->number, gpdev->devfn));
kref_put(&npu_context->kref, pnv_npu2_release_context);
spin_lock(&npu_context_lock);
removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
spin_unlock(&npu_context_lock);
/*
* We need to do this outside of pnv_npu2_release_context so that it is
* outside the spinlock as mmu_notifier_destroy uses SRCU.
*/
if (removed) {
mmu_notifier_unregister(&npu_context->mn,
npu_context->mm);
kfree(npu_context);
}
}
EXPORT_SYMBOL(pnv_npu2_destroy_context);

View File

@ -48,10 +48,12 @@ unsigned long __init opal_get_boot_time(void)
while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
if (rc == OPAL_BUSY_EVENT)
if (rc == OPAL_BUSY_EVENT) {
mdelay(OPAL_BUSY_DELAY_MS);
opal_poll_events(NULL);
else if (rc == OPAL_BUSY)
mdelay(10);
} else if (rc == OPAL_BUSY) {
mdelay(OPAL_BUSY_DELAY_MS);
}
}
if (rc != OPAL_SUCCESS)
return 0;

View File

@ -679,6 +679,16 @@ void gpstate_timer_handler(struct timer_list *t)
if (!spin_trylock(&gpstates->gpstate_lock))
return;
/*
* If the timer has migrated to the different cpu then bring
* it back to one of the policy->cpus
*/
if (!cpumask_test_cpu(raw_smp_processor_id(), policy->cpus)) {
gpstates->timer.expires = jiffies + msecs_to_jiffies(1);
add_timer_on(&gpstates->timer, cpumask_first(policy->cpus));
spin_unlock(&gpstates->gpstate_lock);
return;
}
/*
* If PMCR was last updated was using fast_swtich then
@ -718,10 +728,8 @@ void gpstate_timer_handler(struct timer_list *t)
if (gpstate_idx != gpstates->last_lpstate_idx)
queue_gpstate_timer(gpstates);
set_pstate(&freq_data);
spin_unlock(&gpstates->gpstate_lock);
/* Timer may get migrated to a different cpu on cpu hot unplug */
smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
}
/*

View File

@ -57,7 +57,7 @@ static void tm_to_opal(struct rtc_time *tm, u32 *y_m_d, u64 *h_m_s_ms)
static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)
{
long rc = OPAL_BUSY;
s64 rc = OPAL_BUSY;
int retries = 10;
u32 y_m_d;
u64 h_m_s_ms;
@ -66,13 +66,17 @@ static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)
while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
if (rc == OPAL_BUSY_EVENT)
if (rc == OPAL_BUSY_EVENT) {
msleep(OPAL_BUSY_DELAY_MS);
opal_poll_events(NULL);
else if (retries-- && (rc == OPAL_HARDWARE
|| rc == OPAL_INTERNAL_ERROR))
msleep(10);
else if (rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT)
break;
} else if (rc == OPAL_BUSY) {
msleep(OPAL_BUSY_DELAY_MS);
} else if (rc == OPAL_HARDWARE || rc == OPAL_INTERNAL_ERROR) {
if (retries--) {
msleep(10); /* Wait 10ms before retry */
rc = OPAL_BUSY; /* go around again */
}
}
}
if (rc != OPAL_SUCCESS)
@ -87,21 +91,26 @@ static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)
static int opal_set_rtc_time(struct device *dev, struct rtc_time *tm)
{
long rc = OPAL_BUSY;
s64 rc = OPAL_BUSY;
int retries = 10;
u32 y_m_d = 0;
u64 h_m_s_ms = 0;
tm_to_opal(tm, &y_m_d, &h_m_s_ms);
while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
rc = opal_rtc_write(y_m_d, h_m_s_ms);
if (rc == OPAL_BUSY_EVENT)
if (rc == OPAL_BUSY_EVENT) {
msleep(OPAL_BUSY_DELAY_MS);
opal_poll_events(NULL);
else if (retries-- && (rc == OPAL_HARDWARE
|| rc == OPAL_INTERNAL_ERROR))
msleep(10);
else if (rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT)
break;
} else if (rc == OPAL_BUSY) {
msleep(OPAL_BUSY_DELAY_MS);
} else if (rc == OPAL_HARDWARE || rc == OPAL_INTERNAL_ERROR) {
if (retries--) {
msleep(10); /* Wait 10ms before retry */
rc = OPAL_BUSY; /* go around again */
}
}
}
return rc == OPAL_SUCCESS ? 0 : -EIO;