linux/mm/kasan/quarantine.c
Dmitry Vyukov ce5bec54bb kasan: fix races in quarantine_remove_cache()
quarantine_remove_cache() frees all pending objects that belong to the
cache, before we destroy the cache itself.  However there are currently
two possibilities how it can fail to do so.

First, another thread can hold some of the objects from the cache in
temp list in quarantine_put().  quarantine_put() has a windows of
enabled interrupts, and on_each_cpu() in quarantine_remove_cache() can
finish right in that window.  These objects will be later freed into the
destroyed cache.

Then, quarantine_reduce() has the same problem.  It grabs a batch of
objects from the global quarantine, then unlocks quarantine_lock and
then frees the batch.  quarantine_remove_cache() can finish while some
objects from the cache are still in the local to_free list in
quarantine_reduce().

Fix the race with quarantine_put() by disabling interrupts for the whole
duration of quarantine_put().  In combination with on_each_cpu() in
quarantine_remove_cache() it ensures that quarantine_remove_cache()
either sees the objects in the per-cpu list or in the global list.

Fix the race with quarantine_reduce() by protecting quarantine_reduce()
with srcu critical section and then doing synchronize_srcu() at the end
of quarantine_remove_cache().

I've done some assessment of how good synchronize_srcu() works in this
case.  And on a 4 CPU VM I see that it blocks waiting for pending read
critical sections in about 2-3% of cases.  Which looks good to me.

I suspect that these races are the root cause of some GPFs that I
episodically hit.  Previously I did not have any explanation for them.

  BUG: unable to handle kernel NULL pointer dereference at 00000000000000c8
  IP: qlist_free_all+0x2e/0xc0 mm/kasan/quarantine.c:155
  PGD 6aeea067
  PUD 60ed7067
  PMD 0
  Oops: 0000 [#1] SMP KASAN
  Dumping ftrace buffer:
     (ftrace buffer empty)
  Modules linked in:
  CPU: 0 PID: 13667 Comm: syz-executor2 Not tainted 4.10.0+ #60
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  task: ffff88005f948040 task.stack: ffff880069818000
  RIP: 0010:qlist_free_all+0x2e/0xc0 mm/kasan/quarantine.c:155
  RSP: 0018:ffff88006981f298 EFLAGS: 00010246
  RAX: ffffea0000ffff00 RBX: 0000000000000000 RCX: ffffea0000ffff1f
  RDX: 0000000000000000 RSI: ffff88003fffc3e0 RDI: 0000000000000000
  RBP: ffff88006981f2c0 R08: ffff88002fed7bd8 R09: 00000001001f000d
  R10: 00000000001f000d R11: ffff88006981f000 R12: ffff88003fffc3e0
  R13: ffff88006981f2d0 R14: ffffffff81877fae R15: 0000000080000000
  FS:  00007fb911a2d700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000000000c8 CR3: 0000000060ed6000 CR4: 00000000000006f0
  Call Trace:
   quarantine_reduce+0x10e/0x120 mm/kasan/quarantine.c:239
   kasan_kmalloc+0xca/0xe0 mm/kasan/kasan.c:590
   kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:544
   slab_post_alloc_hook mm/slab.h:456 [inline]
   slab_alloc_node mm/slub.c:2718 [inline]
   kmem_cache_alloc_node+0x1d3/0x280 mm/slub.c:2754
   __alloc_skb+0x10f/0x770 net/core/skbuff.c:219
   alloc_skb include/linux/skbuff.h:932 [inline]
   _sctp_make_chunk+0x3b/0x260 net/sctp/sm_make_chunk.c:1388
   sctp_make_data net/sctp/sm_make_chunk.c:1420 [inline]
   sctp_make_datafrag_empty+0x208/0x360 net/sctp/sm_make_chunk.c:746
   sctp_datamsg_from_user+0x7e8/0x11d0 net/sctp/chunk.c:266
   sctp_sendmsg+0x2611/0x3970 net/sctp/socket.c:1962
   inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:761
   sock_sendmsg_nosec net/socket.c:633 [inline]
   sock_sendmsg+0xca/0x110 net/socket.c:643
   SYSC_sendto+0x660/0x810 net/socket.c:1685
   SyS_sendto+0x40/0x50 net/socket.c:1653

I am not sure about backporting.  The bug is quite hard to trigger, I've
seen it few times during our massive continuous testing (however, it
could be cause of some other episodic stray crashes as it leads to
memory corruption...).  If it is triggered, the consequences are very
bad -- almost definite bad memory corruption.  The fix is non trivial
and has chances of introducing new bugs.  I am also not sure how
actively people use KASAN on older releases.

[dvyukov@google.com: - sorted includes[
  Link: http://lkml.kernel.org/r/20170309094028.51088-1-dvyukov@google.com
Link: http://lkml.kernel.org/r/20170308151532.5070-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-03-09 17:01:10 -08:00

329 lines
8.7 KiB
C

/*
* KASAN quarantine.
*
* Author: Alexander Potapenko <glider@google.com>
* Copyright (C) 2016 Google, Inc.
*
* Based on code by Dmitry Chernenkov.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
*/
#include <linux/gfp.h>
#include <linux/hash.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/printk.h>
#include <linux/shrinker.h>
#include <linux/slab.h>
#include <linux/srcu.h>
#include <linux/string.h>
#include <linux/types.h>
#include "../slab.h"
#include "kasan.h"
/* Data structure and operations for quarantine queues. */
/*
* Each queue is a signle-linked list, which also stores the total size of
* objects inside of it.
*/
struct qlist_head {
struct qlist_node *head;
struct qlist_node *tail;
size_t bytes;
};
#define QLIST_INIT { NULL, NULL, 0 }
static bool qlist_empty(struct qlist_head *q)
{
return !q->head;
}
static void qlist_init(struct qlist_head *q)
{
q->head = q->tail = NULL;
q->bytes = 0;
}
static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
size_t size)
{
if (unlikely(qlist_empty(q)))
q->head = qlink;
else
q->tail->next = qlink;
q->tail = qlink;
qlink->next = NULL;
q->bytes += size;
}
static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
{
if (unlikely(qlist_empty(from)))
return;
if (qlist_empty(to)) {
*to = *from;
qlist_init(from);
return;
}
to->tail->next = from->head;
to->tail = from->tail;
to->bytes += from->bytes;
qlist_init(from);
}
#define QUARANTINE_PERCPU_SIZE (1 << 20)
#define QUARANTINE_BATCHES \
(1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
/*
* The object quarantine consists of per-cpu queues and a global queue,
* guarded by quarantine_lock.
*/
static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
/* Round-robin FIFO array of batches. */
static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
static int quarantine_head;
static int quarantine_tail;
/* Total size of all objects in global_quarantine across all batches. */
static unsigned long quarantine_size;
static DEFINE_SPINLOCK(quarantine_lock);
DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
static unsigned long quarantine_max_size;
/*
* Target size of a batch in global_quarantine.
* Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
*/
static unsigned long quarantine_batch_size;
/*
* The fraction of physical memory the quarantine is allowed to occupy.
* Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
* the ratio low to avoid OOM.
*/
#define QUARANTINE_FRACTION 32
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
return virt_to_head_page(qlink)->slab_cache;
}
static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
{
struct kasan_free_meta *free_info =
container_of(qlink, struct kasan_free_meta,
quarantine_link);
return ((void *)free_info) - cache->kasan_info.free_meta_offset;
}
static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
{
void *object = qlink_to_object(qlink, cache);
unsigned long flags;
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
___cache_free(cache, object, _THIS_IP_);
if (IS_ENABLED(CONFIG_SLAB))
local_irq_restore(flags);
}
static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
{
struct qlist_node *qlink;
if (unlikely(qlist_empty(q)))
return;
qlink = q->head;
while (qlink) {
struct kmem_cache *obj_cache =
cache ? cache : qlink_to_cache(qlink);
struct qlist_node *next = qlink->next;
qlink_free(qlink, obj_cache);
qlink = next;
}
qlist_init(q);
}
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
{
unsigned long flags;
struct qlist_head *q;
struct qlist_head temp = QLIST_INIT;
/*
* Note: irq must be disabled until after we move the batch to the
* global quarantine. Otherwise quarantine_remove_cache() can miss
* some objects belonging to the cache if they are in our local temp
* list. quarantine_remove_cache() executes on_each_cpu() at the
* beginning which ensures that it either sees the objects in per-cpu
* lists or in the global quarantine.
*/
local_irq_save(flags);
q = this_cpu_ptr(&cpu_quarantine);
qlist_put(q, &info->quarantine_link, cache->size);
if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
spin_lock(&quarantine_lock);
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
if (global_quarantine[quarantine_tail].bytes >=
READ_ONCE(quarantine_batch_size)) {
int new_tail;
new_tail = quarantine_tail + 1;
if (new_tail == QUARANTINE_BATCHES)
new_tail = 0;
if (new_tail != quarantine_head)
quarantine_tail = new_tail;
}
spin_unlock(&quarantine_lock);
}
local_irq_restore(flags);
}
void quarantine_reduce(void)
{
size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
int srcu_idx;
struct qlist_head to_free = QLIST_INIT;
if (likely(READ_ONCE(quarantine_size) <=
READ_ONCE(quarantine_max_size)))
return;
/*
* srcu critical section ensures that quarantine_remove_cache()
* will not miss objects belonging to the cache while they are in our
* local to_free list. srcu is chosen because (1) it gives us private
* grace period domain that does not interfere with anything else,
* and (2) it allows synchronize_srcu() to return without waiting
* if there are no pending read critical sections (which is the
* expected case).
*/
srcu_idx = srcu_read_lock(&remove_cache_srcu);
spin_lock_irqsave(&quarantine_lock, flags);
/*
* Update quarantine size in case of hotplug. Allocate a fraction of
* the installed memory to quarantine minus per-cpu queue limits.
*/
total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
QUARANTINE_FRACTION;
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
new_quarantine_size = (total_size < percpu_quarantines) ?
0 : total_size - percpu_quarantines;
WRITE_ONCE(quarantine_max_size, new_quarantine_size);
/* Aim at consuming at most 1/2 of slots in quarantine. */
WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
2 * total_size / QUARANTINE_BATCHES));
if (likely(quarantine_size > quarantine_max_size)) {
qlist_move_all(&global_quarantine[quarantine_head], &to_free);
WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
quarantine_head++;
if (quarantine_head == QUARANTINE_BATCHES)
quarantine_head = 0;
}
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
srcu_read_unlock(&remove_cache_srcu, srcu_idx);
}
static void qlist_move_cache(struct qlist_head *from,
struct qlist_head *to,
struct kmem_cache *cache)
{
struct qlist_node *curr;
if (unlikely(qlist_empty(from)))
return;
curr = from->head;
qlist_init(from);
while (curr) {
struct qlist_node *next = curr->next;
struct kmem_cache *obj_cache = qlink_to_cache(curr);
if (obj_cache == cache)
qlist_put(to, curr, obj_cache->size);
else
qlist_put(from, curr, obj_cache->size);
curr = next;
}
}
static void per_cpu_remove_cache(void *arg)
{
struct kmem_cache *cache = arg;
struct qlist_head to_free = QLIST_INIT;
struct qlist_head *q;
q = this_cpu_ptr(&cpu_quarantine);
qlist_move_cache(q, &to_free, cache);
qlist_free_all(&to_free, cache);
}
/* Free all quarantined objects belonging to cache. */
void quarantine_remove_cache(struct kmem_cache *cache)
{
unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
/*
* Must be careful to not miss any objects that are being moved from
* per-cpu list to the global quarantine in quarantine_put(),
* nor objects being freed in quarantine_reduce(). on_each_cpu()
* achieves the first goal, while synchronize_srcu() achieves the
* second.
*/
on_each_cpu(per_cpu_remove_cache, cache, 1);
spin_lock_irqsave(&quarantine_lock, flags);
for (i = 0; i < QUARANTINE_BATCHES; i++) {
if (qlist_empty(&global_quarantine[i]))
continue;
qlist_move_cache(&global_quarantine[i], &to_free, cache);
/* Scanning whole quarantine can take a while. */
spin_unlock_irqrestore(&quarantine_lock, flags);
cond_resched();
spin_lock_irqsave(&quarantine_lock, flags);
}
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
synchronize_srcu(&remove_cache_srcu);
}