diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 251bcdc6352e..749d7d39d657 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -188,7 +188,7 @@ _xfs_buf_initialize( atomic_set(&bp->b_hold, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_list); - INIT_LIST_HEAD(&bp->b_hash_list); + RB_CLEAR_NODE(&bp->b_rbnode); init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; @@ -262,8 +262,6 @@ xfs_buf_free( { trace_xfs_buf_free(bp, _RET_IP_); - ASSERT(list_empty(&bp->b_hash_list)); - if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; @@ -422,8 +420,10 @@ _xfs_buf_find( { xfs_off_t range_base; size_t range_length; - xfs_bufhash_t *hash; - xfs_buf_t *bp, *n; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + xfs_buf_t *bp; range_base = (ioff << BBSHIFT); range_length = (isize << BBSHIFT); @@ -432,14 +432,37 @@ _xfs_buf_find( ASSERT(!(range_length < (1 << btp->bt_sshift))); ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); - hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; + /* get tree root */ + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, ioff)); - spin_lock(&hash->bh_lock); + /* walk tree */ + spin_lock(&pag->pag_buf_lock); + rbp = &pag->pag_buf_tree.rb_node; + parent = NULL; + bp = NULL; + while (*rbp) { + parent = *rbp; + bp = rb_entry(parent, struct xfs_buf, b_rbnode); - list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { - ASSERT(btp == bp->b_target); - if (bp->b_file_offset == range_base && - bp->b_buffer_length == range_length) { + if (range_base < bp->b_file_offset) + rbp = &(*rbp)->rb_left; + else if (range_base > bp->b_file_offset) + rbp = &(*rbp)->rb_right; + else { + /* + * found a block offset match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching to the right for an exact match. + */ + if (bp->b_buffer_length != range_length) { + ASSERT(bp->b_flags & XBF_STALE); + rbp = &(*rbp)->rb_right; + continue; + } atomic_inc(&bp->b_hold); goto found; } @@ -449,17 +472,21 @@ _xfs_buf_find( if (new_bp) { _xfs_buf_initialize(new_bp, btp, range_base, range_length, flags); - new_bp->b_hash = hash; - list_add(&new_bp->b_hash_list, &hash->bh_list); + rb_link_node(&new_bp->b_rbnode, parent, rbp); + rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); } else { XFS_STATS_INC(xb_miss_locked); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); } - - spin_unlock(&hash->bh_lock); return new_bp; found: - spin_unlock(&hash->bh_lock); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); /* Attempt to get the semaphore without sleeping, * if this does not work then we need to drop the @@ -809,27 +836,30 @@ void xfs_buf_rele( xfs_buf_t *bp) { - xfs_bufhash_t *hash = bp->b_hash; + struct xfs_perag *pag = bp->b_pag; trace_xfs_buf_rele(bp, _RET_IP_); - if (unlikely(!hash)) { + if (!pag) { ASSERT(!bp->b_relse); + ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) xfs_buf_free(bp); return; } + ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { + if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { if (bp->b_relse) { atomic_inc(&bp->b_hold); - spin_unlock(&hash->bh_lock); - (*(bp->b_relse)) (bp); + spin_unlock(&pag->pag_buf_lock); + bp->b_relse(bp); } else { ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); - list_del_init(&bp->b_hash_list); - spin_unlock(&hash->bh_lock); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); xfs_buf_free(bp); } } @@ -1429,56 +1459,24 @@ xfs_buf_iomove( */ void xfs_wait_buftarg( - xfs_buftarg_t *btp) + struct xfs_buftarg *btp) { - xfs_bufhash_t *hash; - uint i; + struct xfs_perag *pag; + uint i; - for (i = 0; i < (1 << btp->bt_hashshift); i++) { - hash = &btp->bt_hash[i]; - spin_lock(&hash->bh_lock); - while (!list_empty(&hash->bh_list)) { - spin_unlock(&hash->bh_lock); + for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { + pag = xfs_perag_get(btp->bt_mount, i); + spin_lock(&pag->pag_buf_lock); + while (rb_first(&pag->pag_buf_tree)) { + spin_unlock(&pag->pag_buf_lock); delay(100); - spin_lock(&hash->bh_lock); + spin_lock(&pag->pag_buf_lock); } - spin_unlock(&hash->bh_lock); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); } } -/* - * Allocate buffer hash table for a given target. - * For devices containing metadata (i.e. not the log/realtime devices) - * we need to allocate a much larger hash table. - */ -STATIC void -xfs_alloc_bufhash( - xfs_buftarg_t *btp, - int external) -{ - unsigned int i; - - if (external) { - btp->bt_hash = NULL; - return; - } - btp->bt_hashshift = 12; /* 4096 buckets */ - btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * - sizeof(xfs_bufhash_t)); - for (i = 0; i < (1 << btp->bt_hashshift); i++) { - spin_lock_init(&btp->bt_hash[i].bh_lock); - INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); - } -} - -STATIC void -xfs_free_bufhash( - xfs_buftarg_t *btp) -{ - kmem_free_large(btp->bt_hash); - btp->bt_hash = NULL; -} - /* * buftarg list for delwrite queue processing */ @@ -1511,7 +1509,6 @@ xfs_free_buftarg( xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - xfs_free_bufhash(btp); iput(btp->bt_mapping->host); /* Unregister the buftarg first so that we don't get a @@ -1651,7 +1648,6 @@ xfs_alloc_buftarg( goto error; if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; - xfs_alloc_bufhash(btp, external); return btp; error: diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 1f109cee136c..c79882ecb513 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -135,10 +135,6 @@ typedef struct xfs_buftarg { unsigned int bt_sshift; size_t bt_smask; - /* per device buffer hash table */ - uint bt_hashshift; - xfs_bufhash_t *bt_hash; - /* per device delwri queue */ struct task_struct *bt_task; struct list_head bt_list; @@ -172,8 +168,8 @@ typedef struct xfs_buf { wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; xfs_buf_flags_t b_flags; /* status flags */ - struct list_head b_hash_list; /* hash table list */ - xfs_bufhash_t *b_hash; /* hash table list start */ + struct rb_node b_rbnode; /* rbtree node */ + struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ atomic_t b_hold; /* reference count */ xfs_daddr_t b_bn; /* block number for I/O */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index baeec83d01f9..63c7a1a6c022 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -233,6 +233,10 @@ typedef struct xfs_perag { struct mutex pag_ici_reclaim_lock; /* serialisation point */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ + struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + /* for rcu-safe freeing */ struct rcu_head rcu_head; #endif diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 59859c343e04..cfa2fb4e7f97 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -479,6 +479,8 @@ xfs_initialize_perag( rwlock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + spin_lock_init(&pag->pag_buf_lock); + pag->pag_buf_tree = RB_ROOT; if (radix_tree_preload(GFP_NOFS)) goto out_unwind;