From 5ece6c52ea52f9e94298e950a837ccff415c7687 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 18 Feb 2008 13:45:51 +0100
Subject: [PATCH 01/15] make blk-core.c:request_cachep static again

request_cachep needlessly became global.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index e9754dc98ec4..c013ca22eb67 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -38,7 +38,7 @@ static int __make_request(struct request_queue *q, struct bio *bio);
 /*
  * For the allocated request tables
  */
-struct kmem_cache *request_cachep;
+static struct kmem_cache *request_cachep;
 
 /*
  * For queue allocation

From 13341598263011e079386b22ea35e482f97714c0 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 18 Feb 2008 13:45:53 +0100
Subject: [PATCH 02/15] make blk_ioc_init() static

blk_ioc_init() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-ioc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 80245dc30c75..4ae0929c6e38 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -176,7 +176,7 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 }
 EXPORT_SYMBOL(copy_io_context);
 
-int __init blk_ioc_init(void)
+static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL);

From 52ff4cae65b45dcdfa23de09619754d6f380f31e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 18 Feb 2008 13:45:55 +0100
Subject: [PATCH 03/15] make blk_settings_init() static

blk_settings_init() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-settings.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index c8d0c5724098..13536a388d27 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -386,7 +386,7 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
-int __init blk_settings_init(void)
+static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
 	blk_max_pfn = max_pfn - 1;

From 4c54ac62dceecedd82d4a865017bba0b738e2897 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 18 Feb 2008 13:48:31 +0100
Subject: [PATCH 04/15] make struct def_blk_aops static

This patch makes the needlessly global struct def_blk_aops static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 fs/block_dev.c     | 4 +++-
 include/linux/fs.h | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 67fe72ce6ac7..8335f0e1b0fb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -31,6 +31,8 @@ struct bdev_inode {
 	struct inode vfs_inode;
 };
 
+static const struct address_space_operations def_blk_aops;
+
 static inline struct bdev_inode *BDEV_I(struct inode *inode)
 {
 	return container_of(inode, struct bdev_inode, vfs_inode);
@@ -1334,7 +1336,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
 }
 
-const struct address_space_operations def_blk_aops = {
+static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98ffb6ead434..b84b848431f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1590,7 +1590,6 @@ extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, unsigned);
-extern const struct address_space_operations def_blk_aops;
 #else
 static inline void bd_forget(struct inode *inode) {}
 #endif

From 86b6c7a7f78feca58d2d8615e53aee4d59ab9dc6 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 18 Feb 2008 13:48:32 +0100
Subject: [PATCH 05/15] fs/block_dev.c: remove #if 0'ed code

Commit b2e895dbd80c420bfc0937c3729b4afe073b3848 #if 0'ed this code stating:

<--  snip  -->

    [PATCH] revert blockdev direct io back to 2.6.19 version

    Andrew Vasquez is reporting as-iosched oopses and a 65% throughput
    slowdown due to the recent special-casing of direct-io against
    blockdevs.  We don't know why either of these things are occurring.

    The patch minimally reverts us back to the 2.6.19 code for a 2.6.20
    release.

<--  snip  -->

It has since been dead code, and unless someone wants to revive it now
it's time to remove it.

This patch also makes bio_release_pages() static again and removes the
ki_bio_count member from struct kiocb, reverting changes that had been
done for this dead code.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 fs/bio.c            |   2 +-
 fs/block_dev.c      | 197 --------------------------------------------
 include/linux/aio.h |   1 -
 include/linux/bio.h |   1 -
 4 files changed, 1 insertion(+), 200 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 242e409dab4b..3312fcc3c098 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -903,7 +903,7 @@ void bio_set_pages_dirty(struct bio *bio)
 	}
 }
 
-void bio_release_pages(struct bio *bio)
+static void bio_release_pages(struct bio *bio)
 {
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int i;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8335f0e1b0fb..7d822fae7765 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -173,203 +173,6 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
-#if 0
-static void blk_end_aio(struct bio *bio, int error)
-{
-	struct kiocb *iocb = bio->bi_private;
-	atomic_t *bio_count = &iocb->ki_bio_count;
-
-	if (bio_data_dir(bio) == READ)
-		bio_check_pages_dirty(bio);
-	else {
-		bio_release_pages(bio);
-		bio_put(bio);
-	}
-
-	/* iocb->ki_nbytes stores error code from LLDD */
-	if (error)
-		iocb->ki_nbytes = -EIO;
-
-	if (atomic_dec_and_test(bio_count)) {
-		if ((long)iocb->ki_nbytes < 0)
-			aio_complete(iocb, iocb->ki_nbytes, 0);
-		else
-			aio_complete(iocb, iocb->ki_left, 0);
-	}
-
-	return 0;
-}
-
-#define VEC_SIZE	16
-struct pvec {
-	unsigned short nr;
-	unsigned short idx;
-	struct page *page[VEC_SIZE];
-};
-
-#define PAGES_SPANNED(addr, len)	\
-	(DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
-
-/*
- * get page pointer for user addr, we internally cache struct page array for
- * (addr, count) range in pvec to avoid frequent call to get_user_pages.  If
- * internal page list is exhausted, a batch count of up to VEC_SIZE is used
- * to get next set of page struct.
- */
-static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
-				 struct pvec *pvec)
-{
-	int ret, nr_pages;
-	if (pvec->idx == pvec->nr) {
-		nr_pages = PAGES_SPANNED(addr, count);
-		nr_pages = min(nr_pages, VEC_SIZE);
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, addr, nr_pages,
-				     rw == READ, 0, pvec->page, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (ret < 0)
-			return ERR_PTR(ret);
-		pvec->nr = ret;
-		pvec->idx = 0;
-	}
-	return pvec->page[pvec->idx++];
-}
-
-/* return a page back to pvec array */
-static void blk_unget_page(struct page *page, struct pvec *pvec)
-{
-	pvec->page[--pvec->idx] = page;
-}
-
-static ssize_t
-blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-		 loff_t pos, unsigned long nr_segs)
-{
-	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
-	unsigned blocksize_mask = (1 << blkbits) - 1;
-	unsigned long seg = 0;	/* iov segment iterator */
-	unsigned long nvec;	/* number of bio vec needed */
-	unsigned long cur_off;	/* offset into current page */
-	unsigned long cur_len;	/* I/O len of current page, up to PAGE_SIZE */
-
-	unsigned long addr;	/* user iovec address */
-	size_t count;		/* user iovec len */
-	size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */
-	loff_t size;		/* size of block device */
-	struct bio *bio;
-	atomic_t *bio_count = &iocb->ki_bio_count;
-	struct page *page;
-	struct pvec pvec;
-
-	pvec.nr = 0;
-	pvec.idx = 0;
-
-	if (pos & blocksize_mask)
-		return -EINVAL;
-
-	size = i_size_read(inode);
-	if (pos + nbytes > size) {
-		nbytes = size - pos;
-		iocb->ki_left = nbytes;
-	}
-
-	/*
-	 * check first non-zero iov alignment, the remaining
-	 * iov alignment is checked inside bio loop below.
-	 */
-	do {
-		addr = (unsigned long) iov[seg].iov_base;
-		count = min(iov[seg].iov_len, nbytes);
-		if (addr & blocksize_mask || count & blocksize_mask)
-			return -EINVAL;
-	} while (!count && ++seg < nr_segs);
-	atomic_set(bio_count, 1);
-
-	while (nbytes) {
-		/* roughly estimate number of bio vec needed */
-		nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		nvec = max(nvec, nr_segs - seg);
-		nvec = min(nvec, (unsigned long) BIO_MAX_PAGES);
-
-		/* bio_alloc should not fail with GFP_KERNEL flag */
-		bio = bio_alloc(GFP_KERNEL, nvec);
-		bio->bi_bdev = I_BDEV(inode);
-		bio->bi_end_io = blk_end_aio;
-		bio->bi_private = iocb;
-		bio->bi_sector = pos >> blkbits;
-same_bio:
-		cur_off = addr & ~PAGE_MASK;
-		cur_len = PAGE_SIZE - cur_off;
-		if (count < cur_len)
-			cur_len = count;
-
-		page = blk_get_page(addr, count, rw, &pvec);
-		if (unlikely(IS_ERR(page)))
-			goto backout;
-
-		if (bio_add_page(bio, page, cur_len, cur_off)) {
-			pos += cur_len;
-			addr += cur_len;
-			count -= cur_len;
-			nbytes -= cur_len;
-
-			if (count)
-				goto same_bio;
-			while (++seg < nr_segs) {
-				addr = (unsigned long) iov[seg].iov_base;
-				count = iov[seg].iov_len;
-				if (!count)
-					continue;
-				if (unlikely(addr & blocksize_mask ||
-					     count & blocksize_mask)) {
-					page = ERR_PTR(-EINVAL);
-					goto backout;
-				}
-				count = min(count, nbytes);
-				goto same_bio;
-			}
-		} else {
-			blk_unget_page(page, &pvec);
-		}
-
-		/* bio is ready, submit it */
-		if (rw == READ)
-			bio_set_pages_dirty(bio);
-		atomic_inc(bio_count);
-		submit_bio(rw, bio);
-	}
-
-completion:
-	iocb->ki_left -= nbytes;
-	nbytes = iocb->ki_left;
-	iocb->ki_pos += nbytes;
-
-	blk_run_address_space(inode->i_mapping);
-	if (atomic_dec_and_test(bio_count))
-		aio_complete(iocb, nbytes, 0);
-
-	return -EIOCBQUEUED;
-
-backout:
-	/*
-	 * back out nbytes count constructed so far for this bio,
-	 * we will throw away current bio.
-	 */
-	nbytes += bio->bi_size;
-	bio_release_pages(bio);
-	bio_put(bio);
-
-	/*
-	 * if no bio was submmitted, return the error code.
-	 * otherwise, proceed with pending I/O completion.
-	 */
-	if (atomic_read(bio_count) == 1)
-		return PTR_ERR(page);
-	goto completion;
-}
-#endif
-
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index a9931e2e5624..0d0b7f629bd3 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -105,7 +105,6 @@ struct kiocb {
 	wait_queue_t		ki_wait;
 	loff_t			ki_pos;
 
-	atomic_t		ki_bio_count;	/* num bio used for this iocb */
 	void			*private;
 	/* State that we remember to be able to restart/retry  */
 	unsigned short		ki_opcode;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4da441337d6e..4c59bdccd3ee 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -326,7 +326,6 @@ extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				gfp_t);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
-extern void bio_release_pages(struct bio *bio);
 extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);

From 84e9e03c55c2456799ab19f1d577e72f721fdd39 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 18 Feb 2008 13:51:56 +0100
Subject: [PATCH 06/15] block: make blk_rq_map_user() clear ->bio if it unmaps
 it

That way the interface is symmetric, and calling blk_rq_unmap_user()
on the request wont oops.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-map.c b/block/blk-map.c
index 955d75c1a58f..bc5ce60691c3 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -143,6 +143,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	return 0;
 unmap_rq:
 	blk_rq_unmap_user(bio);
+	rq->bio = NULL;
 	return ret;
 }
 EXPORT_SYMBOL(blk_rq_map_user);

From ffc4e7595734cf768fa60cea8a4d545dfef8231a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 19 Feb 2008 10:02:29 +0100
Subject: [PATCH 07/15] cfq-iosched: add hlist for browsing parallel to the
 radix tree

It's cumbersome to browse a radix tree from start to finish, especially
since we modify keys when a process exits. So add a hlist for the single
purpose of browsing over all known cfq_io_contexts, used for exit,
io prio change, etc.

This fixes http://bugzilla.kernel.org/show_bug.cgi?id=9948

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c           | 35 +++++++++++++++--------------------
 block/cfq-iosched.c       | 38 ++++++++++++--------------------------
 include/linux/iocontext.h |  2 ++
 3 files changed, 29 insertions(+), 46 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 4ae0929c6e38..e34df7c9fc36 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -17,17 +17,13 @@ static struct kmem_cache *iocontext_cachep;
 
 static void cfq_dtor(struct io_context *ioc)
 {
-	struct cfq_io_context *cic[1];
-	int r;
+	if (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic;
 
-	/*
-	 * We don't have a specific key to lookup with, so use the gang
-	 * lookup to just retrieve the first item stored. The cfq exit
-	 * function will iterate the full tree, so any member will do.
-	 */
-	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
-	if (r > 0)
-		cic[0]->dtor(ioc);
+		cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
+								cic_list);
+		cic->dtor(ioc);
+	}
 }
 
 /*
@@ -57,18 +53,16 @@ EXPORT_SYMBOL(put_io_context);
 
 static void cfq_exit(struct io_context *ioc)
 {
-	struct cfq_io_context *cic[1];
-	int r;
-
 	rcu_read_lock();
-	/*
-	 * See comment for cfq_dtor()
-	 */
-	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
-	rcu_read_unlock();
 
-	if (r > 0)
-		cic[0]->exit(ioc);
+	if (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic;
+
+		cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
+								cic_list);
+		cic->exit(ioc);
+	}
+	rcu_read_unlock();
 }
 
 /* Called by the exitting task */
@@ -105,6 +99,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
+		INIT_HLIST_HEAD(&ret->cic_list);
 		ret->ioc_data = NULL;
 	}
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ca198e61fa65..0f962ecae91f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1145,38 +1145,19 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 /*
  * Call func for each cic attached to this ioc. Returns number of cic's seen.
  */
-#define CIC_GANG_NR	16
 static unsigned int
 call_for_each_cic(struct io_context *ioc,
 		  void (*func)(struct io_context *, struct cfq_io_context *))
 {
-	struct cfq_io_context *cics[CIC_GANG_NR];
-	unsigned long index = 0;
-	unsigned int called = 0;
-	int nr;
+	struct cfq_io_context *cic;
+	struct hlist_node *n;
+	int called = 0;
 
 	rcu_read_lock();
-
-	do {
-		int i;
-
-		/*
-		 * Perhaps there's a better way - this just gang lookups from
-		 * 0 to the end, restarting after each CIC_GANG_NR from the
-		 * last key + 1.
-		 */
-		nr = radix_tree_gang_lookup(&ioc->radix_root, (void **) cics,
-						index, CIC_GANG_NR);
-		if (!nr)
-			break;
-
-		called += nr;
-		index = 1 + (unsigned long) cics[nr - 1]->key;
-
-		for (i = 0; i < nr; i++)
-			func(ioc, cics[i]);
-	} while (nr == CIC_GANG_NR);
-
+	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) {
+		func(ioc, cic);
+		called++;
+	}
 	rcu_read_unlock();
 
 	return called;
@@ -1190,6 +1171,7 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
 
 	spin_lock_irqsave(&ioc->lock, flags);
 	radix_tree_delete(&ioc->radix_root, cic->dead_key);
+	hlist_del_rcu(&cic->cic_list);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 
 	kmem_cache_free(cfq_ioc_pool, cic);
@@ -1280,6 +1262,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (cic) {
 		cic->last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
+		INIT_HLIST_NODE(&cic->cic_list);
 		cic->dtor = cfq_free_io_context;
 		cic->exit = cfq_exit_io_context;
 		elv_ioc_count_inc(ioc_count);
@@ -1501,6 +1484,7 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
 		rcu_assign_pointer(ioc->ioc_data, NULL);
 
 	radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
+	hlist_del_rcu(&cic->cic_list);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 
 	cfq_cic_free(cic);
@@ -1561,6 +1545,8 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 		spin_lock_irqsave(&ioc->lock, flags);
 		ret = radix_tree_insert(&ioc->radix_root,
 						(unsigned long) cfqd, cic);
+		if (!ret)
+			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
 		spin_unlock_irqrestore(&ioc->lock, flags);
 
 		radix_tree_preload_end();
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 593b222d9dcc..1b4ccf25b4d2 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -50,6 +50,7 @@ struct cfq_io_context {
 	sector_t seek_mean;
 
 	struct list_head queue_list;
+	struct hlist_node cic_list;
 
 	void (*dtor)(struct io_context *); /* destructor */
 	void (*exit)(struct io_context *); /* called on task exit */
@@ -77,6 +78,7 @@ struct io_context {
 
 	struct as_io_context *aic;
 	struct radix_tree_root radix_root;
+	struct hlist_head cic_list;
 	void *ioc_data;
 };
 

From e164094964e6e20fe7fce418e06a9dce952bb7a4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 19 Feb 2008 10:20:37 +0100
Subject: [PATCH 08/15] elevator: make elevator_get() attempt to load the
 appropriate module

Currently we fail if someone requests a valid io scheduler, but it's
modular and not currently loaded. That can happen from a driver init
asking for a different scheduler, or online switching through sysfs
as requested by a user.

This patch makes elevator_get() request_module() to attempt to load
the appropriate module, instead of requiring that done manually.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/block/elevator.c b/block/elevator.c
index bafbae0344d3..88318c383608 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -134,6 +134,21 @@ static struct elevator_type *elevator_get(const char *name)
 	spin_lock(&elv_list_lock);
 
 	e = elevator_find(name);
+	if (!e) {
+		char elv[ELV_NAME_MAX + strlen("-iosched")];
+
+		spin_unlock(&elv_list_lock);
+
+		if (!strcmp(name, "anticipatory"))
+			sprintf(elv, "as-iosched");
+		else
+			sprintf(elv, "%s-iosched", name);
+
+		request_module(elv);
+		spin_lock(&elv_list_lock);
+		e = elevator_find(name);
+	}
+
 	if (e && !try_module_get(e->elevator_owner))
 		e = NULL;
 

From 56c819df77f96c3fc0c2a979e12b478403728790 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 19 Feb 2008 11:35:37 +0100
Subject: [PATCH 09/15] libata: update ATAPI overflow draining

For misc ATAPI commands which transfer variable length data to the
host, overflow can occur due to application or hardware bug.  Such
overflows can be ignored safely as long as overflow data is properly
drained.  libata HSM implementation has this implemented in
__atapi_pio_bytes() and recently updated for 2.6.24-rc but it requires
further improvements.  Improve drain logic such that...

* Report overflow errors using ehi desc mechanism instead of printing
  directly.

* Properly calculate the number of bytes to be drained considering
  actual number of consumed bytes for partial draining.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ata/libata-core.c | 76 +++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 43 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index f46eb6f6dc9f..a109ccbda9ca 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4675,24 +4675,9 @@ int ata_check_atapi_dma(struct ata_queued_cmd *qc)
  */
 static int atapi_qc_may_overflow(struct ata_queued_cmd *qc)
 {
-	if (qc->tf.protocol != ATAPI_PROT_PIO &&
-	    qc->tf.protocol != ATAPI_PROT_DMA)
-		return 0;
-
-	if (qc->tf.flags & ATA_TFLAG_WRITE)
-		return 0;
-
-	switch (qc->cdb[0]) {
-	case READ_10:
-	case READ_12:
-	case WRITE_10:
-	case WRITE_12:
-	case GPCMD_READ_CD:
-	case GPCMD_READ_CD_MSF:
-		return 0;
-	}
-
-	return 1;
+	return ata_is_atapi(qc->tf.protocol) && ata_is_data(qc->tf.protocol) &&
+		atapi_cmd_type(qc->cdb[0]) == ATAPI_MISC &&
+		!(qc->tf.flags & ATA_TFLAG_WRITE);
 }
 
 /**
@@ -5146,13 +5131,14 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
  */
 static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
 {
-	int do_write = (qc->tf.flags & ATA_TFLAG_WRITE);
+	int rw = (qc->tf.flags & ATA_TFLAG_WRITE) ? WRITE : READ;
 	struct ata_port *ap = qc->ap;
-	struct ata_eh_info *ehi = &qc->dev->link->eh_info;
+	struct ata_device *dev = qc->dev;
+	struct ata_eh_info *ehi = &dev->link->eh_info;
 	struct scatterlist *sg;
 	struct page *page;
 	unsigned char *buf;
-	unsigned int offset, count;
+	unsigned int offset, count, consumed;
 
 next_sg:
 	sg = qc->cursg;
@@ -5165,26 +5151,27 @@ next_sg:
 		 *    - for write case, padding zero data to the device
 		 */
 		u16 pad_buf[1] = { 0 };
-		unsigned int i;
 
-		if (bytes > qc->curbytes - qc->nbytes + ATAPI_MAX_DRAIN) {
+		if (qc->curbytes + bytes > qc->nbytes + ATAPI_MAX_DRAIN) {
 			ata_ehi_push_desc(ehi, "too much trailing data "
 					  "buf=%u cur=%u bytes=%u",
 					  qc->nbytes, qc->curbytes, bytes);
 			return -1;
 		}
 
-		 /* overflow is exptected for misc ATAPI commands */
-		if (bytes && !atapi_qc_may_overflow(qc))
-			ata_dev_printk(qc->dev, KERN_WARNING, "ATAPI %u bytes "
-				       "trailing data (cdb=%02x nbytes=%u)\n",
-				       bytes, qc->cdb[0], qc->nbytes);
+		/* allow overflow only for misc ATAPI commands */
+		if (!atapi_qc_may_overflow(qc)) {
+			ata_ehi_push_desc(ehi, "unexpected trailing data "
+					  "%u bytes", bytes);
+			return -1;
+		}
 
-		for (i = 0; i < (bytes + 1) / 2; i++)
-			ap->ops->data_xfer(qc->dev, (unsigned char *)pad_buf, 2, do_write);
+		consumed = 0;
+		while (consumed < bytes)
+			consumed += ap->ops->data_xfer(dev,
+					(unsigned char *)pad_buf, 2, rw);
 
 		qc->curbytes += bytes;
-
 		return 0;
 	}
 
@@ -5211,18 +5198,16 @@ next_sg:
 		buf = kmap_atomic(page, KM_IRQ0);
 
 		/* do the actual data transfer */
-		ap->ops->data_xfer(qc->dev,  buf + offset, count, do_write);
+		consumed = ap->ops->data_xfer(dev,  buf + offset, count, rw);
 
 		kunmap_atomic(buf, KM_IRQ0);
 		local_irq_restore(flags);
 	} else {
 		buf = page_address(page);
-		ap->ops->data_xfer(qc->dev,  buf + offset, count, do_write);
+		consumed = ap->ops->data_xfer(dev,  buf + offset, count, rw);
 	}
 
-	bytes -= count;
-	if ((count & 1) && bytes)
-		bytes--;
+	bytes -= min(bytes, consumed);
 	qc->curbytes += count;
 	qc->cursg_ofs += count;
 
@@ -5231,9 +5216,11 @@ next_sg:
 		qc->cursg_ofs = 0;
 	}
 
+	/* consumed can be larger than count only for the last transfer */
+	WARN_ON(qc->cursg && count != consumed);
+
 	if (bytes)
 		goto next_sg;
-
 	return 0;
 }
 
@@ -5251,6 +5238,7 @@ static void atapi_pio_bytes(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 	struct ata_device *dev = qc->dev;
+	struct ata_eh_info *ehi = &dev->link->eh_info;
 	unsigned int ireason, bc_lo, bc_hi, bytes;
 	int i_write, do_write = (qc->tf.flags & ATA_TFLAG_WRITE) ? 1 : 0;
 
@@ -5268,26 +5256,28 @@ static void atapi_pio_bytes(struct ata_queued_cmd *qc)
 
 	/* shall be cleared to zero, indicating xfer of data */
 	if (unlikely(ireason & (1 << 0)))
-		goto err_out;
+		goto atapi_check;
 
 	/* make sure transfer direction matches expected */
 	i_write = ((ireason & (1 << 1)) == 0) ? 1 : 0;
 	if (unlikely(do_write != i_write))
-		goto err_out;
+		goto atapi_check;
 
 	if (unlikely(!bytes))
-		goto err_out;
+		goto atapi_check;
 
 	VPRINTK("ata%u: xfering %d bytes\n", ap->print_id, bytes);
 
-	if (__atapi_pio_bytes(qc, bytes))
+	if (unlikely(__atapi_pio_bytes(qc, bytes)))
 		goto err_out;
 	ata_altstatus(ap); /* flush */
 
 	return;
 
-err_out:
-	ata_dev_printk(dev, KERN_INFO, "ATAPI check failed\n");
+ atapi_check:
+	ata_ehi_push_desc(ehi, "ATAPI check failed (ireason=0x%x bytes=%u)",
+			  ireason, bytes);
+ err_out:
 	qc->err_mask |= AC_ERR_HSM;
 	ap->hsm_task_state = HSM_ST_ERR;
 }

From 40b01b9bbdf51ae543a04744283bf2d56c4a6afa Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 19 Feb 2008 11:35:38 +0100
Subject: [PATCH 10/15] block: update bio according to DMA alignment padding

DMA start address and transfer size alignment for PC requests are
achieved using bio_copy_user() instead of bio_map_user().  This works
because bio_copy_user() always uses full pages and block DMA alignment
isn't allowed to go over PAGE_SIZE.

However, the implementation didn't update the last bio of the request
to make this padding visible to lower layers.  This patch makes
blk_rq_map_user() extend the last bio such that it includes the
padding area and the size of area pointed to by the request is
properly aligned.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/block/blk-map.c b/block/blk-map.c
index bc5ce60691c3..a7cf63ccb5cf 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -139,6 +139,23 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 		ubuf += ret;
 	}
 
+	/*
+	 * __blk_rq_map_user() copies the buffers if starting address
+	 * or length isn't aligned.  As the copied buffer is always
+	 * page aligned, we know that there's enough room for padding.
+	 * Extend the last bio and update rq->data_len accordingly.
+	 *
+	 * On unmap, bio_uncopy_user() will use unmodified
+	 * bio_map_data pointed to by bio->bi_private.
+	 */
+	if (len & queue_dma_alignment(q)) {
+		unsigned int pad_len = (queue_dma_alignment(q) & ~len) + 1;
+		struct bio *bio = rq->biotail;
+
+		bio->bi_io_vec[bio->bi_vcnt - 1].bv_len += pad_len;
+		bio->bi_size += pad_len;
+	}
+
 	rq->buffer = rq->data = NULL;
 	return 0;
 unmap_rq:

From 6b00769fe1502b4ad97bb327ef7ac971b208bfb5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 19 Feb 2008 11:36:35 +0100
Subject: [PATCH 11/15] block: add request->raw_data_len

With padding and draining moved into it, block layer now may extend
requests as directed by queue parameters, so now a request has two
sizes - the original request size and the extended size which matches
the size of area pointed to by bios and later by sgs.  The latter size
is what lower layers are primarily interested in when allocating,
filling up DMA tables and setting up the controller.

Both padding and draining extend the data area to accomodate
controller characteristics.  As any controller which speaks SCSI can
handle underflows, feeding larger data area is safe.

So, this patch makes the primary data length field, request->data_len,
indicate the size of full data area and add a separate length field,
request->raw_data_len, for the unmodified request size.  The latter is
used to report to higher layer (userland) and where the original
request size should be fed to the controller or device.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c        | 2 ++
 block/blk-map.c         | 2 ++
 block/blk-merge.c       | 1 +
 block/bsg.c             | 8 ++++----
 block/scsi_ioctl.c      | 3 ++-
 drivers/scsi/scsi_lib.c | 8 ++++----
 include/linux/blkdev.h  | 1 +
 7 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index c013ca22eb67..775c8516abf5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,6 +127,7 @@ void rq_init(struct request_queue *q, struct request *rq)
 	rq->nr_hw_segments = 0;
 	rq->ioprio = 0;
 	rq->special = NULL;
+	rq->raw_data_len = 0;
 	rq->buffer = NULL;
 	rq->tag = -1;
 	rq->errors = 0;
@@ -2015,6 +2016,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 	rq->hard_cur_sectors = rq->current_nr_sectors;
 	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->buffer = bio_data(bio);
+	rq->raw_data_len = bio->bi_size;
 	rq->data_len = bio->bi_size;
 
 	rq->bio = rq->biotail = bio;
diff --git a/block/blk-map.c b/block/blk-map.c
index a7cf63ccb5cf..09f7fd0bcb73 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -19,6 +19,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 
+		rq->raw_data_len += bio->bi_size;
 		rq->data_len += bio->bi_size;
 	}
 	return 0;
@@ -154,6 +155,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 
 		bio->bi_io_vec[bio->bi_vcnt - 1].bv_len += pad_len;
 		bio->bi_size += pad_len;
+		rq->data_len += pad_len;
 	}
 
 	rq->buffer = rq->data = NULL;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d3b84bbb776a..39f2e077a014 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -228,6 +228,7 @@ new_segment:
 			    ((unsigned long)q->dma_drain_buffer) &
 			    (PAGE_SIZE - 1));
 		nsegs++;
+		rq->data_len += q->dma_drain_size;
 	}
 
 	if (sg)
diff --git a/block/bsg.c b/block/bsg.c
index 8917c5174dc2..7f3c09549e4b 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -437,14 +437,14 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	}
 
 	if (rq->next_rq) {
-		hdr->dout_resid = rq->data_len;
-		hdr->din_resid = rq->next_rq->data_len;
+		hdr->dout_resid = rq->raw_data_len;
+		hdr->din_resid = rq->next_rq->raw_data_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = rq->data_len;
+		hdr->din_resid = rq->raw_data_len;
 	else
-		hdr->dout_resid = rq->data_len;
+		hdr->dout_resid = rq->raw_data_len;
 
 	/*
 	 * If the request generated a negative error number, return it
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9675b34638d4..e993cac4911d 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -266,7 +266,7 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->data_len;
+	hdr->resid = rq->raw_data_len;
 	hdr->sb_len_wr = 0;
 
 	if (rq->sense_len && hdr->sbp) {
@@ -528,6 +528,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	rq->data = NULL;
+	rq->raw_data_len = 0;
 	rq->data_len = 0;
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 	memset(rq->cmd, 0, sizeof(rq->cmd));
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 135c1d054701..ba21d97d1855 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1014,10 +1014,6 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 	}
 
 	req->buffer = NULL;
-	if (blk_pc_request(req))
-		sdb->length = req->data_len;
-	else
-		sdb->length = req->nr_sectors << 9;
 
 	/* 
 	 * Next, walk the list, and fill in the addresses and sizes of
@@ -1026,6 +1022,10 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 	count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
 	BUG_ON(count > sdb->table.nents);
 	sdb->table.nents = count;
+	if (blk_pc_request(req))
+		sdb->length = req->data_len;
+	else
+		sdb->length = req->nr_sectors << 9;
 	return BLKPREP_OK;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e1888cc5b8ae..f1fe9fbf1c0e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -216,6 +216,7 @@ struct request {
 	unsigned int cmd_len;
 	unsigned char cmd[BLK_MAX_CDB];
 
+	unsigned int raw_data_len;
 	unsigned int data_len;
 	unsigned int sense_len;
 	void *data;

From 2fb98e8414c42cb14698833aac640b143b9ade4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 19 Feb 2008 11:36:53 +0100
Subject: [PATCH 12/15] block: implement request_queue->dma_drain_needed

Draining shouldn't be done for commands where overflow may indicate
data integrity issues.  Add dma_drain_needed callback to
request_queue.  Drain buffer is appened iff this function returns
non-zero.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-merge.c      | 2 +-
 block/blk-settings.c   | 7 +++++--
 include/linux/blkdev.h | 7 +++++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 39f2e077a014..bef1b4d0fc02 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -220,7 +220,7 @@ new_segment:
 		bvprv = bvec;
 	} /* segments in rq */
 
-	if (q->dma_drain_size) {
+	if (q->dma_drain_size && q->dma_drain_needed(rq)) {
 		sg->page_link &= ~0x02;
 		sg = sg_next(sg);
 		sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 13536a388d27..9a8ffdd0ce3d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -296,6 +296,7 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
  * blk_queue_dma_drain - Set up a drain buffer for excess dma.
  *
  * @q:  the request queue for the device
+ * @dma_drain_needed: fn which returns non-zero if drain is necessary
  * @buf:	physically contiguous buffer
  * @size:	size of the buffer in bytes
  *
@@ -315,14 +316,16 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
  * device can support otherwise there won't be room for the drain
  * buffer.
  */
-int blk_queue_dma_drain(struct request_queue *q, void *buf,
-				unsigned int size)
+extern int blk_queue_dma_drain(struct request_queue *q,
+			       dma_drain_needed_fn *dma_drain_needed,
+			       void *buf, unsigned int size)
 {
 	if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
 	--q->max_hw_segments;
 	--q->max_phys_segments;
+	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f1fe9fbf1c0e..6fe67d1939c2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -259,6 +259,7 @@ struct bio_vec;
 typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
+typedef int (dma_drain_needed_fn)(struct request *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -295,6 +296,7 @@ struct request_queue
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
+	dma_drain_needed_fn	*dma_drain_needed;
 
 	/*
 	 * Dispatch queue sorting
@@ -699,8 +701,9 @@ extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
-extern int blk_queue_dma_drain(struct request_queue *q, void *buf,
-			       unsigned int size);
+extern int blk_queue_dma_drain(struct request_queue *q,
+			       dma_drain_needed_fn *dma_drain_needed,
+			       void *buf, unsigned int size);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);

From db0a2e0099be3a1cff55879225881465f16c67d3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 19 Feb 2008 11:36:55 +0100
Subject: [PATCH 13/15] block: clear drain buffer if draining for write command

Clear drain buffer before chaining if the command in question is a
write.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-merge.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index bef1b4d0fc02..7506c4fe0264 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -221,6 +221,9 @@ new_segment:
 	} /* segments in rq */
 
 	if (q->dma_drain_size && q->dma_drain_needed(rq)) {
+		if (rq->cmd_flags & REQ_RW)
+			memset(q->dma_drain_buffer, 0, q->dma_drain_size);
+
 		sg->page_link &= ~0x02;
 		sg = sg_next(sg);
 		sg_set_page(sg, virt_to_page(q->dma_drain_buffer),

From dde2020754aeb14e17052d61784dcb37f252aac2 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Tue, 19 Feb 2008 11:36:56 +0100
Subject: [PATCH 14/15] libata: eliminate the home grown dma padding in favour
 of

that provided by the block layer

ATA requires that all DMA transfers begin and end on word boundaries.
Because of this, a large amount of machinery grew up in ide to adjust
scatterlists on this basis.  However, as of 2.5, the block layer has a
dma_alignment variable which ensures both the beginning and length of a
DMA transfer are aligned on the dma_alignment boundary.  Although the
block layer does adjust the beginning of the transfer to ensure this
happens, it doesn't actually adjust the length, it merely makes sure
that space is allocated for transfers beyond the declared length.  The
upshot of this is that scatterlists may be padded to any size between
the actual length and the length adjusted to the dma_alignment safely
knowing that memory is allocated in this region.

Right at the moment, SCSI takes the default dma_aligment which is on a
512 byte boundary.  Note that this aligment only applies to transfers
coming in from user space.  However, since all kernel allocations are
automatically aligned on a minimum of 32 byte boundaries, it is safe to
adjust them in this manner as well.

tj: * Adjusting sg after padding is done in block layer.  Make libata
      set queue alignment correctly for ATAPI devices and drop broken
      sg mangling from ata_sg_setup().
    * Use request->raw_data_len for ATAPI transfer chunk size.
    * Killed qc->raw_nbytes.
    * Separated out killing qc->n_iter.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ata/ahci.c            |   5 --
 drivers/ata/libata-core.c     | 145 +++-------------------------------
 drivers/ata/libata-scsi.c     |  23 ++----
 drivers/ata/pata_icside.c     |   8 --
 drivers/ata/sata_fsl.c        |  13 ---
 drivers/ata/sata_mv.c         |   6 +-
 drivers/ata/sata_sil24.c      |   5 --
 drivers/scsi/ipr.c            |   4 +-
 drivers/scsi/libsas/sas_ata.c |   4 +-
 include/linux/libata.h        |  28 +------
 10 files changed, 21 insertions(+), 220 deletions(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 29e71bddd6ff..3c06e457b4dc 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1975,16 +1975,11 @@ static int ahci_port_start(struct ata_port *ap)
 	struct ahci_port_priv *pp;
 	void *mem;
 	dma_addr_t mem_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	mem = dmam_alloc_coherent(dev, AHCI_PORT_PRIV_DMA_SZ, &mem_dma,
 				  GFP_KERNEL);
 	if (!mem)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index a109ccbda9ca..3587ac3fe3f3 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4493,30 +4493,13 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 	struct ata_port *ap = qc->ap;
 	struct scatterlist *sg = qc->sg;
 	int dir = qc->dma_dir;
-	void *pad_buf = NULL;
 
 	WARN_ON(sg == NULL);
 
-	VPRINTK("unmapping %u sg elements\n", qc->mapped_n_elem);
+	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
-	/* if we padded the buffer out to 32-bit bound, and data
-	 * xfer direction is from-device, we must copy from the
-	 * pad buffer back into the supplied buffer
-	 */
-	if (qc->pad_len && !(qc->tf.flags & ATA_TFLAG_WRITE))
-		pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-
-	if (qc->mapped_n_elem)
-		dma_unmap_sg(ap->dev, sg, qc->mapped_n_elem, dir);
-	/* restore last sg */
-	if (qc->last_sg)
-		*qc->last_sg = qc->saved_last_sg;
-	if (pad_buf) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-		memcpy(addr + psg->offset, pad_buf, qc->pad_len);
-		kunmap_atomic(addr, KM_IRQ0);
-	}
+	if (qc->n_elem)
+		dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
 
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
 	qc->sg = NULL;
@@ -4767,97 +4750,6 @@ void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
 	qc->cursg = qc->sg;
 }
 
-static unsigned int ata_sg_setup_extra(struct ata_queued_cmd *qc,
-				       unsigned int *n_elem_extra,
-				       unsigned int *nbytes_extra)
-{
-	struct ata_port *ap = qc->ap;
-	unsigned int n_elem = qc->n_elem;
-	struct scatterlist *lsg, *copy_lsg = NULL, *tsg = NULL, *esg = NULL;
-
-	*n_elem_extra = 0;
-	*nbytes_extra = 0;
-
-	/* needs padding? */
-	qc->pad_len = qc->nbytes & 3;
-
-	if (likely(!qc->pad_len))
-		return n_elem;
-
-	/* locate last sg and save it */
-	lsg = sg_last(qc->sg, n_elem);
-	qc->last_sg = lsg;
-	qc->saved_last_sg = *lsg;
-
-	sg_init_table(qc->extra_sg, ARRAY_SIZE(qc->extra_sg));
-
-	if (qc->pad_len) {
-		struct scatterlist *psg = &qc->extra_sg[1];
-		void *pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
-		unsigned int offset;
-
-		WARN_ON(qc->dev->class != ATA_DEV_ATAPI);
-
-		memset(pad_buf, 0, ATA_DMA_PAD_SZ);
-
-		/* psg->page/offset are used to copy to-be-written
-		 * data in this function or read data in ata_sg_clean.
-		 */
-		offset = lsg->offset + lsg->length - qc->pad_len;
-		sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT),
-			    qc->pad_len, offset_in_page(offset));
-
-		if (qc->tf.flags & ATA_TFLAG_WRITE) {
-			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
-			memcpy(pad_buf, addr + psg->offset, qc->pad_len);
-			kunmap_atomic(addr, KM_IRQ0);
-		}
-
-		sg_dma_address(psg) = ap->pad_dma + (qc->tag * ATA_DMA_PAD_SZ);
-		sg_dma_len(psg) = ATA_DMA_PAD_SZ;
-
-		/* Trim the last sg entry and chain the original and
-		 * padding sg lists.
-		 *
-		 * Because chaining consumes one sg entry, one extra
-		 * sg entry is allocated and the last sg entry is
-		 * copied to it if the length isn't zero after padded
-		 * amount is removed.
-		 *
-		 * If the last sg entry is completely replaced by
-		 * padding sg entry, the first sg entry is skipped
-		 * while chaining.
-		 */
-		lsg->length -= qc->pad_len;
-		if (lsg->length) {
-			copy_lsg = &qc->extra_sg[0];
-			tsg = &qc->extra_sg[0];
-		} else {
-			n_elem--;
-			tsg = &qc->extra_sg[1];
-		}
-
-		esg = &qc->extra_sg[1];
-
-		(*n_elem_extra)++;
-		(*nbytes_extra) += 4 - qc->pad_len;
-	}
-
-	if (copy_lsg)
-		sg_set_page(copy_lsg, sg_page(lsg), lsg->length, lsg->offset);
-
-	sg_chain(lsg, 1, tsg);
-	sg_mark_end(esg);
-
-	/* sglist can't start with chaining sg entry, fast forward */
-	if (qc->sg == lsg) {
-		qc->sg = tsg;
-		qc->cursg = tsg;
-	}
-
-	return n_elem;
-}
-
 /**
  *	ata_sg_setup - DMA-map the scatter-gather table associated with a command.
  *	@qc: Command with scatter-gather table to be mapped.
@@ -4874,26 +4766,17 @@ static unsigned int ata_sg_setup_extra(struct ata_queued_cmd *qc,
 static int ata_sg_setup(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	unsigned int n_elem, n_elem_extra, nbytes_extra;
+	unsigned int n_elem;
 
 	VPRINTK("ENTER, ata%u\n", ap->print_id);
 
-	n_elem = ata_sg_setup_extra(qc, &n_elem_extra, &nbytes_extra);
+	n_elem = dma_map_sg(ap->dev, qc->sg, qc->n_elem, qc->dma_dir);
+	if (n_elem < 1)
+		return -1;
 
-	if (n_elem) {
-		n_elem = dma_map_sg(ap->dev, qc->sg, n_elem, qc->dma_dir);
-		if (n_elem < 1) {
-			/* restore last sg */
-			if (qc->last_sg)
-				*qc->last_sg = qc->saved_last_sg;
-			return -1;
-		}
-		DPRINTK("%d sg elements mapped\n", n_elem);
-	}
+	DPRINTK("%d sg elements mapped\n", n_elem);
 
-	qc->n_elem = qc->mapped_n_elem = n_elem;
-	qc->n_elem += n_elem_extra;
-	qc->nbytes += nbytes_extra;
+	qc->n_elem = n_elem;
 	qc->flags |= ATA_QCFLAG_DMAMAP;
 
 	return 0;
@@ -5962,9 +5845,6 @@ void ata_qc_issue(struct ata_queued_cmd *qc)
 	 */
 	BUG_ON(ata_is_data(prot) && (!qc->sg || !qc->n_elem || !qc->nbytes));
 
-	/* ata_sg_setup() may update nbytes */
-	qc->raw_nbytes = qc->nbytes;
-
 	if (ata_is_dma(prot) || (ata_is_pio(prot) &&
 				 (ap->flags & ATA_FLAG_PIO_DMA)))
 		if (ata_sg_setup(qc))
@@ -6573,19 +6453,12 @@ void ata_host_resume(struct ata_host *host)
 int ata_port_start(struct ata_port *ap)
 {
 	struct device *dev = ap->dev;
-	int rc;
 
 	ap->prd = dmam_alloc_coherent(dev, ATA_PRD_TBL_SZ, &ap->prd_dma,
 				      GFP_KERNEL);
 	if (!ap->prd)
 		return -ENOMEM;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
-	DPRINTK("prd alloc, virt %p, dma %llx\n", ap->prd,
-		(unsigned long long)ap->prd_dma);
 	return 0;
 }
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 1cea18f62abc..6e15c5ddae6d 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -832,24 +832,16 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 	/* configure max sectors */
 	blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
 
-	/* SATA DMA transfers must be multiples of 4 byte, so
-	 * we need to pad ATAPI transfers using an extra sg.
-	 * Decrement max hw segments accordingly.
-	 */
-	if (dev->class == ATA_DEV_ATAPI) {
-		struct request_queue *q = sdev->request_queue;
-		blk_queue_max_hw_segments(q, q->max_hw_segments - 1);
-
+	if (dev->class == ATA_DEV_ATAPI)
 		/* set the min alignment */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_DMA_PAD_SZ - 1);
-	} else
+	else {
 		/* ATA devices must be sector aligned */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_SECT_SIZE - 1);
-
-	if (dev->class == ATA_DEV_ATA)
 		sdev->manage_start_stop = 1;
+	}
 
 	if (dev->flags & ATA_DFLAG_AN)
 		set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
@@ -2500,7 +2492,7 @@ static unsigned int atapi_xlat(struct ata_queued_cmd *qc)
 	 * want to set it properly, and for DMA where it is
 	 * effectively meaningless.
 	 */
-	nbytes = min(qc->nbytes, (unsigned int)63 * 1024);
+	nbytes = min(scmd->request->raw_data_len, (unsigned int)63 * 1024);
 
 	/* Most ATAPI devices which honor transfer chunk size don't
 	 * behave according to the spec when odd chunk size which
@@ -3555,7 +3547,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  *	@ap: Port to initialize
  *
  *	Called just after data structures for each port are
- *	initialized.  Allocates DMA pad.
+ *	initialized.
  *
  *	May be used as the port_start() entry in ata_port_operations.
  *
@@ -3564,7 +3556,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  */
 int ata_sas_port_start(struct ata_port *ap)
 {
-	return ata_pad_alloc(ap, ap->dev);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
@@ -3572,8 +3564,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
  *	ata_port_stop - Undo ata_sas_port_start()
  *	@ap: Port to shut down
  *
- *	Frees the DMA pad.
- *
  *	May be used as the port_stop() entry in ata_port_operations.
  *
  *	LOCKING:
@@ -3582,7 +3572,6 @@ EXPORT_SYMBOL_GPL(ata_sas_port_start);
 
 void ata_sas_port_stop(struct ata_port *ap)
 {
-	ata_pad_free(ap, ap->dev);
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_stop);
 
diff --git a/drivers/ata/pata_icside.c b/drivers/ata/pata_icside.c
index 5b8586dac63b..f97068be2d79 100644
--- a/drivers/ata/pata_icside.c
+++ b/drivers/ata/pata_icside.c
@@ -304,12 +304,6 @@ static int icside_dma_init(struct pata_icside_info *info)
 }
 
 
-static int pata_icside_port_start(struct ata_port *ap)
-{
-	/* No PRD to alloc */
-	return ata_pad_alloc(ap, ap->dev);
-}
-
 static struct scsi_host_template pata_icside_sht = {
 	.module			= THIS_MODULE,
 	.name			= DRV_NAME,
@@ -389,8 +383,6 @@ static struct ata_port_operations pata_icside_port_ops = {
 	.irq_clear		= ata_dummy_noret,
 	.irq_on			= ata_irq_on,
 
-	.port_start		= pata_icside_port_start,
-
 	.bmdma_stop		= pata_icside_bmdma_stop,
 	.bmdma_status		= pata_icside_bmdma_status,
 };
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index efcb66b6ccef..9323dd0c7d8d 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -601,21 +601,9 @@ static int sata_fsl_port_start(struct ata_port *ap)
 	if (!pp)
 		return -ENOMEM;
 
-	/*
-	 * allocate per command dma alignment pad buffer, which is used
-	 * internally by libATA to ensure that all transfers ending on
-	 * unaligned boundaries are padded, to align on Dword boundaries
-	 */
-	retval = ata_pad_alloc(ap, dev);
-	if (retval) {
-		kfree(pp);
-		return retval;
-	}
-
 	mem = dma_alloc_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ, &mem_dma,
 				 GFP_KERNEL);
 	if (!mem) {
-		ata_pad_free(ap, dev);
 		kfree(pp);
 		return -ENOMEM;
 	}
@@ -694,7 +682,6 @@ static void sata_fsl_port_stop(struct ata_port *ap)
 	dma_free_coherent(dev, SATA_FSL_PORT_PRIV_DMA_SZ,
 			  pp->cmdslot, pp->cmdslot_paddr);
 
-	ata_pad_free(ap, dev);
 	kfree(pp);
 }
 
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 2ecd44db4142..1c1fbf375d9a 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1158,17 +1158,13 @@ static int mv_port_start(struct ata_port *ap)
 	struct mv_port_priv *pp;
 	void __iomem *port_mmio = mv_ap_base(ap);
 	unsigned long flags;
-	int tag, rc;
+	int tag;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 	ap->private_data = pp;
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->crqb = dma_pool_alloc(hpriv->crqb_pool, GFP_KERNEL, &pp->crqb_dma);
 	if (!pp->crqb)
 		return -ENOMEM;
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index b4b1f91ea693..df7988df7908 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -1234,7 +1234,6 @@ static int sil24_port_start(struct ata_port *ap)
 	union sil24_cmd_block *cb;
 	size_t cb_size = sizeof(*cb) * SIL24_MAX_CMDS;
 	dma_addr_t cb_dma;
-	int rc;
 
 	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
@@ -1247,10 +1246,6 @@ static int sil24_port_start(struct ata_port *ap)
 		return -ENOMEM;
 	memset(cb, 0, cb_size);
 
-	rc = ata_pad_alloc(ap, dev);
-	if (rc)
-		return rc;
-
 	pp->cmd_block = cb;
 	pp->cmd_block_dma = cb_dma;
 
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 2074701f7e76..c72014a3e7d4 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -5140,7 +5140,7 @@ static void ipr_build_ata_ioadl(struct ipr_cmnd *ipr_cmd,
 	struct ipr_ioarcb *ioarcb = &ipr_cmd->ioarcb;
 	struct ipr_ioadl_desc *ioadl = ipr_cmd->ioadl;
 	struct ipr_ioadl_desc *last_ioadl = NULL;
-	int len = qc->nbytes + qc->pad_len;
+	int len = qc->nbytes;
 	struct scatterlist *sg;
 	unsigned int si;
 
@@ -5206,7 +5206,7 @@ static unsigned int ipr_qc_issue(struct ata_queued_cmd *qc)
 	ioarcb->cmd_pkt.request_type = IPR_RQTYPE_ATA_PASSTHRU;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_LINK_DESC;
 	ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_ULEN_CHK;
-	ipr_cmd->dma_use_sg = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+	ipr_cmd->dma_use_sg = qc->n_elem;
 
 	ipr_build_ata_ioadl(ipr_cmd, qc);
 	regs->flags |= IPR_ATA_FLAG_STATUS_ON_GOOD_COMPLETION;
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 0996f866f14c..7cd05b599a12 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -178,8 +178,8 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	task->uldd_task = qc;
 	if (ata_is_atapi(qc->tf.protocol)) {
 		memcpy(task->ata_task.atapi_packet, qc->cdb, qc->dev->cdb_len);
-		task->total_xfer_len = qc->nbytes + qc->pad_len;
-		task->num_scatter = qc->pad_len ? qc->n_elem + 1 : qc->n_elem;
+		task->total_xfer_len = qc->nbytes;
+		task->num_scatter = qc->n_elem;
 	} else {
 		for_each_sg(qc->sg, sg, qc->n_elem, si)
 			xfer += sg->length;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index bc5a8d0c7090..2e098f940cec 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -278,7 +278,6 @@ enum {
 
 	/* size of buffer to pad xfers ending on unaligned boundaries */
 	ATA_DMA_PAD_SZ		= 4,
-	ATA_DMA_PAD_BUF_SZ	= ATA_DMA_PAD_SZ * ATA_MAX_QUEUE,
 
 	/* ering size */
 	ATA_ERING_SIZE		= 32,
@@ -457,24 +456,18 @@ struct ata_queued_cmd {
 	unsigned long		flags;		/* ATA_QCFLAG_xxx */
 	unsigned int		tag;
 	unsigned int		n_elem;
-	unsigned int		mapped_n_elem;
 
 	int			dma_dir;
 
-	unsigned int		pad_len;
 	unsigned int		sect_size;
 
 	unsigned int		nbytes;
-	unsigned int		raw_nbytes;
 	unsigned int		curbytes;
 
 	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
 
-	struct scatterlist	*last_sg;
-	struct scatterlist	saved_last_sg;
 	struct scatterlist	sgent;
-	struct scatterlist	extra_sg[2];
 
 	struct scatterlist	*sg;
 
@@ -619,9 +612,6 @@ struct ata_port {
 	struct ata_prd		*prd;	 /* our SG list */
 	dma_addr_t		prd_dma; /* and its DMA mapping */
 
-	void			*pad;	/* array of DMA pad buffers */
-	dma_addr_t		pad_dma;
-
 	struct ata_ioports	ioaddr;	/* ATA cmd/ctl/dma register blocks */
 
 	u8			ctl;	/* cache of ATA control register */
@@ -1363,12 +1353,9 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->flags = 0;
 	qc->cursg = NULL;
 	qc->cursg_ofs = 0;
-	qc->nbytes = qc->raw_nbytes = qc->curbytes = 0;
+	qc->nbytes = qc->curbytes = 0;
 	qc->n_elem = 0;
-	qc->mapped_n_elem = 0;
 	qc->err_mask = 0;
-	qc->pad_len = 0;
-	qc->last_sg = NULL;
 	qc->sect_size = ATA_SECT_SIZE;
 
 	ata_tf_init(qc->dev, &qc->tf);
@@ -1423,19 +1410,6 @@ static inline unsigned int __ac_err_mask(u8 status)
 	return mask;
 }
 
-static inline int ata_pad_alloc(struct ata_port *ap, struct device *dev)
-{
-	ap->pad_dma = 0;
-	ap->pad = dmam_alloc_coherent(dev, ATA_DMA_PAD_BUF_SZ,
-				      &ap->pad_dma, GFP_KERNEL);
-	return (ap->pad == NULL) ? -ENOMEM : 0;
-}
-
-static inline void ata_pad_free(struct ata_port *ap, struct device *dev)
-{
-	dmam_free_coherent(dev, ATA_DMA_PAD_BUF_SZ, ap->pad, ap->pad_dma);
-}
-
 static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
 {
 	return *(struct ata_port **)&host->hostdata[0];

From fa2fc7f4813bfec1ae3232d49e3befbd601e8a6f Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Tue, 19 Feb 2008 11:36:57 +0100
Subject: [PATCH 15/15] libata: implement drain buffers

This just updates the libata slave configure routine to take advantage
of the block layer drain buffers.  It also adjusts the size lengths in
the atapi code to add the drain buffer to the DMA length so the driver
knows it can rely on it.

I suspect I should also be checking for AHCI as well as ATA_DEV_ATAPI,
but I couldn't see how to do that easily.

tj: * atapi_drain_needed() added such that draining is applied to only
      misc ATAPI commands.
    * q->bounce_gfp used when allocating drain buffer.
    * Now duplicate ATAPI PIO drain logic dropped.
    * ata_dev_printk() used instead of sdev_printk().

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ata/libata-core.c | 56 +++----------------------------------
 drivers/ata/libata-scsi.c | 59 +++++++++++++++++++++++++++++++++++----
 2 files changed, 57 insertions(+), 58 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 3587ac3fe3f3..def3682f416a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4641,28 +4641,6 @@ int ata_check_atapi_dma(struct ata_queued_cmd *qc)
 	return 0;
 }
 
-/**
- *	atapi_qc_may_overflow - Check whether data transfer may overflow
- *	@qc: ATA command in question
- *
- *	ATAPI commands which transfer variable length data to host
- *	might overflow due to application error or hardare bug.  This
- *	function checks whether overflow should be drained and ignored
- *	for @qc.
- *
- *	LOCKING:
- *	None.
- *
- *	RETURNS:
- *	1 if @qc may overflow; otherwise, 0.
- */
-static int atapi_qc_may_overflow(struct ata_queued_cmd *qc)
-{
-	return ata_is_atapi(qc->tf.protocol) && ata_is_data(qc->tf.protocol) &&
-		atapi_cmd_type(qc->cdb[0]) == ATAPI_MISC &&
-		!(qc->tf.flags & ATA_TFLAG_WRITE);
-}
-
 /**
  *	ata_std_qc_defer - Check whether a qc needs to be deferred
  *	@qc: ATA command in question
@@ -5026,36 +5004,10 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
 next_sg:
 	sg = qc->cursg;
 	if (unlikely(!sg)) {
-		/*
-		 * The end of qc->sg is reached and the device expects
-		 * more data to transfer. In order not to overrun qc->sg
-		 * and fulfill length specified in the byte count register,
-		 *    - for read case, discard trailing data from the device
-		 *    - for write case, padding zero data to the device
-		 */
-		u16 pad_buf[1] = { 0 };
-
-		if (qc->curbytes + bytes > qc->nbytes + ATAPI_MAX_DRAIN) {
-			ata_ehi_push_desc(ehi, "too much trailing data "
-					  "buf=%u cur=%u bytes=%u",
-					  qc->nbytes, qc->curbytes, bytes);
-			return -1;
-		}
-
-		/* allow overflow only for misc ATAPI commands */
-		if (!atapi_qc_may_overflow(qc)) {
-			ata_ehi_push_desc(ehi, "unexpected trailing data "
-					  "%u bytes", bytes);
-			return -1;
-		}
-
-		consumed = 0;
-		while (consumed < bytes)
-			consumed += ap->ops->data_xfer(dev,
-					(unsigned char *)pad_buf, 2, rw);
-
-		qc->curbytes += bytes;
-		return 0;
+		ata_ehi_push_desc(ehi, "unexpected or too much trailing data "
+				  "buf=%u cur=%u bytes=%u",
+				  qc->nbytes, qc->curbytes, bytes);
+		return -1;
 	}
 
 	page = sg_page(sg);
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6e15c5ddae6d..dd41b1a1b304 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -826,17 +826,56 @@ static void ata_scsi_sdev_config(struct scsi_device *sdev)
 	sdev->max_device_blocked = 1;
 }
 
-static void ata_scsi_dev_config(struct scsi_device *sdev,
-				struct ata_device *dev)
+/**
+ *	atapi_drain_needed - Check whether data transfer may overflow
+ *	@request: request to be checked
+ *
+ *	ATAPI commands which transfer variable length data to host
+ *	might overflow due to application error or hardare bug.  This
+ *	function checks whether overflow should be drained and ignored
+ *	for @request.
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	1 if ; otherwise, 0.
+ */
+static int atapi_drain_needed(struct request *rq)
+{
+	if (likely(!blk_pc_request(rq)))
+		return 0;
+
+	if (!rq->data_len || (rq->cmd_flags & REQ_RW))
+		return 0;
+
+	return atapi_cmd_type(rq->cmd[0]) == ATAPI_MISC;
+}
+
+static int ata_scsi_dev_config(struct scsi_device *sdev,
+			       struct ata_device *dev)
 {
 	/* configure max sectors */
 	blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
 
-	if (dev->class == ATA_DEV_ATAPI)
+	if (dev->class == ATA_DEV_ATAPI) {
+		struct request_queue *q = sdev->request_queue;
+		void *buf;
+
 		/* set the min alignment */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_DMA_PAD_SZ - 1);
-	else {
+
+		/* configure draining */
+		buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL);
+		if (!buf) {
+			ata_dev_printk(dev, KERN_ERR,
+				       "drain buffer allocation failed\n");
+			return -ENOMEM;
+		}
+
+		blk_queue_dma_drain(q, atapi_drain_needed, buf, ATAPI_MAX_DRAIN);
+	} else {
 		/* ATA devices must be sector aligned */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_SECT_SIZE - 1);
@@ -853,6 +892,8 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 		depth = min(ATA_MAX_QUEUE - 1, depth);
 		scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth);
 	}
+
+	return 0;
 }
 
 /**
@@ -871,13 +912,14 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 {
 	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev = __ata_scsi_find_dev(ap, sdev);
+	int rc = 0;
 
 	ata_scsi_sdev_config(sdev);
 
 	if (dev)
-		ata_scsi_dev_config(sdev, dev);
+		rc = ata_scsi_dev_config(sdev, dev);
 
-	return 0;
+	return rc;
 }
 
 /**
@@ -897,6 +939,7 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 void ata_scsi_slave_destroy(struct scsi_device *sdev)
 {
 	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	struct request_queue *q = sdev->request_queue;
 	unsigned long flags;
 	struct ata_device *dev;
 
@@ -912,6 +955,10 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev)
 		ata_port_schedule_eh(ap);
 	}
 	spin_unlock_irqrestore(ap->lock, flags);
+
+	kfree(q->dma_drain_buffer);
+	q->dma_drain_buffer = NULL;
+	q->dma_drain_size = 0;
 }
 
 /**