From 4d05a28db56225bbab5e1321d818f318e92a4657 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 18:25:47 -0400
Subject: [PATCH 001/110] xen: add blkback support

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

Conflicts:

	drivers/xen/Makefile
---
 drivers/xen/Kconfig             |   8 +
 drivers/xen/Makefile            |   1 +
 drivers/xen/blkback/Makefile    |   3 +
 drivers/xen/blkback/blkback.c   | 656 ++++++++++++++++++++++++++++++++
 drivers/xen/blkback/common.h    | 139 +++++++
 drivers/xen/blkback/interface.c | 181 +++++++++
 drivers/xen/blkback/vbd.c       | 118 ++++++
 drivers/xen/blkback/xenbus.c    | 541 ++++++++++++++++++++++++++
 include/xen/blkif.h             | 123 ++++++
 9 files changed, 1770 insertions(+)
 create mode 100644 drivers/xen/blkback/Makefile
 create mode 100644 drivers/xen/blkback/blkback.c
 create mode 100644 drivers/xen/blkback/common.h
 create mode 100644 drivers/xen/blkback/interface.c
 create mode 100644 drivers/xen/blkback/vbd.c
 create mode 100644 drivers/xen/blkback/xenbus.c
 create mode 100644 include/xen/blkif.h

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index a59638b37c1a..fb1af628cbfc 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -37,6 +37,14 @@ config XEN_BACKEND
 	  Support for backend device drivers that provide I/O services
 	  to other virtual machines.
 
+config XEN_BLKDEV_BACKEND
+	tristate "Block-device backend driver"
+	depends on XEN_BACKEND && BLOCK
+	help
+	  The block-device backend driver allows the kernel to export its
+	  block devices to other guests via a high-performance shared-memory
+	  interface.
+
 config XENFS
 	tristate "Xen filesystem"
 	default y
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index f420f1ff7f13..29c0a416f082 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_XEN_BALLOON)	+= xen-balloon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)	+= xen-evtchn.o
 obj-$(CONFIG_XEN_GNTDEV)	+= xen-gntdev.o
 obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)	+= xen-gntalloc.o
+obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
 obj-$(CONFIG_XENFS)		+= xenfs/
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
 obj-$(CONFIG_XEN_PLATFORM_PCI)	+= xen-platform-pci.o
diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
new file mode 100644
index 000000000000..8bab63da3b3e
--- /dev/null
+++ b/drivers/xen/blkback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
+
+blkbk-y	:= blkback.o xenbus.o interface.o vbd.o
diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
new file mode 100644
index 000000000000..5b8d50e344b4
--- /dev/null
+++ b/drivers/xen/blkback/blkback.c
@@ -0,0 +1,656 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/blkif/frontend
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <xen/balloon.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ *
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ *
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+	blkif_t       *blkif;
+	u64            id;
+	int            nr_pages;
+	atomic_t       pendcnt;
+	unsigned short operation;
+	int            status;
+	struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+			  unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+	if (blkif->plug == NULL)
+		return;
+	if (blkif->plug->unplug_fn)
+		blkif->plug->unplug_fn(blkif->plug);
+	blk_put_queue(blkif->plug);
+	blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct block_device *bdev)
+{
+	request_queue_t *q = bdev_get_queue(bdev);
+
+	if (q == blkif->plug)
+		return;
+	unplug_queue(blkif);
+	blk_get_queue(q);
+	blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int i, invcount = 0;
+	grant_handle_t handle;
+	int ret;
+
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
+		if (handle == BLKBACK_INVALID_HANDLE)
+			continue;
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+				    GNTMAP_host_map, handle);
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+		invcount++;
+	}
+
+	ret = HYPERVISOR_grant_table_op(
+		GNTTABOP_unmap_grant_ref, unmap, invcount);
+	BUG_ON(ret);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
+	       current->comm, blkif->st_oo_req,
+	       blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
+	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	blkif->st_rd_req = 0;
+	blkif->st_wr_req = 0;
+	blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+	blkif_t *blkif = arg;
+
+	blkif_get(blkif);
+
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: started\n", current->comm);
+
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+
+		wait_event_interruptible(
+			blkif->wq,
+			blkif->waiting_reqs || kthread_should_stop());
+		wait_event_interruptible(
+			pending_free_wq,
+			!list_empty(&pending_free) || kthread_should_stop());
+
+		blkif->waiting_reqs = 0;
+		smp_mb(); /* clear flag *before* checking for work */
+
+		if (do_block_io_op(blkif))
+			blkif->waiting_reqs = 1;
+		unplug_queue(blkif);
+
+		if (log_stats && time_after(jiffies, blkif->st_print))
+			print_stats(blkif);
+	}
+
+	if (log_stats)
+		print_stats(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+	blkif->xenblkd = NULL;
+	blkif_put(blkif);
+
+	return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int error)
+{
+	/* An error fails the entire request. */
+	if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+	    (error == -EOPNOTSUPP)) {
+		DPRINTK("blkback: write barrier op failed, not supported\n");
+		blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (error) {
+		DPRINTK("Buffer not up-to-date at end of operation, "
+			"error=%d\n", error);
+		pending_req->status = BLKIF_RSP_ERROR;
+	}
+
+	if (atomic_dec_and_test(&pending_req->pendcnt)) {
+		fast_flush_area(pending_req);
+		make_response(pending_req->blkif, pending_req->id,
+			      pending_req->operation, pending_req->status);
+		blkif_put(pending_req->blkif);
+		free_req(pending_req);
+	}
+}
+
+static int end_block_io_op(struct bio *bio, unsigned int done, int error)
+{
+	if (bio->bi_size != 0)
+		return 1;
+	__end_block_io_op(bio->bi_private, error);
+	bio_put(bio);
+	return error;
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+	blkif->waiting_reqs = 1;
+	wake_up(&blkif->wq);
+}
+
+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+	blkif_notify_work(dev_id);
+	return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif)
+{
+	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+	blkif_request_t req;
+	pending_req_t *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do = 0;
+
+	rc = blk_rings->common.req_cons;
+	rp = blk_rings->common.sring->req_prod;
+	rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+	while (rc != rp) {
+
+		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+			break;
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
+			more_to_do = 1;
+			break;
+		}
+
+		if (kthread_should_stop()) {
+			more_to_do = 1;
+			break;
+		}
+
+		switch (blkif->blk_protocol) {
+		case BLKIF_PROTOCOL_NATIVE:
+			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+			break;
+		case BLKIF_PROTOCOL_X86_64:
+			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+			break;
+		default:
+			BUG();
+		}
+		blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+		/* Apply all sanity checks to /private copy/ of request. */
+		barrier();
+
+		switch (req.operation) {
+		case BLKIF_OP_READ:
+			blkif->st_rd_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+		case BLKIF_OP_WRITE_BARRIER:
+			blkif->st_br_req++;
+			/* fall through */
+		case BLKIF_OP_WRITE:
+			blkif->st_wr_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+		default:
+			/* A good sign something is wrong: sleep for a while to
+			 * avoid excessive CPU consumption by a bad guest. */
+			msleep(1);
+			DPRINTK("error: unknown block io operation [%d]\n",
+				req.operation);
+			make_response(blkif, req.id, req.operation,
+				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+
+	return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req)
+{
+	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
+	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct phys_req preq;
+	struct {
+		unsigned long buf; unsigned int nsec;
+	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int nseg;
+	struct bio *bio = NULL;
+	int ret, i;
+	int operation;
+
+	switch (req->operation) {
+	case BLKIF_OP_READ:
+		operation = READ;
+		break;
+	case BLKIF_OP_WRITE:
+		operation = WRITE;
+		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		operation = WRITE_BARRIER;
+		break;
+	default:
+		operation = 0; /* make gcc happy */
+		BUG();
+	}
+
+	/* Check that number of segments is sane. */
+	nseg = req->nr_segments;
+	if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
+	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+		DPRINTK("Bad number of segments in request (%d)\n", nseg);
+		goto fail_response;
+	}
+
+	preq.dev           = req->handle;
+	preq.sector_number = req->sector_number;
+	preq.nr_sects      = 0;
+
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->operation = req->operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_pages  = nseg;
+
+	for (i = 0; i < nseg; i++) {
+		uint32_t flags;
+
+		seg[i].nsec = req->seg[i].last_sect -
+			req->seg[i].first_sect + 1;
+
+		if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+		    (req->seg[i].last_sect < req->seg[i].first_sect))
+			goto fail_response;
+		preq.nr_sects += seg[i].nsec;
+
+		flags = GNTMAP_host_map;
+		if (operation != READ)
+			flags |= GNTMAP_readonly;
+		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+				  req->seg[i].gref, blkif->domid);
+	}
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+	BUG_ON(ret);
+
+	for (i = 0; i < nseg; i++) {
+		if (unlikely(map[i].status != 0)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			map[i].handle = BLKBACK_INVALID_HANDLE;
+			ret |= 1;
+		}
+
+		pending_handle(pending_req, i) = map[i].handle;
+
+		if (ret)
+			continue;
+
+		set_phys_to_machine(__pa(vaddr(
+			pending_req, i)) >> PAGE_SHIFT,
+			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+		seg[i].buf  = map[i].dev_bus_addr |
+			(req->seg[i].first_sect << 9);
+	}
+
+	if (ret)
+		goto fail_flush;
+
+	if (vbd_translate(&preq, blkif, operation) != 0) {
+		DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
+			operation == READ ? "read" : "write",
+			preq.sector_number,
+			preq.sector_number + preq.nr_sects, preq.dev);
+		goto fail_flush;
+	}
+
+	plug_queue(blkif, preq.bdev);
+	atomic_set(&pending_req->pendcnt, 1);
+	blkif_get(blkif);
+
+	for (i = 0; i < nseg; i++) {
+		if (((int)preq.sector_number|(int)seg[i].nsec) &
+		    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
+			DPRINTK("Misaligned I/O request from domain %d",
+				blkif->domid);
+			goto fail_put_bio;
+		}
+
+		while ((bio == NULL) ||
+		       (bio_add_page(bio,
+				     virt_to_page(vaddr(pending_req, i)),
+				     seg[i].nsec << 9,
+				     seg[i].buf & ~PAGE_MASK) == 0)) {
+			if (bio) {
+				atomic_inc(&pending_req->pendcnt);
+				submit_bio(operation, bio);
+			}
+
+			bio = bio_alloc(GFP_KERNEL, nseg-i);
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
+
+			bio->bi_bdev    = preq.bdev;
+			bio->bi_private = pending_req;
+			bio->bi_end_io  = end_block_io_op;
+			bio->bi_sector  = preq.sector_number;
+		}
+
+		preq.sector_number += seg[i].nsec;
+	}
+
+	if (!bio) {
+		BUG_ON(operation != WRITE_BARRIER);
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (unlikely(bio == NULL))
+			goto fail_put_bio;
+
+		bio->bi_bdev    = preq.bdev;
+		bio->bi_private = pending_req;
+		bio->bi_end_io  = end_block_io_op;
+		bio->bi_sector  = -1;
+	}
+
+	submit_bio(operation, bio);
+
+	if (operation == READ)
+		blkif->st_rd_sect += preq.nr_sects;
+	else if (operation == WRITE || operation == WRITE_BARRIER)
+		blkif->st_wr_sect += preq.nr_sects;
+
+	return;
+
+ fail_flush:
+	fast_flush_area(pending_req);
+ fail_response:
+	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
+	msleep(1); /* back off a bit */
+	return;
+
+ fail_put_bio:
+	__end_block_io_op(pending_req, -EINVAL);
+	if (bio)
+		bio_put(bio);
+	unplug_queue(blkif);
+	msleep(1); /* back off a bit */
+	return;
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+			  unsigned short op, int st)
+{
+	blkif_response_t  resp;
+	unsigned long     flags;
+	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+	int more_to_do = 0;
+	int notify;
+
+	resp.id        = id;
+	resp.operation = op;
+	resp.status    = st;
+
+	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+	/* Place on the response ring for the relevant domain. */
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_32:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_64:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	default:
+		BUG();
+	}
+	blk_rings->common.rsp_prod_pvt++;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+	if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+		/*
+		 * Tail check for pending requests. Allows frontend to avoid
+		 * notifications if requests are already in flight (lower
+		 * overheads and promotes batching).
+		 */
+		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+
+	} else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+		more_to_do = 1;
+	}
+
+	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+	if (more_to_do)
+		blkif_notify_work(blkif);
+	if (notify)
+		notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+	int i, mmap_pages;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+					blkif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
+
+	if (!pending_reqs || !pending_grant_handles || !pending_pages)
+		goto out_of_memory;
+
+	for (i = 0; i < mmap_pages; i++)
+		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+
+	blkif_interface_init();
+
+	memset(pending_reqs, 0, sizeof(pending_reqs));
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < blkif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+	blkif_xenbus_init();
+
+	return 0;
+
+ out_of_memory:
+	kfree(pending_reqs);
+	kfree(pending_grant_handles);
+	free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+	printk("%s: out of memory\n", __FUNCTION__);
+	return -ENOMEM;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
new file mode 100644
index 000000000000..422e935528be
--- /dev/null
+++ b/drivers/xen/blkback/common.h
@@ -0,0 +1,139 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/gnttab.h>
+#include <xen/driver_util.h>
+#include <xen/xenbus.h>
+
+#define DPRINTK(_f, _a...)			\
+	pr_debug("(file=%s, line=%d) " _f,	\
+		 __FILE__ , __LINE__ , ## _a )
+
+struct vbd {
+	blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
+	unsigned char  readonly;    /* Non-zero -> read-only */
+	unsigned char  type;        /* VDISK_xxx */
+	u32            pdevice;     /* phys device that this vbd maps to */
+	struct block_device *bdev;
+};
+
+struct backend_info;
+
+typedef struct blkif_st {
+	/* Unique identifier for this interface. */
+	domid_t           domid;
+	unsigned int      handle;
+	/* Physical parameters of the comms window. */
+	unsigned int      irq;
+	/* Comms information. */
+	enum blkif_protocol blk_protocol;
+	blkif_back_rings_t blk_rings;
+	struct vm_struct *blk_ring_area;
+	/* The VBD attached to this interface. */
+	struct vbd        vbd;
+	/* Back pointer to the backend_info. */
+	struct backend_info *be;
+	/* Private fields. */
+	spinlock_t       blk_ring_lock;
+	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	struct task_struct  *xenblkd;
+	unsigned int        waiting_reqs;
+	request_queue_t     *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
+	int                 st_br_req;
+	int                 st_rd_sect;
+	int                 st_wr_sect;
+
+	wait_queue_head_t waiting_to_free;
+
+	grant_handle_t shmem_handle;
+	grant_ref_t    shmem_ref;
+} blkif_t;
+
+blkif_t *blkif_alloc(domid_t domid);
+void blkif_disconnect(blkif_t *blkif);
+void blkif_free(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)					\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			wake_up(&(_b)->waiting_to_free);\
+	} while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+	       unsigned minor, int readonly, int cdrom);
+void vbd_free(struct vbd *vbd);
+
+unsigned long long vbd_size(struct vbd *vbd);
+unsigned int vbd_info(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+	unsigned short       dev;
+	unsigned short       nr_sects;
+	struct block_device *bdev;
+	blkif_sector_t       sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+void blkif_interface_init(void);
+
+void blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int blkif_schedule(void *arg);
+
+int blkback_barrier(struct xenbus_transaction xbt,
+		    struct backend_info *be, int state);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
new file mode 100644
index 000000000000..81821bdc7ef1
--- /dev/null
+++ b/drivers/xen/blkback/interface.c
@@ -0,0 +1,181 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+#include <linux/kthread.h>
+
+static kmem_cache_t *blkif_cachep;
+
+blkif_t *blkif_alloc(domid_t domid)
+{
+	blkif_t *blkif;
+
+	blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+	if (!blkif)
+		return ERR_PTR(-ENOMEM);
+
+	memset(blkif, 0, sizeof(*blkif));
+	blkif->domid = domid;
+	spin_lock_init(&blkif->blk_ring_lock);
+	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
+	init_waitqueue_head(&blkif->waiting_to_free);
+
+	return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+{
+	struct gnttab_map_grant_ref op;
+
+	gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+			  GNTMAP_host_map, shared_page, blkif->domid);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+		BUG();
+
+	if (op.status) {
+		DPRINTK(" Grant table operation failure !\n");
+		return op.status;
+	}
+
+	blkif->shmem_ref = shared_page;
+	blkif->shmem_handle = op.handle;
+
+	return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+	struct gnttab_unmap_grant_ref op;
+
+	gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+			    GNTMAP_host_map, blkif->shmem_handle);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+		BUG();
+}
+
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+{
+	int err;
+
+	/* Already connected through? */
+	if (blkif->irq)
+		return 0;
+
+	if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+		return -ENOMEM;
+
+	err = map_frontend_page(blkif, shared_page);
+	if (err) {
+		free_vm_area(blkif->blk_ring_area);
+		return err;
+	}
+
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		blkif_sring_t *sring;
+		sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		blkif_x86_32_sring_t *sring_x86_32;
+		sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		blkif_x86_64_sring_t *sring_x86_64;
+		sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+	if (err < 0)
+	{
+		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+	}
+	blkif->irq = err;
+
+	return 0;
+}
+
+void blkif_disconnect(blkif_t *blkif)
+{
+	if (blkif->xenblkd) {
+		kthread_stop(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+	}
+
+	atomic_dec(&blkif->refcnt);
+	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+	atomic_inc(&blkif->refcnt);
+
+	if (blkif->irq) {
+		unbind_from_irqhandler(blkif->irq, blkif);
+		blkif->irq = 0;
+	}
+
+	if (blkif->blk_rings.common.sring) {
+		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+	}
+}
+
+void blkif_free(blkif_t *blkif)
+{
+	if (!atomic_dec_and_test(&blkif->refcnt))
+		BUG();
+	kmem_cache_free(blkif_cachep, blkif);
+}
+
+void __init blkif_interface_init(void)
+{
+	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+					 0, 0, NULL, NULL);
+}
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
new file mode 100644
index 000000000000..1fb31d0236b4
--- /dev/null
+++ b/drivers/xen/blkback/vbd.c
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * blkback/vbd.c
+ *
+ * Routines for managing virtual block devices (VBDs).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
+	(_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+	return vbd_sz(vbd);
+}
+
+unsigned int vbd_info(struct vbd *vbd)
+{
+	return vbd->type | (vbd->readonly?VDISK_READONLY:0);
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+	return bdev_hardsect_size(vbd->bdev);
+}
+
+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+	       unsigned minor, int readonly, int cdrom)
+{
+	struct vbd *vbd;
+	struct block_device *bdev;
+
+	vbd = &blkif->vbd;
+	vbd->handle   = handle;
+	vbd->readonly = readonly;
+	vbd->type     = 0;
+
+	vbd->pdevice  = MKDEV(major, minor);
+
+	bdev = open_by_devnum(vbd->pdevice,
+			      vbd->readonly ? FMODE_READ : FMODE_WRITE);
+
+	if (IS_ERR(bdev)) {
+		DPRINTK("vbd_creat: device %08x could not be opened.\n",
+			vbd->pdevice);
+		return -ENOENT;
+	}
+
+	vbd->bdev = bdev;
+
+	if (vbd->bdev->bd_disk == NULL) {
+		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+			vbd->pdevice);
+		vbd_free(vbd);
+		return -ENOENT;
+	}
+
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+		vbd->type |= VDISK_CDROM;
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+		vbd->type |= VDISK_REMOVABLE;
+
+	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+		handle, blkif->domid);
+	return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+	if (vbd->bdev)
+		blkdev_put(vbd->bdev);
+	vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+{
+	struct vbd *vbd = &blkif->vbd;
+	int rc = -EACCES;
+
+	if ((operation != READ) && vbd->readonly)
+		goto out;
+
+	if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
+		goto out;
+
+	req->dev  = vbd->pdevice;
+	req->bdev = vbd->bdev;
+	rc = 0;
+
+ out:
+	return rc;
+}
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
new file mode 100644
index 000000000000..80d9aa6e6ba3
--- /dev/null
+++ b/drivers/xen/blkback/xenbus.c
@@ -0,0 +1,541 @@
+/*  Xenbus code for blkif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include "common.h"
+
+#undef DPRINTK
+#define DPRINTK(fmt, args...)				\
+	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
+
+struct backend_info
+{
+	struct xenbus_device *dev;
+	blkif_t *blkif;
+	struct xenbus_watch backend_watch;
+	unsigned major;
+	unsigned minor;
+	char *mode;
+};
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+			    unsigned int);
+
+static int blkback_name(blkif_t *blkif, char *buf)
+{
+	char *devpath, *devname;
+	struct xenbus_device *dev = blkif->be->dev;
+
+	devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+	if (IS_ERR(devpath))
+		return PTR_ERR(devpath);
+
+	if ((devname = strstr(devpath, "/dev/")) != NULL)
+		devname += strlen("/dev/");
+	else
+		devname  = devpath;
+
+	snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+	kfree(devpath);
+
+	return 0;
+}
+
+static void update_blkif_status(blkif_t *blkif)
+{
+	int err;
+	char name[TASK_COMM_LEN];
+
+	/* Not ready to connect? */
+	if (!blkif->irq || !blkif->vbd.bdev)
+		return;
+
+	/* Already connected? */
+	if (blkif->be->dev->state == XenbusStateConnected)
+		return;
+
+	/* Attempt to connect: exit if we fail to. */
+	connect(blkif->be);
+	if (blkif->be->dev->state != XenbusStateConnected)
+		return;
+
+	err = blkback_name(blkif, name);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+		return;
+	}
+
+	blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+	if (IS_ERR(blkif->xenblkd)) {
+		err = PTR_ERR(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+	}
+}
+
+
+/****************************************************************
+ *  sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		struct xenbus_device *dev = to_xenbus_device(_dev);	\
+		struct backend_info *be = dev->dev.driver_data;		\
+									\
+		return sprintf(buf, format, ##args);			\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(br_req,  "%d\n", be->blkif->st_br_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *vbdstat_attrs[] = {
+	&dev_attr_oo_req.attr,
+	&dev_attr_rd_req.attr,
+	&dev_attr_wr_req.attr,
+	&dev_attr_br_req.attr,
+	&dev_attr_rd_sect.attr,
+	&dev_attr_wr_sect.attr,
+	NULL
+};
+
+static struct attribute_group vbdstat_group = {
+	.name = "statistics",
+	.attrs = vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+	int error;
+
+	error = device_create_file(&dev->dev, &dev_attr_physical_device);
+ 	if (error)
+		goto fail1;
+
+	error = device_create_file(&dev->dev, &dev_attr_mode);
+	if (error)
+		goto fail2;
+
+	error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+	if (error)
+		goto fail3;
+
+	return 0;
+
+fail3:	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail2:	device_remove_file(&dev->dev, &dev_attr_mode);
+fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
+	return error;
+}
+
+void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+	device_remove_file(&dev->dev, &dev_attr_mode);
+	device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev->dev.driver_data;
+
+	DPRINTK("");
+
+	if (be->major || be->minor)
+		xenvbd_sysfs_delif(dev);
+
+	if (be->backend_watch.node) {
+		unregister_xenbus_watch(&be->backend_watch);
+		kfree(be->backend_watch.node);
+		be->backend_watch.node = NULL;
+	}
+
+	if (be->blkif) {
+		blkif_disconnect(be->blkif);
+		vbd_free(&be->blkif->vbd);
+		blkif_free(be->blkif);
+		be->blkif = NULL;
+	}
+
+	kfree(be);
+	dev->dev.driver_data = NULL;
+	return 0;
+}
+
+int blkback_barrier(struct xenbus_transaction xbt,
+		    struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+			    "%d", state);
+	if (err)
+		xenbus_dev_fatal(dev, err, "writing feature-barrier");
+
+	return err;
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers.  Switch to InitWait.
+ */
+static int blkback_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id)
+{
+	int err;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+	be->dev = dev;
+	dev->dev.driver_data = be;
+
+	be->blkif = blkif_alloc(dev->otherend_id);
+	if (IS_ERR(be->blkif)) {
+		err = PTR_ERR(be->blkif);
+		be->blkif = NULL;
+		xenbus_dev_fatal(dev, err, "creating block interface");
+		goto fail;
+	}
+
+	/* setup back pointer */
+	be->blkif->be = be;
+
+	err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
+				 &be->backend_watch, backend_changed);
+	if (err)
+		goto fail;
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	DPRINTK("failed");
+	blkback_remove(dev);
+	return err;
+}
+
+
+/**
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node.  Read it and the mode node, and create a vbd.  If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	int err;
+	unsigned major;
+	unsigned minor;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, backend_watch);
+	struct xenbus_device *dev = be->dev;
+	int cdrom = 0;
+	char *device_type;
+
+	DPRINTK("");
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+			   &major, &minor);
+	if (XENBUS_EXIST_ERR(err)) {
+		/* Since this watch will fire once immediately after it is
+		   registered, we expect this.  Ignore it, and wait for the
+		   hotplug scripts. */
+		return;
+	}
+	if (err != 2) {
+		xenbus_dev_fatal(dev, err, "reading physical-device");
+		return;
+	}
+
+	if ((be->major || be->minor) &&
+	    ((be->major != major) || (be->minor != minor))) {
+		printk(KERN_WARNING
+		       "blkback: changing physical device (from %x:%x to "
+		       "%x:%x) not supported.\n", be->major, be->minor,
+		       major, minor);
+		return;
+	}
+
+	be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+	if (IS_ERR(be->mode)) {
+		err = PTR_ERR(be->mode);
+		be->mode = NULL;
+		xenbus_dev_fatal(dev, err, "reading mode");
+		return;
+	}
+
+	device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+	if (!IS_ERR(device_type)) {
+		cdrom = strcmp(device_type, "cdrom") == 0;
+		kfree(device_type);
+	}
+
+	if (be->major == 0 && be->minor == 0) {
+		/* Front end dir is a number, which is used as the handle. */
+
+		char *p = strrchr(dev->otherend, '/') + 1;
+		long handle = simple_strtoul(p, NULL, 0);
+
+		be->major = major;
+		be->minor = minor;
+
+		err = vbd_create(be->blkif, handle, major, minor,
+				 (NULL == strchr(be->mode, 'w')), cdrom);
+		if (err) {
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating vbd structure");
+			return;
+		}
+
+		err = xenvbd_sysfs_addif(dev);
+		if (err) {
+			vbd_free(&be->blkif->vbd);
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating sysfs entries");
+			return;
+		}
+
+		/* We're potentially connected now */
+		update_blkif_status(be->blkif);
+	}
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev->dev.driver_data;
+	int err;
+
+	DPRINTK("%s", xenbus_strstate(frontend_state));
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+			       __FUNCTION__, dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		/* Ensure we connect even when two watches fire in
+		   close successsion and we miss the intermediate value
+		   of frontend_state. */
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		err = connect_ring(be);
+		if (err)
+			break;
+		update_blkif_status(be->blkif);
+		break;
+
+	case XenbusStateClosing:
+		blkif_disconnect(be->blkif);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+/* ** Connection ** */
+
+
+/**
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = be->dev;
+
+	DPRINTK("%s", dev->otherend);
+
+	/* Supply the information about the device the frontend needs */
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return;
+	}
+
+	err = blkback_barrier(xbt, be, 1);
+	if (err)
+		goto abort;
+
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    vbd_size(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sectors",
+				 dev->nodename);
+		goto abort;
+	}
+
+	/* FIXME: use a typename instead */
+	err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+			    vbd_info(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/info",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+			    vbd_secsize(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+				 dev->nodename);
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(dev, err, "ending transaction");
+
+	err = xenbus_switch_state(dev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(dev, err, "switching to Connected state",
+				 dev->nodename);
+
+	return;
+ abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned long ring_ref;
+	unsigned int evtchn;
+	char protocol[64] = "";
+	int err;
+
+	DPRINTK("%s", dev->otherend);
+
+	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
+			    "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+			    "%63s", protocol, NULL);
+	if (err)
+		strcpy(protocol, "unspecified, assuming native");
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		return -1;
+	}
+	printk(KERN_INFO
+	       "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+	       ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+
+	/* Map the shared frame, irq etc. */
+	err = blkif_map(be->blkif, ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+				 ring_ref, evtchn);
+		return err;
+	}
+
+	return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkback_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+
+static struct xenbus_driver blkback = {
+	.name = "vbd",
+	.owner = THIS_MODULE,
+	.ids = blkback_ids,
+	.probe = blkback_probe,
+	.remove = blkback_remove,
+	.otherend_changed = frontend_changed
+};
+
+
+void blkif_xenbus_init(void)
+{
+	xenbus_register_backend(&blkback);
+}
diff --git a/include/xen/blkif.h b/include/xen/blkif.h
new file mode 100644
index 000000000000..3d56b75de909
--- /dev/null
+++ b/include/xen/blkif.h
@@ -0,0 +1,123 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+	char dummy;
+};
+struct blkif_common_response {
+	char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_response {
+	uint64_t        id;              /* copied from request */
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       __attribute__((__aligned__(8))) id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_response {
+	uint64_t       __attribute__((__aligned__(8))) id;
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+union blkif_back_rings {
+	blkif_back_ring_t        native;
+	blkif_common_back_ring_t common;
+	blkif_x86_32_back_ring_t x86_32;
+	blkif_x86_64_back_ring_t x86_64;
+};
+typedef union blkif_back_rings blkif_back_rings_t;
+
+enum blkif_protocol {
+	BLKIF_PROTOCOL_NATIVE = 1,
+	BLKIF_PROTOCOL_X86_32 = 2,
+	BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->sector_number = src->sector_number;
+	barrier();
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->sector_number = src->sector_number;
+	barrier();
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */

From 8812293323a79134e06c3bf82eba1e217d23382e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 9 Feb 2009 12:05:51 -0800
Subject: [PATCH 002/110] xen-blkback-porting

---
 drivers/xen/blkback/blkback.c   | 30 ++++++++++++++++--------------
 drivers/xen/blkback/common.h    |  9 ++++-----
 drivers/xen/blkback/interface.c | 19 ++++++++++---------
 drivers/xen/blkback/vbd.c       |  4 ++--
 drivers/xen/blkback/xenbus.c    |  7 ++++---
 include/xen/blkif.h             | 13 ++++++-------
 6 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 5b8d50e344b4..43fd07091d4d 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -39,8 +39,12 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/delay.h>
+#include <linux/freezer.h>
 #include <xen/balloon.h>
-#include <asm/hypervisor.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
 #include "common.h"
 
 /*
@@ -106,7 +110,7 @@ static inline unsigned long vaddr(pending_req_t *req, int seg)
 
 static int do_block_io_op(blkif_t *blkif);
 static void dispatch_rw_block_io(blkif_t *blkif,
-				 blkif_request_t *req,
+				 struct blkif_request *req,
 				 pending_req_t *pending_req);
 static void make_response(blkif_t *blkif, u64 id,
 			  unsigned short op, int st);
@@ -153,7 +157,7 @@ static void unplug_queue(blkif_t *blkif)
 
 static void plug_queue(blkif_t *blkif, struct block_device *bdev)
 {
-	request_queue_t *q = bdev_get_queue(bdev);
+	struct request_queue *q = bdev_get_queue(bdev);
 
 	if (q == blkif->plug)
 		return;
@@ -268,13 +272,10 @@ static void __end_block_io_op(pending_req_t *pending_req, int error)
 	}
 }
 
-static int end_block_io_op(struct bio *bio, unsigned int done, int error)
+static void end_block_io_op(struct bio *bio, int error)
 {
-	if (bio->bi_size != 0)
-		return 1;
 	__end_block_io_op(bio->bi_private, error);
 	bio_put(bio);
-	return error;
 }
 
 
@@ -288,7 +289,7 @@ static void blkif_notify_work(blkif_t *blkif)
 	wake_up(&blkif->wq);
 }
 
-irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t blkif_be_int(int irq, void *dev_id)
 {
 	blkif_notify_work(dev_id);
 	return IRQ_HANDLED;
@@ -302,8 +303,8 @@ irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
 
 static int do_block_io_op(blkif_t *blkif)
 {
-	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
-	blkif_request_t req;
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	struct blkif_request req;
 	pending_req_t *pending_req;
 	RING_IDX rc, rp;
 	int more_to_do = 0;
@@ -379,7 +380,7 @@ static int do_block_io_op(blkif_t *blkif)
 }
 
 static void dispatch_rw_block_io(blkif_t *blkif,
-				 blkif_request_t *req,
+				 struct blkif_request *req,
 				 pending_req_t *pending_req)
 {
 	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
@@ -560,9 +561,9 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 static void make_response(blkif_t *blkif, u64 id,
 			  unsigned short op, int st)
 {
-	blkif_response_t  resp;
+	struct blkif_response  resp;
 	unsigned long     flags;
-	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
 	int more_to_do = 0;
 	int notify;
 
@@ -614,7 +615,8 @@ static int __init blkif_init(void)
 {
 	int i, mmap_pages;
 
-	if (!is_running_on_xen())
+	printk(KERN_CRIT "***blkif_init\n");
+	if (!xen_pv_domain())
 		return -ENODEV;
 
 	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
index 422e935528be..1c422b00974e 100644
--- a/drivers/xen/blkback/common.h
+++ b/drivers/xen/blkback/common.h
@@ -40,8 +40,7 @@
 #include <xen/evtchn.h>
 #include <asm/hypervisor.h>
 #include <xen/blkif.h>
-#include <xen/gnttab.h>
-#include <xen/driver_util.h>
+#include <xen/grant_table.h>
 #include <xen/xenbus.h>
 
 #define DPRINTK(_f, _a...)			\
@@ -66,7 +65,7 @@ typedef struct blkif_st {
 	unsigned int      irq;
 	/* Comms information. */
 	enum blkif_protocol blk_protocol;
-	blkif_back_rings_t blk_rings;
+	union blkif_back_rings blk_rings;
 	struct vm_struct *blk_ring_area;
 	/* The VBD attached to this interface. */
 	struct vbd        vbd;
@@ -79,7 +78,7 @@ typedef struct blkif_st {
 	wait_queue_head_t   wq;
 	struct task_struct  *xenblkd;
 	unsigned int        waiting_reqs;
-	request_queue_t     *plug;
+	struct request_queue     *plug;
 
 	/* statistics */
 	unsigned long       st_print;
@@ -130,7 +129,7 @@ void blkif_interface_init(void);
 
 void blkif_xenbus_init(void);
 
-irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+irqreturn_t blkif_be_int(int irq, void *dev_id);
 int blkif_schedule(void *arg);
 
 int blkback_barrier(struct xenbus_transaction xbt,
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
index 81821bdc7ef1..c6c3e14776b9 100644
--- a/drivers/xen/blkback/interface.c
+++ b/drivers/xen/blkback/interface.c
@@ -31,10 +31,11 @@
  */
 
 #include "common.h"
-#include <xen/evtchn.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
 #include <linux/kthread.h>
 
-static kmem_cache_t *blkif_cachep;
+static struct kmem_cache *blkif_cachep;
 
 blkif_t *blkif_alloc(domid_t domid)
 {
@@ -107,22 +108,22 @@ int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
 	switch (blkif->blk_protocol) {
 	case BLKIF_PROTOCOL_NATIVE:
 	{
-		blkif_sring_t *sring;
-		sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
+		struct blkif_sring *sring;
+		sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
 		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_32:
 	{
-		blkif_x86_32_sring_t *sring_x86_32;
-		sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr;
+		struct blkif_x86_32_sring *sring_x86_32;
+		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
 		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_64:
 	{
-		blkif_x86_64_sring_t *sring_x86_64;
-		sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr;
+		struct blkif_x86_64_sring *sring_x86_64;
+		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
 		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
 		break;
 	}
@@ -177,5 +178,5 @@ void blkif_free(blkif_t *blkif)
 void __init blkif_interface_init(void)
 {
 	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
-					 0, 0, NULL, NULL);
+					 0, 0, NULL);
 }
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index 1fb31d0236b4..7e9a1cd35ade 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -33,7 +33,7 @@
 #include "common.h"
 
 #define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
-	(_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
+		      (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
 
 unsigned long long vbd_size(struct vbd *vbd)
 {
@@ -94,7 +94,7 @@ int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
 void vbd_free(struct vbd *vbd)
 {
 	if (vbd->bdev)
-		blkdev_put(vbd->bdev);
+		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
 	vbd->bdev = NULL;
 }
 
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index 80d9aa6e6ba3..650f4b3e9b3c 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -238,8 +238,8 @@ static int blkback_probe(struct xenbus_device *dev,
 	/* setup back pointer */
 	be->blkif->be = be;
 
-	err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
-				 &be->backend_watch, backend_changed);
+	err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
+				   "%s/%s", dev->nodename, "physical-device");
 	if (err)
 		goto fail;
 
@@ -537,5 +537,6 @@ static struct xenbus_driver blkback = {
 
 void blkif_xenbus_init(void)
 {
-	xenbus_register_backend(&blkback);
+	/* XXX must_check */
+	(void)xenbus_register_backend(&blkback);
 }
diff --git a/include/xen/blkif.h b/include/xen/blkif.h
index 3d56b75de909..d27428046918 100644
--- a/include/xen/blkif.h
+++ b/include/xen/blkif.h
@@ -77,12 +77,11 @@ DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32
 DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
 
 union blkif_back_rings {
-	blkif_back_ring_t        native;
-	blkif_common_back_ring_t common;
-	blkif_x86_32_back_ring_t x86_32;
-	blkif_x86_64_back_ring_t x86_64;
+	struct blkif_back_ring        native;
+	struct blkif_common_back_ring common;
+	struct blkif_x86_32_back_ring x86_32;
+	struct blkif_x86_64_back_ring x86_64;
 };
-typedef union blkif_back_rings blkif_back_rings_t;
 
 enum blkif_protocol {
 	BLKIF_PROTOCOL_NATIVE = 1,
@@ -90,7 +89,7 @@ enum blkif_protocol {
 	BLKIF_PROTOCOL_X86_64 = 3,
 };
 
-static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
+static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src)
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	dst->operation = src->operation;
@@ -105,7 +104,7 @@ static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_reque
 		dst->seg[i] = src->seg[i];
 }
 
-static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
+static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src)
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	dst->operation = src->operation;

From dd3672424caa7b302433635831afbb6787476b96 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 9 Feb 2009 16:39:58 -0800
Subject: [PATCH 003/110] xen/blkback: don't include xen/evtchn.h

It's a user-mode header for users of /dev/evtchn

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
---
 drivers/xen/blkback/common.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
index 1c422b00974e..57b78250cfb7 100644
--- a/drivers/xen/blkback/common.h
+++ b/drivers/xen/blkback/common.h
@@ -37,7 +37,6 @@
 #include <asm/io.h>
 #include <asm/setup.h>
 #include <asm/pgalloc.h>
-#include <xen/evtchn.h>
 #include <asm/hypervisor.h>
 #include <xen/blkif.h>
 #include <xen/grant_table.h>

From 8270b45bc8a45eef4a224bd256bd0997d4fd857e Mon Sep 17 00:00:00 2001
From: Keir Fraser <keir.fraser@citrix.com>
Date: Fri, 6 Mar 2009 08:29:15 +0000
Subject: [PATCH 004/110] blkback: Fix potential resource leak.

---
 drivers/xen/blkback/blkback.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 43fd07091d4d..8d988f4513aa 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -318,14 +318,14 @@ static int do_block_io_op(blkif_t *blkif)
 		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
 			break;
 
-		pending_req = alloc_req();
-		if (NULL == pending_req) {
-			blkif->st_oo_req++;
+		if (kthread_should_stop()) {
 			more_to_do = 1;
 			break;
 		}
 
-		if (kthread_should_stop()) {
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
 			more_to_do = 1;
 			break;
 		}

From 690f1b63b2db88330834d8482f3b125990c8e609 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 21 Mar 2009 23:34:19 -0700
Subject: [PATCH 005/110] block: export blk_get/put_queue for blkback

Impact: build fix

I'm not sure if blkback should be using these functions, but in the
meantime export them to allow blkback to be a module.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 block/blk-core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 90f22cc30799..9b60e69a5400 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -351,6 +351,7 @@ void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
+EXPORT_SYMBOL_GPL(blk_put_queue);
 
 /*
  * Note: If a driver supplied the queue lock, it should not zap that lock
@@ -572,6 +573,7 @@ int blk_get_queue(struct request_queue *q)
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(blk_get_queue);
 
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {

From 05d43865ddc00bdb33d12c8e9d9f176ed5d3797b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 29 Jun 2009 14:58:45 -0700
Subject: [PATCH 006/110] xen/blkback: deal with hardsect_size to
 logical_block_size rename

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/blkback.c | 2 +-
 drivers/xen/blkback/vbd.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 8d988f4513aa..ac5af91c393f 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -484,7 +484,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 	for (i = 0; i < nseg; i++) {
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
-		    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
+		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
 			DPRINTK("Misaligned I/O request from domain %d",
 				blkif->domid);
 			goto fail_put_bio;
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index 7e9a1cd35ade..410c2eac5ad7 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -47,7 +47,7 @@ unsigned int vbd_info(struct vbd *vbd)
 
 unsigned long vbd_secsize(struct vbd *vbd)
 {
-	return bdev_hardsect_size(vbd->bdev);
+	return bdev_logical_block_size(vbd->bdev);
 }
 
 int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,

From 0660c7dbf228a06345392a64ebb43734875a3b91 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 9 Sep 2009 15:15:16 -0700
Subject: [PATCH 007/110] xen/blkback: remove spurious debug output noise

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/blkback.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index ac5af91c393f..31458bd07252 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -615,7 +615,6 @@ static int __init blkif_init(void)
 {
 	int i, mmap_pages;
 
-	printk(KERN_CRIT "***blkif_init\n");
 	if (!xen_pv_domain())
 		return -ENODEV;
 

From afd91d07ff72919071e37086c0664384b3875688 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 15 Sep 2009 14:12:37 -0700
Subject: [PATCH 008/110] xen/blkback: little cleanups

Remove unused local prototype; group headers.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/blkback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 31458bd07252..e9e3de119a73 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -40,6 +40,7 @@
 #include <linux/list.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
+
 #include <xen/balloon.h>
 #include <xen/events.h>
 #include <xen/page.h>
@@ -383,7 +384,6 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 				 struct blkif_request *req,
 				 pending_req_t *pending_req)
 {
-	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct phys_req preq;
 	struct {

From 8770b2683f9f98d4c1d6caf2e28f625592bba4f3 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 8 Oct 2009 13:23:09 -0400
Subject: [PATCH 009/110] Fix compile warnings: ignoring return value of
 'xenbus_register_backend' ..

We neglect to check the return value of xenbus_register_backend
and take actions when that fails. This patch fixes that and adds
code to deal with those type of failures.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/blkback.c   | 18 +++++++++++++-----
 drivers/xen/blkback/common.h    |  4 ++--
 drivers/xen/blkback/interface.c |  6 +++++-
 drivers/xen/blkback/xenbus.c    |  5 ++---
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index e9e3de119a73..a2ac7189cc0a 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -614,6 +614,7 @@ static void make_response(blkif_t *blkif, u64 id,
 static int __init blkif_init(void)
 {
 	int i, mmap_pages;
+	int rc = 0;
 
 	if (!xen_pv_domain())
 		return -ENODEV;
@@ -626,13 +627,17 @@ static int __init blkif_init(void)
 					mmap_pages, GFP_KERNEL);
 	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
 
-	if (!pending_reqs || !pending_grant_handles || !pending_pages)
+	if (!pending_reqs || !pending_grant_handles || !pending_pages) {
+		rc = -ENOMEM;
 		goto out_of_memory;
+	}
 
 	for (i = 0; i < mmap_pages; i++)
 		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
 
-	blkif_interface_init();
+	rc = blkif_interface_init();
+	if (rc)
+		goto failed_init;
 
 	memset(pending_reqs, 0, sizeof(pending_reqs));
 	INIT_LIST_HEAD(&pending_free);
@@ -640,16 +645,19 @@ static int __init blkif_init(void)
 	for (i = 0; i < blkif_reqs; i++)
 		list_add_tail(&pending_reqs[i].free_list, &pending_free);
 
-	blkif_xenbus_init();
+	rc = blkif_xenbus_init();
+	if (rc)
+		goto failed_init;
 
 	return 0;
 
  out_of_memory:
+	printk(KERN_ERR "%s: out of memory\n", __func__);
+ failed_init:
 	kfree(pending_reqs);
 	kfree(pending_grant_handles);
 	free_empty_pages_and_pagevec(pending_pages, mmap_pages);
-	printk("%s: out of memory\n", __FUNCTION__);
-	return -ENOMEM;
+	return rc;
 }
 
 module_init(blkif_init);
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
index 57b78250cfb7..aaf36485bc01 100644
--- a/drivers/xen/blkback/common.h
+++ b/drivers/xen/blkback/common.h
@@ -124,9 +124,9 @@ struct phys_req {
 
 int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
 
-void blkif_interface_init(void);
+int blkif_interface_init(void);
 
-void blkif_xenbus_init(void);
+int blkif_xenbus_init(void);
 
 irqreturn_t blkif_be_int(int irq, void *dev_id);
 int blkif_schedule(void *arg);
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
index c6c3e14776b9..e397a4134f1b 100644
--- a/drivers/xen/blkback/interface.c
+++ b/drivers/xen/blkback/interface.c
@@ -175,8 +175,12 @@ void blkif_free(blkif_t *blkif)
 	kmem_cache_free(blkif_cachep, blkif);
 }
 
-void __init blkif_interface_init(void)
+int __init blkif_interface_init(void)
 {
 	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
 					 0, 0, NULL);
+	if (!blkif_cachep)
+		return -ENOMEM;
+
+	return 0;
 }
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index 650f4b3e9b3c..04c0a12aff36 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -535,8 +535,7 @@ static struct xenbus_driver blkback = {
 };
 
 
-void blkif_xenbus_init(void)
+int blkif_xenbus_init(void)
 {
-	/* XXX must_check */
-	(void)xenbus_register_backend(&blkback);
+	return xenbus_register_backend(&blkback);
 }

From e7579a99b598f8e4a2b4df4854fbda2cc961bb02 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ijc@hellion.org.uk>
Date: Thu, 3 Dec 2009 21:56:18 +0000
Subject: [PATCH 010/110] xen: rename blkbk module xen-blkback.

blkbk is rather generic for a modular distro style kernel.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
index 8bab63da3b3e..f1ae1ff07a4d 100644
--- a/drivers/xen/blkback/Makefile
+++ b/drivers/xen/blkback/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
 
-blkbk-y	:= blkback.o xenbus.o interface.o vbd.o
+xen-blkback-y	:= blkback.o xenbus.o interface.o vbd.o

From 5cf6e4f6f6d5549904db6ecb3ffd5b8f71f41250 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 11 Feb 2010 16:07:31 -0800
Subject: [PATCH 011/110] xen/blkback: use drv_get/set_drvdata rather than
 directly accessing driver_data.

Direct driver_data access is obsolete and will disappear.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/xenbus.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index 04c0a12aff36..34f8e4046578 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -105,7 +105,7 @@ static void update_blkif_status(blkif_t *blkif)
 				   char *buf)				\
 	{								\
 		struct xenbus_device *dev = to_xenbus_device(_dev);	\
-		struct backend_info *be = dev->dev.driver_data;		\
+		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
 									\
 		return sprintf(buf, format, ##args);			\
 	}								\
@@ -169,7 +169,7 @@ void xenvbd_sysfs_delif(struct xenbus_device *dev)
 
 static int blkback_remove(struct xenbus_device *dev)
 {
-	struct backend_info *be = dev->dev.driver_data;
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
 
 	DPRINTK("");
 
@@ -190,7 +190,7 @@ static int blkback_remove(struct xenbus_device *dev)
 	}
 
 	kfree(be);
-	dev->dev.driver_data = NULL;
+	dev_set_drvdata(&dev->dev, NULL);
 	return 0;
 }
 
@@ -225,7 +225,7 @@ static int blkback_probe(struct xenbus_device *dev,
 		return -ENOMEM;
 	}
 	be->dev = dev;
-	dev->dev.driver_data = be;
+	dev_set_drvdata(&dev->dev, be);
 
 	be->blkif = blkif_alloc(dev->otherend_id);
 	if (IS_ERR(be->blkif)) {
@@ -348,7 +348,7 @@ static void backend_changed(struct xenbus_watch *watch,
 static void frontend_changed(struct xenbus_device *dev,
 			     enum xenbus_state frontend_state)
 {
-	struct backend_info *be = dev->dev.driver_data;
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
 	int err;
 
 	DPRINTK("%s", xenbus_strstate(frontend_state));

From 2ccbfe26c106a1a93a402567b7853c1484c4a0b0 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <ksrinivasan@novell.com>
Date: Thu, 11 Mar 2010 13:39:50 -0800
Subject: [PATCH 012/110] xen/blkback: Propagate changed size of VBDs

Support dynamic resizing of virtual block devices. This patch supports
both file backed block devices as well as physical devices that can be
dynamically resized on the host side.

Signed-off-by: K. Y. Srinivasan <ksrinivasan@novell.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/blkback.c |  3 +++
 drivers/xen/blkback/common.h  |  2 ++
 drivers/xen/blkback/vbd.c     | 43 +++++++++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index a2ac7189cc0a..6d897664802d 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -207,6 +207,7 @@ static void print_stats(blkif_t *blkif)
 int blkif_schedule(void *arg)
 {
 	blkif_t *blkif = arg;
+	struct vbd *vbd = &blkif->vbd;
 
 	blkif_get(blkif);
 
@@ -216,6 +217,8 @@ int blkif_schedule(void *arg)
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
 			continue;
+		if (unlikely(vbd->size != vbd_size(vbd)))
+			vbd_resize(blkif);
 
 		wait_event_interruptible(
 			blkif->wq,
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
index aaf36485bc01..cebcc2b7e9f6 100644
--- a/drivers/xen/blkback/common.h
+++ b/drivers/xen/blkback/common.h
@@ -52,6 +52,7 @@ struct vbd {
 	unsigned char  type;        /* VDISK_xxx */
 	u32            pdevice;     /* phys device that this vbd maps to */
 	struct block_device *bdev;
+	sector_t       size;        /* Cached size parameter */
 };
 
 struct backend_info;
@@ -98,6 +99,7 @@ blkif_t *blkif_alloc(domid_t domid);
 void blkif_disconnect(blkif_t *blkif);
 void blkif_free(blkif_t *blkif);
 int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+void vbd_resize(blkif_t *blkif);
 
 #define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
 #define blkif_put(_b)					\
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index 410c2eac5ad7..0635c54079f8 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -73,6 +73,7 @@ int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
 	}
 
 	vbd->bdev = bdev;
+	vbd->size = vbd_size(vbd);
 
 	if (vbd->bdev->bd_disk == NULL) {
 		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
@@ -116,3 +117,45 @@ int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
  out:
 	return rc;
 }
+
+void vbd_resize(blkif_t *blkif)
+{
+	struct vbd *vbd = &blkif->vbd;
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = blkif->be->dev;
+	unsigned long long new_size = vbd_size(vbd);
+
+	printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
+	vbd->size = new_size;
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		printk(KERN_WARNING "Error starting transaction");
+		return;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
+			    vbd_size(vbd));
+	if (err) {
+		printk(KERN_WARNING "Error writing new size");
+		goto abort;
+	}
+	/*
+	 * Write the current state; we will use this to synchronize
+	 * the front-end. If the current state is "connected" the
+	 * front-end will get the new size information online.
+	 */
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+	if (err) {
+		printk(KERN_WARNING "Error writing the state");
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		printk(KERN_WARNING "Error ending transaction");
+abort:
+	xenbus_transaction_end(xbt, 1);
+}

From 98e036a356747cfaa225478b1e4875e190257b09 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 18 Mar 2010 15:35:05 -0700
Subject: [PATCH 013/110] xen/blkback: add accessor for xenbus backend device

Since backend_info is hidden away now.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/common.h | 2 ++
 drivers/xen/blkback/vbd.c    | 2 +-
 drivers/xen/blkback/xenbus.c | 5 +++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
index cebcc2b7e9f6..0f91830f18c8 100644
--- a/drivers/xen/blkback/common.h
+++ b/drivers/xen/blkback/common.h
@@ -136,4 +136,6 @@ int blkif_schedule(void *arg);
 int blkback_barrier(struct xenbus_transaction xbt,
 		    struct backend_info *be, int state);
 
+struct xenbus_device *blkback_xenbus(struct backend_info *be);
+
 #endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index 0635c54079f8..943ec2313522 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -123,7 +123,7 @@ void vbd_resize(blkif_t *blkif)
 	struct vbd *vbd = &blkif->vbd;
 	struct xenbus_transaction xbt;
 	int err;
-	struct xenbus_device *dev = blkif->be->dev;
+	struct xenbus_device *dev = blkback_xenbus(blkif->be);
 	unsigned long long new_size = vbd_size(vbd);
 
 	printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index 34f8e4046578..c31e5c40b45c 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -42,6 +42,11 @@ static int connect_ring(struct backend_info *);
 static void backend_changed(struct xenbus_watch *, const char **,
 			    unsigned int);
 
+struct xenbus_device *blkback_xenbus(struct backend_info *be)
+{
+	return be->dev;
+}
+
 static int blkback_name(blkif_t *blkif, char *buf)
 {
 	char *devpath, *devname;

From cbf462908c8080f47c2a3300072877589dd1275f Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 21 Jul 2010 12:41:45 -0700
Subject: [PATCH 014/110] xen/blkback: Flush blkback data when connecting.

First cut at flushing blkback data when first connecting
blkback.  This should avoid the pygrub issues we are experiencing
in (RedHat bugzilla) 466681.

[ 2.6.18-xen.hg commit 63b4d7f56688 ]

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/xenbus.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index c31e5c40b45c..a0534fc6a428 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -91,6 +91,13 @@ static void update_blkif_status(blkif_t *blkif)
 		return;
 	}
 
+	err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "block flush");
+		return;
+	}
+	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+
 	blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
 	if (IS_ERR(blkif->xenblkd)) {
 		err = PTR_ERR(blkif->xenblkd);

From a81135d90bf176e6139c352c7b96c03d00131836 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <ksrinivasan@novell.com>
Date: Mon, 16 Aug 2010 13:43:06 -0700
Subject: [PATCH 015/110] xen/blkback: Print additional information when a vbd
 is resized.

Signed-off-by: K. Y. Srinivasan <ksrinivasan@novell.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/vbd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index 943ec2313522..dc2572338567 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -126,6 +126,8 @@ void vbd_resize(blkif_t *blkif)
 	struct xenbus_device *dev = blkback_xenbus(blkif->be);
 	unsigned long long new_size = vbd_size(vbd);
 
+	printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
+		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
 	printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
 	vbd->size = new_size;
 again:

From 313d7b003ceceb797e8c0d18ab085ed0638b4aff Mon Sep 17 00:00:00 2001
From: Keir Fraser <keir@xen.org>
Date: Wed, 24 Nov 2010 22:08:20 -0800
Subject: [PATCH 016/110] blkback: Fix CVE-2010-3699

A guest can cause the backend driver to leak a kernel thread. Such
leaked threads hold references to the device, whichmakes the device
impossible to tear down. If shut down, the guest remains a zombie
domain, the xenwatch process hangs, and most xm commands will stop
working.

This patch tries to do the following for blkback:
    - identify/extract idempotent teardown operations,
    - add/move the invocation of said teardown operation
      right before we're about to allocate new resources in the
      Connected states.

[ linux-2.6.18-xen.hg 59f097ef181b ]

Signed-off-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Keir Fraser <keir@xen.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/blkback/xenbus.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index a0534fc6a428..031bc3d7eec3 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -382,6 +382,11 @@ static void frontend_changed(struct xenbus_device *dev,
 		if (dev->state == XenbusStateConnected)
 			break;
 
+		/* Enforce precondition before potential leak point.
+		 * blkif_disconnect() is idempotent.
+		 */
+		blkif_disconnect(be->blkif);
+
 		err = connect_ring(be);
 		if (err)
 			break;
@@ -399,6 +404,7 @@ static void frontend_changed(struct xenbus_device *dev,
 			break;
 		/* fall through if not online */
 	case XenbusStateUnknown:
+		/* implies blkif_disconnect() via blkback_remove() */
 		device_unregister(&dev->dev);
 		break;
 

From 248e9f7539f8351cd857d12a74bd52133a3a900f Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 24 Feb 2011 17:22:41 -0500
Subject: [PATCH 017/110] xen/blkback: Replace WRITE_BARRIER with (REQ_FLUSH |
 REQ_FUA)

TODO: Double check xen-blkfront.c
---
 drivers/xen/blkback/blkback.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 6d897664802d..cb844f734d91 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -405,7 +405,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 		operation = WRITE;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
-		operation = WRITE_BARRIER;
+		operation = REQ_FLUSH | REQ_FUA;
 		break;
 	default:
 		operation = 0; /* make gcc happy */
@@ -414,7 +414,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 	/* Check that number of segments is sane. */
 	nseg = req->nr_segments;
-	if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
+	if (unlikely(nseg == 0 && operation != (REQ_FLUSH | REQ_FUA)) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
 		goto fail_response;
@@ -517,7 +517,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 	}
 
 	if (!bio) {
-		BUG_ON(operation != WRITE_BARRIER);
+		BUG_ON(operation != (REQ_FLUSH | REQ_FUA));
 		bio = bio_alloc(GFP_KERNEL, 0);
 		if (unlikely(bio == NULL))
 			goto fail_put_bio;
@@ -532,7 +532,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;
-	else if (operation == WRITE || operation == WRITE_BARRIER)
+	else if (operation == WRITE || operation == (REQ_FLUSH | REQ_FUA))
 		blkif->st_wr_sect += preq.nr_sects;
 
 	return;

From bc0c081b0e7a4afc4d2c7bc0666f5cd169e96814 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 25 Feb 2011 10:02:39 -0500
Subject: [PATCH 018/110] xen/blkback: Update to use blkdev_get_by_dev instead
 of open_by_devnum.

The API for opening a block device has changed since 2.6.32. The
correct function to open a device is blkdev_get_by_dev.
---
 drivers/xen/blkback/vbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index dc2572338567..8c91a2fb0019 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -63,8 +63,8 @@ int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
 
 	vbd->pdevice  = MKDEV(major, minor);
 
-	bdev = open_by_devnum(vbd->pdevice,
-			      vbd->readonly ? FMODE_READ : FMODE_WRITE);
+	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+				 FMODE_READ : FMODE_WRITE, NULL);
 
 	if (IS_ERR(bdev)) {
 		DPRINTK("vbd_creat: device %08x could not be opened.\n",

From efe08a3eecf15ab022afba48c691d02c7de2fbbb Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 5 Feb 2010 14:19:33 -0500
Subject: [PATCH 019/110] xen/blkback: simplify address translations

Cherry-pick and modified from 69d64727c42eecd47fdf82c15a54474d21a4012a
("blkback/blktap2: simplify address translations"):

"There are quite a number of places where e.g. page->va->page
translations happen.

Besides yielding smaller code (source and binary), a second goal is to
make it easier to determine where virtual addresses of pages allocated
through alloc_empty_pages_and_pagevec() are really used (in turn in
order to determine whether using highmem pages would be possible
there)."

The second goal is not the purpose of this patch - it is just to
make it easier to read the code.

linux-2.6-pvops:
 * Stripped drivers/xen/gntdev/*
 * Stripped drivers/xen/netback/*

[v2: Stripped blktap off]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Daniel Stodden <daniel.stodden@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index cb844f734d91..7c9421cc5991 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -99,9 +99,11 @@ static inline int vaddr_pagenr(pending_req_t *req, int seg)
 	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
 }
 
+#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+
 static inline unsigned long vaddr(pending_req_t *req, int seg)
 {
-	unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
+	unsigned long pfn = page_to_pfn(pending_page(req, seg));
 	return (unsigned long)pfn_to_kaddr(pfn);
 }
 
@@ -463,8 +465,8 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 		if (ret)
 			continue;
 
-		set_phys_to_machine(__pa(vaddr(
-			pending_req, i)) >> PAGE_SHIFT,
+		set_phys_to_machine(
+			page_to_pfn(pending_page(pending_req, i)),
 			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 		seg[i].buf  = map[i].dev_bus_addr |
 			(req->seg[i].first_sect << 9);
@@ -495,7 +497,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
-				     virt_to_page(vaddr(pending_req, i)),
+				     pending_page(pending_req, i),
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
 			if (bio) {

From e8e28871edf0d0adb0bd7e597c044cbaf7a7f137 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 25 Feb 2011 10:51:29 -0500
Subject: [PATCH 020/110] xen/blkback: Move global/static variables into struct
 xen_blkbk.

Bundle the lot of discrete variables into a single structure.
This is based on what was done in the xen-netback driver:
xen: netback: Move global/static variables into struct xen_netbk.
(094944631cc5a9d6e623302c987f78117c0bf7ac)

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 82 ++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 7c9421cc5991..c08875b0ad64 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -84,31 +84,34 @@ typedef struct {
 	struct list_head free_list;
 } pending_req_t;
 
-static pending_req_t *pending_reqs;
-static struct list_head pending_free;
-static DEFINE_SPINLOCK(pending_free_lock);
-static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
-
 #define BLKBACK_INVALID_HANDLE (~0)
 
-static struct page **pending_pages;
-static grant_handle_t *pending_grant_handles;
+struct xen_blkbk {
+	pending_req_t	*pending_reqs;
+	struct list_head	pending_free;
+	spinlock_t		pending_free_lock;
+	wait_queue_head_t	pending_free_wq;
+	struct page		**pending_pages;
+	grant_handle_t		*pending_grant_handles;
+};
+
+static struct xen_blkbk *blkbk;
 
 static inline int vaddr_pagenr(pending_req_t *req, int seg)
 {
-	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+	return (req - blkbk->pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
 }
 
 #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
 
 static inline unsigned long vaddr(pending_req_t *req, int seg)
 {
-	unsigned long pfn = page_to_pfn(pending_page(req, seg));
+	unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
 	return (unsigned long)pfn_to_kaddr(pfn);
 }
 
 #define pending_handle(_req, _seg) \
-	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+	(blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
 
 
 static int do_block_io_op(blkif_t *blkif);
@@ -126,12 +129,12 @@ static pending_req_t* alloc_req(void)
 	pending_req_t *req = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pending_free_lock, flags);
-	if (!list_empty(&pending_free)) {
-		req = list_entry(pending_free.next, pending_req_t, free_list);
+	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
+	if (!list_empty(&blkbk->pending_free)) {
+		req = list_entry(blkbk->pending_free.next, pending_req_t, free_list);
 		list_del(&req->free_list);
 	}
-	spin_unlock_irqrestore(&pending_free_lock, flags);
+	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
 	return req;
 }
 
@@ -140,12 +143,12 @@ static void free_req(pending_req_t *req)
 	unsigned long flags;
 	int was_empty;
 
-	spin_lock_irqsave(&pending_free_lock, flags);
-	was_empty = list_empty(&pending_free);
-	list_add(&req->free_list, &pending_free);
-	spin_unlock_irqrestore(&pending_free_lock, flags);
+	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
+	was_empty = list_empty(&blkbk->pending_free);
+	list_add(&req->free_list, &blkbk->pending_free);
+	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
 	if (was_empty)
-		wake_up(&pending_free_wq);
+		wake_up(&blkbk->pending_free_wq);
 }
 
 static void unplug_queue(blkif_t *blkif)
@@ -226,8 +229,8 @@ int blkif_schedule(void *arg)
 			blkif->wq,
 			blkif->waiting_reqs || kthread_should_stop());
 		wait_event_interruptible(
-			pending_free_wq,
-			!list_empty(&pending_free) || kthread_should_stop());
+			blkbk->pending_free_wq,
+			!list_empty(&blkbk->pending_free) || kthread_should_stop());
 
 		blkif->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
@@ -466,7 +469,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 			continue;
 
 		set_phys_to_machine(
-			page_to_pfn(pending_page(pending_req, i)),
+			page_to_pfn(blkbk->pending_page(pending_req, i)),
 			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 		seg[i].buf  = map[i].dev_bus_addr |
 			(req->seg[i].first_sect << 9);
@@ -497,7 +500,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
-				     pending_page(pending_req, i),
+				     blkbk->pending_page(pending_req, i),
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
 			if (bio) {
@@ -624,31 +627,40 @@ static int __init blkif_init(void)
 	if (!xen_pv_domain())
 		return -ENODEV;
 
+	blkbk = (struct xen_blkbk *)vmalloc(sizeof(struct xen_blkbk));
+	if (!blkbk) {
+		printk(KERN_ALERT "%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+
 	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
 
-	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+	blkbk->pending_reqs          = kmalloc(sizeof(blkbk->pending_reqs[0]) *
 					blkif_reqs, GFP_KERNEL);
-	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+	blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
 					mmap_pages, GFP_KERNEL);
-	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
+	blkbk->pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
 
-	if (!pending_reqs || !pending_grant_handles || !pending_pages) {
+	if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || !blkbk->pending_pages) {
 		rc = -ENOMEM;
 		goto out_of_memory;
 	}
 
 	for (i = 0; i < mmap_pages; i++)
-		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+		blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
 
 	rc = blkif_interface_init();
 	if (rc)
 		goto failed_init;
 
-	memset(pending_reqs, 0, sizeof(pending_reqs));
-	INIT_LIST_HEAD(&pending_free);
+	memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs));
+
+	INIT_LIST_HEAD(&blkbk->pending_free);
+	spin_lock_init(&blkbk->pending_free_lock);
+	init_waitqueue_head(&blkbk->pending_free_wq);
 
 	for (i = 0; i < blkif_reqs; i++)
-		list_add_tail(&pending_reqs[i].free_list, &pending_free);
+		list_add_tail(&blkbk->pending_reqs[i].free_list, &blkbk->pending_free);
 
 	rc = blkif_xenbus_init();
 	if (rc)
@@ -659,9 +671,11 @@ static int __init blkif_init(void)
  out_of_memory:
 	printk(KERN_ERR "%s: out of memory\n", __func__);
  failed_init:
-	kfree(pending_reqs);
-	kfree(pending_grant_handles);
-	free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+	kfree(blkbk->pending_reqs);
+	kfree(blkbk->pending_grant_handles);
+	free_empty_pages_and_pagevec(blkbk->pending_pages, mmap_pages);
+	vfree(blkbk);
+	blkbk = NULL;
 	return rc;
 }
 

From c35950bfa9abaaf16548a287a8d5d782a361414f Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 1 Mar 2011 16:22:28 -0500
Subject: [PATCH 021/110] xen/blkback: Union the blkif_request request specific
 fields

Following in the steps of patch:
"xen: Union the blkif_request request specific fields" this patch
changes the blkback. Per the original patch:

"Prepare for extending the block device ring to allow request
specific fields, by moving the request specific fields for
reads, writes and barrier requests to a union member."

Cc: Owen Smith <owen.smith@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 14 +++++++-------
 include/xen/blkif.h           |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index c08875b0ad64..eda50646775d 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -426,7 +426,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 	}
 
 	preq.dev           = req->handle;
-	preq.sector_number = req->sector_number;
+	preq.sector_number = req->u.rw.sector_number;
 	preq.nr_sects      = 0;
 
 	pending_req->blkif     = blkif;
@@ -438,11 +438,11 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 	for (i = 0; i < nseg; i++) {
 		uint32_t flags;
 
-		seg[i].nsec = req->seg[i].last_sect -
-			req->seg[i].first_sect + 1;
+		seg[i].nsec = req->u.rw.seg[i].last_sect -
+			req->u.rw.seg[i].first_sect + 1;
 
-		if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
-		    (req->seg[i].last_sect < req->seg[i].first_sect))
+		if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
 			goto fail_response;
 		preq.nr_sects += seg[i].nsec;
 
@@ -450,7 +450,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 		if (operation != READ)
 			flags |= GNTMAP_readonly;
 		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
-				  req->seg[i].gref, blkif->domid);
+				  req->u.rw.seg[i].gref, blkif->domid);
 	}
 
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
@@ -472,7 +472,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 			page_to_pfn(blkbk->pending_page(pending_req, i)),
 			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 		seg[i].buf  = map[i].dev_bus_addr |
-			(req->seg[i].first_sect << 9);
+			(req->u.rw.seg[i].first_sect << 9);
 	}
 
 	if (ret)
diff --git a/include/xen/blkif.h b/include/xen/blkif.h
index d27428046918..ab794269fc53 100644
--- a/include/xen/blkif.h
+++ b/include/xen/blkif.h
@@ -96,12 +96,12 @@ static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_
 	dst->nr_segments = src->nr_segments;
 	dst->handle = src->handle;
 	dst->id = src->id;
-	dst->sector_number = src->sector_number;
+	dst->u.rw.sector_number = src->sector_number;
 	barrier();
 	if (n > dst->nr_segments)
 		n = dst->nr_segments;
 	for (i = 0; i < n; i++)
-		dst->seg[i] = src->seg[i];
+		dst->u.rw.seg[i] = src->seg[i];
 }
 
 static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src)
@@ -111,12 +111,12 @@ static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_
 	dst->nr_segments = src->nr_segments;
 	dst->handle = src->handle;
 	dst->id = src->id;
-	dst->sector_number = src->sector_number;
+	dst->u.rw.sector_number = src->sector_number;
 	barrier();
 	if (n > dst->nr_segments)
 		n = dst->nr_segments;
 	for (i = 0; i < n; i++)
-		dst->seg[i] = src->seg[i];
+		dst->u.rw.seg[i] = src->seg[i];
 }
 
 #endif /* __XEN_BLKIF_H__ */

From 464fb419e17083a18b636c9f4714fc49ef6857d2 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 1 Mar 2011 16:26:10 -0500
Subject: [PATCH 022/110] xen/blkback: Use 'vzalloc' for page arrays and
 pre-allocate pages.

Previously we would allocate the array for page using 'kmalloc' which
we can as easily do with 'vzalloc'. The pre-allocation of pages
was done a bit differently in the past - it used to be that
the balloon driver would export "alloc_empty_pages_and_pagevec"
which would have in one function created an array, allocated
the pages, balloned the pages out (so the memory behind those
pages would be non-present), and provide us those pages.

This was OK as those pages were shared between other guest and
the only thing we needed was to "swizzel" the MFN of those pages
to point to the other guest MFN. We can still "swizzel" the MFNs
using the M2P (and P2M) override API calls, but for the sake of
simplicity we are dropping the balloon API calls. We can return
to those later on.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index eda50646775d..d32198d1be04 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -637,18 +637,23 @@ static int __init blkif_init(void)
 
 	blkbk->pending_reqs          = kmalloc(sizeof(blkbk->pending_reqs[0]) *
 					blkif_reqs, GFP_KERNEL);
-	blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
-					mmap_pages, GFP_KERNEL);
-	blkbk->pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
+	blkbk->pending_grant_handles = vzalloc(sizeof(blkbk->pending_grant_handles[0]) *
+					mmap_pages);
+	blkbk->pending_pages         = vzalloc(sizeof(blkbk->pending_pages[0]) * mmap_pages);
 
 	if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || !blkbk->pending_pages) {
 		rc = -ENOMEM;
 		goto out_of_memory;
 	}
 
-	for (i = 0; i < mmap_pages; i++)
+	for (i = 0; i < mmap_pages; i++) {
 		blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
-
+		blkbk->pending_pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (blkbk->pending_pages[i] == NULL) {
+			rc = -ENOMEM;
+			goto out_of_memory;
+		}
+	}
 	rc = blkif_interface_init();
 	if (rc)
 		goto failed_init;
@@ -672,8 +677,12 @@ static int __init blkif_init(void)
 	printk(KERN_ERR "%s: out of memory\n", __func__);
  failed_init:
 	kfree(blkbk->pending_reqs);
-	kfree(blkbk->pending_grant_handles);
-	free_empty_pages_and_pagevec(blkbk->pending_pages, mmap_pages);
+	vfree(blkbk->pending_grant_handles);
+	for (i = 0; i < mmap_pages; i++) {
+		if (blkbk->pending_pages[i])
+			__free_page(blkbk->pending_pages[i]);
+	}
+	vfree(blkbk->pending_pages);
 	vfree(blkbk);
 	blkbk = NULL;
 	return rc;

From 5dc03639cc903f887931831d69895facb5260f4b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 1 Mar 2011 16:46:45 -0500
Subject: [PATCH 023/110] xen/blkback: Utilize the M2P override mechanism for
 GNTMAP_host_map

Instead of doing copy grants lets do mapping grants using
the M2P(and P2M) override mechanism.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

Conflicts:

	drivers/xen/blkback/blkback.c
---
 drivers/xen/blkback/blkback.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index d32198d1be04..15790ae96f33 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -41,7 +41,6 @@
 #include <linux/delay.h>
 #include <linux/freezer.h>
 
-#include <xen/balloon.h>
 #include <xen/events.h>
 #include <xen/page.h>
 #include <asm/xen/hypervisor.h>
@@ -192,6 +191,17 @@ static void fast_flush_area(pending_req_t *req)
 	ret = HYPERVISOR_grant_table_op(
 		GNTTABOP_unmap_grant_ref, unmap, invcount);
 	BUG_ON(ret);
+	/* Note, we use invcount, so nr->pages, so we can't index
+	 * using vaddr(req, i). */
+	for (i = 0; i < invcount; i++) {
+		ret = m2p_remove_override(
+			virt_to_page(unmap[i].host_addr), false);
+		if (ret) {
+			printk(KERN_ALERT "Failed to remove M2P override for " \
+				"%lx\n", (unsigned long)unmap[i].host_addr);
+			continue;
+		}
+	}
 }
 
 /******************************************************************
@@ -467,10 +477,15 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 		if (ret)
 			continue;
+		
+		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
+			blkbk->pending_page(pending_req, i), false);
+		if (ret) {
+			printk(KERN_ALERT "Failed to install M2P override for"\
+				" %lx (ret: %d)\n", (unsigned long)map[i].dev_bus_addr, ret);
+			continue;
+		}
 
-		set_phys_to_machine(
-			page_to_pfn(blkbk->pending_page(pending_req, i)),
-			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 		seg[i].buf  = map[i].dev_bus_addr |
 			(req->u.rw.seg[i].first_sect << 9);
 	}

From a742b02c75e6e76bd0833f9b6e702f1be7d7e008 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 14 Mar 2011 12:41:26 -0400
Subject: [PATCH 024/110] xen/blkback: Use kzalloc's, and GFP_KERNEL for data
 structures.

The patch titled:"xen/blkback: Use 'vzalloc' for page arrays and pre-allocate pages."
allocates the structures and its member variables using the 'vzalloc'.

Daniel Stodden pointed out that vzalloc is good when we use
big number of pages - while these are at the max two pages.

We can do this using kzalloc. Also the GFP_HIGHMEM does not
work properly with Xen, so take that out.

We will have to revisit this when a "get_empty_pages_and_pagevec"
type API shows up to leverage that.

BugLink: http://mid.gmane.org/1299898639.11681.227.camel@agari.van.xensource.com

CC: Daniel Stodden <daniel.stodden@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 15790ae96f33..a6f8f1338118 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -642,7 +642,7 @@ static int __init blkif_init(void)
 	if (!xen_pv_domain())
 		return -ENODEV;
 
-	blkbk = (struct xen_blkbk *)vmalloc(sizeof(struct xen_blkbk));
+	blkbk = (struct xen_blkbk *)kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
 	if (!blkbk) {
 		printk(KERN_ALERT "%s: out of memory!\n", __func__);
 		return -ENOMEM;
@@ -652,9 +652,10 @@ static int __init blkif_init(void)
 
 	blkbk->pending_reqs          = kmalloc(sizeof(blkbk->pending_reqs[0]) *
 					blkif_reqs, GFP_KERNEL);
-	blkbk->pending_grant_handles = vzalloc(sizeof(blkbk->pending_grant_handles[0]) *
-					mmap_pages);
-	blkbk->pending_pages         = vzalloc(sizeof(blkbk->pending_pages[0]) * mmap_pages);
+	blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
+					mmap_pages, GFP_KERNEL);
 
 	if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || !blkbk->pending_pages) {
 		rc = -ENOMEM;
@@ -663,7 +664,7 @@ static int __init blkif_init(void)
 
 	for (i = 0; i < mmap_pages; i++) {
 		blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
-		blkbk->pending_pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
 		if (blkbk->pending_pages[i] == NULL) {
 			rc = -ENOMEM;
 			goto out_of_memory;
@@ -692,13 +693,13 @@ static int __init blkif_init(void)
 	printk(KERN_ERR "%s: out of memory\n", __func__);
  failed_init:
 	kfree(blkbk->pending_reqs);
-	vfree(blkbk->pending_grant_handles);
+	kfree(blkbk->pending_grant_handles);
 	for (i = 0; i < mmap_pages; i++) {
 		if (blkbk->pending_pages[i])
 			__free_page(blkbk->pending_pages[i]);
 	}
-	vfree(blkbk->pending_pages);
-	vfree(blkbk);
+	kfree(blkbk->pending_pages);
+	kfree(blkbk);
 	blkbk = NULL;
 	return rc;
 }

From 314146e515710f8a7d7eaf7a58b7ed590c9c14c3 Mon Sep 17 00:00:00 2001
From: Tom Goetz <tom.goetz@virtualcomputer.com>
Date: Thu, 17 Mar 2011 12:14:29 -0400
Subject: [PATCH 025/110] xen/blkback: Fix the WRITE_BARRIER

The WRITE_BARRIER was missing the REQ_WRITE option. This
was causing the blktap to die.

Signed-off-by: Tom Goetz <tom.goetz@virtualcomputer.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index a6f8f1338118..4cd5b49de0c1 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -47,6 +47,8 @@
 #include <asm/xen/hypercall.h>
 #include "common.h"
 
+#define WRITE_BARRIER	(REQ_WRITE | REQ_FLUSH | REQ_FUA)
+
 /*
  * These are rather arbitrary. They are fairly large because adjacent requests
  * pulled from a communication ring are quite likely to end up being part of
@@ -420,7 +422,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 		operation = WRITE;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
-		operation = REQ_FLUSH | REQ_FUA;
+		operation = WRITE_BARRIER;
 		break;
 	default:
 		operation = 0; /* make gcc happy */
@@ -429,7 +431,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 	/* Check that number of segments is sane. */
 	nseg = req->nr_segments;
-	if (unlikely(nseg == 0 && operation != (REQ_FLUSH | REQ_FUA)) ||
+	if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
 		goto fail_response;
@@ -537,7 +539,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 	}
 
 	if (!bio) {
-		BUG_ON(operation != (REQ_FLUSH | REQ_FUA));
+		BUG_ON(operation != WRITE_BARRIER);
 		bio = bio_alloc(GFP_KERNEL, 0);
 		if (unlikely(bio == NULL))
 			goto fail_put_bio;
@@ -552,7 +554,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;
-	else if (operation == WRITE || operation == (REQ_FLUSH | REQ_FUA))
+	else if (operation == WRITE || operation == WRITE_BARRIER)
 		blkif->st_wr_sect += preq.nr_sects;
 
 	return;

From a1397fa3090c25c6c51c04b4101f2786d16b615f Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 17:05:23 -0400
Subject: [PATCH 026/110] xen/blkback: Add some comments.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c   | 88 ++++++++++++++++++++++++++-------
 drivers/xen/blkback/common.h    |  1 +
 drivers/xen/blkback/interface.c |  2 -
 drivers/xen/blkback/vbd.c       |  2 -
 drivers/xen/blkback/xenbus.c    |  2 +-
 5 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 4cd5b49de0c1..8a4b1e8eeb62 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -1,11 +1,10 @@
 /******************************************************************************
- * arch/xen/drivers/blkif/backend/main.c
  *
  * Back-end of the driver for virtual block devices. This portion of the
  * driver exports a 'unified' block-device interface that can be accessed
  * by any operating system that implements a compatible front end. A
  * reference front-end implementation can be found in:
- *  arch/xen/drivers/blkif/frontend
+ *  drivers/block/xen-blkfront.c
  *
  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  * Copyright (c) 2005, Christopher Clark
@@ -88,16 +87,25 @@ typedef struct {
 #define BLKBACK_INVALID_HANDLE (~0)
 
 struct xen_blkbk {
-	pending_req_t	*pending_reqs;
+	pending_req_t		*pending_reqs;
+	/* List of all 'pending_req' available */
 	struct list_head	pending_free;
+	/* And its spinlock. */
 	spinlock_t		pending_free_lock;
 	wait_queue_head_t	pending_free_wq;
+	/* The list of all pages that are available. */
 	struct page		**pending_pages;
+	/* And the grant handles that are available. */
 	grant_handle_t		*pending_grant_handles;
 };
 
 static struct xen_blkbk *blkbk;
 
+/*
+ * Little helpful macro to figure out the index and virtual address of the
+ * pending_pages[..]. For each 'pending_req' we have have up to
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
+ * 10 and would index in the pending_pages[..]. */
 static inline int vaddr_pagenr(pending_req_t *req, int seg)
 {
 	return (req - blkbk->pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
@@ -122,8 +130,8 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 static void make_response(blkif_t *blkif, u64 id,
 			  unsigned short op, int st);
 
-/******************************************************************
- * misc small helpers
+/*
+ * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  */
 static pending_req_t* alloc_req(void)
 {
@@ -139,6 +147,10 @@ static pending_req_t* alloc_req(void)
 	return req;
 }
 
+/*
+ * Return the 'pending_req' structure back to the freepool. We also
+ * wake up the thread if it was waiting for a free page.
+ */
 static void free_req(pending_req_t *req)
 {
 	unsigned long flags;
@@ -152,6 +164,11 @@ static void free_req(pending_req_t *req)
 		wake_up(&blkbk->pending_free_wq);
 }
 
+/*
+ * Give back a reference count on the underlaying storage.
+ * It is OK to make multiple calls in this function as it
+ * resets the plug to NULL when it is done on the first call.
+ */
 static void unplug_queue(blkif_t *blkif)
 {
 	if (blkif->plug == NULL)
@@ -162,6 +179,12 @@ static void unplug_queue(blkif_t *blkif)
 	blkif->plug = NULL;
 }
 
+/*
+ * Take a reference count on the underlaying storage.
+ * It is OK to call this multiple times as we check to make sure
+ * not to double reference. We also give back a reference count
+ * if it corresponds to another queue.
+ */
 static void plug_queue(blkif_t *blkif, struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
@@ -173,6 +196,10 @@ static void plug_queue(blkif_t *blkif, struct block_device *bdev)
 	blkif->plug = q;
 }
 
+/*
+ * Unmap the grant references, and also remove the M2P over-rides
+ * used in the 'pending_req'.
+*/
 static void fast_flush_area(pending_req_t *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -266,8 +293,8 @@ int blkif_schedule(void *arg)
 	return 0;
 }
 
-/******************************************************************
- * COMPLETION CALLBACK -- Called as bh->b_end_io()
+/*
+ * Completion callback on the bio's. Called as bh->b_end_io()
  */
 
 static void __end_block_io_op(pending_req_t *pending_req, int error)
@@ -284,6 +311,9 @@ static void __end_block_io_op(pending_req_t *pending_req, int error)
 		pending_req->status = BLKIF_RSP_ERROR;
 	}
 
+	/* If all of the bio's have completed it is time to unmap
+	 * the grant references associated with 'request' and provide
+	 * the proper response on the ring. */
 	if (atomic_dec_and_test(&pending_req->pendcnt)) {
 		fast_flush_area(pending_req);
 		make_response(pending_req->blkif, pending_req->id,
@@ -293,6 +323,9 @@ static void __end_block_io_op(pending_req_t *pending_req, int error)
 	}
 }
 
+/*
+ * bio callback.
+ */
 static void end_block_io_op(struct bio *bio, int error)
 {
 	__end_block_io_op(bio->bi_private, error);
@@ -300,8 +333,8 @@ static void end_block_io_op(struct bio *bio, int error)
 }
 
 
-/******************************************************************************
- * NOTIFICATION FROM GUEST OS.
+/*
+ * Notification from the guest OS.
  */
 
 static void blkif_notify_work(blkif_t *blkif)
@@ -318,10 +351,11 @@ irqreturn_t blkif_be_int(int irq, void *dev_id)
 
 
 
-/******************************************************************
- * DOWNWARD CALLS -- These interface with the block-device layer proper.
+/*
+ * Function to copy the from the ring buffer the 'struct blkif_request'
+ * (which has the sectors we want, number of them, grant references, etc),
+ * and transmute  it to the block API to hand it over to the proper block disk.
  */
-
 static int do_block_io_op(blkif_t *blkif)
 {
 	union blkif_back_rings *blk_rings = &blkif->blk_rings;
@@ -400,6 +434,10 @@ static int do_block_io_op(blkif_t *blkif)
 	return more_to_do;
 }
 
+/*
+ * Transumation of the 'struct blkif_request' to a proper 'struct bio'
+ * and call the 'submit_bio' to pass it to the underlaying storage.
+ */
 static void dispatch_rw_block_io(blkif_t *blkif,
 				 struct blkif_request *req,
 				 pending_req_t *pending_req)
@@ -429,7 +467,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 		BUG();
 	}
 
-	/* Check that number of segments is sane. */
+	/* Check that the number of segments is sane. */
 	nseg = req->nr_segments;
 	if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
@@ -447,12 +485,14 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 	pending_req->status    = BLKIF_RSP_OKAY;
 	pending_req->nr_pages  = nseg;
 
+	/* Fill out preq.nr_sects with proper amount of sectors, and setup
+	 * assign map[..] with the PFN of the page in our domain with the
+	 * corresponding grant reference for each page.*/
 	for (i = 0; i < nseg; i++) {
 		uint32_t flags;
 
 		seg[i].nsec = req->u.rw.seg[i].last_sect -
 			req->u.rw.seg[i].first_sect + 1;
-
 		if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
 		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
 			goto fail_response;
@@ -468,6 +508,9 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
 	BUG_ON(ret);
 
+	/* Now swizzel the MFN in our domain with the MFN from the other domain
+	 * so that when we access vaddr(pending_req,i) it has the contents of the
+	 * page from the other domain. */
 	for (i = 0; i < nseg; i++) {
 		if (unlikely(map[i].status != 0)) {
 			DPRINTK("invalid buffer -- could not remap it\n");
@@ -485,6 +528,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 		if (ret) {
 			printk(KERN_ALERT "Failed to install M2P override for"\
 				" %lx (ret: %d)\n", (unsigned long)map[i].dev_bus_addr, ret);
+			/* We could switch over to GNTTABOP_copy */
 			continue;
 		}
 
@@ -492,6 +536,9 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 			(req->u.rw.seg[i].first_sect << 9);
 	}
 
+	/* If we have failed at this point, we need to undo the M2P override, set
+	 * gnttab_set_unmap_op on all of the grant references and perform the
+	 * hypercall to unmap the grants - that is all done in fast_flush_area. */
 	if (ret)
 		goto fail_flush;
 
@@ -503,7 +550,11 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 		goto fail_flush;
 	}
 
+	/* Get a reference count for the disk queue and start sending I/O */
 	plug_queue(blkif, preq.bdev);
+
+	/* We set it one so that the last submit_bio does not have to call
+	 * atomic_inc. */
 	atomic_set(&pending_req->pendcnt, 1);
 	blkif_get(blkif);
 
@@ -524,7 +575,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 				atomic_inc(&pending_req->pendcnt);
 				submit_bio(operation, bio);
 			}
-
+                        
 			bio = bio_alloc(GFP_KERNEL, nseg-i);
 			if (unlikely(bio == NULL))
 				goto fail_put_bio;
@@ -538,6 +589,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 		preq.sector_number += seg[i].nsec;
 	}
 
+	/* This will be hit if the operation was a barrier. */
 	if (!bio) {
 		BUG_ON(operation != WRITE_BARRIER);
 		bio = bio_alloc(GFP_KERNEL, 0);
@@ -578,11 +630,9 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 
 
-/******************************************************************
- * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+/*
+ * Put a response on the ring on how the operation fared.
  */
-
-
 static void make_response(blkif_t *blkif, u64 id,
 			  unsigned short op, int st)
 {
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
index 0f91830f18c8..4c140c8e75bd 100644
--- a/drivers/xen/blkback/common.h
+++ b/drivers/xen/blkback/common.h
@@ -76,6 +76,7 @@ typedef struct blkif_st {
 	atomic_t         refcnt;
 
 	wait_queue_head_t   wq;
+	/* One thread per one blkif. */
 	struct task_struct  *xenblkd;
 	unsigned int        waiting_reqs;
 	struct request_queue     *plug;
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
index e397a4134f1b..a4a15350737f 100644
--- a/drivers/xen/blkback/interface.c
+++ b/drivers/xen/blkback/interface.c
@@ -1,6 +1,4 @@
 /******************************************************************************
- * arch/xen/drivers/blkif/backend/interface.c
- *
  * Block-device interface management.
  *
  * Copyright (c) 2004, Keir Fraser
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index 8c91a2fb0019..95156c95ab2f 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -1,6 +1,4 @@
 /******************************************************************************
- * blkback/vbd.c
- *
  * Routines for managing virtual block devices (VBDs).
  *
  * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index 031bc3d7eec3..e9c4f80ef1c8 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -107,7 +107,7 @@ static void update_blkif_status(blkif_t *blkif)
 }
 
 
-/****************************************************************
+/*
  *  sysfs interface for VBD I/O requests
  */
 

From 5489377ce40d52fb722dcd811617114cebad7bba Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 17:21:50 -0400
Subject: [PATCH 027/110] xen/blkback: blkif->struct blkif_st

checkpatch.pl suggested that we don't use the typdef in common.h
and this triggered this avalanche of patches.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c   | 24 ++++++++++++------------
 drivers/xen/blkback/common.h    | 23 ++++++++++++-----------
 drivers/xen/blkback/interface.c | 16 ++++++++--------
 drivers/xen/blkback/vbd.c       |  6 +++---
 drivers/xen/blkback/xenbus.c    |  6 +++---
 5 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 8a4b1e8eeb62..d07ad5318a85 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -75,7 +75,7 @@ module_param(debug_lvl, int, 0644);
  * response queued for it, with the saved 'id' passed back.
  */
 typedef struct {
-	blkif_t       *blkif;
+	struct blkif_st       *blkif;
 	u64            id;
 	int            nr_pages;
 	atomic_t       pendcnt;
@@ -123,11 +123,11 @@ static inline unsigned long vaddr(pending_req_t *req, int seg)
 	(blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
 
 
-static int do_block_io_op(blkif_t *blkif);
-static void dispatch_rw_block_io(blkif_t *blkif,
+static int do_block_io_op(struct blkif_st *blkif);
+static void dispatch_rw_block_io(struct blkif_st *blkif,
 				 struct blkif_request *req,
 				 pending_req_t *pending_req);
-static void make_response(blkif_t *blkif, u64 id,
+static void make_response(struct blkif_st *blkif, u64 id,
 			  unsigned short op, int st);
 
 /*
@@ -169,7 +169,7 @@ static void free_req(pending_req_t *req)
  * It is OK to make multiple calls in this function as it
  * resets the plug to NULL when it is done on the first call.
  */
-static void unplug_queue(blkif_t *blkif)
+static void unplug_queue(struct blkif_st *blkif)
 {
 	if (blkif->plug == NULL)
 		return;
@@ -185,7 +185,7 @@ static void unplug_queue(blkif_t *blkif)
  * not to double reference. We also give back a reference count
  * if it corresponds to another queue.
  */
-static void plug_queue(blkif_t *blkif, struct block_device *bdev)
+static void plug_queue(struct blkif_st *blkif, struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
@@ -237,7 +237,7 @@ static void fast_flush_area(pending_req_t *req)
  * SCHEDULER FUNCTIONS
  */
 
-static void print_stats(blkif_t *blkif)
+static void print_stats(struct blkif_st *blkif)
 {
 	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
 	       current->comm, blkif->st_oo_req,
@@ -250,7 +250,7 @@ static void print_stats(blkif_t *blkif)
 
 int blkif_schedule(void *arg)
 {
-	blkif_t *blkif = arg;
+	struct blkif_st *blkif = arg;
 	struct vbd *vbd = &blkif->vbd;
 
 	blkif_get(blkif);
@@ -337,7 +337,7 @@ static void end_block_io_op(struct bio *bio, int error)
  * Notification from the guest OS.
  */
 
-static void blkif_notify_work(blkif_t *blkif)
+static void blkif_notify_work(struct blkif_st *blkif)
 {
 	blkif->waiting_reqs = 1;
 	wake_up(&blkif->wq);
@@ -356,7 +356,7 @@ irqreturn_t blkif_be_int(int irq, void *dev_id)
  * (which has the sectors we want, number of them, grant references, etc),
  * and transmute  it to the block API to hand it over to the proper block disk.
  */
-static int do_block_io_op(blkif_t *blkif)
+static int do_block_io_op(struct blkif_st *blkif)
 {
 	union blkif_back_rings *blk_rings = &blkif->blk_rings;
 	struct blkif_request req;
@@ -438,7 +438,7 @@ static int do_block_io_op(blkif_t *blkif)
  * Transumation of the 'struct blkif_request' to a proper 'struct bio'
  * and call the 'submit_bio' to pass it to the underlaying storage.
  */
-static void dispatch_rw_block_io(blkif_t *blkif,
+static void dispatch_rw_block_io(struct blkif_st *blkif,
 				 struct blkif_request *req,
 				 pending_req_t *pending_req)
 {
@@ -633,7 +633,7 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 /*
  * Put a response on the ring on how the operation fared.
  */
-static void make_response(blkif_t *blkif, u64 id,
+static void make_response(struct blkif_st *blkif, u64 id,
 			  unsigned short op, int st)
 {
 	struct blkif_response  resp;
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
index 4c140c8e75bd..be3fc93d8a31 100644
--- a/drivers/xen/blkback/common.h
+++ b/drivers/xen/blkback/common.h
@@ -34,7 +34,7 @@
 #include <linux/blkdev.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/setup.h>
 #include <asm/pgalloc.h>
 #include <asm/hypervisor.h>
@@ -44,7 +44,7 @@
 
 #define DPRINTK(_f, _a...)			\
 	pr_debug("(file=%s, line=%d) " _f,	\
-		 __FILE__ , __LINE__ , ## _a )
+		 __FILE__ , __LINE__ , ## _a)
 
 struct vbd {
 	blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
@@ -57,7 +57,7 @@ struct vbd {
 
 struct backend_info;
 
-typedef struct blkif_st {
+struct blkif_st {
 	/* Unique identifier for this interface. */
 	domid_t           domid;
 	unsigned int      handle;
@@ -94,13 +94,14 @@ typedef struct blkif_st {
 
 	grant_handle_t shmem_handle;
 	grant_ref_t    shmem_ref;
-} blkif_t;
+};
 
-blkif_t *blkif_alloc(domid_t domid);
-void blkif_disconnect(blkif_t *blkif);
-void blkif_free(blkif_t *blkif);
-int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
-void vbd_resize(blkif_t *blkif);
+struct blkif_st *blkif_alloc(domid_t domid);
+void blkif_disconnect(struct blkif_st *blkif);
+void blkif_free(struct blkif_st *blkif);
+int blkif_map(struct blkif_st *blkif, unsigned long shared_page,
+	      unsigned int evtchn);
+void vbd_resize(struct blkif_st *blkif);
 
 #define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
 #define blkif_put(_b)					\
@@ -110,7 +111,7 @@ void vbd_resize(blkif_t *blkif);
 	} while (0)
 
 /* Create a vbd. */
-int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+int vbd_create(struct blkif_st *blkif, blkif_vdev_t vdevice, unsigned major,
 	       unsigned minor, int readonly, int cdrom);
 void vbd_free(struct vbd *vbd);
 
@@ -125,7 +126,7 @@ struct phys_req {
 	blkif_sector_t       sector_number;
 };
 
-int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+int vbd_translate(struct phys_req *req, struct blkif_st *blkif, int operation);
 
 int blkif_interface_init(void);
 
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
index a4a15350737f..7d59f13115cf 100644
--- a/drivers/xen/blkback/interface.c
+++ b/drivers/xen/blkback/interface.c
@@ -35,9 +35,9 @@
 
 static struct kmem_cache *blkif_cachep;
 
-blkif_t *blkif_alloc(domid_t domid)
+struct blkif_st *blkif_alloc(domid_t domid)
 {
-	blkif_t *blkif;
+	struct blkif_st *blkif;
 
 	blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
 	if (!blkif)
@@ -54,7 +54,7 @@ blkif_t *blkif_alloc(domid_t domid)
 	return blkif;
 }
 
-static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+static int map_frontend_page(struct blkif_st *blkif, unsigned long shared_page)
 {
 	struct gnttab_map_grant_ref op;
 
@@ -75,7 +75,7 @@ static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
 	return 0;
 }
 
-static void unmap_frontend_page(blkif_t *blkif)
+static void unmap_frontend_page(struct blkif_st *blkif)
 {
 	struct gnttab_unmap_grant_ref op;
 
@@ -86,7 +86,7 @@ static void unmap_frontend_page(blkif_t *blkif)
 		BUG();
 }
 
-int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+int blkif_map(struct blkif_st *blkif, unsigned long shared_page, unsigned int evtchn)
 {
 	int err;
 
@@ -143,7 +143,7 @@ int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
 	return 0;
 }
 
-void blkif_disconnect(blkif_t *blkif)
+void blkif_disconnect(struct blkif_st *blkif)
 {
 	if (blkif->xenblkd) {
 		kthread_stop(blkif->xenblkd);
@@ -166,7 +166,7 @@ void blkif_disconnect(blkif_t *blkif)
 	}
 }
 
-void blkif_free(blkif_t *blkif)
+void blkif_free(struct blkif_st *blkif)
 {
 	if (!atomic_dec_and_test(&blkif->refcnt))
 		BUG();
@@ -175,7 +175,7 @@ void blkif_free(blkif_t *blkif)
 
 int __init blkif_interface_init(void)
 {
-	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(struct blkif_st),
 					 0, 0, NULL);
 	if (!blkif_cachep)
 		return -ENOMEM;
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index 95156c95ab2f..26a37df8173a 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -48,7 +48,7 @@ unsigned long vbd_secsize(struct vbd *vbd)
 	return bdev_logical_block_size(vbd->bdev);
 }
 
-int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle, unsigned major,
 	       unsigned minor, int readonly, int cdrom)
 {
 	struct vbd *vbd;
@@ -97,7 +97,7 @@ void vbd_free(struct vbd *vbd)
 	vbd->bdev = NULL;
 }
 
-int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+int vbd_translate(struct phys_req *req, struct blkif_st *blkif, int operation)
 {
 	struct vbd *vbd = &blkif->vbd;
 	int rc = -EACCES;
@@ -116,7 +116,7 @@ int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
 	return rc;
 }
 
-void vbd_resize(blkif_t *blkif)
+void vbd_resize(struct blkif_st *blkif)
 {
 	struct vbd *vbd = &blkif->vbd;
 	struct xenbus_transaction xbt;
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index e9c4f80ef1c8..67462c4e9ab4 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -30,7 +30,7 @@
 struct backend_info
 {
 	struct xenbus_device *dev;
-	blkif_t *blkif;
+	struct blkif_st *blkif;
 	struct xenbus_watch backend_watch;
 	unsigned major;
 	unsigned minor;
@@ -47,7 +47,7 @@ struct xenbus_device *blkback_xenbus(struct backend_info *be)
 	return be->dev;
 }
 
-static int blkback_name(blkif_t *blkif, char *buf)
+static int blkback_name(struct blkif_st *blkif, char *buf)
 {
 	char *devpath, *devname;
 	struct xenbus_device *dev = blkif->be->dev;
@@ -67,7 +67,7 @@ static int blkback_name(blkif_t *blkif, char *buf)
 	return 0;
 }
 
-static void update_blkif_status(blkif_t *blkif)
+static void update_blkif_status(struct blkif_st *blkif)
 {
 	int err;
 	char name[TASK_COMM_LEN];

From 3c64b58cd614c976dcb19e16fa59ab620b3fe130 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 17:24:45 -0400
Subject: [PATCH 028/110] xen/blkback: Fix checkpatch warnings in vbd.c

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/vbd.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
index 26a37df8173a..d0ff4cf91a34 100644
--- a/drivers/xen/blkback/vbd.c
+++ b/drivers/xen/blkback/vbd.c
@@ -30,8 +30,9 @@
 
 #include "common.h"
 
-#define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
-		      (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
+#define vbd_sz(_v)	((_v)->bdev->bd_part ? \
+			 (_v)->bdev->bd_part->nr_sects : \
+			  get_capacity((_v)->bdev->bd_disk))
 
 unsigned long long vbd_size(struct vbd *vbd)
 {
@@ -40,7 +41,7 @@ unsigned long long vbd_size(struct vbd *vbd)
 
 unsigned int vbd_info(struct vbd *vbd)
 {
-	return vbd->type | (vbd->readonly?VDISK_READONLY:0);
+	return vbd->type | (vbd->readonly ? VDISK_READONLY : 0);
 }
 
 unsigned long vbd_secsize(struct vbd *vbd)
@@ -126,7 +127,7 @@ void vbd_resize(struct blkif_st *blkif)
 
 	printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
 		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
-	printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
+	printk(KERN_INFO "VBD Resize: new size %llu\n", new_size);
 	vbd->size = new_size;
 again:
 	err = xenbus_transaction_start(&xbt);
@@ -134,7 +135,7 @@ again:
 		printk(KERN_WARNING "Error starting transaction");
 		return;
 	}
-	err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    vbd_size(vbd));
 	if (err) {
 		printk(KERN_WARNING "Error writing new size");

From e5f4b3c498623fc3d83f6d92e00a2b2dbf500cd0 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 17:27:29 -0400
Subject: [PATCH 029/110] xen/blkback: Fix interface.c checkpatch warnings ..
 except

+               sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;

WARNING: line over 80 characters
+               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);

as breaking them up really does not help that much.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/interface.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
index 7d59f13115cf..163aed41e825 100644
--- a/drivers/xen/blkback/interface.c
+++ b/drivers/xen/blkback/interface.c
@@ -86,7 +86,8 @@ static void unmap_frontend_page(struct blkif_st *blkif)
 		BUG();
 }
 
-int blkif_map(struct blkif_st *blkif, unsigned long shared_page, unsigned int evtchn)
+int blkif_map(struct blkif_st *blkif, unsigned long shared_page,
+	      unsigned int evtchn)
 {
 	int err;
 
@@ -94,7 +95,8 @@ int blkif_map(struct blkif_st *blkif, unsigned long shared_page, unsigned int ev
 	if (blkif->irq)
 		return 0;
 
-	if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+	blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE);
+	if (!blkif->blk_ring_area)
 		return -ENOMEM;
 
 	err = map_frontend_page(blkif, shared_page);
@@ -131,8 +133,7 @@ int blkif_map(struct blkif_st *blkif, unsigned long shared_page, unsigned int ev
 
 	err = bind_interdomain_evtchn_to_irqhandler(
 		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
-	if (err < 0)
-	{
+	if (err < 0) {
 		unmap_frontend_page(blkif);
 		free_vm_area(blkif->blk_ring_area);
 		blkif->blk_rings.common.sring = NULL;

From d6091b217dd4fdabc4a8cd6fa61775f1e3eb6efe Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 17:33:30 -0400
Subject: [PATCH 030/110] xen/blkback: Fix checkpatch warnings of xenbus.c

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/xenbus.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
index 67462c4e9ab4..b41ed65db2d3 100644
--- a/drivers/xen/blkback/xenbus.c
+++ b/drivers/xen/blkback/xenbus.c
@@ -25,10 +25,9 @@
 #undef DPRINTK
 #define DPRINTK(fmt, args...)				\
 	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
-		 __FUNCTION__, __LINE__, ##args)
+		 __func__, __LINE__, ##args)
 
-struct backend_info
-{
+struct backend_info {
 	struct xenbus_device *dev;
 	struct blkif_st *blkif;
 	struct xenbus_watch backend_watch;
@@ -56,7 +55,8 @@ static int blkback_name(struct blkif_st *blkif, char *buf)
 	if (IS_ERR(devpath))
 		return PTR_ERR(devpath);
 
-	if ((devname = strstr(devpath, "/dev/")) != NULL)
+	devname = strstr(devpath, "/dev/");
+	if (devname != NULL)
 		devname += strlen("/dev/");
 	else
 		devname  = devpath;
@@ -153,7 +153,7 @@ int xenvbd_sysfs_addif(struct xenbus_device *dev)
 	int error;
 
 	error = device_create_file(&dev->dev, &dev_attr_physical_device);
- 	if (error)
+	if (error)
 		goto fail1;
 
 	error = device_create_file(&dev->dev, &dev_attr_mode);
@@ -327,7 +327,10 @@ static void backend_changed(struct xenbus_watch *watch,
 		/* Front end dir is a number, which is used as the handle. */
 
 		char *p = strrchr(dev->otherend, '/') + 1;
-		long handle = simple_strtoul(p, NULL, 0);
+		long handle;
+		err = strict_strtoul(p, 0, &handle);
+		if (err)
+			return;
 
 		be->major = major;
 		be->minor = minor;
@@ -369,7 +372,7 @@ static void frontend_changed(struct xenbus_device *dev,
 	case XenbusStateInitialising:
 		if (dev->state == XenbusStateClosed) {
 			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
-			       __FUNCTION__, dev->nodename);
+			       __func__, dev->nodename);
 			xenbus_switch_state(dev, XenbusStateInitWait);
 		}
 		break;
@@ -494,8 +497,8 @@ static int connect_ring(struct backend_info *be)
 
 	DPRINTK("%s", dev->otherend);
 
-	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
-			    "event-channel", "%u", &evtchn, NULL);
+	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
+			    &ring_ref, "event-channel", "%u", &evtchn, NULL);
 	if (err) {
 		xenbus_dev_fatal(dev, err,
 				 "reading %s/ring-ref and event-channel",

From 2e9977c21f7679d5f616132ae1f7857e932ccd19 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 17:42:07 -0400
Subject: [PATCH 031/110] xen/blkback: Fix checkpatch warnings in blkback.c

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 81 ++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 34 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index d07ad5318a85..2d413930f235 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -63,8 +63,8 @@ module_param_named(reqs, blkif_reqs, int, 0);
 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
 
 /* Run-time switchable: /sys/module/blkback/parameters/ */
-static unsigned int log_stats = 0;
-static unsigned int debug_lvl = 0;
+static unsigned int log_stats;
+static unsigned int debug_lvl;
 module_param(log_stats, int, 0644);
 module_param(debug_lvl, int, 0644);
 
@@ -74,7 +74,7 @@ module_param(debug_lvl, int, 0644);
  * the pendcnt towards zero. When it hits zero, the specified domain has a
  * response queued for it, with the saved 'id' passed back.
  */
-typedef struct {
+struct pending_req {
 	struct blkif_st       *blkif;
 	u64            id;
 	int            nr_pages;
@@ -82,12 +82,12 @@ typedef struct {
 	unsigned short operation;
 	int            status;
 	struct list_head free_list;
-} pending_req_t;
+};
 
 #define BLKBACK_INVALID_HANDLE (~0)
 
 struct xen_blkbk {
-	pending_req_t		*pending_reqs;
+	struct pending_req	*pending_reqs;
 	/* List of all 'pending_req' available */
 	struct list_head	pending_free;
 	/* And its spinlock. */
@@ -106,14 +106,15 @@ static struct xen_blkbk *blkbk;
  * pending_pages[..]. For each 'pending_req' we have have up to
  * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
  * 10 and would index in the pending_pages[..]. */
-static inline int vaddr_pagenr(pending_req_t *req, int seg)
+static inline int vaddr_pagenr(struct pending_req *req, int seg)
 {
-	return (req - blkbk->pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+	return (req - blkbk->pending_reqs) *
+		BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
 }
 
 #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
 
-static inline unsigned long vaddr(pending_req_t *req, int seg)
+static inline unsigned long vaddr(struct pending_req *req, int seg)
 {
 	unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
 	return (unsigned long)pfn_to_kaddr(pfn);
@@ -126,21 +127,22 @@ static inline unsigned long vaddr(pending_req_t *req, int seg)
 static int do_block_io_op(struct blkif_st *blkif);
 static void dispatch_rw_block_io(struct blkif_st *blkif,
 				 struct blkif_request *req,
-				 pending_req_t *pending_req);
+				 struct pending_req *pending_req);
 static void make_response(struct blkif_st *blkif, u64 id,
 			  unsigned short op, int st);
 
 /*
  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  */
-static pending_req_t* alloc_req(void)
+static struct pending_req *alloc_req(void)
 {
-	pending_req_t *req = NULL;
+	struct pending_req *req = NULL;
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
 	if (!list_empty(&blkbk->pending_free)) {
-		req = list_entry(blkbk->pending_free.next, pending_req_t, free_list);
+		req = list_entry(blkbk->pending_free.next, struct pending_req,
+				 free_list);
 		list_del(&req->free_list);
 	}
 	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
@@ -151,7 +153,7 @@ static pending_req_t* alloc_req(void)
  * Return the 'pending_req' structure back to the freepool. We also
  * wake up the thread if it was waiting for a free page.
  */
-static void free_req(pending_req_t *req)
+static void free_req(struct pending_req *req)
 {
 	unsigned long flags;
 	int was_empty;
@@ -200,7 +202,7 @@ static void plug_queue(struct blkif_st *blkif, struct block_device *bdev)
  * Unmap the grant references, and also remove the M2P over-rides
  * used in the 'pending_req'.
 */
-static void fast_flush_area(pending_req_t *req)
+static void fast_flush_area(struct pending_req *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
@@ -221,7 +223,8 @@ static void fast_flush_area(pending_req_t *req)
 		GNTTABOP_unmap_grant_ref, unmap, invcount);
 	BUG_ON(ret);
 	/* Note, we use invcount, so nr->pages, so we can't index
-	 * using vaddr(req, i). */
+	 * using vaddr(req, i).
+	 */
 	for (i = 0; i < invcount; i++) {
 		ret = m2p_remove_override(
 			virt_to_page(unmap[i].host_addr), false);
@@ -233,7 +236,7 @@ static void fast_flush_area(pending_req_t *req)
 	}
 }
 
-/******************************************************************
+/*
  * SCHEDULER FUNCTIONS
  */
 
@@ -269,7 +272,8 @@ int blkif_schedule(void *arg)
 			blkif->waiting_reqs || kthread_should_stop());
 		wait_event_interruptible(
 			blkbk->pending_free_wq,
-			!list_empty(&blkbk->pending_free) || kthread_should_stop());
+			!list_empty(&blkbk->pending_free) ||
+			kthread_should_stop());
 
 		blkif->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
@@ -297,7 +301,7 @@ int blkif_schedule(void *arg)
  * Completion callback on the bio's. Called as bh->b_end_io()
  */
 
-static void __end_block_io_op(pending_req_t *pending_req, int error)
+static void __end_block_io_op(struct pending_req *pending_req, int error)
 {
 	/* An error fails the entire request. */
 	if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
@@ -313,7 +317,8 @@ static void __end_block_io_op(pending_req_t *pending_req, int error)
 
 	/* If all of the bio's have completed it is time to unmap
 	 * the grant references associated with 'request' and provide
-	 * the proper response on the ring. */
+	 * the proper response on the ring.
+	 */
 	if (atomic_dec_and_test(&pending_req->pendcnt)) {
 		fast_flush_area(pending_req);
 		make_response(pending_req->blkif, pending_req->id,
@@ -360,7 +365,7 @@ static int do_block_io_op(struct blkif_st *blkif)
 {
 	union blkif_back_rings *blk_rings = &blkif->blk_rings;
 	struct blkif_request req;
-	pending_req_t *pending_req;
+	struct pending_req *pending_req;
 	RING_IDX rc, rp;
 	int more_to_do = 0;
 
@@ -440,7 +445,7 @@ static int do_block_io_op(struct blkif_st *blkif)
  */
 static void dispatch_rw_block_io(struct blkif_st *blkif,
 				 struct blkif_request *req,
-				 pending_req_t *pending_req)
+				 struct pending_req *pending_req)
 {
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct phys_req preq;
@@ -487,7 +492,8 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 
 	/* Fill out preq.nr_sects with proper amount of sectors, and setup
 	 * assign map[..] with the PFN of the page in our domain with the
-	 * corresponding grant reference for each page.*/
+	 * corresponding grant reference for each page.
+	 */
 	for (i = 0; i < nseg; i++) {
 		uint32_t flags;
 
@@ -509,8 +515,9 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	BUG_ON(ret);
 
 	/* Now swizzel the MFN in our domain with the MFN from the other domain
-	 * so that when we access vaddr(pending_req,i) it has the contents of the
-	 * page from the other domain. */
+	 * so that when we access vaddr(pending_req,i) it has the contents of
+	 * the page from the other domain.
+	 */
 	for (i = 0; i < nseg; i++) {
 		if (unlikely(map[i].status != 0)) {
 			DPRINTK("invalid buffer -- could not remap it\n");
@@ -522,12 +529,13 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 
 		if (ret)
 			continue;
-		
+
 		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
 			blkbk->pending_page(pending_req, i), false);
 		if (ret) {
 			printk(KERN_ALERT "Failed to install M2P override for"\
-				" %lx (ret: %d)\n", (unsigned long)map[i].dev_bus_addr, ret);
+				" %lx (ret: %d)\n", (unsigned long)
+				map[i].dev_bus_addr, ret);
 			/* We could switch over to GNTTABOP_copy */
 			continue;
 		}
@@ -536,9 +544,11 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 			(req->u.rw.seg[i].first_sect << 9);
 	}
 
-	/* If we have failed at this point, we need to undo the M2P override, set
-	 * gnttab_set_unmap_op on all of the grant references and perform the
-	 * hypercall to unmap the grants - that is all done in fast_flush_area. */
+	/* If we have failed at this point, we need to undo the M2P override,
+	 * set gnttab_set_unmap_op on all of the grant references and perform
+	 * the hypercall to unmap the grants - that is all done in
+	 * fast_flush_area.
+	 */
 	if (ret)
 		goto fail_flush;
 
@@ -554,7 +564,8 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	plug_queue(blkif, preq.bdev);
 
 	/* We set it one so that the last submit_bio does not have to call
-	 * atomic_inc. */
+	 * atomic_inc.
+	 */
 	atomic_set(&pending_req->pendcnt, 1);
 	blkif_get(blkif);
 
@@ -575,7 +586,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 				atomic_inc(&pending_req->pendcnt);
 				submit_bio(operation, bio);
 			}
-                        
+
 			bio = bio_alloc(GFP_KERNEL, nseg-i);
 			if (unlikely(bio == NULL))
 				goto fail_put_bio;
@@ -694,7 +705,7 @@ static int __init blkif_init(void)
 	if (!xen_pv_domain())
 		return -ENODEV;
 
-	blkbk = (struct xen_blkbk *)kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
+	blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
 	if (!blkbk) {
 		printk(KERN_ALERT "%s: out of memory!\n", __func__);
 		return -ENOMEM;
@@ -709,7 +720,8 @@ static int __init blkif_init(void)
 	blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
 					mmap_pages, GFP_KERNEL);
 
-	if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || !blkbk->pending_pages) {
+	if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
+	    !blkbk->pending_pages) {
 		rc = -ENOMEM;
 		goto out_of_memory;
 	}
@@ -733,7 +745,8 @@ static int __init blkif_init(void)
 	init_waitqueue_head(&blkbk->pending_free_wq);
 
 	for (i = 0; i < blkif_reqs; i++)
-		list_add_tail(&blkbk->pending_reqs[i].free_list, &blkbk->pending_free);
+		list_add_tail(&blkbk->pending_reqs[i].free_list,
+			      &blkbk->pending_free);
 
 	rc = blkif_xenbus_init();
 	if (rc)

From 0faa8cca883bbc6a0919e3c89128672659b75820 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 17:58:19 -0400
Subject: [PATCH 032/110] xen/blkback: remove per-queue plugging

commit 7eaceaccab5f40bbfda044629a6298616aeaed50 ("block: remove per-queue plugging")
added two new interfaces to plug and unplug: blk_start_plug
and blk_finish_plug. Lets use those.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 44 +++++++----------------------------
 drivers/xen/blkback/common.h  |  1 -
 2 files changed, 9 insertions(+), 36 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 2d413930f235..464f2e0b5a61 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -166,38 +166,6 @@ static void free_req(struct pending_req *req)
 		wake_up(&blkbk->pending_free_wq);
 }
 
-/*
- * Give back a reference count on the underlaying storage.
- * It is OK to make multiple calls in this function as it
- * resets the plug to NULL when it is done on the first call.
- */
-static void unplug_queue(struct blkif_st *blkif)
-{
-	if (blkif->plug == NULL)
-		return;
-	if (blkif->plug->unplug_fn)
-		blkif->plug->unplug_fn(blkif->plug);
-	blk_put_queue(blkif->plug);
-	blkif->plug = NULL;
-}
-
-/*
- * Take a reference count on the underlaying storage.
- * It is OK to call this multiple times as we check to make sure
- * not to double reference. We also give back a reference count
- * if it corresponds to another queue.
- */
-static void plug_queue(struct blkif_st *blkif, struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	if (q == blkif->plug)
-		return;
-	unplug_queue(blkif);
-	blk_get_queue(q);
-	blkif->plug = q;
-}
-
 /*
  * Unmap the grant references, and also remove the M2P over-rides
  * used in the 'pending_req'.
@@ -280,7 +248,6 @@ int blkif_schedule(void *arg)
 
 		if (do_block_io_op(blkif))
 			blkif->waiting_reqs = 1;
-		unplug_queue(blkif);
 
 		if (log_stats && time_after(jiffies, blkif->st_print))
 			print_stats(blkif);
@@ -456,6 +423,8 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	struct bio *bio = NULL;
 	int ret, i;
 	int operation;
+	struct blk_plug plug;
+	struct request_queue *q;
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
@@ -561,7 +530,8 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	}
 
 	/* Get a reference count for the disk queue and start sending I/O */
-	plug_queue(blkif, preq.bdev);
+	blk_get_queue(q);
+	blk_start_plug(&plug);
 
 	/* We set it one so that the last submit_bio does not have to call
 	 * atomic_inc.
@@ -620,11 +590,14 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	else if (operation == WRITE || operation == WRITE_BARRIER)
 		blkif->st_wr_sect += preq.nr_sects;
 
+	blk_finish_plug(&plug);
+	blk_put_queue(q);
 	return;
 
  fail_flush:
 	fast_flush_area(pending_req);
  fail_response:
+	/* Haven't submitted any bio's yet. */
 	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
 	free_req(pending_req);
 	msleep(1); /* back off a bit */
@@ -634,7 +607,8 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	__end_block_io_op(pending_req, -EINVAL);
 	if (bio)
 		bio_put(bio);
-	unplug_queue(blkif);
+	blk_finish_plug(&plug);
+	blk_put_queue(q);
 	msleep(1); /* back off a bit */
 	return;
 }
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
index be3fc93d8a31..6257c1106591 100644
--- a/drivers/xen/blkback/common.h
+++ b/drivers/xen/blkback/common.h
@@ -79,7 +79,6 @@ struct blkif_st {
 	/* One thread per one blkif. */
 	struct task_struct  *xenblkd;
 	unsigned int        waiting_reqs;
-	struct request_queue     *plug;
 
 	/* statistics */
 	unsigned long       st_print;

From 7708992616487c00d5ca8ed7612111180d8e1b68 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad@kernel.org>
Date: Fri, 15 Apr 2011 10:51:27 -0400
Subject: [PATCH 033/110] xen/blkback: Seperate the bio allocation and the bio
 submission.

We seperate the bio allocation (bio_alloc) from the bio submission so
that the error paths are much easier, and also so that the bio
submission can be done in one tight loop. It also makes the
plug/unplug calls much much easier.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 45 ++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 464f2e0b5a61..3c10499d61a7 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -421,7 +421,8 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int nseg;
 	struct bio *bio = NULL;
-	int ret, i;
+	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	int ret, i, nbio = 0;
 	int operation;
 	struct blk_plug plug;
 	struct request_queue *q;
@@ -529,14 +530,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 		goto fail_flush;
 	}
 
-	/* Get a reference count for the disk queue and start sending I/O */
-	blk_get_queue(q);
-	blk_start_plug(&plug);
-
-	/* We set it one so that the last submit_bio does not have to call
-	 * atomic_inc.
-	 */
-	atomic_set(&pending_req->pendcnt, 1);
+	/* This corresponding blkif_put is done in __end_block_io_op */
 	blkif_get(blkif);
 
 	for (i = 0; i < nseg; i++) {
@@ -552,12 +546,8 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 				     blkbk->pending_page(pending_req, i),
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
-			if (bio) {
-				atomic_inc(&pending_req->pendcnt);
-				submit_bio(operation, bio);
-			}
 
-			bio = bio_alloc(GFP_KERNEL, nseg-i);
+			bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
 			if (unlikely(bio == NULL))
 				goto fail_put_bio;
 
@@ -573,7 +563,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	/* This will be hit if the operation was a barrier. */
 	if (!bio) {
 		BUG_ON(operation != WRITE_BARRIER);
-		bio = bio_alloc(GFP_KERNEL, 0);
+		bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, 0);
 		if (unlikely(bio == NULL))
 			goto fail_put_bio;
 
@@ -583,15 +573,28 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 		bio->bi_sector  = -1;
 	}
 
-	submit_bio(operation, bio);
+
+	/* We set it one so that the last submit_bio does not have to call
+	 * atomic_inc.
+	 */
+	atomic_set(&pending_req->pendcnt, nbio);
+
+	/* Get a reference count for the disk queue and start sending I/O */
+	blk_get_queue(q);
+	blk_start_plug(&plug);
+
+	for (i = 0; i < nbio; i++)
+		submit_bio(operation, biolist[i]);
+
+	blk_finish_plug(&plug);
+	/* Let the I/Os go.. */
+	blk_put_queue(q);
 
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;
 	else if (operation == WRITE || operation == WRITE_BARRIER)
 		blkif->st_wr_sect += preq.nr_sects;
 
-	blk_finish_plug(&plug);
-	blk_put_queue(q);
 	return;
 
  fail_flush:
@@ -604,11 +607,9 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	return;
 
  fail_put_bio:
+	for (i = 0; i < (nbio-1); i++)
+		bio_put(biolist[i]);
 	__end_block_io_op(pending_req, -EINVAL);
-	if (bio)
-		bio_put(bio);
-	blk_finish_plug(&plug);
-	blk_put_queue(q);
 	msleep(1); /* back off a bit */
 	return;
 }

From b0aef17924a06646403cae8eecf6c73219a63c19 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 15 Apr 2011 10:58:05 -0400
Subject: [PATCH 034/110] xen/blkback: Cleanup move the code a bit around.

Moving it so that the code that 'fast_flush_area' code is
close to the code that deals with it so that the reader
won't lose focus.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 97 +++++++++++++++++------------------
 1 file changed, 47 insertions(+), 50 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 3c10499d61a7..f282463d7b5c 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -167,41 +167,18 @@ static void free_req(struct pending_req *req)
 }
 
 /*
- * Unmap the grant references, and also remove the M2P over-rides
- * used in the 'pending_req'.
-*/
-static void fast_flush_area(struct pending_req *req)
+ * Notification from the guest OS.
+ */
+static void blkif_notify_work(struct blkif_st *blkif)
 {
-	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	unsigned int i, invcount = 0;
-	grant_handle_t handle;
-	int ret;
+	blkif->waiting_reqs = 1;
+	wake_up(&blkif->wq);
+}
 
-	for (i = 0; i < req->nr_pages; i++) {
-		handle = pending_handle(req, i);
-		if (handle == BLKBACK_INVALID_HANDLE)
-			continue;
-		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
-				    GNTMAP_host_map, handle);
-		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
-		invcount++;
-	}
-
-	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_unmap_grant_ref, unmap, invcount);
-	BUG_ON(ret);
-	/* Note, we use invcount, so nr->pages, so we can't index
-	 * using vaddr(req, i).
-	 */
-	for (i = 0; i < invcount; i++) {
-		ret = m2p_remove_override(
-			virt_to_page(unmap[i].host_addr), false);
-		if (ret) {
-			printk(KERN_ALERT "Failed to remove M2P override for " \
-				"%lx\n", (unsigned long)unmap[i].host_addr);
-			continue;
-		}
-	}
+irqreturn_t blkif_be_int(int irq, void *dev_id)
+{
+	blkif_notify_work(dev_id);
+	return IRQ_HANDLED;
 }
 
 /*
@@ -264,6 +241,43 @@ int blkif_schedule(void *arg)
 	return 0;
 }
 
+/*
+ * Unmap the grant references, and also remove the M2P over-rides
+ * used in the 'pending_req'.
+*/
+static void fast_flush_area(struct pending_req *req)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int i, invcount = 0;
+	grant_handle_t handle;
+	int ret;
+
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
+		if (handle == BLKBACK_INVALID_HANDLE)
+			continue;
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+				    GNTMAP_host_map, handle);
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+		invcount++;
+	}
+
+	ret = HYPERVISOR_grant_table_op(
+		GNTTABOP_unmap_grant_ref, unmap, invcount);
+	BUG_ON(ret);
+	/* Note, we use invcount, so nr->pages, so we can't index
+	 * using vaddr(req, i).
+	 */
+	for (i = 0; i < invcount; i++) {
+		ret = m2p_remove_override(
+			virt_to_page(unmap[i].host_addr), false);
+		if (ret) {
+			printk(KERN_ALERT "Failed to remove M2P override for " \
+				"%lx\n", (unsigned long)unmap[i].host_addr);
+			continue;
+		}
+	}
+}
 /*
  * Completion callback on the bio's. Called as bh->b_end_io()
  */
@@ -305,23 +319,6 @@ static void end_block_io_op(struct bio *bio, int error)
 }
 
 
-/*
- * Notification from the guest OS.
- */
-
-static void blkif_notify_work(struct blkif_st *blkif)
-{
-	blkif->waiting_reqs = 1;
-	wake_up(&blkif->wq);
-}
-
-irqreturn_t blkif_be_int(int irq, void *dev_id)
-{
-	blkif_notify_work(dev_id);
-	return IRQ_HANDLED;
-}
-
-
 
 /*
  * Function to copy the from the ring buffer the 'struct blkif_request'

From 1a95fe6e42cefc52c62c471ad87d7fe8643231df Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 15 Apr 2011 11:35:13 -0400
Subject: [PATCH 035/110] xen/blkback: Shuffle code around (vbd_translate moved
 higher).

We take out the chunk of code dealing with mapping to the guest
of pages into the xen_blk_map_buf code. And we also move the
vbd_translate to be done much earlier.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 131 ++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 60 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index f282463d7b5c..211b2005f963 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -241,6 +241,10 @@ int blkif_schedule(void *arg)
 	return 0;
 }
 
+struct seg_buf {
+	unsigned long buf;
+	unsigned int nsec;
+};
 /*
  * Unmap the grant references, and also remove the M2P over-rides
  * used in the 'pending_req'.
@@ -278,6 +282,62 @@ static void fast_flush_area(struct pending_req *req)
 		}
 	}
 }
+static int xen_blk_map_buf(struct blkif_request *req, struct pending_req *pending_req,
+			   struct seg_buf seg[])
+{
+	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	int i;
+	int nseg = req->nr_segments;
+	int ret = 0;
+	/* Fill out preq.nr_sects with proper amount of sectors, and setup
+	 * assign map[..] with the PFN of the page in our domain with the
+	 * corresponding grant reference for each page.
+	 */
+	for (i = 0; i < nseg; i++) {
+		uint32_t flags;
+
+		flags = GNTMAP_host_map;
+		if (pending_req->operation != BLKIF_OP_READ)
+			flags |= GNTMAP_readonly;
+		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+				  req->u.rw.seg[i].gref, pending_req->blkif->domid);
+	}
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+	BUG_ON(ret);
+
+	/* Now swizzel the MFN in our domain with the MFN from the other domain
+	 * so that when we access vaddr(pending_req,i) it has the contents of
+	 * the page from the other domain.
+	 */
+	for (i = 0; i < nseg; i++) {
+		if (unlikely(map[i].status != 0)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			map[i].handle = BLKBACK_INVALID_HANDLE;
+			ret |= 1;
+		}
+
+		pending_handle(pending_req, i) = map[i].handle;
+
+		if (ret)
+			continue;
+
+		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
+			blkbk->pending_page(pending_req, i), false);
+		if (ret) {
+			printk(KERN_ALERT "Failed to install M2P override for"\
+				" %lx (ret: %d)\n", (unsigned long)
+				map[i].dev_bus_addr, ret);
+			/* We could switch over to GNTTABOP_copy */
+			continue;
+		}
+
+		seg[i].buf  = map[i].dev_bus_addr |
+			(req->u.rw.seg[i].first_sect << 9);
+	}
+	return ret;
+}
+
 /*
  * Completion callback on the bio's. Called as bh->b_end_io()
  */
@@ -411,15 +471,12 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 				 struct blkif_request *req,
 				 struct pending_req *pending_req)
 {
-	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct phys_req preq;
-	struct {
-		unsigned long buf; unsigned int nsec;
-	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int nseg;
 	struct bio *bio = NULL;
 	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	int ret, i, nbio = 0;
+	int i, nbio = 0;
 	int operation;
 	struct blk_plug plug;
 	struct request_queue *q;
@@ -444,6 +501,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
+		/* Haven't submitted any bio's yet. */
 		goto fail_response;
 	}
 
@@ -456,76 +514,29 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	pending_req->operation = req->operation;
 	pending_req->status    = BLKIF_RSP_OKAY;
 	pending_req->nr_pages  = nseg;
-
-	/* Fill out preq.nr_sects with proper amount of sectors, and setup
-	 * assign map[..] with the PFN of the page in our domain with the
-	 * corresponding grant reference for each page.
-	 */
 	for (i = 0; i < nseg; i++) {
-		uint32_t flags;
-
 		seg[i].nsec = req->u.rw.seg[i].last_sect -
 			req->u.rw.seg[i].first_sect + 1;
 		if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
 		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
 			goto fail_response;
 		preq.nr_sects += seg[i].nsec;
-
-		flags = GNTMAP_host_map;
-		if (operation != READ)
-			flags |= GNTMAP_readonly;
-		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
-				  req->u.rw.seg[i].gref, blkif->domid);
 	}
 
-	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
-	BUG_ON(ret);
-
-	/* Now swizzel the MFN in our domain with the MFN from the other domain
-	 * so that when we access vaddr(pending_req,i) it has the contents of
-	 * the page from the other domain.
-	 */
-	for (i = 0; i < nseg; i++) {
-		if (unlikely(map[i].status != 0)) {
-			DPRINTK("invalid buffer -- could not remap it\n");
-			map[i].handle = BLKBACK_INVALID_HANDLE;
-			ret |= 1;
-		}
-
-		pending_handle(pending_req, i) = map[i].handle;
-
-		if (ret)
-			continue;
-
-		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
-			blkbk->pending_page(pending_req, i), false);
-		if (ret) {
-			printk(KERN_ALERT "Failed to install M2P override for"\
-				" %lx (ret: %d)\n", (unsigned long)
-				map[i].dev_bus_addr, ret);
-			/* We could switch over to GNTTABOP_copy */
-			continue;
-		}
-
-		seg[i].buf  = map[i].dev_bus_addr |
-			(req->u.rw.seg[i].first_sect << 9);
-	}
-
-	/* If we have failed at this point, we need to undo the M2P override,
-	 * set gnttab_set_unmap_op on all of the grant references and perform
-	 * the hypercall to unmap the grants - that is all done in
-	 * fast_flush_area.
-	 */
-	if (ret)
-		goto fail_flush;
-
 	if (vbd_translate(&preq, blkif, operation) != 0) {
 		DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
 			operation == READ ? "read" : "write",
 			preq.sector_number,
 			preq.sector_number + preq.nr_sects, preq.dev);
-		goto fail_flush;
+		goto fail_response;
 	}
+	/* If we have failed at this point, we need to undo the M2P override,
+	 * set gnttab_set_unmap_op on all of the grant references and perform
+	 * the hypercall to unmap the grants - that is all done in
+	 * fast_flush_area.
+	 */
+	if (xen_blk_map_buf(req, pending_req, seg))
+		goto fail_flush;
 
 	/* This corresponding blkif_put is done in __end_block_io_op */
 	blkif_get(blkif);

From 976222e05ea5a9959ccf880d7a24efbf79b3c6cf Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 15 Apr 2011 11:38:29 -0400
Subject: [PATCH 036/110] xen/blkback: Move the check for misaligned I/O
 higher.

We move it up higher to be in same loop that actually computes
the sector number.

This way, all of the code that deals with verifying that the
request is correct is all done before we do any of the
page mapping, I/O submission, etc.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 211b2005f963..9598e0fd0f9e 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -521,6 +521,13 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
 			goto fail_response;
 		preq.nr_sects += seg[i].nsec;
+
+		if (((int)preq.sector_number|(int)seg[i].nsec) &
+		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+			DPRINTK("Misaligned I/O request from domain %d",
+				blkif->domid);
+			goto fail_response;
+		}
 	}
 
 	if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -542,13 +549,6 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	blkif_get(blkif);
 
 	for (i = 0; i < nseg; i++) {
-		if (((int)preq.sector_number|(int)seg[i].nsec) &
-		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
-			DPRINTK("Misaligned I/O request from domain %d",
-				blkif->domid);
-			goto fail_put_bio;
-		}
-
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
 				     blkbk->pending_page(pending_req, i),

From 9f3aedf573dd034d59e7eb6c4ee97648d5be8fc6 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 15 Apr 2011 11:50:34 -0400
Subject: [PATCH 037/110] xen/blkback: Change fast_flush_area to
 xen_blkbk_unmap, and tweak xen_blk_map_seg.

The previous name ('fast_flush_area') had nothing to do with what it
does right now. Changing the names so that the code dealing with
mapping pages in and out of the guest is called xen_blkbk_[map|unmap].

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 9598e0fd0f9e..c645c83f900b 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -249,7 +249,7 @@ struct seg_buf {
  * Unmap the grant references, and also remove the M2P over-rides
  * used in the 'pending_req'.
 */
-static void fast_flush_area(struct pending_req *req)
+static void xen_blkbk_unmap(struct pending_req *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
@@ -282,8 +282,8 @@ static void fast_flush_area(struct pending_req *req)
 		}
 	}
 }
-static int xen_blk_map_buf(struct blkif_request *req, struct pending_req *pending_req,
-			   struct seg_buf seg[])
+static int xen_blkbk_map(struct blkif_request *req, struct pending_req *pending_req,
+			 struct seg_buf seg[])
 {
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	int i;
@@ -361,7 +361,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 	 * the proper response on the ring.
 	 */
 	if (atomic_dec_and_test(&pending_req->pendcnt)) {
-		fast_flush_area(pending_req);
+		xen_blkbk_unmap(pending_req);
 		make_response(pending_req->blkif, pending_req->id,
 			      pending_req->operation, pending_req->status);
 		blkif_put(pending_req->blkif);
@@ -540,9 +540,9 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	/* If we have failed at this point, we need to undo the M2P override,
 	 * set gnttab_set_unmap_op on all of the grant references and perform
 	 * the hypercall to unmap the grants - that is all done in
-	 * fast_flush_area.
+	 * xen_blkbk_unmap.
 	 */
-	if (xen_blk_map_buf(req, pending_req, seg))
+	if (xen_blkbk_map(req, pending_req, seg))
 		goto fail_flush;
 
 	/* This corresponding blkif_put is done in __end_block_io_op */
@@ -606,7 +606,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	return;
 
  fail_flush:
-	fast_flush_area(pending_req);
+	xen_blkbk_unmap(pending_req);
  fail_response:
 	/* Haven't submitted any bio's yet. */
 	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);

From e93504933ee6982bdc005fa5c24e1ea330faaf8b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 18 Apr 2011 11:34:55 -0400
Subject: [PATCH 038/110] xen/blkback: Move the check for misaligned I/O once
 more.

The commit 976222e05ea5a9959ccf880d7a24efbf79b3c6cf

    xen/blkback: Move the check for misaligned I/O higher.

moved it a bit to high. The preq->vbdev was not set, so the
check for misaligned I/O would cause a NULL pointer derefence.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index c645c83f900b..a0d3227955c9 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -514,6 +514,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	pending_req->operation = req->operation;
 	pending_req->status    = BLKIF_RSP_OKAY;
 	pending_req->nr_pages  = nseg;
+
 	for (i = 0; i < nseg; i++) {
 		seg[i].nsec = req->u.rw.seg[i].last_sect -
 			req->u.rw.seg[i].first_sect + 1;
@@ -522,12 +523,6 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 			goto fail_response;
 		preq.nr_sects += seg[i].nsec;
 
-		if (((int)preq.sector_number|(int)seg[i].nsec) &
-		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
-			DPRINTK("Misaligned I/O request from domain %d",
-				blkif->domid);
-			goto fail_response;
-		}
 	}
 
 	if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -537,6 +532,16 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 			preq.sector_number + preq.nr_sects, preq.dev);
 		goto fail_response;
 	}
+	/* This check _MUST_ be done after vbd_translate as the preq.bdev
+	 * is set there. */
+	for (i = 0; i < nseg; i++) {
+		if (((int)preq.sector_number|(int)seg[i].nsec) &
+		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+			DPRINTK("Misaligned I/O request from domain %d",
+				blkif->domid);
+			goto fail_response;
+		}
+	}
 	/* If we have failed at this point, we need to undo the M2P override,
 	 * set gnttab_set_unmap_op on all of the grant references and perform
 	 * the hypercall to unmap the grants - that is all done in

From 6fd17b5643bf05c29fc226a5aee96328056fca10 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 18 Apr 2011 12:04:17 -0400
Subject: [PATCH 039/110] xen/blkback: Get the 'requeust_queue' properly.

After the commit 0faa8cca883bbc6a0919e3c89128672659b75820
("    xen/blkback: remove per-queue plugging") we forgot
to retrieve the 'struct request_queue' from the block device.

This puts the functionality back in and fixes a NULL pointer
bug.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/blkback/blkback.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index a0d3227955c9..3751325bfc32 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -542,6 +542,9 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 			goto fail_response;
 		}
 	}
+	q = bdev_get_queue(preq.bdev);
+	if (!q)
+		goto fail_response;
 	/* If we have failed at this point, we need to undo the M2P override,
 	 * set gnttab_set_unmap_op on all of the grant references and perform
 	 * the hypercall to unmap the grants - that is all done in

From d2436eda2e81f1993bfe6349f17f52503bffeff5 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 18 Apr 2011 14:17:49 -0400
Subject: [PATCH 040/110] block, xen/blkback: remove blk_[get|put]_queue calls.

They were used to check if the queue does not have QUEUE_FLAG_DEAD
set. That is not necessary anymore as the 'submit_io' call
ends up doing that for us.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 block/blk-core.c              | 2 --
 drivers/xen/blkback/blkback.c | 6 ------
 2 files changed, 8 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 9b60e69a5400..90f22cc30799 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -351,7 +351,6 @@ void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
-EXPORT_SYMBOL_GPL(blk_put_queue);
 
 /*
  * Note: If a driver supplied the queue lock, it should not zap that lock
@@ -573,7 +572,6 @@ int blk_get_queue(struct request_queue *q)
 
 	return 1;
 }
-EXPORT_SYMBOL_GPL(blk_get_queue);
 
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
index 3751325bfc32..59a2bae0f35e 100644
--- a/drivers/xen/blkback/blkback.c
+++ b/drivers/xen/blkback/blkback.c
@@ -479,7 +479,6 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	int i, nbio = 0;
 	int operation;
 	struct blk_plug plug;
-	struct request_queue *q;
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
@@ -542,9 +541,6 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 			goto fail_response;
 		}
 	}
-	q = bdev_get_queue(preq.bdev);
-	if (!q)
-		goto fail_response;
 	/* If we have failed at this point, we need to undo the M2P override,
 	 * set gnttab_set_unmap_op on all of the grant references and perform
 	 * the hypercall to unmap the grants - that is all done in
@@ -596,7 +592,6 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	atomic_set(&pending_req->pendcnt, nbio);
 
 	/* Get a reference count for the disk queue and start sending I/O */
-	blk_get_queue(q);
 	blk_start_plug(&plug);
 
 	for (i = 0; i < nbio; i++)
@@ -604,7 +599,6 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 
 	blk_finish_plug(&plug);
 	/* Let the I/Os go.. */
-	blk_put_queue(q);
 
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;

From dfc07b13dcacefda6ebdea14584ed8724dc980ef Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 18 Apr 2011 14:24:23 -0400
Subject: [PATCH 041/110] xen/blkback: Move it from drivers/xen to
 drivers/block

.. and modify the Makefile and Kconfig files appropriately.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/Kconfig                                  | 8 ++++++++
 drivers/block/Makefile                                 | 1 +
 drivers/{xen/blkback => block/xen-blkback}/Makefile    | 0
 drivers/{xen/blkback => block/xen-blkback}/blkback.c   | 0
 drivers/{xen/blkback => block/xen-blkback}/common.h    | 0
 drivers/{xen/blkback => block/xen-blkback}/interface.c | 0
 drivers/{xen/blkback => block/xen-blkback}/vbd.c       | 0
 drivers/{xen/blkback => block/xen-blkback}/xenbus.c    | 0
 drivers/xen/Kconfig                                    | 8 --------
 drivers/xen/Makefile                                   | 1 -
 10 files changed, 9 insertions(+), 9 deletions(-)
 rename drivers/{xen/blkback => block/xen-blkback}/Makefile (100%)
 rename drivers/{xen/blkback => block/xen-blkback}/blkback.c (100%)
 rename drivers/{xen/blkback => block/xen-blkback}/common.h (100%)
 rename drivers/{xen/blkback => block/xen-blkback}/interface.c (100%)
 rename drivers/{xen/blkback => block/xen-blkback}/vbd.c (100%)
 rename drivers/{xen/blkback => block/xen-blkback}/xenbus.c (100%)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 83c32cb72582..9abb64689712 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -470,6 +470,14 @@ config XEN_BLKDEV_FRONTEND
 	  block device driver.  It communicates with a back-end driver
 	  in another domain which drives the actual block device.
 
+config XEN_BLKDEV_BACKEND
+	tristate "Block-device backend driver"
+	depends on XEN_BACKEND
+	help
+	  The block-device backend driver allows the kernel to export its
+	  block devices to other guests via a high-performance shared-memory
+	  interface.
+
 config VIRTIO_BLK
 	tristate "Virtio block driver (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && VIRTIO
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 40528ba56d1b..76646e9a1c91 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 obj-$(CONFIG_BLK_DEV_HD)	+= hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
+obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= xen-blkback/
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
 
diff --git a/drivers/xen/blkback/Makefile b/drivers/block/xen-blkback/Makefile
similarity index 100%
rename from drivers/xen/blkback/Makefile
rename to drivers/block/xen-blkback/Makefile
diff --git a/drivers/xen/blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
similarity index 100%
rename from drivers/xen/blkback/blkback.c
rename to drivers/block/xen-blkback/blkback.c
diff --git a/drivers/xen/blkback/common.h b/drivers/block/xen-blkback/common.h
similarity index 100%
rename from drivers/xen/blkback/common.h
rename to drivers/block/xen-blkback/common.h
diff --git a/drivers/xen/blkback/interface.c b/drivers/block/xen-blkback/interface.c
similarity index 100%
rename from drivers/xen/blkback/interface.c
rename to drivers/block/xen-blkback/interface.c
diff --git a/drivers/xen/blkback/vbd.c b/drivers/block/xen-blkback/vbd.c
similarity index 100%
rename from drivers/xen/blkback/vbd.c
rename to drivers/block/xen-blkback/vbd.c
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
similarity index 100%
rename from drivers/xen/blkback/xenbus.c
rename to drivers/block/xen-blkback/xenbus.c
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index fb1af628cbfc..a59638b37c1a 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -37,14 +37,6 @@ config XEN_BACKEND
 	  Support for backend device drivers that provide I/O services
 	  to other virtual machines.
 
-config XEN_BLKDEV_BACKEND
-	tristate "Block-device backend driver"
-	depends on XEN_BACKEND && BLOCK
-	help
-	  The block-device backend driver allows the kernel to export its
-	  block devices to other guests via a high-performance shared-memory
-	  interface.
-
 config XENFS
 	tristate "Xen filesystem"
 	default y
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 29c0a416f082..f420f1ff7f13 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -11,7 +11,6 @@ obj-$(CONFIG_XEN_BALLOON)	+= xen-balloon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)	+= xen-evtchn.o
 obj-$(CONFIG_XEN_GNTDEV)	+= xen-gntdev.o
 obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)	+= xen-gntalloc.o
-obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
 obj-$(CONFIG_XENFS)		+= xenfs/
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
 obj-$(CONFIG_XEN_PLATFORM_PCI)	+= xen-platform-pci.o

From ee9ff8537eacb4383bf9146df6c21b9301c9baa2 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 20 Apr 2011 10:57:29 -0400
Subject: [PATCH 042/110] xen/blkback: Squash vbd.c,interface.c in blkback.c
 and xenbus.c respectivly.

Daniel Stodden suggested to eliminate vbd.c and interface.c, inlining the
critical bits where they belong, respectively.

Leaving only blkback.c for the data- and xenbus.c for the control path.

Suggested-by:  Daniel Stodden <daniel.stodden@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/Makefile    |   2 +-
 drivers/block/xen-blkback/blkback.c   | 135 +++++++++++++++++++
 drivers/block/xen-blkback/interface.c | 185 --------------------------
 drivers/block/xen-blkback/vbd.c       | 162 ----------------------
 drivers/block/xen-blkback/xenbus.c    | 151 +++++++++++++++++++++
 5 files changed, 287 insertions(+), 348 deletions(-)
 delete mode 100644 drivers/block/xen-blkback/interface.c
 delete mode 100644 drivers/block/xen-blkback/vbd.c

diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile
index f1ae1ff07a4d..e491c1b76878 100644
--- a/drivers/block/xen-blkback/Makefile
+++ b/drivers/block/xen-blkback/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
 
-xen-blkback-y	:= blkback.o xenbus.o interface.o vbd.o
+xen-blkback-y	:= blkback.o xenbus.o
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 59a2bae0f35e..63001fac9af2 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -166,6 +166,141 @@ static void free_req(struct pending_req *req)
 		wake_up(&blkbk->pending_free_wq);
 }
 
+/*
+ * Routines for managing virtual block devices (vbds).
+ */
+
+#define vbd_sz(_v)	((_v)->bdev->bd_part ? \
+			 (_v)->bdev->bd_part->nr_sects : \
+			  get_capacity((_v)->bdev->bd_disk))
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+	return vbd_sz(vbd);
+}
+
+unsigned int vbd_info(struct vbd *vbd)
+{
+	return vbd->type | (vbd->readonly ? VDISK_READONLY : 0);
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+	return bdev_logical_block_size(vbd->bdev);
+}
+
+int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle, unsigned major,
+	       unsigned minor, int readonly, int cdrom)
+{
+	struct vbd *vbd;
+	struct block_device *bdev;
+
+	vbd = &blkif->vbd;
+	vbd->handle   = handle;
+	vbd->readonly = readonly;
+	vbd->type     = 0;
+
+	vbd->pdevice  = MKDEV(major, minor);
+
+	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+				 FMODE_READ : FMODE_WRITE, NULL);
+
+	if (IS_ERR(bdev)) {
+		DPRINTK("vbd_creat: device %08x could not be opened.\n",
+			vbd->pdevice);
+		return -ENOENT;
+	}
+
+	vbd->bdev = bdev;
+	vbd->size = vbd_size(vbd);
+
+	if (vbd->bdev->bd_disk == NULL) {
+		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+			vbd->pdevice);
+		vbd_free(vbd);
+		return -ENOENT;
+	}
+
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+		vbd->type |= VDISK_CDROM;
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+		vbd->type |= VDISK_REMOVABLE;
+
+	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+		handle, blkif->domid);
+	return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+	if (vbd->bdev)
+		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+	vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, struct blkif_st *blkif, int operation)
+{
+	struct vbd *vbd = &blkif->vbd;
+	int rc = -EACCES;
+
+	if ((operation != READ) && vbd->readonly)
+		goto out;
+
+	if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
+		goto out;
+
+	req->dev  = vbd->pdevice;
+	req->bdev = vbd->bdev;
+	rc = 0;
+
+ out:
+	return rc;
+}
+
+void vbd_resize(struct blkif_st *blkif)
+{
+	struct vbd *vbd = &blkif->vbd;
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = blkback_xenbus(blkif->be);
+	unsigned long long new_size = vbd_size(vbd);
+
+	printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
+		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
+	printk(KERN_INFO "VBD Resize: new size %llu\n", new_size);
+	vbd->size = new_size;
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		printk(KERN_WARNING "Error starting transaction");
+		return;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    vbd_size(vbd));
+	if (err) {
+		printk(KERN_WARNING "Error writing new size");
+		goto abort;
+	}
+	/*
+	 * Write the current state; we will use this to synchronize
+	 * the front-end. If the current state is "connected" the
+	 * front-end will get the new size information online.
+	 */
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+	if (err) {
+		printk(KERN_WARNING "Error writing the state");
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		printk(KERN_WARNING "Error ending transaction");
+abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
 /*
  * Notification from the guest OS.
  */
diff --git a/drivers/block/xen-blkback/interface.c b/drivers/block/xen-blkback/interface.c
deleted file mode 100644
index 163aed41e825..000000000000
--- a/drivers/block/xen-blkback/interface.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/******************************************************************************
- * Block-device interface management.
- *
- * Copyright (c) 2004, Keir Fraser
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "common.h"
-#include <xen/events.h>
-#include <xen/grant_table.h>
-#include <linux/kthread.h>
-
-static struct kmem_cache *blkif_cachep;
-
-struct blkif_st *blkif_alloc(domid_t domid)
-{
-	struct blkif_st *blkif;
-
-	blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
-	if (!blkif)
-		return ERR_PTR(-ENOMEM);
-
-	memset(blkif, 0, sizeof(*blkif));
-	blkif->domid = domid;
-	spin_lock_init(&blkif->blk_ring_lock);
-	atomic_set(&blkif->refcnt, 1);
-	init_waitqueue_head(&blkif->wq);
-	blkif->st_print = jiffies;
-	init_waitqueue_head(&blkif->waiting_to_free);
-
-	return blkif;
-}
-
-static int map_frontend_page(struct blkif_st *blkif, unsigned long shared_page)
-{
-	struct gnttab_map_grant_ref op;
-
-	gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
-			  GNTMAP_host_map, shared_page, blkif->domid);
-
-	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-		BUG();
-
-	if (op.status) {
-		DPRINTK(" Grant table operation failure !\n");
-		return op.status;
-	}
-
-	blkif->shmem_ref = shared_page;
-	blkif->shmem_handle = op.handle;
-
-	return 0;
-}
-
-static void unmap_frontend_page(struct blkif_st *blkif)
-{
-	struct gnttab_unmap_grant_ref op;
-
-	gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
-			    GNTMAP_host_map, blkif->shmem_handle);
-
-	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-		BUG();
-}
-
-int blkif_map(struct blkif_st *blkif, unsigned long shared_page,
-	      unsigned int evtchn)
-{
-	int err;
-
-	/* Already connected through? */
-	if (blkif->irq)
-		return 0;
-
-	blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE);
-	if (!blkif->blk_ring_area)
-		return -ENOMEM;
-
-	err = map_frontend_page(blkif, shared_page);
-	if (err) {
-		free_vm_area(blkif->blk_ring_area);
-		return err;
-	}
-
-	switch (blkif->blk_protocol) {
-	case BLKIF_PROTOCOL_NATIVE:
-	{
-		struct blkif_sring *sring;
-		sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
-		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
-		break;
-	}
-	case BLKIF_PROTOCOL_X86_32:
-	{
-		struct blkif_x86_32_sring *sring_x86_32;
-		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
-		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
-		break;
-	}
-	case BLKIF_PROTOCOL_X86_64:
-	{
-		struct blkif_x86_64_sring *sring_x86_64;
-		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
-		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
-		break;
-	}
-	default:
-		BUG();
-	}
-
-	err = bind_interdomain_evtchn_to_irqhandler(
-		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
-	if (err < 0) {
-		unmap_frontend_page(blkif);
-		free_vm_area(blkif->blk_ring_area);
-		blkif->blk_rings.common.sring = NULL;
-		return err;
-	}
-	blkif->irq = err;
-
-	return 0;
-}
-
-void blkif_disconnect(struct blkif_st *blkif)
-{
-	if (blkif->xenblkd) {
-		kthread_stop(blkif->xenblkd);
-		blkif->xenblkd = NULL;
-	}
-
-	atomic_dec(&blkif->refcnt);
-	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
-	atomic_inc(&blkif->refcnt);
-
-	if (blkif->irq) {
-		unbind_from_irqhandler(blkif->irq, blkif);
-		blkif->irq = 0;
-	}
-
-	if (blkif->blk_rings.common.sring) {
-		unmap_frontend_page(blkif);
-		free_vm_area(blkif->blk_ring_area);
-		blkif->blk_rings.common.sring = NULL;
-	}
-}
-
-void blkif_free(struct blkif_st *blkif)
-{
-	if (!atomic_dec_and_test(&blkif->refcnt))
-		BUG();
-	kmem_cache_free(blkif_cachep, blkif);
-}
-
-int __init blkif_interface_init(void)
-{
-	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(struct blkif_st),
-					 0, 0, NULL);
-	if (!blkif_cachep)
-		return -ENOMEM;
-
-	return 0;
-}
diff --git a/drivers/block/xen-blkback/vbd.c b/drivers/block/xen-blkback/vbd.c
deleted file mode 100644
index d0ff4cf91a34..000000000000
--- a/drivers/block/xen-blkback/vbd.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/******************************************************************************
- * Routines for managing virtual block devices (VBDs).
- *
- * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "common.h"
-
-#define vbd_sz(_v)	((_v)->bdev->bd_part ? \
-			 (_v)->bdev->bd_part->nr_sects : \
-			  get_capacity((_v)->bdev->bd_disk))
-
-unsigned long long vbd_size(struct vbd *vbd)
-{
-	return vbd_sz(vbd);
-}
-
-unsigned int vbd_info(struct vbd *vbd)
-{
-	return vbd->type | (vbd->readonly ? VDISK_READONLY : 0);
-}
-
-unsigned long vbd_secsize(struct vbd *vbd)
-{
-	return bdev_logical_block_size(vbd->bdev);
-}
-
-int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle, unsigned major,
-	       unsigned minor, int readonly, int cdrom)
-{
-	struct vbd *vbd;
-	struct block_device *bdev;
-
-	vbd = &blkif->vbd;
-	vbd->handle   = handle;
-	vbd->readonly = readonly;
-	vbd->type     = 0;
-
-	vbd->pdevice  = MKDEV(major, minor);
-
-	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
-				 FMODE_READ : FMODE_WRITE, NULL);
-
-	if (IS_ERR(bdev)) {
-		DPRINTK("vbd_creat: device %08x could not be opened.\n",
-			vbd->pdevice);
-		return -ENOENT;
-	}
-
-	vbd->bdev = bdev;
-	vbd->size = vbd_size(vbd);
-
-	if (vbd->bdev->bd_disk == NULL) {
-		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
-			vbd->pdevice);
-		vbd_free(vbd);
-		return -ENOENT;
-	}
-
-	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
-		vbd->type |= VDISK_CDROM;
-	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
-		vbd->type |= VDISK_REMOVABLE;
-
-	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
-		handle, blkif->domid);
-	return 0;
-}
-
-void vbd_free(struct vbd *vbd)
-{
-	if (vbd->bdev)
-		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
-	vbd->bdev = NULL;
-}
-
-int vbd_translate(struct phys_req *req, struct blkif_st *blkif, int operation)
-{
-	struct vbd *vbd = &blkif->vbd;
-	int rc = -EACCES;
-
-	if ((operation != READ) && vbd->readonly)
-		goto out;
-
-	if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
-		goto out;
-
-	req->dev  = vbd->pdevice;
-	req->bdev = vbd->bdev;
-	rc = 0;
-
- out:
-	return rc;
-}
-
-void vbd_resize(struct blkif_st *blkif)
-{
-	struct vbd *vbd = &blkif->vbd;
-	struct xenbus_transaction xbt;
-	int err;
-	struct xenbus_device *dev = blkback_xenbus(blkif->be);
-	unsigned long long new_size = vbd_size(vbd);
-
-	printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
-		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
-	printk(KERN_INFO "VBD Resize: new size %llu\n", new_size);
-	vbd->size = new_size;
-again:
-	err = xenbus_transaction_start(&xbt);
-	if (err) {
-		printk(KERN_WARNING "Error starting transaction");
-		return;
-	}
-	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
-			    vbd_size(vbd));
-	if (err) {
-		printk(KERN_WARNING "Error writing new size");
-		goto abort;
-	}
-	/*
-	 * Write the current state; we will use this to synchronize
-	 * the front-end. If the current state is "connected" the
-	 * front-end will get the new size information online.
-	 */
-	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
-	if (err) {
-		printk(KERN_WARNING "Error writing the state");
-		goto abort;
-	}
-
-	err = xenbus_transaction_end(xbt, 0);
-	if (err == -EAGAIN)
-		goto again;
-	if (err)
-		printk(KERN_WARNING "Error ending transaction");
-abort:
-	xenbus_transaction_end(xbt, 1);
-}
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index b41ed65db2d3..0c263a248007 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -20,6 +20,8 @@
 #include <stdarg.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
 #include "common.h"
 
 #undef DPRINTK
@@ -36,6 +38,7 @@ struct backend_info {
 	char *mode;
 };
 
+static struct kmem_cache *blkif_cachep;
 static void connect(struct backend_info *);
 static int connect_ring(struct backend_info *);
 static void backend_changed(struct xenbus_watch *, const char **,
@@ -106,6 +109,154 @@ static void update_blkif_status(struct blkif_st *blkif)
 	}
 }
 
+struct blkif_st *blkif_alloc(domid_t domid)
+{
+	struct blkif_st *blkif;
+
+	blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+	if (!blkif)
+		return ERR_PTR(-ENOMEM);
+
+	memset(blkif, 0, sizeof(*blkif));
+	blkif->domid = domid;
+	spin_lock_init(&blkif->blk_ring_lock);
+	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
+	init_waitqueue_head(&blkif->waiting_to_free);
+
+	return blkif;
+}
+
+static int map_frontend_page(struct blkif_st *blkif, unsigned long shared_page)
+{
+	struct gnttab_map_grant_ref op;
+
+	gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+			  GNTMAP_host_map, shared_page, blkif->domid);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+		BUG();
+
+	if (op.status) {
+		DPRINTK(" Grant table operation failure !\n");
+		return op.status;
+	}
+
+	blkif->shmem_ref = shared_page;
+	blkif->shmem_handle = op.handle;
+
+	return 0;
+}
+
+static void unmap_frontend_page(struct blkif_st *blkif)
+{
+	struct gnttab_unmap_grant_ref op;
+
+	gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+			    GNTMAP_host_map, blkif->shmem_handle);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+		BUG();
+}
+
+int blkif_map(struct blkif_st *blkif, unsigned long shared_page,
+	      unsigned int evtchn)
+{
+	int err;
+
+	/* Already connected through? */
+	if (blkif->irq)
+		return 0;
+
+	blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE);
+	if (!blkif->blk_ring_area)
+		return -ENOMEM;
+
+	err = map_frontend_page(blkif, shared_page);
+	if (err) {
+		free_vm_area(blkif->blk_ring_area);
+		return err;
+	}
+
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		struct blkif_sring *sring;
+		sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		struct blkif_x86_32_sring *sring_x86_32;
+		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		struct blkif_x86_64_sring *sring_x86_64;
+		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+	if (err < 0) {
+		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+	}
+	blkif->irq = err;
+
+	return 0;
+}
+
+void blkif_disconnect(struct blkif_st *blkif)
+{
+	if (blkif->xenblkd) {
+		kthread_stop(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+	}
+
+	atomic_dec(&blkif->refcnt);
+	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+	atomic_inc(&blkif->refcnt);
+
+	if (blkif->irq) {
+		unbind_from_irqhandler(blkif->irq, blkif);
+		blkif->irq = 0;
+	}
+
+	if (blkif->blk_rings.common.sring) {
+		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+	}
+}
+
+void blkif_free(struct blkif_st *blkif)
+{
+	if (!atomic_dec_and_test(&blkif->refcnt))
+		BUG();
+	kmem_cache_free(blkif_cachep, blkif);
+}
+
+int __init blkif_interface_init(void)
+{
+	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(struct blkif_st),
+					 0, 0, NULL);
+	if (!blkif_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
 
 /*
  *  sysfs interface for VBD I/O requests

From 6cd0388cd600a51a8824dc5b34f1107b367b0cac Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 20 Apr 2011 11:01:47 -0400
Subject: [PATCH 043/110] xen-blkback: Remove from the copyright notice the
 address.

There is no need for it, as the address is updated constatly
in the root of the Linux kernel.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 0c263a248007..c6c5286aa813 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -12,9 +12,6 @@
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
 #include <stdarg.h>

From 42c7841d171a2fe32005738dfebd724a90921496 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 20 Apr 2011 11:21:43 -0400
Subject: [PATCH 044/110] xen-blkback: Inline some of the functions that were
 moved from vbd/interface.c

Shuffling code around.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 78 +++--------------------------
 drivers/block/xen-blkback/common.h  | 22 ++------
 drivers/block/xen-blkback/xenbus.c  | 58 +++++++++++++++++++--
 3 files changed, 65 insertions(+), 93 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 63001fac9af2..806c2c947c63 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -170,75 +170,9 @@ static void free_req(struct pending_req *req)
  * Routines for managing virtual block devices (vbds).
  */
 
-#define vbd_sz(_v)	((_v)->bdev->bd_part ? \
-			 (_v)->bdev->bd_part->nr_sects : \
-			  get_capacity((_v)->bdev->bd_disk))
 
-unsigned long long vbd_size(struct vbd *vbd)
-{
-	return vbd_sz(vbd);
-}
-
-unsigned int vbd_info(struct vbd *vbd)
-{
-	return vbd->type | (vbd->readonly ? VDISK_READONLY : 0);
-}
-
-unsigned long vbd_secsize(struct vbd *vbd)
-{
-	return bdev_logical_block_size(vbd->bdev);
-}
-
-int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle, unsigned major,
-	       unsigned minor, int readonly, int cdrom)
-{
-	struct vbd *vbd;
-	struct block_device *bdev;
-
-	vbd = &blkif->vbd;
-	vbd->handle   = handle;
-	vbd->readonly = readonly;
-	vbd->type     = 0;
-
-	vbd->pdevice  = MKDEV(major, minor);
-
-	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
-				 FMODE_READ : FMODE_WRITE, NULL);
-
-	if (IS_ERR(bdev)) {
-		DPRINTK("vbd_creat: device %08x could not be opened.\n",
-			vbd->pdevice);
-		return -ENOENT;
-	}
-
-	vbd->bdev = bdev;
-	vbd->size = vbd_size(vbd);
-
-	if (vbd->bdev->bd_disk == NULL) {
-		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
-			vbd->pdevice);
-		vbd_free(vbd);
-		return -ENOENT;
-	}
-
-	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
-		vbd->type |= VDISK_CDROM;
-	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
-		vbd->type |= VDISK_REMOVABLE;
-
-	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
-		handle, blkif->domid);
-	return 0;
-}
-
-void vbd_free(struct vbd *vbd)
-{
-	if (vbd->bdev)
-		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
-	vbd->bdev = NULL;
-}
-
-int vbd_translate(struct phys_req *req, struct blkif_st *blkif, int operation)
+static int vbd_translate(struct phys_req *req, struct blkif_st *blkif,
+			 int operation)
 {
 	struct vbd *vbd = &blkif->vbd;
 	int rc = -EACCES;
@@ -257,13 +191,13 @@ int vbd_translate(struct phys_req *req, struct blkif_st *blkif, int operation)
 	return rc;
 }
 
-void vbd_resize(struct blkif_st *blkif)
+static void vbd_resize(struct blkif_st *blkif)
 {
 	struct vbd *vbd = &blkif->vbd;
 	struct xenbus_transaction xbt;
 	int err;
 	struct xenbus_device *dev = blkback_xenbus(blkif->be);
-	unsigned long long new_size = vbd_size(vbd);
+	unsigned long long new_size = vbd_sz(vbd);
 
 	printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
 		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
@@ -276,7 +210,7 @@ again:
 		return;
 	}
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
-			    vbd_size(vbd));
+			    (unsigned long long)vbd_sz(vbd));
 	if (err) {
 		printk(KERN_WARNING "Error writing new size");
 		goto abort;
@@ -344,7 +278,7 @@ int blkif_schedule(void *arg)
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
 			continue;
-		if (unlikely(vbd->size != vbd_size(vbd)))
+		if (unlikely(vbd->size != vbd_sz(vbd)))
 			vbd_resize(blkif);
 
 		wait_event_interruptible(
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 6257c1106591..4b5acb3e8b24 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -95,12 +95,10 @@ struct blkif_st {
 	grant_ref_t    shmem_ref;
 };
 
-struct blkif_st *blkif_alloc(domid_t domid);
-void blkif_disconnect(struct blkif_st *blkif);
-void blkif_free(struct blkif_st *blkif);
-int blkif_map(struct blkif_st *blkif, unsigned long shared_page,
-	      unsigned int evtchn);
-void vbd_resize(struct blkif_st *blkif);
+
+#define vbd_sz(_v)	((_v)->bdev->bd_part ? \
+			 (_v)->bdev->bd_part->nr_sects : \
+			  get_capacity((_v)->bdev->bd_disk))
 
 #define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
 #define blkif_put(_b)					\
@@ -109,24 +107,12 @@ void vbd_resize(struct blkif_st *blkif);
 			wake_up(&(_b)->waiting_to_free);\
 	} while (0)
 
-/* Create a vbd. */
-int vbd_create(struct blkif_st *blkif, blkif_vdev_t vdevice, unsigned major,
-	       unsigned minor, int readonly, int cdrom);
-void vbd_free(struct vbd *vbd);
-
-unsigned long long vbd_size(struct vbd *vbd);
-unsigned int vbd_info(struct vbd *vbd);
-unsigned long vbd_secsize(struct vbd *vbd);
-
 struct phys_req {
 	unsigned short       dev;
 	unsigned short       nr_sects;
 	struct block_device *bdev;
 	blkif_sector_t       sector_number;
 };
-
-int vbd_translate(struct phys_req *req, struct blkif_st *blkif, int operation);
-
 int blkif_interface_init(void);
 
 int blkif_xenbus_init(void);
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index c6c5286aa813..75bf49bd365c 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -327,6 +327,56 @@ void xenvbd_sysfs_delif(struct xenbus_device *dev)
 	device_remove_file(&dev->dev, &dev_attr_physical_device);
 }
 
+
+static void vbd_free(struct vbd *vbd)
+{
+	if (vbd->bdev)
+		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+	vbd->bdev = NULL;
+}
+
+static int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle,
+		      unsigned major, unsigned minor, int readonly,
+		      int cdrom)
+{
+	struct vbd *vbd;
+	struct block_device *bdev;
+
+	vbd = &blkif->vbd;
+	vbd->handle   = handle;
+	vbd->readonly = readonly;
+	vbd->type     = 0;
+
+	vbd->pdevice  = MKDEV(major, minor);
+
+	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+				 FMODE_READ : FMODE_WRITE, NULL);
+
+	if (IS_ERR(bdev)) {
+		DPRINTK("vbd_creat: device %08x could not be opened.\n",
+			vbd->pdevice);
+		return -ENOENT;
+	}
+
+	vbd->bdev = bdev;
+	vbd->size = vbd_sz(vbd);
+
+	if (vbd->bdev->bd_disk == NULL) {
+		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+			vbd->pdevice);
+		vbd_free(vbd);
+		return -ENOENT;
+	}
+
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+		vbd->type |= VDISK_CDROM;
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+		vbd->type |= VDISK_REMOVABLE;
+
+	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+		handle, blkif->domid);
+	return 0;
+}
 static int blkback_remove(struct xenbus_device *dev)
 {
 	struct backend_info *be = dev_get_drvdata(&dev->dev);
@@ -595,7 +645,7 @@ again:
 		goto abort;
 
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
-			    vbd_size(&be->blkif->vbd));
+			    (unsigned long long)vbd_sz(&be->blkif->vbd));
 	if (err) {
 		xenbus_dev_fatal(dev, err, "writing %s/sectors",
 				 dev->nodename);
@@ -604,14 +654,16 @@ again:
 
 	/* FIXME: use a typename instead */
 	err = xenbus_printf(xbt, dev->nodename, "info", "%u",
-			    vbd_info(&be->blkif->vbd));
+			    be->blkif->vbd.type |
+			    (be->blkif->vbd.readonly ? VDISK_READONLY : 0));
 	if (err) {
 		xenbus_dev_fatal(dev, err, "writing %s/info",
 				 dev->nodename);
 		goto abort;
 	}
 	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
-			    vbd_secsize(&be->blkif->vbd));
+			    (unsigned long)
+			    bdev_logical_block_size(be->blkif->vbd.bdev));
 	if (err) {
 		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
 				 dev->nodename);

From 8b6bf747d70e5bac1a34c8fd773230e1cfdd7546 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 20 Apr 2011 11:50:43 -0400
Subject: [PATCH 045/110] xen/blkback: Prefix exposed functions with xen_

And also shorten the name if it has blkback to blkbk.

This results in the symbol table (if compiled in the kernel)
to be much shorter, prettier,  and also easier to search for.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 36 ++++++-------
 drivers/block/xen-blkback/common.h  | 18 +++----
 drivers/block/xen-blkback/xenbus.c  | 80 +++++++++++++++--------------
 3 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 806c2c947c63..c4bc85e69d33 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -53,13 +53,13 @@
  * pulled from a communication ring are quite likely to end up being part of
  * the same scatter/gather request at the disc.
  *
- * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
  *
  * This will increase the chances of being able to write whole tracks.
  * 64 should be enough to keep us competitive with Linux.
  */
-static int blkif_reqs = 64;
-module_param_named(reqs, blkif_reqs, int, 0);
+static int xen_blkif_reqs = 64;
+module_param_named(reqs, xen_blkif_reqs, int, 0);
 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
 
 /* Run-time switchable: /sys/module/blkback/parameters/ */
@@ -196,7 +196,7 @@ static void vbd_resize(struct blkif_st *blkif)
 	struct vbd *vbd = &blkif->vbd;
 	struct xenbus_transaction xbt;
 	int err;
-	struct xenbus_device *dev = blkback_xenbus(blkif->be);
+	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
 	unsigned long long new_size = vbd_sz(vbd);
 
 	printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
@@ -244,7 +244,7 @@ static void blkif_notify_work(struct blkif_st *blkif)
 	wake_up(&blkif->wq);
 }
 
-irqreturn_t blkif_be_int(int irq, void *dev_id)
+irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 {
 	blkif_notify_work(dev_id);
 	return IRQ_HANDLED;
@@ -265,12 +265,12 @@ static void print_stats(struct blkif_st *blkif)
 	blkif->st_oo_req = 0;
 }
 
-int blkif_schedule(void *arg)
+int xen_blkif_schedule(void *arg)
 {
 	struct blkif_st *blkif = arg;
 	struct vbd *vbd = &blkif->vbd;
 
-	blkif_get(blkif);
+	xen_blkif_get(blkif);
 
 	if (debug_lvl)
 		printk(KERN_DEBUG "%s: started\n", current->comm);
@@ -305,7 +305,7 @@ int blkif_schedule(void *arg)
 		printk(KERN_DEBUG "%s: exiting\n", current->comm);
 
 	blkif->xenblkd = NULL;
-	blkif_put(blkif);
+	xen_blkif_put(blkif);
 
 	return 0;
 }
@@ -417,7 +417,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 	if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
 	    (error == -EOPNOTSUPP)) {
 		DPRINTK("blkback: write barrier op failed, not supported\n");
-		blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if (error) {
 		DPRINTK("Buffer not up-to-date at end of operation, "
@@ -433,7 +433,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 		xen_blkbk_unmap(pending_req);
 		make_response(pending_req->blkif, pending_req->id,
 			      pending_req->operation, pending_req->status);
-		blkif_put(pending_req->blkif);
+		xen_blkif_put(pending_req->blkif);
 		free_req(pending_req);
 	}
 }
@@ -619,7 +619,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 		goto fail_flush;
 
 	/* This corresponding blkif_put is done in __end_block_io_op */
-	blkif_get(blkif);
+	xen_blkif_get(blkif);
 
 	for (i = 0; i < nseg; i++) {
 		while ((bio == NULL) ||
@@ -751,7 +751,7 @@ static void make_response(struct blkif_st *blkif, u64 id,
 		notify_remote_via_irq(blkif->irq);
 }
 
-static int __init blkif_init(void)
+static int __init xen_blkif_init(void)
 {
 	int i, mmap_pages;
 	int rc = 0;
@@ -765,10 +765,10 @@ static int __init blkif_init(void)
 		return -ENOMEM;
 	}
 
-	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
 
 	blkbk->pending_reqs          = kmalloc(sizeof(blkbk->pending_reqs[0]) *
-					blkif_reqs, GFP_KERNEL);
+					xen_blkif_reqs, GFP_KERNEL);
 	blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) *
 					mmap_pages, GFP_KERNEL);
 	blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
@@ -788,7 +788,7 @@ static int __init blkif_init(void)
 			goto out_of_memory;
 		}
 	}
-	rc = blkif_interface_init();
+	rc = xen_blkif_interface_init();
 	if (rc)
 		goto failed_init;
 
@@ -798,11 +798,11 @@ static int __init blkif_init(void)
 	spin_lock_init(&blkbk->pending_free_lock);
 	init_waitqueue_head(&blkbk->pending_free_wq);
 
-	for (i = 0; i < blkif_reqs; i++)
+	for (i = 0; i < xen_blkif_reqs; i++)
 		list_add_tail(&blkbk->pending_reqs[i].free_list,
 			      &blkbk->pending_free);
 
-	rc = blkif_xenbus_init();
+	rc = xen_blkif_xenbus_init();
 	if (rc)
 		goto failed_init;
 
@@ -823,6 +823,6 @@ static int __init blkif_init(void)
 	return rc;
 }
 
-module_init(blkif_init);
+module_init(xen_blkif_init);
 
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 4b5acb3e8b24..16af388268e7 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -100,8 +100,8 @@ struct blkif_st {
 			 (_v)->bdev->bd_part->nr_sects : \
 			  get_capacity((_v)->bdev->bd_disk))
 
-#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
-#define blkif_put(_b)					\
+#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define xen_blkif_put(_b)				\
 	do {						\
 		if (atomic_dec_and_test(&(_b)->refcnt))	\
 			wake_up(&(_b)->waiting_to_free);\
@@ -113,16 +113,16 @@ struct phys_req {
 	struct block_device *bdev;
 	blkif_sector_t       sector_number;
 };
-int blkif_interface_init(void);
+int xen_blkif_interface_init(void);
 
-int blkif_xenbus_init(void);
+int xen_blkif_xenbus_init(void);
 
-irqreturn_t blkif_be_int(int irq, void *dev_id);
-int blkif_schedule(void *arg);
+irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
+int xen_blkif_schedule(void *arg);
 
-int blkback_barrier(struct xenbus_transaction xbt,
-		    struct backend_info *be, int state);
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+			struct backend_info *be, int state);
 
-struct xenbus_device *blkback_xenbus(struct backend_info *be);
+struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
 
 #endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 75bf49bd365c..64b0a1c760fb 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -35,13 +35,13 @@ struct backend_info {
 	char *mode;
 };
 
-static struct kmem_cache *blkif_cachep;
+static struct kmem_cache *xen_blkif_cachep;
 static void connect(struct backend_info *);
 static int connect_ring(struct backend_info *);
 static void backend_changed(struct xenbus_watch *, const char **,
 			    unsigned int);
 
-struct xenbus_device *blkback_xenbus(struct backend_info *be)
+struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
 {
 	return be->dev;
 }
@@ -67,7 +67,7 @@ static int blkback_name(struct blkif_st *blkif, char *buf)
 	return 0;
 }
 
-static void update_blkif_status(struct blkif_st *blkif)
+static void xen_update_blkif_status(struct blkif_st *blkif)
 {
 	int err;
 	char name[TASK_COMM_LEN];
@@ -98,7 +98,7 @@ static void update_blkif_status(struct blkif_st *blkif)
 	}
 	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
 
-	blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+	blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, name);
 	if (IS_ERR(blkif->xenblkd)) {
 		err = PTR_ERR(blkif->xenblkd);
 		blkif->xenblkd = NULL;
@@ -106,11 +106,11 @@ static void update_blkif_status(struct blkif_st *blkif)
 	}
 }
 
-struct blkif_st *blkif_alloc(domid_t domid)
+static struct blkif_st *xen_blkif_alloc(domid_t domid)
 {
 	struct blkif_st *blkif;
 
-	blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+	blkif = kmem_cache_alloc(xen_blkif_cachep, GFP_KERNEL);
 	if (!blkif)
 		return ERR_PTR(-ENOMEM);
 
@@ -157,8 +157,8 @@ static void unmap_frontend_page(struct blkif_st *blkif)
 		BUG();
 }
 
-int blkif_map(struct blkif_st *blkif, unsigned long shared_page,
-	      unsigned int evtchn)
+static int xen_blkif_map(struct blkif_st *blkif, unsigned long shared_page,
+			 unsigned int evtchn)
 {
 	int err;
 
@@ -202,8 +202,9 @@ int blkif_map(struct blkif_st *blkif, unsigned long shared_page,
 		BUG();
 	}
 
-	err = bind_interdomain_evtchn_to_irqhandler(
-		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+	err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
+						    xen_blkif_be_int, 0,
+						    "blkif-backend", blkif);
 	if (err < 0) {
 		unmap_frontend_page(blkif);
 		free_vm_area(blkif->blk_ring_area);
@@ -215,7 +216,7 @@ int blkif_map(struct blkif_st *blkif, unsigned long shared_page,
 	return 0;
 }
 
-void blkif_disconnect(struct blkif_st *blkif)
+static void xen_blkif_disconnect(struct blkif_st *blkif)
 {
 	if (blkif->xenblkd) {
 		kthread_stop(blkif->xenblkd);
@@ -238,18 +239,19 @@ void blkif_disconnect(struct blkif_st *blkif)
 	}
 }
 
-void blkif_free(struct blkif_st *blkif)
+void xen_blkif_free(struct blkif_st *blkif)
 {
 	if (!atomic_dec_and_test(&blkif->refcnt))
 		BUG();
-	kmem_cache_free(blkif_cachep, blkif);
+	kmem_cache_free(xen_blkif_cachep, blkif);
 }
 
-int __init blkif_interface_init(void)
+int __init xen_blkif_interface_init(void)
 {
-	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(struct blkif_st),
-					 0, 0, NULL);
-	if (!blkif_cachep)
+	xen_blkif_cachep = kmem_cache_create("blkif_cache",
+					     sizeof(struct blkif_st),
+					     0, 0, NULL);
+	if (!xen_blkif_cachep)
 		return -ENOMEM;
 
 	return 0;
@@ -377,7 +379,7 @@ static int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle,
 		handle, blkif->domid);
 	return 0;
 }
-static int blkback_remove(struct xenbus_device *dev)
+static int xen_blkbk_remove(struct xenbus_device *dev)
 {
 	struct backend_info *be = dev_get_drvdata(&dev->dev);
 
@@ -393,9 +395,9 @@ static int blkback_remove(struct xenbus_device *dev)
 	}
 
 	if (be->blkif) {
-		blkif_disconnect(be->blkif);
+		xen_blkif_disconnect(be->blkif);
 		vbd_free(&be->blkif->vbd);
-		blkif_free(be->blkif);
+		xen_blkif_free(be->blkif);
 		be->blkif = NULL;
 	}
 
@@ -404,8 +406,8 @@ static int blkback_remove(struct xenbus_device *dev)
 	return 0;
 }
 
-int blkback_barrier(struct xenbus_transaction xbt,
-		    struct backend_info *be, int state)
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+		      struct backend_info *be, int state)
 {
 	struct xenbus_device *dev = be->dev;
 	int err;
@@ -423,8 +425,8 @@ int blkback_barrier(struct xenbus_transaction xbt,
  * structures, and watch the store waiting for the hotplug scripts to tell us
  * the device's physical major and minor numbers.  Switch to InitWait.
  */
-static int blkback_probe(struct xenbus_device *dev,
-			 const struct xenbus_device_id *id)
+static int xen_blkbk_probe(struct xenbus_device *dev,
+			   const struct xenbus_device_id *id)
 {
 	int err;
 	struct backend_info *be = kzalloc(sizeof(struct backend_info),
@@ -437,7 +439,7 @@ static int blkback_probe(struct xenbus_device *dev,
 	be->dev = dev;
 	dev_set_drvdata(&dev->dev, be);
 
-	be->blkif = blkif_alloc(dev->otherend_id);
+	be->blkif = xen_blkif_alloc(dev->otherend_id);
 	if (IS_ERR(be->blkif)) {
 		err = PTR_ERR(be->blkif);
 		be->blkif = NULL;
@@ -461,7 +463,7 @@ static int blkback_probe(struct xenbus_device *dev,
 
 fail:
 	DPRINTK("failed");
-	blkback_remove(dev);
+	xen_blkbk_remove(dev);
 	return err;
 }
 
@@ -550,7 +552,7 @@ static void backend_changed(struct xenbus_watch *watch,
 		}
 
 		/* We're potentially connected now */
-		update_blkif_status(be->blkif);
+		xen_update_blkif_status(be->blkif);
 	}
 }
 
@@ -586,16 +588,16 @@ static void frontend_changed(struct xenbus_device *dev,
 		/* Enforce precondition before potential leak point.
 		 * blkif_disconnect() is idempotent.
 		 */
-		blkif_disconnect(be->blkif);
+		xen_blkif_disconnect(be->blkif);
 
 		err = connect_ring(be);
 		if (err)
 			break;
-		update_blkif_status(be->blkif);
+		xen_update_blkif_status(be->blkif);
 		break;
 
 	case XenbusStateClosing:
-		blkif_disconnect(be->blkif);
+		xen_blkif_disconnect(be->blkif);
 		xenbus_switch_state(dev, XenbusStateClosing);
 		break;
 
@@ -640,7 +642,7 @@ again:
 		return;
 	}
 
-	err = blkback_barrier(xbt, be, 1);
+	err = xen_blkbk_barrier(xbt, be, 1);
 	if (err)
 		goto abort;
 
@@ -726,7 +728,7 @@ static int connect_ring(struct backend_info *be)
 	       ring_ref, evtchn, be->blkif->blk_protocol, protocol);
 
 	/* Map the shared frame, irq etc. */
-	err = blkif_map(be->blkif, ring_ref, evtchn);
+	err = xen_blkif_map(be->blkif, ring_ref, evtchn);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
 				 ring_ref, evtchn);
@@ -740,23 +742,23 @@ static int connect_ring(struct backend_info *be)
 /* ** Driver Registration ** */
 
 
-static const struct xenbus_device_id blkback_ids[] = {
+static const struct xenbus_device_id xen_blkbk_ids[] = {
 	{ "vbd" },
 	{ "" }
 };
 
 
-static struct xenbus_driver blkback = {
+static struct xenbus_driver xen_blkbk = {
 	.name = "vbd",
 	.owner = THIS_MODULE,
-	.ids = blkback_ids,
-	.probe = blkback_probe,
-	.remove = blkback_remove,
+	.ids = xen_blkbk_ids,
+	.probe = xen_blkbk_probe,
+	.remove = xen_blkbk_remove,
 	.otherend_changed = frontend_changed
 };
 
 
-int blkif_xenbus_init(void)
+int xen_blkif_xenbus_init(void)
 {
-	return xenbus_register_backend(&blkback);
+	return xenbus_register_backend(&xen_blkbk);
 }

From 97961ef46b9b5a6a7c918a38b898a7b3e49869f4 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 26 Apr 2011 12:57:59 -0400
Subject: [PATCH 046/110] xen/blkback: Move the plugging/unplugging to a higher
 level.

We used to the plug/unplug on the submit_bio. But that means
if within a stream of WRITE, WRITE, WRITE,...,WRITE we have
one READ, it could stall the pipeline (as the 'submio_bio'
could trigger the unplug_fnc to be called and stall/sync
when doing the READ). Instead we want to move the unplugging
when the whole (or as a much as possible) ring buffer has been
processed. This also eliminates us doing plug/unplug for
each request.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index c4bc85e69d33..ed85ba94b2e0 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -276,6 +276,8 @@ int xen_blkif_schedule(void *arg)
 		printk(KERN_DEBUG "%s: started\n", current->comm);
 
 	while (!kthread_should_stop()) {
+		struct blk_plug plug;
+
 		if (try_to_freeze())
 			continue;
 		if (unlikely(vbd->size != vbd_sz(vbd)))
@@ -292,9 +294,13 @@ int xen_blkif_schedule(void *arg)
 		blkif->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
 
+		blk_start_plug(&plug);
+
 		if (do_block_io_op(blkif))
 			blkif->waiting_reqs = 1;
 
+		blk_finish_plug(&plug);
+
 		if (log_stats && time_after(jiffies, blkif->st_print))
 			print_stats(blkif);
 	}
@@ -547,7 +553,6 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	int i, nbio = 0;
 	int operation;
-	struct blk_plug plug;
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
@@ -660,15 +665,9 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	 */
 	atomic_set(&pending_req->pendcnt, nbio);
 
-	/* Get a reference count for the disk queue and start sending I/O */
-	blk_start_plug(&plug);
-
 	for (i = 0; i < nbio; i++)
 		submit_bio(operation, biolist[i]);
 
-	blk_finish_plug(&plug);
-	/* Let the I/Os go.. */
-
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;
 	else if (operation == WRITE || operation == WRITE_BARRIER)

From 013c3ca184851078b9c04744efd4d47e52c6ecf8 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 26 Apr 2011 16:24:18 -0400
Subject: [PATCH 047/110] xen/blkback: Stick REQ_SYNC on WRITEs to deal with
 CFQ I/O scheduler.

If one runs a simple fio request with random read/write with a
20%/80% ratio, the numbers are incredibly bad when using the CFQ scheduler.

IOmeter       |       |      |          |
64K, randrw   |  NOOP | CFQ  | deadline |
randrwmix=80  |       |      |          |
--------------+-------+------+----------+
blkback       |103/27 |32/10 | 102/27   |
--------------+-------+------+----------+
QEMU qdisk    |103/27 |102/27| 102/27   |

The problem as explained by Vivek Goyal was:

".. that difference is that sync vs async requests. In the case of
a kernel thread submitting IO, [..] all the WRITES might be being
considered as async and will go in a different queue. If you mix those
with some READS, they are always sync and will go in differnet queue.
In presence of sync queue, CFQ will idle and choke up WRITES in
an attempt to improve latencies of READs.

In case of AIO [note: this is what QEMU qdisk is doing] , [..]
it is direct IO and both READS and WRITES will be considered SYNC
and will go in a single queue and no choking of WRITES will take place."

The solution is quite simple, tack on REQ_SYNC (which is
what the WRITE_ODIRECT macro points to) and the numbers go
back up.

Suggested-by: Vivek Goyal <vgoyal@redhat.com
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index ed85ba94b2e0..8583b130499a 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -559,7 +559,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 		operation = READ;
 		break;
 	case BLKIF_OP_WRITE:
-		operation = WRITE;
+		operation = WRITE_ODIRECT;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
 		operation = WRITE_BARRIER;

From a19be5f0f073525306f6a4b000d90dc84065ed93 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 27 Apr 2011 12:40:11 -0400
Subject: [PATCH 048/110] Revert "xen/blkback: Move the plugging/unplugging to
 a higher level."

This reverts commit 97961ef46b9b5a6a7c918a38b898a7b3e49869f4 b/c
we lose about 15% performance if we do the unplugging and the
end of the reading the ring buffer.
---
 drivers/block/xen-blkback/blkback.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 8583b130499a..eb068d0b47ea 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -276,8 +276,6 @@ int xen_blkif_schedule(void *arg)
 		printk(KERN_DEBUG "%s: started\n", current->comm);
 
 	while (!kthread_should_stop()) {
-		struct blk_plug plug;
-
 		if (try_to_freeze())
 			continue;
 		if (unlikely(vbd->size != vbd_sz(vbd)))
@@ -294,13 +292,9 @@ int xen_blkif_schedule(void *arg)
 		blkif->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
 
-		blk_start_plug(&plug);
-
 		if (do_block_io_op(blkif))
 			blkif->waiting_reqs = 1;
 
-		blk_finish_plug(&plug);
-
 		if (log_stats && time_after(jiffies, blkif->st_print))
 			print_stats(blkif);
 	}
@@ -553,6 +547,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	int i, nbio = 0;
 	int operation;
+	struct blk_plug plug;
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
@@ -665,9 +660,15 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	 */
 	atomic_set(&pending_req->pendcnt, nbio);
 
+	/* Get a reference count for the disk queue and start sending I/O */
+	blk_start_plug(&plug);
+
 	for (i = 0; i < nbio; i++)
 		submit_bio(operation, biolist[i]);
 
+	blk_finish_plug(&plug);
+	/* Let the I/Os go.. */
+
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;
 	else if (operation == WRITE || operation == WRITE_BARRIER)

From 73d842af27b863cbc816e75003edbc287bf57130 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 5 May 2011 12:41:03 -0400
Subject: [PATCH 049/110] xen-blkfront: Provide for 'feature-flush-cache' the
 BLKIF_OP_WRITE_FLUSH_CACHE operation.

The operation BLKIF_OP_WRITE_FLUSH_CACHE has existed in the Xen
tree header file for years but it was never present in the Linux tree
because the frontend (nor the backend) supported this interface.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/xen/interface/io/blkif.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index 61e523af3c46..3d5d6db864fe 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -44,6 +44,19 @@ typedef uint64_t blkif_sector_t;
  */
 #define BLKIF_OP_WRITE_BARRIER     2
 
+/*
+ * Recognised if "feature-flush-cache" is present in backend xenbus
+ * info.  A flush will ask the underlying storage hardware to flush its
+ * non-volatile caches as appropriate.  The "feature-flush-cache" node
+ * contains a boolean indicating whether flush requests are likely to
+ * succeed or fail. Either way, a flush request may fail at any time
+ * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
+ * block-device hardware. The boolean simply indicates whether or not it
+ * is worthwhile for the frontend to attempt flushes.  If a backend does
+ * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
+ * "feature-flush-cache" node!
+ */
+#define BLKIF_OP_FLUSH_DISKCACHE   3
 /*
  * Maximum scatter/gather segments per request.
  * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.

From 24f567f952aa308c3352f3340b9d296fc72bd066 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 4 May 2011 17:07:27 -0400
Subject: [PATCH 050/110] xen/blkback: Add support for BLKIF_OP_FLUSH_DISKCACHE
 and drop BLKIF_OP_WRITE_BARRIER.

We drop the support for 'feature-barrier' and add in the support
for the 'feature-flush-cache' if the real backend storage supports
flushing.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 33 ++++++++++++++++-------------
 drivers/block/xen-blkback/common.h  |  7 +++---
 drivers/block/xen-blkback/xenbus.c  | 19 +++++++++++------
 3 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index eb068d0b47ea..72ede0bf2697 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -46,8 +46,6 @@
 #include <asm/xen/hypercall.h>
 #include "common.h"
 
-#define WRITE_BARRIER	(REQ_WRITE | REQ_FLUSH | REQ_FUA)
-
 /*
  * These are rather arbitrary. They are fairly large because adjacent requests
  * pulled from a communication ring are quite likely to end up being part of
@@ -256,9 +254,9 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 
 static void print_stats(struct blkif_st *blkif)
 {
-	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
+	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
 	       current->comm, blkif->st_oo_req,
-	       blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
+	       blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
 	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 	blkif->st_rd_req = 0;
 	blkif->st_wr_req = 0;
@@ -414,10 +412,10 @@ static int xen_blkbk_map(struct blkif_request *req, struct pending_req *pending_
 static void __end_block_io_op(struct pending_req *pending_req, int error)
 {
 	/* An error fails the entire request. */
-	if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 	    (error == -EOPNOTSUPP)) {
-		DPRINTK("blkback: write barrier op failed, not supported\n");
-		xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		DPRINTK("blkback: flush diskcache op failed, not supported\n");
+		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if (error) {
 		DPRINTK("Buffer not up-to-date at end of operation, "
@@ -506,13 +504,14 @@ static int do_block_io_op(struct blkif_st *blkif)
 			blkif->st_rd_req++;
 			dispatch_rw_block_io(blkif, &req, pending_req);
 			break;
-		case BLKIF_OP_WRITE_BARRIER:
-			blkif->st_br_req++;
+		case BLKIF_OP_FLUSH_DISKCACHE:
+			blkif->st_f_req++;
 			/* fall through */
 		case BLKIF_OP_WRITE:
 			blkif->st_wr_req++;
 			dispatch_rw_block_io(blkif, &req, pending_req);
 			break;
+		case BLKIF_OP_WRITE_BARRIER:
 		default:
 			/* A good sign something is wrong: sleep for a while to
 			 * avoid excessive CPU consumption by a bad guest. */
@@ -556,9 +555,14 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	case BLKIF_OP_WRITE:
 		operation = WRITE_ODIRECT;
 		break;
-	case BLKIF_OP_WRITE_BARRIER:
-		operation = WRITE_BARRIER;
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		operation = WRITE_FLUSH;
+		/* The frontend likes to set this to -1, which vbd_translate
+		 * is alergic too. */
+		req->u.rw.sector_number = 0;
 		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		/* Should never get here. */
 	default:
 		operation = 0; /* make gcc happy */
 		BUG();
@@ -566,7 +570,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 
 	/* Check that the number of segments is sane. */
 	nseg = req->nr_segments;
-	if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
+	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
 		/* Haven't submitted any bio's yet. */
@@ -643,7 +647,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 
 	/* This will be hit if the operation was a barrier. */
 	if (!bio) {
-		BUG_ON(operation != WRITE_BARRIER);
+		BUG_ON(operation != WRITE_FLUSH);
 		bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, 0);
 		if (unlikely(bio == NULL))
 			goto fail_put_bio;
@@ -651,7 +655,6 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 		bio->bi_bdev    = preq.bdev;
 		bio->bi_private = pending_req;
 		bio->bi_end_io  = end_block_io_op;
-		bio->bi_sector  = -1;
 	}
 
 
@@ -671,7 +674,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;
-	else if (operation == WRITE || operation == WRITE_BARRIER)
+	else if (operation == WRITE || operation == WRITE_FLUSH)
 		blkif->st_wr_sect += preq.nr_sects;
 
 	return;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 16af388268e7..af93837e1295 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -53,6 +53,7 @@ struct vbd {
 	u32            pdevice;     /* phys device that this vbd maps to */
 	struct block_device *bdev;
 	sector_t       size;        /* Cached size parameter */
+	bool           flush_support;
 };
 
 struct backend_info;
@@ -85,7 +86,7 @@ struct blkif_st {
 	int                 st_rd_req;
 	int                 st_wr_req;
 	int                 st_oo_req;
-	int                 st_br_req;
+	int                 st_f_req;
 	int                 st_rd_sect;
 	int                 st_wr_sect;
 
@@ -120,8 +121,8 @@ int xen_blkif_xenbus_init(void);
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
 int xen_blkif_schedule(void *arg);
 
-int xen_blkbk_barrier(struct xenbus_transaction xbt,
-			struct backend_info *be, int state);
+int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
+			      struct backend_info *be, int state);
 
 struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
 
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 64b0a1c760fb..9adcf806f83f 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -276,7 +276,7 @@ int __init xen_blkif_interface_init(void)
 VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
 VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
 VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
-VBD_SHOW(br_req,  "%d\n", be->blkif->st_br_req);
+VBD_SHOW(f_req,  "%d\n", be->blkif->st_f_req);
 VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
 VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
 
@@ -284,7 +284,7 @@ static struct attribute *vbdstat_attrs[] = {
 	&dev_attr_oo_req.attr,
 	&dev_attr_rd_req.attr,
 	&dev_attr_wr_req.attr,
-	&dev_attr_br_req.attr,
+	&dev_attr_f_req.attr,
 	&dev_attr_rd_sect.attr,
 	&dev_attr_wr_sect.attr,
 	NULL
@@ -343,6 +343,7 @@ static int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle,
 {
 	struct vbd *vbd;
 	struct block_device *bdev;
+	struct request_queue *q;
 
 	vbd = &blkif->vbd;
 	vbd->handle   = handle;
@@ -375,6 +376,10 @@ static int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle,
 	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
 		vbd->type |= VDISK_REMOVABLE;
 
+	q = bdev_get_queue(bdev);
+	if (q && q->flush_flags)
+		vbd->flush_support = true;
+
 	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
 		handle, blkif->domid);
 	return 0;
@@ -406,16 +411,16 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 	return 0;
 }
 
-int xen_blkbk_barrier(struct xenbus_transaction xbt,
-		      struct backend_info *be, int state)
+int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
+			      struct backend_info *be, int state)
 {
 	struct xenbus_device *dev = be->dev;
 	int err;
 
-	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+	err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
 			    "%d", state);
 	if (err)
-		xenbus_dev_fatal(dev, err, "writing feature-barrier");
+		xenbus_dev_fatal(dev, err, "writing feature-flush-cache");
 
 	return err;
 }
@@ -642,7 +647,7 @@ again:
 		return;
 	}
 
-	err = xen_blkbk_barrier(xbt, be, 1);
+	err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
 	if (err)
 		goto abort;
 

From fc53bf757ede292312eee10d64f4e691c8c8cebf Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 5 May 2011 13:37:23 -0400
Subject: [PATCH 051/110] xen/blkback: Squash the checking for operation into
 dispatch_rw_block_io

We do a check for the operations right before calling dispatch_rw_block_io.
And then we do the same check in dispatch_rw_block_io. This patch
squashes those checks into the 'dispatch_rw_block_io' function.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 45 +++++++++--------------------
 1 file changed, 13 insertions(+), 32 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 72ede0bf2697..5f4284729a3a 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -123,9 +123,9 @@ static inline unsigned long vaddr(struct pending_req *req, int seg)
 
 
 static int do_block_io_op(struct blkif_st *blkif);
-static void dispatch_rw_block_io(struct blkif_st *blkif,
-				 struct blkif_request *req,
-				 struct pending_req *pending_req);
+static int dispatch_rw_block_io(struct blkif_st *blkif,
+				struct blkif_request *req,
+				struct pending_req *pending_req);
 static void make_response(struct blkif_st *blkif, u64 id,
 			  unsigned short op, int st);
 
@@ -499,30 +499,8 @@ static int do_block_io_op(struct blkif_st *blkif)
 		/* Apply all sanity checks to /private copy/ of request. */
 		barrier();
 
-		switch (req.operation) {
-		case BLKIF_OP_READ:
-			blkif->st_rd_req++;
-			dispatch_rw_block_io(blkif, &req, pending_req);
+		if (dispatch_rw_block_io(blkif, &req, pending_req))
 			break;
-		case BLKIF_OP_FLUSH_DISKCACHE:
-			blkif->st_f_req++;
-			/* fall through */
-		case BLKIF_OP_WRITE:
-			blkif->st_wr_req++;
-			dispatch_rw_block_io(blkif, &req, pending_req);
-			break;
-		case BLKIF_OP_WRITE_BARRIER:
-		default:
-			/* A good sign something is wrong: sleep for a while to
-			 * avoid excessive CPU consumption by a bad guest. */
-			msleep(1);
-			DPRINTK("error: unknown block io operation [%d]\n",
-				req.operation);
-			make_response(blkif, req.id, req.operation,
-				      BLKIF_RSP_ERROR);
-			free_req(pending_req);
-			break;
-		}
 
 		/* Yield point for this unbounded loop. */
 		cond_resched();
@@ -535,7 +513,7 @@ static int do_block_io_op(struct blkif_st *blkif)
  * Transumation of the 'struct blkif_request' to a proper 'struct bio'
  * and call the 'submit_bio' to pass it to the underlaying storage.
  */
-static void dispatch_rw_block_io(struct blkif_st *blkif,
+static int dispatch_rw_block_io(struct blkif_st *blkif,
 				 struct blkif_request *req,
 				 struct pending_req *pending_req)
 {
@@ -550,22 +528,25 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
+		blkif->st_rd_req++;
 		operation = READ;
 		break;
 	case BLKIF_OP_WRITE:
+		blkif->st_wr_req++;
 		operation = WRITE_ODIRECT;
 		break;
 	case BLKIF_OP_FLUSH_DISKCACHE:
+		blkif->st_f_req++;
 		operation = WRITE_FLUSH;
 		/* The frontend likes to set this to -1, which vbd_translate
 		 * is alergic too. */
 		req->u.rw.sector_number = 0;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
-		/* Should never get here. */
 	default:
 		operation = 0; /* make gcc happy */
-		BUG();
+		goto fail_response;
+		break;
 	}
 
 	/* Check that the number of segments is sane. */
@@ -677,7 +658,7 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	else if (operation == WRITE || operation == WRITE_FLUSH)
 		blkif->st_wr_sect += preq.nr_sects;
 
-	return;
+	return 0;
 
  fail_flush:
 	xen_blkbk_unmap(pending_req);
@@ -686,14 +667,14 @@ static void dispatch_rw_block_io(struct blkif_st *blkif,
 	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
 	free_req(pending_req);
 	msleep(1); /* back off a bit */
-	return;
+	return -EIO;
 
  fail_put_bio:
 	for (i = 0; i < (nbio-1); i++)
 		bio_put(biolist[i]);
 	__end_block_io_op(pending_req, -EINVAL);
 	msleep(1); /* back off a bit */
-	return;
+	return -EIO;
 }
 
 

From 3d68b39926b3b247d76cc4da0256e979b2b730e3 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 5 May 2011 13:42:10 -0400
Subject: [PATCH 052/110] xen/blkback: Fix up some of the comments.

They had the wrong data or were in the wrong spot.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 5f4284729a3a..b9bdd9e43ab9 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -603,7 +603,7 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	if (xen_blkbk_map(req, pending_req, seg))
 		goto fail_flush;
 
-	/* This corresponding blkif_put is done in __end_block_io_op */
+	/* This corresponding xen_blkif_put is done in __end_block_io_op */
 	xen_blkif_get(blkif);
 
 	for (i = 0; i < nseg; i++) {
@@ -626,7 +626,7 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 		preq.sector_number += seg[i].nsec;
 	}
 
-	/* This will be hit if the operation was a barrier. */
+	/* This will be hit if the operation was a flush. */
 	if (!bio) {
 		BUG_ON(operation != WRITE_FLUSH);
 		bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, 0);
@@ -650,8 +650,8 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	for (i = 0; i < nbio; i++)
 		submit_bio(operation, biolist[i]);
 
-	blk_finish_plug(&plug);
 	/* Let the I/Os go.. */
+	blk_finish_plug(&plug);
 
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;

From 9bd3c20487b7c13d397dc11dd51e30256bf4c9b3 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:52:54 -0500
Subject: [PATCH 053/110] cciss: add readl after writel in interrupt mask
 setting code

This is to ensure the board interrupts are really off when
these functions return.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 554bbd907d14..9b494392e5d5 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -239,11 +239,13 @@ static void SA5_intr_mask(ctlr_info_t *h, unsigned long val)
 	{ /* Turn interrupts on */
 		h->interrupts_enabled = 1;
 		writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
 	} else /* Turn them off */
 	{
 		h->interrupts_enabled = 0;
         	writel( SA5_INTR_OFF, 
 			h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
 	}
 }
 /*
@@ -257,11 +259,13 @@ static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val)
         { /* Turn interrupts on */
 		h->interrupts_enabled = 1;
                 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
         } else /* Turn them off */
         {
 		h->interrupts_enabled = 0;
                 writel( SA5B_INTR_OFF,
                         h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
         }
 }
 
@@ -271,10 +275,12 @@ static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val)
 	if (val) { /* turn on interrupts */
 		h->interrupts_enabled = 1;
 		writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
 	} else {
 		h->interrupts_enabled = 0;
 		writel(SA5_PERF_INTR_OFF,
 				h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
 	}
 }
 

From 62710ae1ceb839de1eebb5b4492ec8a7fbcf8d02 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:00 -0500
Subject: [PATCH 054/110] cciss: do a better job of detecting controller reset
 failure

Detect failure of controller reset by noticing if the 32 bytes of
"driver version" we store on the hardware in the config table
fail to get zeroed out.  Previously we noticed if the controller
did not transition to "simple mode", but this did not detect reset
failure if the controller was already in simple mode prior to
the reset attempt (e.g. due to module parameter hpsa_simple_mode=1).

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c     | 82 ++++++++++++++++++++++++++++++++++-----
 drivers/block/cciss_cmd.h |  1 +
 2 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9bf13988f1a2..ed6aa83d86dd 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -194,6 +194,8 @@ static int __devinit cciss_find_cfg_addrs(struct pci_dev *pdev,
 static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
 	unsigned long *memory_bar);
 static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag);
+static __devinit int write_driver_ver_to_cfgtable(
+	CfgTable_struct __iomem *cfgtable);
 
 /* performant mode helper functions */
 static void  calc_bucket_map(int *bucket, int num_buckets, int nsgs,
@@ -4078,6 +4080,9 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h)
 		cfg_base_addr_index) + cfg_offset, sizeof(h->cfgtable));
 	if (!h->cfgtable)
 		return -ENOMEM;
+	rc = write_driver_ver_to_cfgtable(h->cfgtable);
+	if (rc)
+		return rc;
 	/* Find performant mode table. */
 	trans_offset = readl(&h->cfgtable->TransMethodOffset);
 	h->transtable = remap_pci_mem(pci_resource_start(h->pdev,
@@ -4428,6 +4433,60 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
 	return 0;
 }
 
+static __devinit void init_driver_version(char *driver_version, int len)
+{
+	memset(driver_version, 0, len);
+	strncpy(driver_version, "cciss " DRIVER_NAME, len - 1);
+}
+
+static __devinit int write_driver_ver_to_cfgtable(
+	CfgTable_struct __iomem *cfgtable)
+{
+	char *driver_version;
+	int i, size = sizeof(cfgtable->driver_version);
+
+	driver_version = kmalloc(size, GFP_KERNEL);
+	if (!driver_version)
+		return -ENOMEM;
+
+	init_driver_version(driver_version, size);
+	for (i = 0; i < size; i++)
+		writeb(driver_version[i], &cfgtable->driver_version[i]);
+	kfree(driver_version);
+	return 0;
+}
+
+static __devinit void read_driver_ver_from_cfgtable(
+	CfgTable_struct __iomem *cfgtable, unsigned char *driver_ver)
+{
+	int i;
+
+	for (i = 0; i < sizeof(cfgtable->driver_version); i++)
+		driver_ver[i] = readb(&cfgtable->driver_version[i]);
+}
+
+static __devinit int controller_reset_failed(
+	CfgTable_struct __iomem *cfgtable)
+{
+
+	char *driver_ver, *old_driver_ver;
+	int rc, size = sizeof(cfgtable->driver_version);
+
+	old_driver_ver = kmalloc(2 * size, GFP_KERNEL);
+	if (!old_driver_ver)
+		return -ENOMEM;
+	driver_ver = old_driver_ver + size;
+
+	/* After a reset, the 32 bytes of "driver version" in the cfgtable
+	 * should have been changed, otherwise we know the reset failed.
+	 */
+	init_driver_version(old_driver_ver, size);
+	read_driver_ver_from_cfgtable(cfgtable, driver_ver);
+	rc = !memcmp(driver_ver, old_driver_ver, size);
+	kfree(old_driver_ver);
+	return rc;
+}
+
 /* This does a hard reset of the controller using PCI power management
  * states or using the doorbell register. */
 static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
@@ -4437,7 +4496,7 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	u64 cfg_base_addr_index;
 	void __iomem *vaddr;
 	unsigned long paddr;
-	u32 misc_fw_support, active_transport;
+	u32 misc_fw_support;
 	int rc;
 	CfgTable_struct __iomem *cfgtable;
 	bool use_doorbell;
@@ -4497,6 +4556,9 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 		rc = -ENOMEM;
 		goto unmap_vaddr;
 	}
+	rc = write_driver_ver_to_cfgtable(cfgtable);
+	if (rc)
+		goto unmap_vaddr;
 
 	/* If reset via doorbell register is supported, use that. */
 	misc_fw_support = readl(&cfgtable->misc_fw_support);
@@ -4535,21 +4597,21 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 			"failed waiting for board to become ready\n");
 		goto unmap_cfgtable;
 	}
-	dev_info(&pdev->dev, "board ready.\n");
 
-	/* Controller should be in simple mode at this point.  If it's not,
-	 * It means we're on one of those controllers which doesn't support
-	 * the doorbell reset method and on which the PCI power management reset
-	 * method doesn't work (P800, for example.)
-	 * In those cases, don't try to proceed, as it generally doesn't work.
-	 */
-	active_transport = readl(&cfgtable->TransportActive);
-	if (active_transport & PERFORMANT_MODE) {
+	rc = controller_reset_failed(vaddr);
+	if (rc < 0)
+		goto unmap_cfgtable;
+	if (rc) {
 		dev_warn(&pdev->dev, "Unable to successfully reset controller,"
 			" Ignoring controller.\n");
 		rc = -ENODEV;
+		goto unmap_cfgtable;
+	} else {
+		dev_info(&pdev->dev, "board ready.\n");
 	}
 
+	dev_info(&pdev->dev, "board ready.\n");
+
 unmap_cfgtable:
 	iounmap(cfgtable);
 
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index cd441bef031f..a2e68d21efe3 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -235,6 +235,7 @@ typedef struct _CfgTable_struct {
   u8		   reserved[0x78 - 0x58];
   u32		   misc_fw_support; /* offset 0x78 */
 #define MISC_FW_DOORBELL_RESET (0x02)
+	u8	   driver_version[32];
 } CfgTable_struct;
 
 struct TransTable_struct {

From 54dae3432021f38cf20542ccd152dddb91c7c2d7 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:05 -0500
Subject: [PATCH 055/110] cciss: factor out command pool allocation functions

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 67 +++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index ed6aa83d86dd..152cb40e2e28 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4653,6 +4653,39 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
 	return 0;
 }
 
+static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h)
+{
+	h->cmd_pool_bits = kmalloc(
+		DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) *
+		sizeof(unsigned long), GFP_KERNEL);
+	h->cmd_pool = pci_alloc_consistent(h->pdev,
+		h->nr_cmds * sizeof(CommandList_struct),
+		&(h->cmd_pool_dhandle));
+	h->errinfo_pool = pci_alloc_consistent(h->pdev,
+		h->nr_cmds * sizeof(ErrorInfo_struct),
+		&(h->errinfo_pool_dhandle));
+	if ((h->cmd_pool_bits == NULL)
+		|| (h->cmd_pool == NULL)
+		|| (h->errinfo_pool == NULL)) {
+		dev_err(&h->pdev->dev, "out of memory");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void cciss_free_cmd_pool(ctlr_info_t *h)
+{
+	kfree(h->cmd_pool_bits);
+	if (h->cmd_pool)
+		pci_free_consistent(h->pdev,
+			h->nr_cmds * sizeof(CommandList_struct),
+			h->cmd_pool, h->cmd_pool_dhandle);
+	if (h->errinfo_pool)
+		pci_free_consistent(h->pdev,
+			h->nr_cmds * sizeof(ErrorInfo_struct),
+			h->errinfo_pool, h->errinfo_pool_dhandle);
+}
+
 /*
  *  This is it.  Find all the controllers and register them.  I really hate
  *  stealing all these major device numbers.
@@ -4745,23 +4778,8 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	       h->devname, pdev->device, pci_name(pdev),
 	       h->intr[PERF_MODE_INT], dac ? "" : " not");
 
-	h->cmd_pool_bits =
-	    kmalloc(DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG)
-			* sizeof(unsigned long), GFP_KERNEL);
-	h->cmd_pool = (CommandList_struct *)
-	    pci_alloc_consistent(h->pdev,
-		    h->nr_cmds * sizeof(CommandList_struct),
-		    &(h->cmd_pool_dhandle));
-	h->errinfo_pool = (ErrorInfo_struct *)
-	    pci_alloc_consistent(h->pdev,
-		    h->nr_cmds * sizeof(ErrorInfo_struct),
-		    &(h->errinfo_pool_dhandle));
-	if ((h->cmd_pool_bits == NULL)
-	    || (h->cmd_pool == NULL)
-	    || (h->errinfo_pool == NULL)) {
-		dev_err(&h->pdev->dev, "out of memory");
+	if (cciss_allocate_cmd_pool(h))
 		goto clean4;
-	}
 
 	/* Need space for temp scatter list */
 	h->scatter_list = kmalloc(h->max_commands *
@@ -4837,21 +4855,12 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	return 1;
 
 clean4:
-	kfree(h->cmd_pool_bits);
+	cciss_free_cmd_pool(h);
 	/* Free up sg elements */
 	for (k-- ; k >= 0; k--)
 		kfree(h->scatter_list[k]);
 	kfree(h->scatter_list);
 	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
-	if (h->cmd_pool)
-		pci_free_consistent(h->pdev,
-				    h->nr_cmds * sizeof(CommandList_struct),
-				    h->cmd_pool, h->cmd_pool_dhandle);
-	if (h->errinfo_pool)
-		pci_free_consistent(h->pdev,
-				    h->nr_cmds * sizeof(ErrorInfo_struct),
-				    h->errinfo_pool,
-				    h->errinfo_pool_dhandle);
 	free_irq(h->intr[PERF_MODE_INT], h);
 clean2:
 	unregister_blkdev(h->major, h->devname);
@@ -4949,11 +4958,7 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 	iounmap(h->cfgtable);
 	iounmap(h->vaddr);
 
-	pci_free_consistent(h->pdev, h->nr_cmds * sizeof(CommandList_struct),
-			    h->cmd_pool, h->cmd_pool_dhandle);
-	pci_free_consistent(h->pdev, h->nr_cmds * sizeof(ErrorInfo_struct),
-			    h->errinfo_pool, h->errinfo_pool_dhandle);
-	kfree(h->cmd_pool_bits);
+	cciss_free_cmd_pool(h);
 	/* Free up sg elements */
 	for (j = 0; j < h->nr_cmds; j++)
 		kfree(h->scatter_list[j]);

From abf7966e616ef6e04393ef3678227f77d6179a8a Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:10 -0500
Subject: [PATCH 056/110] cciss: factor out scatterlist allocation functions

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 55 +++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 152cb40e2e28..55ca1f45c0c7 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4673,6 +4673,39 @@ static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h)
 	return 0;
 }
 
+static __devinit int cciss_allocate_scatterlists(ctlr_info_t *h)
+{
+	int i;
+
+	/* zero it, so that on free we need not know how many were alloc'ed */
+	h->scatter_list = kzalloc(h->max_commands *
+				sizeof(struct scatterlist *), GFP_KERNEL);
+	if (!h->scatter_list)
+		return -ENOMEM;
+
+	for (i = 0; i < h->nr_cmds; i++) {
+		h->scatter_list[i] = kmalloc(sizeof(struct scatterlist) *
+						h->maxsgentries, GFP_KERNEL);
+		if (h->scatter_list[i] == NULL) {
+			dev_err(&h->pdev->dev, "could not allocate "
+				"s/g lists\n");
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+static void cciss_free_scatterlists(ctlr_info_t *h)
+{
+	int i;
+
+	if (h->scatter_list) {
+		for (i = 0; i < h->nr_cmds; i++)
+			kfree(h->scatter_list[i]);
+		kfree(h->scatter_list);
+	}
+}
+
 static void cciss_free_cmd_pool(ctlr_info_t *h)
 {
 	kfree(h->cmd_pool_bits);
@@ -4696,7 +4729,6 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 {
 	int i;
 	int j = 0;
-	int k = 0;
 	int rc;
 	int dac, return_code;
 	InquiryData_struct *inq_buff;
@@ -4781,23 +4813,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	if (cciss_allocate_cmd_pool(h))
 		goto clean4;
 
-	/* Need space for temp scatter list */
-	h->scatter_list = kmalloc(h->max_commands *
-						sizeof(struct scatterlist *),
-						GFP_KERNEL);
-	if (!h->scatter_list)
+	if (cciss_allocate_scatterlists(h))
 		goto clean4;
 
-	for (k = 0; k < h->nr_cmds; k++) {
-		h->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
-							h->maxsgentries,
-							GFP_KERNEL);
-		if (h->scatter_list[k] == NULL) {
-			dev_err(&h->pdev->dev,
-				"could not allocate s/g lists\n");
-			goto clean4;
-		}
-	}
 	h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
 		h->chainsize, h->nr_cmds);
 	if (!h->cmd_sg_list && h->chainsize > 0)
@@ -4856,10 +4874,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 clean4:
 	cciss_free_cmd_pool(h);
-	/* Free up sg elements */
-	for (k-- ; k >= 0; k--)
-		kfree(h->scatter_list[k]);
-	kfree(h->scatter_list);
+	cciss_free_scatterlists(h);
 	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
 	free_irq(h->intr[PERF_MODE_INT], h);
 clean2:

From 2b48085f972a761b38cf60c626031a7fdd9e6d55 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:16 -0500
Subject: [PATCH 057/110] cciss: factor out irq request code

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 55ca1f45c0c7..63fe05af2c5d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4719,6 +4719,28 @@ static void cciss_free_cmd_pool(ctlr_info_t *h)
 			h->errinfo_pool, h->errinfo_pool_dhandle);
 }
 
+static int cciss_request_irq(ctlr_info_t *h,
+	irqreturn_t (*msixhandler)(int, void *),
+	irqreturn_t (*intxhandler)(int, void *))
+{
+	if (h->msix_vector || h->msi_vector) {
+		if (!request_irq(h->intr[PERF_MODE_INT], msixhandler,
+				IRQF_DISABLED, h->devname, h))
+			return 0;
+		dev_err(&h->pdev->dev, "Unable to get msi irq %d"
+			" for %s\n", h->intr[PERF_MODE_INT],
+			h->devname);
+		return -1;
+	}
+
+	if (!request_irq(h->intr[PERF_MODE_INT], intxhandler,
+			IRQF_DISABLED, h->devname, h))
+		return 0;
+	dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
+		h->intr[PERF_MODE_INT], h->devname);
+	return -1;
+}
+
 /*
  *  This is it.  Find all the controllers and register them.  I really hate
  *  stealing all these major device numbers.
@@ -4789,22 +4811,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 	/* make sure the board interrupts are off */
 	h->access.set_intr_mask(h, CCISS_INTR_OFF);
-	if (h->msi_vector || h->msix_vector) {
-		if (request_irq(h->intr[PERF_MODE_INT],
-				do_cciss_msix_intr,
-				IRQF_DISABLED, h->devname, h)) {
-			dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
-			       h->intr[PERF_MODE_INT], h->devname);
-			goto clean2;
-		}
-	} else {
-		if (request_irq(h->intr[PERF_MODE_INT], do_cciss_intx,
-				IRQF_DISABLED, h->devname, h)) {
-			dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
-			       h->intr[PERF_MODE_INT], h->devname);
-			goto clean2;
-		}
-	}
+	rc = cciss_request_irq(h, do_cciss_msix_intr, do_cciss_intx);
+	if (rc)
+		goto clean2;
 
 	dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
 	       h->devname, pdev->device, pci_name(pdev),

From e363e0143615a67f19d56e6b223b55df3bd9f580 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:21 -0500
Subject: [PATCH 058/110] cciss: fix reply pool and block fetch table memory
 leaks

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 63fe05af2c5d..d60448941d53 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4988,6 +4988,10 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 		kfree(h->scatter_list[j]);
 	kfree(h->scatter_list);
 	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+	kfree(h->blockFetchTable);
+	if (h->reply_pool)
+		pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
+				h->reply_pool, h->reply_pool_dhandle);
 	/*
 	 * Deliberately omit pci_disable_device(): it does something nasty to
 	 * Smart Array controllers that pci_enable_device does not undo

From 8f71bb829a964ef4deead86b60fda09452fb5c2f Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:26 -0500
Subject: [PATCH 059/110] cciss: get rid of message related magic numbers

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c     | 8 ++++----
 drivers/block/cciss_cmd.h | 8 ++++++++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d60448941d53..1a4dff09c541 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2569,7 +2569,7 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 		}
 	} else if (cmd_type == TYPE_MSG) {
 		switch (cmd) {
-		case 0:	/* ABORT message */
+		case CCISS_ABORT_MSG:
 			c->Request.CDBLen = 12;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_WRITE;
@@ -2579,16 +2579,16 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 			/* buff contains the tag of the command to abort */
 			memcpy(&c->Request.CDB[4], buff, 8);
 			break;
-		case 1:	/* RESET message */
+		case CCISS_RESET_MSG:
 			c->Request.CDBLen = 16;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_NONE;
 			c->Request.Timeout = 0;
 			memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
 			c->Request.CDB[0] = cmd;	/* reset */
-			c->Request.CDB[1] = 0x03;	/* reset a target */
+			c->Request.CDB[1] = CCISS_RESET_TYPE_TARGET;
 			break;
-		case 3:	/* No-Op message */
+		case CCISS_NOOP_MSG:
 			c->Request.CDBLen = 1;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_WRITE;
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index a2e68d21efe3..3b20c31d746d 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -142,6 +142,14 @@ typedef struct _ReadCapdata_struct_16
 #define BMIC_CACHE_FLUSH 0xc2
 #define CCISS_CACHE_FLUSH 0x01	/* C2 was already being used by CCISS */
 
+#define CCISS_ABORT_MSG 0x00
+#define CCISS_RESET_MSG 0x01
+#define CCISS_RESET_TYPE_CONTROLLER 0x00
+#define CCISS_RESET_TYPE_BUS 0x01
+#define CCISS_RESET_TYPE_TARGET 0x03
+#define CCISS_RESET_TYPE_LUN 0x04
+#define CCISS_NOOP_MSG 0x03
+
 /* Command List Structure */
 #define CTLR_LUNID "\0\0\0\0\0\0\0\0"
 

From 19adbb9254cbd46224376fcb3a773a2272e0845e Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:31 -0500
Subject: [PATCH 060/110] cciss: increase time to wait for board reset to start

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 9b494392e5d5..21ec628ef700 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -200,7 +200,7 @@ struct ctlr_info
  * the above.
  */
 #define CCISS_BOARD_READY_WAIT_SECS (120)
-#define CCISS_BOARD_NOT_READY_WAIT_SECS (10)
+#define CCISS_BOARD_NOT_READY_WAIT_SECS (100)
 #define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
 #define CCISS_BOARD_READY_ITERATIONS \
 	((CCISS_BOARD_READY_WAIT_SECS * 1000) / \

From 59ec86bb9872fbf9fd8572a936423f5e3ad615e7 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:36 -0500
Subject: [PATCH 061/110] cciss: clarify messages around reset behavior

When waiting for the board to become "not ready"
don't print a message saying "waiting for board to
become ready" (possibly followed by a message saying
"failed waiting for board to become not ready".  Instead,
it should be "waiting for board to reset" and "failed
waiting for board to reset."

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
"
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 1a4dff09c541..3b557231e1d3 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4586,11 +4586,11 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	msleep(CCISS_POST_RESET_PAUSE_MSECS);
 
 	/* Wait for board to become not ready, then ready. */
-	dev_info(&pdev->dev, "Waiting for board to become ready.\n");
+	dev_info(&pdev->dev, "Waiting for board to reset.\n");
 	rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
 	if (rc) /* Don't bail, might be E500, etc. which can't be reset */
 		dev_warn(&pdev->dev,
-			"failed waiting for board to become not ready\n");
+			"failed waiting for board to reset\n");
 	rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
 	if (rc) {
 		dev_warn(&pdev->dev,
@@ -4641,6 +4641,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
 		return -ENODEV;
 
 	/* Now try to get the controller to respond to a no-op */
+	dev_warn(&pdev->dev, "Waiting for controller to respond to no-op\n");
 	for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) {
 		if (cciss_noop(pdev) == 0)
 			break;

From 3e28601fdfdec75ce8f6aaaf58540fdd0883fb58 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:41 -0500
Subject: [PATCH 062/110] cciss: increase timeouts for post-reset no-ops

Just to reduce the messages about timeouts that appear.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 2 +-
 drivers/block/cciss.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 3b557231e1d3..b9f8658341cc 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4353,7 +4353,7 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
 		tag = readl(vaddr + SA5_REPLY_PORT_OFFSET);
 		if ((tag & ~3) == paddr32)
 			break;
-		schedule_timeout_uninterruptible(HZ);
+		msleep(CCISS_POST_RESET_NOOP_TIMEOUT_MSECS);
 	}
 
 	iounmap(vaddr);
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 21ec628ef700..16b4d58d84dd 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -209,8 +209,9 @@ struct ctlr_info
 	((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
 		CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
 #define CCISS_POST_RESET_PAUSE_MSECS (3000)
-#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000)
+#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (4000)
 #define CCISS_POST_RESET_NOOP_RETRIES (12)
+#define CCISS_POST_RESET_NOOP_TIMEOUT_MSECS (10000)
 
 /* 
 	Send the command to the hardware 

From bf2e2e6b87ae38fab460a36abfe272d99ae8be49 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:46 -0500
Subject: [PATCH 063/110] cciss: use new doorbell-bit-5 reset method

The bit-2-doorbell reset method seemed to cause (survivable) NMIs
on some systems and (unsurvivable) IOCK NMIs on some G7 servers.
Firmware guys implemented a new doorbell method to alleviate these
problems triggered by bit 5 of the doorbell register.  We want to
use it if it's available.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c     | 25 ++++++++++++++-----------
 drivers/block/cciss_cmd.h |  2 ++
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index b9f8658341cc..df58c59d9031 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4384,7 +4384,7 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
 #define cciss_noop(p) cciss_message(p, 3, 0)
 
 static int cciss_controller_hard_reset(struct pci_dev *pdev,
-	void * __iomem vaddr, bool use_doorbell)
+	void * __iomem vaddr, u32 use_doorbell)
 {
 	u16 pmcsr;
 	int pos;
@@ -4395,7 +4395,7 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
 		 * other way using the doorbell register.
 		 */
 		dev_info(&pdev->dev, "using doorbell to reset controller\n");
-		writel(DOORBELL_CTLR_RESET, vaddr + SA5_DOORBELL);
+		writel(use_doorbell, vaddr + SA5_DOORBELL);
 		msleep(1000);
 	} else { /* Try to do it the PCI power state way */
 
@@ -4499,7 +4499,7 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	u32 misc_fw_support;
 	int rc;
 	CfgTable_struct __iomem *cfgtable;
-	bool use_doorbell;
+	u32 use_doorbell;
 	u32 board_id;
 	u16 command_register;
 
@@ -4560,15 +4560,18 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	if (rc)
 		goto unmap_vaddr;
 
-	/* If reset via doorbell register is supported, use that. */
-	misc_fw_support = readl(&cfgtable->misc_fw_support);
-	use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
-
-	/* The doorbell reset seems to cause lockups on some Smart
-	 * Arrays (e.g. P410, P410i, maybe others).  Until this is
-	 * fixed or at least isolated, avoid the doorbell reset.
+	/* If reset via doorbell register is supported, use that.
+	 * There are two such methods.  Favor the newest method.
 	 */
-	use_doorbell = 0;
+	misc_fw_support = readl(&cfgtable->misc_fw_support);
+	use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET2;
+	if (use_doorbell) {
+		use_doorbell = DOORBELL_CTLR_RESET2;
+	} else {
+		use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
+		if (use_doorbell)
+			use_doorbell = DOORBELL_CTLR_RESET;
+	}
 
 	rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
 	if (rc)
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index 3b20c31d746d..d9be6b4d49a6 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -53,6 +53,7 @@
 #define CFGTBL_ChangeReq        0x00000001l
 #define CFGTBL_AccCmds          0x00000001l
 #define DOORBELL_CTLR_RESET     0x00000004l
+#define DOORBELL_CTLR_RESET2    0x00000020l
 
 #define CFGTBL_Trans_Simple     0x00000002l
 #define CFGTBL_Trans_Performant 0x00000004l
@@ -243,6 +244,7 @@ typedef struct _CfgTable_struct {
   u8		   reserved[0x78 - 0x58];
   u32		   misc_fw_support; /* offset 0x78 */
 #define MISC_FW_DOORBELL_RESET (0x02)
+#define MISC_FW_DOORBELL_RESET2 (0x10)
 	u8	   driver_version[32];
 } CfgTable_struct;
 

From 5afe278114a8dd9480813377c75b5e40a42c5066 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:52 -0500
Subject: [PATCH 064/110] cciss: do soft reset if hard reset is broken

on driver load, if reset_devices is set, and the hard reset
attempts fail, try to bring up the controller to the point that
a command can be sent, and send it a soft reset command, then
after the reset undo whatever driver initialization was done to get
it to the point to take a command, and re-do it after the reset.

This is to get kdump to work on all the "non-resettable" controllers
(except 64xx controllers which can't be reset due to the potentially
shared cache module.)

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 236 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 221 insertions(+), 15 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index df58c59d9031..23b0ba49300a 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2477,6 +2477,31 @@ static int deregister_disk(ctlr_info_t *h, int drv_index,
 	return 0;
 }
 
+static int __devinit cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr,
+	u8 reset_type)
+{
+	CommandList_struct *c;
+	int return_status;
+
+	c = cmd_alloc(h);
+	if (!c)
+		return -ENOMEM;
+	return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0,
+		CTLR_LUNID, TYPE_MSG);
+	c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */
+	if (return_status != IO_OK) {
+		cmd_special_free(h, c);
+		return return_status;
+	}
+	c->waiting = NULL;
+	enqueue_cmd_and_start_io(h, c);
+	/* Don't wait for completion, the reset won't complete.  Don't free
+	 * the command either.  This is the last command we will send before
+	 * re-initializing everything, so it doesn't matter and won't leak.
+	 */
+	return 0;
+}
+
 static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 		size_t size, __u8 page_code, unsigned char *scsi3addr,
 		int cmd_type)
@@ -3463,6 +3488,63 @@ static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
 	return next_command(h);
 }
 
+/* Some controllers, like p400, will give us one interrupt
+ * after a soft reset, even if we turned interrupts off.
+ * Only need to check for this in the cciss_xxx_discard_completions
+ * functions.
+ */
+static int ignore_bogus_interrupt(ctlr_info_t *h)
+{
+	if (likely(!reset_devices))
+		return 0;
+
+	if (likely(h->interrupts_enabled))
+		return 0;
+
+	dev_info(&h->pdev->dev, "Received interrupt while interrupts disabled "
+		"(known firmware bug.)  Ignoring.\n");
+
+	return 1;
+}
+
+static irqreturn_t cciss_intx_discard_completions(int irq, void *dev_id)
+{
+	ctlr_info_t *h = dev_id;
+	unsigned long flags;
+	u32 raw_tag;
+
+	if (ignore_bogus_interrupt(h))
+		return IRQ_NONE;
+
+	if (interrupt_not_for_us(h))
+		return IRQ_NONE;
+	spin_lock_irqsave(&h->lock, flags);
+	while (interrupt_pending(h)) {
+		raw_tag = get_next_completion(h);
+		while (raw_tag != FIFO_EMPTY)
+			raw_tag = next_command(h);
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t cciss_msix_discard_completions(int irq, void *dev_id)
+{
+	ctlr_info_t *h = dev_id;
+	unsigned long flags;
+	u32 raw_tag;
+
+	if (ignore_bogus_interrupt(h))
+		return IRQ_NONE;
+
+	spin_lock_irqsave(&h->lock, flags);
+	raw_tag = get_next_completion(h);
+	while (raw_tag != FIFO_EMPTY)
+		raw_tag = next_command(h);
+	spin_unlock_irqrestore(&h->lock, flags);
+	return IRQ_HANDLED;
+}
+
 static irqreturn_t do_cciss_intx(int irq, void *dev_id)
 {
 	ctlr_info_t *h = dev_id;
@@ -4380,7 +4462,6 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
 	return 0;
 }
 
-#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0)
 #define cciss_noop(p) cciss_message(p, 3, 0)
 
 static int cciss_controller_hard_reset(struct pci_dev *pdev,
@@ -4591,13 +4672,17 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	/* Wait for board to become not ready, then ready. */
 	dev_info(&pdev->dev, "Waiting for board to reset.\n");
 	rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
-	if (rc) /* Don't bail, might be E500, etc. which can't be reset */
-		dev_warn(&pdev->dev,
-			"failed waiting for board to reset\n");
+	if (rc) {
+		dev_warn(&pdev->dev, "Failed waiting for board to hard reset."
+				"  Will try soft reset.\n");
+		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
+		goto unmap_cfgtable;
+	}
 	rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
 	if (rc) {
 		dev_warn(&pdev->dev,
-			"failed waiting for board to become ready\n");
+			"failed waiting for board to become ready "
+			"after hard reset\n");
 		goto unmap_cfgtable;
 	}
 
@@ -4605,16 +4690,13 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	if (rc < 0)
 		goto unmap_cfgtable;
 	if (rc) {
-		dev_warn(&pdev->dev, "Unable to successfully reset controller,"
-			" Ignoring controller.\n");
-		rc = -ENODEV;
-		goto unmap_cfgtable;
+		dev_warn(&pdev->dev, "Unable to successfully hard reset "
+			"controller. Will try soft reset.\n");
+		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
 	} else {
-		dev_info(&pdev->dev, "board ready.\n");
+		dev_info(&pdev->dev, "Board ready after hard reset.\n");
 	}
 
-	dev_info(&pdev->dev, "board ready.\n");
-
 unmap_cfgtable:
 	iounmap(cfgtable);
 
@@ -4639,7 +4721,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
 	 * due to concerns about shared bbwc between 6402/6404 pair.
 	 */
 	if (rc == -ENOTSUPP)
-		return 0; /* just try to do the kdump anyhow. */
+		return rc; /* just try to do the kdump anyhow. */
 	if (rc)
 		return -ENODEV;
 
@@ -4745,6 +4827,60 @@ static int cciss_request_irq(ctlr_info_t *h,
 	return -1;
 }
 
+static int __devinit cciss_kdump_soft_reset(ctlr_info_t *h)
+{
+	if (cciss_send_reset(h, CTLR_LUNID, CCISS_RESET_TYPE_CONTROLLER)) {
+		dev_warn(&h->pdev->dev, "Resetting array controller failed.\n");
+		return -EIO;
+	}
+
+	dev_info(&h->pdev->dev, "Waiting for board to soft reset.\n");
+	if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_NOT_READY)) {
+		dev_warn(&h->pdev->dev, "Soft reset had no effect.\n");
+		return -1;
+	}
+
+	dev_info(&h->pdev->dev, "Board reset, awaiting READY status.\n");
+	if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY)) {
+		dev_warn(&h->pdev->dev, "Board failed to become ready "
+			"after soft reset.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
+{
+	int ctlr = h->ctlr;
+
+	free_irq(h->intr[PERF_MODE_INT], h);
+#ifdef CONFIG_PCI_MSI
+	if (h->msix_vector)
+		pci_disable_msix(h->pdev);
+	else if (h->msi_vector)
+		pci_disable_msi(h->pdev);
+#endif /* CONFIG_PCI_MSI */
+	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+	cciss_free_scatterlists(h);
+	cciss_free_cmd_pool(h);
+	kfree(h->blockFetchTable);
+	if (h->reply_pool)
+		pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
+				h->reply_pool, h->reply_pool_dhandle);
+	if (h->transtable)
+		iounmap(h->transtable);
+	if (h->cfgtable)
+		iounmap(h->cfgtable);
+	if (h->vaddr)
+		iounmap(h->vaddr);
+	unregister_blkdev(h->major, h->devname);
+	cciss_destroy_hba_sysfs_entry(h);
+	pci_release_regions(h->pdev);
+	kfree(h);
+	hba[ctlr] = NULL;
+}
+
 /*
  *  This is it.  Find all the controllers and register them.  I really hate
  *  stealing all these major device numbers.
@@ -4756,13 +4892,27 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	int i;
 	int j = 0;
 	int rc;
+	int try_soft_reset = 0;
 	int dac, return_code;
 	InquiryData_struct *inq_buff;
 	ctlr_info_t *h;
+	unsigned long flags;
 
 	rc = cciss_init_reset_devices(pdev);
-	if (rc)
-		return rc;
+	if (rc) {
+		if (rc != -ENOTSUPP)
+			return rc;
+		/* If the reset fails in a particular way (it has no way to do
+		 * a proper hard reset, so returns -ENOTSUPP) we can try to do
+		 * a soft reset once we get the controller configured up to the
+		 * point that it can accept a command.
+		 */
+		try_soft_reset = 1;
+		rc = 0;
+	}
+
+reinit_after_soft_reset:
+
 	i = alloc_cciss_hba(pdev);
 	if (i < 0)
 		return -1;
@@ -4852,6 +5002,62 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		h->gendisk[j] = NULL;
 	}
 
+	/* At this point, the controller is ready to take commands.
+	 * Now, if reset_devices and the hard reset didn't work, try
+	 * the soft reset and see if that works.
+	 */
+	if (try_soft_reset) {
+
+		/* This is kind of gross.  We may or may not get a completion
+		 * from the soft reset command, and if we do, then the value
+		 * from the fifo may or may not be valid.  So, we wait 10 secs
+		 * after the reset throwing away any completions we get during
+		 * that time.  Unregister the interrupt handler and register
+		 * fake ones to scoop up any residual completions.
+		 */
+		spin_lock_irqsave(&h->lock, flags);
+		h->access.set_intr_mask(h, CCISS_INTR_OFF);
+		spin_unlock_irqrestore(&h->lock, flags);
+		free_irq(h->intr[PERF_MODE_INT], h);
+		rc = cciss_request_irq(h, cciss_msix_discard_completions,
+					cciss_intx_discard_completions);
+		if (rc) {
+			dev_warn(&h->pdev->dev, "Failed to request_irq after "
+				"soft reset.\n");
+			goto clean4;
+		}
+
+		rc = cciss_kdump_soft_reset(h);
+		if (rc) {
+			dev_warn(&h->pdev->dev, "Soft reset failed.\n");
+			goto clean4;
+		}
+
+		dev_info(&h->pdev->dev, "Board READY.\n");
+		dev_info(&h->pdev->dev,
+			"Waiting for stale completions to drain.\n");
+		h->access.set_intr_mask(h, CCISS_INTR_ON);
+		msleep(10000);
+		h->access.set_intr_mask(h, CCISS_INTR_OFF);
+
+		rc = controller_reset_failed(h->cfgtable);
+		if (rc)
+			dev_info(&h->pdev->dev,
+				"Soft reset appears to have failed.\n");
+
+		/* since the controller's reset, we have to go back and re-init
+		 * everything.  Easiest to just forget what we've done and do it
+		 * all over again.
+		 */
+		cciss_undo_allocations_after_kdump_soft_reset(h);
+		try_soft_reset = 0;
+		if (rc)
+			/* don't go to clean4, we already unallocated */
+			return -ENODEV;
+
+		goto reinit_after_soft_reset;
+	}
+
 	cciss_scsi_setup(h);
 
 	/* Turn the interrupts on so we can service requests */

From 93c46c2fa7cfb272c3014327830d6cb30d8486a4 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:53:57 -0500
Subject: [PATCH 065/110] cciss: remove superfluous sleeps around reset code

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 23b0ba49300a..7990b98d4f73 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4477,7 +4477,6 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
 		 */
 		dev_info(&pdev->dev, "using doorbell to reset controller\n");
 		writel(use_doorbell, vaddr + SA5_DOORBELL);
-		msleep(1000);
 	} else { /* Try to do it the PCI power state way */
 
 		/* Quoting from the Open CISS Specification: "The Power
@@ -4508,8 +4507,6 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
 		pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
 		pmcsr |= PCI_D0;
 		pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
-
-		msleep(500);
 	}
 	return 0;
 }

From ec52d5f1cb9a1a0db02143fdcc6004749ea19e0b Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:54:02 -0500
Subject: [PATCH 066/110] cciss: do not attempt PCI power management reset
 method if we know it won't work.

Just go straight to the soft-reset method instead.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 7990b98d4f73..865484e40841 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -558,7 +558,7 @@ static void __devinit cciss_procinit(ctlr_info_t *h)
 #define to_hba(n) container_of(n, struct ctlr_info, dev)
 #define to_drv(n) container_of(n, drive_info_struct, dev)
 
-/* List of controllers which cannot be reset on kexec with reset_devices */
+/* List of controllers which cannot be hard reset on kexec with reset_devices */
 static u32 unresettable_controller[] = {
 	0x324a103C, /* Smart Array P712m */
 	0x324b103C, /* SmartArray P711m */
@@ -576,23 +576,45 @@ static u32 unresettable_controller[] = {
 	0x409D0E11, /* Smart Array 6400 EM */
 };
 
-static int ctlr_is_resettable(struct ctlr_info *h)
+/* List of controllers which cannot even be soft reset */
+static u32 soft_unresettable_controller[] = {
+	0x409C0E11, /* Smart Array 6400 */
+	0x409D0E11, /* Smart Array 6400 EM */
+};
+
+static int ctlr_is_hard_resettable(u32 board_id)
 {
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++)
-		if (unresettable_controller[i] == h->board_id)
+		if (unresettable_controller[i] == board_id)
 			return 0;
 	return 1;
 }
 
+static int ctlr_is_soft_resettable(u32 board_id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(soft_unresettable_controller); i++)
+		if (soft_unresettable_controller[i] == board_id)
+			return 0;
+	return 1;
+}
+
+static int ctlr_is_resettable(u32 board_id)
+{
+	return ctlr_is_hard_resettable(board_id) ||
+		ctlr_is_soft_resettable(board_id);
+}
+
 static ssize_t host_show_resettable(struct device *dev,
 				    struct device_attribute *attr,
 				    char *buf)
 {
 	struct ctlr_info *h = to_hba(dev);
 
-	return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h));
+	return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h->board_id));
 }
 static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL);
 
@@ -4601,12 +4623,16 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	 * likely not be happy.  Just forbid resetting this conjoined mess.
 	 */
 	cciss_lookup_board_id(pdev, &board_id);
-	if (board_id == 0x409C0E11 || board_id == 0x409D0E11) {
+	if (!ctlr_is_resettable(board_id)) {
 		dev_warn(&pdev->dev, "Cannot reset Smart Array 640x "
 				"due to shared cache module.");
 		return -ENODEV;
 	}
 
+	/* if controller is soft- but not hard resettable... */
+	if (!ctlr_is_hard_resettable(board_id))
+		return -ENOTSUPP; /* try soft reset later. */
+
 	/* Save the PCI command register */
 	pci_read_config_word(pdev, 4, &command_register);
 	/* Turn the board off.  This is so that later pci_restore_state()

From 063d2cf72ab6101d2dd69bd6fb503b229be70325 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:54:07 -0500
Subject: [PATCH 067/110] cciss: do not use bit 2 doorbell reset

It causes NMIs which are undesirable at best, unsurvivable at worst.
Prefer the soft reset instead.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 865484e40841..9528e94dc8b2 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4673,8 +4673,14 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 		use_doorbell = DOORBELL_CTLR_RESET2;
 	} else {
 		use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
-		if (use_doorbell)
-			use_doorbell = DOORBELL_CTLR_RESET;
+		if (use_doorbell) {
+			dev_warn(&pdev->dev, "Controller claims that "
+				"'Bit 2 doorbell reset' is "
+				"supported, but not 'bit 5 doorbell reset'.  "
+				"Firmware update is recommended.\n");
+			rc = -ENOTSUPP; /* use the soft reset */
+			goto unmap_cfgtable;
+		}
 	}
 
 	rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);

From 8a4ec67bd5648beb09d7db988a75835b740e950d Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 3 May 2011 14:54:12 -0500
Subject: [PATCH 068/110] cciss: add cciss_tape_cmds module paramter

This is to allow number of commands reserved for use by SCSI tape drives
and medium changers to be adjusted at driver load time via the kernel
parameter cciss_tape_cmds, with a default value of 6, and a range
of 2 - 16 inclusive.  Previously, the driver limited the number of
commands which could be queued to the SCSI half of the the driver
to only 2.  This is to fix the problem that if you had more than
two tape drives, you couldn't, for example, erase or rewind them all
at the same time.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/blockdev/cciss.txt | 15 ++++++++++++
 drivers/block/cciss.c            | 11 ++++++++-
 drivers/block/cciss_scsi.c       | 41 ++++++++++++++++++--------------
 drivers/block/cciss_scsi.h       |  4 ----
 4 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/Documentation/blockdev/cciss.txt b/Documentation/blockdev/cciss.txt
index 89698e8df7d4..c00c6a5ab21f 100644
--- a/Documentation/blockdev/cciss.txt
+++ b/Documentation/blockdev/cciss.txt
@@ -169,3 +169,18 @@ is issued which positions the tape to a known position.  Typically you
 must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example)
 before i/o can proceed again to a tape drive which was reset.
 
+There is a cciss_tape_cmds module parameter which can be used to make cciss
+allocate more commands for use by tape drives.  Ordinarily only a few commands
+(6) are allocated for tape drives because tape drives are slow and
+infrequently used and the primary purpose of Smart Array controllers is to
+act as a RAID controller for disk drives, so the vast majority of commands
+are allocated for disk devices.  However, if you have more than a few tape
+drives attached to a smart array, the default number of commands may not be
+enought (for example, if you have 8 tape drives, you could only rewind 6
+at one time with the default number of commands.)  The cciss_tape_cmds module
+parameter allows more commands (up to 16 more) to be allocated for use by
+tape drives.  For example:
+
+        insmod cciss.ko cciss_tape_cmds=16
+
+Or, as a kernel boot parameter passed in via grub:  cciss.cciss_tape_cmds=8
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9528e94dc8b2..abe90c973540 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -64,6 +64,10 @@ MODULE_DESCRIPTION("Driver for HP Smart Array Controllers");
 MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
 MODULE_VERSION("3.6.26");
 MODULE_LICENSE("GPL");
+static int cciss_tape_cmds = 6;
+module_param(cciss_tape_cmds, int, 0644);
+MODULE_PARM_DESC(cciss_tape_cmds,
+	"number of commands to allocate for tape devices (default: 6)");
 
 static DEFINE_MUTEX(cciss_mutex);
 static struct proc_dir_entry *proc_cciss;
@@ -4221,7 +4225,7 @@ static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
 static void __devinit cciss_find_board_params(ctlr_info_t *h)
 {
 	cciss_get_max_perf_mode_cmds(h);
-	h->nr_cmds = h->max_commands - 4; /* Allow room for some ioctls */
+	h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds;
 	h->maxsgentries = readl(&(h->cfgtable->MaxSGElements));
 	/*
 	 * Limit in-command s/g elements to 32 save dma'able memory.
@@ -4959,6 +4963,11 @@ reinit_after_soft_reset:
 	sprintf(h->devname, "cciss%d", i);
 	h->ctlr = i;
 
+	if (cciss_tape_cmds < 2)
+		cciss_tape_cmds = 2;
+	if (cciss_tape_cmds > 16)
+		cciss_tape_cmds = 16;
+
 	init_completion(&h->scan_wait);
 
 	if (cciss_create_hba_sysfs_entry(h))
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index df793803f5ae..696100241a6f 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = {
 	.proc_name		= "cciss",
 	.proc_info		= cciss_scsi_proc_info,
 	.queuecommand		= cciss_scsi_queue_command,
-	.can_queue		= SCSI_CCISS_CAN_QUEUE,
 	.this_id		= 7,
 	.cmd_per_lun		= 1,
 	.use_clustering		= DISABLE_CLUSTERING,
@@ -108,16 +107,13 @@ struct cciss_scsi_cmd_stack_elem_t {
 
 #pragma pack()
 
-#define CMD_STACK_SIZE (SCSI_CCISS_CAN_QUEUE * \
-		CCISS_MAX_SCSI_DEVS_PER_HBA + 2)
-			// plus two for init time usage
-
 #pragma pack(1)
 struct cciss_scsi_cmd_stack_t {
 	struct cciss_scsi_cmd_stack_elem_t *pool;
-	struct cciss_scsi_cmd_stack_elem_t *elem[CMD_STACK_SIZE];
+	struct cciss_scsi_cmd_stack_elem_t **elem;
 	dma_addr_t cmd_pool_handle;
 	int top;
+	int nelems;
 };
 #pragma pack()
 
@@ -191,7 +187,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
 	sa = h->scsi_ctlr;
 	stk = &sa->cmd_stack; 
 	stk->top++;
-	if (stk->top >= CMD_STACK_SIZE) {
+	if (stk->top >= stk->nelems) {
 		dev_err(&h->pdev->dev,
 			"scsi_cmd_free called too many times.\n");
 		BUG();
@@ -206,13 +202,14 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
 	struct cciss_scsi_cmd_stack_t *stk;
 	size_t size;
 
+	stk = &sa->cmd_stack;
+	stk->nelems = cciss_tape_cmds + 2;
 	sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
-		h->chainsize, CMD_STACK_SIZE);
+		h->chainsize, stk->nelems);
 	if (!sa->cmd_sg_list && h->chainsize > 0)
 		return -ENOMEM;
 
-	stk = &sa->cmd_stack; 
-	size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE;
+	size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
 
 	/* Check alignment, see cciss_cmd.h near CommandList_struct def. */
 	BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0);
@@ -221,18 +218,23 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
 		pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle);
 
 	if (stk->pool == NULL) {
-		cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE);
+		cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
 		sa->cmd_sg_list = NULL;
 		return -ENOMEM;
 	}
-
-	for (i=0; i<CMD_STACK_SIZE; i++) {
+	stk->elem = kmalloc(sizeof(stk->elem[0]) * stk->nelems, GFP_KERNEL);
+	if (!stk->elem) {
+		pci_free_consistent(h->pdev, size, stk->pool,
+		stk->cmd_pool_handle);
+		return -1;
+	}
+	for (i = 0; i < stk->nelems; i++) {
 		stk->elem[i] = &stk->pool[i];
 		stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + 
 			(sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
 		stk->elem[i]->cmdindex = i;
 	}
-	stk->top = CMD_STACK_SIZE-1;
+	stk->top = stk->nelems-1;
 	return 0;
 }
 
@@ -245,16 +247,18 @@ scsi_cmd_stack_free(ctlr_info_t *h)
 
 	sa = h->scsi_ctlr;
 	stk = &sa->cmd_stack; 
-	if (stk->top != CMD_STACK_SIZE-1) {
+	if (stk->top != stk->nelems-1) {
 		dev_warn(&h->pdev->dev,
 			"bug: %d scsi commands are still outstanding.\n",
-			CMD_STACK_SIZE - stk->top);
+			stk->nelems - stk->top);
 	}
-	size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE;
+	size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
 
 	pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle);
 	stk->pool = NULL;
-	cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE);
+	cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
+	kfree(stk->elem);
+	stk->elem = NULL;
 }
 
 #if 0
@@ -859,6 +863,7 @@ cciss_scsi_detect(ctlr_info_t *h)
 	sh->io_port = 0;	// good enough?  FIXME, 
 	sh->n_io_port = 0;	// I don't think we use these two...
 	sh->this_id = SELF_SCSI_ID;  
+	sh->can_queue = cciss_tape_cmds;
 	sh->sg_tablesize = h->maxsgentries;
 	sh->max_cmd_len = MAX_COMMAND_SIZE;
 
diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h
index 6d5822fe851a..e71d986727ca 100644
--- a/drivers/block/cciss_scsi.h
+++ b/drivers/block/cciss_scsi.h
@@ -36,13 +36,9 @@
 		   addressible natively, and may in fact turn
 		   out to be not scsi at all. */
 
-#define SCSI_CCISS_CAN_QUEUE 2
 
 /* 
 
-Note, cmd_per_lun could give us some trouble, so I'm setting it very low.
-Likewise, SCSI_CCISS_CAN_QUEUE is set very conservatively.
-
 If the upper scsi layer tries to track how many commands we have 
 outstanding, it will be operating under the misapprehension that it is
 the only one sending us requests.  We also have the block interface,

From edc83d47a9928281ecf6fb709753edb3c58ae7f1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 6 May 2011 08:27:00 -0600
Subject: [PATCH 069/110] cciss: fix compile issue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drivers/block/cciss.c: In function ‘cciss_send_reset’:
drivers/block/cciss.c:2515:2: error: implicit declaration of function ‘fill_cmd’
drivers/block/cciss.c: At top level:
drivers/block/cciss.c:2531:12: error: conflicting types for ‘fill_cmd’
drivers/block/cciss.c:2534:1: note: an argument type that has a default promotion can’t match an empty parameter name list declaration
drivers/block/cciss.c:2515:18: note: previous implicit declaration of ‘fill_cmd’ was here
make[1]: *** [drivers/block/cciss.o] Error 1
make: *** [drivers/block/cciss.o] Error 2

Move fill_cmd() to above where it is first used.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 50 +++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index abe90c973540..8f4ef656a1af 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2503,31 +2503,6 @@ static int deregister_disk(ctlr_info_t *h, int drv_index,
 	return 0;
 }
 
-static int __devinit cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr,
-	u8 reset_type)
-{
-	CommandList_struct *c;
-	int return_status;
-
-	c = cmd_alloc(h);
-	if (!c)
-		return -ENOMEM;
-	return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0,
-		CTLR_LUNID, TYPE_MSG);
-	c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */
-	if (return_status != IO_OK) {
-		cmd_special_free(h, c);
-		return return_status;
-	}
-	c->waiting = NULL;
-	enqueue_cmd_and_start_io(h, c);
-	/* Don't wait for completion, the reset won't complete.  Don't free
-	 * the command either.  This is the last command we will send before
-	 * re-initializing everything, so it doesn't matter and won't leak.
-	 */
-	return 0;
-}
-
 static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 		size_t size, __u8 page_code, unsigned char *scsi3addr,
 		int cmd_type)
@@ -2668,6 +2643,31 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 	return status;
 }
 
+static int __devinit cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr,
+	u8 reset_type)
+{
+	CommandList_struct *c;
+	int return_status;
+
+	c = cmd_alloc(h);
+	if (!c)
+		return -ENOMEM;
+	return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0,
+		CTLR_LUNID, TYPE_MSG);
+	c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */
+	if (return_status != IO_OK) {
+		cmd_special_free(h, c);
+		return return_status;
+	}
+	c->waiting = NULL;
+	enqueue_cmd_and_start_io(h, c);
+	/* Don't wait for completion, the reset won't complete.  Don't free
+	 * the command either.  This is the last command we will send before
+	 * re-initializing everything, so it doesn't matter and won't leak.
+	 */
+	return 0;
+}
+
 static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
 {
 	switch (c->err_info->ScsiStatus) {

From 01f37f2d53e14a05b7fc3601d182f31ac3b35847 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 11 May 2011 15:57:09 -0400
Subject: [PATCH 070/110] xen/blkback: Fixed up comments and converted spaces
 to tabs.

Suggested-by: Ian Campbell <Ian.Campbell@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 70 ++++++++++++++++-----------
 drivers/block/xen-blkback/common.h  | 73 +++++++++++++++--------------
 drivers/block/xen-blkback/xenbus.c  | 39 ++++++++-------
 3 files changed, 103 insertions(+), 79 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index b9bdd9e43ab9..6808cc7d9c73 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -73,13 +73,13 @@ module_param(debug_lvl, int, 0644);
  * response queued for it, with the saved 'id' passed back.
  */
 struct pending_req {
-	struct blkif_st       *blkif;
-	u64            id;
-	int            nr_pages;
-	atomic_t       pendcnt;
-	unsigned short operation;
-	int            status;
-	struct list_head free_list;
+	struct blkif_st		*blkif;
+	u64			id;
+	int			nr_pages;
+	atomic_t		pendcnt;
+	unsigned short		operation;
+	int			status;
+	struct list_head	free_list;
 };
 
 #define BLKBACK_INVALID_HANDLE (~0)
@@ -103,7 +103,8 @@ static struct xen_blkbk *blkbk;
  * Little helpful macro to figure out the index and virtual address of the
  * pending_pages[..]. For each 'pending_req' we have have up to
  * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
- * 10 and would index in the pending_pages[..]. */
+ * 10 and would index in the pending_pages[..].
+ */
 static inline int vaddr_pagenr(struct pending_req *req, int seg)
 {
 	return (req - blkbk->pending_reqs) *
@@ -167,8 +168,6 @@ static void free_req(struct pending_req *req)
 /*
  * Routines for managing virtual block devices (vbds).
  */
-
-
 static int vbd_translate(struct phys_req *req, struct blkif_st *blkif,
 			 int operation)
 {
@@ -315,7 +314,7 @@ struct seg_buf {
 /*
  * Unmap the grant references, and also remove the M2P over-rides
  * used in the 'pending_req'.
-*/
+ */
 static void xen_blkbk_unmap(struct pending_req *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -336,27 +335,32 @@ static void xen_blkbk_unmap(struct pending_req *req)
 	ret = HYPERVISOR_grant_table_op(
 		GNTTABOP_unmap_grant_ref, unmap, invcount);
 	BUG_ON(ret);
-	/* Note, we use invcount, so nr->pages, so we can't index
+	/*
+	 * Note, we use invcount, so nr->pages, so we can't index
 	 * using vaddr(req, i).
 	 */
 	for (i = 0; i < invcount; i++) {
 		ret = m2p_remove_override(
 			virt_to_page(unmap[i].host_addr), false);
 		if (ret) {
-			printk(KERN_ALERT "Failed to remove M2P override for " \
-				"%lx\n", (unsigned long)unmap[i].host_addr);
+			printk(KERN_ALERT "Failed to remove M2P override for %lx\n",
+			       (unsigned long)unmap[i].host_addr);
 			continue;
 		}
 	}
 }
-static int xen_blkbk_map(struct blkif_request *req, struct pending_req *pending_req,
+
+static int xen_blkbk_map(struct blkif_request *req,
+			 struct pending_req *pending_req,
 			 struct seg_buf seg[])
 {
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	int i;
 	int nseg = req->nr_segments;
 	int ret = 0;
-	/* Fill out preq.nr_sects with proper amount of sectors, and setup
+
+	/*
+	 * Fill out preq.nr_sects with proper amount of sectors, and setup
 	 * assign map[..] with the PFN of the page in our domain with the
 	 * corresponding grant reference for each page.
 	 */
@@ -367,13 +371,15 @@ static int xen_blkbk_map(struct blkif_request *req, struct pending_req *pending_
 		if (pending_req->operation != BLKIF_OP_READ)
 			flags |= GNTMAP_readonly;
 		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
-				  req->u.rw.seg[i].gref, pending_req->blkif->domid);
+				  req->u.rw.seg[i].gref,
+				  pending_req->blkif->domid);
 	}
 
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
 	BUG_ON(ret);
 
-	/* Now swizzel the MFN in our domain with the MFN from the other domain
+	/*
+	 * Now swizzle the MFN in our domain with the MFN from the other domain
 	 * so that when we access vaddr(pending_req,i) it has the contents of
 	 * the page from the other domain.
 	 */
@@ -423,7 +429,8 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 		pending_req->status = BLKIF_RSP_ERROR;
 	}
 
-	/* If all of the bio's have completed it is time to unmap
+	/*
+	 * If all of the bio's have completed it is time to unmap
 	 * the grant references associated with 'request' and provide
 	 * the proper response on the ring.
 	 */
@@ -510,8 +517,8 @@ static int do_block_io_op(struct blkif_st *blkif)
 }
 
 /*
- * Transumation of the 'struct blkif_request' to a proper 'struct bio'
- * and call the 'submit_bio' to pass it to the underlaying storage.
+ * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
+ * and call the 'submit_bio' to pass it to the underlying storage.
  */
 static int dispatch_rw_block_io(struct blkif_st *blkif,
 				 struct blkif_request *req,
@@ -538,8 +545,10 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	case BLKIF_OP_FLUSH_DISKCACHE:
 		blkif->st_f_req++;
 		operation = WRITE_FLUSH;
-		/* The frontend likes to set this to -1, which vbd_translate
-		 * is alergic too. */
+		/*
+		 * The frontend likes to set this to -1, which vbd_translate
+		 * is alergic too.
+		 */
 		req->u.rw.sector_number = 0;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
@@ -585,8 +594,11 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 			preq.sector_number + preq.nr_sects, preq.dev);
 		goto fail_response;
 	}
-	/* This check _MUST_ be done after vbd_translate as the preq.bdev
-	 * is set there. */
+
+	/*
+	 * This check _MUST_ be done after vbd_translate as the preq.bdev
+	 * is set there.
+	 */
 	for (i = 0; i < nseg; i++) {
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
@@ -595,7 +607,9 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 			goto fail_response;
 		}
 	}
-	/* If we have failed at this point, we need to undo the M2P override,
+
+	/*
+	 * If we have failed at this point, we need to undo the M2P override,
 	 * set gnttab_set_unmap_op on all of the grant references and perform
 	 * the hypercall to unmap the grants - that is all done in
 	 * xen_blkbk_unmap.
@@ -638,8 +652,8 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 		bio->bi_end_io  = end_block_io_op;
 	}
 
-
-	/* We set it one so that the last submit_bio does not have to call
+	/*
+	 * We set it one so that the last submit_bio does not have to call
 	 * atomic_inc.
 	 */
 	atomic_set(&pending_req->pendcnt, nbio);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index af93837e1295..e37dcf7f6b8e 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -47,53 +47,58 @@
 		 __FILE__ , __LINE__ , ## _a)
 
 struct vbd {
-	blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
-	unsigned char  readonly;    /* Non-zero -> read-only */
-	unsigned char  type;        /* VDISK_xxx */
-	u32            pdevice;     /* phys device that this vbd maps to */
-	struct block_device *bdev;
-	sector_t       size;        /* Cached size parameter */
-	bool           flush_support;
+	/* What the domain refers to this vbd as. */
+	blkif_vdev_t		handle;
+	/* Non-zero -> read-only */
+	unsigned char		readonly;
+	/* VDISK_xxx */
+	unsigned char		type;
+	/* phys device that this vbd maps to. */
+	u32			pdevice;
+	struct block_device	*bdev;
+	/* Cached size parameter. */
+	sector_t		size;
+	bool			flush_support;
 };
 
 struct backend_info;
 
 struct blkif_st {
 	/* Unique identifier for this interface. */
-	domid_t           domid;
-	unsigned int      handle;
+	domid_t			domid;
+	unsigned int		handle;
 	/* Physical parameters of the comms window. */
-	unsigned int      irq;
+	unsigned int		irq;
 	/* Comms information. */
-	enum blkif_protocol blk_protocol;
-	union blkif_back_rings blk_rings;
-	struct vm_struct *blk_ring_area;
+	enum blkif_protocol	blk_protocol;
+	union blkif_back_rings	blk_rings;
+	struct vm_struct	*blk_ring_area;
 	/* The VBD attached to this interface. */
-	struct vbd        vbd;
+	struct vbd		vbd;
 	/* Back pointer to the backend_info. */
-	struct backend_info *be;
+	struct backend_info	*be;
 	/* Private fields. */
-	spinlock_t       blk_ring_lock;
-	atomic_t         refcnt;
+	spinlock_t		blk_ring_lock;
+	atomic_t		refcnt;
 
-	wait_queue_head_t   wq;
+	wait_queue_head_t	wq;
 	/* One thread per one blkif. */
-	struct task_struct  *xenblkd;
-	unsigned int        waiting_reqs;
+	struct task_struct	*xenblkd;
+	unsigned int		waiting_reqs;
 
 	/* statistics */
-	unsigned long       st_print;
-	int                 st_rd_req;
-	int                 st_wr_req;
-	int                 st_oo_req;
-	int                 st_f_req;
-	int                 st_rd_sect;
-	int                 st_wr_sect;
+	unsigned long		st_print;
+	int			st_rd_req;
+	int			st_wr_req;
+	int			st_oo_req;
+	int			st_f_req;
+	int			st_rd_sect;
+	int			st_wr_sect;
 
-	wait_queue_head_t waiting_to_free;
+	wait_queue_head_t	waiting_to_free;
 
-	grant_handle_t shmem_handle;
-	grant_ref_t    shmem_ref;
+	grant_handle_t		shmem_handle;
+	grant_ref_t		shmem_ref;
 };
 
 
@@ -109,10 +114,10 @@ struct blkif_st {
 	} while (0)
 
 struct phys_req {
-	unsigned short       dev;
-	unsigned short       nr_sects;
-	struct block_device *bdev;
-	blkif_sector_t       sector_number;
+	unsigned short		dev;
+	unsigned short		nr_sects;
+	struct block_device	*bdev;
+	blkif_sector_t		sector_number;
 };
 int xen_blkif_interface_init(void);
 
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 9adcf806f83f..0cda406b4edb 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -27,12 +27,12 @@
 		 __func__, __LINE__, ##args)
 
 struct backend_info {
-	struct xenbus_device *dev;
-	struct blkif_st *blkif;
-	struct xenbus_watch backend_watch;
-	unsigned major;
-	unsigned minor;
-	char *mode;
+	struct xenbus_device	*dev;
+	struct blkif_st		*blkif;
+	struct xenbus_watch	backend_watch;
+	unsigned		major;
+	unsigned		minor;
+	char			*mode;
 };
 
 static struct kmem_cache *xen_blkif_cachep;
@@ -425,7 +425,7 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 	return err;
 }
 
-/**
+/*
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures, and watch the store waiting for the hotplug scripts to tell us
  * the device's physical major and minor numbers.  Switch to InitWait.
@@ -473,7 +473,7 @@ fail:
 }
 
 
-/**
+/*
  * Callback received when the hotplug scripts have placed the physical-device
  * node.  Read it and the mode node, and create a vbd.  If the frontend is
  * ready, connect.
@@ -495,9 +495,11 @@ static void backend_changed(struct xenbus_watch *watch,
 	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
 			   &major, &minor);
 	if (XENBUS_EXIST_ERR(err)) {
-		/* Since this watch will fire once immediately after it is
-		   registered, we expect this.  Ignore it, and wait for the
-		   hotplug scripts. */
+		/*
+		 * Since this watch will fire once immediately after it is
+		 * registered, we expect this.  Ignore it, and wait for the
+		 * hotplug scripts.
+		 */
 		return;
 	}
 	if (err != 2) {
@@ -562,7 +564,7 @@ static void backend_changed(struct xenbus_watch *watch,
 }
 
 
-/**
+/*
  * Callback received when the frontend's state changes.
  */
 static void frontend_changed(struct xenbus_device *dev,
@@ -584,13 +586,16 @@ static void frontend_changed(struct xenbus_device *dev,
 
 	case XenbusStateInitialised:
 	case XenbusStateConnected:
-		/* Ensure we connect even when two watches fire in
-		   close successsion and we miss the intermediate value
-		   of frontend_state. */
+		/*
+		 * Ensure we connect even when two watches fire in
+		 * close successsion and we miss the intermediate value
+		 * of frontend_state.
+		 */
 		if (dev->state == XenbusStateConnected)
 			break;
 
-		/* Enforce precondition before potential leak point.
+		/*
+		 * Enforce precondition before potential leak point.
 		 * blkif_disconnect() is idempotent.
 		 */
 		xen_blkif_disconnect(be->blkif);
@@ -627,7 +632,7 @@ static void frontend_changed(struct xenbus_device *dev,
 /* ** Connection ** */
 
 
-/**
+/*
  * Write the physical details regarding the block device to the store, and
  * switch to Connected state.
  */

From 4352b47ab7918108b389a48d2163c9a4c2aaf139 Mon Sep 17 00:00:00 2001
From: Marek Marczykowski <marmarek@mimuw.edu.pl>
Date: Tue, 3 May 2011 12:04:52 -0400
Subject: [PATCH 071/110] xen-blkfront: fix data size for xenbus_gather in
 blkfront_connect

barrier variable is int, not long. This overflow caused another variable
override: "err" (in PV code) and "binfo" (in xenlinux code -
drivers/xen/blkfront/blkfront.c). The later caused incorrect device
flags (RO/removable etc).

Signed-off-by: Marek Marczykowski <marmarek@mimuw.edu.pl>
Acked-by: Ian Campbell <Ian.Campbell@citrix.com>
[v1: Changed title]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9cb8668ff5f4..20759817ad57 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1141,7 +1141,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	}
 
 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "feature-barrier", "%lu", &barrier,
+			    "feature-barrier", "%d", &barrier,
 			    NULL);
 
 	/*

From 6dcfb751c927879399e404b3885cbdef7d8d368b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 5 May 2011 12:41:03 -0400
Subject: [PATCH 072/110] xen-blkfront: Provide for 'feature-flush-cache' the
 BLKIF_OP_WRITE_FLUSH_CACHE operation.

The operation BLKIF_OP_WRITE_FLUSH_CACHE has existed in the Xen
tree header file for years but it was never present in the Linux tree
because the frontend (nor the backend) supported this interface.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/xen/interface/io/blkif.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index 61e523af3c46..3d5d6db864fe 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -44,6 +44,19 @@ typedef uint64_t blkif_sector_t;
  */
 #define BLKIF_OP_WRITE_BARRIER     2
 
+/*
+ * Recognised if "feature-flush-cache" is present in backend xenbus
+ * info.  A flush will ask the underlying storage hardware to flush its
+ * non-volatile caches as appropriate.  The "feature-flush-cache" node
+ * contains a boolean indicating whether flush requests are likely to
+ * succeed or fail. Either way, a flush request may fail at any time
+ * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
+ * block-device hardware. The boolean simply indicates whether or not it
+ * is worthwhile for the frontend to attempt flushes.  If a backend does
+ * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
+ * "feature-flush-cache" node!
+ */
+#define BLKIF_OP_FLUSH_DISKCACHE   3
 /*
  * Maximum scatter/gather segments per request.
  * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.

From edf6ef59ec7ee82cadc93528229c4097b6c92a7e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 3 May 2011 12:01:11 -0400
Subject: [PATCH 073/110] xen-blkfront: Introduce BLKIF_OP_FLUSH_DISKCACHE
 support.

If the backend supports the 'feature-flush-cache' mode, use that
instead of the 'feature-barrier' support.

Currently there are three backends that support the 'feature-flush-cache'
mode: NetBSD, Solaris and Linux kernel. The 'flush' option is much
light-weight version than the 'barrier' support so lets try to use as
there are no filesystems in the kernel that use full barriers anymore.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 49 ++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 20759817ad57..b536a9cef917 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -97,6 +97,7 @@ struct blkfront_info
 	struct blk_shadow shadow[BLK_RING_SIZE];
 	unsigned long shadow_free;
 	unsigned int feature_flush;
+	unsigned int flush_op;
 	int is_ready;
 };
 
@@ -250,8 +251,7 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 
 /*
  * Generate a Xen blkfront IO request from a blk layer request.  Reads
- * and writes are handled as expected.  Since we lack a loose flush
- * request, we map flushes into a full ordered barrier.
+ * and writes are handled as expected.
  *
  * @req: a request struct
  */
@@ -293,14 +293,13 @@ static int blkif_queue_request(struct request *req)
 
 	if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
 		/*
-		 * Ideally we could just do an unordered
-		 * flush-to-disk, but all we have is a full write
-		 * barrier at the moment.  However, a barrier write is
+		 * Ideally we can do an unordered flush-to-disk. In case the
+		 * backend onlysupports barriers, use that. A barrier request
 		 * a superset of FUA, so we can implement it the same
 		 * way.  (It's also a FLUSH+FUA, since it is
 		 * guaranteed ordered WRT previous writes.)
 		 */
-		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
+		ring_req->operation = info->flush_op;
 	}
 
 	ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
@@ -433,8 +432,11 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 static void xlvbd_flush(struct blkfront_info *info)
 {
 	blk_queue_flush(info->rq, info->feature_flush);
-	printk(KERN_INFO "blkfront: %s: barriers %s\n",
+	printk(KERN_INFO "blkfront: %s: %s: %s\n",
 	       info->gd->disk_name,
+	       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+		"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
+		"flush diskcache" : "barrier or flush"),
 	       info->feature_flush ? "enabled" : "disabled");
 }
 
@@ -720,15 +722,20 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 		error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 		switch (bret->operation) {
+		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_WRITE_BARRIER:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
-				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
+				printk(KERN_WARNING "blkfront: %s: write %s op failed\n",
+				       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+				       "barrier" :  "flush disk cache",
 				       info->gd->disk_name);
 				error = -EOPNOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
 				     info->shadow[id].req.nr_segments == 0)) {
-				printk(KERN_WARNING "blkfront: %s: empty write barrier op failed\n",
+				printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
+				       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+				       "barrier" :  "flush disk cache",
 				       info->gd->disk_name);
 				error = -EOPNOTSUPP;
 			}
@@ -736,6 +743,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				if (error == -EOPNOTSUPP)
 					error = 0;
 				info->feature_flush = 0;
+				info->flush_op = 0;
 				xlvbd_flush(info);
 			}
 			/* fall through */
@@ -1100,7 +1108,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	unsigned long sector_size;
 	unsigned int binfo;
 	int err;
-	int barrier;
+	int barrier, flush;
 
 	switch (info->connected) {
 	case BLKIF_STATE_CONNECTED:
@@ -1140,6 +1148,9 @@ static void blkfront_connect(struct blkfront_info *info)
 		return;
 	}
 
+	info->feature_flush = 0;
+	info->flush_op = 0;
+
 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 			    "feature-barrier", "%d", &barrier,
 			    NULL);
@@ -1151,11 +1162,23 @@ static void blkfront_connect(struct blkfront_info *info)
 	 *
 	 * If there are barriers, then we use flush.
 	 */
-	info->feature_flush = 0;
-
-	if (!err && barrier)
+	if (!err && barrier) {
 		info->feature_flush = REQ_FLUSH | REQ_FUA;
+		info->flush_op = BLKIF_OP_WRITE_BARRIER;
+	}
+	/*
+	 * And if there is "feature-flush-cache" use that above
+	 * barriers.
+	 */
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-flush-cache", "%d", &flush,
+			    NULL);
 
+	if (!err && flush) {
+		info->feature_flush = REQ_FLUSH;
+		info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
+	}
+		
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",

From ebe8190659244ec21b5f16950cf7b156f5b7eb01 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:42:31 -0400
Subject: [PATCH 074/110] xen/blkback: Change printk/DPRINTK to pr_.. type
 variant.

And also make them uniform and prefix the message with 'xen-blkback'.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 58 ++++++++++++++---------------
 drivers/block/xen-blkback/xenbus.c  | 19 ++++------
 2 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 6808cc7d9c73..5c9568e39eab 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -196,20 +196,20 @@ static void vbd_resize(struct blkif_st *blkif)
 	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
 	unsigned long long new_size = vbd_sz(vbd);
 
-	printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
+	pr_info("xen-blkback: VBD Resize: Domid: %d, Device: (%d, %d)\n",
 		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
-	printk(KERN_INFO "VBD Resize: new size %llu\n", new_size);
+	pr_info("xen-blkback: VBD Resize: new size %llu\n", new_size);
 	vbd->size = new_size;
 again:
 	err = xenbus_transaction_start(&xbt);
 	if (err) {
-		printk(KERN_WARNING "Error starting transaction");
+		pr_warn("xen-blkback: Error starting transaction");
 		return;
 	}
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    (unsigned long long)vbd_sz(vbd));
 	if (err) {
-		printk(KERN_WARNING "Error writing new size");
+		pr_warn("xen-blkback: Error writing new size");
 		goto abort;
 	}
 	/*
@@ -219,7 +219,7 @@ again:
 	 */
 	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
 	if (err) {
-		printk(KERN_WARNING "Error writing the state");
+		pr_warn("xen-blkback: Error writing the state");
 		goto abort;
 	}
 
@@ -227,7 +227,7 @@ again:
 	if (err == -EAGAIN)
 		goto again;
 	if (err)
-		printk(KERN_WARNING "Error ending transaction");
+		pr_warn("xen-blkback: Error ending transaction");
 abort:
 	xenbus_transaction_end(xbt, 1);
 }
@@ -253,9 +253,9 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 
 static void print_stats(struct blkif_st *blkif)
 {
-	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
-	       current->comm, blkif->st_oo_req,
-	       blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
+	pr_debug("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
+		 current->comm, blkif->st_oo_req,
+		 blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
 	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 	blkif->st_rd_req = 0;
 	blkif->st_wr_req = 0;
@@ -270,7 +270,7 @@ int xen_blkif_schedule(void *arg)
 	xen_blkif_get(blkif);
 
 	if (debug_lvl)
-		printk(KERN_DEBUG "%s: started\n", current->comm);
+		pr_debug("xen-blkback: %s: started\n", current->comm);
 
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
@@ -299,7 +299,7 @@ int xen_blkif_schedule(void *arg)
 	if (log_stats)
 		print_stats(blkif);
 	if (debug_lvl)
-		printk(KERN_DEBUG "%s: exiting\n", current->comm);
+		pr_debug("xen-blkback: %s: exiting\n", current->comm);
 
 	blkif->xenblkd = NULL;
 	xen_blkif_put(blkif);
@@ -343,8 +343,8 @@ static void xen_blkbk_unmap(struct pending_req *req)
 		ret = m2p_remove_override(
 			virt_to_page(unmap[i].host_addr), false);
 		if (ret) {
-			printk(KERN_ALERT "Failed to remove M2P override for %lx\n",
-			       (unsigned long)unmap[i].host_addr);
+			pr_alert("xen-blkback: Failed to remove M2P override for %lx\n",
+				 (unsigned long)unmap[i].host_addr);
 			continue;
 		}
 	}
@@ -385,7 +385,7 @@ static int xen_blkbk_map(struct blkif_request *req,
 	 */
 	for (i = 0; i < nseg; i++) {
 		if (unlikely(map[i].status != 0)) {
-			DPRINTK("invalid buffer -- could not remap it\n");
+			pr_debug("xen-blkback: invalid buffer -- could not remap it\n");
 			map[i].handle = BLKBACK_INVALID_HANDLE;
 			ret |= 1;
 		}
@@ -398,9 +398,8 @@ static int xen_blkbk_map(struct blkif_request *req,
 		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
 			blkbk->pending_page(pending_req, i), false);
 		if (ret) {
-			printk(KERN_ALERT "Failed to install M2P override for"\
-				" %lx (ret: %d)\n", (unsigned long)
-				map[i].dev_bus_addr, ret);
+			pr_alert("xen-blkback: Failed to install M2P override for %lx (ret: %d)\n",
+				 (unsigned long)map[i].dev_bus_addr, ret);
 			/* We could switch over to GNTTABOP_copy */
 			continue;
 		}
@@ -420,12 +419,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 	/* An error fails the entire request. */
 	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 	    (error == -EOPNOTSUPP)) {
-		DPRINTK("blkback: flush diskcache op failed, not supported\n");
+		pr_debug("xen-blkback: flush diskcache op failed, not supported\n");
 		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if (error) {
-		DPRINTK("Buffer not up-to-date at end of operation, "
-			"error=%d\n", error);
+		pr_debug("xen-blkback: Buffer not up-to-date at end of operation,"
+			 " error=%d\n", error);
 		pending_req->status = BLKIF_RSP_ERROR;
 	}
 
@@ -562,7 +561,8 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	nseg = req->nr_segments;
 	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
-		DPRINTK("Bad number of segments in request (%d)\n", nseg);
+		pr_debug("xen-blkback: Bad number of segments in request (%d)\n",
+			 nseg);
 		/* Haven't submitted any bio's yet. */
 		goto fail_response;
 	}
@@ -588,10 +588,10 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	}
 
 	if (vbd_translate(&preq, blkif, operation) != 0) {
-		DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
-			operation == READ ? "read" : "write",
-			preq.sector_number,
-			preq.sector_number + preq.nr_sects, preq.dev);
+		pr_debug("xen-blkback: access denied: %s of [%llu,%llu] on dev=%04x\n",
+			 operation == READ ? "read" : "write",
+			 preq.sector_number,
+			 preq.sector_number + preq.nr_sects, preq.dev);
 		goto fail_response;
 	}
 
@@ -602,8 +602,8 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	for (i = 0; i < nseg; i++) {
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
-			DPRINTK("Misaligned I/O request from domain %d",
-				blkif->domid);
+			pr_debug("xen-blkback: Misaligned I/O request from domain %d",
+				 blkif->domid);
 			goto fail_response;
 		}
 	}
@@ -759,7 +759,7 @@ static int __init xen_blkif_init(void)
 
 	blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
 	if (!blkbk) {
-		printk(KERN_ALERT "%s: out of memory!\n", __func__);
+		pr_alert("xen-blkback: %s: out of memory!\n", __func__);
 		return -ENOMEM;
 	}
 
@@ -807,7 +807,7 @@ static int __init xen_blkif_init(void)
 	return 0;
 
  out_of_memory:
-	printk(KERN_ERR "%s: out of memory\n", __func__);
+	pr_alert("xen-blkback: %s: out of memory\n", __func__);
  failed_init:
 	kfree(blkbk->pending_reqs);
 	kfree(blkbk->pending_grant_handles);
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 0cda406b4edb..c86519c477f3 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -23,7 +23,7 @@
 
 #undef DPRINTK
 #define DPRINTK(fmt, args...)				\
-	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
+	pr_debug("xen-blkback: (%s:%d) " fmt ".\n",	\
 		 __func__, __LINE__, ##args)
 
 struct backend_info {
@@ -136,7 +136,7 @@ static int map_frontend_page(struct blkif_st *blkif, unsigned long shared_page)
 		BUG();
 
 	if (op.status) {
-		DPRINTK(" Grant table operation failure !\n");
+		DPRINTK("Grant table operation failure !\n");
 		return op.status;
 	}
 
@@ -509,10 +509,8 @@ static void backend_changed(struct xenbus_watch *watch,
 
 	if ((be->major || be->minor) &&
 	    ((be->major != major) || (be->minor != minor))) {
-		printk(KERN_WARNING
-		       "blkback: changing physical device (from %x:%x to "
-		       "%x:%x) not supported.\n", be->major, be->minor,
-		       major, minor);
+		pr_warn("xen-blkback: changing physical device (from %x:%x to %x:%x) not supported.\n",
+			be->major, be->minor, major, minor);
 		return;
 	}
 
@@ -578,8 +576,8 @@ static void frontend_changed(struct xenbus_device *dev,
 	switch (frontend_state) {
 	case XenbusStateInitialising:
 		if (dev->state == XenbusStateClosed) {
-			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
-			       __func__, dev->nodename);
+			pr_info("xen-blkback: %s: prepare for reconnect\n",
+				dev->nodename);
 			xenbus_switch_state(dev, XenbusStateInitWait);
 		}
 		break;
@@ -733,9 +731,8 @@ static int connect_ring(struct backend_info *be)
 		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
 		return -1;
 	}
-	printk(KERN_INFO
-	       "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
-	       ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+	pr_info("xen-blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+		ring_ref, evtchn, be->blkif->blk_protocol, protocol);
 
 	/* Map the shared frame, irq etc. */
 	err = xen_blkif_map(be->blkif, ring_ref, evtchn);

From 1afbd730a33c6e4ca780a70351e8929dd4c40636 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 11 May 2011 16:15:24 -0400
Subject: [PATCH 075/110] xen/blkback: Make the DPRINTK uniform.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/common.h | 6 +++---
 drivers/block/xen-blkback/xenbus.c | 5 -----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index e37dcf7f6b8e..46e5d0630440 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -42,9 +42,9 @@
 #include <xen/grant_table.h>
 #include <xen/xenbus.h>
 
-#define DPRINTK(_f, _a...)			\
-	pr_debug("(file=%s, line=%d) " _f,	\
-		 __FILE__ , __LINE__ , ## _a)
+#define DPRINTK(fmt, args...)				\
+	pr_debug("xen-blkback: (%s:%d) " fmt ".\n",	\
+		 __func__, __LINE__, ##args)
 
 struct vbd {
 	/* What the domain refers to this vbd as. */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index c86519c477f3..fa01dbbee0ad 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -21,11 +21,6 @@
 #include <xen/grant_table.h>
 #include "common.h"
 
-#undef DPRINTK
-#define DPRINTK(fmt, args...)				\
-	pr_debug("xen-blkback: (%s:%d) " fmt ".\n",	\
-		 __func__, __LINE__, ##args)
-
 struct backend_info {
 	struct xenbus_device	*dev;
 	struct blkif_st		*blkif;

From 22b20f2dffd09edd66127f2022c26d0039bad88e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:43:12 -0400
Subject: [PATCH 076/110] xen/blkback: Use the DRV_PFX in the pr_.. macros.

To make it easier to read.
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 36 ++++++++++++++---------------
 drivers/block/xen-blkback/common.h  |  3 ++-
 drivers/block/xen-blkback/xenbus.c  |  6 ++---
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 5c9568e39eab..09fe528dd088 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -196,20 +196,20 @@ static void vbd_resize(struct blkif_st *blkif)
 	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
 	unsigned long long new_size = vbd_sz(vbd);
 
-	pr_info("xen-blkback: VBD Resize: Domid: %d, Device: (%d, %d)\n",
+	pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
 		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
-	pr_info("xen-blkback: VBD Resize: new size %llu\n", new_size);
+	pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
 	vbd->size = new_size;
 again:
 	err = xenbus_transaction_start(&xbt);
 	if (err) {
-		pr_warn("xen-blkback: Error starting transaction");
+		pr_warn(DRV_PFX "Error starting transaction");
 		return;
 	}
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    (unsigned long long)vbd_sz(vbd));
 	if (err) {
-		pr_warn("xen-blkback: Error writing new size");
+		pr_warn(DRV_PFX "Error writing new size");
 		goto abort;
 	}
 	/*
@@ -219,7 +219,7 @@ again:
 	 */
 	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
 	if (err) {
-		pr_warn("xen-blkback: Error writing the state");
+		pr_warn(DRV_PFX "Error writing the state");
 		goto abort;
 	}
 
@@ -227,7 +227,7 @@ again:
 	if (err == -EAGAIN)
 		goto again;
 	if (err)
-		pr_warn("xen-blkback: Error ending transaction");
+		pr_warn(DRV_PFX "Error ending transaction");
 abort:
 	xenbus_transaction_end(xbt, 1);
 }
@@ -270,7 +270,7 @@ int xen_blkif_schedule(void *arg)
 	xen_blkif_get(blkif);
 
 	if (debug_lvl)
-		pr_debug("xen-blkback: %s: started\n", current->comm);
+		pr_debug(DRV_PFX "%s: started\n", current->comm);
 
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
@@ -299,7 +299,7 @@ int xen_blkif_schedule(void *arg)
 	if (log_stats)
 		print_stats(blkif);
 	if (debug_lvl)
-		pr_debug("xen-blkback: %s: exiting\n", current->comm);
+		pr_debug(DRV_PFX "%s: exiting\n", current->comm);
 
 	blkif->xenblkd = NULL;
 	xen_blkif_put(blkif);
@@ -343,7 +343,7 @@ static void xen_blkbk_unmap(struct pending_req *req)
 		ret = m2p_remove_override(
 			virt_to_page(unmap[i].host_addr), false);
 		if (ret) {
-			pr_alert("xen-blkback: Failed to remove M2P override for %lx\n",
+			pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n",
 				 (unsigned long)unmap[i].host_addr);
 			continue;
 		}
@@ -385,7 +385,7 @@ static int xen_blkbk_map(struct blkif_request *req,
 	 */
 	for (i = 0; i < nseg; i++) {
 		if (unlikely(map[i].status != 0)) {
-			pr_debug("xen-blkback: invalid buffer -- could not remap it\n");
+			pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
 			map[i].handle = BLKBACK_INVALID_HANDLE;
 			ret |= 1;
 		}
@@ -398,7 +398,7 @@ static int xen_blkbk_map(struct blkif_request *req,
 		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
 			blkbk->pending_page(pending_req, i), false);
 		if (ret) {
-			pr_alert("xen-blkback: Failed to install M2P override for %lx (ret: %d)\n",
+			pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
 				 (unsigned long)map[i].dev_bus_addr, ret);
 			/* We could switch over to GNTTABOP_copy */
 			continue;
@@ -419,11 +419,11 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 	/* An error fails the entire request. */
 	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 	    (error == -EOPNOTSUPP)) {
-		pr_debug("xen-blkback: flush diskcache op failed, not supported\n");
+		pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
 		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if (error) {
-		pr_debug("xen-blkback: Buffer not up-to-date at end of operation,"
+		pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
 			 " error=%d\n", error);
 		pending_req->status = BLKIF_RSP_ERROR;
 	}
@@ -561,7 +561,7 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	nseg = req->nr_segments;
 	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
-		pr_debug("xen-blkback: Bad number of segments in request (%d)\n",
+		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
 			 nseg);
 		/* Haven't submitted any bio's yet. */
 		goto fail_response;
@@ -588,7 +588,7 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	}
 
 	if (vbd_translate(&preq, blkif, operation) != 0) {
-		pr_debug("xen-blkback: access denied: %s of [%llu,%llu] on dev=%04x\n",
+		pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
 			 operation == READ ? "read" : "write",
 			 preq.sector_number,
 			 preq.sector_number + preq.nr_sects, preq.dev);
@@ -602,7 +602,7 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	for (i = 0; i < nseg; i++) {
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
-			pr_debug("xen-blkback: Misaligned I/O request from domain %d",
+			pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
 				 blkif->domid);
 			goto fail_response;
 		}
@@ -759,7 +759,7 @@ static int __init xen_blkif_init(void)
 
 	blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
 	if (!blkbk) {
-		pr_alert("xen-blkback: %s: out of memory!\n", __func__);
+		pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
 		return -ENOMEM;
 	}
 
@@ -807,7 +807,7 @@ static int __init xen_blkif_init(void)
 	return 0;
 
  out_of_memory:
-	pr_alert("xen-blkback: %s: out of memory\n", __func__);
+	pr_alert(DRV_PFX "%s: out of memory\n", __func__);
  failed_init:
 	kfree(blkbk->pending_reqs);
 	kfree(blkbk->pending_grant_handles);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 46e5d0630440..da96e3eaa641 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -42,8 +42,9 @@
 #include <xen/grant_table.h>
 #include <xen/xenbus.h>
 
+#define DRV_PFX "xen-blkback:"
 #define DPRINTK(fmt, args...)				\
-	pr_debug("xen-blkback: (%s:%d) " fmt ".\n",	\
+	pr_debug(DRV_PFX "(%s:%d) " fmt ".\n",	\
 		 __func__, __LINE__, ##args)
 
 struct vbd {
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index fa01dbbee0ad..1c3fa6507e6d 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -504,7 +504,7 @@ static void backend_changed(struct xenbus_watch *watch,
 
 	if ((be->major || be->minor) &&
 	    ((be->major != major) || (be->minor != minor))) {
-		pr_warn("xen-blkback: changing physical device (from %x:%x to %x:%x) not supported.\n",
+		pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n",
 			be->major, be->minor, major, minor);
 		return;
 	}
@@ -571,7 +571,7 @@ static void frontend_changed(struct xenbus_device *dev,
 	switch (frontend_state) {
 	case XenbusStateInitialising:
 		if (dev->state == XenbusStateClosed) {
-			pr_info("xen-blkback: %s: prepare for reconnect\n",
+			pr_info(DRV_PFX "%s: prepare for reconnect\n",
 				dev->nodename);
 			xenbus_switch_state(dev, XenbusStateInitWait);
 		}
@@ -726,7 +726,7 @@ static int connect_ring(struct backend_info *be)
 		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
 		return -1;
 	}
-	pr_info("xen-blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+	pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
 		ring_ref, evtchn, be->blkif->blk_protocol, protocol);
 
 	/* Map the shared frame, irq etc. */

From 72468bfcb815bc9875a870973469f68e20c78717 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 11 May 2011 16:21:08 -0400
Subject: [PATCH 077/110] xen/blkback: Removing the debug_lvl option.

It is not really used for anything.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 09fe528dd088..453b51ac737f 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -62,9 +62,7 @@ MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
 
 /* Run-time switchable: /sys/module/blkback/parameters/ */
 static unsigned int log_stats;
-static unsigned int debug_lvl;
 module_param(log_stats, int, 0644);
-module_param(debug_lvl, int, 0644);
 
 /*
  * Each outstanding request that we've passed to the lower device layers has a
@@ -269,9 +267,6 @@ int xen_blkif_schedule(void *arg)
 
 	xen_blkif_get(blkif);
 
-	if (debug_lvl)
-		pr_debug(DRV_PFX "%s: started\n", current->comm);
-
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
 			continue;
@@ -298,8 +293,6 @@ int xen_blkif_schedule(void *arg)
 
 	if (log_stats)
 		print_stats(blkif);
-	if (debug_lvl)
-		pr_debug(DRV_PFX "%s: exiting\n", current->comm);
 
 	blkif->xenblkd = NULL;
 	xen_blkif_put(blkif);

From 68c88dd7d3caf1737112238fbe91cccd8e7a69fc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 11 May 2011 16:23:39 -0400
Subject: [PATCH 078/110] xen/blkback: Move blkif_get_x86_[32|64]_req to
 common.h in block/xen-blkback dir.

From the blkif.h header, which was exposed to the frontend.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/common.h | 32 ++++++++++++++++++++++++++++++
 include/xen/blkif.h                | 30 ----------------------------
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index da96e3eaa641..647d974da392 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -132,4 +132,36 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 
 struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
 
+static void inline blkif_get_x86_32_req(struct blkif_request *dst,
+					struct blkif_x86_32_request *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->u.rw.sector_number = src->sector_number;
+	barrier();
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->u.rw.seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(struct blkif_request *dst,
+					struct blkif_x86_64_request *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->u.rw.sector_number = src->sector_number;
+	barrier();
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->u.rw.seg[i] = src->seg[i];
+}
+
 #endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/include/xen/blkif.h b/include/xen/blkif.h
index ab794269fc53..6ed7c01253b2 100644
--- a/include/xen/blkif.h
+++ b/include/xen/blkif.h
@@ -89,34 +89,4 @@ enum blkif_protocol {
 	BLKIF_PROTOCOL_X86_64 = 3,
 };
 
-static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src)
-{
-	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
-	dst->operation = src->operation;
-	dst->nr_segments = src->nr_segments;
-	dst->handle = src->handle;
-	dst->id = src->id;
-	dst->u.rw.sector_number = src->sector_number;
-	barrier();
-	if (n > dst->nr_segments)
-		n = dst->nr_segments;
-	for (i = 0; i < n; i++)
-		dst->u.rw.seg[i] = src->seg[i];
-}
-
-static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src)
-{
-	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
-	dst->operation = src->operation;
-	dst->nr_segments = src->nr_segments;
-	dst->handle = src->handle;
-	dst->id = src->id;
-	dst->u.rw.sector_number = src->sector_number;
-	barrier();
-	if (n > dst->nr_segments)
-		n = dst->nr_segments;
-	for (i = 0; i < n; i++)
-		dst->u.rw.seg[i] = src->seg[i];
-}
-
 #endif /* __XEN_BLKIF_H__ */

From b9fc02968c5dd3c0461b4bb126499a17b13fb86e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 11 May 2011 16:26:59 -0400
Subject: [PATCH 079/110] xen/blkback: Fix spelling mistakes.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 1c3fa6507e6d..5d2bbf6240c8 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -351,7 +351,7 @@ static int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle,
 				 FMODE_READ : FMODE_WRITE, NULL);
 
 	if (IS_ERR(bdev)) {
-		DPRINTK("vbd_creat: device %08x could not be opened.\n",
+		DPRINTK("vbd_create: device %08x could not be opened.\n",
 			vbd->pdevice);
 		return -ENOENT;
 	}
@@ -360,7 +360,7 @@ static int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle,
 	vbd->size = vbd_sz(vbd);
 
 	if (vbd->bdev->bd_disk == NULL) {
-		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+		DPRINTK("vbd_create: device %08x doesn't exist.\n",
 			vbd->pdevice);
 		vbd_free(vbd);
 		return -ENOENT;

From a4c348580e65c95d4b278bb6f154f622df12b893 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:10:55 -0400
Subject: [PATCH 080/110] xen/blkback: Flesh out the description in the
 Kconfig.

with more details.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/Kconfig | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 9abb64689712..717d6e4e18d3 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -478,6 +478,19 @@ config XEN_BLKDEV_BACKEND
 	  block devices to other guests via a high-performance shared-memory
 	  interface.
 
+	  The corresponding Linux frontend driver is enabled by the
+	  CONFIG_XEN_BLKDEV_FRONTEND configuration option.
+
+	  The backend driver attaches itself to a any block device specified
+	  in the XenBus configuration. There are no limits to what the block
+	  device as long as it has a major and minor.
+
+	  If you are compiling a kernel to run in a Xen block backend driver
+	  domain (often this is domain 0) you should say Y here. To
+	  compile this driver as a module, chose M here: the module
+	  will be called xen-blkback.
+
+
 config VIRTIO_BLK
 	tristate "Virtio block driver (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && VIRTIO

From 41ca4d388560d2048c7b64ff5ca7dc3bac0d0812 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:14:15 -0400
Subject: [PATCH 081/110] xen/blkback: Fix checkpatch.pl warnings about more
 than 80 lines.

Break up the macro usage.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/xen/blkif.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/xen/blkif.h b/include/xen/blkif.h
index 6ed7c01253b2..3c9ca7a93ba4 100644
--- a/include/xen/blkif.h
+++ b/include/xen/blkif.h
@@ -72,9 +72,12 @@ struct blkif_x86_64_response {
 typedef struct blkif_x86_64_request blkif_x86_64_request_t;
 typedef struct blkif_x86_64_response blkif_x86_64_response_t;
 
-DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
-DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
-DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
+		  struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
+		  struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
+		  struct blkif_x86_64_response);
 
 union blkif_back_rings {
 	struct blkif_back_ring        native;

From 03e0edf946a08f498788bb6e8ab58453d98f25b9 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:19:23 -0400
Subject: [PATCH 082/110] xen/blkback: Checkpatch.pl recommend against multiple
 assigments.

CHECK: multiple assignments should be avoided

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 9 ++++++---
 drivers/block/xen-blkback/xenbus.c  | 6 ++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 453b51ac737f..362fbf6f656d 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -620,10 +620,11 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
 
-			bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
+			bio = bio_alloc(GFP_KERNEL, nseg-i);
 			if (unlikely(bio == NULL))
 				goto fail_put_bio;
 
+			biolist[nbio++] = bio;
 			bio->bi_bdev    = preq.bdev;
 			bio->bi_private = pending_req;
 			bio->bi_end_io  = end_block_io_op;
@@ -636,10 +637,12 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	/* This will be hit if the operation was a flush. */
 	if (!bio) {
 		BUG_ON(operation != WRITE_FLUSH);
-		bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, 0);
+		
+		bio = bio_alloc(GFP_KERNEL, 0);
 		if (unlikely(bio == NULL))
 			goto fail_put_bio;
 
+		biolist[nbio++] = bio;
 		bio->bi_bdev    = preq.bdev;
 		bio->bi_private = pending_req;
 		bio->bi_end_io  = end_block_io_op;
@@ -677,7 +680,7 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	return -EIO;
 
  fail_put_bio:
-	for (i = 0; i < (nbio-1); i++)
+	for (i = 0; i < nbio; i++)
 		bio_put(biolist[i]);
 	__end_block_io_op(pending_req, -EINVAL);
 	msleep(1); /* back off a bit */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 5d2bbf6240c8..ba8d30662d19 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -538,7 +538,8 @@ static void backend_changed(struct xenbus_watch *watch,
 		err = vbd_create(be->blkif, handle, major, minor,
 				 (NULL == strchr(be->mode, 'w')), cdrom);
 		if (err) {
-			be->major = be->minor = 0;
+			be->major = 0;
+			be->minor = 0;
 			xenbus_dev_fatal(dev, err, "creating vbd structure");
 			return;
 		}
@@ -546,7 +547,8 @@ static void backend_changed(struct xenbus_watch *watch,
 		err = xenvbd_sysfs_addif(dev);
 		if (err) {
 			vbd_free(&be->blkif->vbd);
-			be->major = be->minor = 0;
+			be->major = 0;
+			be->minor = 0;
 			xenbus_dev_fatal(dev, err, "creating sysfs entries");
 			return;
 		}

From b0f801273f7359a7d91fc94f5c6bf216bc17aaa1 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:23:06 -0400
Subject: [PATCH 083/110] xen/blkback: Fixing some more of the cleanpatch.pl
 warnings.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 2 +-
 drivers/block/xen-blkback/common.h  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 362fbf6f656d..d06eb6a50d57 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -637,7 +637,7 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 	/* This will be hit if the operation was a flush. */
 	if (!bio) {
 		BUG_ON(operation != WRITE_FLUSH);
-		
+
 		bio = bio_alloc(GFP_KERNEL, 0);
 		if (unlikely(bio == NULL))
 			goto fail_put_bio;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 647d974da392..629546558a47 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -132,7 +132,7 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 
 struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
 
-static void inline blkif_get_x86_32_req(struct blkif_request *dst,
+static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 					struct blkif_x86_32_request *src)
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
@@ -148,7 +148,7 @@ static void inline blkif_get_x86_32_req(struct blkif_request *dst,
 		dst->u.rw.seg[i] = src->seg[i];
 }
 
-static void inline blkif_get_x86_64_req(struct blkif_request *dst,
+static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 					struct blkif_x86_64_request *src)
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;

From 452a6b2bb6de677acdd2ccb8b39cf6e8fe06f306 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:31:51 -0400
Subject: [PATCH 084/110] xen/blkback: Move include/xen/blkif.h into
 drivers/block/xen-blkback/common.h

Not point of the blkif.h file. It is not used by the frontend.
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/common.h | 72 +++++++++++++++++++++-
 include/xen/blkif.h                | 95 ------------------------------
 2 files changed, 71 insertions(+), 96 deletions(-)
 delete mode 100644 include/xen/blkif.h

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 629546558a47..b8856fe2568f 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -38,15 +38,85 @@
 #include <asm/setup.h>
 #include <asm/pgalloc.h>
 #include <asm/hypervisor.h>
-#include <xen/blkif.h>
 #include <xen/grant_table.h>
 #include <xen/xenbus.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
 
 #define DRV_PFX "xen-blkback:"
 #define DPRINTK(fmt, args...)				\
 	pr_debug(DRV_PFX "(%s:%d) " fmt ".\n",	\
 		 __func__, __LINE__, ##args)
 
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+	char dummy;
+};
+struct blkif_common_response {
+	char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_response {
+	uint64_t        id;              /* copied from request */
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       __attribute__((__aligned__(8))) id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_response {
+	uint64_t       __attribute__((__aligned__(8))) id;
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
+		  struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
+		  struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
+		  struct blkif_x86_64_response);
+
+union blkif_back_rings {
+	struct blkif_back_ring        native;
+	struct blkif_common_back_ring common;
+	struct blkif_x86_32_back_ring x86_32;
+	struct blkif_x86_64_back_ring x86_64;
+};
+
+enum blkif_protocol {
+	BLKIF_PROTOCOL_NATIVE = 1,
+	BLKIF_PROTOCOL_X86_32 = 2,
+	BLKIF_PROTOCOL_X86_64 = 3,
+};
+
 struct vbd {
 	/* What the domain refers to this vbd as. */
 	blkif_vdev_t		handle;
diff --git a/include/xen/blkif.h b/include/xen/blkif.h
deleted file mode 100644
index 3c9ca7a93ba4..000000000000
--- a/include/xen/blkif.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef __XEN_BLKIF_H__
-#define __XEN_BLKIF_H__
-
-#include <xen/interface/io/ring.h>
-#include <xen/interface/io/blkif.h>
-#include <xen/interface/io/protocols.h>
-
-/* Not a real protocol.  Used to generate ring structs which contain
- * the elements common to all protocols only.  This way we get a
- * compiler-checkable way to use common struct elements, so we can
- * avoid using switch(protocol) in a number of places.  */
-struct blkif_common_request {
-	char dummy;
-};
-struct blkif_common_response {
-	char dummy;
-};
-
-/* i386 protocol version */
-#pragma pack(push, 4)
-struct blkif_x86_32_request {
-	uint8_t        operation;    /* BLKIF_OP_???                         */
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       id;           /* private guest value, echoed in resp  */
-	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
-struct blkif_x86_32_response {
-	uint64_t        id;              /* copied from request */
-	uint8_t         operation;       /* copied from request */
-	int16_t         status;          /* BLKIF_RSP_???       */
-};
-typedef struct blkif_x86_32_request blkif_x86_32_request_t;
-typedef struct blkif_x86_32_response blkif_x86_32_response_t;
-#pragma pack(pop)
-
-/* x86_64 protocol version */
-struct blkif_x86_64_request {
-	uint8_t        operation;    /* BLKIF_OP_???                         */
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       __attribute__((__aligned__(8))) id;
-	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
-struct blkif_x86_64_response {
-	uint64_t       __attribute__((__aligned__(8))) id;
-	uint8_t         operation;       /* copied from request */
-	int16_t         status;          /* BLKIF_RSP_???       */
-};
-typedef struct blkif_x86_64_request blkif_x86_64_request_t;
-typedef struct blkif_x86_64_response blkif_x86_64_response_t;
-
-DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
-		  struct blkif_common_response);
-DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
-		  struct blkif_x86_32_response);
-DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
-		  struct blkif_x86_64_response);
-
-union blkif_back_rings {
-	struct blkif_back_ring        native;
-	struct blkif_common_back_ring common;
-	struct blkif_x86_32_back_ring x86_32;
-	struct blkif_x86_64_back_ring x86_64;
-};
-
-enum blkif_protocol {
-	BLKIF_PROTOCOL_NATIVE = 1,
-	BLKIF_PROTOCOL_X86_32 = 2,
-	BLKIF_PROTOCOL_X86_64 = 3,
-};
-
-#endif /* __XEN_BLKIF_H__ */

From 325a64860472765ecaeaa0081e9ddd67671183d4 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:37:04 -0400
Subject: [PATCH 085/110] xen/blkback: Remove the unused typedefs.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/common.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index b8856fe2568f..b0db6aabb5b8 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -76,8 +76,6 @@ struct blkif_x86_32_response {
 	uint8_t         operation;       /* copied from request */
 	int16_t         status;          /* BLKIF_RSP_???       */
 };
-typedef struct blkif_x86_32_request blkif_x86_32_request_t;
-typedef struct blkif_x86_32_response blkif_x86_32_response_t;
 #pragma pack(pop)
 
 /* x86_64 protocol version */
@@ -94,8 +92,6 @@ struct blkif_x86_64_response {
 	uint8_t         operation;       /* copied from request */
 	int16_t         status;          /* BLKIF_RSP_???       */
 };
-typedef struct blkif_x86_64_request blkif_x86_64_request_t;
-typedef struct blkif_x86_64_response blkif_x86_64_response_t;
 
 DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
 		  struct blkif_common_response);

From 30fd150202fb2d08a62f9c2966a4b1fcf2e861e7 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:47:48 -0400
Subject: [PATCH 086/110] xen/blkback: Change structure name blkif_st to
 xen_blkif.

No need for that '_st' and xen_blkif is more apt.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 28 ++++++++++++++--------------
 drivers/block/xen-blkback/common.h  |  2 +-
 drivers/block/xen-blkback/xenbus.c  | 24 ++++++++++++------------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index d06eb6a50d57..d438781ecc7c 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -71,7 +71,7 @@ module_param(log_stats, int, 0644);
  * response queued for it, with the saved 'id' passed back.
  */
 struct pending_req {
-	struct blkif_st		*blkif;
+	struct xen_blkif	*blkif;
 	u64			id;
 	int			nr_pages;
 	atomic_t		pendcnt;
@@ -121,11 +121,11 @@ static inline unsigned long vaddr(struct pending_req *req, int seg)
 	(blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
 
 
-static int do_block_io_op(struct blkif_st *blkif);
-static int dispatch_rw_block_io(struct blkif_st *blkif,
+static int do_block_io_op(struct xen_blkif *blkif);
+static int dispatch_rw_block_io(struct xen_blkif *blkif,
 				struct blkif_request *req,
 				struct pending_req *pending_req);
-static void make_response(struct blkif_st *blkif, u64 id,
+static void make_response(struct xen_blkif *blkif, u64 id,
 			  unsigned short op, int st);
 
 /*
@@ -166,7 +166,7 @@ static void free_req(struct pending_req *req)
 /*
  * Routines for managing virtual block devices (vbds).
  */
-static int vbd_translate(struct phys_req *req, struct blkif_st *blkif,
+static int vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 			 int operation)
 {
 	struct vbd *vbd = &blkif->vbd;
@@ -186,7 +186,7 @@ static int vbd_translate(struct phys_req *req, struct blkif_st *blkif,
 	return rc;
 }
 
-static void vbd_resize(struct blkif_st *blkif)
+static void vbd_resize(struct xen_blkif *blkif)
 {
 	struct vbd *vbd = &blkif->vbd;
 	struct xenbus_transaction xbt;
@@ -233,7 +233,7 @@ abort:
 /*
  * Notification from the guest OS.
  */
-static void blkif_notify_work(struct blkif_st *blkif)
+static void blkif_notify_work(struct xen_blkif *blkif)
 {
 	blkif->waiting_reqs = 1;
 	wake_up(&blkif->wq);
@@ -249,7 +249,7 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
  * SCHEDULER FUNCTIONS
  */
 
-static void print_stats(struct blkif_st *blkif)
+static void print_stats(struct xen_blkif *blkif)
 {
 	pr_debug("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
 		 current->comm, blkif->st_oo_req,
@@ -262,7 +262,7 @@ static void print_stats(struct blkif_st *blkif)
 
 int xen_blkif_schedule(void *arg)
 {
-	struct blkif_st *blkif = arg;
+	struct xen_blkif *blkif = arg;
 	struct vbd *vbd = &blkif->vbd;
 
 	xen_blkif_get(blkif);
@@ -451,7 +451,7 @@ static void end_block_io_op(struct bio *bio, int error)
  * (which has the sectors we want, number of them, grant references, etc),
  * and transmute  it to the block API to hand it over to the proper block disk.
  */
-static int do_block_io_op(struct blkif_st *blkif)
+static int do_block_io_op(struct xen_blkif *blkif)
 {
 	union blkif_back_rings *blk_rings = &blkif->blk_rings;
 	struct blkif_request req;
@@ -512,9 +512,9 @@ static int do_block_io_op(struct blkif_st *blkif)
  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  * and call the 'submit_bio' to pass it to the underlying storage.
  */
-static int dispatch_rw_block_io(struct blkif_st *blkif,
-				 struct blkif_request *req,
-				 struct pending_req *pending_req)
+static int dispatch_rw_block_io(struct xen_blkif *blkif,
+				struct blkif_request *req,
+				struct pending_req *pending_req)
 {
 	struct phys_req preq;
 	struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -692,7 +692,7 @@ static int dispatch_rw_block_io(struct blkif_st *blkif,
 /*
  * Put a response on the ring on how the operation fared.
  */
-static void make_response(struct blkif_st *blkif, u64 id,
+static void make_response(struct xen_blkif *blkif, u64 id,
 			  unsigned short op, int st)
 {
 	struct blkif_response  resp;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index b0db6aabb5b8..722c048663d6 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -130,7 +130,7 @@ struct vbd {
 
 struct backend_info;
 
-struct blkif_st {
+struct xen_blkif {
 	/* Unique identifier for this interface. */
 	domid_t			domid;
 	unsigned int		handle;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index ba8d30662d19..f355f7a5d52d 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -23,7 +23,7 @@
 
 struct backend_info {
 	struct xenbus_device	*dev;
-	struct blkif_st		*blkif;
+	struct xen_blkif		*blkif;
 	struct xenbus_watch	backend_watch;
 	unsigned		major;
 	unsigned		minor;
@@ -41,7 +41,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
 	return be->dev;
 }
 
-static int blkback_name(struct blkif_st *blkif, char *buf)
+static int blkback_name(struct xen_blkif *blkif, char *buf)
 {
 	char *devpath, *devname;
 	struct xenbus_device *dev = blkif->be->dev;
@@ -62,7 +62,7 @@ static int blkback_name(struct blkif_st *blkif, char *buf)
 	return 0;
 }
 
-static void xen_update_blkif_status(struct blkif_st *blkif)
+static void xen_update_blkif_status(struct xen_blkif *blkif)
 {
 	int err;
 	char name[TASK_COMM_LEN];
@@ -101,9 +101,9 @@ static void xen_update_blkif_status(struct blkif_st *blkif)
 	}
 }
 
-static struct blkif_st *xen_blkif_alloc(domid_t domid)
+static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 {
-	struct blkif_st *blkif;
+	struct xen_blkif *blkif;
 
 	blkif = kmem_cache_alloc(xen_blkif_cachep, GFP_KERNEL);
 	if (!blkif)
@@ -120,7 +120,7 @@ static struct blkif_st *xen_blkif_alloc(domid_t domid)
 	return blkif;
 }
 
-static int map_frontend_page(struct blkif_st *blkif, unsigned long shared_page)
+static int map_frontend_page(struct xen_blkif *blkif, unsigned long shared_page)
 {
 	struct gnttab_map_grant_ref op;
 
@@ -141,7 +141,7 @@ static int map_frontend_page(struct blkif_st *blkif, unsigned long shared_page)
 	return 0;
 }
 
-static void unmap_frontend_page(struct blkif_st *blkif)
+static void unmap_frontend_page(struct xen_blkif *blkif)
 {
 	struct gnttab_unmap_grant_ref op;
 
@@ -152,7 +152,7 @@ static void unmap_frontend_page(struct blkif_st *blkif)
 		BUG();
 }
 
-static int xen_blkif_map(struct blkif_st *blkif, unsigned long shared_page,
+static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
 			 unsigned int evtchn)
 {
 	int err;
@@ -211,7 +211,7 @@ static int xen_blkif_map(struct blkif_st *blkif, unsigned long shared_page,
 	return 0;
 }
 
-static void xen_blkif_disconnect(struct blkif_st *blkif)
+static void xen_blkif_disconnect(struct xen_blkif *blkif)
 {
 	if (blkif->xenblkd) {
 		kthread_stop(blkif->xenblkd);
@@ -234,7 +234,7 @@ static void xen_blkif_disconnect(struct blkif_st *blkif)
 	}
 }
 
-void xen_blkif_free(struct blkif_st *blkif)
+void xen_blkif_free(struct xen_blkif *blkif)
 {
 	if (!atomic_dec_and_test(&blkif->refcnt))
 		BUG();
@@ -244,7 +244,7 @@ void xen_blkif_free(struct blkif_st *blkif)
 int __init xen_blkif_interface_init(void)
 {
 	xen_blkif_cachep = kmem_cache_create("blkif_cache",
-					     sizeof(struct blkif_st),
+					     sizeof(struct xen_blkif),
 					     0, 0, NULL);
 	if (!xen_blkif_cachep)
 		return -ENOMEM;
@@ -332,7 +332,7 @@ static void vbd_free(struct vbd *vbd)
 	vbd->bdev = NULL;
 }
 
-static int vbd_create(struct blkif_st *blkif, blkif_vdev_t handle,
+static int vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 		      unsigned major, unsigned minor, int readonly,
 		      int cdrom)
 {

From 3d814731ba67f9514bdf380c1b95dd852ac82a2f Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:53:56 -0400
Subject: [PATCH 087/110] xen/blkback: Prefix 'vbd' with 'xen' in structs and
 functions.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 20 ++++++++---------
 drivers/block/xen-blkback/common.h  |  4 ++--
 drivers/block/xen-blkback/xenbus.c  | 34 ++++++++++++++---------------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index d438781ecc7c..1e454a30e4e1 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -166,10 +166,10 @@ static void free_req(struct pending_req *req)
 /*
  * Routines for managing virtual block devices (vbds).
  */
-static int vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
-			 int operation)
+static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
+			     int operation)
 {
-	struct vbd *vbd = &blkif->vbd;
+	struct xen_vbd *vbd = &blkif->vbd;
 	int rc = -EACCES;
 
 	if ((operation != READ) && vbd->readonly)
@@ -186,9 +186,9 @@ static int vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 	return rc;
 }
 
-static void vbd_resize(struct xen_blkif *blkif)
+static void xen_vbd_resize(struct xen_blkif *blkif)
 {
-	struct vbd *vbd = &blkif->vbd;
+	struct xen_vbd *vbd = &blkif->vbd;
 	struct xenbus_transaction xbt;
 	int err;
 	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
@@ -263,7 +263,7 @@ static void print_stats(struct xen_blkif *blkif)
 int xen_blkif_schedule(void *arg)
 {
 	struct xen_blkif *blkif = arg;
-	struct vbd *vbd = &blkif->vbd;
+	struct xen_vbd *vbd = &blkif->vbd;
 
 	xen_blkif_get(blkif);
 
@@ -271,7 +271,7 @@ int xen_blkif_schedule(void *arg)
 		if (try_to_freeze())
 			continue;
 		if (unlikely(vbd->size != vbd_sz(vbd)))
-			vbd_resize(blkif);
+			xen_vbd_resize(blkif);
 
 		wait_event_interruptible(
 			blkif->wq,
@@ -538,7 +538,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		blkif->st_f_req++;
 		operation = WRITE_FLUSH;
 		/*
-		 * The frontend likes to set this to -1, which vbd_translate
+		 * The frontend likes to set this to -1, which xen_vbd_translate
 		 * is alergic too.
 		 */
 		req->u.rw.sector_number = 0;
@@ -580,7 +580,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	}
 
-	if (vbd_translate(&preq, blkif, operation) != 0) {
+	if (xen_vbd_translate(&preq, blkif, operation) != 0) {
 		pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
 			 operation == READ ? "read" : "write",
 			 preq.sector_number,
@@ -589,7 +589,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	}
 
 	/*
-	 * This check _MUST_ be done after vbd_translate as the preq.bdev
+	 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
 	 * is set there.
 	 */
 	for (i = 0; i < nseg; i++) {
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 722c048663d6..1e4ccdeadef3 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -113,7 +113,7 @@ enum blkif_protocol {
 	BLKIF_PROTOCOL_X86_64 = 3,
 };
 
-struct vbd {
+struct xen_vbd {
 	/* What the domain refers to this vbd as. */
 	blkif_vdev_t		handle;
 	/* Non-zero -> read-only */
@@ -141,7 +141,7 @@ struct xen_blkif {
 	union blkif_back_rings	blk_rings;
 	struct vm_struct	*blk_ring_area;
 	/* The VBD attached to this interface. */
-	struct vbd		vbd;
+	struct xen_vbd		vbd;
 	/* Back pointer to the backend_info. */
 	struct backend_info	*be;
 	/* Private fields. */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index f355f7a5d52d..e470d8869053 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -275,7 +275,7 @@ VBD_SHOW(f_req,  "%d\n", be->blkif->st_f_req);
 VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
 VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
 
-static struct attribute *vbdstat_attrs[] = {
+static struct attribute *xen_vbdstat_attrs[] = {
 	&dev_attr_oo_req.attr,
 	&dev_attr_rd_req.attr,
 	&dev_attr_wr_req.attr,
@@ -285,9 +285,9 @@ static struct attribute *vbdstat_attrs[] = {
 	NULL
 };
 
-static struct attribute_group vbdstat_group = {
+static struct attribute_group xen_vbdstat_group = {
 	.name = "statistics",
-	.attrs = vbdstat_attrs,
+	.attrs = xen_vbdstat_attrs,
 };
 
 VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
@@ -305,13 +305,13 @@ int xenvbd_sysfs_addif(struct xenbus_device *dev)
 	if (error)
 		goto fail2;
 
-	error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+	error = sysfs_create_group(&dev->dev.kobj, &xen_vbdstat_group);
 	if (error)
 		goto fail3;
 
 	return 0;
 
-fail3:	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail3:	sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
 fail2:	device_remove_file(&dev->dev, &dev_attr_mode);
 fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
 	return error;
@@ -319,24 +319,24 @@ fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
 
 void xenvbd_sysfs_delif(struct xenbus_device *dev)
 {
-	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+	sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
 	device_remove_file(&dev->dev, &dev_attr_mode);
 	device_remove_file(&dev->dev, &dev_attr_physical_device);
 }
 
 
-static void vbd_free(struct vbd *vbd)
+static void xen_vbd_free(struct xen_vbd *vbd)
 {
 	if (vbd->bdev)
 		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
 	vbd->bdev = NULL;
 }
 
-static int vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
-		      unsigned major, unsigned minor, int readonly,
-		      int cdrom)
+static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
+			  unsigned major, unsigned minor, int readonly,
+			  int cdrom)
 {
-	struct vbd *vbd;
+	struct xen_vbd *vbd;
 	struct block_device *bdev;
 	struct request_queue *q;
 
@@ -351,7 +351,7 @@ static int vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 				 FMODE_READ : FMODE_WRITE, NULL);
 
 	if (IS_ERR(bdev)) {
-		DPRINTK("vbd_create: device %08x could not be opened.\n",
+		DPRINTK("xen_vbd_create: device %08x could not be opened.\n",
 			vbd->pdevice);
 		return -ENOENT;
 	}
@@ -360,9 +360,9 @@ static int vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 	vbd->size = vbd_sz(vbd);
 
 	if (vbd->bdev->bd_disk == NULL) {
-		DPRINTK("vbd_create: device %08x doesn't exist.\n",
+		DPRINTK("xen_vbd_create: device %08x doesn't exist.\n",
 			vbd->pdevice);
-		vbd_free(vbd);
+		xen_vbd_free(vbd);
 		return -ENOENT;
 	}
 
@@ -396,7 +396,7 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 
 	if (be->blkif) {
 		xen_blkif_disconnect(be->blkif);
-		vbd_free(&be->blkif->vbd);
+		xen_vbd_free(&be->blkif->vbd);
 		xen_blkif_free(be->blkif);
 		be->blkif = NULL;
 	}
@@ -535,7 +535,7 @@ static void backend_changed(struct xenbus_watch *watch,
 		be->major = major;
 		be->minor = minor;
 
-		err = vbd_create(be->blkif, handle, major, minor,
+		err = xen_vbd_create(be->blkif, handle, major, minor,
 				 (NULL == strchr(be->mode, 'w')), cdrom);
 		if (err) {
 			be->major = 0;
@@ -546,7 +546,7 @@ static void backend_changed(struct xenbus_watch *watch,
 
 		err = xenvbd_sysfs_addif(dev);
 		if (err) {
-			vbd_free(&be->blkif->vbd);
+			xen_vbd_free(&be->blkif->vbd);
 			be->major = 0;
 			be->minor = 0;
 			xenbus_dev_fatal(dev, err, "creating sysfs entries");

From 5a577e38724226e06337bc8361f492b6bb76b9a5 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 16:58:21 -0400
Subject: [PATCH 088/110] xen/blkback: Add the prefix XEN in the common.h.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/common.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 1e4ccdeadef3..9e40b283a468 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -24,8 +24,8 @@
  * IN THE SOFTWARE.
  */
 
-#ifndef __BLKIF__BACKEND__COMMON_H__
-#define __BLKIF__BACKEND__COMMON_H__
+#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
+#define __XEN_BLKIF__BACKEND__COMMON_H__
 
 #include <linux/version.h>
 #include <linux/module.h>
@@ -230,4 +230,4 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 		dst->u.rw.seg[i] = src->seg[i];
 }
 
-#endif /* __BLKIF__BACKEND__COMMON_H__ */
+#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */

From cca537af7d6defe8001c2229da738f8a3c934fc0 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 17:23:30 -0400
Subject: [PATCH 089/110] xen/blkback: if log_stats is enabled print out the
 data.

And not depend on the driver being built with -DDEBUG flag.
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 1e454a30e4e1..9dee5454740f 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -251,7 +251,7 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 
 static void print_stats(struct xen_blkif *blkif)
 {
-	pr_debug("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
+	pr_info("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
 		 current->comm, blkif->st_oo_req,
 		 blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
 	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);

From 5185432277ddf5bd91ad5af29cd1945f25ed10fc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 May 2011 18:02:28 -0400
Subject: [PATCH 090/110] xen/blkback: Align the tabs on the structure.

The recent changes caused this field of the structure to be offset a bit.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index e470d8869053..34570823355b 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -23,7 +23,7 @@
 
 struct backend_info {
 	struct xenbus_device	*dev;
-	struct xen_blkif		*blkif;
+	struct xen_blkif	*blkif;
 	struct xenbus_watch	backend_watch;
 	unsigned		major;
 	unsigned		minor;

From 496b318eb65558c1a3a4fe882cb9da6d1dc6493a Mon Sep 17 00:00:00 2001
From: Laszlo Ersek <lersek@redhat.com>
Date: Fri, 13 May 2011 09:45:40 -0400
Subject: [PATCH 091/110] xen/blkback: fix xenbus_transaction_start() hang
 caused by double xenbus_transaction_end()

vbd_resize() up_read()'s xs_state.suspend_mutex twice in a row via double
xenbus_transaction_end() calls. The next down_read() in
xenbus_transaction_start() (at eg. the next resize attempt) hangs.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=618317

Acked-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 9dee5454740f..dba55e3a4a86 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -226,6 +226,7 @@ again:
 		goto again;
 	if (err)
 		pr_warn(DRV_PFX "Error ending transaction");
+	return;
 abort:
 	xenbus_transaction_end(xbt, 1);
 }

From 8ab521506c4dbb144f0c04c55e3d8bec42c1b2b9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 17 May 2011 11:07:05 +0100
Subject: [PATCH 092/110] xen/blkback: don't fail empty barrier requests

The sector number on empty barrier requests may (will?) be -1, which,
given that it's being treated as unsigned 64-bit quantity, will almost
always exceed the actual (virtual) disk's size.

Inspired by Konrad's "When writting barriers set the sector number to
zero...".

While at it also add overflow checking to the math in vbd_translate().

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index dba55e3a4a86..c73910cc28c9 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -175,8 +175,14 @@ static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 	if ((operation != READ) && vbd->readonly)
 		goto out;
 
-	if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
-		goto out;
+	if (likely(req->nr_sects)) {
+		blkif_sector_t end = req->sector_number + req->nr_sects;
+
+		if (unlikely(end < req->sector_number))
+			goto out;
+		if (unlikely(end > vbd_sz(vbd)))
+			goto out;
+	}
 
 	req->dev  = vbd->pdevice;
 	req->bdev = vbd->bdev;
@@ -538,11 +544,6 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	case BLKIF_OP_FLUSH_DISKCACHE:
 		blkif->st_f_req++;
 		operation = WRITE_FLUSH;
-		/*
-		 * The frontend likes to set this to -1, which xen_vbd_translate
-		 * is alergic too.
-		 */
-		req->u.rw.sector_number = 0;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
 	default:

From d5431d5273392c6509e9567a9c30ca8b8362d6b2 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 28 Feb 2011 17:58:48 -0500
Subject: [PATCH 093/110] xen/p2m/m2p/gnttab: Support GNTMAP_host_map in the
 M2P override.

We only supported the M2P (and P2M) override only for the
GNTMAP_contains_pte type mappings. Meaning that we grants
operations would "contain the machine address of the PTE to update"
If the flag is unset, then the grant operation is
"contains a host virtual address". The latter case means that
the Hypervisor takes care of updating our page table
(specifically the PTE entry) with the guest's MFN. As such we should
not try to do anything with the PTE. Previous to this patch
we would try to clear the PTE which resulted in Xen hypervisor
being upset with us:

(XEN) mm.c:1066:d0 Attempt to implicitly unmap a granted PTE c0100000ccc59067
(XEN) domain_crash called from mm.c:1067
(XEN) Domain 0 (vcpu#0) crashed on cpu#3:
(XEN) ----[ Xen-4.0-110228  x86_64  debug=y  Not tainted ]----

and crashing us.

This patch allows us to inhibit the PTE clearing in the PV guest
if the GNTMAP_contains_pte is not set.

On the m2p_remove_override path we provide the same parameter.

Sadly in the grant-table driver we do not have a mechanism to
tell m2p_remove_override whether to clear the PTE or not. Since
the grant-table driver is used by user-space, we can safely assume
that it operates only on PTE's. Hence the implementation for
it to work on !GNTMAP_contains_pte returns -EOPNOTSUPP. In the future
we can implement the support for this. It will require some extra
accounting structure to keep track of the page[i], and the flag.

[v1: Added documentation details, made it return -EOPNOTSUPP instead
 of trying to do a half-way implementation]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/xen/page.h |  5 +++--
 arch/x86/xen/p2m.c              | 10 ++++------
 drivers/xen/grant-table.c       | 31 ++++++++++++++++++++++++-------
 3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c61934fbf22a..64a619d47d34 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -47,8 +47,9 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern unsigned long set_phys_range_identity(unsigned long pfn_s,
 					     unsigned long pfn_e);
 
-extern int m2p_add_override(unsigned long mfn, struct page *page);
-extern int m2p_remove_override(struct page *page);
+extern int m2p_add_override(unsigned long mfn, struct page *page,
+			    bool clear_pte);
+extern int m2p_remove_override(struct page *page, bool clear_pte);
 extern struct page *m2p_find_override(unsigned long mfn);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 141eb0de8b06..2d2b32af3a1d 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -650,7 +650,7 @@ static unsigned long mfn_hash(unsigned long mfn)
 }
 
 /* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page)
+int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
 {
 	unsigned long flags;
 	unsigned long pfn;
@@ -662,7 +662,6 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	if (!PageHighMem(page)) {
 		address = (unsigned long)__va(pfn << PAGE_SHIFT);
 		ptep = lookup_address(address, &level);
-
 		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
 					"m2p_add_override: pfn %lx not mapped", pfn))
 			return -EINVAL;
@@ -674,10 +673,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
 		return -ENOMEM;
 
-	if (!PageHighMem(page))
+	if (clear_pte && !PageHighMem(page))
 		/* Just zap old mapping for now */
 		pte_clear(&init_mm, address, ptep);
-
 	spin_lock_irqsave(&m2p_override_lock, flags);
 	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
@@ -685,7 +683,7 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	return 0;
 }
 
-int m2p_remove_override(struct page *page)
+int m2p_remove_override(struct page *page, bool clear_pte)
 {
 	unsigned long flags;
 	unsigned long mfn;
@@ -713,7 +711,7 @@ int m2p_remove_override(struct page *page)
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
 	set_phys_to_machine(pfn, page->index);
 
-	if (!PageHighMem(page))
+	if (clear_pte && !PageHighMem(page))
 		set_pte_at(&init_mm, address, ptep,
 				pfn_pte(pfn, PAGE_KERNEL));
 		/* No tlb flush necessary because the caller already
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 3745a318defc..fd725cde6ad1 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -466,13 +466,30 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 		if (map_ops[i].status)
 			continue;
 
-		/* m2p override only supported for GNTMAP_contains_pte mappings */
-		if (!(map_ops[i].flags & GNTMAP_contains_pte))
-			continue;
-		pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+		if (map_ops[i].flags & GNTMAP_contains_pte) {
+			pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
 				(map_ops[i].host_addr & ~PAGE_MASK));
-		mfn = pte_mfn(*pte);
-		ret = m2p_add_override(mfn, pages[i]);
+			mfn = pte_mfn(*pte);
+		} else {
+			/* If you really wanted to do this:
+			 * mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
+			 *
+			 * The reason we do not implement it is b/c on the
+			 * unmap path (gnttab_unmap_refs) we have no means of
+			 * checking whether the page is !GNTMAP_contains_pte.
+			 *
+			 * That is without some extra data-structure to carry
+			 * the struct page, bool clear_pte, and list_head next
+			 * tuples and deal with allocation/delallocation, etc.
+			 *
+			 * The users of this API set the GNTMAP_contains_pte
+			 * flag so lets just return not supported until it
+			 * becomes neccessary to implement.
+			 */
+			return -EOPNOTSUPP;
+		}
+		ret = m2p_add_override(mfn, pages[i],
+				       map_ops[i].flags & GNTMAP_contains_pte);
 		if (ret)
 			return ret;
 	}
@@ -494,7 +511,7 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 		return ret;
 
 	for (i = 0; i < count; i++) {
-		ret = m2p_remove_override(pages[i]);
+		ret = m2p_remove_override(pages[i], true /* clear the PTE */);
 		if (ret)
 			return ret;
 	}

From c9ce9e438b2a9faba74a05a71b3dbe169cde783b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 20 Apr 2011 11:54:10 -0400
Subject: [PATCH 094/110] xen/p2m: Add EXPORT_SYMBOL_GPL to the M2P override
 functions.

If the backends, which use these two functions, are compiled as
a module we need these two functions to be exported.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 2d2b32af3a1d..c851397e657c 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -682,7 +682,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
 
 	return 0;
 }
-
+EXPORT_SYMBOL_GPL(m2p_add_override);
 int m2p_remove_override(struct page *page, bool clear_pte)
 {
 	unsigned long flags;
@@ -719,6 +719,7 @@ int m2p_remove_override(struct page *page, bool clear_pte)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(m2p_remove_override);
 
 struct page *m2p_find_override(unsigned long mfn)
 {

From 738a84b25cac5af94936e5a1b15cd9909212383c Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 3 Mar 2011 00:21:30 +0100
Subject: [PATCH 095/110] drbd: Fix for application IO with the
 on-io-error=pass-on policy

In case a write failes on the local disk, go into D_INCONSISTENT
disk state. That causes future reads of that block to be shipped
to the peer.

Read retry remote was already in place.

Actually the documentation needs to get fixed now. Since the
application is still shielded from the error. (as long as we have
only a single disk failing) The difference to detach is that
we keep the disk. And therefore might keep all the other, still
working sectors up to date.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  | 1 +
 drivers/block/drbd/drbd_main.c | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b2699bb2e530..2c38752ca8d6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1827,6 +1827,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
 		if (!forcedetach) {
 			if (__ratelimit(&drbd_ratelimit_state))
 				dev_err(DEV, "Local IO failed in %s.\n", where);
+			_drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
 			break;
 		}
 		/* NOTE fall through to detach case if forcedetach set */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 5b525c179f39..fd308864833f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1565,6 +1565,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 		put_ldev(mdev);
 	}
 
+	/* Notify peer that I had a local IO error, and did not detached.. */
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
+		drbd_send_state(mdev);
+
 	/* Disks got bigger while they were detached */
 	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
 	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {

From 600942e0fdb7ed1565d056d7305c46c7c0544a3e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 27 Jan 2011 15:24:58 +0100
Subject: [PATCH 096/110] lru_cache.h: fix comments referring to ts_ instead of
 lc_

For some time we contemplated calling the "struct lru_cache"
a "struct tracked_set", and some comments kept the ts_ prefix.

Fix those to match the member field names.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 include/linux/lru_cache.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 6a4fab7c6e09..7a71ffad037c 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -139,9 +139,9 @@ write intent log information, three of which are mentioned here.
  * .list is on one of three lists:
  *  in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
  *     lru: unused but ready to be reused or recycled
- *          (ts_refcnt == 0, lc_number != LC_FREE),
+ *          (lc_refcnt == 0, lc_number != LC_FREE),
  *    free: unused but ready to be recycled
- *          (ts_refcnt == 0, lc_number == LC_FREE),
+ *          (lc_refcnt == 0, lc_number == LC_FREE),
  *
  * an element is said to be "in the active set",
  * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
@@ -160,8 +160,8 @@ struct lc_element {
 	struct hlist_node colision;
 	struct list_head list;		 /* LRU list or free list */
 	unsigned refcnt;
-	/* back "pointer" into ts_cache->element[index],
-	 * for paranoia, and for "ts_element_to_index" */
+	/* back "pointer" into lc_cache->element[index],
+	 * for paranoia, and for "lc_element_to_index" */
 	unsigned lc_index;
 	/* if we want to track a larger set of objects,
 	 * it needs to become arch independend u64 */
@@ -190,8 +190,8 @@ struct lru_cache {
 	/* Arbitrary limit on maximum tracked objects. Practical limit is much
 	 * lower due to allocation failures, probably. For typical use cases,
 	 * nr_elements should be a few thousand at most.
-	 * This also limits the maximum value of ts_element.ts_index, allowing the
-	 * 8 high bits of .ts_index to be overloaded with flags in the future. */
+	 * This also limits the maximum value of lc_element.lc_index, allowing the
+	 * 8 high bits of .lc_index to be overloaded with flags in the future. */
 #define LC_MAX_ACTIVE	(1<<24)
 
 	/* statistics */

From 53ea433145d9a56c7ad5e69f21f5662053e00e84 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 8 Mar 2011 17:11:40 +0100
Subject: [PATCH 097/110] drbd: fix potential distributed deadlock

We limit ourselves to a configurable maximum number of pages used as
temporary bio pages.

If the configured "max_buffers" is not big enough to match the bandwidth
of the respective deployment, a distributed deadlock could be triggered
by e.g. fast online verify and heavy application IO.

TCP connections would block on congestion, because both receivers
would wait on pages to become available.

Fortunately the respective senders in this case would be able to give
back some pages already. So do that.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 94 ++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 35 deletions(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index f7e6c92f8d03..b5e53695fd7e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -297,42 +297,48 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
 	crypto_hash_final(&desc, digest);
 }
 
-static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* TODO merge common code with w_e_end_ov_req */
+int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
 	int digest_size;
 	void *digest;
-	int ok;
+	int ok = 1;
 
 	D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
 
-	if (unlikely(cancel)) {
+	if (unlikely(cancel))
+		goto out;
+
+	if (likely((e->flags & EE_WAS_ERROR) != 0))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (digest) {
+		sector_t sector = e->sector;
+		unsigned int size = e->size;
+		drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
+		/* Free e and pages before send.
+		 * In case we block on congestion, we could otherwise run into
+		 * some distributed deadlock, if the other side blocks on
+		 * congestion as well, because our receiver blocks in
+		 * drbd_pp_alloc due to pp_in_use > max_buffers. */
 		drbd_free_ee(mdev, e);
-		return 1;
+		e = NULL;
+		inc_rs_pending(mdev);
+		ok = drbd_send_drequest_csum(mdev, sector, size,
+					     digest, digest_size,
+					     P_CSUM_RS_REQUEST);
+		kfree(digest);
+	} else {
+		dev_err(DEV, "kmalloc() of digest failed.\n");
+		ok = 0;
 	}
 
-	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
-		digest_size = crypto_hash_digestsize(mdev->csums_tfm);
-		digest = kmalloc(digest_size, GFP_NOIO);
-		if (digest) {
-			drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
-
-			inc_rs_pending(mdev);
-			ok = drbd_send_drequest_csum(mdev,
-						     e->sector,
-						     e->size,
-						     digest,
-						     digest_size,
-						     P_CSUM_RS_REQUEST);
-			kfree(digest);
-		} else {
-			dev_err(DEV, "kmalloc() of digest failed.\n");
-			ok = 0;
-		}
-	} else
-		ok = 1;
-
-	drbd_free_ee(mdev, e);
+out:
+	if (e)
+		drbd_free_ee(mdev, e);
 
 	if (unlikely(!ok))
 		dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
@@ -1071,9 +1077,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	return ok;
 }
 
+/* TODO merge common code with w_e_send_csum */
 int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	sector_t sector = e->sector;
+	unsigned int size = e->size;
 	int digest_size;
 	void *digest;
 	int ok = 1;
@@ -1093,17 +1102,25 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	else
 		memset(digest, 0, digest_size);
 
+	/* Free e and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_pp_alloc due to pp_in_use > max_buffers. */
+	drbd_free_ee(mdev, e);
+	e = NULL;
 	inc_rs_pending(mdev);
-	ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
-				     digest, digest_size, P_OV_REPLY);
+	ok = drbd_send_drequest_csum(mdev, sector, size,
+				     digest, digest_size,
+				     P_OV_REPLY);
 	if (!ok)
 		dec_rs_pending(mdev);
 	kfree(digest);
 
 out:
-	drbd_free_ee(mdev, e);
+	if (e)
+		drbd_free_ee(mdev, e);
 	dec_unacked(mdev);
-
 	return ok;
 }
 
@@ -1122,8 +1139,10 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
 	struct digest_info *di;
-	int digest_size;
 	void *digest;
+	sector_t sector = e->sector;
+	unsigned int size = e->size;
+	int digest_size;
 	int ok, eq = 0;
 
 	if (unlikely(cancel)) {
@@ -1153,16 +1172,21 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 		}
 	}
 
-	dec_unacked(mdev);
+		/* Free e and pages before send.
+		 * In case we block on congestion, we could otherwise run into
+		 * some distributed deadlock, if the other side blocks on
+		 * congestion as well, because our receiver blocks in
+		 * drbd_pp_alloc due to pp_in_use > max_buffers. */
+	drbd_free_ee(mdev, e);
 	if (!eq)
-		drbd_ov_oos_found(mdev, e->sector, e->size);
+		drbd_ov_oos_found(mdev, sector, size);
 	else
 		ov_oos_print(mdev);
 
-	ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
+	ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
 			      eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
 
-	drbd_free_ee(mdev, e);
+	dec_unacked(mdev);
 
 	--mdev->ov_left;
 

From f36af18c7b4ea1ba333c09b606bb4a7e5af66b4d Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 9 Mar 2011 22:44:55 +0100
Subject: [PATCH 098/110] drbd: fix disconnect/reconnect loop, if ping-timeout
 == ping-int

If there is no replication traffic within the idle timeout
(ping-int seconds), DRBD will send a P_PING,
and adjust the timeout to ping-timeout.

If there is no P_PING_ACK received within this ping-timeout,
DRBD finally drops the connection, and tries to re-establish it.

To decide which timeout was active, we compared the current timeout
with the ping-timeout, and dropped the connection, if that was the case.

By default, ping-int is 10 seconds, ping-timeout is 500 ms.

Unfortunately, if you configure ping-timeout to be the same as ping-int,
expiry of the idle-timeout had been mistaken for a missing ping ack,
and caused an immediate reconnection attempt.

Fix:
Allow both timeouts to be equal, use a local variable
to store which timeout is active.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index fd26666c0b08..0b17d426c32b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -4554,6 +4554,7 @@ int drbd_asender(struct drbd_thread *thi)
 	int received = 0;
 	int expect   = sizeof(struct p_header80);
 	int empty;
+	int ping_timeout_active = 0;
 
 	sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
 
@@ -4566,6 +4567,7 @@ int drbd_asender(struct drbd_thread *thi)
 			ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
 			mdev->meta.socket->sk->sk_rcvtimeo =
 				mdev->net_conf->ping_timeo*HZ/10;
+			ping_timeout_active = 1;
 		}
 
 		/* conditionally cork;
@@ -4620,8 +4622,7 @@ int drbd_asender(struct drbd_thread *thi)
 			dev_err(DEV, "meta connection shut down by peer.\n");
 			goto reconnect;
 		} else if (rv == -EAGAIN) {
-			if (mdev->meta.socket->sk->sk_rcvtimeo ==
-			    mdev->net_conf->ping_timeo*HZ/10) {
+			if (ping_timeout_active) {
 				dev_err(DEV, "PingAck did not arrive in time.\n");
 				goto reconnect;
 			}
@@ -4660,6 +4661,11 @@ int drbd_asender(struct drbd_thread *thi)
 			if (!cmd->process(mdev, h))
 				goto reconnect;
 
+			/* the idle_timeout (ping-int)
+			 * has been restored in got_PingAck() */
+			if (cmd == get_asender_cmd(P_PING_ACK))
+				ping_timeout_active = 0;
+
 			buf	 = h;
 			received = 0;
 			expect	 = sizeof(struct p_header80);

From d2e17807e3799bae24664a92f4d2d3dade021e00 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 14 Mar 2011 11:54:47 +0100
Subject: [PATCH 099/110] drbd: Only downgrade the disk state in case of disk
 failures

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 2c38752ca8d6..5c994739d11e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1827,7 +1827,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
 		if (!forcedetach) {
 			if (__ratelimit(&drbd_ratelimit_state))
 				dev_err(DEV, "Local IO failed in %s.\n", where);
-			_drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
+			if (mdev->state.disk > D_INCONSISTENT)
+				_drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
 			break;
 		}
 		/* NOTE fall through to detach case if forcedetach set */

From 76727f684aa2d6a2dc59a7e5cf77e092a1bf4fb6 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 16 May 2011 15:31:45 +0200
Subject: [PATCH 100/110] drbd: fix potential activity log refcount imbalance
 in error path

It is no longer sufficient to trigger on local WRITE,
we need to check on (rq_state & RQ_IN_ACT_LOG)
before calling drbd_al_complete_io also in the error path.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 5c0c8be1bb0a..14645bd40092 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1033,7 +1033,7 @@ fail_conflicting:
 	err = 0;
 
 fail_free_complete:
-	if (rw == WRITE && local)
+	if (req->rq_state & RQ_IN_ACT_LOG)
 		drbd_al_complete_io(mdev, sector);
 fail_and_free_req:
 	if (local) {

From a8e407925d49c521151dd24b6376c1f9a04a093f Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 13 May 2011 12:03:55 +0200
Subject: [PATCH 101/110] drbd: Fix for the connection problems on high latency
 links

It seems that the real cause of all the issues where that
we did not noticed in drbd_try_connect() when the other
guy closes one socket if the round trip time gets higher
than 100ms. There were that 100ms hard coded!

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 0b17d426c32b..b0b0ba345e83 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -787,7 +787,7 @@ static int drbd_connect(struct drbd_conf *mdev)
 		}
 
 		if (sock && msock) {
-			schedule_timeout_interruptible(HZ / 10);
+			schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
 			ok = drbd_socket_okay(mdev, &sock);
 			ok = drbd_socket_okay(mdev, &msock) && ok;
 			if (ok)

From fa7d939663b61f5c2bd3436d3aa126d4c0f47aa8 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 17 May 2011 14:48:55 +0200
Subject: [PATCH 102/110] drbd: Disallow the peer_disk_state to be D_OUTDATED
 while connected

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fd308864833f..ce6a764e905b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -745,6 +745,9 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
 		  mdev->agreed_pro_version < 88)
 		rv = SS_NOT_SUPPORTED;
 
+	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+		rv = SS_CONNECTED_OUTDATES;
+
 	return rv;
 }
 

From 21423fa79119a80e335de0c82ec29f67ed59f1bc Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 17 May 2011 14:19:41 +0200
Subject: [PATCH 103/110] drbd: Fixed state transitions after async
 outdate-peer-handler returned

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 03b29f78a37d..9dfe58a09625 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -272,9 +272,22 @@ static int _try_outdate_peer_async(void *data)
 {
 	struct drbd_conf *mdev = (struct drbd_conf *)data;
 	enum drbd_disk_state nps;
+	union drbd_state ns;
 
 	nps = drbd_try_outdate_peer(mdev);
-	drbd_request_state(mdev, NS(pdsk, nps));
+
+	/* Not using
+	   drbd_request_state(mdev, NS(pdsk, nps));
+	   here, because we might were able to re-establish the connection in the
+	   meantime.
+	*/
+	spin_lock_irq(&mdev->req_lock);
+	ns = mdev->state;
+	if (ns.conn < C_WF_REPORT_PARAMS) {
+		ns.pdsk = nps;
+		_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	}
+	spin_unlock_irq(&mdev->req_lock);
 
 	return 0;
 }

From 99432fcc528d7a5ac8494a4c07ad4726670c96e2 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 20 May 2011 16:39:13 +0200
Subject: [PATCH 104/110] drbd: Take a more conservative approach when deciding
 max_bio_size

The old (optimistic) implementation could shrink the bio size
on an primary device.

Shrinking the bio size on a primary device is bad. Since there
we might get BIOs with the old (bigger) size shortly after
we published the new size.

The new implementation is more conservative, and eventually
increases the max_bio_size on a primary device (which is valid).
It does so, when it knows the local limit AND the remote limit.

 We cache the last seen max_bio_size of the peer in the meta
 data, and rely on that, to make the operation of single
 nodes more efficient.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  5 +-
 drivers/block/drbd/drbd_main.c     | 26 ++++++--
 drivers/block/drbd/drbd_nl.c       | 96 +++++++++++++++++++++---------
 drivers/block/drbd/drbd_receiver.c | 20 +------
 4 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 5c994739d11e..8aa10391115b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1128,6 +1128,8 @@ struct drbd_conf {
 	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
 	int rs_planed;    /* resync sectors already planned */
 	atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+	int peer_max_bio_size;
+	int local_max_bio_size;
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1433,6 +1435,7 @@ struct bm_extent {
  * hash table. */
 #define HT_SHIFT 8
 #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
+#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12)       /* Works always = 4k */
 
 #define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
 
@@ -1519,7 +1522,7 @@ extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *,
 enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
 extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_conf *);
-extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
+extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
 extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
 					enum drbd_role new_role,
 					int force);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index ce6a764e905b..cfeb13b5a216 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2071,7 +2071,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 {
 	struct p_sizes p;
 	sector_t d_size, u_size;
-	int q_order_type;
+	int q_order_type, max_bio_size;
 	int ok;
 
 	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -2079,17 +2079,20 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 		d_size = drbd_get_max_capacity(mdev->ldev);
 		u_size = mdev->ldev->dc.disk_size;
 		q_order_type = drbd_queue_order_type(mdev);
+		max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
+		max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
 		put_ldev(mdev);
 	} else {
 		d_size = 0;
 		u_size = 0;
 		q_order_type = QUEUE_ORDERED_NONE;
+		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 	}
 
 	p.d_size = cpu_to_be64(d_size);
 	p.u_size = cpu_to_be64(u_size);
 	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
-	p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
+	p.max_bio_size = cpu_to_be32(max_bio_size);
 	p.queue_order_type = cpu_to_be16(q_order_type);
 	p.dds_flags = cpu_to_be16(flags);
 
@@ -3048,6 +3051,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	mdev->agreed_pro_version = PRO_VERSION_MAX;
 	mdev->write_ordering = WO_bdev_flush;
 	mdev->resync_wenr = LC_FREE;
+	mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+	mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
 }
 
 void drbd_mdev_cleanup(struct drbd_conf *mdev)
@@ -3422,7 +3427,9 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
 	q->backing_dev_info.congested_data = mdev;
 
 	blk_queue_make_request(q, drbd_make_request);
-	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
+	/* Setting the max_hw_sectors to an odd value of 8kibyte here
+	   This triggers a max_bio_size message upon first attach or connect */
+	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 	blk_queue_merge_bvec(q, drbd_merge_bvec);
 	q->queue_lock = &mdev->req_lock;
@@ -3634,7 +3641,8 @@ struct meta_data_on_disk {
 	      /* `-- act_log->nr_elements <-- sync_conf.al_extents */
 	u32 bm_offset;         /* offset to the bitmap, from here */
 	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
-	u32 reserved_u32[4];
+	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
+	u32 reserved_u32[3];
 
 } __packed;
 
@@ -3675,6 +3683,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
 
 	buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
+	buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
 
 	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
 	sector = mdev->ldev->md.md_offset;
@@ -3758,6 +3767,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
 	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
 
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.conn < C_CONNECTED) {
+		int peer;
+		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
+		peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
+		mdev->peer_max_bio_size = peer;
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
 	if (mdev->sync_conf.al_extents < 7)
 		mdev->sync_conf.al_extents = 127;
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9dfe58a09625..7c64ec042124 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -278,8 +278,14 @@ static int _try_outdate_peer_async(void *data)
 
 	/* Not using
 	   drbd_request_state(mdev, NS(pdsk, nps));
-	   here, because we might were able to re-establish the connection in the
-	   meantime.
+	   here, because we might were able to re-establish the connection
+	   in the meantime. This can only partially be solved in the state's
+	   engine is_valid_state() and is_valid_state_transition()
+	   functions.
+
+	   nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
+	   pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
+	   therefore we have to have the pre state change check here.
 	*/
 	spin_lock_irq(&mdev->req_lock);
 	ns = mdev->state;
@@ -786,30 +792,78 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
 	return 0;
 }
 
-void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) __must_hold(local)
+static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
 {
 	struct request_queue * const q = mdev->rq_queue;
-	struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
-	int max_segments = mdev->ldev->dc.max_bio_bvecs;
-	int max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+	int max_hw_sectors = max_bio_size >> 9;
+	int max_segments = 0;
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
+
+		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+		max_segments = mdev->ldev->dc.max_bio_bvecs;
+		put_ldev(mdev);
+	}
 
 	blk_queue_logical_block_size(q, 512);
 	blk_queue_max_hw_sectors(q, max_hw_sectors);
 	/* This is the workaround for "bio would need to, but cannot, be split" */
 	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
 	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
-	blk_queue_stack_limits(q, b);
 
-	dev_info(DEV, "max BIO size = %u\n", queue_max_hw_sectors(q) << 9);
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
 
-	if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
-		dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
-		     q->backing_dev_info.ra_pages,
-		     b->backing_dev_info.ra_pages);
-		q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+		blk_queue_stack_limits(q, b);
+
+		if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+			dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+				 q->backing_dev_info.ra_pages,
+				 b->backing_dev_info.ra_pages);
+			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+		}
+		put_ldev(mdev);
 	}
 }
 
+void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
+{
+	int now, new, local, peer;
+
+	now = queue_max_hw_sectors(mdev->rq_queue) << 9;
+	local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
+	peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
+		mdev->local_max_bio_size = local;
+		put_ldev(mdev);
+	}
+
+	/* We may ignore peer limits if the peer is modern enough.
+	   Because new from 8.3.8 onwards the peer can use multiple
+	   BIOs for a single peer_request */
+	if (mdev->state.conn >= C_CONNECTED) {
+		if (mdev->agreed_pro_version < 94)
+			peer = mdev->peer_max_bio_size;
+		else if (mdev->agreed_pro_version == 94)
+			peer = DRBD_MAX_SIZE_H80_PACKET;
+		else /* drbd 8.3.8 onwards */
+			peer = DRBD_MAX_BIO_SIZE;
+	}
+
+	new = min_t(int, local, peer);
+
+	if (mdev->state.role == R_PRIMARY && new < now)
+		dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
+
+	if (new != now)
+		dev_info(DEV, "max BIO size = %u\n", new);
+
+	drbd_setup_queue_param(mdev, new);
+}
+
 /* serialize deconfig (worker exiting, doing cleanup)
  * and reconfig (drbdsetup disk, drbdsetup net)
  *
@@ -878,7 +932,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
 	union drbd_state ns, os;
-	unsigned int max_bio_size;
 	enum drbd_state_rv rv;
 	int cp_discovered = 0;
 	int logical_block_size;
@@ -1130,20 +1183,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	mdev->read_cnt = 0;
 	mdev->writ_cnt = 0;
 
-	max_bio_size = DRBD_MAX_BIO_SIZE;
-	if (mdev->state.conn == C_CONNECTED) {
-		/* We are Primary, Connected, and now attach a new local
-		 * backing store. We must not increase the user visible maximum
-		 * bio size on this device to something the peer may not be
-		 * able to handle. */
-		if (mdev->agreed_pro_version < 94)
-			max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
-		else if (mdev->agreed_pro_version == 94)
-			max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
-		/* else: drbd 8.3.9 and later, stay with default */
-	}
-
-	drbd_setup_queue_param(mdev, max_bio_size);
+	drbd_reconsider_max_bio_size(mdev);
 
 	/* If I am currently not R_PRIMARY,
 	 * but meta data primary indicator is set,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b0b0ba345e83..6ea0a4b51ece 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -899,11 +899,6 @@ retry:
 
 	drbd_thread_start(&mdev->asender);
 
-	if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
-		drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
-		put_ldev(mdev);
-	}
-
 	if (drbd_send_protocol(mdev) == -1)
 		return -1;
 	drbd_send_sync_param(mdev, &mdev->sync_conf);
@@ -2939,7 +2934,6 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 {
 	struct p_sizes *p = &mdev->data.rbuf.sizes;
 	enum determine_dev_size dd = unchanged;
-	unsigned int max_bio_size;
 	sector_t p_size, p_usize, my_usize;
 	int ldsc = 0; /* local disk size changed */
 	enum dds_flags ddsf;
@@ -3004,23 +2998,15 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		drbd_set_my_capacity(mdev, p_size);
 	}
 
+	mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+	drbd_reconsider_max_bio_size(mdev);
+
 	if (get_ldev(mdev)) {
 		if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
 			mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
 			ldsc = 1;
 		}
 
-		if (mdev->agreed_pro_version < 94)
-			max_bio_size = be32_to_cpu(p->max_bio_size);
-		else if (mdev->agreed_pro_version == 94)
-			max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
-		else /* drbd 8.3.8 onwards */
-			max_bio_size = DRBD_MAX_BIO_SIZE;
-
-		if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9)
-			drbd_setup_queue_param(mdev, max_bio_size);
-
-		drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
 		put_ldev(mdev);
 	}
 

From 9a0d9d0389ef769e4b01abf50fcc11407706270b Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 2 May 2011 11:51:31 +0200
Subject: [PATCH 105/110] drbd: fix schedule in atomic

An administrative detach used to request a state change directly to D_DISKLESS,
first suspending IO to avoid the last put_ldev() occuring from an endio handler,
potentially in irq context.

This is not enough on the receiving side (typically secondary), we may miss
some peer_req on the way to local disk, which then may do the last put_ldev()
from their drbd_peer_request_endio().

This patch makes the detach always go through the intermediate D_FAILED state.
We may consider to rename it D_DETACHING.

Alternative approach would be to create yet an other work item to be scheduled
on the worker, do the destructor work from there, and get the timing right.

manually picked commit 564040f from the drbd 8.4 branch.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h |  4 ++++
 drivers/block/drbd/drbd_nl.c  | 14 +++++++++++---
 include/linux/drbd.h          |  2 +-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 8aa10391115b..a74d3ee04ba8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2157,6 +2157,10 @@ static inline int get_net_conf(struct drbd_conf *mdev)
 static inline void put_ldev(struct drbd_conf *mdev)
 {
 	int i = atomic_dec_return(&mdev->local_cnt);
+
+	/* This may be called from some endio handler,
+	 * so we must not sleep here. */
+
 	__release(local);
 	D_ASSERT(i >= 0);
 	if (i == 0) {
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 7c64ec042124..13569635b922 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1334,11 +1334,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			  struct drbd_nl_cfg_reply *reply)
 {
+	enum drbd_ret_code retcode;
+	int ret;
 	drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
-	reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
-	if (mdev->state.disk == D_DISKLESS)
-		wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+	retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
+	/* D_FAILED will transition to DISKLESS. */
+	ret = wait_event_interruptible(mdev->misc_wait,
+			mdev->state.disk != D_FAILED);
 	drbd_resume_io(mdev);
+	if (retcode == SS_IS_DISKLESS)
+		retcode = SS_NOTHING_TO_DO;
+	if (ret)
+		retcode = ERR_INTR;
+	reply->ret_code = retcode;
 	return 0;
 }
 
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index cec467f5d676..c86283b4fbab 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.10"
+#define REL_VERSION "8.3.11"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 96

From 24c4830c8ec3cbc904d84c213126a35f41a4e455 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Sat, 21 May 2011 18:32:29 +0200
Subject: [PATCH 106/110] drbd: Fix spelling

Found these with the help of ispell -l.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c   |  2 +-
 drivers/block/drbd/drbd_bitmap.c   |  6 +++---
 drivers/block/drbd/drbd_int.h      |  6 +++---
 drivers/block/drbd/drbd_main.c     |  4 ++--
 drivers/block/drbd/drbd_nl.c       |  6 +++---
 drivers/block/drbd/drbd_receiver.c | 30 +++++++++++++++---------------
 drivers/block/drbd/drbd_req.c      | 18 +++++++++---------
 drivers/block/drbd/drbd_req.h      |  4 ++--
 drivers/block/drbd/drbd_worker.c   |  4 ++--
 include/linux/drbd.h               |  8 ++++----
 include/linux/drbd_tag_magic.h     |  2 +-
 11 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index c6828b68d77b..09ef9a878ef0 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -28,7 +28,7 @@
 #include "drbd_int.h"
 #include "drbd_wrappers.h"
 
-/* We maintain a trivial check sum in our on disk activity log.
+/* We maintain a trivial checksum in our on disk activity log.
  * With that we can ensure correct operation even when the storage
  * device might do a partial (last) sector write while losing power.
  */
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 76210ba401ac..f440a02dfdb1 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -74,7 +74,7 @@
  *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
  *	seems excessive.
  *
- *	We plan to reduce the amount of in-core bitmap pages by pageing them in
+ *	We plan to reduce the amount of in-core bitmap pages by paging them in
  *	and out against their on-disk location as necessary, but need to make
  *	sure we don't cause too much meta data IO, and must not deadlock in
  *	tight memory situations. This needs some more work.
@@ -200,7 +200,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
  * we if bits have been cleared since last IO. */
 #define BM_PAGE_LAZY_WRITEOUT	28
 
-/* store_page_idx uses non-atomic assingment. It is only used directly after
+/* store_page_idx uses non-atomic assignment. It is only used directly after
  * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
  * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
  * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
@@ -318,7 +318,7 @@ static void bm_unmap(unsigned long *p_addr)
 /* word offset from start of bitmap to word number _in_page_
  * modulo longs per page
 #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
- hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
+ hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
  so do it explicitly:
  */
 #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a74d3ee04ba8..7952eb90d17f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -699,7 +699,7 @@ struct drbd_request {
 	 * see drbd_endio_pri(). */
 	struct bio *private_bio;
 
-	struct hlist_node colision;
+	struct hlist_node collision;
 	sector_t sector;
 	unsigned int size;
 	unsigned int epoch; /* barrier_nr */
@@ -765,7 +765,7 @@ struct digest_info {
 
 struct drbd_epoch_entry {
 	struct drbd_work w;
-	struct hlist_node colision;
+	struct hlist_node collision;
 	struct drbd_epoch *epoch; /* for writes */
 	struct drbd_conf *mdev;
 	struct page *pages;
@@ -1520,7 +1520,7 @@ extern void drbd_resume_io(struct drbd_conf *mdev);
 extern char *ppsize(char *buf, unsigned long long size);
 extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
 enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
-extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
+extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_conf *);
 extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
 extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index cfeb13b5a216..0358e55356c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2732,7 +2732,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 
 		/* double check digest, sometimes buffers have been modified in flight. */
 		if (dgs > 0 && dgs <= 64) {
-			/* 64 byte, 512 bit, is the larges digest size
+			/* 64 byte, 512 bit, is the largest digest size
 			 * currently supported in kernel crypto. */
 			unsigned char digest[64];
 			drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
@@ -3287,7 +3287,7 @@ static void drbd_delete_device(unsigned int minor)
 
 	drbd_release_ee_lists(mdev);
 
-	/* should be free'd on disconnect? */
+	/* should be freed on disconnect? */
 	kfree(mdev->ee_hash);
 	/*
 	mdev->ee_hash_s = 0;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 13569635b922..259ca0b20961 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -596,7 +596,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
  * Returns 0 on success, negative return values indicate errors.
  * You should call drbd_md_sync() after calling this function.
  */
-enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
+enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
 {
 	sector_t prev_first_sect, prev_size; /* previous meta location */
 	sector_t la_size;
@@ -1205,7 +1205,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	    !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
 		set_bit(USE_DEGR_WFC_T, &mdev->flags);
 
-	dd = drbd_determin_dev_size(mdev, 0);
+	dd = drbd_determine_dev_size(mdev, 0);
 	if (dd == dev_size_error) {
 		retcode = ERR_NOMEM_BITMAP;
 		goto force_diskless_dec;
@@ -1719,7 +1719,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
 	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
 	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
-	dd = drbd_determin_dev_size(mdev, ddsf);
+	dd = drbd_determine_dev_size(mdev, ddsf);
 	drbd_md_sync(mdev);
 	put_ldev(mdev);
 	if (dd == dev_size_error) {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6ea0a4b51ece..d9f2109fd9e6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -333,7 +333,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 	if (!page)
 		goto fail;
 
-	INIT_HLIST_NODE(&e->colision);
+	INIT_HLIST_NODE(&e->collision);
 	e->epoch = NULL;
 	e->mdev = mdev;
 	e->pages = page;
@@ -356,7 +356,7 @@ void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int i
 		kfree(e->digest);
 	drbd_pp_free(mdev, e->pages, is_net);
 	D_ASSERT(atomic_read(&e->pending_bios) == 0);
-	D_ASSERT(hlist_unhashed(&e->colision));
+	D_ASSERT(hlist_unhashed(&e->collision));
 	mempool_free(e, drbd_ee_mempool);
 }
 
@@ -1413,7 +1413,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
 	sector_t sector = e->sector;
 	int ok;
 
-	D_ASSERT(hlist_unhashed(&e->colision));
+	D_ASSERT(hlist_unhashed(&e->collision));
 
 	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		drbd_set_in_sync(mdev, sector, e->size);
@@ -1482,7 +1482,7 @@ static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 		return false;
 	}
 
-	/* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
+	/* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
 	 * special casing it there for the various failure cases.
 	 * still no race with drbd_fail_pending_reads */
 	ok = recv_dless_read(mdev, req, sector, data_size);
@@ -1553,11 +1553,11 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
 	if (mdev->net_conf->two_primaries) {
 		spin_lock_irq(&mdev->req_lock);
-		D_ASSERT(!hlist_unhashed(&e->colision));
-		hlist_del_init(&e->colision);
+		D_ASSERT(!hlist_unhashed(&e->collision));
+		hlist_del_init(&e->collision);
 		spin_unlock_irq(&mdev->req_lock);
 	} else {
-		D_ASSERT(hlist_unhashed(&e->colision));
+		D_ASSERT(hlist_unhashed(&e->collision));
 	}
 
 	drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
@@ -1574,8 +1574,8 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
 	ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
 
 	spin_lock_irq(&mdev->req_lock);
-	D_ASSERT(!hlist_unhashed(&e->colision));
-	hlist_del_init(&e->colision);
+	D_ASSERT(!hlist_unhashed(&e->collision));
+	hlist_del_init(&e->collision);
 	spin_unlock_irq(&mdev->req_lock);
 
 	dec_unacked(mdev);
@@ -1750,7 +1750,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 
 		spin_lock_irq(&mdev->req_lock);
 
-		hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
+		hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
 
 #define OVERLAPS overlaps(i->sector, i->size, sector, size)
 		slot = tl_hash_slot(mdev, sector);
@@ -1760,7 +1760,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 			int have_conflict = 0;
 			prepare_to_wait(&mdev->misc_wait, &wait,
 				TASK_INTERRUPTIBLE);
-			hlist_for_each_entry(i, n, slot, colision) {
+			hlist_for_each_entry(i, n, slot, collision) {
 				if (OVERLAPS) {
 					/* only ALERT on first iteration,
 					 * we may be woken up early... */
@@ -1799,7 +1799,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 			}
 
 			if (signal_pending(current)) {
-				hlist_del_init(&e->colision);
+				hlist_del_init(&e->collision);
 
 				spin_unlock_irq(&mdev->req_lock);
 
@@ -1857,7 +1857,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	dev_err(DEV, "submit failed, triggering re-connect\n");
 	spin_lock_irq(&mdev->req_lock);
 	list_del(&e->w.list);
-	hlist_del_init(&e->colision);
+	hlist_del_init(&e->collision);
 	spin_unlock_irq(&mdev->req_lock);
 	if (e->flags & EE_CALL_AL_COMPLETE_IO)
 		drbd_al_complete_io(mdev, e->sector);
@@ -2988,7 +2988,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 
 	ddsf = be16_to_cpu(p->dds_flags);
 	if (get_ldev(mdev)) {
-		dd = drbd_determin_dev_size(mdev, ddsf);
+		dd = drbd_determine_dev_size(mdev, ddsf);
 		put_ldev(mdev);
 		if (dd == dev_size_error)
 			return false;
@@ -4261,7 +4261,7 @@ static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
 	struct hlist_node *n;
 	struct drbd_request *req;
 
-	hlist_for_each_entry(req, n, slot, colision) {
+	hlist_for_each_entry(req, n, slot, collision) {
 		if ((unsigned long)req == (unsigned long)id) {
 			if (req->sector != sector) {
 				dev_err(DEV, "_ack_id_to_req: found req %p but it has "
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 14645bd40092..3424d675b769 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -163,7 +163,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
 		 * they must have been failed on the spot */
 #define OVERLAPS overlaps(sector, size, i->sector, i->size)
 		slot = tl_hash_slot(mdev, sector);
-		hlist_for_each_entry(i, n, slot, colision) {
+		hlist_for_each_entry(i, n, slot, collision) {
 			if (OVERLAPS) {
 				dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
 				      "other: %p %llus +%u\n",
@@ -187,7 +187,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
 #undef OVERLAPS
 #define OVERLAPS overlaps(sector, size, e->sector, e->size)
 		slot = ee_hash_slot(mdev, req->sector);
-		hlist_for_each_entry(e, n, slot, colision) {
+		hlist_for_each_entry(e, n, slot, collision) {
 			if (OVERLAPS) {
 				wake_up(&mdev->misc_wait);
 				break;
@@ -260,8 +260,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 
 		/* remove the request from the conflict detection
 		 * respective block_id verification hash */
-		if (!hlist_unhashed(&req->colision))
-			hlist_del(&req->colision);
+		if (!hlist_unhashed(&req->collision))
+			hlist_del(&req->collision);
 		else
 			D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
 
@@ -329,7 +329,7 @@ static int _req_conflicts(struct drbd_request *req)
 	struct hlist_node *n;
 	struct hlist_head *slot;
 
-	D_ASSERT(hlist_unhashed(&req->colision));
+	D_ASSERT(hlist_unhashed(&req->collision));
 
 	if (!get_net_conf(mdev))
 		return 0;
@@ -341,7 +341,7 @@ static int _req_conflicts(struct drbd_request *req)
 
 #define OVERLAPS overlaps(i->sector, i->size, sector, size)
 	slot = tl_hash_slot(mdev, sector);
-	hlist_for_each_entry(i, n, slot, colision) {
+	hlist_for_each_entry(i, n, slot, collision) {
 		if (OVERLAPS) {
 			dev_alert(DEV, "%s[%u] Concurrent local write detected! "
 			      "[DISCARD L] new: %llus +%u; "
@@ -359,7 +359,7 @@ static int _req_conflicts(struct drbd_request *req)
 #undef OVERLAPS
 #define OVERLAPS overlaps(e->sector, e->size, sector, size)
 		slot = ee_hash_slot(mdev, sector);
-		hlist_for_each_entry(e, n, slot, colision) {
+		hlist_for_each_entry(e, n, slot, collision) {
 			if (OVERLAPS) {
 				dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
 				      " [DISCARD L] new: %llus +%u; "
@@ -491,7 +491,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 		/* so we can verify the handle in the answer packet
 		 * corresponding hlist_del is in _req_may_be_done() */
-		hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
+		hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
 
 		set_bit(UNPLUG_REMOTE, &mdev->flags);
 
@@ -507,7 +507,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		/* assert something? */
 		/* from drbd_make_request_common only */
 
-		hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
+		hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
 		/* corresponding hlist_del is in _req_may_be_done() */
 
 		/* NOTE
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 32e2c3e6a813..281342dca2c8 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -256,7 +256,7 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
 	struct hlist_node *n;
 	struct drbd_request *req;
 
-	hlist_for_each_entry(req, n, slot, colision) {
+	hlist_for_each_entry(req, n, slot, collision) {
 		if ((unsigned long)req == (unsigned long)id) {
 			D_ASSERT(req->sector == sector);
 			return req;
@@ -291,7 +291,7 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
 		req->epoch       = 0;
 		req->sector      = bio_src->bi_sector;
 		req->size        = bio_src->bi_size;
-		INIT_HLIST_NODE(&req->colision);
+		INIT_HLIST_NODE(&req->collision);
 		INIT_LIST_HEAD(&req->tl_requests);
 		INIT_LIST_HEAD(&req->w.list);
 	}
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index b5e53695fd7e..4d76b06b6b20 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -126,7 +126,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
 	list_del(&e->w.list); /* has been on active_ee or sync_ee */
 	list_add_tail(&e->w.list, &mdev->done_ee);
 
-	/* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
+	/* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
 	 * neither did we wake possibly waiting conflicting requests.
 	 * done from "drbd_process_done_ee" within the appropriate w.cb
 	 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
@@ -840,7 +840,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 			const int ratio =
 				(t == 0)     ? 0 :
 			(t < 100000) ? ((s*100)/t) : (s/(t/100));
-			dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
+			dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "
 			     "transferred %luK total %luK\n",
 			     ratio,
 			     Bit2KB(mdev->rs_same_csum),
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index c86283b4fbab..9e5f5607eba3 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -38,7 +38,7 @@
 
 /* Although the Linux source code makes a difference between
    generic endianness and the bitfields' endianness, there is no
-   architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
+   architecture as of Linux-2.6.24-rc4 where the bitfields' endianness
    does not match the generic endianness. */
 
 #if __BYTE_ORDER == __LITTLE_ENDIAN
@@ -195,7 +195,7 @@ enum drbd_conns {
 	C_WF_REPORT_PARAMS, /* we have a socket */
 	C_CONNECTED,      /* we have introduced each other */
 	C_STARTING_SYNC_S,  /* starting full sync by admin request. */
-	C_STARTING_SYNC_T,  /* stariing full sync by admin request. */
+	C_STARTING_SYNC_T,  /* starting full sync by admin request. */
 	C_WF_BITMAP_S,
 	C_WF_BITMAP_T,
 	C_WF_SYNC_UUID,
@@ -236,7 +236,7 @@ union drbd_state {
  * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
  * even though we transmit as "cpu_to_be32(state)",
  * the offsets of the bitfields still need to be swapped
- * on different endianess.
+ * on different endianness.
  */
 	struct {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
@@ -266,7 +266,7 @@ union drbd_state {
 		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
 		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
 #else
-# error "this endianess is not supported"
+# error "this endianness is not supported"
 #endif
 	};
 	unsigned int i;
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h
index f14a165e82dc..069543190516 100644
--- a/include/linux/drbd_tag_magic.h
+++ b/include/linux/drbd_tag_magic.h
@@ -30,7 +30,7 @@ enum packet_types {
 	int tag_and_len ## member;
 #include "linux/drbd_nl.h"
 
-/* declate tag-list-sizes */
+/* declare tag-list-sizes */
 static const int tag_list_sizes[] = {
 #define NL_PACKET(name, number, fields) 2 fields ,
 #define NL_INTEGER(pn, pr, member)      + 4 + 4

From 9b2f61aec73dc9e735e247fd720c673b30999e7c Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 24 May 2011 10:27:38 +0200
Subject: [PATCH 107/110] drbd: fix warning

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
---
 drivers/block/drbd/drbd_nl.c       | 2 +-
 drivers/block/drbd/drbd_receiver.c | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 259ca0b20961..515bcd948a43 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1342,7 +1342,7 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	ret = wait_event_interruptible(mdev->misc_wait,
 			mdev->state.disk != D_FAILED);
 	drbd_resume_io(mdev);
-	if (retcode == SS_IS_DISKLESS)
+	if ((int)retcode == (int)SS_IS_DISKLESS)
 		retcode = SS_NOTHING_TO_DO;
 	if (ret)
 		retcode = ERR_INTR;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d9f2109fd9e6..25d32c5aa50a 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2911,12 +2911,6 @@ disconnect:
 	return false;
 }
 
-static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
-{
-	/* sorry, we currently have no working implementation
-	 * of distributed TCQ */
-}
-
 /* warn if the arguments differ by more than 12.5% */
 static void warn_if_differ_considerably(struct drbd_conf *mdev,
 	const char *s, sector_t a, sector_t b)

From 0ddf72be4edbd7640b57c13161f71416df16ec11 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 23 May 2011 15:29:32 -0700
Subject: [PATCH 108/110] drbd: fix warning

In file included from drivers/block/drbd/drbd_main.c:54:                        drivers/block/drbd/drbd_int.h:1190: warning: parameter has incomplete type

Forward declarations of enums do not work.

Fix it unpleasantly by moving the prototype.

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Lars Ellenberg <drbd-dev@lists.linbit.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/drbd/drbd_int.h | 2 --
 drivers/block/drbd/drbd_req.h | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 7952eb90d17f..b127f8d25b0b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1219,8 +1219,6 @@ extern void drbd_free_resources(struct drbd_conf *mdev);
 extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 		       unsigned int set_size);
 extern void tl_clear(struct drbd_conf *mdev);
-enum drbd_req_event;
-extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
 extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
 extern void drbd_free_sock(struct drbd_conf *mdev);
 extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 281342dca2c8..68a234a5fdc5 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -323,6 +323,7 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 extern void complete_master_bio(struct drbd_conf *mdev,
 		struct bio_and_error *m);
 extern void request_timer_fn(unsigned long data);
+extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
 
 /* use this if you don't want to deal with calling complete_master_bio()
  * outside the spinlock, e.g. when walking some list on cleanup. */

From 78f4bb367fd147a0e7e3998ba6e47109999d8814 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 24 May 2011 16:48:54 +0200
Subject: [PATCH 109/110] loop: limit 'max_part' module param to DISK_MAX_PARTS

The 'max_part' parameter controls the number of maximum partition
a loop block device can have. However if a user specifies very
large value it would exceed the limitation of device minor number
and can cause a kernel panic (or, at least, produce invalid
device nodes in some cases).

On my desktop system, following command kills the kernel. On qemu,
it triggers similar oops but the kernel was alive:

$ sudo modprobe loop max_part0000
 ------------[ cut here ]------------
 kernel BUG at /media/Linux_Data/project/linux/fs/sysfs/group.c:65!
 invalid opcode: 0000 [#1] SMP
 last sysfs file:
 CPU 0
 Modules linked in: loop(+)

 Pid: 43, comm: insmod Tainted: G        W   2.6.39-qemu+ #155 Bochs Bochs
 RIP: 0010:[<ffffffff8113ce61>]  [<ffffffff8113ce61>] internal_create_group=
+0x2a/0x170
 RSP: 0018:ffff880007b3fde8  EFLAGS: 00000246
 RAX: 00000000ffffffef RBX: ffff880007b3d878 RCX: 00000000000007b4
 RDX: ffffffff8152da50 RSI: 0000000000000000 RDI: ffff880007b3d878
 RBP: ffff880007b3fe38 R08: ffff880007b3fde8 R09: 0000000000000000
 R10: ffff88000783b4a8 R11: ffff880007b3d878 R12: ffffffff8152da50
 R13: ffff880007b3d868 R14: 0000000000000000 R15: ffff880007b3d800
 FS:  0000000002137880(0063) GS:ffff880007c00000(0000) knlGS:00000000000000=
00
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000422680 CR3: 0000000007b50000 CR4: 00000000000006b0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 0000000000000000 DR7: 0000000000000000
 Process insmod (pid: 43, threadinfo ffff880007b3e000, task ffff880007afb9c=
0)
 Stack:
  ffff880007b3fe58 ffffffff811e66dd ffff880007b3fe58 ffffffff811e570b
  0000000000000010 ffff880007b3d800 ffff880007a7b390 ffff880007b3d868
  0000000000400920 ffff880007b3d800 ffff880007b3fe48 ffffffff8113cfc8
 Call Trace:
  [<ffffffff811e66dd>] ? device_add+0x4bc/0x5af
  [<ffffffff811e570b>] ? dev_set_name+0x3c/0x3e
  [<ffffffff8113cfc8>] sysfs_create_group+0xe/0x12
  [<ffffffff810b420e>] blk_trace_init_sysfs+0x14/0x16
  [<ffffffff8116a090>] blk_register_queue+0x47/0xf7
  [<ffffffff8116f527>] add_disk+0xdf/0x290
  [<ffffffffa00060eb>] loop_init+0xeb/0x1b8 [loop]
  [<ffffffffa0006000>] ? 0xffffffffa0005fff
  [<ffffffff8100020a>] do_one_initcall+0x7a/0x12e
  [<ffffffff81096804>] sys_init_module+0x9c/0x1e0
  [<ffffffff813329bb>] system_call_fastpath+0x16/0x1b
 Code: c3 55 48 89 e5 41 57 41 56 41 89 f6 41 55 41 54 49 89 d4 53 48 89 fb=
 48 83 ec 28 48 85 ff 74 0b 85 f6 75 0b 48 83 7f 30 00 75 14 <0f> 0b eb fe =
48 83 7f 30 00 b9 ea ff ff ff 0f 84 18 01 00 00 49
 RIP  [<ffffffff8113ce61>] internal_create_group+0x2a/0x170
  RSP <ffff880007b3fde8>
 ---[ end trace a123eb592043acad ]---

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Laurent Vivier <Laurent.Vivier@bull.net>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a076a14ca72d..cbf7052d1dd5 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1691,6 +1691,9 @@ static int __init loop_init(void)
 	if (max_part > 0)
 		part_shift = fls(max_part);
 
+	if ((1UL << part_shift) > DISK_MAX_PARTS)
+		return -EINVAL;
+
 	if (max_loop > 1UL << (MINORBITS - part_shift))
 		return -EINVAL;
 

From a1c15c59feee36267c43142a41152fbf7402afb6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 24 May 2011 16:48:55 +0200
Subject: [PATCH 110/110] loop: handle on-demand devices correctly

When finding or allocating a loop device, loop_probe() did not take
partition numbers into account so that it can result to a different
device. Consider following example:

$ sudo modprobe loop max_part=15
$ ls -l /dev/loop*
brw-rw---- 1 root disk 7,   0 2011-05-24 22:16 /dev/loop0
brw-rw---- 1 root disk 7,  16 2011-05-24 22:16 /dev/loop1
brw-rw---- 1 root disk 7,  32 2011-05-24 22:16 /dev/loop2
brw-rw---- 1 root disk 7,  48 2011-05-24 22:16 /dev/loop3
brw-rw---- 1 root disk 7,  64 2011-05-24 22:16 /dev/loop4
brw-rw---- 1 root disk 7,  80 2011-05-24 22:16 /dev/loop5
brw-rw---- 1 root disk 7,  96 2011-05-24 22:16 /dev/loop6
brw-rw---- 1 root disk 7, 112 2011-05-24 22:16 /dev/loop7
$ sudo mknod /dev/loop8 b 7 128
$ sudo losetup /dev/loop8 ~/temp/disk-with-3-parts.img
$ sudo losetup -a
/dev/loop128: [0805]:278201 (/home/namhyung/temp/disk-with-3-parts.img)
$ ls -l /dev/loop*
brw-rw---- 1 root disk 7,    0 2011-05-24 22:16 /dev/loop0
brw-rw---- 1 root disk 7,   16 2011-05-24 22:16 /dev/loop1
brw-rw---- 1 root disk 7, 2048 2011-05-24 22:18 /dev/loop128
brw-rw---- 1 root disk 7, 2049 2011-05-24 22:18 /dev/loop128p1
brw-rw---- 1 root disk 7, 2050 2011-05-24 22:18 /dev/loop128p2
brw-rw---- 1 root disk 7, 2051 2011-05-24 22:18 /dev/loop128p3
brw-rw---- 1 root disk 7,   32 2011-05-24 22:16 /dev/loop2
brw-rw---- 1 root disk 7,   48 2011-05-24 22:16 /dev/loop3
brw-rw---- 1 root disk 7,   64 2011-05-24 22:16 /dev/loop4
brw-rw---- 1 root disk 7,   80 2011-05-24 22:16 /dev/loop5
brw-rw---- 1 root disk 7,   96 2011-05-24 22:16 /dev/loop6
brw-rw---- 1 root disk 7,  112 2011-05-24 22:16 /dev/loop7
brw-r--r-- 1 root root 7,  128 2011-05-24 22:17 /dev/loop8

After this patch, /dev/loop8 - instead of /dev/loop128 - was
accessed correctly.

In addition, 'range' passed to blk_register_region() should
include all range of dev_t that LOOP_MAJOR can address. It does
not need to be limited by partition numbers unless 'max_loop'
param was specified.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Laurent Vivier <Laurent.Vivier@bull.net>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cbf7052d1dd5..c59a672a3de0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1658,7 +1658,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 	struct kobject *kobj;
 
 	mutex_lock(&loop_devices_mutex);
-	lo = loop_init_one(dev & MINORMASK);
+	lo = loop_init_one(MINOR(dev) >> part_shift);
 	kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
 	mutex_unlock(&loop_devices_mutex);
 
@@ -1699,10 +1699,10 @@ static int __init loop_init(void)
 
 	if (max_loop) {
 		nr = max_loop;
-		range = max_loop;
+		range = max_loop << part_shift;
 	} else {
 		nr = 8;
-		range = 1UL << (MINORBITS - part_shift);
+		range = 1UL << MINORBITS;
 	}
 
 	if (register_blkdev(LOOP_MAJOR, "loop"))
@@ -1741,7 +1741,7 @@ static void __exit loop_exit(void)
 	unsigned long range;
 	struct loop_device *lo, *next;
 
-	range = max_loop ? max_loop :  1UL << (MINORBITS - part_shift);
+	range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
 
 	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
 		loop_del_one(lo);