qemu-e2k/hw/virtio-blk.c

518 lines
14 KiB
C
Raw Normal View History

/*
* Virtio Block Device
*
* Copyright IBM, Corp. 2007
*
* Authors:
* Anthony Liguori <aliguori@us.ibm.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include <qemu-common.h>
#include "virtio-blk.h"
#ifdef __linux__
# include <scsi/sg.h>
#endif
typedef struct VirtIOBlock
{
VirtIODevice vdev;
BlockDriverState *bs;
VirtQueue *vq;
void *rq;
Fix VM state change handlers running out of order When a VM state change handler changes VM state, other VM state change handlers can see the state transitions out of order. bmdma_map(), scsi_disk_init() and virtio_blk_init() install VM state change handlers to restart DMA. These handlers can vm_stop() by running into a write error on a drive with werror=stop. This throws the VM state change handler callback into disarray. Here's an example case I observed: 0. The virtual IDE drive goes south. All future writes return errors. 1. Something encounters a write error, and duly stops the VM with vm_stop(). 2. vm_stop() calls vm_state_notify(0). 3. vm_state_notify() runs the callbacks in list vm_change_state_head. It contains ide_dma_restart_cb() installed by bmdma_map(). It also contains audio_vm_change_state_handler() installed by audio_init(). 4. audio_vm_change_state_handler() stops audio stuff. 5. User continues VM with monitor command "c". This runs vm_start(). 6. vm_start() calls vm_state_notify(1). 7. vm_state_notify() runs the callbacks in vm_change_state_head. 8. ide_dma_restart_cb() happens to come first. It does its work, runs into a write error, and duly stops the VM with vm_stop(). 9. vm_stop() runs vm_state_notify(0). 10. vm_state_notify() runs the callbacks in vm_change_state_head. 11. audio_vm_change_state_handler() stops audio stuff. Which isn't running. 12. vm_stop() finishes, ide_dma_restart_cb() finishes, step 7's vm_state_notify() resumes running handlers. 13. audio_vm_change_state_handler() starts audio stuff. Oopsie. Fix this by moving the actual write from each VM state change handler into a new bottom half (suggested by Gleb Natapov). Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2009-07-28 20:33:41 +02:00
QEMUBH *bh;
BlockConf *conf;
unsigned short sector_mask;
} VirtIOBlock;
static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
{
return (VirtIOBlock *)vdev;
}
typedef struct VirtIOBlockReq
{
VirtIOBlock *dev;
VirtQueueElement elem;
struct virtio_blk_inhdr *in;
struct virtio_blk_outhdr *out;
struct virtio_scsi_inhdr *scsi;
QEMUIOVector qiov;
struct VirtIOBlockReq *next;
} VirtIOBlockReq;
static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
{
VirtIOBlock *s = req->dev;
req->in->status = status;
virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
virtio_notify(&s->vdev, s->vq);
qemu_free(req);
}
static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
int is_read)
{
BlockInterfaceErrorAction action =
drive_get_on_error(req->dev->bs, is_read);
VirtIOBlock *s = req->dev;
if (action == BLOCK_ERR_IGNORE) {
bdrv_mon_event(s->bs, BDRV_ACTION_IGNORE, is_read);
return 0;
}
if ((error == ENOSPC && action == BLOCK_ERR_STOP_ENOSPC)
|| action == BLOCK_ERR_STOP_ANY) {
req->next = s->rq;
s->rq = req;
bdrv_mon_event(s->bs, BDRV_ACTION_STOP, is_read);
vm_stop(0);
} else {
virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read);
}
return 1;
}
static void virtio_blk_rw_complete(void *opaque, int ret)
{
VirtIOBlockReq *req = opaque;
if (ret) {
int is_read = !(req->out->type & VIRTIO_BLK_T_OUT);
if (virtio_blk_handle_rw_error(req, -ret, is_read))
return;
}
virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
}
static void virtio_blk_flush_complete(void *opaque, int ret)
{
VirtIOBlockReq *req = opaque;
virtio_blk_req_complete(req, ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK);
}
static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
{
VirtIOBlockReq *req = qemu_malloc(sizeof(*req));
req->dev = s;
req->qiov.size = 0;
req->next = NULL;
return req;
}
static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
{
VirtIOBlockReq *req = virtio_blk_alloc_request(s);
if (req != NULL) {
if (!virtqueue_pop(s->vq, &req->elem)) {
qemu_free(req);
return NULL;
}
}
return req;
}
#ifdef __linux__
static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
{
struct sg_io_hdr hdr;
int ret;
int status;
int i;
/*
* We require at least one output segment each for the virtio_blk_outhdr
* and the SCSI command block.
*
* We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
* and the sense buffer pointer in the input segments.
*/
if (req->elem.out_num < 2 || req->elem.in_num < 3) {
virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
return;
}
/*
* No support for bidirection commands yet.
*/
if (req->elem.out_num > 2 && req->elem.in_num > 3) {
virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
return;
}
/*
* The scsi inhdr is placed in the second-to-last input segment, just
* before the regular inhdr.
*/
req->scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;
memset(&hdr, 0, sizeof(struct sg_io_hdr));
hdr.interface_id = 'S';
hdr.cmd_len = req->elem.out_sg[1].iov_len;
hdr.cmdp = req->elem.out_sg[1].iov_base;
hdr.dxfer_len = 0;
if (req->elem.out_num > 2) {
/*
* If there are more than the minimally required 2 output segments
* there is write payload starting from the third iovec.
*/
hdr.dxfer_direction = SG_DXFER_TO_DEV;
hdr.iovec_count = req->elem.out_num - 2;
for (i = 0; i < hdr.iovec_count; i++)
hdr.dxfer_len += req->elem.out_sg[i + 2].iov_len;
hdr.dxferp = req->elem.out_sg + 2;
} else if (req->elem.in_num > 3) {
/*
* If we have more than 3 input segments the guest wants to actually
* read data.
*/
hdr.dxfer_direction = SG_DXFER_FROM_DEV;
hdr.iovec_count = req->elem.in_num - 3;
for (i = 0; i < hdr.iovec_count; i++)
hdr.dxfer_len += req->elem.in_sg[i].iov_len;
hdr.dxferp = req->elem.in_sg;
} else {
/*
* Some SCSI commands don't actually transfer any data.
*/
hdr.dxfer_direction = SG_DXFER_NONE;
}
hdr.sbp = req->elem.in_sg[req->elem.in_num - 3].iov_base;
hdr.mx_sb_len = req->elem.in_sg[req->elem.in_num - 3].iov_len;
ret = bdrv_ioctl(req->dev->bs, SG_IO, &hdr);
if (ret) {
status = VIRTIO_BLK_S_UNSUPP;
hdr.status = ret;
hdr.resid = hdr.dxfer_len;
} else if (hdr.status) {
status = VIRTIO_BLK_S_IOERR;
} else {
status = VIRTIO_BLK_S_OK;
}
req->scsi->errors = hdr.status;
req->scsi->residual = hdr.resid;
req->scsi->sense_len = hdr.sb_len_wr;
req->scsi->data_len = hdr.dxfer_len;
virtio_blk_req_complete(req, status);
}
#else
static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
{
virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
}
#endif /* __linux__ */
static void do_multiwrite(BlockDriverState *bs, BlockRequest *blkreq,
int num_writes)
{
int i, ret;
ret = bdrv_aio_multiwrite(bs, blkreq, num_writes);
if (ret != 0) {
for (i = 0; i < num_writes; i++) {
if (blkreq[i].error) {
virtio_blk_rw_complete(blkreq[i].opaque, -EIO);
}
}
}
}
static void virtio_blk_handle_flush(BlockRequest *blkreq, int *num_writes,
VirtIOBlockReq *req, BlockDriverState **old_bs)
{
BlockDriverAIOCB *acb;
/*
* Make sure all outstanding writes are posted to the backing device.
*/
if (*old_bs != NULL) {
do_multiwrite(*old_bs, blkreq, *num_writes);
}
*num_writes = 0;
*old_bs = req->dev->bs;
acb = bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req);
if (!acb) {
virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
}
}
static void virtio_blk_handle_write(BlockRequest *blkreq, int *num_writes,
VirtIOBlockReq *req, BlockDriverState **old_bs)
{
if (req->out->sector & req->dev->sector_mask) {
virtio_blk_rw_complete(req, -EIO);
return;
}
if (req->dev->bs != *old_bs || *num_writes == 32) {
if (*old_bs != NULL) {
do_multiwrite(*old_bs, blkreq, *num_writes);
}
*num_writes = 0;
*old_bs = req->dev->bs;
}
blkreq[*num_writes].sector = req->out->sector;
blkreq[*num_writes].nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
blkreq[*num_writes].qiov = &req->qiov;
blkreq[*num_writes].cb = virtio_blk_rw_complete;
blkreq[*num_writes].opaque = req;
blkreq[*num_writes].error = 0;
(*num_writes)++;
}
static void virtio_blk_handle_read(VirtIOBlockReq *req)
{
BlockDriverAIOCB *acb;
if (req->out->sector & req->dev->sector_mask) {
virtio_blk_rw_complete(req, -EIO);
return;
}
acb = bdrv_aio_readv(req->dev->bs, req->out->sector, &req->qiov,
req->qiov.size / BDRV_SECTOR_SIZE,
virtio_blk_rw_complete, req);
if (!acb) {
virtio_blk_rw_complete(req, -EIO);
}
}
typedef struct MultiReqBuffer {
BlockRequest blkreq[32];
int num_writes;
BlockDriverState *old_bs;
} MultiReqBuffer;
static void virtio_blk_handle_request(VirtIOBlockReq *req,
MultiReqBuffer *mrb)
{
if (req->elem.out_num < 1 || req->elem.in_num < 1) {
fprintf(stderr, "virtio-blk missing headers\n");
exit(1);
}
if (req->elem.out_sg[0].iov_len < sizeof(*req->out) ||
req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) {
fprintf(stderr, "virtio-blk header not in correct element\n");
exit(1);
}
req->out = (void *)req->elem.out_sg[0].iov_base;
req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base;
if (req->out->type & VIRTIO_BLK_T_FLUSH) {
virtio_blk_handle_flush(mrb->blkreq, &mrb->num_writes,
req, &mrb->old_bs);
} else if (req->out->type & VIRTIO_BLK_T_SCSI_CMD) {
virtio_blk_handle_scsi(req);
} else if (req->out->type & VIRTIO_BLK_T_OUT) {
qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
req->elem.out_num - 1);
virtio_blk_handle_write(mrb->blkreq, &mrb->num_writes,
req, &mrb->old_bs);
} else {
qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0],
req->elem.in_num - 1);
virtio_blk_handle_read(req);
}
}
static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOBlock *s = to_virtio_blk(vdev);
VirtIOBlockReq *req;
MultiReqBuffer mrb = {
.num_writes = 0,
.old_bs = NULL,
};
while ((req = virtio_blk_get_request(s))) {
virtio_blk_handle_request(req, &mrb);
}
if (mrb.num_writes > 0) {
do_multiwrite(mrb.old_bs, mrb.blkreq, mrb.num_writes);
}
/*
* FIXME: Want to check for completions before returning to guest mode,
* so cached reads and writes are reported as quickly as possible. But
* that should be done in the generic block layer.
*/
}
Fix VM state change handlers running out of order When a VM state change handler changes VM state, other VM state change handlers can see the state transitions out of order. bmdma_map(), scsi_disk_init() and virtio_blk_init() install VM state change handlers to restart DMA. These handlers can vm_stop() by running into a write error on a drive with werror=stop. This throws the VM state change handler callback into disarray. Here's an example case I observed: 0. The virtual IDE drive goes south. All future writes return errors. 1. Something encounters a write error, and duly stops the VM with vm_stop(). 2. vm_stop() calls vm_state_notify(0). 3. vm_state_notify() runs the callbacks in list vm_change_state_head. It contains ide_dma_restart_cb() installed by bmdma_map(). It also contains audio_vm_change_state_handler() installed by audio_init(). 4. audio_vm_change_state_handler() stops audio stuff. 5. User continues VM with monitor command "c". This runs vm_start(). 6. vm_start() calls vm_state_notify(1). 7. vm_state_notify() runs the callbacks in vm_change_state_head. 8. ide_dma_restart_cb() happens to come first. It does its work, runs into a write error, and duly stops the VM with vm_stop(). 9. vm_stop() runs vm_state_notify(0). 10. vm_state_notify() runs the callbacks in vm_change_state_head. 11. audio_vm_change_state_handler() stops audio stuff. Which isn't running. 12. vm_stop() finishes, ide_dma_restart_cb() finishes, step 7's vm_state_notify() resumes running handlers. 13. audio_vm_change_state_handler() starts audio stuff. Oopsie. Fix this by moving the actual write from each VM state change handler into a new bottom half (suggested by Gleb Natapov). Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2009-07-28 20:33:41 +02:00
static void virtio_blk_dma_restart_bh(void *opaque)
{
VirtIOBlock *s = opaque;
VirtIOBlockReq *req = s->rq;
MultiReqBuffer mrb = {
.num_writes = 0,
.old_bs = NULL,
};
Fix VM state change handlers running out of order When a VM state change handler changes VM state, other VM state change handlers can see the state transitions out of order. bmdma_map(), scsi_disk_init() and virtio_blk_init() install VM state change handlers to restart DMA. These handlers can vm_stop() by running into a write error on a drive with werror=stop. This throws the VM state change handler callback into disarray. Here's an example case I observed: 0. The virtual IDE drive goes south. All future writes return errors. 1. Something encounters a write error, and duly stops the VM with vm_stop(). 2. vm_stop() calls vm_state_notify(0). 3. vm_state_notify() runs the callbacks in list vm_change_state_head. It contains ide_dma_restart_cb() installed by bmdma_map(). It also contains audio_vm_change_state_handler() installed by audio_init(). 4. audio_vm_change_state_handler() stops audio stuff. 5. User continues VM with monitor command "c". This runs vm_start(). 6. vm_start() calls vm_state_notify(1). 7. vm_state_notify() runs the callbacks in vm_change_state_head. 8. ide_dma_restart_cb() happens to come first. It does its work, runs into a write error, and duly stops the VM with vm_stop(). 9. vm_stop() runs vm_state_notify(0). 10. vm_state_notify() runs the callbacks in vm_change_state_head. 11. audio_vm_change_state_handler() stops audio stuff. Which isn't running. 12. vm_stop() finishes, ide_dma_restart_cb() finishes, step 7's vm_state_notify() resumes running handlers. 13. audio_vm_change_state_handler() starts audio stuff. Oopsie. Fix this by moving the actual write from each VM state change handler into a new bottom half (suggested by Gleb Natapov). Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2009-07-28 20:33:41 +02:00
qemu_bh_delete(s->bh);
s->bh = NULL;
s->rq = NULL;
while (req) {
virtio_blk_handle_request(req, &mrb);
req = req->next;
}
if (mrb.num_writes > 0) {
do_multiwrite(mrb.old_bs, mrb.blkreq, mrb.num_writes);
}
}
Fix VM state change handlers running out of order When a VM state change handler changes VM state, other VM state change handlers can see the state transitions out of order. bmdma_map(), scsi_disk_init() and virtio_blk_init() install VM state change handlers to restart DMA. These handlers can vm_stop() by running into a write error on a drive with werror=stop. This throws the VM state change handler callback into disarray. Here's an example case I observed: 0. The virtual IDE drive goes south. All future writes return errors. 1. Something encounters a write error, and duly stops the VM with vm_stop(). 2. vm_stop() calls vm_state_notify(0). 3. vm_state_notify() runs the callbacks in list vm_change_state_head. It contains ide_dma_restart_cb() installed by bmdma_map(). It also contains audio_vm_change_state_handler() installed by audio_init(). 4. audio_vm_change_state_handler() stops audio stuff. 5. User continues VM with monitor command "c". This runs vm_start(). 6. vm_start() calls vm_state_notify(1). 7. vm_state_notify() runs the callbacks in vm_change_state_head. 8. ide_dma_restart_cb() happens to come first. It does its work, runs into a write error, and duly stops the VM with vm_stop(). 9. vm_stop() runs vm_state_notify(0). 10. vm_state_notify() runs the callbacks in vm_change_state_head. 11. audio_vm_change_state_handler() stops audio stuff. Which isn't running. 12. vm_stop() finishes, ide_dma_restart_cb() finishes, step 7's vm_state_notify() resumes running handlers. 13. audio_vm_change_state_handler() starts audio stuff. Oopsie. Fix this by moving the actual write from each VM state change handler into a new bottom half (suggested by Gleb Natapov). Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2009-07-28 20:33:41 +02:00
static void virtio_blk_dma_restart_cb(void *opaque, int running, int reason)
{
VirtIOBlock *s = opaque;
if (!running)
return;
if (!s->bh) {
s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s);
qemu_bh_schedule(s->bh);
}
}
static void virtio_blk_reset(VirtIODevice *vdev)
{
/*
* This should cancel pending requests, but can't do nicely until there
* are per-device request lists.
*/
qemu_aio_flush();
}
/* coalesce internal state, copy to pci i/o region 0
*/
static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
{
VirtIOBlock *s = to_virtio_blk(vdev);
struct virtio_blk_config blkcfg;
uint64_t capacity;
int cylinders, heads, secs;
bdrv_get_geometry(s->bs, &capacity);
bdrv_get_geometry_hint(s->bs, &cylinders, &heads, &secs);
memset(&blkcfg, 0, sizeof(blkcfg));
stq_raw(&blkcfg.capacity, capacity);
stl_raw(&blkcfg.seg_max, 128 - 2);
stw_raw(&blkcfg.cylinders, cylinders);
blkcfg.heads = heads;
blkcfg.sectors = secs & ~s->sector_mask;
blkcfg.blk_size = s->conf->logical_block_size;
blkcfg.size_max = 0;
blkcfg.physical_block_exp = get_physical_block_exp(s->conf);
blkcfg.alignment_offset = 0;
blkcfg.min_io_size = s->conf->min_io_size / blkcfg.blk_size;
blkcfg.opt_io_size = s->conf->opt_io_size / blkcfg.blk_size;
memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
}
static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
{
VirtIOBlock *s = to_virtio_blk(vdev);
features |= (1 << VIRTIO_BLK_F_SEG_MAX);
features |= (1 << VIRTIO_BLK_F_GEOMETRY);
features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
features |= (1 << VIRTIO_BLK_F_BLK_SIZE);
if (bdrv_enable_write_cache(s->bs))
features |= (1 << VIRTIO_BLK_F_WCACHE);
if (bdrv_is_read_only(s->bs))
features |= 1 << VIRTIO_BLK_F_RO;
return features;
}
static void virtio_blk_save(QEMUFile *f, void *opaque)
{
VirtIOBlock *s = opaque;
VirtIOBlockReq *req = s->rq;
virtio_save(&s->vdev, f);
while (req) {
qemu_put_sbyte(f, 1);
qemu_put_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
req = req->next;
}
qemu_put_sbyte(f, 0);
}
static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id)
{
VirtIOBlock *s = opaque;
if (version_id != 2)
return -EINVAL;
virtio_load(&s->vdev, f);
while (qemu_get_sbyte(f)) {
VirtIOBlockReq *req = virtio_blk_alloc_request(s);
qemu_get_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
req->next = s->rq;
s->rq = req->next;
}
return 0;
}
block: add topology qdev properties Add three new qdev properties to export block topology information to the guest. This is needed to get optimal I/O alignment for RAID arrays or SSDs. The options are: - physical_block_size to specify the physical block size of the device, this is going to increase from 512 bytes to 4096 kilobytes for many modern storage devices - min_io_size to specify the minimal I/O size without performance impact, this is typically set to the RAID chunk size for arrays. - opt_io_size to specify the optimal sustained I/O size, this is typically the RAID stripe width for arrays. I decided to not auto-probe these values from blkid which might easily be possible as I don't know how to deal with these issues on migration. Note that we specificly only set the physical_block_size, and not the logial one which is the unit all I/O is described in. The reason for that is that IDE does not support increasing the logical block size and at last for now I want to stick to one meachnisms in queue and allow for easy switching of transports for a given backing image which would not be possible if scsi and virtio use real 4k sectors, while ide only uses the physical block exponent. To make this more common for the different block drivers introduce a new BlockConf structure holding all common block properties and a DEFINE_BLOCK_PROPERTIES macro to add them all together, mirroring what is done for network drivers. Also switch over all block drivers to use it, except for the floppy driver which has weird driveA/driveB properties and probably won't require any advanced block options ever. Example usage for a virtio device with 4k physical block size and 8k optimal I/O size: -drive file=scratch.img,media=disk,cache=none,id=scratch \ -device virtio-blk-pci,drive=scratch,physical_block_size=4096,opt_io_size=8192 aliguori: updated patch to take into account BLOCK events Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2010-02-10 23:37:09 +01:00
VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf)
{
VirtIOBlock *s;
int cylinders, heads, secs;
static int virtio_blk_id;
s = (VirtIOBlock *)virtio_common_init("virtio-blk", VIRTIO_ID_BLOCK,
sizeof(struct virtio_blk_config),
sizeof(VirtIOBlock));
s->vdev.get_config = virtio_blk_update_config;
s->vdev.get_features = virtio_blk_get_features;
s->vdev.reset = virtio_blk_reset;
block: add topology qdev properties Add three new qdev properties to export block topology information to the guest. This is needed to get optimal I/O alignment for RAID arrays or SSDs. The options are: - physical_block_size to specify the physical block size of the device, this is going to increase from 512 bytes to 4096 kilobytes for many modern storage devices - min_io_size to specify the minimal I/O size without performance impact, this is typically set to the RAID chunk size for arrays. - opt_io_size to specify the optimal sustained I/O size, this is typically the RAID stripe width for arrays. I decided to not auto-probe these values from blkid which might easily be possible as I don't know how to deal with these issues on migration. Note that we specificly only set the physical_block_size, and not the logial one which is the unit all I/O is described in. The reason for that is that IDE does not support increasing the logical block size and at last for now I want to stick to one meachnisms in queue and allow for easy switching of transports for a given backing image which would not be possible if scsi and virtio use real 4k sectors, while ide only uses the physical block exponent. To make this more common for the different block drivers introduce a new BlockConf structure holding all common block properties and a DEFINE_BLOCK_PROPERTIES macro to add them all together, mirroring what is done for network drivers. Also switch over all block drivers to use it, except for the floppy driver which has weird driveA/driveB properties and probably won't require any advanced block options ever. Example usage for a virtio device with 4k physical block size and 8k optimal I/O size: -drive file=scratch.img,media=disk,cache=none,id=scratch \ -device virtio-blk-pci,drive=scratch,physical_block_size=4096,opt_io_size=8192 aliguori: updated patch to take into account BLOCK events Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2010-02-10 23:37:09 +01:00
s->bs = conf->dinfo->bdrv;
s->conf = conf;
s->rq = NULL;
s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1;
bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
register_savevm("virtio-blk", virtio_blk_id++, 2,
virtio_blk_save, virtio_blk_load, s);
return &s->vdev;
}