From f35d9d8aae08940b7fdd1bb8110619da2ece6b28 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:49:54 -0500 Subject: [PATCH 01/25] virtio: Implement skb_partial_csum_set, for setting partial csums on untrusted packets. Use it in virtio_net (replacing buggy version there), it's also going to be used by TAP for partial csum support. Signed-off-by: Rusty Russell Acked-by: David S. Miller --- drivers/net/virtio_net.c | 11 +---------- include/linux/skbuff.h | 1 + net/core/skbuff.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 5413dbf3d4ac..a60505c8f82a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -83,17 +83,8 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb, if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { pr_debug("Needs csum!\n"); - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = hdr->csum_start; - skb->csum_offset = hdr->csum_offset; - if (skb->csum_start > skb->len - 2 - || skb->csum_offset > skb->len - 2) { - if (net_ratelimit()) - printk(KERN_WARNING "%s: csum=%u/%u len=%u\n", - dev->name, skb->csum_start, - skb->csum_offset, skb->len); + if (!skb_partial_csum_set(skb,hdr->csum_start,hdr->csum_offset)) goto frame_err; - } } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dfe975a9967e..412672a79e8a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1810,5 +1810,6 @@ static inline void skb_forward_csum(struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; } +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 98420f9c4b6d..4e354221ec23 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2461,6 +2461,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return elt; } +/** + * skb_partial_csum_set - set up and verify partial csum values for packet + * @skb: the skb to set + * @start: the number of bytes after skb->data to start checksumming. + * @off: the offset from start to place the checksum. + * + * For untrusted partially-checksummed packets, we need to make sure the values + * for skb->csum_start and skb->csum_offset are valid so we don't oops. + * + * This function checks and sets those values and skb->ip_summed: if this + * returns false you should drop the packet. + */ +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) +{ + if (unlikely(start > skb->len - 2) || + unlikely((int)start + off > skb->len - 2)) { + if (net_ratelimit()) + printk(KERN_WARNING + "bad partial csum: csum=%u/%u len=%u\n", + start, off, skb->len); + return false; + } + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + start; + skb->csum_offset = off; + return true; +} + EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(kfree_skb); @@ -2497,3 +2525,4 @@ EXPORT_SYMBOL(skb_append_datato_frags); EXPORT_SYMBOL_GPL(skb_to_sgvec); EXPORT_SYMBOL_GPL(skb_cow_data); +EXPORT_SYMBOL_GPL(skb_partial_csum_set); From a586d4f6016f7139d8c26df0e6927131168d3b5b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:49:56 -0500 Subject: [PATCH 02/25] virtio: simplify config mechanism. Previously we used a type/len pair within the config space, but this seems overkill. We now simply define a structure which represents the layout in the config space: the config space can now only be extended at the end. The main driver-visible changes: 1) We indicate what fields are present with an explicit feature bit. 2) Virtqueues are explicitly numbered, and not in the config space. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 178 +++++++++++++++++++------------- drivers/block/virtio_blk.c | 35 +++---- drivers/char/virtio_console.c | 4 +- drivers/lguest/lguest_device.c | 128 +++++++++++++---------- drivers/net/virtio_net.c | 25 ++--- drivers/virtio/virtio.c | 45 -------- include/linux/lguest_launcher.h | 9 +- include/linux/virtio_blk.h | 20 ++-- include/linux/virtio_config.h | 98 +++++++++--------- include/linux/virtio_net.h | 11 +- net/9p/trans_virtio.c | 4 +- 11 files changed, 278 insertions(+), 279 deletions(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 6c8a2386cd50..4df1804169dc 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include "linux/lguest_launcher.h" #include "linux/virtio_config.h" #include "linux/virtio_net.h" @@ -99,13 +101,11 @@ struct device_list /* The descriptor page for the devices. */ u8 *descpage; - /* The tail of the last descriptor. */ - unsigned int desc_used; - /* A single linked list of devices. */ struct device *dev; - /* ... And an end pointer so we can easily append new devices */ - struct device **lastdev; + /* And a pointer to the last device for easy append and also for + * configuration appending. */ + struct device *lastdev; }; /* The list of Guest devices, based on command line arguments. */ @@ -191,7 +191,7 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, #define cpu_to_le64(v64) (v64) #define le16_to_cpu(v16) (v16) #define le32_to_cpu(v32) (v32) -#define le64_to_cpu(v32) (v64) +#define le64_to_cpu(v64) (v64) /*L:100 The Launcher code itself takes us out into userspace, that scary place * where pointers run wild and free! Unfortunately, like most userspace @@ -986,54 +986,44 @@ static void handle_input(int fd) * * All devices need a descriptor so the Guest knows it exists, and a "struct * device" so the Launcher can keep track of it. We have common helper - * routines to allocate them. - * - * This routine allocates a new "struct lguest_device_desc" from descriptor - * table just above the Guest's normal memory. It returns a pointer to that - * descriptor. */ + * routines to allocate and manage them. */ + +/* The layout of the device page is a "struct lguest_device_desc" followed by a + * number of virtqueue descriptors, then two sets of feature bits, then an + * array of configuration bytes. This routine returns the configuration + * pointer. */ +static u8 *device_config(const struct device *dev) +{ + return (void *)(dev->desc + 1) + + dev->desc->num_vq * sizeof(struct lguest_vqconfig) + + dev->desc->feature_len * 2; +} + +/* This routine allocates a new "struct lguest_device_desc" from descriptor + * table page just above the Guest's normal memory. It returns a pointer to + * that descriptor. */ static struct lguest_device_desc *new_dev_desc(u16 type) { - struct lguest_device_desc *d; + struct lguest_device_desc d = { .type = type }; + void *p; + + /* Figure out where the next device config is, based on the last one. */ + if (devices.lastdev) + p = device_config(devices.lastdev) + + devices.lastdev->desc->config_len; + else + p = devices.descpage; /* We only have one page for all the descriptors. */ - if (devices.desc_used + sizeof(*d) > getpagesize()) + if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) errx(1, "Too many devices"); - /* We don't need to set config_len or status: page is 0 already. */ - d = (void *)devices.descpage + devices.desc_used; - d->type = type; - devices.desc_used += sizeof(*d); - - return d; + /* p might not be aligned, so we memcpy in. */ + return memcpy(p, &d, sizeof(d)); } -/* Each device descriptor is followed by some configuration information. - * Each configuration field looks like: u8 type, u8 len, [... len bytes...]. - * - * This routine adds a new field to an existing device's descriptor. It only - * works for the last device, but that's OK because that's how we use it. */ -static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) -{ - /* This is the last descriptor, right? */ - assert(devices.descpage + devices.desc_used - == (u8 *)(dev->desc + 1) + dev->desc->config_len); - - /* We only have one page of device descriptions. */ - if (devices.desc_used + 2 + len > getpagesize()) - errx(1, "Too many devices"); - - /* Copy in the new config header: type then length. */ - devices.descpage[devices.desc_used++] = type; - devices.descpage[devices.desc_used++] = len; - memcpy(devices.descpage + devices.desc_used, c, len); - devices.desc_used += len; - - /* Update the device descriptor length: two byte head then data. */ - dev->desc->config_len += 2 + len; -} - -/* This routine adds a virtqueue to a device. We specify how many descriptors - * the virtqueue is to have. */ +/* Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. */ static void add_virtqueue(struct device *dev, unsigned int num_descs, void (*handle_output)(int fd, struct virtqueue *me)) { @@ -1059,9 +1049,15 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, /* Initialize the vring. */ vring_init(&vq->vring, num_descs, p, getpagesize()); - /* Add the configuration information to this device's descriptor. */ - add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, - sizeof(vq->config), &vq->config); + /* Append virtqueue to this device's descriptor. We use + * device_config() to get the end of the device's current virtqueues; + * we check that we haven't added any config or feature information + * yet, otherwise we'd be overwriting them. */ + assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); + memcpy(device_config(dev), &vq->config, sizeof(vq->config)); + dev->desc->num_vq++; + + verbose("Virtqueue page %#lx\n", to_guest_phys(p)); /* Add to tail of list, so dev->vq is first vq, dev->vq->next is * second. */ @@ -1077,6 +1073,37 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; } +/* The virtqueue descriptors are followed by feature bytes. */ +static void add_feature(struct device *dev, unsigned bit) +{ + u8 *features; + + /* We can't extend the feature bits once we've added config bytes */ + if (dev->desc->feature_len <= bit / CHAR_BIT) { + assert(dev->desc->config_len == 0); + dev->desc->feature_len = (bit / CHAR_BIT) + 1; + } + + features = (u8 *)(dev->desc + 1) + + dev->desc->num_vq * sizeof(struct lguest_vqconfig); + + features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); +} + +/* This routine sets the configuration fields for an existing device's + * descriptor. It only works for the last device, but that's OK because that's + * how we use it. */ +static void set_config(struct device *dev, unsigned len, const void *conf) +{ + /* Check we haven't overflowed our single page. */ + if (device_config(dev) + len > devices.descpage + getpagesize()) + errx(1, "Too many devices"); + + /* Copy in the config information, and store the length. */ + memcpy(device_config(dev), conf, len); + dev->desc->config_len = len; +} + /* This routine does all the creation and setup of a new device, including * calling new_dev_desc() to allocate the descriptor and device memory. */ static struct device *new_device(const char *name, u16 type, int fd, @@ -1084,14 +1111,6 @@ static struct device *new_device(const char *name, u16 type, int fd, { struct device *dev = malloc(sizeof(*dev)); - /* Append to device list. Prepending to a single-linked list is - * easier, but the user expects the devices to be arranged on the bus - * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. */ - *devices.lastdev = dev; - dev->next = NULL; - devices.lastdev = &dev->next; - /* Now we populate the fields one at a time. */ dev->fd = fd; /* If we have an input handler for this file descriptor, then we add it @@ -1102,6 +1121,17 @@ static struct device *new_device(const char *name, u16 type, int fd, dev->handle_input = handle_input; dev->name = name; dev->vq = NULL; + + /* Append to device list. Prepending to a single-linked list is + * easier, but the user expects the devices to be arranged on the bus + * in command-line order. The first network device on the command line + * is eth0, the first block device /dev/vda, etc. */ + if (devices.lastdev) + devices.lastdev->next = dev; + else + devices.dev = dev; + devices.lastdev = dev; + return dev; } @@ -1226,7 +1256,7 @@ static void setup_tun_net(const char *arg) int netfd, ipfd; u32 ip; const char *br_name = NULL; - u8 hwaddr[6]; + struct virtio_net_config conf; /* We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell @@ -1265,12 +1295,13 @@ static void setup_tun_net(const char *arg) ip = str2ip(arg); /* Set up the tun device, and get the mac address for the interface. */ - configure_device(ipfd, ifr.ifr_name, ip, hwaddr); + configure_device(ipfd, ifr.ifr_name, ip, conf.mac); /* Tell Guest what MAC address to use. */ - add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); + add_feature(dev, VIRTIO_NET_F_MAC); + set_config(dev, sizeof(conf), &conf); - /* We don't seed the socket any more; setup is done. */ + /* We don't need the socket any more; setup is done. */ close(ipfd); verbose("device %u: tun net %u.%u.%u.%u\n", @@ -1458,8 +1489,7 @@ static void setup_block_file(const char *filename) struct device *dev; struct vblk_info *vblk; void *stack; - u64 cap; - unsigned int val; + struct virtio_blk_config conf; /* This is the pipe the I/O thread will use to tell us I/O is done. */ pipe(p); @@ -1477,14 +1507,18 @@ static void setup_block_file(const char *filename) vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); vblk->len = lseek64(vblk->fd, 0, SEEK_END); + /* We support barriers. */ + add_feature(dev, VIRTIO_BLK_F_BARRIER); + /* Tell Guest how many sectors this device has. */ - cap = cpu_to_le64(vblk->len / 512); - add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); + conf.capacity = cpu_to_le64(vblk->len / 512); /* Tell Guest not to put in too many descriptors at once: two are used * for the in and out elements. */ - val = cpu_to_le32(VIRTQUEUE_NUM - 2); - add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); + add_feature(dev, VIRTIO_BLK_F_SEG_MAX); + conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); + + set_config(dev, sizeof(conf), &conf); /* The I/O thread writes to this end of the pipe when done. */ vblk->done_fd = p[1]; @@ -1505,7 +1539,7 @@ static void setup_block_file(const char *filename) close(vblk->workpipe[0]); verbose("device %u: virtblock %llu sectors\n", - devices.device_num, cap); + devices.device_num, le64_to_cpu(conf.capacity)); } /* That's the end of device setup. :*/ @@ -1610,12 +1644,12 @@ int main(int argc, char *argv[]) /* First we initialize the device list. Since console and network * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the - * list. We also keep a pointer to the last device, for easy appending - * to the list. Finally, we keep the next interrupt number to hand out - * (1: remember that 0 is used by the timer). */ + * list. We also keep a pointer to the last device. Finally, we keep + * the next interrupt number to hand out (1: remember that 0 is used by + * the timer). */ FD_ZERO(&devices.infds); devices.max_infd = -1; - devices.lastdev = &devices.dev; + devices.lastdev = NULL; devices.next_irq = 1; cpu_id = 0; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 924ddd8bccd2..1c63d5b64c20 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -162,8 +162,6 @@ static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; int err, major; - void *token; - unsigned int len; u64 cap; u32 v; @@ -178,7 +176,7 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->vdev = vdev; /* We expect one virtqueue, for output. */ - vblk->vq = vdev->config->find_vq(vdev, blk_done); + vblk->vq = vdev->config->find_vq(vdev, 0, blk_done); if (IS_ERR(vblk->vq)) { err = PTR_ERR(vblk->vq); goto out_free_vblk; @@ -216,15 +214,12 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->fops = &virtblk_fops; /* If barriers are supported, tell block layer that queue is ordered */ - token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len); - if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER)) + if (vdev->config->feature(vdev, VIRTIO_BLK_F_BARRIER)) blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); - err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap); - if (err) { - dev_err(&vdev->dev, "Bad/missing capacity in config\n"); - goto out_cleanup_queue; - } + /* Host must always specify the capacity. */ + __virtio_config_val(vdev, offsetof(struct virtio_blk_config, capacity), + &cap); /* If capacity is too big, truncate with warning. */ if ((sector_t)cap != cap) { @@ -234,27 +229,23 @@ static int virtblk_probe(struct virtio_device *vdev) } set_capacity(vblk->disk, cap); - err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v); + /* Host can optionally specify maximum segment size and number of + * segments. */ + err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX, + offsetof(struct virtio_blk_config, size_max), + &v); if (!err) blk_queue_max_segment_size(vblk->disk->queue, v); - else if (err != -ENOENT) { - dev_err(&vdev->dev, "Bad SIZE_MAX in config\n"); - goto out_cleanup_queue; - } - err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v); + err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX, + offsetof(struct virtio_blk_config, seg_max), + &v); if (!err) blk_queue_max_hw_segments(vblk->disk->queue, v); - else if (err != -ENOENT) { - dev_err(&vdev->dev, "Bad SEG_MAX in config\n"); - goto out_cleanup_queue; - } add_disk(vblk->disk); return 0; -out_cleanup_queue: - blk_cleanup_queue(vblk->disk->queue); out_put_disk: put_disk(vblk->disk); out_unregister_blkdev: diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index e34da5c97196..dc17fe3a88bc 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -158,13 +158,13 @@ static int __devinit virtcons_probe(struct virtio_device *dev) /* Find the input queue. */ /* FIXME: This is why we want to wean off hvc: we do nothing * when input comes in. */ - in_vq = vdev->config->find_vq(vdev, NULL); + in_vq = vdev->config->find_vq(vdev, 0, NULL); if (IS_ERR(in_vq)) { err = PTR_ERR(in_vq); goto free; } - out_vq = vdev->config->find_vq(vdev, NULL); + out_vq = vdev->config->find_vq(vdev, 1, NULL); if (IS_ERR(out_vq)) { err = PTR_ERR(out_vq); goto free_in_vq; diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index e2eec38c83c2..07f57a53658b 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -52,57 +52,82 @@ struct lguest_device { /*D:130 * Device configurations * - * The configuration information for a device consists of a series of fields. - * We don't really care what they are: the Launcher set them up, and the driver - * will look at them during setup. + * The configuration information for a device consists of one or more + * virtqueues, a feature bitmaks, and some configuration bytes. The + * configuration bytes don't really matter to us: the Launcher set them up, and + * the driver will look at them during setup. * - * For us these fields come immediately after that device's descriptor in the - * lguest_devices page. - * - * Each field starts with a "type" byte, a "length" byte, then that number of - * bytes of configuration information. The device descriptor tells us the - * total configuration length so we know when we've reached the last field. */ + * A convenient routine to return the device's virtqueue config array: + * immediately after the descriptor. */ +static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) +{ + return (void *)(desc + 1); +} -/* type + length bytes */ -#define FHDR_LEN 2 +/* The features come immediately after the virtqueues. */ +static u8 *lg_features(const struct lguest_device_desc *desc) +{ + return (void *)(lg_vq(desc) + desc->num_vq); +} -/* This finds the first field of a given type for a device's configuration. */ -static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len) +/* The config space comes after the two feature bitmasks. */ +static u8 *lg_config(const struct lguest_device_desc *desc) +{ + return lg_features(desc) + desc->feature_len * 2; +} + +/* The total size of the config page used by this device (incl. desc) */ +static unsigned desc_size(const struct lguest_device_desc *desc) +{ + return sizeof(*desc) + + desc->num_vq * sizeof(struct lguest_vqconfig) + + desc->feature_len * 2 + + desc->config_len; +} + +/* This tests (and acknowleges) a feature bit. */ +static bool lg_feature(struct virtio_device *vdev, unsigned fbit) { struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - int i; + u8 *features; - for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) { - if (desc->config[i] == type) { - /* Mark it used, so Host can know we looked at it, and - * also so we won't find the same one twice. */ - desc->config[i] |= 0x80; - /* Remember, the second byte is the length. */ - *len = desc->config[i+1]; - /* We return a pointer to the field header. */ - return desc->config + i; - } - } + /* Obviously if they ask for a feature off the end of our feature + * bitmap, it's not set. */ + if (fbit / 8 > desc->feature_len) + return false; - /* Not found: return NULL for failure. */ - return NULL; + /* The feature bitmap comes after the virtqueues. */ + features = lg_features(desc); + if (!(features[fbit / 8] & (1 << (fbit % 8)))) + return false; + + /* We set the matching bit in the other half of the bitmap to tell the + * Host we want to use this feature. We don't use this yet, but we + * could in future. */ + features[desc->feature_len + fbit / 8] |= (1 << (fbit % 8)); + return true; } /* Once they've found a field, getting a copy of it is easy. */ -static void lg_get(struct virtio_device *vdev, void *token, +static void lg_get(struct virtio_device *vdev, unsigned int offset, void *buf, unsigned len) { - /* Check they didn't ask for more than the length of the field! */ - BUG_ON(len > ((u8 *)token)[1]); - memcpy(buf, token + FHDR_LEN, len); + struct lguest_device_desc *desc = to_lgdev(vdev)->desc; + + /* Check they didn't ask for more than the length of the config! */ + BUG_ON(offset + len > desc->config_len); + memcpy(buf, lg_config(desc) + offset, len); } /* Setting the contents is also trivial. */ -static void lg_set(struct virtio_device *vdev, void *token, +static void lg_set(struct virtio_device *vdev, unsigned int offset, const void *buf, unsigned len) { - BUG_ON(len > ((u8 *)token)[1]); - memcpy(token + FHDR_LEN, buf, len); + struct lguest_device_desc *desc = to_lgdev(vdev)->desc; + + /* Check they didn't ask for more than the length of the config! */ + BUG_ON(offset + len > desc->config_len); + memcpy(lg_config(desc) + offset, buf, len); } /* The operations to get and set the status word just access the status field @@ -165,39 +190,29 @@ static void lg_notify(struct virtqueue *vq) * * So we provide devices with a "find virtqueue and set it up" function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, + unsigned index, bool (*callback)(struct virtqueue *vq)) { + struct lguest_device *ldev = to_lgdev(vdev); struct lguest_vq_info *lvq; struct virtqueue *vq; - unsigned int len; - void *token; int err; - /* Look for a field of the correct type to mark a virtqueue. Note that - * if this succeeds, then the type will be changed so it won't be found - * again, and future lg_find_vq() calls will find the next - * virtqueue (if any). */ - token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len); - if (!token) + /* We must have this many virtqueues. */ + if (index >= ldev->desc->num_vq) return ERR_PTR(-ENOENT); lvq = kmalloc(sizeof(*lvq), GFP_KERNEL); if (!lvq) return ERR_PTR(-ENOMEM); - /* Note: we could use a configuration space inside here, just like we - * do for the device. This would allow expansion in future, because - * our configuration system is designed to be expansible. But this is - * way easier. */ - if (len != sizeof(lvq->config)) { - dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len); - err = -EIO; - goto free_lvq; - } - /* Make a copy of the "struct lguest_vqconfig" field. We need a copy - * because the config space might not be aligned correctly. */ - vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config)); + /* Make a copy of the "struct lguest_vqconfig" entry, which sits after + * the descriptor. We need a copy because the config space might not + * be aligned correctly. */ + memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); + printk("Mapping virtqueue %i addr %lx\n", index, + (unsigned long)lvq->config.pfn << PAGE_SHIFT); /* Figure out how many pages the ring will take, and map that memory */ lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT, DIV_ROUND_UP(vring_size(lvq->config.num, @@ -259,7 +274,7 @@ static void lg_del_vq(struct virtqueue *vq) /* The ops structure which hooks everything together. */ static struct virtio_config_ops lguest_config_ops = { - .find = lg_find, + .feature = lg_feature, .get = lg_get, .set = lg_set, .get_status = lg_get_status, @@ -329,13 +344,14 @@ static void scan_devices(void) struct lguest_device_desc *d; /* We start at the page beginning, and skip over each entry. */ - for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) { + for (i = 0; i < PAGE_SIZE; i += desc_size(d)) { d = lguest_devices + i; /* Once we hit a zero, stop. */ if (d->type == 0) break; + printk("Device at %i has size %u\n", i, desc_size(d)); add_lguest_device(d); } } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a60505c8f82a..4b8138312750 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -311,10 +311,8 @@ static int virtnet_close(struct net_device *dev) static int virtnet_probe(struct virtio_device *vdev) { int err; - unsigned int len; struct net_device *dev; struct virtnet_info *vi; - void *token; /* Allocate ourselves a network device with room for our info */ dev = alloc_etherdev(sizeof(struct virtnet_info)); @@ -330,25 +328,24 @@ static int virtnet_probe(struct virtio_device *vdev) SET_NETDEV_DEV(dev, &vdev->dev); /* Do we support "hardware" checksums? */ - token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len); - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) { + if (vdev->config->feature(vdev, VIRTIO_NET_F_NO_CSUM)) { /* This opens up the world of extra features. */ dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4)) + if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4)) dev->features |= NETIF_F_TSO; - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO)) + if (vdev->config->feature(vdev, VIRTIO_NET_F_UFO)) dev->features |= NETIF_F_UFO; - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN)) + if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4_ECN)) dev->features |= NETIF_F_TSO_ECN; - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6)) + if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO6)) dev->features |= NETIF_F_TSO6; } /* Configuration may specify what MAC to use. Otherwise random. */ - token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len); - if (token) { - dev->addr_len = len; - vdev->config->get(vdev, token, dev->dev_addr, len); + if (vdev->config->feature(vdev, VIRTIO_NET_F_MAC)) { + vdev->config->get(vdev, + offsetof(struct virtio_net_config, mac), + dev->dev_addr, dev->addr_len); } else random_ether_addr(dev->dev_addr); @@ -359,13 +356,13 @@ static int virtnet_probe(struct virtio_device *vdev) vi->vdev = vdev; /* We expect two virtqueues, receive then send. */ - vi->rvq = vdev->config->find_vq(vdev, skb_recv_done); + vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done); if (IS_ERR(vi->rvq)) { err = PTR_ERR(vi->rvq); goto free; } - vi->svq = vdev->config->find_vq(vdev, skb_xmit_done); + vi->svq = vdev->config->find_vq(vdev, 1, skb_xmit_done); if (IS_ERR(vi->svq)) { err = PTR_ERR(vi->svq); goto free_recv; diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 69d7ea02cd48..303cb6f90108 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -148,51 +148,6 @@ void unregister_virtio_device(struct virtio_device *dev) } EXPORT_SYMBOL_GPL(unregister_virtio_device); -int __virtio_config_val(struct virtio_device *vdev, - u8 type, void *val, size_t size) -{ - void *token; - unsigned int len; - - token = vdev->config->find(vdev, type, &len); - if (!token) - return -ENOENT; - - if (len != size) - return -EIO; - - vdev->config->get(vdev, token, val, size); - return 0; -} -EXPORT_SYMBOL_GPL(__virtio_config_val); - -int virtio_use_bit(struct virtio_device *vdev, - void *token, unsigned int len, unsigned int bitnum) -{ - unsigned long bits[16]; - - /* This makes it convenient to pass-through find() results. */ - if (!token) - return 0; - - /* bit not in range of this bitfield? */ - if (bitnum * 8 >= len / 2) - return 0; - - /* Giant feature bitfields are silly. */ - BUG_ON(len > sizeof(bits)); - vdev->config->get(vdev, token, bits, len); - - if (!test_bit(bitnum, bits)) - return 0; - - /* Set acknowledge bit, and write it back. */ - set_bit(bitnum + len * 8 / 2, bits); - vdev->config->set(vdev, token, bits, len); - return 1; -} -EXPORT_SYMBOL_GPL(virtio_use_bit); - static int virtio_init(void) { if (bus_register(&virtio_bus) != 0) diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index 697104da91f1..589be3e1f3ac 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -23,7 +23,12 @@ struct lguest_device_desc { /* The device type: console, network, disk etc. Type 0 terminates. */ __u8 type; - /* The number of bytes of the config array. */ + /* The number of virtqueues (first in config array) */ + __u8 num_vq; + /* The number of bytes of feature bits. Multiply by 2: one for host + * features and one for guest acknowledgements. */ + __u8 feature_len; + /* The number of bytes of the config array after virtqueues. */ __u8 config_len; /* A status byte, written by the Guest. */ __u8 status; @@ -31,7 +36,7 @@ struct lguest_device_desc { }; /*D:135 This is how we expect the device configuration field for a virtqueue - * (type VIRTIO_CONFIG_F_VIRTQUEUE) to be laid out: */ + * to be laid out in config space. */ struct lguest_vqconfig { /* The number of entries in the virtio_ring */ __u16 num; diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index 7bd2bce0cfd9..e54635666f2c 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -6,15 +6,19 @@ #define VIRTIO_ID_BLOCK 2 /* Feature bits */ -#define VIRTIO_CONFIG_BLK_F 0x40 -#define VIRTIO_BLK_F_BARRIER 1 /* Does host support barriers? */ +#define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */ +#define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */ +#define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */ -/* The capacity (in 512-byte sectors). */ -#define VIRTIO_CONFIG_BLK_F_CAPACITY 0x41 -/* The maximum segment size. */ -#define VIRTIO_CONFIG_BLK_F_SIZE_MAX 0x42 -/* The maximum number of segments. */ -#define VIRTIO_CONFIG_BLK_F_SEG_MAX 0x43 +struct virtio_blk_config +{ + /* The capacity (in 512-byte sectors). */ + __le64 capacity; + /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ + __le32 size_max; + /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ + __le32 seg_max; +} __attribute__((packed)); /* These two define direction. */ #define VIRTIO_BLK_T_IN 0 diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index bcc01888df78..70bb26062d76 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -5,7 +5,7 @@ * store and access that space differently. */ #include -/* Status byte for guest to report progress, and synchronize config. */ +/* Status byte for guest to report progress, and synchronize features. */ /* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */ #define VIRTIO_CONFIG_S_ACKNOWLEDGE 1 /* We have found a driver for the device. */ @@ -15,34 +15,27 @@ /* We've given up on this device. */ #define VIRTIO_CONFIG_S_FAILED 0x80 -/* Feature byte (actually 7 bits availabe): */ -/* Requirements/features of the virtio implementation. */ -#define VIRTIO_CONFIG_F_VIRTIO 1 -/* Requirements/features of the virtqueue (may have more than one). */ -#define VIRTIO_CONFIG_F_VIRTQUEUE 2 - #ifdef __KERNEL__ struct virtio_device; /** * virtio_config_ops - operations for configuring a virtio device - * @find: search for the next configuration field of the given type. + * @feature: search for a feature in this config * vdev: the virtio_device - * type: the feature type - * len: the (returned) length of the field if found. - * Returns a token if found, or NULL. Never returnes the same field twice - * (ie. it's used up). - * @get: read the value of a configuration field after find(). + * bit: the feature bit + * Returns true if the feature is supported. Acknowledges the feature + * so the host can see it. + * @get: read the value of a configuration field * vdev: the virtio_device - * token: the token returned from find(). + * offset: the offset of the configuration field * buf: the buffer to write the field value into. - * len: the length of the buffer (given by find()). + * len: the length of the buffer * Note that contents are conventionally little-endian. - * @set: write the value of a configuration field after find(). + * @set: write the value of a configuration field * vdev: the virtio_device - * token: the token returned from find(). + * offset: the offset of the configuration field * buf: the buffer to read the field value from. - * len: the length of the buffer (given by find()). + * len: the length of the buffer * Note that contents are conventionally little-endian. * @get_status: read the status byte * vdev: the virtio_device @@ -50,62 +43,63 @@ struct virtio_device; * @set_status: write the status byte * vdev: the virtio_device * status: the new status byte - * @find_vq: find the first VIRTIO_CONFIG_F_VIRTQUEUE and create a virtqueue. + * @find_vq: find a virtqueue and instantiate it. * vdev: the virtio_device + * index: the 0-based virtqueue number in case there's more than one. * callback: the virqtueue callback - * Returns the new virtqueue or ERR_PTR(). + * Returns the new virtqueue or ERR_PTR() (eg. -ENOENT). * @del_vq: free a virtqueue found by find_vq(). */ struct virtio_config_ops { - void *(*find)(struct virtio_device *vdev, u8 type, unsigned *len); - void (*get)(struct virtio_device *vdev, void *token, + bool (*feature)(struct virtio_device *vdev, unsigned bit); + void (*get)(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len); - void (*set)(struct virtio_device *vdev, void *token, + void (*set)(struct virtio_device *vdev, unsigned offset, const void *buf, unsigned len); u8 (*get_status)(struct virtio_device *vdev); void (*set_status)(struct virtio_device *vdev, u8 status); struct virtqueue *(*find_vq)(struct virtio_device *vdev, + unsigned index, bool (*callback)(struct virtqueue *)); void (*del_vq)(struct virtqueue *vq); }; /** - * virtio_config_val - get a single virtio config and mark it used. - * @config: the virtio config space - * @type: the type to search for. + * virtio_config_val - look for a feature and get a single virtio config. + * @vdev: the virtio device + * @fbit: the feature bit + * @offset: the type to search for. * @val: a pointer to the value to fill in. * - * Once used, the config type is marked with VIRTIO_CONFIG_F_USED so it can't - * be found again. This version does endian conversion. */ -#define virtio_config_val(vdev, type, v) ({ \ - int _err = __virtio_config_val((vdev),(type),(v),sizeof(*(v))); \ - \ - BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \ - && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \ - if (!_err) { \ - switch (sizeof(*(v))) { \ - case 2: le16_to_cpus((__u16 *) v); break; \ - case 4: le32_to_cpus((__u32 *) v); break; \ - case 8: le64_to_cpus((__u64 *) v); break; \ - } \ - } \ + * The return value is -ENOENT if the feature doesn't exist. Otherwise + * the value is endian-corrected and returned in v. */ +#define virtio_config_val(vdev, fbit, offset, v) ({ \ + int _err; \ + if ((vdev)->config->feature((vdev), (fbit))) { \ + __virtio_config_val((vdev), (offset), (v)); \ + _err = 0; \ + } else \ + _err = -ENOENT; \ _err; \ }) -int __virtio_config_val(struct virtio_device *dev, - u8 type, void *val, size_t size); - /** - * virtio_use_bit - helper to use a feature bit in a bitfield value. - * @dev: the virtio device - * @token: the token as returned from vdev->config->find(). - * @len: the length of the field. - * @bitnum: the bit to test. + * __virtio_config_val - get a single virtio config without feature check. + * @vdev: the virtio device + * @offset: the type to search for. + * @val: a pointer to the value to fill in. * - * If handed a NULL token, it returns false, otherwise returns bit status. - * If it's one, it sets the mirroring acknowledgement bit. */ -int virtio_use_bit(struct virtio_device *vdev, - void *token, unsigned int len, unsigned int bitnum); + * The value is endian-corrected and returned in v. */ +#define __virtio_config_val(vdev, offset, v) do { \ + BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \ + && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \ + (vdev)->config->get((vdev), (offset), (v), sizeof(*(v))); \ + switch (sizeof(*(v))) { \ + case 2: le16_to_cpus((__u16 *) v); break; \ + case 4: le32_to_cpus((__u32 *) v); break; \ + case 8: le64_to_cpus((__u64 *) v); break; \ + } \ +} while(0) #endif /* __KERNEL__ */ #endif /* _LINUX_VIRTIO_CONFIG_H */ diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index ae469ae55d36..6e8fdfea8cd0 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -5,16 +5,19 @@ /* The ID for virtio_net */ #define VIRTIO_ID_NET 1 -/* The bitmap of config for virtio net */ -#define VIRTIO_CONFIG_NET_F 0x40 +/* The feature bitmap for virtio net */ #define VIRTIO_NET_F_NO_CSUM 0 #define VIRTIO_NET_F_TSO4 1 #define VIRTIO_NET_F_UFO 2 #define VIRTIO_NET_F_TSO4_ECN 3 #define VIRTIO_NET_F_TSO6 4 +#define VIRTIO_NET_F_MAC 5 -/* The config defining mac address. */ -#define VIRTIO_CONFIG_NET_MAC_F 0x41 +struct virtio_net_config +{ + /* The config defining mac address (if VIRTIO_NET_F_MAC) */ + __u8 mac[6]; +} __attribute__((packed)); /* This is the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 40b71a29fc3f..78d7946f81fe 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -236,13 +236,13 @@ static int p9_virtio_probe(struct virtio_device *dev) /* Find the input queue. */ dev->priv = chan; - chan->in_vq = dev->config->find_vq(dev, p9_virtio_intr); + chan->in_vq = dev->config->find_vq(dev, 0, p9_virtio_intr); if (IS_ERR(chan->in_vq)) { err = PTR_ERR(chan->in_vq); goto free; } - chan->out_vq = dev->config->find_vq(dev, NULL); + chan->out_vq = dev->config->find_vq(dev, 1, NULL); if (IS_ERR(chan->out_vq)) { err = PTR_ERR(chan->out_vq); goto free_in_vq; From 18445c4d501b9ab4336f66ef46b092661ddaf336 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:49:57 -0500 Subject: [PATCH 03/25] virtio: explicit enable_cb/disable_cb rather than callback return. It seems that virtio_net wants to disable callbacks (interrupts) before calling netif_rx_schedule(), so we can't use the return value to do so. Rename "restart" to "cb_enable" and introduce "cb_disable" hook: callback now returns void, rather than a boolean. Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 3 +-- drivers/lguest/lguest_device.c | 2 +- drivers/net/virtio_net.c | 15 ++++++++------- drivers/virtio/virtio_ring.c | 21 ++++++++++++++++----- include/linux/virtio.h | 11 ++++++----- include/linux/virtio_config.h | 2 +- include/linux/virtio_ring.h | 2 +- net/9p/trans_virtio.c | 4 +--- 8 files changed, 35 insertions(+), 25 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1c63d5b64c20..54a8017ad487 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -36,7 +36,7 @@ struct virtblk_req struct virtio_blk_inhdr in_hdr; }; -static bool blk_done(struct virtqueue *vq) +static void blk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; struct virtblk_req *vbr; @@ -65,7 +65,6 @@ static bool blk_done(struct virtqueue *vq) /* In case queue is stopped waiting for more buffers. */ blk_start_queue(vblk->disk->queue); spin_unlock_irqrestore(&vblk->lock, flags); - return true; } static bool do_req(struct request_queue *q, struct virtio_blk *vblk, diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 07f57a53658b..ced5b44cebce 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -191,7 +191,7 @@ static void lg_notify(struct virtqueue *vq) * So we provide devices with a "find virtqueue and set it up" function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, - bool (*callback)(struct virtqueue *vq)) + void (*callback)(struct virtqueue *vq)) { struct lguest_device *ldev = to_lgdev(vdev); struct lguest_vq_info *lvq; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4b8138312750..7b0059f0f5d4 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -52,13 +52,12 @@ static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb) sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr)); } -static bool skb_xmit_done(struct virtqueue *rvq) +static void skb_xmit_done(struct virtqueue *rvq) { struct virtnet_info *vi = rvq->vdev->priv; /* In case we were waiting for output buffers. */ netif_wake_queue(vi->dev); - return true; } static void receive_skb(struct net_device *dev, struct sk_buff *skb, @@ -161,12 +160,14 @@ static void try_fill_recv(struct virtnet_info *vi) vi->rvq->vq_ops->kick(vi->rvq); } -static bool skb_recv_done(struct virtqueue *rvq) +static void skb_recv_done(struct virtqueue *rvq) { struct virtnet_info *vi = rvq->vdev->priv; - netif_rx_schedule(vi->dev, &vi->napi); - /* Suppress further interrupts. */ - return false; + /* Schedule NAPI, Suppress further interrupts if successful. */ + if (netif_rx_schedule_prep(vi->dev, &vi->napi)) { + rvq->vq_ops->disable_cb(rvq); + __netif_rx_schedule(vi->dev, &vi->napi); + } } static int virtnet_poll(struct napi_struct *napi, int budget) @@ -192,7 +193,7 @@ again: /* Out of packets? */ if (received < budget) { netif_rx_complete(vi->dev, napi); - if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq)) + if (unlikely(!vi->rvq->vq_ops->enable_cb(vi->rvq)) && netif_rx_reschedule(vi->dev, napi)) goto again; } diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 1dc04b6684e6..342bb0363fbe 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -220,7 +220,17 @@ static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len) return ret; } -static bool vring_restart(struct virtqueue *_vq) +static void vring_disable_cb(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + START_USE(vq); + BUG_ON(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT); + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + END_USE(vq); +} + +static bool vring_enable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -254,8 +264,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq) return IRQ_HANDLED; pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); - if (vq->vq.callback && !vq->vq.callback(&vq->vq)) - vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + if (vq->vq.callback) + vq->vq.callback(&vq->vq); return IRQ_HANDLED; } @@ -264,7 +274,8 @@ static struct virtqueue_ops vring_vq_ops = { .add_buf = vring_add_buf, .get_buf = vring_get_buf, .kick = vring_kick, - .restart = vring_restart, + .disable_cb = vring_disable_cb, + .enable_cb = vring_enable_cb, .shutdown = vring_shutdown, }; @@ -272,7 +283,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *), - bool (*callback)(struct virtqueue *)) + void (*callback)(struct virtqueue *)) { struct vring_virtqueue *vq; unsigned int i; diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 14e1379876d3..951d81747b42 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -11,15 +11,13 @@ /** * virtqueue - a queue to register buffers for sending or receiving. * @callback: the function to call when buffers are consumed (can be NULL). - * If this returns false, callbacks are suppressed until vq_ops->restart - * is called. * @vdev: the virtio device this queue was created for. * @vq_ops: the operations for this virtqueue (see below). * @priv: a pointer for the virtqueue implementation to use. */ struct virtqueue { - bool (*callback)(struct virtqueue *vq); + void (*callback)(struct virtqueue *vq); struct virtio_device *vdev; struct virtqueue_ops *vq_ops; void *priv; @@ -41,7 +39,9 @@ struct virtqueue * vq: the struct virtqueue we're talking about. * len: the length written into the buffer * Returns NULL or the "data" token handed to add_buf. - * @restart: restart callbacks after callback returned false. + * @disable_cb: disable callbacks + * vq: the struct virtqueue we're talking about. + * @enable_cb: restart callbacks after disable_cb. * vq: the struct virtqueue we're talking about. * This returns "false" (and doesn't re-enable) if there are pending * buffers in the queue, to avoid a race. @@ -65,7 +65,8 @@ struct virtqueue_ops { void *(*get_buf)(struct virtqueue *vq, unsigned int *len); - bool (*restart)(struct virtqueue *vq); + void (*disable_cb)(struct virtqueue *vq); + bool (*enable_cb)(struct virtqueue *vq); void (*shutdown)(struct virtqueue *vq); }; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 70bb26062d76..81f828ac8f47 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -61,7 +61,7 @@ struct virtio_config_ops void (*set_status)(struct virtio_device *vdev, u8 status); struct virtqueue *(*find_vq)(struct virtio_device *vdev, unsigned index, - bool (*callback)(struct virtqueue *)); + void (*callback)(struct virtqueue *)); void (*del_vq)(struct virtqueue *vq); }; diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 1a4ed49f6478..8cde10e792c4 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -114,7 +114,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *vq), - bool (*callback)(struct virtqueue *vq)); + void (*callback)(struct virtqueue *vq)); void vring_del_virtqueue(struct virtqueue *vq); irqreturn_t vring_interrupt(int irq, void *_vq); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 78d7946f81fe..42eea5fe2628 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -199,14 +199,12 @@ static void p9_virtio_close(struct p9_trans *trans) kfree(trans); } -static bool p9_virtio_intr(struct virtqueue *q) +static void p9_virtio_intr(struct virtqueue *q) { struct virtio_chan *chan = q->vdev->priv; P9_DPRINTK(P9_DEBUG_TRANS, "9p poll_wakeup: %p\n", &chan->wq); wake_up_interruptible(&chan->wq); - - return true; } static int p9_virtio_probe(struct virtio_device *dev) From f957d1f05a1a20bc3b954877c6562a4d53d58bde Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:49:58 -0500 Subject: [PATCH 04/25] virtio: configuration change callback Various drivers want to know when their configuration information changes: the balloon driver is the immediate user, but the network driver may one day have a "carrier" status as well. This introduces that callback (lguest doesn't use it yet). Signed-off-by: Rusty Russell --- include/linux/virtio.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 951d81747b42..78408d5237c1 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -98,12 +98,15 @@ void unregister_virtio_device(struct virtio_device *dev); * @probe: the function to call when a device is found. Returns a token for * remove, or PTR_ERR(). * @remove: the function when a device is removed. + * @config_changed: optional function to call when the device configuration + * changes; may be called in interrupt context. */ struct virtio_driver { struct device_driver driver; const struct virtio_device_id *id_table; int (*probe)(struct virtio_device *dev); void (*remove)(struct virtio_device *dev); + void (*config_changed)(struct virtio_device *dev); }; int register_virtio_driver(struct virtio_driver *drv); From 3309daaad724dd08eb598bf9c12b7bb9daddd706 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Fri, 21 Dec 2007 02:17:47 +0200 Subject: [PATCH 05/25] virtio: Fix vring_init/vring_size to take unsigned long Using unsigned int resulted in silent truncation of the upper 32-bit on x86_64 resulting in an OOPS since the ring was being initialized wrong. Please reconsider my previous patch to just use PAGE_ALIGN(). Open coding this sort of stuff, no matter how simple it seems, is just asking for this sort of trouble. Signed-off-by: Anthony Liguori Signed-off-by: Rusty Russell --- include/linux/virtio_ring.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 8cde10e792c4..ea3be89a0e83 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -89,7 +89,7 @@ struct vring { * }; */ static inline void vring_init(struct vring *vr, unsigned int num, void *p, - unsigned int pagesize) + unsigned long pagesize) { vr->num = num; vr->desc = p; @@ -98,7 +98,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, & ~(pagesize - 1)); } -static inline unsigned vring_size(unsigned int num, unsigned int pagesize) +static inline unsigned vring_size(unsigned int num, unsigned long pagesize) { return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num) + pagesize - 1) & ~(pagesize - 1)) From 426e3e0af5d2473e67d4256fc1340b7faebd1cc7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:49:59 -0500 Subject: [PATCH 06/25] virtio: clarify NO_NOTIFY flag usage The other side (host) can set the NO_NOTIFY flag as an optimization, to say "no need to kick me when you add things". Make it clear that this is advisory only; especially that we should always notify when the ring is full. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 9 +++++---- drivers/virtio/virtio_ring.c | 2 ++ include/linux/virtio_ring.h | 8 ++++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 4df1804169dc..8ff2d8bc690a 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -923,10 +923,10 @@ static void handle_output(int fd, unsigned long addr) /* Check each virtqueue. */ for (i = devices.dev; i; i = i->next) { for (vq = i->vq; vq; vq = vq->next) { - if (vq->config.pfn == addr/getpagesize() - && vq->handle_output) { + if (vq->config.pfn == addr/getpagesize()) { verbose("Output to %s\n", vq->dev->name); - vq->handle_output(fd, vq); + if (vq->handle_output) + vq->handle_output(fd, vq); return; } } @@ -1068,7 +1068,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, * virtqueue. */ vq->handle_output = handle_output; - /* Set the "Don't Notify Me" flag if we don't have a handler */ + /* As an optimization, set the advisory "Don't Notify Me" flag if we + * don't have a handler */ if (!handle_output) vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; } diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 342bb0363fbe..dbe1d35db32a 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -87,6 +87,8 @@ static int vring_add_buf(struct virtqueue *_vq, if (vq->num_free < out + in) { pr_debug("Can't add buf len %i - avail = %i\n", out + in, vq->num_free); + /* We notify *even if* VRING_USED_F_NO_NOTIFY is set here. */ + vq->notify(&vq->vq); END_USE(vq); return -ENOSPC; } diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index ea3be89a0e83..abe481ed990e 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -15,9 +15,13 @@ /* This marks a buffer as write-only (otherwise read-only). */ #define VRING_DESC_F_WRITE 2 -/* This means don't notify other side when buffer added. */ +/* The Host uses this in used->flags to advise the Guest: don't kick me when + * you add a buffer. It's unreliable, so it's simply an optimization. Guest + * will still kick if it's out of buffers. */ #define VRING_USED_F_NO_NOTIFY 1 -/* This means don't interrupt guest when buffer consumed. */ +/* The Guest uses this in avail->flags to advise the Host: don't interrupt me + * when you consume a buffer. It's unreliable, so it's simply an + * optimization. */ #define VRING_AVAIL_F_NO_INTERRUPT 1 /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ From 24a5ae5d0340d5a45df840b24a10d62aa9516116 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:00 -0500 Subject: [PATCH 07/25] virtio: remove unused id field from struct virtio_blk_outhdr This field has been unused since an older version of virtio. Remove it now before we freeze the ABI. Signed-off-by: Rusty Russell Date: Mon, 4 Feb 2008 23:50:01 -0500 Subject: [PATCH 08/25] virtio: Net header needs hdr_len It's far easier to deal with packets if we don't have to parse the packet to figure out the header length to know how much to pull into the skb data. Add the field to the virtio_net_hdr struct (and fix the spaces that somehow crept in there). Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 3 ++- include/linux/virtio_net.h | 11 ++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7b0059f0f5d4..3492ae0951de 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -242,6 +242,7 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) } if (skb_is_gso(skb)) { + hdr->hdr_len = skb_transport_header(skb) - skb->data; hdr->gso_size = skb_shinfo(skb)->gso_size; if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN; @@ -255,7 +256,7 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) BUG(); } else { hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; - hdr->gso_size = 0; + hdr->gso_size = hdr->hdr_len = 0; } vnet_hdr_to_sg(sg, skb); diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 6e8fdfea8cd0..1456f7b936d0 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -24,16 +24,17 @@ struct virtio_net_config struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset - __u8 flags; + __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame #define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) /* FIXME: Do we need this? If they said they can handle ECN, do they care? */ #define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN #define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) #define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP - __u8 gso_type; - __u16 gso_size; - __u16 csum_start; - __u16 csum_offset; + __u8 gso_type; + __u16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ + __u16 gso_size; /* Bytes to append to gso_hdr_len per frame */ + __u16 csum_start; /* Position to start checksumming from */ + __u16 csum_offset; /* Offset after that to place checksum */ }; #endif /* _LINUX_VIRTIO_NET_H */ From 34a48579e4fb380604d06f0409db3851bd22d785 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:02 -0500 Subject: [PATCH 09/25] virtio: Tweak virtio_net defines 1) Turn GSO on virtio net into an all-or-nothing (keep checksumming separate). Having multiple bits is a pain: if you can't support something you should handle it in software, which is still a performance win. 2) Make VIRTIO_NET_HDR_GSO_ECN a flag in the header, so it can apply to IPv6 or v4. 3) Rename VIRTIO_NET_F_NO_CSUM to VIRTIO_NET_F_CSUM (ie. means we do checksumming). 4) Add csum and gso params to virtio_net to allow more testing. Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 32 ++++++++++++++++---------------- include/linux/virtio_net.h | 12 ++++-------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3492ae0951de..73f01db59ab9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -24,6 +24,10 @@ #include #include +static int csum = 1, gso = 1; +module_param(csum, bool, 0444); +module_param(gso, bool, 0444); + /* FIXME: MTU in config. */ #define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN) @@ -88,13 +92,10 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb, if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { pr_debug("GSO!\n"); - switch (hdr->gso_type) { + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; break; - case VIRTIO_NET_HDR_GSO_TCPV4_ECN: - skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN; - break; case VIRTIO_NET_HDR_GSO_UDP: skb_shinfo(skb)->gso_type = SKB_GSO_UDP; break; @@ -108,6 +109,9 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb, goto frame_err; } + if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + skb_shinfo(skb)->gso_size = hdr->gso_size; if (skb_shinfo(skb)->gso_size == 0) { if (net_ratelimit()) @@ -244,9 +248,7 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) if (skb_is_gso(skb)) { hdr->hdr_len = skb_transport_header(skb) - skb->data; hdr->gso_size = skb_shinfo(skb)->gso_size; - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) - hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN; - else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; @@ -254,6 +256,8 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; else BUG(); + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) + hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else { hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; hdr->gso_size = hdr->hdr_len = 0; @@ -330,17 +334,13 @@ static int virtnet_probe(struct virtio_device *vdev) SET_NETDEV_DEV(dev, &vdev->dev); /* Do we support "hardware" checksums? */ - if (vdev->config->feature(vdev, VIRTIO_NET_F_NO_CSUM)) { + if (csum && vdev->config->feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */ dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; - if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4)) - dev->features |= NETIF_F_TSO; - if (vdev->config->feature(vdev, VIRTIO_NET_F_UFO)) - dev->features |= NETIF_F_UFO; - if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4_ECN)) - dev->features |= NETIF_F_TSO_ECN; - if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO6)) - dev->features |= NETIF_F_TSO6; + if (gso && vdev->config->feature(vdev, VIRTIO_NET_F_GSO)) { + dev->features |= NETIF_F_TSO | NETIF_F_UFO + | NETIF_F_TSO_ECN | NETIF_F_TSO6; + } } /* Configuration may specify what MAC to use. Otherwise random. */ diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 1456f7b936d0..1ea3351df609 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -6,12 +6,9 @@ #define VIRTIO_ID_NET 1 /* The feature bitmap for virtio net */ -#define VIRTIO_NET_F_NO_CSUM 0 -#define VIRTIO_NET_F_TSO4 1 -#define VIRTIO_NET_F_UFO 2 -#define VIRTIO_NET_F_TSO4_ECN 3 -#define VIRTIO_NET_F_TSO6 4 -#define VIRTIO_NET_F_MAC 5 +#define VIRTIO_NET_F_CSUM 0 /* Can handle pkts w/ partial csum */ +#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */ +#define VIRTIO_NET_F_GSO 6 /* Can handle pkts w/ any GSO type */ struct virtio_net_config { @@ -27,10 +24,9 @@ struct virtio_net_hdr __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame #define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) -/* FIXME: Do we need this? If they said they can handle ECN, do they care? */ -#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN #define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) #define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP +#define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set __u8 gso_type; __u16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ __u16 gso_size; /* Bytes to append to gso_hdr_len per frame */ From b3369c1fb410fddeb38a404316c861395f6d6ae8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:02 -0500 Subject: [PATCH 10/25] virtio: populate network rings in the probe routine, not open Since we want to reset the device to remove them, this is simpler (device is reset for us on driver remove). Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 44 +++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 73f01db59ab9..ec43284ffd13 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -282,12 +282,6 @@ static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); - try_fill_recv(vi); - - /* If we didn't even get one input buffer, we're useless. */ - if (vi->num == 0) - return -ENOMEM; - napi_enable(&vi->napi); return 0; } @@ -295,22 +289,9 @@ static int virtnet_open(struct net_device *dev) static int virtnet_close(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); - struct sk_buff *skb; napi_disable(&vi->napi); - /* networking core has neutered skb_xmit_done/skb_recv_done, so don't - * worry about races vs. get(). */ - vi->rvq->vq_ops->shutdown(vi->rvq); - while ((skb = __skb_dequeue(&vi->recv)) != NULL) { - kfree_skb(skb); - vi->num--; - } - vi->svq->vq_ops->shutdown(vi->svq); - while ((skb = __skb_dequeue(&vi->send)) != NULL) - kfree_skb(skb); - - BUG_ON(vi->num != 0); return 0; } @@ -379,10 +360,22 @@ static int virtnet_probe(struct virtio_device *vdev) pr_debug("virtio_net: registering device failed\n"); goto free_send; } + + /* Last of all, set up some receive buffers. */ + try_fill_recv(vi); + + /* If we didn't even get one input buffer, we're useless. */ + if (vi->num == 0) { + err = -ENOMEM; + goto unregister; + } + pr_debug("virtnet: registered device %s\n", dev->name); vdev->priv = vi; return 0; +unregister: + unregister_netdev(dev); free_send: vdev->config->del_vq(vi->svq); free_recv: @@ -395,6 +388,19 @@ free: static void virtnet_remove(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; + struct sk_buff *skb; + + /* Free our skbs in send and recv queues, if any. */ + vi->rvq->vq_ops->shutdown(vi->rvq); + while ((skb = __skb_dequeue(&vi->recv)) != NULL) { + kfree_skb(skb); + vi->num--; + } + vi->svq->vq_ops->shutdown(vi->svq); + while ((skb = __skb_dequeue(&vi->send)) != NULL) + kfree_skb(skb); + + BUG_ON(vi->num != 0); vdev->config->del_vq(vi->svq); vdev->config->del_vq(vi->rvq); From 6e5aa7efb27aec7e55b6463fa2c8db594c4226fa Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:03 -0500 Subject: [PATCH 11/25] virtio: reset function A reset function solves three problems: 1) It allows us to renegotiate features, eg. if we want to upgrade a guest driver without rebooting the guest. 2) It gives us a clean way of shutting down virtqueues: after a reset, we know that the buffers won't be used by the host, and 3) It helps the guest recover from messed-up drivers. So we remove the ->shutdown hook, and the only way we now remove feature bits is via reset. We leave it to the driver to do the reset before it deletes queues: the balloon driver, for example, needs to chat to the host in its remove function. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 62 ++++++++++++++++++++++++++++------ drivers/block/virtio_blk.c | 6 +++- drivers/lguest/lguest_device.c | 14 +++++++- drivers/net/virtio_net.c | 5 +-- drivers/virtio/virtio.c | 12 +++++-- drivers/virtio/virtio_ring.c | 11 ------ include/linux/virtio.h | 5 --- include/linux/virtio_config.h | 4 +++ 8 files changed, 87 insertions(+), 32 deletions(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 8ff2d8bc690a..0f23d67f958f 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -193,6 +193,13 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, #define le32_to_cpu(v32) (v32) #define le64_to_cpu(v64) (v64) +/* The device virtqueue descriptors are followed by feature bitmasks. */ +static u8 *get_feature_bits(struct device *dev) +{ + return (u8 *)(dev->desc + 1) + + dev->desc->num_vq * sizeof(struct lguest_vqconfig); +} + /*L:100 The Launcher code itself takes us out into userspace, that scary place * where pointers run wild and free! Unfortunately, like most userspace * programs, it's quite boring (which is why everyone likes to hack on the @@ -914,21 +921,58 @@ static void enable_fd(int fd, struct virtqueue *vq) write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); } +/* Resetting a device is fairly easy. */ +static void reset_device(struct device *dev) +{ + struct virtqueue *vq; + + verbose("Resetting device %s\n", dev->name); + /* Clear the status. */ + dev->desc->status = 0; + + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->desc->feature_len, 0, + dev->desc->feature_len); + + /* Zero out the virtqueues. */ + for (vq = dev->vq; vq; vq = vq->next) { + memset(vq->vring.desc, 0, + vring_size(vq->config.num, getpagesize())); + vq->last_avail_idx = 0; + } +} + /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ static void handle_output(int fd, unsigned long addr) { struct device *i; struct virtqueue *vq; - /* Check each virtqueue. */ + /* Check each device and virtqueue. */ for (i = devices.dev; i; i = i->next) { + /* Notifications to device descriptors reset the device. */ + if (from_guest_phys(addr) == i->desc) { + reset_device(i); + return; + } + + /* Notifications to virtqueues mean output has occurred. */ for (vq = i->vq; vq; vq = vq->next) { - if (vq->config.pfn == addr/getpagesize()) { - verbose("Output to %s\n", vq->dev->name); - if (vq->handle_output) - vq->handle_output(fd, vq); + if (vq->config.pfn != addr/getpagesize()) + continue; + + /* Guest should acknowledge (and set features!) before + * using the device. */ + if (i->desc->status == 0) { + warnx("%s gave early output", i->name); return; } + + if (strcmp(vq->dev->name, "console") != 0) + verbose("Output to %s\n", vq->dev->name); + if (vq->handle_output) + vq->handle_output(fd, vq); + return; } } @@ -1074,10 +1118,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; } -/* The virtqueue descriptors are followed by feature bytes. */ +/* The first half of the feature bitmask is for us to advertise features. The + * second half if for the Guest to accept features. */ static void add_feature(struct device *dev, unsigned bit) { - u8 *features; + u8 *features = get_feature_bits(dev); /* We can't extend the feature bits once we've added config bytes */ if (dev->desc->feature_len <= bit / CHAR_BIT) { @@ -1085,9 +1130,6 @@ static void add_feature(struct device *dev, unsigned bit) dev->desc->feature_len = (bit / CHAR_BIT) + 1; } - features = (u8 *)(dev->desc + 1) - + dev->desc->num_vq * sizeof(struct lguest_vqconfig); - features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 54a8017ad487..6143337527e7 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -264,12 +264,16 @@ static void virtblk_remove(struct virtio_device *vdev) struct virtio_blk *vblk = vdev->priv; int major = vblk->disk->major; + /* Nothing should be pending. */ BUG_ON(!list_empty(&vblk->reqs)); + + /* Stop all the virtqueues. */ + vdev->config->reset(vdev); + blk_cleanup_queue(vblk->disk->queue); put_disk(vblk->disk); unregister_blkdev(major, "virtblk"); mempool_destroy(vblk->pool); - /* There should be nothing in the queue now, so no need to shutdown */ vdev->config->del_vq(vblk->vq); kfree(vblk); } diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index ced5b44cebce..84f85e23cca7 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -54,7 +54,7 @@ struct lguest_device { * * The configuration information for a device consists of one or more * virtqueues, a feature bitmaks, and some configuration bytes. The - * configuration bytes don't really matter to us: the Launcher set them up, and + * configuration bytes don't really matter to us: the Launcher sets them up, and * the driver will look at them during setup. * * A convenient routine to return the device's virtqueue config array: @@ -139,9 +139,20 @@ static u8 lg_get_status(struct virtio_device *vdev) static void lg_set_status(struct virtio_device *vdev, u8 status) { + BUG_ON(!status); to_lgdev(vdev)->desc->status = status; } +/* To reset the device, we (ab)use the NOTIFY hypercall, with the descriptor + * address of the device. The Host will zero the status and all the + * features. */ +static void lg_reset(struct virtio_device *vdev) +{ + unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; + + hcall(LHCALL_NOTIFY, (max_pfn<priv; struct sk_buff *skb; + /* Stop all the virtqueues. */ + vdev->config->reset(vdev); + /* Free our skbs in send and recv queues, if any. */ - vi->rvq->vq_ops->shutdown(vi->rvq); while ((skb = __skb_dequeue(&vi->recv)) != NULL) { kfree_skb(skb); vi->num--; } - vi->svq->vq_ops->shutdown(vi->svq); while ((skb = __skb_dequeue(&vi->send)) != NULL) kfree_skb(skb); diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 303cb6f90108..7dddb1860936 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -102,9 +102,13 @@ static int virtio_dev_remove(struct device *_d) struct virtio_driver *drv = container_of(dev->dev.driver, struct virtio_driver, driver); - dev->config->set_status(dev, dev->config->get_status(dev) - & ~VIRTIO_CONFIG_S_DRIVER); drv->remove(dev); + + /* Driver should have reset device. */ + BUG_ON(dev->config->get_status(dev)); + + /* Acknowledge the device's existence again. */ + add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); return 0; } @@ -130,6 +134,10 @@ int register_virtio_device(struct virtio_device *dev) dev->dev.bus = &virtio_bus; sprintf(dev->dev.bus_id, "%u", dev->index); + /* We always start by resetting the device, in case a previous + * driver messed it up. This also tests that code path a little. */ + dev->config->reset(dev); + /* Acknowledge that we've seen the device. */ add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index dbe1d35db32a..9849babd6b37 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -173,16 +173,6 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) vq->num_free++; } -/* FIXME: We need to tell other side about removal, to synchronize. */ -static void vring_shutdown(struct virtqueue *_vq) -{ - struct vring_virtqueue *vq = to_vvq(_vq); - unsigned int i; - - for (i = 0; i < vq->vring.num; i++) - detach_buf(vq, i); -} - static inline bool more_used(const struct vring_virtqueue *vq) { return vq->last_used_idx != vq->vring.used->idx; @@ -278,7 +268,6 @@ static struct virtqueue_ops vring_vq_ops = { .kick = vring_kick, .disable_cb = vring_disable_cb, .enable_cb = vring_enable_cb, - .shutdown = vring_shutdown, }; struct virtqueue *vring_new_virtqueue(unsigned int num, diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 78408d5237c1..260d1fcf29a4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -45,9 +45,6 @@ struct virtqueue * vq: the struct virtqueue we're talking about. * This returns "false" (and doesn't re-enable) if there are pending * buffers in the queue, to avoid a race. - * @shutdown: "unadd" all buffers. - * vq: the struct virtqueue we're talking about. - * Remove everything from the queue. * * Locking rules are straightforward: the driver is responsible for * locking. No two operations may be invoked simultaneously. @@ -67,8 +64,6 @@ struct virtqueue_ops { void (*disable_cb)(struct virtqueue *vq); bool (*enable_cb)(struct virtqueue *vq); - - void (*shutdown)(struct virtqueue *vq); }; /** diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 81f828ac8f47..d581b2914b34 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -43,6 +43,9 @@ struct virtio_device; * @set_status: write the status byte * vdev: the virtio_device * status: the new status byte + * @reset: reset the device + * vdev: the virtio device + * After this, status and feature negotiation must be done again * @find_vq: find a virtqueue and instantiate it. * vdev: the virtio_device * index: the 0-based virtqueue number in case there's more than one. @@ -59,6 +62,7 @@ struct virtio_config_ops const void *buf, unsigned len); u8 (*get_status)(struct virtio_device *vdev); void (*set_status)(struct virtio_device *vdev, u8 status); + void (*reset)(struct virtio_device *vdev); struct virtqueue *(*find_vq)(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *)); From 81a8deab1ce3816c6a89e3429e234e7d3686da94 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:04 -0500 Subject: [PATCH 12/25] virtio: handle interrupts after callbacks turned off Anthony Liguori found double interrupt suppression in the virtio_net driver, triggered by two skb_recv_done's in a row. This is because virtio_ring's interrupt suppression is a best-effort optimization: it contains no synchronization so the host can miss it and still send interrupts. But it's certainly nicer for virtio users if calling disable_cb actually disables callbacks, so we check for the race in the interrupt routine. Note: SMP guests might require syncronization here, but since disable_cb is actually called from interrupt context, there has to be some form of synchronization before the next same interrupt handler is called (Linux guarantees that the same device's irq handler will never run simultanously on multiple CPUs). Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 9849babd6b37..9859213aa658 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -255,6 +255,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq) if (unlikely(vq->broken)) return IRQ_HANDLED; + /* Other side may have missed us turning off the interrupt, + * but we should preserve disable semantic for virtio users. */ + if (unlikely(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + pr_debug("virtqueue interrupt after disable for %p\n", vq); + return IRQ_HANDLED; + } + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); if (vq->vq.callback) vq->vq.callback(&vq->vq); From 0ad07ec1fd2f3e90dfb086b04c07a83db40e9172 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Wed, 7 Nov 2007 20:46:31 -0600 Subject: [PATCH 13/25] virtio: Put the virtio under the virtualization menu This patch moves virtio under the virtualization menu and changes virtio devices to not claim to only be for lguest. Signed-off-by: Anthony Liguori Signed-off-by: Rusty Russell --- arch/x86/kvm/Kconfig | 1 + drivers/Kconfig | 2 -- drivers/block/Kconfig | 3 ++- drivers/net/Kconfig | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c83e1c9b5129..41962e793c0f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -53,5 +53,6 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/lguest/Kconfig +source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/drivers/Kconfig b/drivers/Kconfig index 08d4ae201597..3f8a231fe754 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -91,6 +91,4 @@ source "drivers/dca/Kconfig" source "drivers/auxdisplay/Kconfig" source "drivers/uio/Kconfig" - -source "drivers/virtio/Kconfig" endmenu diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index f2122855d4ec..64e5148d82bc 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -440,6 +440,7 @@ config VIRTIO_BLK tristate "Virtio block driver (EXPERIMENTAL)" depends on EXPERIMENTAL && VIRTIO ---help--- - This is the virtual block driver for lguest. Say Y or M. + This is the virtual block driver for virtio. It can be used with + lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. endif # BLK_DEV diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 389980f0e59e..e9d761cbde93 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -3113,6 +3113,7 @@ config VIRTIO_NET tristate "Virtio network driver (EXPERIMENTAL)" depends on EXPERIMENTAL && VIRTIO ---help--- - This is the virtual network driver for lguest. Say Y or M. + This is the virtual network driver for virtio. It can be used with + lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. endif # NETDEVICES From 15f9c8903cbdb02aee0f1bcf86a97c2e238b9a3d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:05 -0500 Subject: [PATCH 14/25] virtio: Use the sg_phys convenience function. Simple cleanup. Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 9859213aa658..74c245092b5c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -99,16 +99,14 @@ static int vring_add_buf(struct virtqueue *_vq, head = vq->free_head; for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { vq->vring.desc[i].flags = VRING_DESC_F_NEXT; - vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<offset; + vq->vring.desc[i].addr = sg_phys(sg); vq->vring.desc[i].len = sg->length; prev = i; sg++; } for (; in; i = vq->vring.desc[i].next, in--) { vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<offset; + vq->vring.desc[i].addr = sg_phys(sg); vq->vring.desc[i].len = sg->length; prev = i; sg++; From c6fd47011b4bdebad3f1513bac75fe4895e332ee Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:05 -0500 Subject: [PATCH 15/25] virtio: Allow virtio to be modular and used by modules This is needed for the virtio PCI device to be compiled as a module. Signed-off-by: Anthony Liguori Signed-off-by: Rusty Russell --- drivers/virtio/Kconfig | 4 ++-- drivers/virtio/virtio.c | 8 ++++++++ drivers/virtio/virtio_ring.c | 4 ++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 9e33fc4da875..de0c8c2654ef 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -1,8 +1,8 @@ # Virtio always gets selected by whoever wants it. config VIRTIO - bool + tristate # Similarly the virtio ring implementation. config VIRTIO_RING - bool + tristate depends on VIRTIO diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 7dddb1860936..b535483bc556 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -162,4 +162,12 @@ static int virtio_init(void) panic("virtio bus registration failed"); return 0; } + +static void __exit virtio_exit(void) +{ + bus_unregister(&virtio_bus); +} core_initcall(virtio_init); +module_exit(virtio_exit); + +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 74c245092b5c..3a28c1382131 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -266,6 +266,7 @@ irqreturn_t vring_interrupt(int irq, void *_vq) return IRQ_HANDLED; } +EXPORT_SYMBOL_GPL(vring_interrupt); static struct virtqueue_ops vring_vq_ops = { .add_buf = vring_add_buf, @@ -318,9 +319,12 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, return &vq->vq; } +EXPORT_SYMBOL_GPL(vring_new_virtqueue); void vring_del_virtqueue(struct virtqueue *vq) { kfree(to_vvq(vq)); } +EXPORT_SYMBOL_GPL(vring_del_virtqueue); +MODULE_LICENSE("GPL"); From e70f2f1bb8c8cf08ba0d836466f77eea27dd3474 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 6 Dec 2007 15:21:46 +0100 Subject: [PATCH 16/25] virtnet: remove double ether_setup Hello Rusty, virtnet_probe already calls alloc_etherdev, which calls ether_setup. There is no need to do that again. Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 6e0a9fefe6cb..be5688f5e931 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -307,7 +307,6 @@ static int virtnet_probe(struct virtio_device *vdev) return -ENOMEM; /* Set up network device as normal. */ - ether_setup(dev); dev->open = virtnet_open; dev->stop = virtnet_close; dev->hard_start_xmit = start_xmit; From a48bd8f67003c342e147309a331f656a5e75a5e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:07 -0500 Subject: [PATCH 17/25] virtio: flush buffers on open Fix bug found by Christian Borntraeger: if the other side fills all the registered network buffers before we enable NAPI, we will never get an interrupt. The simplest fix is to process the input queue once on open. Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index be5688f5e931..bd4d26a36ead 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -283,6 +283,13 @@ static int virtnet_open(struct net_device *dev) struct virtnet_info *vi = netdev_priv(dev); napi_enable(&vi->napi); + + /* If all buffers were filled by other side before we napi_enabled, we + * won't get another interrupt, so process any outstanding packets + * now. virtnet_poll wants re-enable the queue, so we disable here. */ + vi->rvq->vq_ops->disable_cb(vi->rvq); + netif_rx_schedule(vi->dev, &vi->napi); + return 0; } From 2cb9c6bafc58cf4066cb15f0ac6989a1015a02cc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:07 -0500 Subject: [PATCH 18/25] virtio: free transmit skbs when notified, not on next xmit. This fixes a potential dangling xmit problem. We also suppress refill interrupts until we need them. Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index bd4d26a36ead..a61c176607f4 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -56,11 +56,13 @@ static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb) sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr)); } -static void skb_xmit_done(struct virtqueue *rvq) +static void skb_xmit_done(struct virtqueue *svq) { - struct virtnet_info *vi = rvq->vdev->priv; + struct virtnet_info *vi = svq->vdev->priv; - /* In case we were waiting for output buffers. */ + /* Suppress further interrupts. */ + svq->vq_ops->disable_cb(svq); + /* We were waiting for more output buffers. */ netif_wake_queue(vi->dev); } @@ -232,8 +234,6 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest)); - free_old_xmit_skbs(vi); - /* Encode metadata header at front. */ hdr = skb_vnet_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { @@ -266,11 +266,24 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) vnet_hdr_to_sg(sg, skb); num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; __skb_queue_head(&vi->send, skb); + +again: + /* Free up any pending old buffers before queueing new ones. */ + free_old_xmit_skbs(vi); err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb); if (err) { pr_debug("%s: virtio not prepared to send\n", dev->name); - skb_unlink(skb, &vi->send); netif_stop_queue(dev); + + /* Activate callback for using skbs: if this fails it + * means some were used in the meantime. */ + if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) { + printk("Unlikely: restart svq failed\n"); + netif_start_queue(dev); + goto again; + } + __skb_unlink(skb, &vi->send); + return NETDEV_TX_BUSY; } vi->svq->vq_ops->kick(vi->svq); From 6c0cd7c000dc0851035c5003bf9d47733d0b257b Mon Sep 17 00:00:00 2001 From: Dor Laor Date: Sun, 16 Dec 2007 15:19:43 +0200 Subject: [PATCH 19/25] virtio_net: parametrize the napi_weight for virtio receive queue. It is done in order to improve performance. Signed-off-by: Dor Laor Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a61c176607f4..e66de0c12fc1 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -24,6 +24,9 @@ #include #include +static int napi_weight = 128; +module_param(napi_weight, int, 0444); + static int csum = 1, gso = 1; module_param(csum, bool, 0444); module_param(gso, bool, 0444); @@ -353,7 +356,7 @@ static int virtnet_probe(struct virtio_device *vdev) /* Set up our device-specific information */ vi = netdev_priv(dev); - netif_napi_add(dev, &vi->napi, virtnet_poll, 16); + netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight); vi->dev = dev; vi->vdev = vdev; From 135da0b037984c0284acdf40aaf4f7f31eb5cbd0 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 23 Jan 2008 17:56:50 +0100 Subject: [PATCH 20/25] virtio_blk: provide getgeo Rusty, I currently try to make my guest boot from an virtio root device without having an external kernel. Some of the tools that I tried expect HDIO_GETGEO to work. The most interesting value is likely the geo.start value to get the offset of a partition. This value is filled by block/ioctl.c if fops->getgeo is set. This patch also fills in some standard values for heads, sectors and cylinders. Makes sense? Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6143337527e7..d2fe679519e4 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -152,9 +152,20 @@ static int virtblk_ioctl(struct inode *inode, struct file *filp, (void __user *)data); } +/* We provide getgeo only to please some old bootloader/partitioning tools */ +static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values, similar to sd */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + static struct block_device_operations virtblk_fops = { - .ioctl = virtblk_ioctl, - .owner = THIS_MODULE, + .ioctl = virtblk_ioctl, + .owner = THIS_MODULE, + .getgeo = virtblk_getgeo, }; static int virtblk_probe(struct virtio_device *vdev) From 4f3bf19c6e8164b441faaee476e734b4f612a78d Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 31 Jan 2008 15:53:53 +0100 Subject: [PATCH 21/25] virtio_blk: Dont waste major numbers Rusty, currently virtio_blk uses one major number per device. While this works quite well on most systems it is wasteful and will exhaust major numbers on larger installations. This patch allocates a major number on init and will use 16 minor numbers for each disk. That will allow ~64k virtio_blk disks. Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index d2fe679519e4..559c322c4ffa 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -7,8 +7,11 @@ #include #define VIRTIO_MAX_SG (3+MAX_PHYS_SEGMENTS) +#define PART_BITS 4 static unsigned char virtblk_index = 'a'; +static int major, minor; + struct virtio_blk { spinlock_t lock; @@ -171,10 +174,13 @@ static struct block_device_operations virtblk_fops = { static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; - int err, major; + int err; u64 cap; u32 v; + if (minor >= 1 << MINORBITS) + return -ENOSPC; + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); if (!vblk) { err = -ENOMEM; @@ -198,17 +204,11 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_free_vq; } - major = register_blkdev(0, "virtblk"); - if (major < 0) { - err = major; - goto out_mempool; - } - /* FIXME: How many partitions? How long is a piece of string? */ - vblk->disk = alloc_disk(1 << 4); + vblk->disk = alloc_disk(1 << PART_BITS); if (!vblk->disk) { err = -ENOMEM; - goto out_unregister_blkdev; + goto out_mempool; } vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); @@ -219,10 +219,12 @@ static int virtblk_probe(struct virtio_device *vdev) sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++); vblk->disk->major = major; - vblk->disk->first_minor = 0; + vblk->disk->first_minor = minor; vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; + minor += 1 << PART_BITS; + /* If barriers are supported, tell block layer that queue is ordered */ if (vdev->config->feature(vdev, VIRTIO_BLK_F_BARRIER)) blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); @@ -258,8 +260,6 @@ static int virtblk_probe(struct virtio_device *vdev) out_put_disk: put_disk(vblk->disk); -out_unregister_blkdev: - unregister_blkdev(major, "virtblk"); out_mempool: mempool_destroy(vblk->pool); out_free_vq: @@ -304,11 +304,15 @@ static struct virtio_driver virtio_blk = { static int __init init(void) { + major = register_blkdev(0, "virtblk"); + if (major < 0) + return major; return register_virtio_driver(&virtio_blk); } static void __exit fini(void) { + unregister_blkdev(major, "virtblk"); unregister_virtio_driver(&virtio_blk); } module_init(init); From d50ed907dc3db5bf2dd0a05b4e199a65793a3788 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 1 Feb 2008 09:05:00 +0100 Subject: [PATCH 22/25] virtio_blk: implement naming for vda-vdz,vdaa-vdzz,vdaaa-vdzzz Am Freitag, 1. Februar 2008 schrieb Christian Borntraeger: > Right. I will fix that with an additional patch. This patch goes on top of the minor number patch. Please let me know if you want a merged patch: Currently virtio_blk creates the disk name combinging "vd" with 'a'++. This will give strange names after vdz. I have implemented names up to vdzzz - inspired by the sd.c code. That should be sufficient for now. There is one driver in the kernel (driver/s390/block/dasd_genhd.c) that implements names from dasda-dasdzzzz allowing even more disks. Maybe a janitor can come up with a common implementation usable for all kind of block device drivers. I have tested this patch with 100 disks - seems to work. Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 559c322c4ffa..3b1a68d6eddb 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -9,8 +9,7 @@ #define VIRTIO_MAX_SG (3+MAX_PHYS_SEGMENTS) #define PART_BITS 4 -static unsigned char virtblk_index = 'a'; -static int major, minor; +static int major, index; struct virtio_blk { @@ -171,6 +170,11 @@ static struct block_device_operations virtblk_fops = { .getgeo = virtblk_getgeo, }; +static int index_to_minor(int index) +{ + return index << PART_BITS; +} + static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -178,7 +182,7 @@ static int virtblk_probe(struct virtio_device *vdev) u64 cap; u32 v; - if (minor >= 1 << MINORBITS) + if (index_to_minor(index) >= 1 << MINORBITS) return -ENOSPC; vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); @@ -217,13 +221,24 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_put_disk; } - sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++); + if (index < 26) { + sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26); + } else if (index < (26 + 1) * 26) { + sprintf(vblk->disk->disk_name, "vd%c%c", + 'a' + index / 26 - 1, 'a' + index % 26); + } else { + const unsigned int m1 = (index / 26 - 1) / 26 - 1; + const unsigned int m2 = (index / 26 - 1) % 26; + const unsigned int m3 = index % 26; + sprintf(vblk->disk->disk_name, "vd%c%c%c", + 'a' + m1, 'a' + m2, 'a' + m3); + } + vblk->disk->major = major; - vblk->disk->first_minor = minor; + vblk->disk->first_minor = index_to_minor(index); vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; - - minor += 1 << PART_BITS; + index++; /* If barriers are supported, tell block layer that queue is ordered */ if (vdev->config->feature(vdev, VIRTIO_BLK_F_BARRIER)) From 3343660d8c62c6b00b2f15324ef3fcb6be207bfa Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Mon, 12 Nov 2007 21:30:26 -0600 Subject: [PATCH 23/25] virtio: PCI device This is a PCI device that implements a transport for virtio. It allows virtio devices to be used by QEMU based VMMs like KVM or Xen. Signed-off-by: Anthony Liguori Signed-off-by: Rusty Russell --- drivers/virtio/Kconfig | 17 ++ drivers/virtio/Makefile | 1 + drivers/virtio/virtio_pci.c | 440 ++++++++++++++++++++++++++++++++++++ include/linux/virtio_pci.h | 55 +++++ 4 files changed, 513 insertions(+) create mode 100644 drivers/virtio/virtio_pci.c create mode 100644 include/linux/virtio_pci.h diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index de0c8c2654ef..833db2f36e9b 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -6,3 +6,20 @@ config VIRTIO config VIRTIO_RING tristate depends on VIRTIO + +config VIRTIO_PCI + tristate "PCI driver for virtio devices (EXPERIMENTAL)" + depends on PCI && EXPERIMENTAL + select VIRTIO + select VIRTIO_RING + ---help--- + This drivers provides support for virtio based paravirtual device + drivers over PCI. This requires that your VMM has appropriate PCI + virtio backends. Most QEMU based VMMs should support these devices + (like KVM or Xen). + + Currently, the ABI is not considered stable so there is no guarantee + that this version of the driver will work with your VMM. + + If unsure, say M. + diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index f70e40971dd9..cc84999f3057 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_VIRTIO) += virtio.o obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o +obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c new file mode 100644 index 000000000000..192687e3a56a --- /dev/null +++ b/drivers/virtio/virtio_pci.c @@ -0,0 +1,440 @@ +/* + * Virtio PCI driver + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Anthony Liguori "); +MODULE_DESCRIPTION("virtio-pci"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1"); + +/* Our device structure */ +struct virtio_pci_device +{ + struct virtio_device vdev; + struct pci_dev *pci_dev; + + /* the IO mapping for the PCI config space */ + void *ioaddr; + + /* a list of queues so we can dispatch IRQs */ + spinlock_t lock; + struct list_head virtqueues; +}; + +struct virtio_pci_vq_info +{ + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the number of entries in the queue */ + int num; + + /* the index of the queue */ + int queue_index; + + /* the virtual address of the ring queue */ + void *queue; + + /* the list node for the virtqueues list */ + struct list_head node; +}; + +/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ +static struct pci_device_id virtio_pci_id_table[] = { + { 0x1af4, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, + { 0 }, +}; + +MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); + +/* A PCI device has it's own struct device and so does a virtio device so + * we create a place for the virtio devices to show up in sysfs. I think it + * would make more sense for virtio to not insist on having it's own device. */ +static struct device virtio_pci_root = { + .parent = NULL, + .bus_id = "virtio-pci", +}; + +/* Unique numbering for devices under the kvm root */ +static unsigned int dev_index; + +/* Convert a generic virtio device to our structure */ +static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev) +{ + return container_of(vdev, struct virtio_pci_device, vdev); +} + +/* virtio config->feature() implementation */ +static bool vp_feature(struct virtio_device *vdev, unsigned bit) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u32 mask; + + /* Since this function is supposed to have the side effect of + * enabling a queried feature, we simulate that by doing a read + * from the host feature bitmask and then writing to the guest + * feature bitmask */ + mask = ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES); + if (mask & (1 << bit)) { + mask |= (1 << bit); + iowrite32(mask, vp_dev->ioaddr + VIRTIO_PCI_GUEST_FEATURES); + } + + return !!(mask & (1 << bit)); +} + +/* virtio config->get() implementation */ +static void vp_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset; + u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + ptr[i] = ioread8(ioaddr + i); +} + +/* the config->set() implementation. it's symmetric to the config->get() + * implementation */ +static void vp_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset; + const u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + iowrite8(ptr[i], ioaddr + i); +} + +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +static void vp_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + return iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +static void vp_reset(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* 0 status means a reset. */ + return iowrite8(0, vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +/* the notify function used when creating a virt queue */ +static void vp_notify(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_vq_info *info = vq->priv; + + /* we write the queue's selector into the notification register to + * signal the other end */ + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); +} + +/* A small wrapper to also acknowledge the interrupt when it's handled. + * I really need an EIO hook for the vring so I can ack the interrupt once we + * know that we'll be handling the IRQ but before we invoke the callback since + * the callback may notify the host which results in the host attempting to + * raise an interrupt that we would then mask once we acknowledged the + * interrupt. */ +static irqreturn_t vp_interrupt(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + struct virtio_pci_vq_info *info; + irqreturn_t ret = IRQ_NONE; + u8 isr; + + /* reading the ISR has the effect of also clearing it so it's very + * important to save off the value. */ + isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR); + + /* It's definitely not us if the ISR was not high */ + if (!isr) + return IRQ_NONE; + + /* Configuration change? Tell driver if it wants to know. */ + if (isr & VIRTIO_PCI_ISR_CONFIG) { + struct virtio_driver *drv; + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); + + if (drv->config_changed) + drv->config_changed(&vp_dev->vdev); + } + + spin_lock(&vp_dev->lock); + list_for_each_entry(info, &vp_dev->virtqueues, node) { + if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) + ret = IRQ_HANDLED; + } + spin_unlock(&vp_dev->lock); + + return ret; +} + +/* the config->find_vq() implementation */ +static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, + void (*callback)(struct virtqueue *vq)) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_vq_info *info; + struct virtqueue *vq; + u16 num; + int err; + + /* Select the queue we're interested in */ + iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + + /* Check if queue is either not available or already active. */ + num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM); + if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN)) + return ERR_PTR(-ENOENT); + + /* allocate and fill out our structure the represents an active + * queue */ + info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->queue_index = index; + info->num = num; + + info->queue = kzalloc(PAGE_ALIGN(vring_size(num,PAGE_SIZE)), GFP_KERNEL); + if (info->queue == NULL) { + err = -ENOMEM; + goto out_info; + } + + /* activate the queue */ + iowrite32(virt_to_phys(info->queue) >> PAGE_SHIFT, + vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + + /* create the vring */ + vq = vring_new_virtqueue(info->num, vdev, info->queue, + vp_notify, callback); + if (!vq) { + err = -ENOMEM; + goto out_activate_queue; + } + + vq->priv = info; + info->vq = vq; + + spin_lock(&vp_dev->lock); + list_add(&info->node, &vp_dev->virtqueues); + spin_unlock(&vp_dev->lock); + + return vq; + +out_activate_queue: + iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + kfree(info->queue); +out_info: + kfree(info); + return ERR_PTR(err); +} + +/* the config->del_vq() implementation */ +static void vp_del_vq(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_vq_info *info = vq->priv; + + spin_lock(&vp_dev->lock); + list_del(&info->node); + spin_unlock(&vp_dev->lock); + + vring_del_virtqueue(vq); + + /* Select and deactivate the queue */ + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + + kfree(info->queue); + kfree(info); +} + +static struct virtio_config_ops virtio_pci_config_ops = { + .feature = vp_feature, + .get = vp_get, + .set = vp_set, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vq = vp_find_vq, + .del_vq = vp_del_vq, +}; + +/* the PCI probing function */ +static int __devinit virtio_pci_probe(struct pci_dev *pci_dev, + const struct pci_device_id *id) +{ + struct virtio_pci_device *vp_dev; + int err; + + /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f) + return -ENODEV; + + /* allocate our structure and fill it out */ + vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); + if (vp_dev == NULL) + return -ENOMEM; + + snprintf(vp_dev->vdev.dev.bus_id, BUS_ID_SIZE, "virtio%d", dev_index); + vp_dev->vdev.index = dev_index; + dev_index++; + + vp_dev->vdev.dev.parent = &virtio_pci_root; + vp_dev->vdev.config = &virtio_pci_config_ops; + vp_dev->pci_dev = pci_dev; + INIT_LIST_HEAD(&vp_dev->virtqueues); + spin_lock_init(&vp_dev->lock); + + /* enable the device */ + err = pci_enable_device(pci_dev); + if (err) + goto out; + + err = pci_request_regions(pci_dev, "virtio-pci"); + if (err) + goto out_enable_device; + + vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0); + if (vp_dev->ioaddr == NULL) + goto out_req_regions; + + pci_set_drvdata(pci_dev, vp_dev); + + /* we use the subsystem vendor/device id as the virtio vendor/device + * id. this allows us to use the same PCI vendor/device id for all + * virtio devices and to identify the particular virtio driver by + * the subsytem ids */ + vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; + vp_dev->vdev.id.device = pci_dev->subsystem_device; + + /* register a handler for the queue with the PCI device's interrupt */ + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, + vp_dev->vdev.dev.bus_id, vp_dev); + if (err) + goto out_set_drvdata; + + /* finally register the virtio device */ + err = register_virtio_device(&vp_dev->vdev); + if (err) + goto out_req_irq; + + return 0; + +out_req_irq: + free_irq(pci_dev->irq, vp_dev); +out_set_drvdata: + pci_set_drvdata(pci_dev, NULL); + pci_iounmap(pci_dev, vp_dev->ioaddr); +out_req_regions: + pci_release_regions(pci_dev); +out_enable_device: + pci_disable_device(pci_dev); +out: + kfree(vp_dev); + return err; +} + +static void __devexit virtio_pci_remove(struct pci_dev *pci_dev) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + + free_irq(pci_dev->irq, vp_dev); + pci_set_drvdata(pci_dev, NULL); + pci_iounmap(pci_dev, vp_dev->ioaddr); + pci_release_regions(pci_dev); + pci_disable_device(pci_dev); + kfree(vp_dev); +} + +#ifdef CONFIG_PM +static int virtio_pci_suspend(struct pci_dev *pci_dev, pm_message_t state) +{ + pci_save_state(pci_dev); + pci_set_power_state(pci_dev, PCI_D3hot); + return 0; +} + +static int virtio_pci_resume(struct pci_dev *pci_dev) +{ + pci_restore_state(pci_dev); + pci_set_power_state(pci_dev, PCI_D0); + return 0; +} +#endif + +static struct pci_driver virtio_pci_driver = { + .name = "virtio-pci", + .id_table = virtio_pci_id_table, + .probe = virtio_pci_probe, + .remove = virtio_pci_remove, +#ifdef CONFIG_PM + .suspend = virtio_pci_suspend, + .resume = virtio_pci_resume, +#endif +}; + +static int __init virtio_pci_init(void) +{ + int err; + + err = device_register(&virtio_pci_root); + if (err) + return err; + + err = pci_register_driver(&virtio_pci_driver); + if (err) + device_unregister(&virtio_pci_root); + + return err; +} + +module_init(virtio_pci_init); + +static void __exit virtio_pci_exit(void) +{ + device_unregister(&virtio_pci_root); + pci_unregister_driver(&virtio_pci_driver); +} + +module_exit(virtio_pci_exit); diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h new file mode 100644 index 000000000000..860eb37bfa07 --- /dev/null +++ b/include/linux/virtio_pci.h @@ -0,0 +1,55 @@ +/* + * Virtio PCI driver + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef _LINUX_VIRTIO_PCI_H +#define _LINUX_VIRTIO_PCI_H + +#include + +/* A 32-bit r/o bitmask of the features supported by the host */ +#define VIRTIO_PCI_HOST_FEATURES 0 + +/* A 32-bit r/w bitmask of features activated by the guest */ +#define VIRTIO_PCI_GUEST_FEATURES 4 + +/* A 32-bit r/w PFN for the currently selected queue */ +#define VIRTIO_PCI_QUEUE_PFN 8 + +/* A 16-bit r/o queue size for the currently selected queue */ +#define VIRTIO_PCI_QUEUE_NUM 12 + +/* A 16-bit r/w queue selector */ +#define VIRTIO_PCI_QUEUE_SEL 14 + +/* A 16-bit r/w queue notifier */ +#define VIRTIO_PCI_QUEUE_NOTIFY 16 + +/* An 8-bit device status register. */ +#define VIRTIO_PCI_STATUS 18 + +/* An 8-bit r/o interrupt status register. Reading the value will return the + * current contents of the ISR and will also clear it. This is effectively + * a read-and-acknowledge. */ +#define VIRTIO_PCI_ISR 19 + +/* The bit of the ISR which indicates a device configuration change. */ +#define VIRTIO_PCI_ISR_CONFIG 0x2 + +/* The remaining space is defined by each driver as the per-driver + * configuration space */ +#define VIRTIO_PCI_CONFIG 20 + +#endif From 55a7c066041e7850948d29ed813f62821a9ec046 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Mon, 28 Jan 2008 09:59:59 -0600 Subject: [PATCH 24/25] virtio: Use PCI revision field to indicate virtio PCI ABI version As Avi pointed out, as we continue to massage the virtio PCI ABI, we can make things a little more friendly to users by utilizing the PCI revision field to indicate which version of the ABI we're using. This is a hard ABI version and incrementing it will cause the guest driver to break. This is the necessary changes to virtio_pci to support this. Signed-off-by: Anthony Liguori Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 6 ++++++ include/linux/virtio_pci.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 192687e3a56a..26f787ddd5ff 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -311,6 +311,12 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev, if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f) return -ENODEV; + if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) { + printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n", + VIRTIO_PCI_ABI_VERSION, pci_dev->revision); + return -ENODEV; + } + /* allocate our structure and fill it out */ vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); if (vp_dev == NULL) diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h index 860eb37bfa07..b3151659cf49 100644 --- a/include/linux/virtio_pci.h +++ b/include/linux/virtio_pci.h @@ -52,4 +52,6 @@ * configuration space */ #define VIRTIO_PCI_CONFIG 20 +/* Virtio ABI version, this must match exactly */ +#define VIRTIO_PCI_ABI_VERSION 0 #endif From 6b35e40767c6c1ac783330109ae8e0c09ea6bc82 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 4 Feb 2008 23:50:12 -0500 Subject: [PATCH 25/25] virtio: balloon driver After discussions with Anthony Liguori, it seems that the virtio balloon can be made even simpler. Here's my attempt. The device configuration tells the driver how much memory it should take from the guest (ie. balloon size). The guest feeds the page numbers it has taken via one virtqueue. A second virtqueue feeds the page numbers the driver wants back: if the device has the VIRTIO_BALLOON_F_MUST_TELL_HOST bit, then this queue is compulsory, otherwise it's advisory (and the guest can simply fault the pages back in). This driver can be enhanced later to deflate the balloon via a shrinker, oom callback or we could even go for a complete set of in-guest regulators. Signed-off-by: Rusty Russell --- drivers/virtio/Kconfig | 10 ++ drivers/virtio/Makefile | 1 + drivers/virtio/virtio_balloon.c | 284 ++++++++++++++++++++++++++++++++ include/linux/virtio_balloon.h | 18 ++ 4 files changed, 313 insertions(+) create mode 100644 drivers/virtio/virtio_balloon.c create mode 100644 include/linux/virtio_balloon.h diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 833db2f36e9b..3dd6294d10b6 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -23,3 +23,13 @@ config VIRTIO_PCI If unsure, say M. +config VIRTIO_BALLOON + tristate "Virtio balloon driver (EXPERIMENTAL)" + select VIRTIO + select VIRTIO_RING + ---help--- + This driver supports increasing and decreasing the amount + of memory within a KVM guest. + + If unsure, say M. + diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index cc84999f3057..6738c446c199 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_VIRTIO) += virtio.o obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o +obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c new file mode 100644 index 000000000000..622aece1acce --- /dev/null +++ b/drivers/virtio/virtio_balloon.c @@ -0,0 +1,284 @@ +/* Virtio balloon implementation, inspired by Dor Loar and Marcelo + * Tosatti's implementations. + * + * Copyright 2008 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +//#define DEBUG +#include +#include +#include +#include +#include + +struct virtio_balloon +{ + struct virtio_device *vdev; + struct virtqueue *inflate_vq, *deflate_vq; + + /* Where the ballooning thread waits for config to change. */ + wait_queue_head_t config_change; + + /* The thread servicing the balloon. */ + struct task_struct *thread; + + /* Waiting for host to ack the pages we released. */ + struct completion acked; + + /* Do we have to tell Host *before* we reuse pages? */ + bool tell_host_first; + + /* The pages we've told the Host we're not using. */ + unsigned int num_pages; + struct list_head pages; + + /* The array of pfns we tell the Host about. */ + unsigned int num_pfns; + u32 pfns[256]; +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static void balloon_ack(struct virtqueue *vq) +{ + struct virtio_balloon *vb; + unsigned int len; + + vb = vq->vq_ops->get_buf(vq, &len); + if (vb) + complete(&vb->acked); +} + +static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) +{ + struct scatterlist sg; + + sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + + init_completion(&vb->acked); + + /* We should always be able to add one buffer to an empty queue. */ + if (vq->vq_ops->add_buf(vq, &sg, 1, 0, vb) != 0) + BUG(); + vq->vq_ops->kick(vq); + + /* When host has read buffer, this completes via balloon_ack */ + wait_for_completion(&vb->acked); +} + +static void fill_balloon(struct virtio_balloon *vb, size_t num) +{ + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns++) { + struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY); + if (!page) { + if (printk_ratelimit()) + dev_printk(KERN_INFO, &vb->vdev->dev, + "Out of puff! Can't get %zu pages\n", + num); + /* Sleep for at least 1/5 of a second before retry. */ + msleep(200); + break; + } + vb->pfns[vb->num_pfns] = page_to_pfn(page); + totalram_pages--; + vb->num_pages++; + list_add(&page->lru, &vb->pages); + } + + /* Didn't get any? Oh well. */ + if (vb->num_pfns == 0) + return; + + tell_host(vb, vb->inflate_vq); +} + +static void release_pages_by_pfn(const u32 pfns[], unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) { + __free_page(pfn_to_page(pfns[i])); + totalram_pages++; + } +} + +static void leak_balloon(struct virtio_balloon *vb, size_t num) +{ + struct page *page; + + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns++) { + page = list_first_entry(&vb->pages, struct page, lru); + list_del(&page->lru); + vb->pfns[vb->num_pfns] = page_to_pfn(page); + vb->num_pages--; + } + + if (vb->tell_host_first) { + tell_host(vb, vb->deflate_vq); + release_pages_by_pfn(vb->pfns, vb->num_pfns); + } else { + release_pages_by_pfn(vb->pfns, vb->num_pfns); + tell_host(vb, vb->deflate_vq); + } +} + +static void virtballoon_changed(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + wake_up(&vb->config_change); +} + +static inline int towards_target(struct virtio_balloon *vb) +{ + u32 v; + __virtio_config_val(vb->vdev, + offsetof(struct virtio_balloon_config, num_pages), + &v); + return v - vb->num_pages; +} + +static void update_balloon_size(struct virtio_balloon *vb) +{ + __le32 actual = cpu_to_le32(vb->num_pages); + + vb->vdev->config->set(vb->vdev, + offsetof(struct virtio_balloon_config, actual), + &actual, sizeof(actual)); +} + +static int balloon(void *_vballoon) +{ + struct virtio_balloon *vb = _vballoon; + + set_freezable(); + while (!kthread_should_stop()) { + int diff; + + try_to_freeze(); + wait_event_interruptible(vb->config_change, + (diff = towards_target(vb)) != 0 + || kthread_should_stop()); + if (diff > 0) + fill_balloon(vb, diff); + else if (diff < 0) + leak_balloon(vb, -diff); + update_balloon_size(vb); + } + return 0; +} + +static int virtballoon_probe(struct virtio_device *vdev) +{ + struct virtio_balloon *vb; + int err; + + vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); + if (!vb) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&vb->pages); + vb->num_pages = 0; + init_waitqueue_head(&vb->config_change); + vb->vdev = vdev; + + /* We expect two virtqueues. */ + vb->inflate_vq = vdev->config->find_vq(vdev, 0, balloon_ack); + if (IS_ERR(vb->inflate_vq)) { + err = PTR_ERR(vb->inflate_vq); + goto out_free_vb; + } + + vb->deflate_vq = vdev->config->find_vq(vdev, 1, balloon_ack); + if (IS_ERR(vb->deflate_vq)) { + err = PTR_ERR(vb->deflate_vq); + goto out_del_inflate_vq; + } + + vb->thread = kthread_run(balloon, vb, "vballoon"); + if (IS_ERR(vb->thread)) { + err = PTR_ERR(vb->thread); + goto out_del_deflate_vq; + } + + vb->tell_host_first + = vdev->config->feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); + + return 0; + +out_del_deflate_vq: + vdev->config->del_vq(vb->deflate_vq); +out_del_inflate_vq: + vdev->config->del_vq(vb->inflate_vq); +out_free_vb: + kfree(vb); +out: + return err; +} + +static void virtballoon_remove(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + kthread_stop(vb->thread); + + /* There might be pages left in the balloon: free them. */ + while (vb->num_pages) + leak_balloon(vb, vb->num_pages); + + /* Now we reset the device so we can clean up the queues. */ + vdev->config->reset(vdev); + + vdev->config->del_vq(vb->deflate_vq); + vdev->config->del_vq(vb->inflate_vq); + kfree(vb); +} + +static struct virtio_driver virtio_balloon = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtballoon_probe, + .remove = __devexit_p(virtballoon_remove), + .config_changed = virtballoon_changed, +}; + +static int __init init(void) +{ + return register_virtio_driver(&virtio_balloon); +} + +static void __exit fini(void) +{ + unregister_virtio_driver(&virtio_balloon); +} +module_init(init); +module_exit(fini); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio balloon driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h new file mode 100644 index 000000000000..979524ee75b7 --- /dev/null +++ b/include/linux/virtio_balloon.h @@ -0,0 +1,18 @@ +#ifndef _LINUX_VIRTIO_BALLOON_H +#define _LINUX_VIRTIO_BALLOON_H +#include + +/* The ID for virtio_balloon */ +#define VIRTIO_ID_BALLOON 5 + +/* The feature bitmap for virtio balloon */ +#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ + +struct virtio_balloon_config +{ + /* Number of pages host wants Guest to give up. */ + __le32 num_pages; + /* Number of pages we've actually got in balloon. */ + __le32 actual; +}; +#endif /* _LINUX_VIRTIO_BALLOON_H */