From a05e8a6e90c82cec67d16fed24da0fd04ec00f32 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 2 Sep 2010 17:47:43 +0300 Subject: [PATCH 1/8] qemu: e1000 fix TOR math Patch b0b900070c7cb29bbefb732ec00397abe5de6d73 made TOR valuer incorrect: the spec says it should always include the CRC field. No one seems to use this field, but better to stick to spec. Signed-off-by: Michael S. Tsirkin --- hw/e1000.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hw/e1000.c b/hw/e1000.c index 80b78bc618..7d7d14002f 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -345,7 +345,7 @@ is_vlan_txd(uint32_t txd_lower) /* FCS aka Ethernet CRC-32. We don't get it from backends and can't * fill it in, just pad descriptor length by 4 bytes unless guest - * told us to trip it off the packet. */ + * told us to strip it off the packet. */ static inline int fcs_len(E1000State *s) { @@ -690,9 +690,14 @@ e1000_receive(VLANClientState *nc, const uint8_t *buf, size_t size) s->mac_reg[GPRC]++; s->mac_reg[TPR]++; - n = s->mac_reg[TORL]; - if ((s->mac_reg[TORL] += size) < n) + /* TOR - Total Octets Received: + * This register includes bytes received in a packet from the field through the field, inclusively. + */ + n = s->mac_reg[TORL] + size + /* Always include FCS length. */ 4; + if (n < s->mac_reg[TORL]) s->mac_reg[TORH]++; + s->mac_reg[TORL] = n; n = E1000_ICS_RXT0; if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH]) From ef4252b149cf238480c45c06dcbd567d41ee7d76 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 13 Jul 2010 17:55:31 +0300 Subject: [PATCH 2/8] tap: generalize code for different vnet header len Make host vnet header length a structure field in preparation for using this support in linux kernel. Signed-off-by: Michael S. Tsirkin --- net/tap-linux.h | 6 ++++++ net/tap.c | 28 ++++++++++++++-------------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/net/tap-linux.h b/net/tap-linux.h index 9f943589bf..727fcf5010 100644 --- a/net/tap-linux.h +++ b/net/tap-linux.h @@ -52,4 +52,10 @@ struct virtio_net_hdr uint16_t csum_offset; }; +struct virtio_net_hdr_mrg_rxbuf +{ + struct virtio_net_hdr hdr; + uint16_t num_buffers; /* Number of merged rx buffers */ +}; + #endif /* QEMU_TAP_H */ diff --git a/net/tap.c b/net/tap.c index 0147dabc15..2866ff40e8 100644 --- a/net/tap.c +++ b/net/tap.c @@ -57,10 +57,10 @@ typedef struct TAPState { uint8_t buf[TAP_BUFSIZE]; unsigned int read_poll : 1; unsigned int write_poll : 1; - unsigned int has_vnet_hdr : 1; unsigned int using_vnet_hdr : 1; unsigned int has_ufo: 1; VHostNetState *vhost_net; + unsigned host_vnet_hdr_len; } TAPState; static int launch_script(const char *setup_script, const char *ifname, int fd); @@ -121,11 +121,11 @@ static ssize_t tap_receive_iov(VLANClientState *nc, const struct iovec *iov, TAPState *s = DO_UPCAST(TAPState, nc, nc); const struct iovec *iovp = iov; struct iovec iov_copy[iovcnt + 1]; - struct virtio_net_hdr hdr = { 0, }; + struct virtio_net_hdr_mrg_rxbuf hdr = { }; - if (s->has_vnet_hdr && !s->using_vnet_hdr) { + if (s->host_vnet_hdr_len && !s->using_vnet_hdr) { iov_copy[0].iov_base = &hdr; - iov_copy[0].iov_len = sizeof(hdr); + iov_copy[0].iov_len = s->host_vnet_hdr_len; memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov)); iovp = iov_copy; iovcnt++; @@ -139,11 +139,11 @@ static ssize_t tap_receive_raw(VLANClientState *nc, const uint8_t *buf, size_t s TAPState *s = DO_UPCAST(TAPState, nc, nc); struct iovec iov[2]; int iovcnt = 0; - struct virtio_net_hdr hdr = { 0, }; + struct virtio_net_hdr_mrg_rxbuf hdr = { }; - if (s->has_vnet_hdr) { + if (s->host_vnet_hdr_len) { iov[iovcnt].iov_base = &hdr; - iov[iovcnt].iov_len = sizeof(hdr); + iov[iovcnt].iov_len = s->host_vnet_hdr_len; iovcnt++; } @@ -159,7 +159,7 @@ static ssize_t tap_receive(VLANClientState *nc, const uint8_t *buf, size_t size) TAPState *s = DO_UPCAST(TAPState, nc, nc); struct iovec iov[1]; - if (s->has_vnet_hdr && !s->using_vnet_hdr) { + if (s->host_vnet_hdr_len && !s->using_vnet_hdr) { return tap_receive_raw(nc, buf, size); } @@ -202,9 +202,9 @@ static void tap_send(void *opaque) break; } - if (s->has_vnet_hdr && !s->using_vnet_hdr) { - buf += sizeof(struct virtio_net_hdr); - size -= sizeof(struct virtio_net_hdr); + if (s->host_vnet_hdr_len && !s->using_vnet_hdr) { + buf += s->host_vnet_hdr_len; + size -= s->host_vnet_hdr_len; } size = qemu_send_packet_async(&s->nc, buf, size, tap_send_completed); @@ -229,7 +229,7 @@ int tap_has_vnet_hdr(VLANClientState *nc) assert(nc->info->type == NET_CLIENT_TYPE_TAP); - return s->has_vnet_hdr; + return !!s->host_vnet_hdr_len; } void tap_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr) @@ -239,7 +239,7 @@ void tap_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr) using_vnet_hdr = using_vnet_hdr != 0; assert(nc->info->type == NET_CLIENT_TYPE_TAP); - assert(s->has_vnet_hdr == using_vnet_hdr); + assert(!!s->host_vnet_hdr_len == using_vnet_hdr); s->using_vnet_hdr = using_vnet_hdr; } @@ -310,7 +310,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, s = DO_UPCAST(TAPState, nc, nc); s->fd = fd; - s->has_vnet_hdr = vnet_hdr != 0; + s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0; s->using_vnet_hdr = 0; s->has_ufo = tap_probe_has_ufo(s->fd); tap_set_offload(&s->nc, 0, 0, 0, 0, 0); From 445d892f43e6a6031cd1aac292df433f821aa830 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 16 Jul 2010 11:16:06 +0300 Subject: [PATCH 3/8] tap: add APIs for vnet header length Add APIs to control host header length. First user will be vhost-net. Signed-off-by: Michael S. Tsirkin --- net/tap-aix.c | 9 +++++++++ net/tap-bsd.c | 9 +++++++++ net/tap-linux.c | 29 +++++++++++++++++++++++++++++ net/tap-linux.h | 2 ++ net/tap-solaris.c | 9 +++++++++ net/tap-win32.c | 9 +++++++++ net/tap.c | 21 +++++++++++++++++++++ net/tap.h | 4 ++++ 8 files changed, 92 insertions(+) diff --git a/net/tap-aix.c b/net/tap-aix.c index 4bc9f2f6d2..e19aaba110 100644 --- a/net/tap-aix.c +++ b/net/tap-aix.c @@ -46,6 +46,15 @@ int tap_probe_has_ufo(int fd) return 0; } +int tap_probe_vnet_hdr_len(int fd, int len) +{ + return 0; +} + +void tap_fd_set_vnet_hdr_len(int fd, int len) +{ +} + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { diff --git a/net/tap-bsd.c b/net/tap-bsd.c index 3773d5da5e..3513075337 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -116,6 +116,15 @@ int tap_probe_has_ufo(int fd) return 0; } +int tap_probe_vnet_hdr_len(int fd, int len) +{ + return 0; +} + +void tap_fd_set_vnet_hdr_len(int fd, int len) +{ +} + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { diff --git a/net/tap-linux.c b/net/tap-linux.c index c92983cb53..f7aa9041b8 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -129,6 +129,35 @@ int tap_probe_has_ufo(int fd) return 1; } +/* Verify that we can assign given length */ +int tap_probe_vnet_hdr_len(int fd, int len) +{ + int orig; + if (ioctl(fd, TUNGETVNETHDRSZ, &orig) == -1) { + return 0; + } + if (ioctl(fd, TUNSETVNETHDRSZ, &len) == -1) { + return 0; + } + /* Restore original length: we can't handle failure. */ + if (ioctl(fd, TUNSETVNETHDRSZ, &orig) == -1) { + fprintf(stderr, "TUNGETVNETHDRSZ ioctl() failed: %s. Exiting.\n", + strerror(errno)); + assert(0); + return -errno; + } + return 1; +} + +void tap_fd_set_vnet_hdr_len(int fd, int len) +{ + if (ioctl(fd, TUNSETVNETHDRSZ, &len) == -1) { + fprintf(stderr, "TUNSETVNETHDRSZ ioctl() failed: %s. Exiting.\n", + strerror(errno)); + assert(0); + } +} + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { diff --git a/net/tap-linux.h b/net/tap-linux.h index 727fcf5010..659e98122b 100644 --- a/net/tap-linux.h +++ b/net/tap-linux.h @@ -27,6 +27,8 @@ #define TUNSETOFFLOAD _IOW('T', 208, unsigned int) #define TUNGETIFF _IOR('T', 210, unsigned int) #define TUNSETSNDBUF _IOW('T', 212, int) +#define TUNGETVNETHDRSZ _IOR('T', 215, int) +#define TUNSETVNETHDRSZ _IOW('T', 216, int) #endif diff --git a/net/tap-solaris.c b/net/tap-solaris.c index 4ac61d2ad6..c216d28267 100644 --- a/net/tap-solaris.c +++ b/net/tap-solaris.c @@ -212,6 +212,15 @@ int tap_probe_has_ufo(int fd) return 0; } +int tap_probe_vnet_hdr_len(int fd, int len) +{ + return 0; +} + +void tap_fd_set_vnet_hdr_len(int fd, int len) +{ +} + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { diff --git a/net/tap-win32.c b/net/tap-win32.c index 74348daf5a..9fe4fcd5f4 100644 --- a/net/tap-win32.c +++ b/net/tap-win32.c @@ -728,6 +728,15 @@ int tap_has_vnet_hdr(VLANClientState *vc) return 0; } +int tap_probe_vnet_hdr_len(int fd, int len) +{ + return 0; +} + +void tap_fd_set_vnet_hdr_len(int fd, int len) +{ +} + void tap_using_vnet_hdr(VLANClientState *vc, int using_vnet_hdr) { } diff --git a/net/tap.c b/net/tap.c index 2866ff40e8..4afb314fde 100644 --- a/net/tap.c +++ b/net/tap.c @@ -232,6 +232,27 @@ int tap_has_vnet_hdr(VLANClientState *nc) return !!s->host_vnet_hdr_len; } +int tap_has_vnet_hdr_len(VLANClientState *nc, int len) +{ + TAPState *s = DO_UPCAST(TAPState, nc, nc); + + assert(nc->info->type == NET_CLIENT_TYPE_TAP); + + return tap_probe_vnet_hdr_len(s->fd, len); +} + +void tap_set_vnet_hdr_len(VLANClientState *nc, int len) +{ + TAPState *s = DO_UPCAST(TAPState, nc, nc); + + assert(nc->info->type == NET_CLIENT_TYPE_TAP); + assert(len == sizeof(struct virtio_net_hdr_mrg_rxbuf) || + len == sizeof(struct virtio_net_hdr)); + + tap_fd_set_vnet_hdr_len(s->fd, len); + s->host_vnet_hdr_len = len; +} + void tap_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr) { TAPState *s = DO_UPCAST(TAPState, nc, nc); diff --git a/net/tap.h b/net/tap.h index b8cec8320c..e44bd2bc5f 100644 --- a/net/tap.h +++ b/net/tap.h @@ -40,13 +40,17 @@ ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen); int tap_has_ufo(VLANClientState *vc); int tap_has_vnet_hdr(VLANClientState *vc); +int tap_has_vnet_hdr_len(VLANClientState *vc, int len); void tap_using_vnet_hdr(VLANClientState *vc, int using_vnet_hdr); void tap_set_offload(VLANClientState *vc, int csum, int tso4, int tso6, int ecn, int ufo); +void tap_set_vnet_hdr_len(VLANClientState *vc, int len); int tap_set_sndbuf(int fd, QemuOpts *opts); int tap_probe_vnet_hdr(int fd); +int tap_probe_vnet_hdr_len(int fd, int len); int tap_probe_has_ufo(int fd); void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); +void tap_fd_set_vnet_hdr_len(int fd, int len); int tap_get_fd(VLANClientState *vc); From ca736c8e748e3bf77abe401231a412b9cdccb9d3 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 16 Jul 2010 11:43:43 +0300 Subject: [PATCH 4/8] vhost_net: mergeable buffers support use the new tap APIs to set header length Signed-off-by: Michael S. Tsirkin --- hw/vhost_net.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/hw/vhost_net.c b/hw/vhost_net.c index 0c00de272a..4a7b8194f2 100644 --- a/hw/vhost_net.c +++ b/hw/vhost_net.c @@ -50,7 +50,9 @@ unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC))) { features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); } - features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); + if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF))) { + features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); + } return features; } @@ -63,6 +65,9 @@ void vhost_net_ack_features(struct vhost_net *net, unsigned features) if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC); } + if (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { + net->dev.acked_features |= (1 << VIRTIO_NET_F_MRG_RXBUF); + } } static int vhost_net_get_fd(VLANClientState *backend) @@ -97,6 +102,10 @@ struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd) if (r < 0) { goto fail; } + if (!tap_has_vnet_hdr_len(backend, + sizeof(struct virtio_net_hdr_mrg_rxbuf))) { + net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); + } if (~net->dev.features & net->dev.backend_features) { fprintf(stderr, "vhost lacks feature mask %" PRIu64 " for backend\n", (uint64_t)(~net->dev.features & net->dev.backend_features)); @@ -117,6 +126,10 @@ int vhost_net_start(struct vhost_net *net, { struct vhost_vring_file file = { }; int r; + if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { + tap_set_vnet_hdr_len(net->vc, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } net->dev.nvqs = 2; net->dev.vqs = net->vqs; @@ -144,6 +157,9 @@ fail: } net->vc->info->poll(net->vc, true); vhost_dev_stop(&net->dev, dev); + if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { + tap_set_vnet_hdr_len(net->vc, sizeof(struct virtio_net_hdr)); + } return r; } @@ -158,11 +174,17 @@ void vhost_net_stop(struct vhost_net *net, } net->vc->info->poll(net->vc, true); vhost_dev_stop(&net->dev, dev); + if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { + tap_set_vnet_hdr_len(net->vc, sizeof(struct virtio_net_hdr)); + } } void vhost_net_cleanup(struct vhost_net *net) { vhost_dev_cleanup(&net->dev); + if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { + tap_set_vnet_hdr_len(net->vc, sizeof(struct virtio_net_hdr)); + } qemu_free(net); } #else From f0c07c7c7b4fe4f9b63c88341fd32707def5a058 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 2 Sep 2010 09:00:50 -0600 Subject: [PATCH 5/8] virtio-net: Make tx_timer timeout configurable Add an option to make the TX mitigation timer adjustable as a device option. The 150us hard coded default used currently is reasonable, but may not be suitable for all workloads, this gives us a way to adjust it using a single binary. We can't support any random option though, so use the "x-" prefix to indicate this is a developer option. Usage: -device virtio-net-pci,x-txtimer=500000,... # .5ms timeout Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin --- hw/s390-virtio-bus.c | 5 ++++- hw/s390-virtio-bus.h | 1 + hw/syborg_virtio.c | 5 ++++- hw/virtio-net.c | 9 ++++++--- hw/virtio-net.h | 5 +++++ hw/virtio-pci.c | 5 ++++- hw/virtio.h | 4 +++- 7 files changed, 27 insertions(+), 7 deletions(-) diff --git a/hw/s390-virtio-bus.c b/hw/s390-virtio-bus.c index fe6884d47d..d5cb24ec77 100644 --- a/hw/s390-virtio-bus.c +++ b/hw/s390-virtio-bus.c @@ -27,6 +27,7 @@ #include "elf.h" #include "hw/virtio.h" #include "hw/virtio-serial.h" +#include "hw/virtio-net.h" #include "hw/sysbus.h" #include "kvm.h" @@ -110,7 +111,7 @@ static int s390_virtio_net_init(VirtIOS390Device *dev) { VirtIODevice *vdev; - vdev = virtio_net_init((DeviceState *)dev, &dev->nic); + vdev = virtio_net_init((DeviceState *)dev, &dev->nic, &dev->net); if (!vdev) { return -1; } @@ -327,6 +328,8 @@ static VirtIOS390DeviceInfo s390_virtio_net = { .qdev.size = sizeof(VirtIOS390Device), .qdev.props = (Property[]) { DEFINE_NIC_PROPERTIES(VirtIOS390Device, nic), + DEFINE_PROP_UINT32("x-txtimer", VirtIOS390Device, + net.txtimer, TX_TIMER_INTERVAL), DEFINE_PROP_END_OF_LIST(), }, }; diff --git a/hw/s390-virtio-bus.h b/hw/s390-virtio-bus.h index 333fea8963..41558c9c67 100644 --- a/hw/s390-virtio-bus.h +++ b/hw/s390-virtio-bus.h @@ -43,6 +43,7 @@ typedef struct VirtIOS390Device { uint32_t host_features; /* Max. number of ports we can have for a the virtio-serial device */ uint32_t max_virtserial_ports; + virtio_net_conf net; } VirtIOS390Device; typedef struct VirtIOS390Bus { diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c index abf0370107..5665189aad 100644 --- a/hw/syborg_virtio.c +++ b/hw/syborg_virtio.c @@ -68,6 +68,7 @@ typedef struct { uint32_t id; NICConf nic; uint32_t host_features; + virtio_net_conf net; } SyborgVirtIOProxy; static uint32_t syborg_virtio_readl(void *opaque, target_phys_addr_t offset) @@ -284,7 +285,7 @@ static int syborg_virtio_net_init(SysBusDevice *dev) VirtIODevice *vdev; SyborgVirtIOProxy *proxy = FROM_SYSBUS(SyborgVirtIOProxy, dev); - vdev = virtio_net_init(&dev->qdev, &proxy->nic); + vdev = virtio_net_init(&dev->qdev, &proxy->nic, &proxy->net); return syborg_virtio_init(proxy, vdev); } @@ -295,6 +296,8 @@ static SysBusDeviceInfo syborg_virtio_net_info = { .qdev.props = (Property[]) { DEFINE_NIC_PROPERTIES(SyborgVirtIOProxy, nic), DEFINE_VIRTIO_NET_FEATURES(SyborgVirtIOProxy, host_features), + DEFINE_PROP_UINT32("x-txtimer", SyborgVirtIOProxy, + net.txtimer, TX_TIMER_INTERVAL), DEFINE_PROP_END_OF_LIST(), } }; diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 075f72df2d..d5b03ab368 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -36,6 +36,7 @@ typedef struct VirtIONet VirtQueue *ctrl_vq; NICState *nic; QEMUTimer *tx_timer; + uint32_t tx_timeout; int tx_timer_active; uint32_t has_vnet_hdr; uint8_t has_ufo; @@ -702,7 +703,7 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq) virtio_net_flush_tx(n, vq); } else { qemu_mod_timer(n->tx_timer, - qemu_get_clock(vm_clock) + TX_TIMER_INTERVAL); + qemu_get_clock(vm_clock) + n->tx_timeout); n->tx_timer_active = 1; virtio_queue_set_notification(vq, 0); } @@ -842,7 +843,7 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) if (n->tx_timer_active) { qemu_mod_timer(n->tx_timer, - qemu_get_clock(vm_clock) + TX_TIMER_INTERVAL); + qemu_get_clock(vm_clock) + n->tx_timeout); } return 0; } @@ -903,7 +904,8 @@ static void virtio_net_vmstate_change(void *opaque, int running, int reason) virtio_net_set_status(&n->vdev, n->vdev.status & status); } -VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf) +VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, + virtio_net_conf *net) { VirtIONet *n; @@ -931,6 +933,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf) n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); n->tx_timer_active = 0; + n->tx_timeout = net->txtimer; n->mergeable_rx_bufs = 0; n->promisc = 1; /* for compatibility */ diff --git a/hw/virtio-net.h b/hw/virtio-net.h index 235f1a9fa8..46a2e1c57d 100644 --- a/hw/virtio-net.h +++ b/hw/virtio-net.h @@ -49,6 +49,11 @@ #define TX_TIMER_INTERVAL 150000 /* 150 us */ +typedef struct virtio_net_conf +{ + uint32_t txtimer; +} virtio_net_conf; + /* Maximum packet size we can receive from tap device: header + 64k */ #define VIRTIO_NET_MAX_BUFSIZE (sizeof(struct virtio_net_hdr) + (64 << 10)) diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c index 6e8f88a141..1af48e295f 100644 --- a/hw/virtio-pci.c +++ b/hw/virtio-pci.c @@ -107,6 +107,7 @@ typedef struct { #endif /* Max. number of ports we can have for a the virtio-serial device */ uint32_t max_virtserial_ports; + virtio_net_conf net; } VirtIOPCIProxy; /* virtio device */ @@ -613,7 +614,7 @@ static int virtio_net_init_pci(PCIDevice *pci_dev) VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); VirtIODevice *vdev; - vdev = virtio_net_init(&pci_dev->qdev, &proxy->nic); + vdev = virtio_net_init(&pci_dev->qdev, &proxy->nic, &proxy->net); vdev->nvectors = proxy->nvectors; virtio_init_pci(proxy, vdev, @@ -690,6 +691,8 @@ static PCIDeviceInfo virtio_info[] = { DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3), DEFINE_VIRTIO_NET_FEATURES(VirtIOPCIProxy, host_features), DEFINE_NIC_PROPERTIES(VirtIOPCIProxy, nic), + DEFINE_PROP_UINT32("x-txtimer", VirtIOPCIProxy, + net.txtimer, TX_TIMER_INTERVAL), DEFINE_PROP_END_OF_LIST(), }, .qdev.reset = virtio_pci_reset, diff --git a/hw/virtio.h b/hw/virtio.h index 5836ab61e7..a60d7355de 100644 --- a/hw/virtio.h +++ b/hw/virtio.h @@ -185,7 +185,9 @@ void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings *binding, /* Base devices. */ VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf); -VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf); +struct virtio_net_conf; +VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, + struct virtio_net_conf *net); VirtIODevice *virtio_serial_init(DeviceState *dev, uint32_t max_nr_ports); VirtIODevice *virtio_balloon_init(DeviceState *dev); #ifdef CONFIG_LINUX From e3f30488e5f802547b3a60e40cebaef3b4ec16a3 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 2 Sep 2010 09:00:57 -0600 Subject: [PATCH 6/8] virtio-net: Limit number of packets sent per TX flush If virtio_net_flush_tx() is called with notification disabled, we can race with the guest, processing packets at the same rate as they get produced. The trouble is that this means we have no guaranteed exit condition from the function and can spend minutes in there. Currently flush_tx is only called with notification on, which seems to limit us to one pass through the queue per call. An upcoming patch changes this. Also add an option to set this value on the command line as different workloads may wish to use different values. We can't necessarily support any random value, so this is a developer option: x-txburst= Usage: -device virtio-net-pci,x-txburst=64 # 64 packets per tx flush One pass through the queue (256) seems to be a good default value for this, balancing latency with throughput. We use a signed int for x-txburst because 2^31 packets in a burst would take many, many minutes to process and it allows us to easily return a negative value value from virtio_net_flush_tx() to indicate a back-off or error condition. Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin --- hw/s390-virtio-bus.c | 2 ++ hw/syborg_virtio.c | 2 ++ hw/virtio-net.c | 21 +++++++++++++++------ hw/virtio-net.h | 8 ++++++++ hw/virtio-pci.c | 2 ++ 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/hw/s390-virtio-bus.c b/hw/s390-virtio-bus.c index d5cb24ec77..092e65f0ca 100644 --- a/hw/s390-virtio-bus.c +++ b/hw/s390-virtio-bus.c @@ -330,6 +330,8 @@ static VirtIOS390DeviceInfo s390_virtio_net = { DEFINE_NIC_PROPERTIES(VirtIOS390Device, nic), DEFINE_PROP_UINT32("x-txtimer", VirtIOS390Device, net.txtimer, TX_TIMER_INTERVAL), + DEFINE_PROP_INT32("x-txburst", VirtIOS390Device, + net.txburst, TX_BURST), DEFINE_PROP_END_OF_LIST(), }, }; diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c index 5665189aad..3c3f3b025b 100644 --- a/hw/syborg_virtio.c +++ b/hw/syborg_virtio.c @@ -298,6 +298,8 @@ static SysBusDeviceInfo syborg_virtio_net_info = { DEFINE_VIRTIO_NET_FEATURES(SyborgVirtIOProxy, host_features), DEFINE_PROP_UINT32("x-txtimer", SyborgVirtIOProxy, net.txtimer, TX_TIMER_INTERVAL), + DEFINE_PROP_INT32("x-txburst", SyborgVirtIOProxy, + net.txburst, TX_BURST), DEFINE_PROP_END_OF_LIST(), } }; diff --git a/hw/virtio-net.c b/hw/virtio-net.c index d5b03ab368..55f3d945f1 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -37,6 +37,7 @@ typedef struct VirtIONet NICState *nic; QEMUTimer *tx_timer; uint32_t tx_timeout; + int32_t tx_burst; int tx_timer_active; uint32_t has_vnet_hdr; uint8_t has_ufo; @@ -620,7 +621,7 @@ static ssize_t virtio_net_receive(VLANClientState *nc, const uint8_t *buf, size_ return size; } -static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq); +static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq); static void virtio_net_tx_complete(VLANClientState *nc, ssize_t len) { @@ -636,16 +637,18 @@ static void virtio_net_tx_complete(VLANClientState *nc, ssize_t len) } /* TX */ -static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) +static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) { VirtQueueElement elem; + int32_t num_packets = 0; - if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) - return; + if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) { + return num_packets; + } if (n->async_tx.elem.out_num) { virtio_queue_set_notification(n->tx_vq, 0); - return; + return num_packets; } while (virtqueue_pop(vq, &elem)) { @@ -682,14 +685,19 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) virtio_queue_set_notification(n->tx_vq, 0); n->async_tx.elem = elem; n->async_tx.len = len; - return; + return -EBUSY; } len += ret; virtqueue_push(vq, &elem, len); virtio_notify(&n->vdev, vq); + + if (++num_packets >= n->tx_burst) { + break; + } } + return num_packets; } static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq) @@ -934,6 +942,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); n->tx_timer_active = 0; n->tx_timeout = net->txtimer; + n->tx_burst = net->txburst; n->mergeable_rx_bufs = 0; n->promisc = 1; /* for compatibility */ diff --git a/hw/virtio-net.h b/hw/virtio-net.h index 46a2e1c57d..a2d1545e4c 100644 --- a/hw/virtio-net.h +++ b/hw/virtio-net.h @@ -49,9 +49,17 @@ #define TX_TIMER_INTERVAL 150000 /* 150 us */ +/* Limit the number of packets that can be sent via a single flush + * of the TX queue. This gives us a guaranteed exit condition and + * ensures fairness in the io path. 256 conveniently matches the + * length of the TX queue and shows a good balance of performance + * and latency. */ +#define TX_BURST 256 + typedef struct virtio_net_conf { uint32_t txtimer; + int32_t txburst; } virtio_net_conf; /* Maximum packet size we can receive from tap device: header + 64k */ diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c index 1af48e295f..3a5b3e6b23 100644 --- a/hw/virtio-pci.c +++ b/hw/virtio-pci.c @@ -693,6 +693,8 @@ static PCIDeviceInfo virtio_info[] = { DEFINE_NIC_PROPERTIES(VirtIOPCIProxy, nic), DEFINE_PROP_UINT32("x-txtimer", VirtIOPCIProxy, net.txtimer, TX_TIMER_INTERVAL), + DEFINE_PROP_INT32("x-txburst", VirtIOPCIProxy, + net.txburst, TX_BURST), DEFINE_PROP_END_OF_LIST(), }, .qdev.reset = virtio_pci_reset, From 4b4b8d361c21209bd6414c369fe93eb70c5ff50d Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 2 Sep 2010 09:01:04 -0600 Subject: [PATCH 7/8] virtio-net: Rename tx_timer_active to tx_waiting De-couple this from the timer since we might want to use different backends to send the packet. Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 55f3d945f1..1a4ba21787 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -38,7 +38,7 @@ typedef struct VirtIONet QEMUTimer *tx_timer; uint32_t tx_timeout; int32_t tx_burst; - int tx_timer_active; + int tx_waiting; uint32_t has_vnet_hdr; uint8_t has_ufo; struct { @@ -704,15 +704,15 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq) { VirtIONet *n = to_virtio_net(vdev); - if (n->tx_timer_active) { + if (n->tx_waiting) { virtio_queue_set_notification(vq, 1); qemu_del_timer(n->tx_timer); - n->tx_timer_active = 0; + n->tx_waiting = 0; virtio_net_flush_tx(n, vq); } else { qemu_mod_timer(n->tx_timer, qemu_get_clock(vm_clock) + n->tx_timeout); - n->tx_timer_active = 1; + n->tx_waiting = 1; virtio_queue_set_notification(vq, 0); } } @@ -721,7 +721,7 @@ static void virtio_net_tx_timer(void *opaque) { VirtIONet *n = opaque; - n->tx_timer_active = 0; + n->tx_waiting = 0; /* Just in case the driver is not ready on more */ if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) @@ -744,7 +744,7 @@ static void virtio_net_save(QEMUFile *f, void *opaque) virtio_save(&n->vdev, f); qemu_put_buffer(f, n->mac, ETH_ALEN); - qemu_put_be32(f, n->tx_timer_active); + qemu_put_be32(f, n->tx_waiting); qemu_put_be32(f, n->mergeable_rx_bufs); qemu_put_be16(f, n->status); qemu_put_byte(f, n->promisc); @@ -773,7 +773,7 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) virtio_load(&n->vdev, f); qemu_get_buffer(f, n->mac, ETH_ALEN); - n->tx_timer_active = qemu_get_be32(f); + n->tx_waiting = qemu_get_be32(f); n->mergeable_rx_bufs = qemu_get_be32(f); if (version_id >= 3) @@ -849,7 +849,7 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) } n->mac_table.first_multi = i; - if (n->tx_timer_active) { + if (n->tx_waiting) { qemu_mod_timer(n->tx_timer, qemu_get_clock(vm_clock) + n->tx_timeout); } @@ -940,7 +940,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a); n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); - n->tx_timer_active = 0; + n->tx_waiting = 0; n->tx_timeout = net->txtimer; n->tx_burst = net->txburst; n->mergeable_rx_bufs = 0; From a697a334b3c4d3250e6420f5d38550ea10eb5319 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 2 Sep 2010 09:01:10 -0600 Subject: [PATCH 8/8] virtio-net: Introduce a new bottom half packet TX Based on a patch from Mark McLoughlin, this patch introduces a new bottom half packet transmitter that avoids the latency imposed by the tx_timer approach. Rather than scheduling a timer when a TX packet comes in, schedule a bottom half to be run from the iothread. The bottom half handler first attempts to flush the queue with notification disabled (this is where we could race with a guest without txburst). If we flush a full burst, reschedule immediately. If we send short of a full burst, try to re-enable notification. To avoid a race with TXs that may have occurred, we must then flush again. If we find some packets to send, the guest it probably active, so we can reschedule again. tx_timer and tx_bh are mutually exclusive, so we can re-use the tx_waiting flag to indicate one or the other needs to be setup. This allows us to seamlessly migrate between timer and bh TX handling. The bottom half handler becomes the new default and we add a new tx= option to virtio-net-pci. Usage: -device virtio-net-pci,tx=timer # select timer mitigation vs "bh" Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin --- hw/s390-virtio-bus.c | 1 + hw/syborg_virtio.c | 1 + hw/virtio-net.c | 85 +++++++++++++++++++++++++++++++++++++++----- hw/virtio-net.h | 1 + hw/virtio-pci.c | 1 + 5 files changed, 81 insertions(+), 8 deletions(-) diff --git a/hw/s390-virtio-bus.c b/hw/s390-virtio-bus.c index 092e65f0ca..784dc01b97 100644 --- a/hw/s390-virtio-bus.c +++ b/hw/s390-virtio-bus.c @@ -332,6 +332,7 @@ static VirtIOS390DeviceInfo s390_virtio_net = { net.txtimer, TX_TIMER_INTERVAL), DEFINE_PROP_INT32("x-txburst", VirtIOS390Device, net.txburst, TX_BURST), + DEFINE_PROP_STRING("tx", VirtIOS390Device, net.tx), DEFINE_PROP_END_OF_LIST(), }, }; diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c index 3c3f3b025b..4dfd1a87b9 100644 --- a/hw/syborg_virtio.c +++ b/hw/syborg_virtio.c @@ -300,6 +300,7 @@ static SysBusDeviceInfo syborg_virtio_net_info = { net.txtimer, TX_TIMER_INTERVAL), DEFINE_PROP_INT32("x-txburst", SyborgVirtIOProxy, net.txburst, TX_BURST), + DEFINE_PROP_STRING("tx", SyborgVirtIOProxy, net.tx), DEFINE_PROP_END_OF_LIST(), } }; diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 1a4ba21787..0a9cae22c4 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -36,6 +36,7 @@ typedef struct VirtIONet VirtQueue *ctrl_vq; NICState *nic; QEMUTimer *tx_timer; + QEMUBH *tx_bh; uint32_t tx_timeout; int32_t tx_burst; int tx_waiting; @@ -700,7 +701,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) return num_packets; } -static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq) +static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq) { VirtIONet *n = to_virtio_net(vdev); @@ -717,6 +718,18 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq) } } +static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIONet *n = to_virtio_net(vdev); + + if (unlikely(n->tx_waiting)) { + return; + } + virtio_queue_set_notification(vq, 0); + qemu_bh_schedule(n->tx_bh); + n->tx_waiting = 1; +} + static void virtio_net_tx_timer(void *opaque) { VirtIONet *n = opaque; @@ -731,6 +744,41 @@ static void virtio_net_tx_timer(void *opaque) virtio_net_flush_tx(n, n->tx_vq); } +static void virtio_net_tx_bh(void *opaque) +{ + VirtIONet *n = opaque; + int32_t ret; + + n->tx_waiting = 0; + + /* Just in case the driver is not ready on more */ + if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))) + return; + + ret = virtio_net_flush_tx(n, n->tx_vq); + if (ret == -EBUSY) { + return; /* Notification re-enable handled by tx_complete */ + } + + /* If we flush a full burst of packets, assume there are + * more coming and immediately reschedule */ + if (ret >= n->tx_burst) { + qemu_bh_schedule(n->tx_bh); + n->tx_waiting = 1; + return; + } + + /* If less than a full burst, re-enable notification and flush + * anything that may have come in while we weren't looking. If + * we find something, assume the guest is still active and reschedule */ + virtio_queue_set_notification(n->tx_vq, 1); + if (virtio_net_flush_tx(n, n->tx_vq) > 0) { + virtio_queue_set_notification(n->tx_vq, 0); + qemu_bh_schedule(n->tx_bh); + n->tx_waiting = 1; + } +} + static void virtio_net_save(QEMUFile *f, void *opaque) { VirtIONet *n = opaque; @@ -850,8 +898,12 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) n->mac_table.first_multi = i; if (n->tx_waiting) { - qemu_mod_timer(n->tx_timer, - qemu_get_clock(vm_clock) + n->tx_timeout); + if (n->tx_timer) { + qemu_mod_timer(n->tx_timer, + qemu_get_clock(vm_clock) + n->tx_timeout); + } else { + qemu_bh_schedule(n->tx_bh); + } } return 0; } @@ -929,7 +981,22 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, n->vdev.reset = virtio_net_reset; n->vdev.set_status = virtio_net_set_status; n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx); - n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx); + + if (net->tx && strcmp(net->tx, "timer") && strcmp(net->tx, "bh")) { + fprintf(stderr, "virtio-net: " + "Unknown option tx=%s, valid options: \"timer\" \"bh\"\n", + net->tx); + fprintf(stderr, "Defaulting to \"bh\"\n"); + } + + if (net->tx && !strcmp(net->tx, "timer")) { + n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_timer); + n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); + n->tx_timeout = net->txtimer; + } else { + n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_bh); + n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n); + } n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl); qemu_macaddr_default_if_unset(&conf->macaddr); memcpy(&n->mac[0], &conf->macaddr, sizeof(n->mac)); @@ -939,9 +1006,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a); - n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); n->tx_waiting = 0; - n->tx_timeout = net->txtimer; n->tx_burst = net->txburst; n->mergeable_rx_bufs = 0; n->promisc = 1; /* for compatibility */ @@ -974,8 +1039,12 @@ void virtio_net_exit(VirtIODevice *vdev) qemu_free(n->mac_table.macs); qemu_free(n->vlans); - qemu_del_timer(n->tx_timer); - qemu_free_timer(n->tx_timer); + if (n->tx_timer) { + qemu_del_timer(n->tx_timer); + qemu_free_timer(n->tx_timer); + } else { + qemu_bh_delete(n->tx_bh); + } virtio_cleanup(&n->vdev); qemu_del_vlan_client(&n->nic->nc); diff --git a/hw/virtio-net.h b/hw/virtio-net.h index a2d1545e4c..8af9a1ce55 100644 --- a/hw/virtio-net.h +++ b/hw/virtio-net.h @@ -60,6 +60,7 @@ typedef struct virtio_net_conf { uint32_t txtimer; int32_t txburst; + char *tx; } virtio_net_conf; /* Maximum packet size we can receive from tap device: header + 64k */ diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c index 3a5b3e6b23..86e6b0a561 100644 --- a/hw/virtio-pci.c +++ b/hw/virtio-pci.c @@ -695,6 +695,7 @@ static PCIDeviceInfo virtio_info[] = { net.txtimer, TX_TIMER_INTERVAL), DEFINE_PROP_INT32("x-txburst", VirtIOPCIProxy, net.txburst, TX_BURST), + DEFINE_PROP_STRING("tx", VirtIOPCIProxy, net.tx), DEFINE_PROP_END_OF_LIST(), }, .qdev.reset = virtio_pci_reset,