2008-12-17 20:13:11 +01:00
|
|
|
/*
|
|
|
|
* Virtio Network Device
|
|
|
|
*
|
|
|
|
* Copyright IBM, Corp. 2007
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Anthony Liguori <aliguori@us.ibm.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2. See
|
|
|
|
* the COPYING file in the top-level directory.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2016-06-29 13:47:03 +02:00
|
|
|
#ifndef QEMU_VIRTIO_NET_H
|
|
|
|
#define QEMU_VIRTIO_NET_H
|
2008-12-17 20:13:11 +01:00
|
|
|
|
2018-06-25 14:42:30 +02:00
|
|
|
#include "qemu/units.h"
|
2015-02-16 22:36:09 +01:00
|
|
|
#include "standard-headers/linux/virtio_net.h"
|
2013-02-05 17:06:20 +01:00
|
|
|
#include "hw/virtio/virtio.h"
|
2019-02-27 14:24:07 +01:00
|
|
|
#include "net/announce.h"
|
2019-10-29 12:49:04 +01:00
|
|
|
#include "qemu/option_int.h"
|
2020-09-03 22:43:22 +02:00
|
|
|
#include "qom/object.h"
|
2008-12-17 20:13:11 +01:00
|
|
|
|
2021-05-14 13:48:33 +02:00
|
|
|
#include "ebpf/ebpf_rss.h"
|
|
|
|
|
2013-04-11 16:29:57 +02:00
|
|
|
#define TYPE_VIRTIO_NET "virtio-net-device"
|
2020-09-16 20:25:19 +02:00
|
|
|
OBJECT_DECLARE_SIMPLE_TYPE(VirtIONet, VIRTIO_NET)
|
2013-04-11 16:29:57 +02:00
|
|
|
|
2008-12-17 20:13:11 +01:00
|
|
|
#define TX_TIMER_INTERVAL 150000 /* 150 us */
|
|
|
|
|
2010-09-02 17:00:57 +02:00
|
|
|
/* Limit the number of packets that can be sent via a single flush
|
|
|
|
* of the TX queue. This gives us a guaranteed exit condition and
|
|
|
|
* ensures fairness in the io path. 256 conveniently matches the
|
|
|
|
* length of the TX queue and shows a good balance of performance
|
|
|
|
* and latency. */
|
|
|
|
#define TX_BURST 256
|
|
|
|
|
2022-07-20 08:59:27 +02:00
|
|
|
/* Maximum VIRTIO_NET_CTRL_MAC_TABLE_SET unicast + multicast entries. */
|
|
|
|
#define MAC_TABLE_ENTRIES 64
|
|
|
|
|
2023-07-23 14:09:12 +02:00
|
|
|
/*
|
|
|
|
* The maximum number of VLANs in the VLAN filter table
|
|
|
|
* added by VIRTIO_NET_CTRL_VLAN_ADD
|
|
|
|
*/
|
|
|
|
#define MAX_VLAN (1 << 12) /* Per 802.1Q definition */
|
|
|
|
|
2010-09-02 17:00:50 +02:00
|
|
|
typedef struct virtio_net_conf
|
|
|
|
{
|
|
|
|
uint32_t txtimer;
|
2010-09-02 17:00:57 +02:00
|
|
|
int32_t txburst;
|
2010-09-02 17:01:10 +02:00
|
|
|
char *tx;
|
2016-08-10 16:47:16 +02:00
|
|
|
uint16_t rx_queue_size;
|
2017-06-28 04:37:59 +02:00
|
|
|
uint16_t tx_queue_size;
|
2016-12-10 16:30:38 +01:00
|
|
|
uint16_t mtu;
|
2018-03-08 04:25:41 +01:00
|
|
|
int32_t speed;
|
|
|
|
char *duplex_str;
|
|
|
|
uint8_t duplex;
|
2019-10-29 12:49:04 +01:00
|
|
|
char *primary_id_str;
|
2010-09-02 17:00:50 +02:00
|
|
|
} virtio_net_conf;
|
|
|
|
|
virtio-net: support RSC v4/v6 tcp traffic for Windows HCK
This commit adds implementation of RX packets
coalescing, compatible with requirements of Windows
Hardware compatibility kit.
The device enables feature VIRTIO_NET_F_RSC_EXT in
host features if it supports extended RSC functionality
as defined in the specification.
This feature requires at least one of VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6. Windows guest driver acks
this feature only if VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
is also present.
If the guest driver acks VIRTIO_NET_F_RSC_EXT feature,
the device coalesces TCPv4 and TCPv6 packets (if
respective VIRTIO_NET_F_GUEST_TSO feature is on,
populates extended RSC information in virtio header
and sets VIRTIO_NET_HDR_F_RSC_INFO bit in header flags.
The device does not recalculate checksums in the coalesced
packet, so they are not valid.
In this case:
All the data packets in a tcp connection are cached
to a single buffer in every receive interval, and will
be sent out via a timer, the 'virtio_net_rsc_timeout'
controls the interval, this value may impact the
performance and response time of tcp connection,
50000(50us) is an experience value to gain a performance
improvement, since the whql test sends packets every 100us,
so '300000(300us)' passes the test case, it is the default
value as well, tune it via the command line parameter
'rsc_interval' within 'virtio-net-pci' device, for example,
to launch a guest with interval set as '500000':
'virtio-net-pci,netdev=hostnet1,bus=pci.0,id=net1,mac=00,
guest_rsc_ext=on,rsc_interval=500000'
The timer will only be triggered if the packets pool is not empty,
and it'll drain off all the cached packets.
'NetRscChain' is used to save the segments of IPv4/6 in a
VirtIONet device.
A new segment becomes a 'Candidate' as well as it passed sanity check,
the main handler of TCP includes TCP window update, duplicated
ACK check and the real data coalescing.
An 'Candidate' segment means:
1. Segment is within current window and the sequence is the expected one.
2. 'ACK' of the segment is in the valid window.
Sanity check includes:
1. Incorrect version in IP header
2. An IP options or IP fragment
3. Not a TCP packet
4. Sanity size check to prevent buffer overflow attack.
5. An ECN packet
Even though, there might more cases should be considered such as
ip identification other flags, while it breaks the test because
windows set it to the same even it's not a fragment.
Normally it includes 2 typical ways to handle a TCP control flag,
'bypass' and 'finalize', 'bypass' means should be sent out directly,
while 'finalize' means the packets should also be bypassed, but this
should be done after search for the same connection packets in the
pool and drain all of them out, this is to avoid out of order fragment.
All the 'SYN' packets will be bypassed since this always begin a new'
connection, other flags such 'URG/FIN/RST/CWR/ECE' will trigger a
finalization, because this normally happens upon a connection is going
to be closed, an 'URG' packet also finalize current coalescing unit.
Statistics can be used to monitor the basic coalescing status, the
'out of order' and 'out of window' means how many retransmitting packets,
thus describe the performance intuitively.
Difference between ip v4 and v6 processing:
Fragment length in ipv4 header includes itself, while it's not
included for ipv6, thus means ipv6 can carry a real 65535 payload.
Note that main goal of implementing this feature in software
is to create reference setup for certification tests. In such
setups guest migration is not required, so the coalesced packets
not yet delivered to the guest will be lost in case of migration.
Signed-off-by: Wei Xu <wexu@redhat.com>
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2019-01-03 14:12:29 +01:00
|
|
|
/* Coalesced packets type & status */
|
|
|
|
typedef enum {
|
|
|
|
RSC_COALESCE, /* Data been coalesced */
|
|
|
|
RSC_FINAL, /* Will terminate current connection */
|
|
|
|
RSC_NO_MATCH, /* No matched in the buffer pool */
|
|
|
|
RSC_BYPASS, /* Packet to be bypass, not tcp, tcp ctrl, etc */
|
|
|
|
RSC_CANDIDATE /* Data want to be coalesced */
|
|
|
|
} CoalesceStatus;
|
|
|
|
|
|
|
|
typedef struct VirtioNetRscStat {
|
|
|
|
uint32_t received;
|
|
|
|
uint32_t coalesced;
|
|
|
|
uint32_t over_size;
|
|
|
|
uint32_t cache;
|
|
|
|
uint32_t empty_cache;
|
|
|
|
uint32_t no_match_cache;
|
|
|
|
uint32_t win_update;
|
|
|
|
uint32_t no_match;
|
|
|
|
uint32_t tcp_syn;
|
|
|
|
uint32_t tcp_ctrl_drain;
|
|
|
|
uint32_t dup_ack;
|
|
|
|
uint32_t dup_ack1;
|
|
|
|
uint32_t dup_ack2;
|
|
|
|
uint32_t pure_ack;
|
|
|
|
uint32_t ack_out_of_win;
|
|
|
|
uint32_t data_out_of_win;
|
|
|
|
uint32_t data_out_of_order;
|
|
|
|
uint32_t data_after_pure_ack;
|
|
|
|
uint32_t bypass_not_tcp;
|
|
|
|
uint32_t tcp_option;
|
|
|
|
uint32_t tcp_all_opt;
|
|
|
|
uint32_t ip_frag;
|
|
|
|
uint32_t ip_ecn;
|
|
|
|
uint32_t ip_hacked;
|
|
|
|
uint32_t ip_option;
|
|
|
|
uint32_t purge_failed;
|
|
|
|
uint32_t drain_failed;
|
|
|
|
uint32_t final_failed;
|
|
|
|
int64_t timer;
|
|
|
|
} VirtioNetRscStat;
|
|
|
|
|
|
|
|
/* Rsc unit general info used to checking if can coalescing */
|
|
|
|
typedef struct VirtioNetRscUnit {
|
|
|
|
void *ip; /* ip header */
|
|
|
|
uint16_t *ip_plen; /* data len pointer in ip header field */
|
|
|
|
struct tcp_header *tcp; /* tcp header */
|
|
|
|
uint16_t tcp_hdrlen; /* tcp header len */
|
|
|
|
uint16_t payload; /* pure payload without virtio/eth/ip/tcp */
|
|
|
|
} VirtioNetRscUnit;
|
|
|
|
|
2019-01-21 19:13:35 +01:00
|
|
|
/* Coalesced segment */
|
virtio-net: support RSC v4/v6 tcp traffic for Windows HCK
This commit adds implementation of RX packets
coalescing, compatible with requirements of Windows
Hardware compatibility kit.
The device enables feature VIRTIO_NET_F_RSC_EXT in
host features if it supports extended RSC functionality
as defined in the specification.
This feature requires at least one of VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6. Windows guest driver acks
this feature only if VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
is also present.
If the guest driver acks VIRTIO_NET_F_RSC_EXT feature,
the device coalesces TCPv4 and TCPv6 packets (if
respective VIRTIO_NET_F_GUEST_TSO feature is on,
populates extended RSC information in virtio header
and sets VIRTIO_NET_HDR_F_RSC_INFO bit in header flags.
The device does not recalculate checksums in the coalesced
packet, so they are not valid.
In this case:
All the data packets in a tcp connection are cached
to a single buffer in every receive interval, and will
be sent out via a timer, the 'virtio_net_rsc_timeout'
controls the interval, this value may impact the
performance and response time of tcp connection,
50000(50us) is an experience value to gain a performance
improvement, since the whql test sends packets every 100us,
so '300000(300us)' passes the test case, it is the default
value as well, tune it via the command line parameter
'rsc_interval' within 'virtio-net-pci' device, for example,
to launch a guest with interval set as '500000':
'virtio-net-pci,netdev=hostnet1,bus=pci.0,id=net1,mac=00,
guest_rsc_ext=on,rsc_interval=500000'
The timer will only be triggered if the packets pool is not empty,
and it'll drain off all the cached packets.
'NetRscChain' is used to save the segments of IPv4/6 in a
VirtIONet device.
A new segment becomes a 'Candidate' as well as it passed sanity check,
the main handler of TCP includes TCP window update, duplicated
ACK check and the real data coalescing.
An 'Candidate' segment means:
1. Segment is within current window and the sequence is the expected one.
2. 'ACK' of the segment is in the valid window.
Sanity check includes:
1. Incorrect version in IP header
2. An IP options or IP fragment
3. Not a TCP packet
4. Sanity size check to prevent buffer overflow attack.
5. An ECN packet
Even though, there might more cases should be considered such as
ip identification other flags, while it breaks the test because
windows set it to the same even it's not a fragment.
Normally it includes 2 typical ways to handle a TCP control flag,
'bypass' and 'finalize', 'bypass' means should be sent out directly,
while 'finalize' means the packets should also be bypassed, but this
should be done after search for the same connection packets in the
pool and drain all of them out, this is to avoid out of order fragment.
All the 'SYN' packets will be bypassed since this always begin a new'
connection, other flags such 'URG/FIN/RST/CWR/ECE' will trigger a
finalization, because this normally happens upon a connection is going
to be closed, an 'URG' packet also finalize current coalescing unit.
Statistics can be used to monitor the basic coalescing status, the
'out of order' and 'out of window' means how many retransmitting packets,
thus describe the performance intuitively.
Difference between ip v4 and v6 processing:
Fragment length in ipv4 header includes itself, while it's not
included for ipv6, thus means ipv6 can carry a real 65535 payload.
Note that main goal of implementing this feature in software
is to create reference setup for certification tests. In such
setups guest migration is not required, so the coalesced packets
not yet delivered to the guest will be lost in case of migration.
Signed-off-by: Wei Xu <wexu@redhat.com>
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2019-01-03 14:12:29 +01:00
|
|
|
typedef struct VirtioNetRscSeg {
|
|
|
|
QTAILQ_ENTRY(VirtioNetRscSeg) next;
|
|
|
|
void *buf;
|
|
|
|
size_t size;
|
|
|
|
uint16_t packets;
|
|
|
|
uint16_t dup_ack;
|
2023-07-14 13:33:02 +02:00
|
|
|
bool is_coalesced; /* need recall ipv4 header checksum, mark here */
|
virtio-net: support RSC v4/v6 tcp traffic for Windows HCK
This commit adds implementation of RX packets
coalescing, compatible with requirements of Windows
Hardware compatibility kit.
The device enables feature VIRTIO_NET_F_RSC_EXT in
host features if it supports extended RSC functionality
as defined in the specification.
This feature requires at least one of VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6. Windows guest driver acks
this feature only if VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
is also present.
If the guest driver acks VIRTIO_NET_F_RSC_EXT feature,
the device coalesces TCPv4 and TCPv6 packets (if
respective VIRTIO_NET_F_GUEST_TSO feature is on,
populates extended RSC information in virtio header
and sets VIRTIO_NET_HDR_F_RSC_INFO bit in header flags.
The device does not recalculate checksums in the coalesced
packet, so they are not valid.
In this case:
All the data packets in a tcp connection are cached
to a single buffer in every receive interval, and will
be sent out via a timer, the 'virtio_net_rsc_timeout'
controls the interval, this value may impact the
performance and response time of tcp connection,
50000(50us) is an experience value to gain a performance
improvement, since the whql test sends packets every 100us,
so '300000(300us)' passes the test case, it is the default
value as well, tune it via the command line parameter
'rsc_interval' within 'virtio-net-pci' device, for example,
to launch a guest with interval set as '500000':
'virtio-net-pci,netdev=hostnet1,bus=pci.0,id=net1,mac=00,
guest_rsc_ext=on,rsc_interval=500000'
The timer will only be triggered if the packets pool is not empty,
and it'll drain off all the cached packets.
'NetRscChain' is used to save the segments of IPv4/6 in a
VirtIONet device.
A new segment becomes a 'Candidate' as well as it passed sanity check,
the main handler of TCP includes TCP window update, duplicated
ACK check and the real data coalescing.
An 'Candidate' segment means:
1. Segment is within current window and the sequence is the expected one.
2. 'ACK' of the segment is in the valid window.
Sanity check includes:
1. Incorrect version in IP header
2. An IP options or IP fragment
3. Not a TCP packet
4. Sanity size check to prevent buffer overflow attack.
5. An ECN packet
Even though, there might more cases should be considered such as
ip identification other flags, while it breaks the test because
windows set it to the same even it's not a fragment.
Normally it includes 2 typical ways to handle a TCP control flag,
'bypass' and 'finalize', 'bypass' means should be sent out directly,
while 'finalize' means the packets should also be bypassed, but this
should be done after search for the same connection packets in the
pool and drain all of them out, this is to avoid out of order fragment.
All the 'SYN' packets will be bypassed since this always begin a new'
connection, other flags such 'URG/FIN/RST/CWR/ECE' will trigger a
finalization, because this normally happens upon a connection is going
to be closed, an 'URG' packet also finalize current coalescing unit.
Statistics can be used to monitor the basic coalescing status, the
'out of order' and 'out of window' means how many retransmitting packets,
thus describe the performance intuitively.
Difference between ip v4 and v6 processing:
Fragment length in ipv4 header includes itself, while it's not
included for ipv6, thus means ipv6 can carry a real 65535 payload.
Note that main goal of implementing this feature in software
is to create reference setup for certification tests. In such
setups guest migration is not required, so the coalesced packets
not yet delivered to the guest will be lost in case of migration.
Signed-off-by: Wei Xu <wexu@redhat.com>
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2019-01-03 14:12:29 +01:00
|
|
|
VirtioNetRscUnit unit;
|
|
|
|
NetClientState *nc;
|
|
|
|
} VirtioNetRscSeg;
|
|
|
|
|
|
|
|
|
|
|
|
/* Chain is divided by protocol(ipv4/v6) and NetClientInfo */
|
|
|
|
typedef struct VirtioNetRscChain {
|
|
|
|
QTAILQ_ENTRY(VirtioNetRscChain) next;
|
|
|
|
VirtIONet *n; /* VirtIONet */
|
|
|
|
uint16_t proto;
|
|
|
|
uint8_t gso_type;
|
|
|
|
uint16_t max_payload;
|
|
|
|
QEMUTimer *drain_timer;
|
|
|
|
QTAILQ_HEAD(, VirtioNetRscSeg) buffers;
|
|
|
|
VirtioNetRscStat stat;
|
|
|
|
} VirtioNetRscChain;
|
|
|
|
|
2008-12-17 20:13:11 +01:00
|
|
|
/* Maximum packet size we can receive from tap device: header + 64k */
|
2018-06-25 14:42:30 +02:00
|
|
|
#define VIRTIO_NET_MAX_BUFSIZE (sizeof(struct virtio_net_hdr) + (64 * KiB))
|
2008-12-17 20:13:11 +01:00
|
|
|
|
2020-05-08 14:59:28 +02:00
|
|
|
#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
|
|
|
|
#define VIRTIO_NET_RSS_MAX_TABLE_LEN 128
|
|
|
|
|
|
|
|
typedef struct VirtioNetRssData {
|
|
|
|
bool enabled;
|
2021-05-14 13:48:33 +02:00
|
|
|
bool enabled_software_rss;
|
2020-05-08 14:59:31 +02:00
|
|
|
bool redirect;
|
|
|
|
bool populate_hash;
|
2020-05-08 14:59:28 +02:00
|
|
|
uint32_t hash_types;
|
|
|
|
uint8_t key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
|
|
|
|
uint16_t indirections_len;
|
|
|
|
uint16_t *indirections_table;
|
|
|
|
uint16_t default_queue;
|
|
|
|
} VirtioNetRssData;
|
|
|
|
|
2013-03-18 17:37:18 +01:00
|
|
|
typedef struct VirtIONetQueue {
|
|
|
|
VirtQueue *rx_vq;
|
|
|
|
VirtQueue *tx_vq;
|
|
|
|
QEMUTimer *tx_timer;
|
|
|
|
QEMUBH *tx_bh;
|
2017-02-03 17:06:51 +01:00
|
|
|
uint32_t tx_waiting;
|
2013-03-18 17:37:18 +01:00
|
|
|
struct {
|
2016-02-04 15:26:51 +01:00
|
|
|
VirtQueueElement *elem;
|
2013-03-18 17:37:18 +01:00
|
|
|
} async_tx;
|
|
|
|
struct VirtIONet *n;
|
|
|
|
} VirtIONetQueue;
|
|
|
|
|
2019-01-21 19:10:30 +01:00
|
|
|
struct VirtIONet {
|
2013-04-11 16:30:01 +02:00
|
|
|
VirtIODevice parent_obj;
|
2013-03-18 17:37:18 +01:00
|
|
|
uint8_t mac[ETH_ALEN];
|
|
|
|
uint16_t status;
|
|
|
|
VirtIONetQueue *vqs;
|
|
|
|
VirtQueue *ctrl_vq;
|
|
|
|
NICState *nic;
|
virtio-net: support RSC v4/v6 tcp traffic for Windows HCK
This commit adds implementation of RX packets
coalescing, compatible with requirements of Windows
Hardware compatibility kit.
The device enables feature VIRTIO_NET_F_RSC_EXT in
host features if it supports extended RSC functionality
as defined in the specification.
This feature requires at least one of VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6. Windows guest driver acks
this feature only if VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
is also present.
If the guest driver acks VIRTIO_NET_F_RSC_EXT feature,
the device coalesces TCPv4 and TCPv6 packets (if
respective VIRTIO_NET_F_GUEST_TSO feature is on,
populates extended RSC information in virtio header
and sets VIRTIO_NET_HDR_F_RSC_INFO bit in header flags.
The device does not recalculate checksums in the coalesced
packet, so they are not valid.
In this case:
All the data packets in a tcp connection are cached
to a single buffer in every receive interval, and will
be sent out via a timer, the 'virtio_net_rsc_timeout'
controls the interval, this value may impact the
performance and response time of tcp connection,
50000(50us) is an experience value to gain a performance
improvement, since the whql test sends packets every 100us,
so '300000(300us)' passes the test case, it is the default
value as well, tune it via the command line parameter
'rsc_interval' within 'virtio-net-pci' device, for example,
to launch a guest with interval set as '500000':
'virtio-net-pci,netdev=hostnet1,bus=pci.0,id=net1,mac=00,
guest_rsc_ext=on,rsc_interval=500000'
The timer will only be triggered if the packets pool is not empty,
and it'll drain off all the cached packets.
'NetRscChain' is used to save the segments of IPv4/6 in a
VirtIONet device.
A new segment becomes a 'Candidate' as well as it passed sanity check,
the main handler of TCP includes TCP window update, duplicated
ACK check and the real data coalescing.
An 'Candidate' segment means:
1. Segment is within current window and the sequence is the expected one.
2. 'ACK' of the segment is in the valid window.
Sanity check includes:
1. Incorrect version in IP header
2. An IP options or IP fragment
3. Not a TCP packet
4. Sanity size check to prevent buffer overflow attack.
5. An ECN packet
Even though, there might more cases should be considered such as
ip identification other flags, while it breaks the test because
windows set it to the same even it's not a fragment.
Normally it includes 2 typical ways to handle a TCP control flag,
'bypass' and 'finalize', 'bypass' means should be sent out directly,
while 'finalize' means the packets should also be bypassed, but this
should be done after search for the same connection packets in the
pool and drain all of them out, this is to avoid out of order fragment.
All the 'SYN' packets will be bypassed since this always begin a new'
connection, other flags such 'URG/FIN/RST/CWR/ECE' will trigger a
finalization, because this normally happens upon a connection is going
to be closed, an 'URG' packet also finalize current coalescing unit.
Statistics can be used to monitor the basic coalescing status, the
'out of order' and 'out of window' means how many retransmitting packets,
thus describe the performance intuitively.
Difference between ip v4 and v6 processing:
Fragment length in ipv4 header includes itself, while it's not
included for ipv6, thus means ipv6 can carry a real 65535 payload.
Note that main goal of implementing this feature in software
is to create reference setup for certification tests. In such
setups guest migration is not required, so the coalesced packets
not yet delivered to the guest will be lost in case of migration.
Signed-off-by: Wei Xu <wexu@redhat.com>
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2019-01-03 14:12:29 +01:00
|
|
|
/* RSC Chains - temporary storage of coalesced data,
|
|
|
|
all these data are lost in case of migration */
|
|
|
|
QTAILQ_HEAD(, VirtioNetRscChain) rsc_chains;
|
2013-03-18 17:37:18 +01:00
|
|
|
uint32_t tx_timeout;
|
|
|
|
int32_t tx_burst;
|
|
|
|
uint32_t has_vnet_hdr;
|
|
|
|
size_t host_hdr_len;
|
|
|
|
size_t guest_hdr_len;
|
2018-03-08 04:25:40 +01:00
|
|
|
uint64_t host_features;
|
virtio-net: support RSC v4/v6 tcp traffic for Windows HCK
This commit adds implementation of RX packets
coalescing, compatible with requirements of Windows
Hardware compatibility kit.
The device enables feature VIRTIO_NET_F_RSC_EXT in
host features if it supports extended RSC functionality
as defined in the specification.
This feature requires at least one of VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6. Windows guest driver acks
this feature only if VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
is also present.
If the guest driver acks VIRTIO_NET_F_RSC_EXT feature,
the device coalesces TCPv4 and TCPv6 packets (if
respective VIRTIO_NET_F_GUEST_TSO feature is on,
populates extended RSC information in virtio header
and sets VIRTIO_NET_HDR_F_RSC_INFO bit in header flags.
The device does not recalculate checksums in the coalesced
packet, so they are not valid.
In this case:
All the data packets in a tcp connection are cached
to a single buffer in every receive interval, and will
be sent out via a timer, the 'virtio_net_rsc_timeout'
controls the interval, this value may impact the
performance and response time of tcp connection,
50000(50us) is an experience value to gain a performance
improvement, since the whql test sends packets every 100us,
so '300000(300us)' passes the test case, it is the default
value as well, tune it via the command line parameter
'rsc_interval' within 'virtio-net-pci' device, for example,
to launch a guest with interval set as '500000':
'virtio-net-pci,netdev=hostnet1,bus=pci.0,id=net1,mac=00,
guest_rsc_ext=on,rsc_interval=500000'
The timer will only be triggered if the packets pool is not empty,
and it'll drain off all the cached packets.
'NetRscChain' is used to save the segments of IPv4/6 in a
VirtIONet device.
A new segment becomes a 'Candidate' as well as it passed sanity check,
the main handler of TCP includes TCP window update, duplicated
ACK check and the real data coalescing.
An 'Candidate' segment means:
1. Segment is within current window and the sequence is the expected one.
2. 'ACK' of the segment is in the valid window.
Sanity check includes:
1. Incorrect version in IP header
2. An IP options or IP fragment
3. Not a TCP packet
4. Sanity size check to prevent buffer overflow attack.
5. An ECN packet
Even though, there might more cases should be considered such as
ip identification other flags, while it breaks the test because
windows set it to the same even it's not a fragment.
Normally it includes 2 typical ways to handle a TCP control flag,
'bypass' and 'finalize', 'bypass' means should be sent out directly,
while 'finalize' means the packets should also be bypassed, but this
should be done after search for the same connection packets in the
pool and drain all of them out, this is to avoid out of order fragment.
All the 'SYN' packets will be bypassed since this always begin a new'
connection, other flags such 'URG/FIN/RST/CWR/ECE' will trigger a
finalization, because this normally happens upon a connection is going
to be closed, an 'URG' packet also finalize current coalescing unit.
Statistics can be used to monitor the basic coalescing status, the
'out of order' and 'out of window' means how many retransmitting packets,
thus describe the performance intuitively.
Difference between ip v4 and v6 processing:
Fragment length in ipv4 header includes itself, while it's not
included for ipv6, thus means ipv6 can carry a real 65535 payload.
Note that main goal of implementing this feature in software
is to create reference setup for certification tests. In such
setups guest migration is not required, so the coalesced packets
not yet delivered to the guest will be lost in case of migration.
Signed-off-by: Wei Xu <wexu@redhat.com>
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2019-01-03 14:12:29 +01:00
|
|
|
uint32_t rsc_timeout;
|
|
|
|
uint8_t rsc4_enabled;
|
|
|
|
uint8_t rsc6_enabled;
|
2013-03-18 17:37:18 +01:00
|
|
|
uint8_t has_ufo;
|
2017-02-03 17:06:51 +01:00
|
|
|
uint32_t mergeable_rx_bufs;
|
2013-03-18 17:37:18 +01:00
|
|
|
uint8_t promisc;
|
|
|
|
uint8_t allmulti;
|
|
|
|
uint8_t alluni;
|
|
|
|
uint8_t nomulti;
|
|
|
|
uint8_t nouni;
|
|
|
|
uint8_t nobcast;
|
|
|
|
uint8_t vhost_started;
|
|
|
|
struct {
|
2014-04-03 18:50:39 +02:00
|
|
|
uint32_t in_use;
|
|
|
|
uint32_t first_multi;
|
2013-03-18 17:37:18 +01:00
|
|
|
uint8_t multi_overflow;
|
|
|
|
uint8_t uni_overflow;
|
|
|
|
uint8_t *macs;
|
|
|
|
} mac_table;
|
|
|
|
uint32_t *vlans;
|
2013-04-11 16:29:57 +02:00
|
|
|
virtio_net_conf net_conf;
|
|
|
|
NICConf nic_conf;
|
2013-03-18 17:37:18 +01:00
|
|
|
DeviceState *qdev;
|
|
|
|
int multiqueue;
|
2021-10-20 06:55:57 +02:00
|
|
|
uint16_t max_queue_pairs;
|
|
|
|
uint16_t curr_queue_pairs;
|
2021-10-20 06:55:59 +02:00
|
|
|
uint16_t max_ncs;
|
2013-03-18 17:37:18 +01:00
|
|
|
size_t config_size;
|
2013-05-15 14:12:49 +02:00
|
|
|
char *netclient_name;
|
|
|
|
char *netclient_type;
|
2013-05-20 10:18:14 +02:00
|
|
|
uint64_t curr_guest_offloads;
|
virtio-net: prevent offloads reset on migration
Currently offloads disabled by guest via the VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
command are not preserved on VM migration.
Instead all offloads reported by guest features (via VIRTIO_PCI_GUEST_FEATURES)
get enabled.
What happens is: first the VirtIONet::curr_guest_offloads gets restored and offloads
are getting set correctly:
#0 qemu_set_offload (nc=0x555556a11400, csum=1, tso4=0, tso6=0, ecn=0, ufo=0) at net/net.c:474
#1 virtio_net_apply_guest_offloads (n=0x555557701ca0) at hw/net/virtio-net.c:720
#2 virtio_net_post_load_device (opaque=0x555557701ca0, version_id=11) at hw/net/virtio-net.c:2334
#3 vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577c80 <vmstate_virtio_net_device>, opaque=0x555557701ca0, version_id=11)
at migration/vmstate.c:168
#4 virtio_load (vdev=0x555557701ca0, f=0x5555569dc010, version_id=11) at hw/virtio/virtio.c:2197
#5 virtio_device_get (f=0x5555569dc010, opaque=0x555557701ca0, size=0, field=0x55555668cd00 <__compound_literal.5>) at hw/virtio/virtio.c:2036
#6 vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577ce0 <vmstate_virtio_net>, opaque=0x555557701ca0, version_id=11) at migration/vmstate.c:143
#7 vmstate_load (f=0x5555569dc010, se=0x5555578189e0) at migration/savevm.c:829
#8 qemu_loadvm_section_start_full (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2211
#9 qemu_loadvm_state_main (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2395
#10 qemu_loadvm_state (f=0x5555569dc010) at migration/savevm.c:2467
#11 process_incoming_migration_co (opaque=0x0) at migration/migration.c:449
However later on the features are getting restored, and offloads get reset to
everything supported by features:
#0 qemu_set_offload (nc=0x555556a11400, csum=1, tso4=1, tso6=1, ecn=0, ufo=0) at net/net.c:474
#1 virtio_net_apply_guest_offloads (n=0x555557701ca0) at hw/net/virtio-net.c:720
#2 virtio_net_set_features (vdev=0x555557701ca0, features=5104441767) at hw/net/virtio-net.c:773
#3 virtio_set_features_nocheck (vdev=0x555557701ca0, val=5104441767) at hw/virtio/virtio.c:2052
#4 virtio_load (vdev=0x555557701ca0, f=0x5555569dc010, version_id=11) at hw/virtio/virtio.c:2220
#5 virtio_device_get (f=0x5555569dc010, opaque=0x555557701ca0, size=0, field=0x55555668cd00 <__compound_literal.5>) at hw/virtio/virtio.c:2036
#6 vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577ce0 <vmstate_virtio_net>, opaque=0x555557701ca0, version_id=11) at migration/vmstate.c:143
#7 vmstate_load (f=0x5555569dc010, se=0x5555578189e0) at migration/savevm.c:829
#8 qemu_loadvm_section_start_full (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2211
#9 qemu_loadvm_state_main (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2395
#10 qemu_loadvm_state (f=0x5555569dc010) at migration/savevm.c:2467
#11 process_incoming_migration_co (opaque=0x0) at migration/migration.c:449
Fix this by preserving the state in saved_guest_offloads field and
pushing out offload initialization to the new post load hook.
Cc: qemu-stable@nongnu.org
Signed-off-by: Mikhail Sennikovsky <mikhail.sennikovskii@cloud.ionos.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-10-11 15:58:04 +02:00
|
|
|
/* used on saved state restore phase to preserve the curr_guest_offloads */
|
|
|
|
uint64_t saved_guest_offloads;
|
2019-02-27 14:24:07 +01:00
|
|
|
AnnounceTimer announce_timer;
|
2016-02-05 11:43:11 +01:00
|
|
|
bool needs_vnet_hdr_swap;
|
2017-05-23 14:31:19 +02:00
|
|
|
bool mtu_bypass_backend;
|
2020-11-18 09:37:29 +01:00
|
|
|
/* primary failover device is hidden*/
|
|
|
|
bool failover_primary_hidden;
|
2019-10-29 12:49:04 +01:00
|
|
|
bool failover;
|
|
|
|
DeviceListener primary_listener;
|
2021-10-08 15:34:41 +02:00
|
|
|
QDict *primary_opts;
|
|
|
|
bool primary_opts_from_json;
|
2024-02-22 18:28:29 +01:00
|
|
|
NotifierWithReturn migration_state;
|
2020-05-08 14:59:28 +02:00
|
|
|
VirtioNetRssData rss_data;
|
2020-05-08 14:59:29 +02:00
|
|
|
struct NetRxPkt *rx_pkt;
|
2021-05-14 13:48:33 +02:00
|
|
|
struct EBPFRSSContext ebpf_rss;
|
2024-02-05 17:54:33 +01:00
|
|
|
uint32_t nr_ebpf_rss_fds;
|
|
|
|
char **ebpf_rss_fds;
|
2019-01-21 19:10:30 +01:00
|
|
|
};
|
2013-03-18 17:37:18 +01:00
|
|
|
|
2022-07-20 08:59:28 +02:00
|
|
|
size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
|
|
|
|
const struct iovec *in_sg, unsigned in_num,
|
|
|
|
const struct iovec *out_sg,
|
|
|
|
unsigned out_num);
|
2013-05-15 14:12:49 +02:00
|
|
|
void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
|
|
|
|
const char *type);
|
2023-06-02 13:52:16 +02:00
|
|
|
uint64_t virtio_net_supported_guest_offloads(const VirtIONet *n);
|
2013-04-11 16:29:57 +02:00
|
|
|
|
2008-12-17 20:13:11 +01:00
|
|
|
#endif
|