qemu-e2k/hw/net/e1000.c

1748 lines
57 KiB
C
Raw Normal View History

/*
* QEMU e1000 emulation
*
* Software developer's manual:
* http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf
*
* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
* Copyright (c) 2008 Qumranet
* Based on work done by:
* Copyright (c) 2007 Dan Aloni
* Copyright (c) 2004 Antony T Curtis
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "qemu/osdep.h"
#include "hw/hw.h"
#include "hw/pci/pci.h"
#include "net/net.h"
#include "net/checksum.h"
#include "sysemu/sysemu.h"
#include "sysemu/dma.h"
#include "qemu/iov.h"
#include "qemu/range.h"
#include "e1000x_common.h"
static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
/* #define E1000_DEBUG */
#ifdef E1000_DEBUG
enum {
DEBUG_GENERAL, DEBUG_IO, DEBUG_MMIO, DEBUG_INTERRUPT,
DEBUG_RX, DEBUG_TX, DEBUG_MDIC, DEBUG_EEPROM,
DEBUG_UNKNOWN, DEBUG_TXSUM, DEBUG_TXERR, DEBUG_RXERR,
DEBUG_RXFILTER, DEBUG_PHY, DEBUG_NOTYET,
};
#define DBGBIT(x) (1<<DEBUG_##x)
static int debugflags = DBGBIT(TXERR) | DBGBIT(GENERAL);
#define DBGOUT(what, fmt, ...) do { \
if (debugflags & DBGBIT(what)) \
fprintf(stderr, "e1000: " fmt, ## __VA_ARGS__); \
} while (0)
#else
#define DBGOUT(what, fmt, ...) do {} while (0)
#endif
#define IOPORT_SIZE 0x40
#define PNPMMIO_SIZE 0x20000
#define MIN_BUF_SIZE 60 /* Min. octets in an ethernet frame sans FCS */
#define MAXIMUM_ETHERNET_HDR_LEN (14+4)
/*
* HW models:
* E1000_DEV_ID_82540EM works with Windows, Linux, and OS X <= 10.8
* E1000_DEV_ID_82544GC_COPPER appears to work; not well tested
* E1000_DEV_ID_82545EM_COPPER works with Linux and OS X >= 10.6
* Others never tested
*/
typedef struct E1000State_st {
/*< private >*/
PCIDevice parent_obj;
/*< public >*/
NICState *nic;
NICConf conf;
MemoryRegion mmio;
MemoryRegion io;
uint32_t mac_reg[0x8000];
uint16_t phy_reg[0x20];
uint16_t eeprom_data[64];
uint32_t rxbuf_size;
uint32_t rxbuf_min_shift;
struct e1000_tx {
unsigned char header[256];
unsigned char vlan_header[4];
/* Fields vlan and data must not be reordered or separated. */
unsigned char vlan[4];
unsigned char data[0x10000];
uint16_t size;
unsigned char vlan_needed;
unsigned char sum_needed;
bool cptse;
e1000x_txd_props props;
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
e1000x_txd_props tso_props;
uint16_t tso_frames;
} tx;
struct {
uint32_t val_in; /* shifted in from guest driver */
uint16_t bitnum_in;
uint16_t bitnum_out;
uint16_t reading;
uint32_t old_eecd;
} eecd_state;
QEMUTimer *autoneg_timer;
QEMUTimer *mit_timer; /* Mitigation timer. */
bool mit_timer_on; /* Mitigation timer is running. */
bool mit_irq_level; /* Tracks interrupt pin level. */
uint32_t mit_ide; /* Tracks E1000_TXD_CMD_IDE bit. */
/* Compatibility flags for migration to/from qemu 1.3.0 and older */
#define E1000_FLAG_AUTONEG_BIT 0
#define E1000_FLAG_MIT_BIT 1
#define E1000_FLAG_MAC_BIT 2
#define E1000_FLAG_AUTONEG (1 << E1000_FLAG_AUTONEG_BIT)
#define E1000_FLAG_MIT (1 << E1000_FLAG_MIT_BIT)
#define E1000_FLAG_MAC (1 << E1000_FLAG_MAC_BIT)
uint32_t compat_flags;
} E1000State;
#define chkflag(x) (s->compat_flags & E1000_FLAG_##x)
typedef struct E1000BaseClass {
PCIDeviceClass parent_class;
uint16_t phy_id2;
} E1000BaseClass;
#define TYPE_E1000_BASE "e1000-base"
#define E1000(obj) \
OBJECT_CHECK(E1000State, (obj), TYPE_E1000_BASE)
#define E1000_DEVICE_CLASS(klass) \
OBJECT_CLASS_CHECK(E1000BaseClass, (klass), TYPE_E1000_BASE)
#define E1000_DEVICE_GET_CLASS(obj) \
OBJECT_GET_CLASS(E1000BaseClass, (obj), TYPE_E1000_BASE)
static void
e1000_link_up(E1000State *s)
{
e1000x_update_regs_on_link_up(s->mac_reg, s->phy_reg);
/* E1000_STATUS_LU is tested by e1000_can_receive() */
qemu_flush_queued_packets(qemu_get_queue(s->nic));
}
static void
e1000_autoneg_done(E1000State *s)
{
e1000x_update_regs_on_autoneg_done(s->mac_reg, s->phy_reg);
/* E1000_STATUS_LU is tested by e1000_can_receive() */
qemu_flush_queued_packets(qemu_get_queue(s->nic));
}
static bool
have_autoneg(E1000State *s)
{
return chkflag(AUTONEG) && (s->phy_reg[PHY_CTRL] & MII_CR_AUTO_NEG_EN);
}
static void
set_phy_ctrl(E1000State *s, int index, uint16_t val)
{
/* bits 0-5 reserved; MII_CR_[RESTART_AUTO_NEG,RESET] are self clearing */
s->phy_reg[PHY_CTRL] = val & ~(0x3f |
MII_CR_RESET |
MII_CR_RESTART_AUTO_NEG);
/*
* QEMU 1.3 does not support link auto-negotiation emulation, so if we
* migrate during auto negotiation, after migration the link will be
* down.
*/
if (have_autoneg(s) && (val & MII_CR_RESTART_AUTO_NEG)) {
e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer);
}
}
static void (*phyreg_writeops[])(E1000State *, int, uint16_t) = {
[PHY_CTRL] = set_phy_ctrl,
};
enum { NPHYWRITEOPS = ARRAY_SIZE(phyreg_writeops) };
enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W };
static const char phy_regcap[0x20] = {
[PHY_STATUS] = PHY_R, [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW,
[PHY_ID1] = PHY_R, [M88E1000_PHY_SPEC_CTRL] = PHY_RW,
[PHY_CTRL] = PHY_RW, [PHY_1000T_CTRL] = PHY_RW,
[PHY_LP_ABILITY] = PHY_R, [PHY_1000T_STATUS] = PHY_R,
[PHY_AUTONEG_ADV] = PHY_RW, [M88E1000_RX_ERR_CNTR] = PHY_R,
[PHY_ID2] = PHY_R, [M88E1000_PHY_SPEC_STATUS] = PHY_R,
[PHY_AUTONEG_EXP] = PHY_R,
};
/* PHY_ID2 documented in 8254x_GBe_SDM.pdf, pp. 250 */
static const uint16_t phy_reg_init[] = {
[PHY_CTRL] = MII_CR_SPEED_SELECT_MSB |
MII_CR_FULL_DUPLEX |
MII_CR_AUTO_NEG_EN,
[PHY_STATUS] = MII_SR_EXTENDED_CAPS |
MII_SR_LINK_STATUS | /* link initially up */
MII_SR_AUTONEG_CAPS |
/* MII_SR_AUTONEG_COMPLETE: initially NOT completed */
MII_SR_PREAMBLE_SUPPRESS |
MII_SR_EXTENDED_STATUS |
MII_SR_10T_HD_CAPS |
MII_SR_10T_FD_CAPS |
MII_SR_100X_HD_CAPS |
MII_SR_100X_FD_CAPS,
[PHY_ID1] = 0x141,
/* [PHY_ID2] configured per DevId, from e1000_reset() */
[PHY_AUTONEG_ADV] = 0xde1,
[PHY_LP_ABILITY] = 0x1e0,
[PHY_1000T_CTRL] = 0x0e00,
[PHY_1000T_STATUS] = 0x3c00,
[M88E1000_PHY_SPEC_CTRL] = 0x360,
[M88E1000_PHY_SPEC_STATUS] = 0xac00,
[M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60,
};
static const uint32_t mac_reg_init[] = {
[PBA] = 0x00100030,
[LEDCTL] = 0x602,
[CTRL] = E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN0 |
E1000_CTRL_SPD_1000 | E1000_CTRL_SLU,
[STATUS] = 0x80000000 | E1000_STATUS_GIO_MASTER_ENABLE |
E1000_STATUS_ASDV | E1000_STATUS_MTXCKOK |
E1000_STATUS_SPEED_1000 | E1000_STATUS_FD |
E1000_STATUS_LU,
[MANC] = E1000_MANC_EN_MNG2HOST | E1000_MANC_RCV_TCO_EN |
E1000_MANC_ARP_EN | E1000_MANC_0298_EN |
E1000_MANC_RMCP_EN,
};
/* Helper function, *curr == 0 means the value is not set */
static inline void
mit_update_delay(uint32_t *curr, uint32_t value)
{
if (value && (*curr == 0 || value < *curr)) {
*curr = value;
}
}
static void
set_interrupt_cause(E1000State *s, int index, uint32_t val)
{
PCIDevice *d = PCI_DEVICE(s);
uint32_t pending_ints;
uint32_t mit_delay;
s->mac_reg[ICR] = val;
/*
* Make sure ICR and ICS registers have the same value.
* The spec says that the ICS register is write-only. However in practice,
* on real hardware ICS is readable, and for reads it has the same value as
* ICR (except that ICS does not have the clear on read behaviour of ICR).
*
* The VxWorks PRO/1000 driver uses this behaviour.
*/
s->mac_reg[ICS] = val;
pending_ints = (s->mac_reg[IMS] & s->mac_reg[ICR]);
if (!s->mit_irq_level && pending_ints) {
/*
* Here we detect a potential raising edge. We postpone raising the
* interrupt line if we are inside the mitigation delay window
* (s->mit_timer_on == 1).
* We provide a partial implementation of interrupt mitigation,
* emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for
* RADV and TADV, 256ns units for ITR). RDTR is only used to enable
* RADV; relative timers based on TIDV and RDTR are not implemented.
*/
if (s->mit_timer_on) {
return;
}
if (chkflag(MIT)) {
/* Compute the next mitigation delay according to pending
* interrupts and the current values of RADV (provided
* RDTR!=0), TADV and ITR.
* Then rearm the timer.
*/
mit_delay = 0;
if (s->mit_ide &&
(pending_ints & (E1000_ICR_TXQE | E1000_ICR_TXDW))) {
mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4);
}
if (s->mac_reg[RDTR] && (pending_ints & E1000_ICS_RXT0)) {
mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4);
}
mit_update_delay(&mit_delay, s->mac_reg[ITR]);
/*
* According to e1000 SPEC, the Ethernet controller guarantees
* a maximum observable interrupt rate of 7813 interrupts/sec.
* Thus if mit_delay < 500 then the delay should be set to the
* minimum delay possible which is 500.
*/
mit_delay = (mit_delay < 500) ? 500 : mit_delay;
s->mit_timer_on = 1;
timer_mod(s->mit_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
mit_delay * 256);
s->mit_ide = 0;
}
}
s->mit_irq_level = (pending_ints != 0);
pci_set_irq(d, s->mit_irq_level);
}
static void
e1000_mit_timer(void *opaque)
{
E1000State *s = opaque;
s->mit_timer_on = 0;
/* Call set_interrupt_cause to update the irq level (if necessary). */
set_interrupt_cause(s, 0, s->mac_reg[ICR]);
}
static void
set_ics(E1000State *s, int index, uint32_t val)
{
DBGOUT(INTERRUPT, "set_ics %x, ICR %x, IMR %x\n", val, s->mac_reg[ICR],
s->mac_reg[IMS]);
set_interrupt_cause(s, 0, val | s->mac_reg[ICR]);
}
static void
e1000_autoneg_timer(void *opaque)
{
E1000State *s = opaque;
if (!qemu_get_queue(s->nic)->link_down) {
e1000_autoneg_done(s);
set_ics(s, 0, E1000_ICS_LSC); /* signal link status change to guest */
}
}
static void e1000_reset(void *opaque)
{
E1000State *d = opaque;
E1000BaseClass *edc = E1000_DEVICE_GET_CLASS(d);
uint8_t *macaddr = d->conf.macaddr.a;
timer_del(d->autoneg_timer);
timer_del(d->mit_timer);
d->mit_timer_on = 0;
d->mit_irq_level = 0;
d->mit_ide = 0;
memset(d->phy_reg, 0, sizeof d->phy_reg);
memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
d->phy_reg[PHY_ID2] = edc->phy_id2;
memset(d->mac_reg, 0, sizeof d->mac_reg);
memmove(d->mac_reg, mac_reg_init, sizeof mac_reg_init);
d->rxbuf_min_shift = 1;
memset(&d->tx, 0, sizeof d->tx);
if (qemu_get_queue(d->nic)->link_down) {
e1000x_update_regs_on_link_down(d->mac_reg, d->phy_reg);
}
e1000x_reset_mac_addr(d->nic, d->mac_reg, macaddr);
}
static void
set_ctrl(E1000State *s, int index, uint32_t val)
{
/* RST is self clearing */
s->mac_reg[CTRL] = val & ~E1000_CTRL_RST;
}
static void
set_rx_control(E1000State *s, int index, uint32_t val)
{
s->mac_reg[RCTL] = val;
s->rxbuf_size = e1000x_rxbufsize(val);
s->rxbuf_min_shift = ((val / E1000_RCTL_RDMTS_QUAT) & 3) + 1;
DBGOUT(RX, "RCTL: %d, mac_reg[RCTL] = 0x%x\n", s->mac_reg[RDT],
s->mac_reg[RCTL]);
qemu_flush_queued_packets(qemu_get_queue(s->nic));
}
static void
set_mdic(E1000State *s, int index, uint32_t val)
{
uint32_t data = val & E1000_MDIC_DATA_MASK;
uint32_t addr = ((val & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT);
if ((val & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT != 1) // phy #
val = s->mac_reg[MDIC] | E1000_MDIC_ERROR;
else if (val & E1000_MDIC_OP_READ) {
DBGOUT(MDIC, "MDIC read reg 0x%x\n", addr);
if (!(phy_regcap[addr] & PHY_R)) {
DBGOUT(MDIC, "MDIC read reg %x unhandled\n", addr);
val |= E1000_MDIC_ERROR;
} else
val = (val ^ data) | s->phy_reg[addr];
} else if (val & E1000_MDIC_OP_WRITE) {
DBGOUT(MDIC, "MDIC write reg 0x%x, value 0x%x\n", addr, data);
if (!(phy_regcap[addr] & PHY_W)) {
DBGOUT(MDIC, "MDIC write reg %x unhandled\n", addr);
val |= E1000_MDIC_ERROR;
} else {
if (addr < NPHYWRITEOPS && phyreg_writeops[addr]) {
phyreg_writeops[addr](s, index, data);
} else {
s->phy_reg[addr] = data;
}
}
}
s->mac_reg[MDIC] = val | E1000_MDIC_READY;
if (val & E1000_MDIC_INT_EN) {
set_ics(s, 0, E1000_ICR_MDAC);
}
}
static uint32_t
get_eecd(E1000State *s, int index)
{
uint32_t ret = E1000_EECD_PRES|E1000_EECD_GNT | s->eecd_state.old_eecd;
DBGOUT(EEPROM, "reading eeprom bit %d (reading %d)\n",
s->eecd_state.bitnum_out, s->eecd_state.reading);
if (!s->eecd_state.reading ||
((s->eeprom_data[(s->eecd_state.bitnum_out >> 4) & 0x3f] >>
((s->eecd_state.bitnum_out & 0xf) ^ 0xf))) & 1)
ret |= E1000_EECD_DO;
return ret;
}
static void
set_eecd(E1000State *s, int index, uint32_t val)
{
uint32_t oldval = s->eecd_state.old_eecd;
s->eecd_state.old_eecd = val & (E1000_EECD_SK | E1000_EECD_CS |
E1000_EECD_DI|E1000_EECD_FWE_MASK|E1000_EECD_REQ);
if (!(E1000_EECD_CS & val)) { /* CS inactive; nothing to do */
return;
}
if (E1000_EECD_CS & (val ^ oldval)) { /* CS rise edge; reset state */
s->eecd_state.val_in = 0;
s->eecd_state.bitnum_in = 0;
s->eecd_state.bitnum_out = 0;
s->eecd_state.reading = 0;
e1000: Fix wrong microwire EEPROM state initialization This change fixes initialization of e1000's microwire EEPROM internal state values so that qemu's e1000 emulation works on NetBSD, which doesn't use Intel's em driver but has its own wm driver for the Intel i8254x Gigabit Ethernet. Previously set_eecd() function in e1000.c clears EEPROM internal state values on SK rising edge during CS==L, but according to FM93C06 EEPROM (which is MicroWire compatible) data sheet, EEPROM internal status should be cleared on CS rise edge regardless of SK input: "... a rising edge on this (CS) signal is required to reset the internal state-machine to accept a new cycle .." and nothing should be changed during CS (chip select) is inactive. Intel's em driver seems to explicitly raise SK output after CS is negated in em_standby_eeprom() so many other OSes that use Intel's driver don't have this problem even on the previous e1000.c implementation, but I can't find any articles that say the MICROWIRE or EEPROM spec requires such sequence, and actually hardware works fine without it (i.e. real i82540EM has been working on NetBSD). This fix also changes initialization to clear each state value in struct eecd_state individually rather than using memset() against the whole structre. The old_eecd member stores the last SK and CS signal levels and it should be preserved even after reset of internal EEPROM state to detect next signal edges for proper EEPROM emulation. Signed-off-by: Izumi Tsutsui <tsutsui@ceres.dti.ne.jp> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
2010-07-10 16:03:45 +02:00
}
if (!(E1000_EECD_SK & (val ^ oldval))) { /* no clock edge */
return;
}
if (!(E1000_EECD_SK & val)) { /* falling edge */
s->eecd_state.bitnum_out++;
return;
}
s->eecd_state.val_in <<= 1;
if (val & E1000_EECD_DI)
s->eecd_state.val_in |= 1;
if (++s->eecd_state.bitnum_in == 9 && !s->eecd_state.reading) {
s->eecd_state.bitnum_out = ((s->eecd_state.val_in & 0x3f)<<4)-1;
s->eecd_state.reading = (((s->eecd_state.val_in >> 6) & 7) ==
EEPROM_READ_OPCODE_MICROWIRE);
}
DBGOUT(EEPROM, "eeprom bitnum in %d out %d, reading %d\n",
s->eecd_state.bitnum_in, s->eecd_state.bitnum_out,
s->eecd_state.reading);
}
static uint32_t
flash_eerd_read(E1000State *s, int x)
{
unsigned int index, r = s->mac_reg[EERD] & ~E1000_EEPROM_RW_REG_START;
if ((s->mac_reg[EERD] & E1000_EEPROM_RW_REG_START) == 0)
return (s->mac_reg[EERD]);
if ((index = r >> E1000_EEPROM_RW_ADDR_SHIFT) > EEPROM_CHECKSUM_REG)
return (E1000_EEPROM_RW_REG_DONE | r);
return ((s->eeprom_data[index] << E1000_EEPROM_RW_REG_DATA) |
E1000_EEPROM_RW_REG_DONE | r);
}
static void
putsum(uint8_t *data, uint32_t n, uint32_t sloc, uint32_t css, uint32_t cse)
{
uint32_t sum;
if (cse && cse < n)
n = cse + 1;
if (sloc < n-1) {
sum = net_checksum_add(n-css, data+css);
stw_be_p(data + sloc, net_checksum_finish_nozero(sum));
}
}
static inline void
inc_tx_bcast_or_mcast_count(E1000State *s, const unsigned char *arr)
{
if (!memcmp(arr, bcast, sizeof bcast)) {
e1000x_inc_reg_if_not_full(s->mac_reg, BPTC);
} else if (arr[0] & 1) {
e1000x_inc_reg_if_not_full(s->mac_reg, MPTC);
}
}
static void
e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
{
static const int PTCregs[6] = { PTC64, PTC127, PTC255, PTC511,
PTC1023, PTC1522 };
NetClientState *nc = qemu_get_queue(s->nic);
if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
nc->info->receive(nc, buf, size);
} else {
qemu_send_packet(nc, buf, size);
}
inc_tx_bcast_or_mcast_count(s, buf);
e1000x_increase_size_stats(s->mac_reg, PTCregs, size);
}
static void
xmit_seg(E1000State *s)
{
uint16_t len;
unsigned int frames = s->tx.tso_frames, css, sofar;
struct e1000_tx *tp = &s->tx;
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
struct e1000x_txd_props *props = tp->cptse ? &tp->tso_props : &tp->props;
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
if (tp->cptse) {
css = props->ipcss;
DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
frames, tp->size, css);
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
if (props->ip) { /* IPv4 */
stw_be_p(tp->data+css+2, tp->size - css);
stw_be_p(tp->data+css+4,
lduw_be_p(tp->data + css + 4) + frames);
} else { /* IPv6 */
stw_be_p(tp->data+css+4, tp->size - css);
}
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
css = props->tucss;
len = tp->size - css;
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", props->tcp, css, len);
if (props->tcp) {
sofar = frames * props->mss;
stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
if (props->paylen - sofar > props->mss) {
tp->data[css + 13] &= ~9; /* PSH, FIN */
} else if (frames) {
e1000x_inc_reg_if_not_full(s->mac_reg, TSCTC);
}
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
} else { /* UDP */
stw_be_p(tp->data+css+4, len);
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
}
if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
unsigned int phsum;
// add pseudo-header length before checksum calculation
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
void *sp = tp->data + props->tucso;
phsum = lduw_be_p(sp) + len;
phsum = (phsum >> 16) + (phsum & 0xffff);
stw_be_p(sp, phsum);
}
tp->tso_frames++;
}
if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
putsum(tp->data, tp->size, props->tucso, props->tucss, props->tucse);
}
if (tp->sum_needed & E1000_TXD_POPTS_IXSM) {
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
putsum(tp->data, tp->size, props->ipcso, props->ipcss, props->ipcse);
}
if (tp->vlan_needed) {
memmove(tp->vlan, tp->data, 4);
memmove(tp->data, tp->data + 4, 8);
memcpy(tp->data + 8, tp->vlan_header, 4);
e1000_send_packet(s, tp->vlan, tp->size + 4);
} else {
e1000_send_packet(s, tp->data, tp->size);
}
e1000x_inc_reg_if_not_full(s->mac_reg, TPT);
e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size);
s->mac_reg[GPTC] = s->mac_reg[TPT];
s->mac_reg[GOTCL] = s->mac_reg[TOTL];
s->mac_reg[GOTCH] = s->mac_reg[TOTH];
}
static void
process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
{
PCIDevice *d = PCI_DEVICE(s);
uint32_t txd_lower = le32_to_cpu(dp->lower.data);
uint32_t dtype = txd_lower & (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D);
unsigned int split_size = txd_lower & 0xffff, bytes, sz;
unsigned int msh = 0xfffff;
uint64_t addr;
struct e1000_context_desc *xp = (struct e1000_context_desc *)dp;
struct e1000_tx *tp = &s->tx;
s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
if (dtype == E1000_TXD_CMD_DEXT) { /* context descriptor */
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
if (le32_to_cpu(xp->cmd_and_length) & E1000_TXD_CMD_TSE) {
e1000x_read_tx_ctx_descr(xp, &tp->tso_props);
tp->tso_frames = 0;
} else {
e1000x_read_tx_ctx_descr(xp, &tp->props);
}
return;
} else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
// data descriptor
if (tp->size == 0) {
tp->sum_needed = le32_to_cpu(dp->upper.data) >> 8;
}
tp->cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
} else {
// legacy descriptor
tp->cptse = 0;
}
if (e1000x_vlan_enabled(s->mac_reg) &&
e1000x_is_vlan_txd(txd_lower) &&
(tp->cptse || txd_lower & E1000_TXD_CMD_EOP)) {
tp->vlan_needed = 1;
stw_be_p(tp->vlan_header,
le16_to_cpu(s->mac_reg[VET]));
stw_be_p(tp->vlan_header + 2,
le16_to_cpu(dp->upper.fields.special));
}
addr = le64_to_cpu(dp->buffer_addr);
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
if (tp->cptse) {
msh = tp->tso_props.hdr_len + tp->tso_props.mss;
do {
bytes = split_size;
if (tp->size + bytes > msh)
bytes = msh - tp->size;
bytes = MIN(sizeof(tp->data) - tp->size, bytes);
pci_dma_read(d, addr, tp->data + tp->size, bytes);
sz = tp->size + bytes;
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
if (sz >= tp->tso_props.hdr_len
&& tp->size < tp->tso_props.hdr_len) {
memmove(tp->header, tp->data, tp->tso_props.hdr_len);
}
tp->size = sz;
addr += bytes;
if (sz == msh) {
xmit_seg(s);
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
memmove(tp->data, tp->header, tp->tso_props.hdr_len);
tp->size = tp->tso_props.hdr_len;
}
split_size -= bytes;
} while (bytes && split_size);
} else {
split_size = MIN(sizeof(tp->data) - tp->size, split_size);
pci_dma_read(d, addr, tp->data + tp->size, split_size);
tp->size += split_size;
}
if (!(txd_lower & E1000_TXD_CMD_EOP))
return;
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) {
xmit_seg(s);
}
tp->tso_frames = 0;
tp->sum_needed = 0;
tp->vlan_needed = 0;
tp->size = 0;
tp->cptse = 0;
}
static uint32_t
txdesc_writeback(E1000State *s, dma_addr_t base, struct e1000_tx_desc *dp)
{
PCIDevice *d = PCI_DEVICE(s);
uint32_t txd_upper, txd_lower = le32_to_cpu(dp->lower.data);
if (!(txd_lower & (E1000_TXD_CMD_RS|E1000_TXD_CMD_RPS)))
return 0;
txd_upper = (le32_to_cpu(dp->upper.data) | E1000_TXD_STAT_DD) &
~(E1000_TXD_STAT_EC | E1000_TXD_STAT_LC | E1000_TXD_STAT_TU);
dp->upper.data = cpu_to_le32(txd_upper);
pci_dma_write(d, base + ((char *)&dp->upper - (char *)dp),
&dp->upper, sizeof(dp->upper));
return E1000_ICR_TXDW;
}
static uint64_t tx_desc_base(E1000State *s)
{
uint64_t bah = s->mac_reg[TDBAH];
uint64_t bal = s->mac_reg[TDBAL] & ~0xf;
return (bah << 32) + bal;
}
static void
start_xmit(E1000State *s)
{
PCIDevice *d = PCI_DEVICE(s);
dma_addr_t base;
struct e1000_tx_desc desc;
uint32_t tdh_start = s->mac_reg[TDH], cause = E1000_ICS_TXQE;
if (!(s->mac_reg[TCTL] & E1000_TCTL_EN)) {
DBGOUT(TX, "tx disabled\n");
return;
}
while (s->mac_reg[TDH] != s->mac_reg[TDT]) {
base = tx_desc_base(s) +
sizeof(struct e1000_tx_desc) * s->mac_reg[TDH];
pci_dma_read(d, base, &desc, sizeof(desc));
DBGOUT(TX, "index %d: %p : %x %x\n", s->mac_reg[TDH],
(void *)(intptr_t)desc.buffer_addr, desc.lower.data,
desc.upper.data);
process_tx_desc(s, &desc);
cause |= txdesc_writeback(s, base, &desc);
if (++s->mac_reg[TDH] * sizeof(desc) >= s->mac_reg[TDLEN])
s->mac_reg[TDH] = 0;
/*
* the following could happen only if guest sw assigns
* bogus values to TDT/TDLEN.
* there's nothing too intelligent we could do about this.
*/
e1000: eliminate infinite loops on out-of-bounds transfer start The start_xmit() and e1000_receive_iov() functions implement DMA transfers iterating over a set of descriptors that the guest's e1000 driver prepares: - the TDLEN and RDLEN registers store the total size of the descriptor area, - while the TDH and RDH registers store the offset (in whole tx / rx descriptors) into the area where the transfer is supposed to start. Each time a descriptor is processed, the TDH and RDH register is bumped (as appropriate for the transfer direction). QEMU already contains logic to deal with bogus transfers submitted by the guest: - Normally, the transmit case wants to increase TDH from its initial value to TDT. (TDT is allowed to be numerically smaller than the initial TDH value; wrapping at or above TDLEN bytes to zero is normal.) The failsafe that QEMU currently has here is a check against reaching the original TDH value again -- a complete wraparound, which should never happen. - In the receive case RDH is increased from its initial value until "total_size" bytes have been received; preferably in a single step, or in "s->rxbuf_size" byte steps, if the latter is smaller. However, null RX descriptors are skipped without receiving data, while RDH is incremented just the same. QEMU tries to prevent an infinite loop (processing only null RX descriptors) by detecting whether RDH assumes its original value during the loop. (Again, wrapping from RDLEN to 0 is normal.) What both directions miss is that the guest could program TDLEN and RDLEN so low, and the initial TDH and RDH so high, that these registers will immediately be truncated to zero, and then never reassume their initial values in the loop -- a full wraparound will never occur. The condition that expresses this is: xdh_start >= s->mac_reg[XDLEN] / sizeof(desc) i.e., TDH or RDH start out after the last whole rx or tx descriptor that fits into the TDLEN or RDLEN sized area. This condition could be checked before we enter the loops, but pci_dma_read() / pci_dma_write() knows how to fill in buffers safely for bogus DMA addresses, so we just extend the existing failsafes with the above condition. This is CVE-2016-1981. Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Petr Matousek <pmatouse@redhat.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Prasad Pandit <ppandit@redhat.com> Cc: Michael Roth <mdroth@linux.vnet.ibm.com> Cc: Jason Wang <jasowang@redhat.com> Cc: qemu-stable@nongnu.org RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1296044 Signed-off-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Jason Wang <jasowang@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2016-01-19 14:17:20 +01:00
if (s->mac_reg[TDH] == tdh_start ||
tdh_start >= s->mac_reg[TDLEN] / sizeof(desc)) {
DBGOUT(TXERR, "TDH wraparound @%x, TDT %x, TDLEN %x\n",
tdh_start, s->mac_reg[TDT], s->mac_reg[TDLEN]);
break;
}
}
set_ics(s, 0, cause);
}
static int
receive_filter(E1000State *s, const uint8_t *buf, int size)
{
uint32_t rctl = s->mac_reg[RCTL];
int isbcast = !memcmp(buf, bcast, sizeof bcast), ismcast = (buf[0] & 1);
if (e1000x_is_vlan_packet(buf, le16_to_cpu(s->mac_reg[VET])) &&
e1000x_vlan_rx_filter_enabled(s->mac_reg)) {
uint16_t vid = lduw_be_p(buf + 14);
uint32_t vfta = ldl_le_p((uint32_t*)(s->mac_reg + VFTA) +
((vid >> 5) & 0x7f));
if ((vfta & (1 << (vid & 0x1f))) == 0)
return 0;
}
if (!isbcast && !ismcast && (rctl & E1000_RCTL_UPE)) { /* promiscuous ucast */
return 1;
}
if (ismcast && (rctl & E1000_RCTL_MPE)) { /* promiscuous mcast */
e1000x_inc_reg_if_not_full(s->mac_reg, MPRC);
return 1;
}
if (isbcast && (rctl & E1000_RCTL_BAM)) { /* broadcast enabled */
e1000x_inc_reg_if_not_full(s->mac_reg, BPRC);
return 1;
}
return e1000x_rx_group_filter(s->mac_reg, buf);
}
static void
e1000_set_link_status(NetClientState *nc)
{
E1000State *s = qemu_get_nic_opaque(nc);
uint32_t old_status = s->mac_reg[STATUS];
e1000: use MII status register for link up/down Some guests will use the standard MII status register to verify link state. They will not notice link changes unless this register is updated. Verified with Linux 3.0 and Windows XP guests. Without this patch, ethtool will report speed and duplex as unknown when the link is down, but still report the link as up. This is because the Linux e1000 driver checks the mac_reg[STATUS] register link state before it checks speed and duplex, but uses the phy_reg[PHY_STATUS] register for the actual link state check. Fix by updating both registers on link state changes. Linux guest before: (qemu) set_link e1000.0 off kvm-sid:~# ethtool eth0 Settings for eth0: Supported ports: [ TP ] Supported link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Supports auto-negotiation: Yes Advertised link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Advertised pause frame use: No Advertised auto-negotiation: Yes Speed: Unknown! Duplex: Unknown! (255) Port: Twisted Pair PHYAD: 0 Transceiver: internal Auto-negotiation: on MDI-X: Unknown Supports Wake-on: umbg Wake-on: d Current message level: 0x00000007 (7) drv probe link Link detected: yes (qemu) set_link e1000.0 on Linux guest after: (qemu) set_link e1000.0 off [ 63.384221] e1000: eth0 NIC Link is Down kvm-sid:~# ethtool eth0 Settings for eth0: Supported ports: [ TP ] Supported link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Supports auto-negotiation: Yes Advertised link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Advertised pause frame use: No Advertised auto-negotiation: Yes Speed: Unknown! Duplex: Unknown! (255) Port: Twisted Pair PHYAD: 0 Transceiver: internal Auto-negotiation: on MDI-X: Unknown Supports Wake-on: umbg Wake-on: d Current message level: 0x00000007 (7) drv probe link Link detected: no (qemu) set_link e1000.0 on [ 84.304582] e1000: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX Signed-off-by: Bjørn Mork <bjorn@mork.no> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2011-08-17 11:03:14 +02:00
if (nc->link_down) {
e1000x_update_regs_on_link_down(s->mac_reg, s->phy_reg);
e1000: use MII status register for link up/down Some guests will use the standard MII status register to verify link state. They will not notice link changes unless this register is updated. Verified with Linux 3.0 and Windows XP guests. Without this patch, ethtool will report speed and duplex as unknown when the link is down, but still report the link as up. This is because the Linux e1000 driver checks the mac_reg[STATUS] register link state before it checks speed and duplex, but uses the phy_reg[PHY_STATUS] register for the actual link state check. Fix by updating both registers on link state changes. Linux guest before: (qemu) set_link e1000.0 off kvm-sid:~# ethtool eth0 Settings for eth0: Supported ports: [ TP ] Supported link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Supports auto-negotiation: Yes Advertised link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Advertised pause frame use: No Advertised auto-negotiation: Yes Speed: Unknown! Duplex: Unknown! (255) Port: Twisted Pair PHYAD: 0 Transceiver: internal Auto-negotiation: on MDI-X: Unknown Supports Wake-on: umbg Wake-on: d Current message level: 0x00000007 (7) drv probe link Link detected: yes (qemu) set_link e1000.0 on Linux guest after: (qemu) set_link e1000.0 off [ 63.384221] e1000: eth0 NIC Link is Down kvm-sid:~# ethtool eth0 Settings for eth0: Supported ports: [ TP ] Supported link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Supports auto-negotiation: Yes Advertised link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Advertised pause frame use: No Advertised auto-negotiation: Yes Speed: Unknown! Duplex: Unknown! (255) Port: Twisted Pair PHYAD: 0 Transceiver: internal Auto-negotiation: on MDI-X: Unknown Supports Wake-on: umbg Wake-on: d Current message level: 0x00000007 (7) drv probe link Link detected: no (qemu) set_link e1000.0 on [ 84.304582] e1000: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX Signed-off-by: Bjørn Mork <bjorn@mork.no> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2011-08-17 11:03:14 +02:00
} else {
if (have_autoneg(s) &&
!(s->phy_reg[PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) {
e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer);
} else {
e1000_link_up(s);
}
e1000: use MII status register for link up/down Some guests will use the standard MII status register to verify link state. They will not notice link changes unless this register is updated. Verified with Linux 3.0 and Windows XP guests. Without this patch, ethtool will report speed and duplex as unknown when the link is down, but still report the link as up. This is because the Linux e1000 driver checks the mac_reg[STATUS] register link state before it checks speed and duplex, but uses the phy_reg[PHY_STATUS] register for the actual link state check. Fix by updating both registers on link state changes. Linux guest before: (qemu) set_link e1000.0 off kvm-sid:~# ethtool eth0 Settings for eth0: Supported ports: [ TP ] Supported link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Supports auto-negotiation: Yes Advertised link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Advertised pause frame use: No Advertised auto-negotiation: Yes Speed: Unknown! Duplex: Unknown! (255) Port: Twisted Pair PHYAD: 0 Transceiver: internal Auto-negotiation: on MDI-X: Unknown Supports Wake-on: umbg Wake-on: d Current message level: 0x00000007 (7) drv probe link Link detected: yes (qemu) set_link e1000.0 on Linux guest after: (qemu) set_link e1000.0 off [ 63.384221] e1000: eth0 NIC Link is Down kvm-sid:~# ethtool eth0 Settings for eth0: Supported ports: [ TP ] Supported link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Supports auto-negotiation: Yes Advertised link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Full Advertised pause frame use: No Advertised auto-negotiation: Yes Speed: Unknown! Duplex: Unknown! (255) Port: Twisted Pair PHYAD: 0 Transceiver: internal Auto-negotiation: on MDI-X: Unknown Supports Wake-on: umbg Wake-on: d Current message level: 0x00000007 (7) drv probe link Link detected: no (qemu) set_link e1000.0 on [ 84.304582] e1000: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX Signed-off-by: Bjørn Mork <bjorn@mork.no> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2011-08-17 11:03:14 +02:00
}
if (s->mac_reg[STATUS] != old_status)
set_ics(s, 0, E1000_ICR_LSC);
}
static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
{
int bufs;
/* Fast-path short packets */
if (total_size <= s->rxbuf_size) {
return s->mac_reg[RDH] != s->mac_reg[RDT];
}
if (s->mac_reg[RDH] < s->mac_reg[RDT]) {
bufs = s->mac_reg[RDT] - s->mac_reg[RDH];
} else if (s->mac_reg[RDH] > s->mac_reg[RDT]) {
bufs = s->mac_reg[RDLEN] / sizeof(struct e1000_rx_desc) +
s->mac_reg[RDT] - s->mac_reg[RDH];
} else {
return false;
}
return total_size <= bufs * s->rxbuf_size;
}
static int
e1000_can_receive(NetClientState *nc)
{
E1000State *s = qemu_get_nic_opaque(nc);
return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
e1000_has_rxbufs(s, 1);
}
static uint64_t rx_desc_base(E1000State *s)
{
uint64_t bah = s->mac_reg[RDBAH];
uint64_t bal = s->mac_reg[RDBAL] & ~0xf;
return (bah << 32) + bal;
}
static ssize_t
e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
{
E1000State *s = qemu_get_nic_opaque(nc);
PCIDevice *d = PCI_DEVICE(s);
struct e1000_rx_desc desc;
dma_addr_t base;
unsigned int n, rdt;
uint32_t rdh_start;
uint16_t vlan_special = 0;
uint8_t vlan_status = 0;
uint8_t min_buf[MIN_BUF_SIZE];
struct iovec min_iov;
uint8_t *filter_buf = iov->iov_base;
size_t size = iov_size(iov, iovcnt);
size_t iov_ofs = 0;
size_t desc_offset;
size_t desc_size;
size_t total_size;
if (!e1000x_hw_rx_enabled(s->mac_reg)) {
return -1;
}
/* Pad to minimum Ethernet frame length */
if (size < sizeof(min_buf)) {
iov_to_buf(iov, iovcnt, 0, min_buf, size);
memset(&min_buf[size], 0, sizeof(min_buf) - size);
e1000x_inc_reg_if_not_full(s->mac_reg, RUC);
min_iov.iov_base = filter_buf = min_buf;
min_iov.iov_len = size = sizeof(min_buf);
iovcnt = 1;
iov = &min_iov;
} else if (iov->iov_len < MAXIMUM_ETHERNET_HDR_LEN) {
/* This is very unlikely, but may happen. */
iov_to_buf(iov, iovcnt, 0, min_buf, MAXIMUM_ETHERNET_HDR_LEN);
filter_buf = min_buf;
}
/* Discard oversized packets if !LPE and !SBP. */
if (e1000x_is_oversized(s->mac_reg, size)) {
return size;
}
if (!receive_filter(s, filter_buf, size)) {
return size;
}
if (e1000x_vlan_enabled(s->mac_reg) &&
e1000x_is_vlan_packet(filter_buf, le16_to_cpu(s->mac_reg[VET]))) {
vlan_special = cpu_to_le16(lduw_be_p(filter_buf + 14));
iov_ofs = 4;
if (filter_buf == iov->iov_base) {
memmove(filter_buf + 4, filter_buf, 12);
} else {
iov_from_buf(iov, iovcnt, 4, filter_buf, 12);
while (iov->iov_len <= iov_ofs) {
iov_ofs -= iov->iov_len;
iov++;
}
}
vlan_status = E1000_RXD_STAT_VP;
size -= 4;
}
rdh_start = s->mac_reg[RDH];
desc_offset = 0;
total_size = size + e1000x_fcs_len(s->mac_reg);
if (!e1000_has_rxbufs(s, total_size)) {
set_ics(s, 0, E1000_ICS_RXO);
return -1;
}
do {
desc_size = total_size - desc_offset;
if (desc_size > s->rxbuf_size) {
desc_size = s->rxbuf_size;
}
base = rx_desc_base(s) + sizeof(desc) * s->mac_reg[RDH];
pci_dma_read(d, base, &desc, sizeof(desc));
desc.special = vlan_special;
desc.status |= (vlan_status | E1000_RXD_STAT_DD);
if (desc.buffer_addr) {
if (desc_offset < size) {
size_t iov_copy;
hwaddr ba = le64_to_cpu(desc.buffer_addr);
size_t copy_size = size - desc_offset;
if (copy_size > s->rxbuf_size) {
copy_size = s->rxbuf_size;
}
do {
iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
pci_dma_write(d, ba, iov->iov_base + iov_ofs, iov_copy);
copy_size -= iov_copy;
ba += iov_copy;
iov_ofs += iov_copy;
if (iov_ofs == iov->iov_len) {
iov++;
iov_ofs = 0;
}
} while (copy_size);
}
desc_offset += desc_size;
desc.length = cpu_to_le16(desc_size);
if (desc_offset >= total_size) {
desc.status |= E1000_RXD_STAT_EOP | E1000_RXD_STAT_IXSM;
} else {
/* Guest zeroing out status is not a hardware requirement.
Clear EOP in case guest didn't do it. */
desc.status &= ~E1000_RXD_STAT_EOP;
}
} else { // as per intel docs; skip descriptors with null buf addr
DBGOUT(RX, "Null RX descriptor!!\n");
}
pci_dma_write(d, base, &desc, sizeof(desc));
if (++s->mac_reg[RDH] * sizeof(desc) >= s->mac_reg[RDLEN])
s->mac_reg[RDH] = 0;
/* see comment in start_xmit; same here */
e1000: eliminate infinite loops on out-of-bounds transfer start The start_xmit() and e1000_receive_iov() functions implement DMA transfers iterating over a set of descriptors that the guest's e1000 driver prepares: - the TDLEN and RDLEN registers store the total size of the descriptor area, - while the TDH and RDH registers store the offset (in whole tx / rx descriptors) into the area where the transfer is supposed to start. Each time a descriptor is processed, the TDH and RDH register is bumped (as appropriate for the transfer direction). QEMU already contains logic to deal with bogus transfers submitted by the guest: - Normally, the transmit case wants to increase TDH from its initial value to TDT. (TDT is allowed to be numerically smaller than the initial TDH value; wrapping at or above TDLEN bytes to zero is normal.) The failsafe that QEMU currently has here is a check against reaching the original TDH value again -- a complete wraparound, which should never happen. - In the receive case RDH is increased from its initial value until "total_size" bytes have been received; preferably in a single step, or in "s->rxbuf_size" byte steps, if the latter is smaller. However, null RX descriptors are skipped without receiving data, while RDH is incremented just the same. QEMU tries to prevent an infinite loop (processing only null RX descriptors) by detecting whether RDH assumes its original value during the loop. (Again, wrapping from RDLEN to 0 is normal.) What both directions miss is that the guest could program TDLEN and RDLEN so low, and the initial TDH and RDH so high, that these registers will immediately be truncated to zero, and then never reassume their initial values in the loop -- a full wraparound will never occur. The condition that expresses this is: xdh_start >= s->mac_reg[XDLEN] / sizeof(desc) i.e., TDH or RDH start out after the last whole rx or tx descriptor that fits into the TDLEN or RDLEN sized area. This condition could be checked before we enter the loops, but pci_dma_read() / pci_dma_write() knows how to fill in buffers safely for bogus DMA addresses, so we just extend the existing failsafes with the above condition. This is CVE-2016-1981. Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Petr Matousek <pmatouse@redhat.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Prasad Pandit <ppandit@redhat.com> Cc: Michael Roth <mdroth@linux.vnet.ibm.com> Cc: Jason Wang <jasowang@redhat.com> Cc: qemu-stable@nongnu.org RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1296044 Signed-off-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Jason Wang <jasowang@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2016-01-19 14:17:20 +01:00
if (s->mac_reg[RDH] == rdh_start ||
rdh_start >= s->mac_reg[RDLEN] / sizeof(desc)) {
DBGOUT(RXERR, "RDH wraparound @%x, RDT %x, RDLEN %x\n",
rdh_start, s->mac_reg[RDT], s->mac_reg[RDLEN]);
set_ics(s, 0, E1000_ICS_RXO);
return -1;
}
} while (desc_offset < total_size);
e1000x_update_rx_total_stats(s->mac_reg, size, total_size);
n = E1000_ICS_RXT0;
if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH])
rdt += s->mac_reg[RDLEN] / sizeof(desc);
if (((rdt - s->mac_reg[RDH]) * sizeof(desc)) <= s->mac_reg[RDLEN] >>
s->rxbuf_min_shift)
n |= E1000_ICS_RXDMT0;
set_ics(s, 0, n);
return size;
}
static ssize_t
e1000_receive(NetClientState *nc, const uint8_t *buf, size_t size)
{
const struct iovec iov = {
.iov_base = (uint8_t *)buf,
.iov_len = size
};
return e1000_receive_iov(nc, &iov, 1);
}
static uint32_t
mac_readreg(E1000State *s, int index)
{
return s->mac_reg[index];
}
static uint32_t
mac_low4_read(E1000State *s, int index)
{
return s->mac_reg[index] & 0xf;
}
static uint32_t
mac_low11_read(E1000State *s, int index)
{
return s->mac_reg[index] & 0x7ff;
}
static uint32_t
mac_low13_read(E1000State *s, int index)
{
return s->mac_reg[index] & 0x1fff;
}
static uint32_t
mac_low16_read(E1000State *s, int index)
{
return s->mac_reg[index] & 0xffff;
}
static uint32_t
mac_icr_read(E1000State *s, int index)
{
uint32_t ret = s->mac_reg[ICR];
DBGOUT(INTERRUPT, "ICR read: %x\n", ret);
set_interrupt_cause(s, 0, 0);
return ret;
}
static uint32_t
mac_read_clr4(E1000State *s, int index)
{
uint32_t ret = s->mac_reg[index];
s->mac_reg[index] = 0;
return ret;
}
static uint32_t
mac_read_clr8(E1000State *s, int index)
{
uint32_t ret = s->mac_reg[index];
s->mac_reg[index] = 0;
s->mac_reg[index-1] = 0;
return ret;
}
static void
mac_writereg(E1000State *s, int index, uint32_t val)
{
uint32_t macaddr[2];
s->mac_reg[index] = val;
if (index == RA + 1) {
macaddr[0] = cpu_to_le32(s->mac_reg[RA]);
macaddr[1] = cpu_to_le32(s->mac_reg[RA + 1]);
qemu_format_nic_info_str(qemu_get_queue(s->nic), (uint8_t *)macaddr);
}
}
static void
set_rdt(E1000State *s, int index, uint32_t val)
{
s->mac_reg[index] = val & 0xffff;
if (e1000_has_rxbufs(s, 1)) {
qemu_flush_queued_packets(qemu_get_queue(s->nic));
}
}
static void
set_16bit(E1000State *s, int index, uint32_t val)
{
s->mac_reg[index] = val & 0xffff;
}
static void
set_dlen(E1000State *s, int index, uint32_t val)
{
s->mac_reg[index] = val & 0xfff80;
}
static void
set_tctl(E1000State *s, int index, uint32_t val)
{
s->mac_reg[index] = val;
s->mac_reg[TDT] &= 0xffff;
start_xmit(s);
}
static void
set_icr(E1000State *s, int index, uint32_t val)
{
DBGOUT(INTERRUPT, "set_icr %x\n", val);
set_interrupt_cause(s, 0, s->mac_reg[ICR] & ~val);
}
static void
set_imc(E1000State *s, int index, uint32_t val)
{
s->mac_reg[IMS] &= ~val;
set_ics(s, 0, 0);
}
static void
set_ims(E1000State *s, int index, uint32_t val)
{
s->mac_reg[IMS] |= val;
set_ics(s, 0, 0);
}
#define getreg(x) [x] = mac_readreg
static uint32_t (*macreg_readops[])(E1000State *, int) = {
getreg(PBA), getreg(RCTL), getreg(TDH), getreg(TXDCTL),
getreg(WUFC), getreg(TDT), getreg(CTRL), getreg(LEDCTL),
getreg(MANC), getreg(MDIC), getreg(SWSM), getreg(STATUS),
getreg(TORL), getreg(TOTL), getreg(IMS), getreg(TCTL),
getreg(RDH), getreg(RDT), getreg(VET), getreg(ICS),
getreg(TDBAL), getreg(TDBAH), getreg(RDBAH), getreg(RDBAL),
getreg(TDLEN), getreg(RDLEN), getreg(RDTR), getreg(RADV),
getreg(TADV), getreg(ITR), getreg(FCRUC), getreg(IPAV),
getreg(WUC), getreg(WUS), getreg(SCC), getreg(ECOL),
getreg(MCC), getreg(LATECOL), getreg(COLC), getreg(DC),
getreg(TNCRS), getreg(SEQEC), getreg(CEXTERR), getreg(RLEC),
getreg(XONRXC), getreg(XONTXC), getreg(XOFFRXC), getreg(XOFFTXC),
getreg(RFC), getreg(RJC), getreg(RNBC), getreg(TSCTFC),
getreg(MGTPRC), getreg(MGTPDC), getreg(MGTPTC), getreg(GORCL),
getreg(GOTCL),
[TOTH] = mac_read_clr8, [TORH] = mac_read_clr8,
[GOTCH] = mac_read_clr8, [GORCH] = mac_read_clr8,
[PRC64] = mac_read_clr4, [PRC127] = mac_read_clr4,
[PRC255] = mac_read_clr4, [PRC511] = mac_read_clr4,
[PRC1023] = mac_read_clr4, [PRC1522] = mac_read_clr4,
[PTC64] = mac_read_clr4, [PTC127] = mac_read_clr4,
[PTC255] = mac_read_clr4, [PTC511] = mac_read_clr4,
[PTC1023] = mac_read_clr4, [PTC1522] = mac_read_clr4,
[GPRC] = mac_read_clr4, [GPTC] = mac_read_clr4,
[TPT] = mac_read_clr4, [TPR] = mac_read_clr4,
[RUC] = mac_read_clr4, [ROC] = mac_read_clr4,
[BPRC] = mac_read_clr4, [MPRC] = mac_read_clr4,
[TSCTC] = mac_read_clr4, [BPTC] = mac_read_clr4,
[MPTC] = mac_read_clr4,
[ICR] = mac_icr_read, [EECD] = get_eecd,
[EERD] = flash_eerd_read,
[RDFH] = mac_low13_read, [RDFT] = mac_low13_read,
[RDFHS] = mac_low13_read, [RDFTS] = mac_low13_read,
[RDFPC] = mac_low13_read,
[TDFH] = mac_low11_read, [TDFT] = mac_low11_read,
[TDFHS] = mac_low13_read, [TDFTS] = mac_low13_read,
[TDFPC] = mac_low13_read,
[AIT] = mac_low16_read,
[CRCERRS ... MPC] = &mac_readreg,
[IP6AT ... IP6AT+3] = &mac_readreg, [IP4AT ... IP4AT+6] = &mac_readreg,
[FFLT ... FFLT+6] = &mac_low11_read,
[RA ... RA+31] = &mac_readreg,
[WUPM ... WUPM+31] = &mac_readreg,
[MTA ... MTA+127] = &mac_readreg,
[VFTA ... VFTA+127] = &mac_readreg,
[FFMT ... FFMT+254] = &mac_low4_read,
[FFVT ... FFVT+254] = &mac_readreg,
[PBM ... PBM+16383] = &mac_readreg,
};
enum { NREADOPS = ARRAY_SIZE(macreg_readops) };
#define putreg(x) [x] = mac_writereg
static void (*macreg_writeops[])(E1000State *, int, uint32_t) = {
putreg(PBA), putreg(EERD), putreg(SWSM), putreg(WUFC),
putreg(TDBAL), putreg(TDBAH), putreg(TXDCTL), putreg(RDBAH),
putreg(RDBAL), putreg(LEDCTL), putreg(VET), putreg(FCRUC),
putreg(TDFH), putreg(TDFT), putreg(TDFHS), putreg(TDFTS),
putreg(TDFPC), putreg(RDFH), putreg(RDFT), putreg(RDFHS),
putreg(RDFTS), putreg(RDFPC), putreg(IPAV), putreg(WUC),
putreg(WUS), putreg(AIT),
[TDLEN] = set_dlen, [RDLEN] = set_dlen, [TCTL] = set_tctl,
[TDT] = set_tctl, [MDIC] = set_mdic, [ICS] = set_ics,
[TDH] = set_16bit, [RDH] = set_16bit, [RDT] = set_rdt,
[IMC] = set_imc, [IMS] = set_ims, [ICR] = set_icr,
[EECD] = set_eecd, [RCTL] = set_rx_control, [CTRL] = set_ctrl,
[RDTR] = set_16bit, [RADV] = set_16bit, [TADV] = set_16bit,
[ITR] = set_16bit,
[IP6AT ... IP6AT+3] = &mac_writereg, [IP4AT ... IP4AT+6] = &mac_writereg,
[FFLT ... FFLT+6] = &mac_writereg,
[RA ... RA+31] = &mac_writereg,
[WUPM ... WUPM+31] = &mac_writereg,
[MTA ... MTA+127] = &mac_writereg,
[VFTA ... VFTA+127] = &mac_writereg,
[FFMT ... FFMT+254] = &mac_writereg, [FFVT ... FFVT+254] = &mac_writereg,
[PBM ... PBM+16383] = &mac_writereg,
};
enum { NWRITEOPS = ARRAY_SIZE(macreg_writeops) };
enum { MAC_ACCESS_PARTIAL = 1, MAC_ACCESS_FLAG_NEEDED = 2 };
#define markflag(x) ((E1000_FLAG_##x << 2) | MAC_ACCESS_FLAG_NEEDED)
/* In the array below the meaning of the bits is: [f|f|f|f|f|f|n|p]
* f - flag bits (up to 6 possible flags)
* n - flag needed
* p - partially implenented */
static const uint8_t mac_reg_access[0x8000] = {
[RDTR] = markflag(MIT), [TADV] = markflag(MIT),
[RADV] = markflag(MIT), [ITR] = markflag(MIT),
[IPAV] = markflag(MAC), [WUC] = markflag(MAC),
[IP6AT] = markflag(MAC), [IP4AT] = markflag(MAC),
[FFVT] = markflag(MAC), [WUPM] = markflag(MAC),
[ECOL] = markflag(MAC), [MCC] = markflag(MAC),
[DC] = markflag(MAC), [TNCRS] = markflag(MAC),
[RLEC] = markflag(MAC), [XONRXC] = markflag(MAC),
[XOFFTXC] = markflag(MAC), [RFC] = markflag(MAC),
[TSCTFC] = markflag(MAC), [MGTPRC] = markflag(MAC),
[WUS] = markflag(MAC), [AIT] = markflag(MAC),
[FFLT] = markflag(MAC), [FFMT] = markflag(MAC),
[SCC] = markflag(MAC), [FCRUC] = markflag(MAC),
[LATECOL] = markflag(MAC), [COLC] = markflag(MAC),
[SEQEC] = markflag(MAC), [CEXTERR] = markflag(MAC),
[XONTXC] = markflag(MAC), [XOFFRXC] = markflag(MAC),
[RJC] = markflag(MAC), [RNBC] = markflag(MAC),
[MGTPDC] = markflag(MAC), [MGTPTC] = markflag(MAC),
[RUC] = markflag(MAC), [ROC] = markflag(MAC),
[GORCL] = markflag(MAC), [GORCH] = markflag(MAC),
[GOTCL] = markflag(MAC), [GOTCH] = markflag(MAC),
[BPRC] = markflag(MAC), [MPRC] = markflag(MAC),
[TSCTC] = markflag(MAC), [PRC64] = markflag(MAC),
[PRC127] = markflag(MAC), [PRC255] = markflag(MAC),
[PRC511] = markflag(MAC), [PRC1023] = markflag(MAC),
[PRC1522] = markflag(MAC), [PTC64] = markflag(MAC),
[PTC127] = markflag(MAC), [PTC255] = markflag(MAC),
[PTC511] = markflag(MAC), [PTC1023] = markflag(MAC),
[PTC1522] = markflag(MAC), [MPTC] = markflag(MAC),
[BPTC] = markflag(MAC),
[TDFH] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[TDFT] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[TDFHS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[TDFTS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[TDFPC] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[RDFH] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[RDFT] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[RDFHS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[RDFTS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[RDFPC] = markflag(MAC) | MAC_ACCESS_PARTIAL,
[PBM] = markflag(MAC) | MAC_ACCESS_PARTIAL,
};
static void
e1000_mmio_write(void *opaque, hwaddr addr, uint64_t val,
unsigned size)
{
E1000State *s = opaque;
unsigned int index = (addr & 0x1ffff) >> 2;
if (index < NWRITEOPS && macreg_writeops[index]) {
if (!(mac_reg_access[index] & MAC_ACCESS_FLAG_NEEDED)
|| (s->compat_flags & (mac_reg_access[index] >> 2))) {
if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) {
DBGOUT(GENERAL, "Writing to register at offset: 0x%08x. "
"It is not fully implemented.\n", index<<2);
}
macreg_writeops[index](s, index, val);
} else { /* "flag needed" bit is set, but the flag is not active */
DBGOUT(MMIO, "MMIO write attempt to disabled reg. addr=0x%08x\n",
index<<2);
}
} else if (index < NREADOPS && macreg_readops[index]) {
DBGOUT(MMIO, "e1000_mmio_writel RO %x: 0x%04"PRIx64"\n",
index<<2, val);
} else {
DBGOUT(UNKNOWN, "MMIO unknown write addr=0x%08x,val=0x%08"PRIx64"\n",
index<<2, val);
}
}
static uint64_t
e1000_mmio_read(void *opaque, hwaddr addr, unsigned size)
{
E1000State *s = opaque;
unsigned int index = (addr & 0x1ffff) >> 2;
if (index < NREADOPS && macreg_readops[index]) {
if (!(mac_reg_access[index] & MAC_ACCESS_FLAG_NEEDED)
|| (s->compat_flags & (mac_reg_access[index] >> 2))) {
if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) {
DBGOUT(GENERAL, "Reading register at offset: 0x%08x. "
"It is not fully implemented.\n", index<<2);
}
return macreg_readops[index](s, index);
} else { /* "flag needed" bit is set, but the flag is not active */
DBGOUT(MMIO, "MMIO read attempt of disabled reg. addr=0x%08x\n",
index<<2);
}
} else {
DBGOUT(UNKNOWN, "MMIO unknown read addr=0x%08x\n", index<<2);
}
return 0;
}
static const MemoryRegionOps e1000_mmio_ops = {
.read = e1000_mmio_read,
.write = e1000_mmio_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.impl = {
.min_access_size = 4,
.max_access_size = 4,
},
};
static uint64_t e1000_io_read(void *opaque, hwaddr addr,
unsigned size)
{
E1000State *s = opaque;
(void)s;
return 0;
}
static void e1000_io_write(void *opaque, hwaddr addr,
uint64_t val, unsigned size)
{
E1000State *s = opaque;
(void)s;
}
static const MemoryRegionOps e1000_io_ops = {
.read = e1000_io_read,
.write = e1000_io_write,
.endianness = DEVICE_LITTLE_ENDIAN,
};
static bool is_version_1(void *opaque, int version_id)
{
return version_id == 1;
}
static int e1000_pre_save(void *opaque)
{
E1000State *s = opaque;
NetClientState *nc = qemu_get_queue(s->nic);
/* If the mitigation timer is active, emulate a timeout now. */
if (s->mit_timer_on) {
e1000_mit_timer(s);
}
/*
* If link is down and auto-negotiation is supported and ongoing,
* complete auto-negotiation immediately. This allows us to look
* at MII_SR_AUTONEG_COMPLETE to infer link status on load.
*/
if (nc->link_down && have_autoneg(s)) {
s->phy_reg[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE;
}
return 0;
}
static int e1000_post_load(void *opaque, int version_id)
{
E1000State *s = opaque;
NetClientState *nc = qemu_get_queue(s->nic);
if (!chkflag(MIT)) {
s->mac_reg[ITR] = s->mac_reg[RDTR] = s->mac_reg[RADV] =
s->mac_reg[TADV] = 0;
s->mit_irq_level = false;
}
s->mit_ide = 0;
s->mit_timer_on = false;
/* nc.link_down can't be migrated, so infer link_down according
* to link status bit in mac_reg[STATUS].
* Alternatively, restart link negotiation if it was in progress. */
nc->link_down = (s->mac_reg[STATUS] & E1000_STATUS_LU) == 0;
if (have_autoneg(s) &&
!(s->phy_reg[PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) {
nc->link_down = false;
timer_mod(s->autoneg_timer,
qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500);
}
return 0;
}
static bool e1000_mit_state_needed(void *opaque)
{
E1000State *s = opaque;
return chkflag(MIT);
}
static bool e1000_full_mac_needed(void *opaque)
{
E1000State *s = opaque;
return chkflag(MAC);
}
static const VMStateDescription vmstate_e1000_mit_state = {
.name = "e1000/mit_state",
.version_id = 1,
.minimum_version_id = 1,
.needed = e1000_mit_state_needed,
.fields = (VMStateField[]) {
VMSTATE_UINT32(mac_reg[RDTR], E1000State),
VMSTATE_UINT32(mac_reg[RADV], E1000State),
VMSTATE_UINT32(mac_reg[TADV], E1000State),
VMSTATE_UINT32(mac_reg[ITR], E1000State),
VMSTATE_BOOL(mit_irq_level, E1000State),
VMSTATE_END_OF_LIST()
}
};
static const VMStateDescription vmstate_e1000_full_mac_state = {
.name = "e1000/full_mac_state",
.version_id = 1,
.minimum_version_id = 1,
.needed = e1000_full_mac_needed,
.fields = (VMStateField[]) {
VMSTATE_UINT32_ARRAY(mac_reg, E1000State, 0x8000),
VMSTATE_END_OF_LIST()
}
};
static const VMStateDescription vmstate_e1000 = {
.name = "e1000",
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
.version_id = 3,
.minimum_version_id = 1,
.pre_save = e1000_pre_save,
.post_load = e1000_post_load,
.fields = (VMStateField[]) {
VMSTATE_PCI_DEVICE(parent_obj, E1000State),
VMSTATE_UNUSED_TEST(is_version_1, 4), /* was instance id */
VMSTATE_UNUSED(4), /* Was mmio_base. */
VMSTATE_UINT32(rxbuf_size, E1000State),
VMSTATE_UINT32(rxbuf_min_shift, E1000State),
VMSTATE_UINT32(eecd_state.val_in, E1000State),
VMSTATE_UINT16(eecd_state.bitnum_in, E1000State),
VMSTATE_UINT16(eecd_state.bitnum_out, E1000State),
VMSTATE_UINT16(eecd_state.reading, E1000State),
VMSTATE_UINT32(eecd_state.old_eecd, E1000State),
VMSTATE_UINT8(tx.props.ipcss, E1000State),
VMSTATE_UINT8(tx.props.ipcso, E1000State),
VMSTATE_UINT16(tx.props.ipcse, E1000State),
VMSTATE_UINT8(tx.props.tucss, E1000State),
VMSTATE_UINT8(tx.props.tucso, E1000State),
VMSTATE_UINT16(tx.props.tucse, E1000State),
VMSTATE_UINT32(tx.props.paylen, E1000State),
VMSTATE_UINT8(tx.props.hdr_len, E1000State),
VMSTATE_UINT16(tx.props.mss, E1000State),
VMSTATE_UINT16(tx.size, E1000State),
VMSTATE_UINT16(tx.tso_frames, E1000State),
VMSTATE_UINT8(tx.sum_needed, E1000State),
VMSTATE_INT8(tx.props.ip, E1000State),
VMSTATE_INT8(tx.props.tcp, E1000State),
VMSTATE_BUFFER(tx.header, E1000State),
VMSTATE_BUFFER(tx.data, E1000State),
VMSTATE_UINT16_ARRAY(eeprom_data, E1000State, 64),
VMSTATE_UINT16_ARRAY(phy_reg, E1000State, 0x20),
VMSTATE_UINT32(mac_reg[CTRL], E1000State),
VMSTATE_UINT32(mac_reg[EECD], E1000State),
VMSTATE_UINT32(mac_reg[EERD], E1000State),
VMSTATE_UINT32(mac_reg[GPRC], E1000State),
VMSTATE_UINT32(mac_reg[GPTC], E1000State),
VMSTATE_UINT32(mac_reg[ICR], E1000State),
VMSTATE_UINT32(mac_reg[ICS], E1000State),
VMSTATE_UINT32(mac_reg[IMC], E1000State),
VMSTATE_UINT32(mac_reg[IMS], E1000State),
VMSTATE_UINT32(mac_reg[LEDCTL], E1000State),
VMSTATE_UINT32(mac_reg[MANC], E1000State),
VMSTATE_UINT32(mac_reg[MDIC], E1000State),
VMSTATE_UINT32(mac_reg[MPC], E1000State),
VMSTATE_UINT32(mac_reg[PBA], E1000State),
VMSTATE_UINT32(mac_reg[RCTL], E1000State),
VMSTATE_UINT32(mac_reg[RDBAH], E1000State),
VMSTATE_UINT32(mac_reg[RDBAL], E1000State),
VMSTATE_UINT32(mac_reg[RDH], E1000State),
VMSTATE_UINT32(mac_reg[RDLEN], E1000State),
VMSTATE_UINT32(mac_reg[RDT], E1000State),
VMSTATE_UINT32(mac_reg[STATUS], E1000State),
VMSTATE_UINT32(mac_reg[SWSM], E1000State),
VMSTATE_UINT32(mac_reg[TCTL], E1000State),
VMSTATE_UINT32(mac_reg[TDBAH], E1000State),
VMSTATE_UINT32(mac_reg[TDBAL], E1000State),
VMSTATE_UINT32(mac_reg[TDH], E1000State),
VMSTATE_UINT32(mac_reg[TDLEN], E1000State),
VMSTATE_UINT32(mac_reg[TDT], E1000State),
VMSTATE_UINT32(mac_reg[TORH], E1000State),
VMSTATE_UINT32(mac_reg[TORL], E1000State),
VMSTATE_UINT32(mac_reg[TOTH], E1000State),
VMSTATE_UINT32(mac_reg[TOTL], E1000State),
VMSTATE_UINT32(mac_reg[TPR], E1000State),
VMSTATE_UINT32(mac_reg[TPT], E1000State),
VMSTATE_UINT32(mac_reg[TXDCTL], E1000State),
VMSTATE_UINT32(mac_reg[WUFC], E1000State),
VMSTATE_UINT32(mac_reg[VET], E1000State),
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32),
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128),
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128),
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk <eswierk@skyportsystems.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 00:23:34 +01:00
VMSTATE_UINT8_V(tx.tso_props.ipcss, E1000State, 3),
VMSTATE_UINT8_V(tx.tso_props.ipcso, E1000State, 3),
VMSTATE_UINT16_V(tx.tso_props.ipcse, E1000State, 3),
VMSTATE_UINT8_V(tx.tso_props.tucss, E1000State, 3),
VMSTATE_UINT8_V(tx.tso_props.tucso, E1000State, 3),
VMSTATE_UINT16_V(tx.tso_props.tucse, E1000State, 3),
VMSTATE_UINT32_V(tx.tso_props.paylen, E1000State, 3),
VMSTATE_UINT8_V(tx.tso_props.hdr_len, E1000State, 3),
VMSTATE_UINT16_V(tx.tso_props.mss, E1000State, 3),
VMSTATE_INT8_V(tx.tso_props.ip, E1000State, 3),
VMSTATE_INT8_V(tx.tso_props.tcp, E1000State, 3),
VMSTATE_END_OF_LIST()
},
.subsections = (const VMStateDescription*[]) {
&vmstate_e1000_mit_state,
&vmstate_e1000_full_mac_state,
NULL
}
};
/*
* EEPROM contents documented in Tables 5-2 and 5-3, pp. 98-102.
* Note: A valid DevId will be inserted during pci_e1000_init().
*/
static const uint16_t e1000_eeprom_template[64] = {
0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000,
0x3000, 0x1000, 0x6403, 0 /*DevId*/, 0x8086, 0 /*DevId*/, 0x8086, 0x3040,
0x0008, 0x2000, 0x7e14, 0x0048, 0x1000, 0x00d8, 0x0000, 0x2700,
0x6cc9, 0x3150, 0x0722, 0x040b, 0x0984, 0x0000, 0xc000, 0x0706,
0x1008, 0x0000, 0x0f04, 0x7fff, 0x4d01, 0xffff, 0xffff, 0xffff,
0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
0x0100, 0x4000, 0x121c, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000,
};
/* PCI interface */
static void
e1000_mmio_setup(E1000State *d)
{
int i;
const uint32_t excluded_regs[] = {
E1000_MDIC, E1000_ICR, E1000_ICS, E1000_IMS,
E1000_IMC, E1000_TCTL, E1000_TDT, PNPMMIO_SIZE
};
memory_region_init_io(&d->mmio, OBJECT(d), &e1000_mmio_ops, d,
"e1000-mmio", PNPMMIO_SIZE);
memory_region_add_coalescing(&d->mmio, 0, excluded_regs[0]);
for (i = 0; excluded_regs[i] != PNPMMIO_SIZE; i++)
memory_region_add_coalescing(&d->mmio, excluded_regs[i] + 4,
excluded_regs[i+1] - excluded_regs[i] - 4);
memory_region_init_io(&d->io, OBJECT(d), &e1000_io_ops, d, "e1000-io", IOPORT_SIZE);
}
static void
pci_e1000_uninit(PCIDevice *dev)
{
E1000State *d = E1000(dev);
timer_del(d->autoneg_timer);
timer_free(d->autoneg_timer);
timer_del(d->mit_timer);
timer_free(d->mit_timer);
qemu_del_nic(d->nic);
}
static NetClientInfo net_e1000_info = {
qapi: Change Netdev into a flat union This is a mostly-mechanical conversion that creates a new flat union 'Netdev' QAPI type that covers all the branches of the former 'NetClientOptions' simple union, where the branches are now listed in a new 'NetClientDriver' enum rather than generated from the simple union. The existence of a flat union has no change to the command line syntax accepted for new code, and will make it possible for a future patch to switch the QMP command to parse a boxed union for no change to valid QMP; but it does have some ripple effect on the C code when dealing with the new types. While making the conversion, note that the 'NetLegacy' type remains unchanged: it applies only to legacy command line options, and will not be ported to QMP, so it should remain a wrapper around a simple union; to avoid confusion, the type named 'NetClientOptions' is now gone, and we introduce 'NetLegacyOptions' in its place. Then, in the C code, we convert from NetLegacy to Netdev as soon as possible, so that the bulk of the net stack only has to deal with one QAPI type, not two. Note that since the old legacy code always rejected 'hubport', we can just omit that branch from the new 'NetLegacyOptions' simple union. Based on an idea originally by Zoltán Kővágó <DirtY.iCE.hu@gmail.com>: Message-Id: <01a527fbf1a5de880091f98cf011616a78adeeee.1441627176.git.DirtY.iCE.hu@gmail.com> although the sed script in that patch no longer applies due to other changes in the tree since then, and I also did some manual cleanups (such as fixing whitespace to keep checkpatch happy). Signed-off-by: Eric Blake <eblake@redhat.com> Message-Id: <1468468228-27827-13-git-send-email-eblake@redhat.com> Reviewed-by: Markus Armbruster <armbru@redhat.com> [Fixup from Eric squashed in] Signed-off-by: Markus Armbruster <armbru@redhat.com>
2016-07-14 05:50:23 +02:00
.type = NET_CLIENT_DRIVER_NIC,
.size = sizeof(NICState),
.can_receive = e1000_can_receive,
.receive = e1000_receive,
.receive_iov = e1000_receive_iov,
.link_status_changed = e1000_set_link_status,
};
static void e1000_write_config(PCIDevice *pci_dev, uint32_t address,
uint32_t val, int len)
{
E1000State *s = E1000(pci_dev);
pci_default_write_config(pci_dev, address, val, len);
if (range_covers_byte(address, len, PCI_COMMAND) &&
(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
qemu_flush_queued_packets(qemu_get_queue(s->nic));
}
}
static void pci_e1000_realize(PCIDevice *pci_dev, Error **errp)
{
DeviceState *dev = DEVICE(pci_dev);
E1000State *d = E1000(pci_dev);
uint8_t *pci_conf;
uint8_t *macaddr;
pci_dev->config_write = e1000_write_config;
pci_conf = pci_dev->config;
/* TODO: RST# value should be 0, PCI spec 6.2.4 */
pci_conf[PCI_CACHE_LINE_SIZE] = 0x10;
pci_conf[PCI_INTERRUPT_PIN] = 1; /* interrupt pin A */
e1000_mmio_setup(d);
pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &d->mmio);
pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->io);
qemu_macaddr_default_if_unset(&d->conf.macaddr);
macaddr = d->conf.macaddr.a;
e1000x_core_prepare_eeprom(d->eeprom_data,
e1000_eeprom_template,
sizeof(e1000_eeprom_template),
PCI_DEVICE_GET_CLASS(pci_dev)->device_id,
macaddr);
d->nic = qemu_new_nic(&net_e1000_info, &d->conf,
object_get_typename(OBJECT(d)), dev->id, d);
qemu_format_nic_info_str(qemu_get_queue(d->nic), macaddr);
d->autoneg_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, e1000_autoneg_timer, d);
d->mit_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, e1000_mit_timer, d);
}
static void qdev_e1000_reset(DeviceState *dev)
{
E1000State *d = E1000(dev);
e1000_reset(d);
}
static Property e1000_properties[] = {
DEFINE_NIC_PROPERTIES(E1000State, conf),
DEFINE_PROP_BIT("autonegotiation", E1000State,
compat_flags, E1000_FLAG_AUTONEG_BIT, true),
DEFINE_PROP_BIT("mitigation", E1000State,
compat_flags, E1000_FLAG_MIT_BIT, true),
DEFINE_PROP_BIT("extra_mac_registers", E1000State,
compat_flags, E1000_FLAG_MAC_BIT, true),
DEFINE_PROP_END_OF_LIST(),
};
typedef struct E1000Info {
const char *name;
uint16_t device_id;
uint8_t revision;
uint16_t phy_id2;
} E1000Info;
static void e1000_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
E1000BaseClass *e = E1000_DEVICE_CLASS(klass);
const E1000Info *info = data;
k->realize = pci_e1000_realize;
k->exit = pci_e1000_uninit;
k->romfile = "efi-e1000.rom";
k->vendor_id = PCI_VENDOR_ID_INTEL;
k->device_id = info->device_id;
k->revision = info->revision;
e->phy_id2 = info->phy_id2;
k->class_id = PCI_CLASS_NETWORK_ETHERNET;
set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
dc->desc = "Intel Gigabit Ethernet";
dc->reset = qdev_e1000_reset;
dc->vmsd = &vmstate_e1000;
dc->props = e1000_properties;
}
static void e1000_instance_init(Object *obj)
{
E1000State *n = E1000(obj);
device_add_bootindex_property(obj, &n->conf.bootindex,
"bootindex", "/ethernet-phy@0",
DEVICE(n), NULL);
}
static const TypeInfo e1000_base_info = {
.name = TYPE_E1000_BASE,
.parent = TYPE_PCI_DEVICE,
.instance_size = sizeof(E1000State),
.instance_init = e1000_instance_init,
.class_size = sizeof(E1000BaseClass),
.abstract = true,
pci: Add INTERFACE_CONVENTIONAL_PCI_DEVICE to Conventional PCI devices Add INTERFACE_CONVENTIONAL_PCI_DEVICE to all direct subtypes of TYPE_PCI_DEVICE, except: 1) The ones that already have INTERFACE_PCIE_DEVICE set: * base-xhci * e1000e * nvme * pvscsi * vfio-pci * virtio-pci * vmxnet3 2) base-pci-bridge Not all PCI bridges are Conventional PCI devices, so INTERFACE_CONVENTIONAL_PCI_DEVICE is added only to the subtypes that are actually Conventional PCI: * dec-21154-p2p-bridge * i82801b11-bridge * pbm-bridge * pci-bridge The direct subtypes of base-pci-bridge not touched by this patch are: * xilinx-pcie-root: Already marked as PCIe-only. * pcie-pci-bridge: Already marked as PCIe-only. * pcie-port: all non-abstract subtypes of pcie-port are already marked as PCIe-only devices. 3) megasas-base Not all megasas devices are Conventional PCI devices, so the interface names are added to the subclasses registered by megasas_register_types(), according to information in the megasas_devices[] array. "megasas-gen2" already implements INTERFACE_PCIE_DEVICE, so add INTERFACE_CONVENTIONAL_PCI_DEVICE only to "megasas". Acked-by: Alberto Garcia <berto@igalia.com> Acked-by: John Snow <jsnow@redhat.com> Acked-by: Anthony PERARD <anthony.perard@citrix.com> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Acked-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Marcel Apfelbaum <marcel@redhat.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2017-09-27 21:56:34 +02:00
.interfaces = (InterfaceInfo[]) {
{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
{ },
},
};
static const E1000Info e1000_devices[] = {
{
.name = "e1000",
.device_id = E1000_DEV_ID_82540EM,
.revision = 0x03,
.phy_id2 = E1000_PHY_ID2_8254xx_DEFAULT,
},
{
.name = "e1000-82544gc",
.device_id = E1000_DEV_ID_82544GC_COPPER,
.revision = 0x03,
.phy_id2 = E1000_PHY_ID2_82544x,
},
{
.name = "e1000-82545em",
.device_id = E1000_DEV_ID_82545EM_COPPER,
.revision = 0x03,
.phy_id2 = E1000_PHY_ID2_8254xx_DEFAULT,
},
};
static void e1000_register_types(void)
{
int i;
type_register_static(&e1000_base_info);
for (i = 0; i < ARRAY_SIZE(e1000_devices); i++) {
const E1000Info *info = &e1000_devices[i];
TypeInfo type_info = {};
type_info.name = info->name;
type_info.parent = TYPE_E1000_BASE;
type_info.class_data = (void *)info;
type_info.class_init = e1000_class_init;
type_info.instance_init = e1000_instance_init;
type_register(&type_info);
}
}
type_init(e1000_register_types)