Message ID | 20121227100658.GB48891@onelab2.iet.unipi.it |
---|---|
State | New |
Headers | show |
On Thu, Dec 27, 2012 at 10:06 AM, Luigi Rizzo <rizzo@iet.unipi.it> wrote: > Before submitting a proper patch, I'd like to hear feedback on the > following proposed change to hw/e1000.c to properly implement > interrupt mitigation. > This is joint work with Vincenzo Maffione and Giuseppe Lettieri (in Cc), > and is a followup to a similar patch i posted in july > > https://lists.gnu.org/archive/html/qemu-devel/2012-07/msg03195.html > > > The patch models three of the five (sic!) e1000 register that control > moderation, and uses qemu timers for that (the minimum delay for > these timers affects the fidelity of the emulation). > > Right now there is a static variable that controls whether the > feature is enabled. I would probably like to make it a parameter > accessible from the command line in qemu, possibly extending it to > override the mitigation delay set by the device driver. > > Right now we reached transmit speeds in the order of 2-300Kpps > (if matched with a proper guest device driver optimization, see > https://groups.google.com/forum/?fromgroups=#!topic/mailing.freebsd.emulator/xQHR_hleCuc ) > > Some performance data using a FreeBSD guest, for udp transmissions: > > KVM QEMU > standard KVM, standard driver 20 Kpps 6.3 Kpps > modified KVM, standard driver 37 Kpps 28 Kpps > modified KVM, modified driver 200 Kpps 34 Kpps > > As you can see, on kvm this change gives a 5x speedup to the tx path, > which combines nicely with the 2x speedup that comes from supporting > interrupt mitigation alone in the hypervisor. Without kvm (or kqemu ?) > the benefits are much lower, as the guest becomes too slow. > > cheers > luigi > > diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c > --- qemu-1.3.0-orig/hw/e1000.c 2012-12-03 20:37:05.000000000 +0100 > +++ qemu-1.3.0/hw/e1000.c 2012-12-27 09:47:16.000000000 +0100 > @@ -35,6 +35,8 @@ > > #include "e1000_hw.h" > > +static int mit_on = 1; /* interrupt mitigation enable */ > + > #define E1000_DEBUG > > #ifdef E1000_DEBUG > @@ -129,6 +131,9 @@ typedef struct E1000State_st { > } eecd_state; > > QEMUTimer *autoneg_timer; > + QEMUTimer *mit_timer; // handle for the timer > + uint32_t mit_timer_on; // mitigation timer active > + uint32_t mit_cause; // pending interrupt cause > } E1000State; > > #define defreg(x) x = (E1000_##x>>2) > @@ -144,6 +149,7 @@ enum { > defreg(TPR), defreg(TPT), defreg(TXDCTL), defreg(WUFC), > defreg(RA), defreg(MTA), defreg(CRCERRS),defreg(VFTA), > defreg(VET), > + defreg(RDTR), defreg(RADV), defreg(TADV), defreg(ITR), > }; > > static void > @@ -639,6 +645,68 @@ static uint64_t tx_desc_base(E1000State > return (bah << 32) + bal; > } > > +/* helper function, 0 means the value is not set */ > +static inline void > +mit_update_delay(uint32_t *curr, uint32_t value) > +{ > + if (value && (*curr == 0 || value < *curr)) Missing braces, please read CODING_STYLE and check the patches with scripts/checkpatch.pl. Also in several places below. > + *curr = value; > +} > + > +/* > + * If necessary, rearm the timer and post an interrupt. > + * Called at the end of tx/rx routines (mit_timer_on == 0), > + * and when the timer fires (mit_timer_on == 1). > + * We provide a partial implementation of interrupt mitigation, > + * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for > + * RADV and TADV, 256ns units for ITR). RDTR is only used to enable RADV; > + * relative timers based on TIDV and RDTR are not implemented. > + */ > +static void > +mit_rearm_and_int(void *opaque) > +{ > + E1000State *s = opaque; > + uint32_t mit_delay = 0; > + > + /* > + * Clear the flag. It is only set when the callback fires, > + * and we need to clear it anyways. > + */ > + s->mit_timer_on = 0; > + if (s->mit_cause == 0) /* no events pending, we are done */ > + return; > + /* > + * Compute the next mitigation delay according to pending interrupts > + * and the current values of RADV (provided RDTR!=0), TADV and ITR. > + * Then rearm the timer. > + */ > + if (s->mit_cause & (E1000_ICR_TXQE | E1000_ICR_TXDW)) > + mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4); > + if (s->mac_reg[RDTR] && (s->mit_cause & E1000_ICS_RXT0)) > + mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4); > + mit_update_delay(&mit_delay, s->mac_reg[ITR]); > + > + if (mit_delay) { > + s->mit_timer_on = 1; > + qemu_mod_timer(s->mit_timer, > + qemu_get_clock_ns(vm_clock) + mit_delay * 256); > + } > + set_ics(s, 0, s->mit_cause); > + s->mit_cause = 0; > +} > + > +static void > +mit_set_ics(E1000State *s, uint32_t cause) > +{ > + if (mit_on == 0) { > + set_ics(s, 0, cause); > + return; > + } > + s->mit_cause |= cause; > + if (!s->mit_timer_on) > + mit_rearm_and_int(s); > +} > + > static void > start_xmit(E1000State *s) > { > @@ -676,7 +744,7 @@ start_xmit(E1000State *s) > break; > } > } > - set_ics(s, 0, cause); > + mit_set_ics(s, cause); > } > > static int > @@ -894,7 +962,7 @@ e1000_receive(NetClientState *nc, const > s->rxbuf_min_shift) > n |= E1000_ICS_RXDMT0; > > - set_ics(s, 0, n); > + mit_set_ics(s, n); > > return size; > } > @@ -999,6 +1067,7 @@ static uint32_t (*macreg_readops[])(E100 > getreg(RDH), getreg(RDT), getreg(VET), getreg(ICS), > getreg(TDBAL), getreg(TDBAH), getreg(RDBAH), getreg(RDBAL), > getreg(TDLEN), getreg(RDLEN), > + getreg(RDTR), getreg(RADV), getreg(TADV), getreg(ITR), > > [TOTH] = mac_read_clr8, [TORH] = mac_read_clr8, [GPRC] = mac_read_clr4, > [GPTC] = mac_read_clr4, [TPR] = mac_read_clr4, [TPT] = mac_read_clr4, > @@ -1015,6 +1084,8 @@ static void (*macreg_writeops[])(E1000St > putreg(PBA), putreg(EERD), putreg(SWSM), putreg(WUFC), > putreg(TDBAL), putreg(TDBAH), putreg(TXDCTL), putreg(RDBAH), > putreg(RDBAL), putreg(LEDCTL), putreg(VET), > + [RDTR] = set_16bit, [RADV] = set_16bit, [TADV] = set_16bit, > + [ITR] = set_16bit, > [TDLEN] = set_dlen, [RDLEN] = set_dlen, [TCTL] = set_tctl, > [TDT] = set_tctl, [MDIC] = set_mdic, [ICS] = set_ics, > [TDH] = set_16bit, [RDH] = set_16bit, [RDT] = set_rdt, > @@ -1286,6 +1357,9 @@ static int pci_e1000_init(PCIDevice *pci > add_boot_device_path(d->conf.bootindex, &pci_dev->qdev, "/ethernet-phy@0"); > > d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d); > + d->mit_cause = 0; > + d->mit_timer_on = 0; > + d->mit_timer = qemu_new_timer_ns(vm_clock, mit_rearm_and_int, d); > > return 0; > } >
On Thu, Dec 27, 2012 at 11:06:58AM +0100, Luigi Rizzo wrote: > diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c > --- qemu-1.3.0-orig/hw/e1000.c 2012-12-03 20:37:05.000000000 +0100 > +++ qemu-1.3.0/hw/e1000.c 2012-12-27 09:47:16.000000000 +0100 > @@ -35,6 +35,8 @@ > > #include "e1000_hw.h" > > +static int mit_on = 1; /* interrupt mitigation enable */ If you want to make this optional then please put it inside E1000State so it can be toggled per NIC. If you don't want to make it optional then please post results for recent Linux and Windows guests to show that out-of-the-box performance is not degraded. > #define E1000_DEBUG > > #ifdef E1000_DEBUG > @@ -129,6 +131,9 @@ typedef struct E1000State_st { > } eecd_state; > > QEMUTimer *autoneg_timer; > + QEMUTimer *mit_timer; // handle for the timer > + uint32_t mit_timer_on; // mitigation timer active > + uint32_t mit_cause; // pending interrupt cause Live migration support is required so that pending interrupts are migrated. We cannot lose interrupts across live migration. I would migrate mit_cause but not mit_timer_on. When loading (resuming from live migration), look at mit_cause and raise any interrupts. Stefan
On Thu, Jan 10, 2013 at 01:25:48PM +0100, Stefan Hajnoczi wrote: > On Thu, Dec 27, 2012 at 11:06:58AM +0100, Luigi Rizzo wrote: > > diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c > > --- qemu-1.3.0-orig/hw/e1000.c 2012-12-03 20:37:05.000000000 +0100 > > +++ qemu-1.3.0/hw/e1000.c 2012-12-27 09:47:16.000000000 +0100 > > @@ -35,6 +35,8 @@ > > > > #include "e1000_hw.h" > > > > +static int mit_on = 1; /* interrupt mitigation enable */ > > If you want to make this optional then please put it inside E1000State > so it can be toggled per NIC. what is the simplest way to add NIC-specific options ? I have added one line to e1000_properties, as below static Property e1000_properties[] = { DEFINE_NIC_PROPERTIES(E1000State, conf), + DEFINE_PROP_UINT32("mit_on, E1000State, mit_on, 0), DEFINE_PROP_END_OF_LIST(), }; and this way i can do recognise this on the command line qemu ... -device e1000,mit_on=1 ... but i do not know how to set the property for the NIC i am using. Specifically, i normally run qemu with "-net nic,model=1000" (leaving the nic unconnected to the host network, so i can test the tx path without being constrained by the backend's speed) Any suggestion ? thanks luigi
On Fri, Jan 11, 2013 at 03:53:50PM +0100, Luigi Rizzo wrote: > On Thu, Jan 10, 2013 at 01:25:48PM +0100, Stefan Hajnoczi wrote: > > On Thu, Dec 27, 2012 at 11:06:58AM +0100, Luigi Rizzo wrote: > > > diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c > > > --- qemu-1.3.0-orig/hw/e1000.c 2012-12-03 20:37:05.000000000 +0100 > > > +++ qemu-1.3.0/hw/e1000.c 2012-12-27 09:47:16.000000000 +0100 > > > @@ -35,6 +35,8 @@ > > > > > > #include "e1000_hw.h" > > > > > > +static int mit_on = 1; /* interrupt mitigation enable */ > > > > If you want to make this optional then please put it inside E1000State > > so it can be toggled per NIC. > > what is the simplest way to add NIC-specific options ? > I have added one line to e1000_properties, as below > > static Property e1000_properties[] = { > DEFINE_NIC_PROPERTIES(E1000State, conf), > + DEFINE_PROP_UINT32("mit_on, E1000State, mit_on, 0), > DEFINE_PROP_END_OF_LIST(), > }; > > and this way i can do recognise this on the command line > qemu ... -device e1000,mit_on=1 ... > > but i do not know how to set the property for the NIC i am using. > Specifically, i normally run qemu with "-net nic,model=1000" > (leaving the nic unconnected to the host network, so i can > test the tx path without being constrained by the backend's speed) Short answer: -device e1000,mit_on=1,vlan=2 That puts the e1000 on its own hub (QEMU "VLAN") and not connected to the outside world. This is equivalent what you get with "-net nic,model=e1000" except you can specify device properties like "mit_on". Long answer: "-net nic" is handled in net_init_nic(). Unfortunately this goes via the intermediate NICInfo struct instead of directly to the e1000 properties. You'd need to plumb the option through NICInfo, but that's ugly since NICInfo is not e1000-specific and other NICs don't have the option. I'm not sure if it's possible to get rid of NICInfo and the nd_table[] global but I hope so. For the time being I'd avoid using device-specific options with "-net nic". Stefan
diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c --- qemu-1.3.0-orig/hw/e1000.c 2012-12-03 20:37:05.000000000 +0100 +++ qemu-1.3.0/hw/e1000.c 2012-12-27 09:47:16.000000000 +0100 @@ -35,6 +35,8 @@ #include "e1000_hw.h" +static int mit_on = 1; /* interrupt mitigation enable */ + #define E1000_DEBUG #ifdef E1000_DEBUG @@ -129,6 +131,9 @@ typedef struct E1000State_st { } eecd_state; QEMUTimer *autoneg_timer; + QEMUTimer *mit_timer; // handle for the timer + uint32_t mit_timer_on; // mitigation timer active + uint32_t mit_cause; // pending interrupt cause } E1000State; #define defreg(x) x = (E1000_##x>>2) @@ -144,6 +149,7 @@ enum { defreg(TPR), defreg(TPT), defreg(TXDCTL), defreg(WUFC), defreg(RA), defreg(MTA), defreg(CRCERRS),defreg(VFTA), defreg(VET), + defreg(RDTR), defreg(RADV), defreg(TADV), defreg(ITR), }; static void @@ -639,6 +645,68 @@ static uint64_t tx_desc_base(E1000State return (bah << 32) + bal; } +/* helper function, 0 means the value is not set */ +static inline void +mit_update_delay(uint32_t *curr, uint32_t value) +{ + if (value && (*curr == 0 || value < *curr)) + *curr = value; +} + +/* + * If necessary, rearm the timer and post an interrupt. + * Called at the end of tx/rx routines (mit_timer_on == 0), + * and when the timer fires (mit_timer_on == 1). + * We provide a partial implementation of interrupt mitigation, + * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for + * RADV and TADV, 256ns units for ITR). RDTR is only used to enable RADV; + * relative timers based on TIDV and RDTR are not implemented. + */ +static void +mit_rearm_and_int(void *opaque) +{ + E1000State *s = opaque; + uint32_t mit_delay = 0; + + /* + * Clear the flag. It is only set when the callback fires, + * and we need to clear it anyways. + */ + s->mit_timer_on = 0; + if (s->mit_cause == 0) /* no events pending, we are done */ + return; + /* + * Compute the next mitigation delay according to pending interrupts + * and the current values of RADV (provided RDTR!=0), TADV and ITR. + * Then rearm the timer. + */ + if (s->mit_cause & (E1000_ICR_TXQE | E1000_ICR_TXDW)) + mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4); + if (s->mac_reg[RDTR] && (s->mit_cause & E1000_ICS_RXT0)) + mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4); + mit_update_delay(&mit_delay, s->mac_reg[ITR]); + + if (mit_delay) { + s->mit_timer_on = 1; + qemu_mod_timer(s->mit_timer, + qemu_get_clock_ns(vm_clock) + mit_delay * 256); + } + set_ics(s, 0, s->mit_cause); + s->mit_cause = 0; +} + +static void +mit_set_ics(E1000State *s, uint32_t cause) +{ + if (mit_on == 0) { + set_ics(s, 0, cause); + return; + } + s->mit_cause |= cause; + if (!s->mit_timer_on) + mit_rearm_and_int(s); +} + static void start_xmit(E1000State *s) { @@ -676,7 +744,7 @@ start_xmit(E1000State *s) break; } } - set_ics(s, 0, cause); + mit_set_ics(s, cause); } static int @@ -894,7 +962,7 @@ e1000_receive(NetClientState *nc, const s->rxbuf_min_shift) n |= E1000_ICS_RXDMT0; - set_ics(s, 0, n); + mit_set_ics(s, n); return size; } @@ -999,6 +1067,7 @@ static uint32_t (*macreg_readops[])(E100 getreg(RDH), getreg(RDT), getreg(VET), getreg(ICS), getreg(TDBAL), getreg(TDBAH), getreg(RDBAH), getreg(RDBAL), getreg(TDLEN), getreg(RDLEN), + getreg(RDTR), getreg(RADV), getreg(TADV), getreg(ITR), [TOTH] = mac_read_clr8, [TORH] = mac_read_clr8, [GPRC] = mac_read_clr4, [GPTC] = mac_read_clr4, [TPR] = mac_read_clr4, [TPT] = mac_read_clr4, @@ -1015,6 +1084,8 @@ static void (*macreg_writeops[])(E1000St putreg(PBA), putreg(EERD), putreg(SWSM), putreg(WUFC), putreg(TDBAL), putreg(TDBAH), putreg(TXDCTL), putreg(RDBAH), putreg(RDBAL), putreg(LEDCTL), putreg(VET), + [RDTR] = set_16bit, [RADV] = set_16bit, [TADV] = set_16bit, + [ITR] = set_16bit, [TDLEN] = set_dlen, [RDLEN] = set_dlen, [TCTL] = set_tctl, [TDT] = set_tctl, [MDIC] = set_mdic, [ICS] = set_ics, [TDH] = set_16bit, [RDH] = set_16bit, [RDT] = set_rdt, @@ -1286,6 +1357,9 @@ static int pci_e1000_init(PCIDevice *pci add_boot_device_path(d->conf.bootindex, &pci_dev->qdev, "/ethernet-phy@0"); d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d); + d->mit_cause = 0; + d->mit_timer_on = 0; + d->mit_timer = qemu_new_timer_ns(vm_clock, mit_rearm_and_int, d); return 0; }