Message ID | 20190918140225.imqchybuf3cnknob@pengutronix.de |
---|---|
State | RFC |
Delegated to: | David Miller |
Headers | show |
Series | dsa traffic priorization | expand |
Hi Sascha, On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > Hi All, > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > regular network traffic on another port. The customer wants to configure two things > on the switch: First Ethercat traffic shall be priorized over other network traffic > (effectively prioritizing traffic based on port). Second the ethernet controller > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > port shall be rate limited. > You probably already know this, but egress shaping will not drop frames, just let them accumulate in the egress queue until something else happens (e.g. queue occupancy threshold triggers pause frames, or tail dropping is enabled, etc). Is this what you want? It sounds a bit strange to me to configure egress shaping on the CPU port of a DSA switch. That literally means you are buffering frames inside the system. What about ingress policing? > For reference the patch below configures the switch to their needs. Now the question > is how this can be implemented in a way suitable for mainline. It looks like the per > port priority mapping for VLAN tagged packets could be done via ip link add link ... > ingress-qos-map QOS-MAP. How the default priority would be set is unclear to me. > Technically, configuring a match-all rxnfc rule with ethtool would count as 'default priority' - I have proposed that before. Now I'm not entirely sure how intuitive it is, but I'm also interested in being able to configure this. > The other part of the problem seems to be that the CPU port has no network device > representation in Linux, so there's no interface to configure the egress limits via tc. > This has been discussed before, but it seems there hasn't been any consensous regarding how > we want to proceed? > > Sascha > > -----------------------------8<----------------------------------- > > drivers/net/dsa/mv88e6xxx/chip.c | 54 +++++++++++++++++++- > drivers/net/dsa/mv88e6xxx/port.c | 87 ++++++++++++++++++++++++++++++++ > drivers/net/dsa/mv88e6xxx/port.h | 19 +++++++ > 3 files changed, 159 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c > index d0a97eb73a37..2a15cf259d04 100644 > --- a/drivers/net/dsa/mv88e6xxx/chip.c > +++ b/drivers/net/dsa/mv88e6xxx/chip.c > @@ -2090,7 +2090,9 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) > { > struct dsa_switch *ds = chip->ds; > int err; > + u16 addr; > u16 reg; > + u16 val; > > chip->ports[port].chip = chip; > chip->ports[port].port = port; > @@ -2246,7 +2248,57 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) > /* Default VLAN ID and priority: don't set a default VLAN > * ID, and set the default packet priority to zero. > */ > - return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_DEFAULT_VLAN, 0); > + err = mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_DEFAULT_VLAN, 0); > + if (err) > + return err; > + > +#define SWITCH_CPU_PORT 5 > +#define SWITCH_ETHERCAT_PORT 3 > + > + /* set the egress rate */ > + switch (port) { > + case SWITCH_CPU_PORT: > + err = mv88e6xxx_port_set_egress_rate(chip, port, > + MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME, 30000); > + break; > + default: > + err = mv88e6xxx_port_set_egress_rate(chip, port, > + MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME, 0); > + break; > + } > + > + if (err) > + return err; > + > + /* set the output queue usage */ > + switch (port) { > + case SWITCH_CPU_PORT: > + err = mv88e6xxx_port_set_output_queue_schedule(chip, port, > + MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_STRICT); > + break; > + default: > + err = mv88e6xxx_port_set_output_queue_schedule(chip, port, > + MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_NONE_STRICT); > + break; > + } > + > + if (err) > + return err; > + > + /* set the default QPri */ > + switch (port) { > + case SWITCH_ETHERCAT_PORT: > + err = mv88e6xxx_port_set_default_qpri(chip, port, 3); > + break; > + default: > + err = mv88e6xxx_port_set_default_qpri(chip, port, 2); > + break; > + } > + > + if (err) > + return err; > + > + return 0; > } > > static int mv88e6xxx_port_enable(struct dsa_switch *ds, int port, > diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c > index 04309ef0a1cc..e03f24308f15 100644 > --- a/drivers/net/dsa/mv88e6xxx/port.c > +++ b/drivers/net/dsa/mv88e6xxx/port.c > @@ -1147,6 +1147,22 @@ int mv88e6165_port_set_jumbo_size(struct mv88e6xxx_chip *chip, int port, > return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, reg); > } > > +int mv88e6xxx_port_set_default_qpri(struct mv88e6xxx_chip *chip, int port, int qpri) > +{ > + u16 reg; > + int err; > + > + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_CTL2, ®); > + if (err) > + return err; > + > + reg &= ~MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK; > + reg |= (qpri << 1) & MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK; > + reg |= MV88E6XXX_PORT_CTL2_USE_DEF_QPRI; > + > + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, reg); > +} > + > /* Offset 0x09: Port Rate Control */ > > int mv88e6095_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port) > @@ -1161,6 +1177,77 @@ int mv88e6097_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port) > 0x0001); > } > > +int mv88e6xxx_port_set_output_queue_schedule(struct mv88e6xxx_chip *chip, int port, > + u16 schedule) > +{ > + u16 reg; > + int err; > + > + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, ®); > + if (err) > + return err; > + > + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_MASK; > + reg |= schedule; > + > + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, reg); > +} > + > +static int _mv88e6xxx_egress_rate_calc_frames(u32 rate, u16 *egress_rate_val) > +{ > + const volatile u32 scale_factor = (1000 * 1000 * 1000); > + volatile u32 u; > + > + if (rate > 1488000) > + return EINVAL; > + > + if (rate < 7600) > + return EINVAL; > + > + u = 32 * rate; > + u = scale_factor / u; /* scale_factor used to convert 32s into 32ns */ > + > + *egress_rate_val = (u16)u; > + > + return 0; > +} > + > +int mv88e6xxx_port_set_egress_rate(struct mv88e6xxx_chip *chip, int port, u16 type, > + u32 rate) > +{ > + u16 reg; > + int err; > + u16 egress_rate_val; > + > + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, ®); > + if (err) > + return err; > + > + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_RATE_MASK; > + > + if (rate) { > + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_MASK; > + reg |= type; > + > + switch (type) { > + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME: > + err = _mv88e6xxx_egress_rate_calc_frames(rate, &egress_rate_val); > + if (err) > + return err; > + reg |= egress_rate_val & 0x0FFF; > + break; > + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L1: > + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L2: > + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L3: > + return EINVAL; /* ToDo */ > + default: > + return EINVAL; > + } > + } > + > + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, reg); > +} > + > /* Offset 0x0C: Port ATU Control */ > > int mv88e6xxx_port_disable_learn_limit(struct mv88e6xxx_chip *chip, int port) > diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h > index 8d5a6cd6fb19..cdd057c52ab8 100644 > --- a/drivers/net/dsa/mv88e6xxx/port.h > +++ b/drivers/net/dsa/mv88e6xxx/port.h > @@ -197,6 +197,8 @@ > #define MV88E6XXX_PORT_CTL2_DEFAULT_FORWARD 0x0040 > #define MV88E6XXX_PORT_CTL2_EGRESS_MONITOR 0x0020 > #define MV88E6XXX_PORT_CTL2_INGRESS_MONITOR 0x0010 > +#define MV88E6XXX_PORT_CTL2_USE_DEF_QPRI 0x0008 > +#define MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK 0x0006 > #define MV88E6095_PORT_CTL2_CPU_PORT_MASK 0x000f > > /* Offset 0x09: Egress Rate Control */ > @@ -204,6 +206,17 @@ > > /* Offset 0x0A: Egress Rate Control 2 */ > #define MV88E6XXX_PORT_EGRESS_RATE_CTL2 0x0a > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_MASK 0xC000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME 0x0000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L1 0x4000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L2 0x8000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L3 0xC000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_MASK 0x3000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_NONE_STRICT 0x0000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_STRICT 0x1000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_Q2_STRICT 0x2000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_ALL_STRICT 0x3000 > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_RATE_MASK 0x0FFF > > /* Offset 0x0B: Port Association Vector */ > #define MV88E6XXX_PORT_ASSOC_VECTOR 0x0b > @@ -326,8 +339,14 @@ int mv88e6xxx_port_set_message_port(struct mv88e6xxx_chip *chip, int port, > bool message_port); > int mv88e6165_port_set_jumbo_size(struct mv88e6xxx_chip *chip, int port, > size_t size); > +int mv88e6xxx_port_set_default_qpri(struct mv88e6xxx_chip *chip, int port, > + int qpri); > int mv88e6095_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port); > int mv88e6097_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port); > +int mv88e6xxx_port_set_output_queue_schedule(struct mv88e6xxx_chip *chip, int port, > + u16 schedule); > +int mv88e6xxx_port_set_egress_rate(struct mv88e6xxx_chip *chip, int port, > + u16 type, u32 rate); > int mv88e6097_port_pause_limit(struct mv88e6xxx_chip *chip, int port, u8 in, > u8 out); > int mv88e6390_port_pause_limit(struct mv88e6xxx_chip *chip, int port, u8 in, > -- > 2.23.0 > > -- > Pengutronix e.K. | | > Industrial Linux Solutions | http://www.pengutronix.de/ | > Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0 | > Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-5555 | Regards, -Vladimir
On Wed, Sep 18, 2019 at 7:37 AM Vladimir Oltean <olteanv@gmail.com> wrote: > > Hi Sascha, > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > Hi All, > > > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > > regular network traffic on another port. The customer wants to configure two things > > on the switch: First Ethercat traffic shall be priorized over other network traffic > > (effectively prioritizing traffic based on port). Second the ethernet controller > > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > > port shall be rate limited. > > > > You probably already know this, but egress shaping will not drop > frames, just let them accumulate in the egress queue until something > else happens (e.g. queue occupancy threshold triggers pause frames, or > tail dropping is enabled, etc). Is this what you want? It sounds a bit Dropping in general is a basic attribute of the fq_codel algorithm which is enabled by default on many boxes. It's latency sensitive, so it responds well to pause frame (over) use. Usually the cpu to switch port is exposed via vlan (e.g eth0:2), and while you can inbound and outbound shape on that - using htb/hfsc + fq_codel, or cake But, also, most usually what happens when the cpu cannot keep up with the switch is we drop packets on the rx ring for receive, and in fq-codel on send. > strange to me to configure egress shaping on the CPU port of a DSA > switch. That literally means you are buffering frames inside the > system. What about ingress policing? > > > For reference the patch below configures the switch to their needs. Now the question > > is how this can be implemented in a way suitable for mainline. It looks like the per > > port priority mapping for VLAN tagged packets could be done via ip link add link ... > > ingress-qos-map QOS-MAP. How the default priority would be set is unclear to me. > > > > Technically, configuring a match-all rxnfc rule with ethtool would > count as 'default priority' - I have proposed that before. Now I'm not > entirely sure how intuitive it is, but I'm also interested in being > able to configure this. > > > The other part of the problem seems to be that the CPU port has no network device > > representation in Linux, so there's no interface to configure the egress limits via tc. > > This has been discussed before, but it seems there hasn't been any consensous regarding how > > we want to proceed? > > > > Sascha > > > > -----------------------------8<----------------------------------- > > > > drivers/net/dsa/mv88e6xxx/chip.c | 54 +++++++++++++++++++- > > drivers/net/dsa/mv88e6xxx/port.c | 87 ++++++++++++++++++++++++++++++++ > > drivers/net/dsa/mv88e6xxx/port.h | 19 +++++++ > > 3 files changed, 159 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c > > index d0a97eb73a37..2a15cf259d04 100644 > > --- a/drivers/net/dsa/mv88e6xxx/chip.c > > +++ b/drivers/net/dsa/mv88e6xxx/chip.c > > @@ -2090,7 +2090,9 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) > > { > > struct dsa_switch *ds = chip->ds; > > int err; > > + u16 addr; > > u16 reg; > > + u16 val; > > > > chip->ports[port].chip = chip; > > chip->ports[port].port = port; > > @@ -2246,7 +2248,57 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) > > /* Default VLAN ID and priority: don't set a default VLAN > > * ID, and set the default packet priority to zero. > > */ > > - return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_DEFAULT_VLAN, 0); > > + err = mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_DEFAULT_VLAN, 0); > > + if (err) > > + return err; > > + > > +#define SWITCH_CPU_PORT 5 > > +#define SWITCH_ETHERCAT_PORT 3 > > + > > + /* set the egress rate */ > > + switch (port) { > > + case SWITCH_CPU_PORT: > > + err = mv88e6xxx_port_set_egress_rate(chip, port, > > + MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME, 30000); > > + break; > > + default: > > + err = mv88e6xxx_port_set_egress_rate(chip, port, > > + MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME, 0); > > + break; > > + } > > + > > + if (err) > > + return err; > > + > > + /* set the output queue usage */ > > + switch (port) { > > + case SWITCH_CPU_PORT: > > + err = mv88e6xxx_port_set_output_queue_schedule(chip, port, > > + MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_STRICT); > > + break; > > + default: > > + err = mv88e6xxx_port_set_output_queue_schedule(chip, port, > > + MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_NONE_STRICT); > > + break; > > + } > > + > > + if (err) > > + return err; > > + > > + /* set the default QPri */ > > + switch (port) { > > + case SWITCH_ETHERCAT_PORT: > > + err = mv88e6xxx_port_set_default_qpri(chip, port, 3); > > + break; > > + default: > > + err = mv88e6xxx_port_set_default_qpri(chip, port, 2); > > + break; > > + } > > + > > + if (err) > > + return err; > > + > > + return 0; > > } > > > > static int mv88e6xxx_port_enable(struct dsa_switch *ds, int port, > > diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c > > index 04309ef0a1cc..e03f24308f15 100644 > > --- a/drivers/net/dsa/mv88e6xxx/port.c > > +++ b/drivers/net/dsa/mv88e6xxx/port.c > > @@ -1147,6 +1147,22 @@ int mv88e6165_port_set_jumbo_size(struct mv88e6xxx_chip *chip, int port, > > return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, reg); > > } > > > > +int mv88e6xxx_port_set_default_qpri(struct mv88e6xxx_chip *chip, int port, int qpri) > > +{ > > + u16 reg; > > + int err; > > + > > + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_CTL2, ®); > > + if (err) > > + return err; > > + > > + reg &= ~MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK; > > + reg |= (qpri << 1) & MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK; > > + reg |= MV88E6XXX_PORT_CTL2_USE_DEF_QPRI; > > + > > + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, reg); > > +} > > + > > /* Offset 0x09: Port Rate Control */ > > > > int mv88e6095_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port) > > @@ -1161,6 +1177,77 @@ int mv88e6097_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port) > > 0x0001); > > } > > > > +int mv88e6xxx_port_set_output_queue_schedule(struct mv88e6xxx_chip *chip, int port, > > + u16 schedule) > > +{ > > + u16 reg; > > + int err; > > + > > + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, ®); > > + if (err) > > + return err; > > + > > + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_MASK; > > + reg |= schedule; > > + > > + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, reg); > > +} > > + > > +static int _mv88e6xxx_egress_rate_calc_frames(u32 rate, u16 *egress_rate_val) > > +{ > > + const volatile u32 scale_factor = (1000 * 1000 * 1000); > > + volatile u32 u; > > + > > + if (rate > 1488000) > > + return EINVAL; > > + > > + if (rate < 7600) > > + return EINVAL; > > + > > + u = 32 * rate; > > + u = scale_factor / u; /* scale_factor used to convert 32s into 32ns */ > > + > > + *egress_rate_val = (u16)u; > > + > > + return 0; > > +} > > + > > +int mv88e6xxx_port_set_egress_rate(struct mv88e6xxx_chip *chip, int port, u16 type, > > + u32 rate) > > +{ > > + u16 reg; > > + int err; > > + u16 egress_rate_val; > > + > > + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, ®); > > + if (err) > > + return err; > > + > > + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_RATE_MASK; > > + > > + if (rate) { > > + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_MASK; > > + reg |= type; > > + > > + switch (type) { > > + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME: > > + err = _mv88e6xxx_egress_rate_calc_frames(rate, &egress_rate_val); > > + if (err) > > + return err; > > + reg |= egress_rate_val & 0x0FFF; > > + break; > > + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L1: > > + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L2: > > + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L3: > > + return EINVAL; /* ToDo */ > > + default: > > + return EINVAL; > > + } > > + } > > + > > + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, reg); > > +} > > + > > /* Offset 0x0C: Port ATU Control */ > > > > int mv88e6xxx_port_disable_learn_limit(struct mv88e6xxx_chip *chip, int port) > > diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h > > index 8d5a6cd6fb19..cdd057c52ab8 100644 > > --- a/drivers/net/dsa/mv88e6xxx/port.h > > +++ b/drivers/net/dsa/mv88e6xxx/port.h > > @@ -197,6 +197,8 @@ > > #define MV88E6XXX_PORT_CTL2_DEFAULT_FORWARD 0x0040 > > #define MV88E6XXX_PORT_CTL2_EGRESS_MONITOR 0x0020 > > #define MV88E6XXX_PORT_CTL2_INGRESS_MONITOR 0x0010 > > +#define MV88E6XXX_PORT_CTL2_USE_DEF_QPRI 0x0008 > > +#define MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK 0x0006 > > #define MV88E6095_PORT_CTL2_CPU_PORT_MASK 0x000f > > > > /* Offset 0x09: Egress Rate Control */ > > @@ -204,6 +206,17 @@ > > > > /* Offset 0x0A: Egress Rate Control 2 */ > > #define MV88E6XXX_PORT_EGRESS_RATE_CTL2 0x0a > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_MASK 0xC000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME 0x0000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L1 0x4000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L2 0x8000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L3 0xC000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_MASK 0x3000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_NONE_STRICT 0x0000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_STRICT 0x1000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_Q2_STRICT 0x2000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_ALL_STRICT 0x3000 > > +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_RATE_MASK 0x0FFF > > > > /* Offset 0x0B: Port Association Vector */ > > #define MV88E6XXX_PORT_ASSOC_VECTOR 0x0b > > @@ -326,8 +339,14 @@ int mv88e6xxx_port_set_message_port(struct mv88e6xxx_chip *chip, int port, > > bool message_port); > > int mv88e6165_port_set_jumbo_size(struct mv88e6xxx_chip *chip, int port, > > size_t size); > > +int mv88e6xxx_port_set_default_qpri(struct mv88e6xxx_chip *chip, int port, > > + int qpri); > > int mv88e6095_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port); > > int mv88e6097_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port); > > +int mv88e6xxx_port_set_output_queue_schedule(struct mv88e6xxx_chip *chip, int port, > > + u16 schedule); > > +int mv88e6xxx_port_set_egress_rate(struct mv88e6xxx_chip *chip, int port, > > + u16 type, u32 rate); > > int mv88e6097_port_pause_limit(struct mv88e6xxx_chip *chip, int port, u8 in, > > u8 out); > > int mv88e6390_port_pause_limit(struct mv88e6xxx_chip *chip, int port, u8 in, > > -- > > 2.23.0 > > > > -- > > Pengutronix e.K. | | > > Industrial Linux Solutions | http://www.pengutronix.de/ | > > Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0 | > > Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-5555 | > > Regards, > -Vladimir
On 9/18/19 8:03 AM, Dave Taht wrote: > On Wed, Sep 18, 2019 at 7:37 AM Vladimir Oltean <olteanv@gmail.com> wrote: >> >> Hi Sascha, >> >> On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: >>> >>> Hi All, >>> >>> We have a customer using a Marvell 88e6240 switch with Ethercat on one port and >>> regular network traffic on another port. The customer wants to configure two things >>> on the switch: First Ethercat traffic shall be priorized over other network traffic >>> (effectively prioritizing traffic based on port). Second the ethernet controller >>> in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU >>> port shall be rate limited. >>> >> >> You probably already know this, but egress shaping will not drop >> frames, just let them accumulate in the egress queue until something >> else happens (e.g. queue occupancy threshold triggers pause frames, or >> tail dropping is enabled, etc). Is this what you want? It sounds a bit > > Dropping in general is a basic attribute of the fq_codel algorithm which is > enabled by default on many boxes. It's latency sensitive, so it responds well > to pause frame (over) use. > > Usually the cpu to switch port is exposed via vlan (e.g eth0:2), and > while you can inbound and > outbound shape on that - using htb/hfsc + fq_codel, or cake That may be true with swconfig in OpenWrt, but this is not true with DSA unless DSA_TAG_PROTO_8021Q is used which happens to be on just one driver at the moment. With other switches that support a proprietary switch tag format, there is not a particular VLAN or even a network interface that describes the CPU port, other than the DSA master network device which is the side facing the host system (not the switch itself). > > But, also, most usually what happens when the cpu cannot keep up with > the switch is we drop packets on the rx ring for receive, and in > fq-codel on send. Dave, you seem to have a tendency to just pattern match on specific QoS- related topics appearing on netdev and throwing the wonderful tool that fq_codel without necessarily considering whether this is applicable or not to the people raising the questions. Since we are talking about hardware switches here and not simply stations on a network (although the Ethernet MAC behind the CPU port ends up being one), there is the possibility of using the HW to do ingress and/or egress policing. The question raised by Sascha is how to avoid statically configuring and instead using possibly existing tools to achieve the same configuration, from user-space, that is, not encode policy in the driver, but just the mechanism.
On 9/18/19 7:36 AM, Vladimir Oltean wrote: > Hi Sascha, > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: >> >> Hi All, >> >> We have a customer using a Marvell 88e6240 switch with Ethercat on one port and >> regular network traffic on another port. The customer wants to configure two things >> on the switch: First Ethercat traffic shall be priorized over other network traffic >> (effectively prioritizing traffic based on port). Second the ethernet controller >> in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU >> port shall be rate limited. >> > > You probably already know this, but egress shaping will not drop > frames, just let them accumulate in the egress queue until something > else happens (e.g. queue occupancy threshold triggers pause frames, or > tail dropping is enabled, etc). Is this what you want? It sounds a bit > strange to me to configure egress shaping on the CPU port of a DSA > switch. That literally means you are buffering frames inside the > system. What about ingress policing? Indeed, but I suppose that depending on the switch architecture and/or nomenclature, configuring egress shaping amounts to determining ingress for the ports where the frame is going to be forwarded to. For instance Broadcom switches rarely if at all mention ingress because the frames have to originate from somewhere and be forwarded to other port(s), therefore, they will egress their original port (which for all practical purposes is the direct continuation of the ingress stage), where shaping happens, which immediately influences the ingress shaping of the destination port, which will egress the frame eventually because packets have to be delivered to the final port's egress queue anyway. > >> For reference the patch below configures the switch to their needs. Now the question >> is how this can be implemented in a way suitable for mainline. It looks like the per >> port priority mapping for VLAN tagged packets could be done via ip link add link ... >> ingress-qos-map QOS-MAP. How the default priority would be set is unclear to me. >> > > Technically, configuring a match-all rxnfc rule with ethtool would > count as 'default priority' - I have proposed that before. Now I'm not > entirely sure how intuitive it is, but I'm also interested in being > able to configure this. That does not sound too crazy from my perspective. > >> The other part of the problem seems to be that the CPU port has no network device >> representation in Linux, so there's no interface to configure the egress limits via tc. >> This has been discussed before, but it seems there hasn't been any consensous regarding how >> we want to proceed? You have the DSA master network device which is on the other side of the switch, -- Florian
Hi Florian, On Wed, 18 Sep 2019 at 20:42, Florian Fainelli <f.fainelli@gmail.com> wrote: > > On 9/18/19 7:36 AM, Vladimir Oltean wrote: > > Hi Sascha, > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > >> > >> Hi All, > >> > >> We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > >> regular network traffic on another port. The customer wants to configure two things > >> on the switch: First Ethercat traffic shall be priorized over other network traffic > >> (effectively prioritizing traffic based on port). Second the ethernet controller > >> in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > >> port shall be rate limited. > >> > > > > You probably already know this, but egress shaping will not drop > > frames, just let them accumulate in the egress queue until something > > else happens (e.g. queue occupancy threshold triggers pause frames, or > > tail dropping is enabled, etc). Is this what you want? It sounds a bit > > strange to me to configure egress shaping on the CPU port of a DSA > > switch. That literally means you are buffering frames inside the > > system. What about ingress policing? > > Indeed, but I suppose that depending on the switch architecture and/or > nomenclature, configuring egress shaping amounts to determining ingress > for the ports where the frame is going to be forwarded to. Egress shaping in the switches I've played with has nothing to do with the ingress port, unless that is used as a key for some sort for QoS classification (aka selector for a traffic class). Furthermore, shaping means queuing (which furthermore means delaying, but not dropping except in extreme cases which are outside the scope of shaping itself), while policing by definition means early dropping (admission control). Like Dave Taht pointed out too, dropping might be better for the system's overall latency. > > For instance Broadcom switches rarely if at all mention ingress because > the frames have to originate from somewhere and be forwarded to other > port(s), therefore, they will egress their original port (which for all > practical purposes is the direct continuation of the ingress stage), > where shaping happens, which immediately influences the ingress shaping > of the destination port, which will egress the frame eventually because > packets have to be delivered to the final port's egress queue anyway. > You lost me. I have never heard of any shaping done inside the guts of a switch, so 'egress of an ingress port' and 'ingress of an egress port' makes no sense to me. I was talking about ingress policing at the front panel ports, for their best-effort traffic. I think that is actually preferable to egress shaping at the CPU port, since I don't think they would want the EtherCAT traffic getting delayed. Alternatively, maybe the DSA master port supports per-stream hardware policing, although that is more exotic. > > > >> For reference the patch below configures the switch to their needs. Now the question > >> is how this can be implemented in a way suitable for mainline. It looks like the per > >> port priority mapping for VLAN tagged packets could be done via ip link add link ... > >> ingress-qos-map QOS-MAP. How the default priority would be set is unclear to me. > >> > > > > Technically, configuring a match-all rxnfc rule with ethtool would > > count as 'default priority' - I have proposed that before. Now I'm not > > entirely sure how intuitive it is, but I'm also interested in being > > able to configure this. > > That does not sound too crazy from my perspective. > Ok, well at least that requires no user space modification, then. > > > >> The other part of the problem seems to be that the CPU port has no network device > >> representation in Linux, so there's no interface to configure the egress limits via tc. > >> This has been discussed before, but it seems there hasn't been any consensous regarding how > >> we want to proceed? > > You have the DSA master network device which is on the other side of the > switch, > -- > Florian Thanks, -Vladimir
On 9/18/19 12:39 PM, Vladimir Oltean wrote: > Hi Florian, > > On Wed, 18 Sep 2019 at 20:42, Florian Fainelli <f.fainelli@gmail.com> wrote: >> >> On 9/18/19 7:36 AM, Vladimir Oltean wrote: >>> Hi Sascha, >>> >>> On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: >>>> >>>> Hi All, >>>> >>>> We have a customer using a Marvell 88e6240 switch with Ethercat on one port and >>>> regular network traffic on another port. The customer wants to configure two things >>>> on the switch: First Ethercat traffic shall be priorized over other network traffic >>>> (effectively prioritizing traffic based on port). Second the ethernet controller >>>> in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU >>>> port shall be rate limited. >>>> >>> >>> You probably already know this, but egress shaping will not drop >>> frames, just let them accumulate in the egress queue until something >>> else happens (e.g. queue occupancy threshold triggers pause frames, or >>> tail dropping is enabled, etc). Is this what you want? It sounds a bit >>> strange to me to configure egress shaping on the CPU port of a DSA >>> switch. That literally means you are buffering frames inside the >>> system. What about ingress policing? >> >> Indeed, but I suppose that depending on the switch architecture and/or >> nomenclature, configuring egress shaping amounts to determining ingress >> for the ports where the frame is going to be forwarded to. > > Egress shaping in the switches I've played with has nothing to do with > the ingress port, unless that is used as a key for some sort for QoS > classification (aka selector for a traffic class). > Furthermore, shaping means queuing (which furthermore means delaying, > but not dropping except in extreme cases which are outside the scope > of shaping itself), while policing by definition means early dropping > (admission control). Like Dave Taht pointed out too, dropping might be > better for the system's overall latency. > >> >> For instance Broadcom switches rarely if at all mention ingress because >> the frames have to originate from somewhere and be forwarded to other >> port(s), therefore, they will egress their original port (which for all >> practical purposes is the direct continuation of the ingress stage), >> where shaping happens, which immediately influences the ingress shaping >> of the destination port, which will egress the frame eventually because >> packets have to be delivered to the final port's egress queue anyway. >> > > You lost me. > I have never heard of any shaping done inside the guts of a switch, so > 'egress of an ingress port' and 'ingress of an egress port' makes no > sense to me. What I meant to explain is that when a packet enters the switch, the forwarding logic will determine the destination port(s) and queue of that packet. As soon as the egress port and queue is known the capacity of that port and queue can be looked up, and that has the immediate effect of changing how the packets are ingress the switch. I suspect that the Marvell switches (like Broadcom) use the ability to perform any form of packet mangling from the perspective of the port's egress (as opposed to ingress) because ultimately packets need to go somewhere and the switch logic tries as much as possible to "connect" the ingress port to the egress port logic to limit on-chip buffering. > I was talking about ingress policing at the front panel ports, for > their best-effort traffic. I think that is actually preferable to > egress shaping at the CPU port, since I don't think they would want > the EtherCAT traffic getting delayed. I would view this as classifying and putting a higher priority on specific types of traffic and making sure that the switch is configured not to drop certain traffic landing in specific queues. Where that is applied (ingress, or egress) amounts to the same thing IMHO. > Alternatively, maybe the DSA master port supports per-stream hardware > policing, although that is more exotic. Even if the DSA master network device does not support it, the switch probably does, so you could do it on traffic that ingresses or egresses the CPU port. > >>> >>>> For reference the patch below configures the switch to their needs. Now the question >>>> is how this can be implemented in a way suitable for mainline. It looks like the per >>>> port priority mapping for VLAN tagged packets could be done via ip link add link ... >>>> ingress-qos-map QOS-MAP. How the default priority would be set is unclear to me. >>>> >>> >>> Technically, configuring a match-all rxnfc rule with ethtool would >>> count as 'default priority' - I have proposed that before. Now I'm not >>> entirely sure how intuitive it is, but I'm also interested in being >>> able to configure this. >> >> That does not sound too crazy from my perspective. >> > > Ok, well at least that requires no user space modification, then. > >>> >>>> The other part of the problem seems to be that the CPU port has no network device >>>> representation in Linux, so there's no interface to configure the egress limits via tc. >>>> This has been discussed before, but it seems there hasn't been any consensous regarding how >>>> we want to proceed? >> >> You have the DSA master network device which is on the other side of the >> switch, >> -- >> Florian > > Thanks, > -Vladimir >
Hi Vladimir, On Wed, Sep 18, 2019 at 05:36:08PM +0300, Vladimir Oltean wrote: > Hi Sascha, > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > Hi All, > > > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > > regular network traffic on another port. The customer wants to configure two things > > on the switch: First Ethercat traffic shall be priorized over other network traffic > > (effectively prioritizing traffic based on port). Second the ethernet controller > > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > > port shall be rate limited. > > > > You probably already know this, but egress shaping will not drop > frames, just let them accumulate in the egress queue until something > else happens (e.g. queue occupancy threshold triggers pause frames, or > tail dropping is enabled, etc). Is this what you want? If I understand correctly then the switch has multiple output queues per port. The Ethercat traffic will go to a higher priority queue and on congestion on other queues, frames designated for that queue will be dropped. I just talked to our customer and he verified that their Ethercat traffic still goes through even when the ports with the general traffic are jammed with packets. So yes, I think this is what I want. > It sounds a bit > strange to me to configure egress shaping on the CPU port of a DSA > switch. That literally means you are buffering frames inside the > system. What about ingress policing? The bottleneck here is in the CPU interface. The SoC simply can't handle all frames coming into a fully occupied link, so we indeed have to limit the number of packets coming into the SoC which speaks for egress rate limiting. We could of course limit the ingress packets on the other ports, but that would mean we have to rate limit each port to the total desired rate divided by the number of ports to be safe, not very optimal. Sascha
On Thu, 19 Sep 2019 at 11:00, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > Hi Vladimir, > > On Wed, Sep 18, 2019 at 05:36:08PM +0300, Vladimir Oltean wrote: > > Hi Sascha, > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > > > Hi All, > > > > > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > > > regular network traffic on another port. The customer wants to configure two things > > > on the switch: First Ethercat traffic shall be priorized over other network traffic > > > (effectively prioritizing traffic based on port). Second the ethernet controller > > > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > > > port shall be rate limited. > > > > > > > You probably already know this, but egress shaping will not drop > > frames, just let them accumulate in the egress queue until something > > else happens (e.g. queue occupancy threshold triggers pause frames, or > > tail dropping is enabled, etc). Is this what you want? > > If I understand correctly then the switch has multiple output queues per > port. The Ethercat traffic will go to a higher priority queue and on > congestion on other queues, frames designated for that queue will be > dropped. I just talked to our customer and he verified that their > Ethercat traffic still goes through even when the ports with the general > traffic are jammed with packets. So yes, I think this is what I want. > Yes, but I mean the egress shaper is per port, so when it goes out of credits it goes out of credits, right? Meaning that even if EtherCAT has higher strict priority, it will still experience latency caused by the best-effort traffic consuming the port's global token bucket credits. Sure, it may not be so bad as to actually cause tail drop, but did you measure this? > > It sounds a bit > > strange to me to configure egress shaping on the CPU port of a DSA > > switch. That literally means you are buffering frames inside the > > system. What about ingress policing? > > The bottleneck here is in the CPU interface. The SoC simply can't handle > all frames coming into a fully occupied link, so we indeed have to limit > the number of packets coming into the SoC which speaks for egress rate > limiting. We could of course limit the ingress packets on the other > ports, but that would mean we have to rate limit each port to the total > desired rate divided by the number of ports to be safe, not very > optimal. > Not very optimal, but may offer better guarantees for the high-priority traffic, and there is already a model for doing that, unlike for egress shaping on the CPU port. What about a software tc-police action on the DSA net device's ingress qdisc? Is that still too high-pressure for the CPU? Is there any flow steering rule on the CPU for processing EtherCAT with higher priority (or affining it to a separate core)? I'm trying to understand where the bottleneck really is. > Sascha > > -- > Pengutronix e.K. | | > Industrial Linux Solutions | http://www.pengutronix.de/ | > Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0 | > Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-5555 | Thanks, -Vladimir
On Thu, Sep 19, 2019 at 10:00:51AM +0200, Sascha Hauer wrote: > Hi Vladimir, > > On Wed, Sep 18, 2019 at 05:36:08PM +0300, Vladimir Oltean wrote: > > Hi Sascha, > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > > > Hi All, > > > > > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > > > regular network traffic on another port. The customer wants to configure two things > > > on the switch: First Ethercat traffic shall be priorized over other network traffic > > > (effectively prioritizing traffic based on port). Second the ethernet controller > > > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > > > port shall be rate limited. > > > > > > > You probably already know this, but egress shaping will not drop > > frames, just let them accumulate in the egress queue until something > > else happens (e.g. queue occupancy threshold triggers pause frames, or > > tail dropping is enabled, etc). Is this what you want? > > If I understand correctly then the switch has multiple output queues per > port. The Ethercat traffic will go to a higher priority queue and on > congestion on other queues, frames designated for that queue will be > dropped. I just talked to our customer and he verified that their > Ethercat traffic still goes through even when the ports with the general > traffic are jammed with packets. So yes, I think this is what I want. Moreover egressing the cpu port has the advantage that network participants on other ports that might be able to process packet quicker are not limited. Best regards Uwe
On Thu, Sep 19, 2019 at 11:18:24AM +0300, Vladimir Oltean wrote: > On Thu, 19 Sep 2019 at 11:00, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > Hi Vladimir, > > > > On Wed, Sep 18, 2019 at 05:36:08PM +0300, Vladimir Oltean wrote: > > > Hi Sascha, > > > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > > > > > Hi All, > > > > > > > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > > > > regular network traffic on another port. The customer wants to configure two things > > > > on the switch: First Ethercat traffic shall be priorized over other network traffic > > > > (effectively prioritizing traffic based on port). Second the ethernet controller > > > > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > > > > port shall be rate limited. > > > > > > > > > > You probably already know this, but egress shaping will not drop > > > frames, just let them accumulate in the egress queue until something > > > else happens (e.g. queue occupancy threshold triggers pause frames, or > > > tail dropping is enabled, etc). Is this what you want? > > > > If I understand correctly then the switch has multiple output queues per > > port. The Ethercat traffic will go to a higher priority queue and on > > congestion on other queues, frames designated for that queue will be > > dropped. I just talked to our customer and he verified that their > > Ethercat traffic still goes through even when the ports with the general > > traffic are jammed with packets. So yes, I think this is what I want. > > > > Yes, but I mean the egress shaper is per port, so when it goes out of > credits it goes out of credits, right? Meaning that even if EtherCAT > has higher strict priority, it will still experience latency caused by > the best-effort traffic consuming the port's global token bucket > credits. Sure, it may not be so bad as to actually cause tail drop, > but did you measure this? I don't think this has been measured. I understand what you mean and there might be latencies introduced, but it seems the latencies are acceptable. Sascha
Hi Uwe, On Thu, 19 Sep 2019 at 11:36, Uwe Kleine-König <u.kleine-koenig@pengutronix.de> wrote: > > On Thu, Sep 19, 2019 at 10:00:51AM +0200, Sascha Hauer wrote: > > Hi Vladimir, > > > > On Wed, Sep 18, 2019 at 05:36:08PM +0300, Vladimir Oltean wrote: > > > Hi Sascha, > > > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > > > > > Hi All, > > > > > > > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > > > > regular network traffic on another port. The customer wants to configure two things > > > > on the switch: First Ethercat traffic shall be priorized over other network traffic > > > > (effectively prioritizing traffic based on port). Second the ethernet controller > > > > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > > > > port shall be rate limited. > > > > > > > > > > You probably already know this, but egress shaping will not drop > > > frames, just let them accumulate in the egress queue until something > > > else happens (e.g. queue occupancy threshold triggers pause frames, or > > > tail dropping is enabled, etc). Is this what you want? > > > > If I understand correctly then the switch has multiple output queues per > > port. The Ethercat traffic will go to a higher priority queue and on > > congestion on other queues, frames designated for that queue will be > > dropped. I just talked to our customer and he verified that their > > Ethercat traffic still goes through even when the ports with the general > > traffic are jammed with packets. So yes, I think this is what I want. > > Moreover egressing the cpu port has the advantage that network > participants on other ports that might be able to process packet quicker > are not limited. > Yes, that is true if you apply port-based policers (matchall), but not necessarily so if your switch is able to do ingress classification (a common key for switches being the {DMAC, VLAN} pair). Again, there is a model for configuring even that, if the hardware is capable. Just not for egress shaping on the CPU port. > Best regards > Uwe > > -- > Pengutronix e.K. | Uwe Kleine-König | > Industrial Linux Solutions | http://www.pengutronix.de/ |
Hi Florian, On Wed, Sep 18, 2019 at 10:41:58AM -0700, Florian Fainelli wrote: > On 9/18/19 7:36 AM, Vladimir Oltean wrote: > > Hi Sascha, > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > >> > >> Hi All, > >> > >> We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > >> regular network traffic on another port. The customer wants to configure two things > >> on the switch: First Ethercat traffic shall be priorized over other network traffic > >> (effectively prioritizing traffic based on port). Second the ethernet controller > >> in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > >> port shall be rate limited. > >> > > > > You probably already know this, but egress shaping will not drop > > frames, just let them accumulate in the egress queue until something > > else happens (e.g. queue occupancy threshold triggers pause frames, or > > tail dropping is enabled, etc). Is this what you want? It sounds a bit > > strange to me to configure egress shaping on the CPU port of a DSA > > switch. That literally means you are buffering frames inside the > > system. What about ingress policing? > > Indeed, but I suppose that depending on the switch architecture and/or > nomenclature, configuring egress shaping amounts to determining ingress > for the ports where the frame is going to be forwarded to. > > For instance Broadcom switches rarely if at all mention ingress because > the frames have to originate from somewhere and be forwarded to other > port(s), therefore, they will egress their original port (which for all > practical purposes is the direct continuation of the ingress stage), > where shaping happens, which immediately influences the ingress shaping > of the destination port, which will egress the frame eventually because > packets have to be delivered to the final port's egress queue anyway. > > > > >> For reference the patch below configures the switch to their needs. Now the question > >> is how this can be implemented in a way suitable for mainline. It looks like the per > >> port priority mapping for VLAN tagged packets could be done via ip link add link ... > >> ingress-qos-map QOS-MAP. How the default priority would be set is unclear to me. > >> > > > > Technically, configuring a match-all rxnfc rule with ethtool would > > count as 'default priority' - I have proposed that before. Now I'm not > > entirely sure how intuitive it is, but I'm also interested in being > > able to configure this. > > That does not sound too crazy from my perspective. > > > > >> The other part of the problem seems to be that the CPU port has no network device > >> representation in Linux, so there's no interface to configure the egress limits via tc. > >> This has been discussed before, but it seems there hasn't been any consensous regarding how > >> we want to proceed? > > You have the DSA master network device which is on the other side of the > switch, You mean the (in my case) i.MX FEC? Everything I do on this device ends up in the FEC itself and not on the switch, right? Sascha
Hi, On Wed, 2019-09-18 at 10:41 -0700, Florian Fainelli wrote: > > Technically, configuring a match-all rxnfc rule with ethtool would > > count as 'default priority' - I have proposed that before. Now I'm not > > entirely sure how intuitive it is, but I'm also interested in being > > able to configure this. > > That does not sound too crazy from my perspective. Sascha and myself aren't that familiar with that part of ethtool. You're talking about using ethtool --config-nfc/--config-ntuple on the (external) sw1p1, sw1p2 ports? Something like this (completely untested from the manpage): ethtool --config-nfc sw1p1 flow-type ether queue 2 # high prio queue for ethercat ethtool --config-nfc sw1p2 flow-type ether queue 1 # normal for rest Currently, there seems to be no "match-all" option. Alternatives to "queue X" might be "action" or "context", but I don't know enough about the details to prefer one above the other. Regards, Jan
Hi Jan, On Thu, 19 Sep 2019 at 16:21, Jan Lübbe <jlu@pengutronix.de> wrote: > > Hi, > > On Wed, 2019-09-18 at 10:41 -0700, Florian Fainelli wrote: > > > Technically, configuring a match-all rxnfc rule with ethtool would > > > count as 'default priority' - I have proposed that before. Now I'm not > > > entirely sure how intuitive it is, but I'm also interested in being > > > able to configure this. > > > > That does not sound too crazy from my perspective. > > Sascha and myself aren't that familiar with that part of ethtool. > You're talking about using ethtool --config-nfc/--config-ntuple on the > (external) sw1p1, sw1p2 ports? Something like this (completely untested > from the manpage): > ethtool --config-nfc sw1p1 flow-type ether queue 2 # high prio queue for ethercat > ethtool --config-nfc sw1p2 flow-type ether queue 1 # normal for rest > Yes, something like that. > Currently, there seems to be no "match-all" option. > Well, some keys for flow steering can be masked. See: src xx:yy:zz:aa:bb:cc [m xx:yy:zz:aa:bb:cc] Includes the source MAC address, specified as 6 bytes in hexadecimal separated by colons, along with an optional mask. Valid only for flow-type ether. dst xx:yy:zz:aa:bb:cc [m xx:yy:zz:aa:bb:cc] Includes the destination MAC address, specified as 6 bytes in hexadecimal separated by colons, along with an optional mask. Valid only for flow-type ether. proto N [m N] Includes the Ethernet protocol number (ethertype) and an optional mask. Valid only for flow-type ether. The idea is that any rule with e.g. src 00:00:00:00:00:00 and m 00:00:00:00:00:00 is an implicit match-all, because any (SMAC & m) == src. The issue I see (and why I said it's not intuitive) is that there is more than 1 way to express the same thing, and that it raises sanity questions about rule ordering (if the rule is first, should all subsequent flow steering rules be ignored?). Also, the driver would have to open-code the "matchall" condition in order to detect it and configure the default qpri. It appears that there is a way to do this with tc-flower (or any other classifier) as well, by specifying any null key with a mask of zero. I don't know enough either to understand what is preferable. > Alternatives to "queue X" might be "action" or "context", but I don't > know enough about the details to prefer one above the other. > > Regards, > Jan > Thanks, -Vladimir
On Thu, Sep 19, 2019 at 10:00:51AM +0200, Sascha Hauer wrote: > Hi Vladimir, > > On Wed, Sep 18, 2019 at 05:36:08PM +0300, Vladimir Oltean wrote: > > Hi Sascha, > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > > > Hi All, > > > > > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > > > regular network traffic on another port. The customer wants to configure two things > > > on the switch: First Ethercat traffic shall be priorized over other network traffic > > > (effectively prioritizing traffic based on port). Second the ethernet controller > > > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > > > port shall be rate limited. > > > > > > > You probably already know this, but egress shaping will not drop > > frames, just let them accumulate in the egress queue until something > > else happens (e.g. queue occupancy threshold triggers pause frames, or > > tail dropping is enabled, etc). Is this what you want? > > If I understand correctly then the switch has multiple output queues per > port. There are 4 egress queues per port. > The Ethercat traffic will go to a higher priority queue and on > congestion on other queues, frames designated for that queue will be > dropped. On ingress, there are various ways to add a priority to a frame, which then determines which egress queue it lands in. This can be static, all frames which ingress port X have priority Y. It can look at the 802.1p header and map the l2 priority to an queue priority. It can look at the IPv4 TOS/DSCP value and map it to a queue priority. And you can also use the TCAM, when it matches on a frame, it can specify the frame priority. If the egress queue, as selected by the queue priority is full, the frame will be dropped. > I just talked to our customer and he verified that their > Ethercat traffic still goes through even when the ports with the general > traffic are jammed with packets. So yes, I think this is what I want. Taking frames out of the port egress queues and passing them out the interface is then determined by the scheduler. There are two different configurations which can be used 1) Strict priority. Frames are taken from the highest priority queue, until it is empty. If the highest priority queue is empty, frames are taken from the second highest priority queue. And if the second priority is empty, the third priority queue is used, etc. This can lead to starvation, when the lower priority queue never get serviced because the higher priority queue have sufficient frames to keep the line busy. 2) Weighted round robin. It takes up to 8 frames from the highest priority queue, then up to 4 frames from the second highest priority queue, then up to 2 frames from the 3rd highest priority queue, and then 1 frame from the lower priority frame. And then it starts the cycle again. This scheduler ensure there is no starvation of the queues, but you do loose more high priority frames when congested. > The bottleneck here is in the CPU interface. The SoC simply can't handle > all frames coming into a fully occupied link, so we indeed have to limit > the number of packets coming into the SoC which speaks for egress rate > limiting. We could of course limit the ingress packets on the other > ports, but that would mean we have to rate limit each port to the total > desired rate divided by the number of ports to be safe, not very > optimal. Pause frames then start playing a role, maybe instead of, or in combination with, egress traffic shaping. If the SoC is overloaded, it can send a pause frame. The Switch will then pause the scheduler. No packets are sent to the SoC, until it unpauses. What i don't know is if the switch itself will send out pause frames downstream, when egress queues are getting full. That could have bad effects on overall network bandwidth, for frames which are not destined for the SoC. If you use egress shaping, you have to make a guess at how many bits per second the SoC can handle, and configure the egress shaping to that. The scheduler will then take frames from the queues at that rate. At least for the older generation of switches, egress shaping was rather course grained at the higher speed. Something like 500Mbps, 256Mbps, 128Mbps, 64Mbps, etc. So it might not do what you want. And older generation switches, ingress shaping is very course grained, mostly designed to limit broadcast storms. There are lots of options here, how you solve your problem. But you cannot look at just the SoC and switch combination. You need to look at your whole network design. If you are using 802.1p, who is setting the priority bits? If you are using TOS/DSCP, who is setting those bits? Is there policing going on in the network edge to ensure other traffic in the network is using the lower priority settings? Do you really want Ethercat to take the highest priority? I would put it at second priority, and make my SSH and SNMP client/server use the highest priority, so when it all goes wrong, i can login and take a look around to see what happened. Once you have a network design, and know what you want the SoC/switch combination to do, we can then figure out the correct API to configure this. It might be TC with offloads, it might be ethtool with offloads, etc. Andrew
On Wed, 2019-09-18 at 10:41 -0700, Florian Fainelli wrote: > > > The other part of the problem seems to be that the CPU port has no network device > > > representation in Linux, so there's no interface to configure the egress limits via tc. > > > This has been discussed before, but it seems there hasn't been any consensous regarding how > > > we want to proceed? > > You have the DSA master network device which is on the other side of the > switch, We thought it might be intutive to allow configuration of this via tc on the DSA master device (eth0, the i.MX FEC in our case). You'd specify ingress policing via tc, and the kernel would try to hw-offload that to the switch's CPU egress side first, maybe fall back to offload on the SoC's network controler and finally use the normal SW implementation. If that sounds reasonable, the next question would be how to express matching on switchdev information (such as the ingress port or vlan priority). Regards, Jan
Hi Andrew, thanks for your detail explanation! On Thu, 2019-09-19 at 15:34 +0200, Andrew Lunn wrote: > On Thu, Sep 19, 2019 at 10:00:51AM +0200, Sascha Hauer wrote: > > Hi Vladimir, > > > > On Wed, Sep 18, 2019 at 05:36:08PM +0300, Vladimir Oltean wrote: > > > Hi Sascha, > > > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > Hi All, > > > > > > > > We have a customer using a Marvell 88e6240 switch with Ethercat on one port and > > > > regular network traffic on another port. The customer wants to configure two things > > > > on the switch: First Ethercat traffic shall be priorized over other network traffic > > > > (effectively prioritizing traffic based on port). Second the ethernet controller > > > > in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU > > > > port shall be rate limited. > > > > > > > > > > You probably already know this, but egress shaping will not drop > > > frames, just let them accumulate in the egress queue until something > > > else happens (e.g. queue occupancy threshold triggers pause frames, or > > > tail dropping is enabled, etc). Is this what you want? > > > > If I understand correctly then the switch has multiple output queues per > > port. > > There are 4 egress queues per port. > > > The Ethercat traffic will go to a higher priority queue and on > > congestion on other queues, frames designated for that queue will be > > dropped. > > On ingress, there are various ways to add a priority to a frame, which > then determines which egress queue it lands in. This can be static, > all frames which ingress port X have priority Y. It can look at the > 802.1p header and map the l2 priority to an queue priority. It can > look at the IPv4 TOS/DSCP value and map it to a queue priority. And > you can also use the TCAM, when it matches on a frame, it can specify > the frame priority. > > If the egress queue, as selected by the queue priority is full, the > frame will be dropped. In our case, it's a static configuration (ethercat prio 3, others 2). > > I just talked to our customer and he verified that their > > Ethercat traffic still goes through even when the ports with the general > > traffic are jammed with packets. So yes, I think this is what I want. > > Taking frames out of the port egress queues and passing them out the > interface is then determined by the scheduler. There are two different > configurations which can be used > > 1) Strict priority. Frames are taken from the highest priority queue, > until it is empty. If the highest priority queue is empty, frames are > taken from the second highest priority queue. And if the second > priority is empty, the third priority queue is used, etc. This can > lead to starvation, when the lower priority queue never get serviced > because the higher priority queue have sufficient frames to keep the > line busy. This is configured in our case, and also limits the queuing latency for the ethercat traffic (to at most on MTU sized packet when other traffic is already being transmitted). > 2) Weighted round robin. It takes up to 8 frames from the highest > priority queue, then up to 4 frames from the second highest priority > queue, then up to 2 frames from the 3rd highest priority queue, and > then 1 frame from the lower priority frame. And then it starts the > cycle again. This scheduler ensure there is no starvation of the > queues, but you do loose more high priority frames when congested. > > > The bottleneck here is in the CPU interface. The SoC simply can't handle > > all frames coming into a fully occupied link, so we indeed have to limit > > the number of packets coming into the SoC which speaks for egress rate > > limiting. We could of course limit the ingress packets on the other > > ports, but that would mean we have to rate limit each port to the total > > desired rate divided by the number of ports to be safe, not very > > optimal. > > Pause frames then start playing a role, maybe instead of, or in > combination with, egress traffic shaping. > > If the SoC is overloaded, it can send a pause frame. The Switch will > then pause the scheduler. No packets are sent to the SoC, until it > unpauses. What i don't know is if the switch itself will send out > pause frames downstream, when egress queues are getting full. That > could have bad effects on overall network bandwidth, for frames which > are not destined for the SoC. I'm not confident that relying on pause frames on the i.MX6 would be enough to avoid dropping frames later. The system should also do other things besides running the network stack. ;) > If you use egress shaping, you have to make a guess at how many bits > per second the SoC can handle, and configure the egress shaping to > that. The scheduler will then take frames from the queues at that > rate. At least for the older generation of switches, egress shaping > was rather course grained at the higher speed. Something like 500Mbps, > 256Mbps, 128Mbps, 64Mbps, etc. So it might not do what you want. And > older generation switches, ingress shaping is very course grained, > mostly designed to limit broadcast storms. I think it's configured for a frame limit in this case, at benchmarks were done do ensure that this limit is low enough to avoid overload while also being high enough for the expected ethercat traffic. > There are lots of options here, how you solve your problem. But you > cannot look at just the SoC and switch combination. You need to look > at your whole network design. If you are using 802.1p, who is setting > the priority bits? If you are using TOS/DSCP, who is setting those > bits? Is there policing going on in the network edge to ensure other > traffic in the network is using the lower priority settings? Do you > really want Ethercat to take the highest priority? I would put it at > second priority, and make my SSH and SNMP client/server use the > highest priority, so when it all goes wrong, i can login and take a > look around to see what happened. We're currently trying to keep it as simple as possible. :) The system (consisting of switch and SoC) sets the priority and limits statically only based on external ports. The ethercat port is trusted and should have priority over all other ports (where untrusted might send any amount of traffic). If that could be configured, I'd of course appreciate a way to pass a limited amount of SSH/SNMP traffic in a even higher priority. That seems to be the next step though... > Once you have a network design, and know what you want the SoC/switch > combination to do, we can then figure out the correct API to configure > this. It might be TC with offloads, it might be ethtool with offloads, > etc. The system already works with the static configuration in the driver, but we'd like to find out how to handle this better (in mainline) in the future (also for updates for this device). Regards, Jan
On 9/19/19 1:44 AM, Sascha Hauer wrote: > Hi Florian, > > On Wed, Sep 18, 2019 at 10:41:58AM -0700, Florian Fainelli wrote: >> On 9/18/19 7:36 AM, Vladimir Oltean wrote: >>> Hi Sascha, >>> >>> On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: >>>> >>>> Hi All, >>>> >>>> We have a customer using a Marvell 88e6240 switch with Ethercat on one port and >>>> regular network traffic on another port. The customer wants to configure two things >>>> on the switch: First Ethercat traffic shall be priorized over other network traffic >>>> (effectively prioritizing traffic based on port). Second the ethernet controller >>>> in the CPU is not able to handle full bandwidth traffic, so the traffic to the CPU >>>> port shall be rate limited. >>>> >>> >>> You probably already know this, but egress shaping will not drop >>> frames, just let them accumulate in the egress queue until something >>> else happens (e.g. queue occupancy threshold triggers pause frames, or >>> tail dropping is enabled, etc). Is this what you want? It sounds a bit >>> strange to me to configure egress shaping on the CPU port of a DSA >>> switch. That literally means you are buffering frames inside the >>> system. What about ingress policing? >> >> Indeed, but I suppose that depending on the switch architecture and/or >> nomenclature, configuring egress shaping amounts to determining ingress >> for the ports where the frame is going to be forwarded to. >> >> For instance Broadcom switches rarely if at all mention ingress because >> the frames have to originate from somewhere and be forwarded to other >> port(s), therefore, they will egress their original port (which for all >> practical purposes is the direct continuation of the ingress stage), >> where shaping happens, which immediately influences the ingress shaping >> of the destination port, which will egress the frame eventually because >> packets have to be delivered to the final port's egress queue anyway. >> >>> >>>> For reference the patch below configures the switch to their needs. Now the question >>>> is how this can be implemented in a way suitable for mainline. It looks like the per >>>> port priority mapping for VLAN tagged packets could be done via ip link add link ... >>>> ingress-qos-map QOS-MAP. How the default priority would be set is unclear to me. >>>> >>> >>> Technically, configuring a match-all rxnfc rule with ethtool would >>> count as 'default priority' - I have proposed that before. Now I'm not >>> entirely sure how intuitive it is, but I'm also interested in being >>> able to configure this. >> >> That does not sound too crazy from my perspective. >> >>> >>>> The other part of the problem seems to be that the CPU port has no network device >>>> representation in Linux, so there's no interface to configure the egress limits via tc. >>>> This has been discussed before, but it seems there hasn't been any consensous regarding how >>>> we want to proceed? >> >> You have the DSA master network device which is on the other side of the >> switch, > > You mean the (in my case) i.MX FEC? Everything I do on this device ends > up in the FEC itself and not on the switch, right? Yes, we have a way to overload specific netdevice_ops and ethtool_ops operations in order to use the i.MX FEC network device as configuration entry point, say eth0, but have it operate on the switch, because when the DSA switch got attached to the DSA master, we replaced some of that network device's operations with ones that piggy back into the switch. See net/dsa/master.c for details.
Adding TC maintainers... (we're trying to find a mainline-acceptable way to configure and offload strict port-based priorities in the context of DSA/switchdev). On Thu, 2019-09-19 at 10:12 -0700, Florian Fainelli wrote: > On 9/19/19 1:44 AM, Sascha Hauer wrote: > > On Wed, Sep 18, 2019 at 10:41:58AM -0700, Florian Fainelli wrote: > > > On 9/18/19 7:36 AM, Vladimir Oltean wrote: > > > > On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: > > > > > The other part of the problem seems to be that the CPU port > > > > > has no network device representation in Linux, so there's no > > > > > interface to configure the egress limits via tc. > > > > > This has been discussed before, but it seems there hasn't > > > > > been any consensous regarding how we want to proceed? > > > > > > You have the DSA master network device which is on the other side of the > > > switch, > > > > You mean the (in my case) i.MX FEC? Everything I do on this device ends > > up in the FEC itself and not on the switch, right? > > Yes, we have a way to overload specific netdevice_ops and ethtool_ops > operations in order to use the i.MX FEC network device as configuration > entry point, say eth0, but have it operate on the switch, because when > the DSA switch got attached to the DSA master, we replaced some of that > network device's operations with ones that piggy back into the switch. > See net/dsa/master.c for details. Currently, it overrides for ethtool: ops->get_sset_count = dsa_master_get_sset_count; ops->get_ethtool_stats = dsa_master_get_ethtool_stats; ops->get_strings = dsa_master_get_strings; ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats; for ndo: ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name; In dsa/slave.c we have for ndo: .ndo_setup_tc = dsa_slave_setup_tc, So we would override ndo_setup_tc from dsa as well, maybe falling back to the original ndo_setup_tc provided by the ethernet driver if we the switch HW cannot handle a TC configuration? That would allow us to configure and offload a CPU-port-specific TC policy under the control of DSA. Is this interface reasonable? Im not sure if the existing TC filters and qdiscs can match on bridge- specific information (like the ingress port) yet, so this might require extensions to TC filters as well... Regards, Jan
On 9/23/2019 5:56 AM, Jan Lübbe wrote: > Adding TC maintainers... (we're trying to find a mainline-acceptable > way to configure and offload strict port-based priorities in the > context of DSA/switchdev). > > On Thu, 2019-09-19 at 10:12 -0700, Florian Fainelli wrote: >> On 9/19/19 1:44 AM, Sascha Hauer wrote: >>> On Wed, Sep 18, 2019 at 10:41:58AM -0700, Florian Fainelli wrote: >>>> On 9/18/19 7:36 AM, Vladimir Oltean wrote: >>>>> On Wed, 18 Sep 2019 at 17:03, Sascha Hauer <s.hauer@pengutronix.de> wrote: >>>>>> The other part of the problem seems to be that the CPU port >>>>>> has no network device representation in Linux, so there's no >>>>>> interface to configure the egress limits via tc. >>>>>> This has been discussed before, but it seems there hasn't >>>>>> been any consensous regarding how we want to proceed? >>>> >>>> You have the DSA master network device which is on the other side of the >>>> switch, >>> >>> You mean the (in my case) i.MX FEC? Everything I do on this device ends >>> up in the FEC itself and not on the switch, right? >> >> Yes, we have a way to overload specific netdevice_ops and ethtool_ops >> operations in order to use the i.MX FEC network device as configuration >> entry point, say eth0, but have it operate on the switch, because when >> the DSA switch got attached to the DSA master, we replaced some of that >> network device's operations with ones that piggy back into the switch. >> See net/dsa/master.c for details. > > Currently, it overrides > for ethtool: > ops->get_sset_count = dsa_master_get_sset_count; > ops->get_ethtool_stats = dsa_master_get_ethtool_stats; > ops->get_strings = dsa_master_get_strings; > ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats; > for ndo: > ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name; > > In dsa/slave.c we have for ndo: > .ndo_setup_tc = dsa_slave_setup_tc, > > So we would override ndo_setup_tc from dsa as well, maybe falling back > to the original ndo_setup_tc provided by the ethernet driver if we the > switch HW cannot handle a TC configuration? There are two simple cases: - the switch provides a ndo_setup_tc() implementation (not just dsa_slave_setup_tc, the callbacks behind are also provided), but the DSA master does not, you can use the DSA switch implementation - the DSA master provides a ndo_setup_tc() implementation, but the DSA switch does not, then you can use the DSA master implementation If both are provided then you must make a choice of either using one, or the other, typically the using the most capable for the specific use case, or using both. If your interest is in doing egress configuration on the CPU port, then maybe using the bridge master device and somehow linking it to the DSA switch's CPU port might be a better approach. See [1] for how this is done for instance. [1]: https://lists.linuxfoundation.org/pipermail/bridge/2016-November/010112.html > > That would allow us to configure and offload a CPU-port-specific TC > policy under the control of DSA. Is this interface reasonable? > > Im not sure if the existing TC filters and qdiscs can match on bridge- > specific information (like the ingress port) yet, so this might require > extensions to TC filters as well... bridge ports are net_device instances, so as long as this paradigm is maintained, it should work.
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index d0a97eb73a37..2a15cf259d04 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2090,7 +2090,9 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) { struct dsa_switch *ds = chip->ds; int err; + u16 addr; u16 reg; + u16 val; chip->ports[port].chip = chip; chip->ports[port].port = port; @@ -2246,7 +2248,57 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) /* Default VLAN ID and priority: don't set a default VLAN * ID, and set the default packet priority to zero. */ - return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_DEFAULT_VLAN, 0); + err = mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_DEFAULT_VLAN, 0); + if (err) + return err; + +#define SWITCH_CPU_PORT 5 +#define SWITCH_ETHERCAT_PORT 3 + + /* set the egress rate */ + switch (port) { + case SWITCH_CPU_PORT: + err = mv88e6xxx_port_set_egress_rate(chip, port, + MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME, 30000); + break; + default: + err = mv88e6xxx_port_set_egress_rate(chip, port, + MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME, 0); + break; + } + + if (err) + return err; + + /* set the output queue usage */ + switch (port) { + case SWITCH_CPU_PORT: + err = mv88e6xxx_port_set_output_queue_schedule(chip, port, + MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_STRICT); + break; + default: + err = mv88e6xxx_port_set_output_queue_schedule(chip, port, + MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_NONE_STRICT); + break; + } + + if (err) + return err; + + /* set the default QPri */ + switch (port) { + case SWITCH_ETHERCAT_PORT: + err = mv88e6xxx_port_set_default_qpri(chip, port, 3); + break; + default: + err = mv88e6xxx_port_set_default_qpri(chip, port, 2); + break; + } + + if (err) + return err; + + return 0; } static int mv88e6xxx_port_enable(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index 04309ef0a1cc..e03f24308f15 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -1147,6 +1147,22 @@ int mv88e6165_port_set_jumbo_size(struct mv88e6xxx_chip *chip, int port, return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, reg); } +int mv88e6xxx_port_set_default_qpri(struct mv88e6xxx_chip *chip, int port, int qpri) +{ + u16 reg; + int err; + + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_CTL2, ®); + if (err) + return err; + + reg &= ~MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK; + reg |= (qpri << 1) & MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK; + reg |= MV88E6XXX_PORT_CTL2_USE_DEF_QPRI; + + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, reg); +} + /* Offset 0x09: Port Rate Control */ int mv88e6095_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port) @@ -1161,6 +1177,77 @@ int mv88e6097_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port) 0x0001); } +int mv88e6xxx_port_set_output_queue_schedule(struct mv88e6xxx_chip *chip, int port, + u16 schedule) +{ + u16 reg; + int err; + + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, ®); + if (err) + return err; + + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_MASK; + reg |= schedule; + + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, reg); +} + +static int _mv88e6xxx_egress_rate_calc_frames(u32 rate, u16 *egress_rate_val) +{ + const volatile u32 scale_factor = (1000 * 1000 * 1000); + volatile u32 u; + + if (rate > 1488000) + return EINVAL; + + if (rate < 7600) + return EINVAL; + + u = 32 * rate; + u = scale_factor / u; /* scale_factor used to convert 32s into 32ns */ + + *egress_rate_val = (u16)u; + + return 0; +} + +int mv88e6xxx_port_set_egress_rate(struct mv88e6xxx_chip *chip, int port, u16 type, + u32 rate) +{ + u16 reg; + int err; + u16 egress_rate_val; + + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, ®); + if (err) + return err; + + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_RATE_MASK; + + if (rate) { + reg &= ~MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_MASK; + reg |= type; + + switch (type) { + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME: + err = _mv88e6xxx_egress_rate_calc_frames(rate, &egress_rate_val); + if (err) + return err; + reg |= egress_rate_val & 0x0FFF; + break; + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L1: + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L2: + case MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L3: + return EINVAL; /* ToDo */ + default: + return EINVAL; + } + } + + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_EGRESS_RATE_CTL2, reg); +} + /* Offset 0x0C: Port ATU Control */ int mv88e6xxx_port_disable_learn_limit(struct mv88e6xxx_chip *chip, int port) diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h index 8d5a6cd6fb19..cdd057c52ab8 100644 --- a/drivers/net/dsa/mv88e6xxx/port.h +++ b/drivers/net/dsa/mv88e6xxx/port.h @@ -197,6 +197,8 @@ #define MV88E6XXX_PORT_CTL2_DEFAULT_FORWARD 0x0040 #define MV88E6XXX_PORT_CTL2_EGRESS_MONITOR 0x0020 #define MV88E6XXX_PORT_CTL2_INGRESS_MONITOR 0x0010 +#define MV88E6XXX_PORT_CTL2_USE_DEF_QPRI 0x0008 +#define MV88E6XXX_PORT_CTL2_DEF_QPRI_MASK 0x0006 #define MV88E6095_PORT_CTL2_CPU_PORT_MASK 0x000f /* Offset 0x09: Egress Rate Control */ @@ -204,6 +206,17 @@ /* Offset 0x0A: Egress Rate Control 2 */ #define MV88E6XXX_PORT_EGRESS_RATE_CTL2 0x0a +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_MASK 0xC000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_FRAME 0x0000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L1 0x4000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L2 0x8000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_COUNT_MODE_L3 0xC000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_MASK 0x3000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_NONE_STRICT 0x0000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_STRICT 0x1000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_Q3_Q2_STRICT 0x2000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_SCHEDULE_ALL_STRICT 0x3000 +#define MV88E6XXX_PORT_EGRESS_RATE_CTL2_RATE_MASK 0x0FFF /* Offset 0x0B: Port Association Vector */ #define MV88E6XXX_PORT_ASSOC_VECTOR 0x0b @@ -326,8 +339,14 @@ int mv88e6xxx_port_set_message_port(struct mv88e6xxx_chip *chip, int port, bool message_port); int mv88e6165_port_set_jumbo_size(struct mv88e6xxx_chip *chip, int port, size_t size); +int mv88e6xxx_port_set_default_qpri(struct mv88e6xxx_chip *chip, int port, + int qpri); int mv88e6095_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port); int mv88e6097_port_egress_rate_limiting(struct mv88e6xxx_chip *chip, int port); +int mv88e6xxx_port_set_output_queue_schedule(struct mv88e6xxx_chip *chip, int port, + u16 schedule); +int mv88e6xxx_port_set_egress_rate(struct mv88e6xxx_chip *chip, int port, + u16 type, u32 rate); int mv88e6097_port_pause_limit(struct mv88e6xxx_chip *chip, int port, u8 in, u8 out); int mv88e6390_port_pause_limit(struct mv88e6xxx_chip *chip, int port, u8 in,