Message ID | 1470329387-25138-1-git-send-email-sridhar.samudrala@intel.com |
---|---|
State | Changes Requested |
Headers | show |
On 8/4/2016 9:49 AM, Sridhar Samudrala wrote: > Add initial devlink support to set/get the mode of SRIOV switch. > This patch allows the mode to be set to either 'legacy' or 'switchdev', but > doesn't implement any functionality to create vf representors in switchdev > mode. > > With smode support in iproute2 'devlink' utility, switch mode can be set > and get via following commands. > > # devlink dev smode pci/0000:05:00.0 > mode: legacy > # devlink dev set pci/0000:05:00.0 smode switchdev > # devlink dev smode pci/0000:05:00.0 > mode: switchdev > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> Acked-by: Amritha Nambiar <amritha.nambiar@intel.com> > --- > drivers/net/ethernet/intel/Kconfig | 1 + > drivers/net/ethernet/intel/i40e/i40e.h | 3 ++ > drivers/net/ethernet/intel/i40e/i40e_main.c | 84 ++++++++++++++++++++++++++--- > 3 files changed, 80 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig > index c0e1743..2ede229 100644 > --- a/drivers/net/ethernet/intel/Kconfig > +++ b/drivers/net/ethernet/intel/Kconfig > @@ -215,6 +215,7 @@ config I40E > tristate "Intel(R) Ethernet Controller XL710 Family support" > select PTP_1588_CLOCK > depends on PCI > + depends on MAY_USE_DEVLINK > ---help--- > This driver supports Intel(R) Ethernet Controller XL710 Family of > devices. For more information on how to identify your adapter, go > diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h > index 2a88291..724bd82 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e.h > +++ b/drivers/net/ethernet/intel/i40e/i40e.h > @@ -53,6 +53,8 @@ > #include <linux/clocksource.h> > #include <linux/net_tstamp.h> > #include <linux/ptp_clock_kernel.h> > +#include <net/devlink.h> > + > #include "i40e_type.h" > #include "i40e_prototype.h" > #ifdef I40E_FCOE > @@ -446,6 +448,7 @@ struct i40e_pf { > u32 ioremap_len; > u32 fd_inv; > u16 phy_led_val; > + enum devlink_eswitch_mode eswitch_mode; > }; > > enum i40e_filter_state { > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c > index 339d99b..bb44f09 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c > @@ -27,6 +27,7 @@ > #include <linux/etherdevice.h> > #include <linux/of_net.h> > #include <linux/pci.h> > +#include <net/devlink.h> > > /* Local includes */ > #include "i40e.h" > @@ -10671,6 +10672,58 @@ static void i40e_get_platform_mac_addr(struct pci_dev *pdev, struct i40e_pf *pf) > } > > /** > + * i40e_devlink_eswitch_mode_get > + * > + * @devlink: pointer to devlink struct > + * @mode: sr-iov switch mode pointer > + * > + * Returns the switch mode of the associated PF in the @mode pointer. > + */ > +static int i40e_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) > +{ > + struct i40e_pf *pf = devlink_priv(devlink); > + > + *mode = pf->eswitch_mode; > + > + return 0; > +} > + > +/** > + * i40e_devlink_eswitch_mode_set > + * > + * @devlink: pointer to devlink struct > + * @mode: sr-iov switch mode > + * > + * Set the switch mode of the associated PF. > + * Returns 0 on success and -EOPNOTSUPP on error. > + */ > +static int i40e_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode) > +{ > + struct i40e_pf *pf = devlink_priv(devlink); > + int err = 0; > + > + if (mode == pf->eswitch_mode) > + goto done; > + > + switch (mode) { > + case DEVLINK_ESWITCH_MODE_LEGACY: > + case DEVLINK_ESWITCH_MODE_SWITCHDEV: > + pf->eswitch_mode = mode; > + break; > + default: > + err = -EOPNOTSUPP; > + } > + > +done: > + return err; > +} > + > +static const struct devlink_ops i40e_devlink_ops = { > + .eswitch_mode_get = i40e_devlink_eswitch_mode_get, > + .eswitch_mode_set = i40e_devlink_eswitch_mode_set, > +}; > + > +/** > * i40e_probe - Device initialization routine > * @pdev: PCI device information struct > * @ent: entry in i40e_pci_tbl > @@ -10687,6 +10740,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > struct i40e_pf *pf; > struct i40e_hw *hw; > static u16 pfs_found; > + struct devlink *devlink; > u16 wol_nvm_bits; > u16 link_status; > int err; > @@ -10721,16 +10775,21 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > pci_enable_pcie_error_reporting(pdev); > pci_set_master(pdev); > > + devlink = devlink_alloc(&i40e_devlink_ops, sizeof(*pf)); > + if (!devlink) { > + dev_err(&pdev->dev, > + "devlink_alloc failed\n"); > + err = -ENOMEM; > + goto err_devlink_alloc; > + } > + > /* Now that we have a PCI connection, we need to do the > * low level device setup. This is primarily setting up > * the Admin Queue structures and then querying for the > * device's current profile information. > */ > - pf = kzalloc(sizeof(*pf), GFP_KERNEL); > - if (!pf) { > - err = -ENOMEM; > - goto err_pf_alloc; > - } > + pf = devlink_priv(devlink); > + > pf->next_vsi = 0; > pf->pdev = pdev; > set_bit(__I40E_DOWN, &pf->state); > @@ -11047,6 +11106,11 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > } > } > > + pf->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY; > + err = devlink_register(devlink, &pdev->dev); > + if (err) > + goto err_devlink_register; > + > #ifdef CONFIG_PCI_IOV > /* prep for VF support */ > if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) && > @@ -11188,6 +11252,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > return 0; > > /* Unwind what we've done if something failed in the setup */ > +err_devlink_register: > + i40e_stop_misc_vector(pf); > err_vsis: > set_bit(__I40E_DOWN, &pf->state); > i40e_clear_interrupt_scheme(pf); > @@ -11205,8 +11271,8 @@ err_adminq_setup: > err_pf_reset: > iounmap(hw->hw_addr); > err_ioremap: > - kfree(pf); > -err_pf_alloc: > + devlink_free(devlink); > +err_devlink_alloc: > pci_disable_pcie_error_reporting(pdev); > pci_release_selected_regions(pdev, > pci_select_bars(pdev, IORESOURCE_MEM)); > @@ -11229,9 +11295,11 @@ static void i40e_remove(struct pci_dev *pdev) > { > struct i40e_pf *pf = pci_get_drvdata(pdev); > struct i40e_hw *hw = &pf->hw; > + struct devlink *devlink = priv_to_devlink(pf); > i40e_status ret_code; > int i; > > + devlink_unregister(devlink); > i40e_dbg_pf_exit(pf); > > i40e_ptp_stop(pf); > @@ -11319,7 +11387,7 @@ static void i40e_remove(struct pci_dev *pdev) > kfree(pf->vsi); > > iounmap(hw->hw_addr); > - kfree(pf); > + devlink_free(devlink); > pci_release_selected_regions(pdev, > pci_select_bars(pdev, IORESOURCE_MEM)); >
On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala <sridhar.samudrala@intel.com> wrote: > Add initial devlink support to set/get the mode of SRIOV switch. > This patch allows the mode to be set to either 'legacy' or 'switchdev', but > doesn't implement any functionality to create vf representors in switchdev > mode. > > With smode support in iproute2 'devlink' utility, switch mode can be set > and get via following commands. > > # devlink dev smode pci/0000:05:00.0 > mode: legacy > # devlink dev set pci/0000:05:00.0 smode switchdev > # devlink dev smode pci/0000:05:00.0 > mode: switchdev > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> I really don't see much value in this patch. If you are going to support SwitchDev then just do it. Otherwise you are adding extra overhead for maintaining two different modes. I would recommend putting this series out to netdev as an RFC. Submitting it to intel-wired-lan is kind of pointless as the audience it to small to get any valuable review. - Alex
On 16-08-10 09:01 AM, Alexander Duyck wrote: > On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala > <sridhar.samudrala@intel.com> wrote: >> Add initial devlink support to set/get the mode of SRIOV switch. >> This patch allows the mode to be set to either 'legacy' or 'switchdev', but >> doesn't implement any functionality to create vf representors in switchdev >> mode. >> >> With smode support in iproute2 'devlink' utility, switch mode can be set >> and get via following commands. >> >> # devlink dev smode pci/0000:05:00.0 >> mode: legacy >> # devlink dev set pci/0000:05:00.0 smode switchdev >> # devlink dev smode pci/0000:05:00.0 >> mode: switchdev >> >> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> > > I really don't see much value in this patch. If you are going to > support SwitchDev then just do it. Otherwise you are adding extra > overhead for maintaining two different modes. > > I would recommend putting this series out to netdev as an RFC. > Submitting it to intel-wired-lan is kind of pointless as the audience > it to small to get any valuable review. > > - Alex I argued at length about this already. Jiri and company wanted this flag to push device in and out of this mode. Here we are just following the already upstreamed and debated decision. This is less about switchdev and more about generating VF netdevs to use with ip tools and friends. Another option would be to just always enable VF netdevs and have no legacy mode at all. I think that would be fine it just depends on if you think having extra netdevs around will confuse the stack at all. It might create a few corner cases but one reasonable thing to do would be to just fix those cases as they appear. Thanks, John
On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend <john.fastabend@gmail.com> wrote: > On 16-08-10 09:01 AM, Alexander Duyck wrote: >> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala >> <sridhar.samudrala@intel.com> wrote: >>> Add initial devlink support to set/get the mode of SRIOV switch. >>> This patch allows the mode to be set to either 'legacy' or 'switchdev', but >>> doesn't implement any functionality to create vf representors in switchdev >>> mode. >>> >>> With smode support in iproute2 'devlink' utility, switch mode can be set >>> and get via following commands. >>> >>> # devlink dev smode pci/0000:05:00.0 >>> mode: legacy >>> # devlink dev set pci/0000:05:00.0 smode switchdev >>> # devlink dev smode pci/0000:05:00.0 >>> mode: switchdev >>> >>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> >> >> I really don't see much value in this patch. If you are going to >> support SwitchDev then just do it. Otherwise you are adding extra >> overhead for maintaining two different modes. >> >> I would recommend putting this series out to netdev as an RFC. >> Submitting it to intel-wired-lan is kind of pointless as the audience >> it to small to get any valuable review. >> >> - Alex > > I argued at length about this already. Jiri and company wanted this flag > to push device in and out of this mode. Here we are just following the > already upstreamed and debated decision. Yeah, I started doing more research after reviewing this patch. I see the idea behind it. I think the issue if anything is that it seems like things are a bit backwards. We probably should be enabling the SwitchDev bits first and then working on adding devlink knobs to disable things later. > This is less about switchdev and more about generating VF netdevs to > use with ip tools and friends. Right. One of the issues I have with this patch set is that it seems to get things backwards. They are making VFs appear that don't do much of anything and then trying to bolt on features after the fact. We probably need to focus on enabling the VF representation, and then providing the ability to switch them on and off. Also I would argue that we should actually be enabling switch features such as FDB entries instead of trying to bolt on stuff like flow director which doesn't really apply to very many switches and isn't as likely to be used on a switch port. > Another option would be to just always enable VF netdevs and have no > legacy mode at all. I think that would be fine it just depends on if > you think having extra netdevs around will confuse the stack at all. > It might create a few corner cases but one reasonable thing to do > would be to just fix those cases as they appear. I'd say we are better off starting out with them just enabled and then enabling the option to disable them after the fact. If we are going to have this extra code floating around we should be defaulting it to enabled so that it is more likely to be used. The legacy option should only really be there so we can turn this off if we don't want it. - Alex
On 16-08-10 11:18 AM, Alexander Duyck wrote: > On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend > <john.fastabend@gmail.com> wrote: >> On 16-08-10 09:01 AM, Alexander Duyck wrote: >>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala >>> <sridhar.samudrala@intel.com> wrote: >>>> Add initial devlink support to set/get the mode of SRIOV switch. >>>> This patch allows the mode to be set to either 'legacy' or 'switchdev', but >>>> doesn't implement any functionality to create vf representors in switchdev >>>> mode. >>>> >>>> With smode support in iproute2 'devlink' utility, switch mode can be set >>>> and get via following commands. >>>> >>>> # devlink dev smode pci/0000:05:00.0 >>>> mode: legacy >>>> # devlink dev set pci/0000:05:00.0 smode switchdev >>>> # devlink dev smode pci/0000:05:00.0 >>>> mode: switchdev >>>> >>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> >>> >>> I really don't see much value in this patch. If you are going to >>> support SwitchDev then just do it. Otherwise you are adding extra >>> overhead for maintaining two different modes. >>> >>> I would recommend putting this series out to netdev as an RFC. >>> Submitting it to intel-wired-lan is kind of pointless as the audience >>> it to small to get any valuable review. >>> >>> - Alex >> >> I argued at length about this already. Jiri and company wanted this flag >> to push device in and out of this mode. Here we are just following the >> already upstreamed and debated decision. > > Yeah, I started doing more research after reviewing this patch. I see > the idea behind it. I think the issue if anything is that it seems > like things are a bit backwards. We probably should be enabling the > SwitchDev bits first and then working on adding devlink knobs to > disable things later. > Sure, although its not clear to me exactly which switchdev bits are useful for an edge NIC like this. Getting switch ids is one thing that will become useful when we enable multiple bridges. Otherwise I don't see what l2 switchdev blocks are useful vs just using the standard ndo op interfaces already in place when working on a device without learning/aging/etc. The one thing that I've never bothered to add is pushing "learned" rules down into the hardware but I'm not convinced for most use cases this is particularly interesting because you _should_ know in a managed system what MAC addresses a VM/container/etc is allowed to use ahead of time via libvirt or other mgmt stack. I haven't tested the VLAN handling though so that needs to be looked at. And l3 switchdev routing may be interesting but its fairly low on my priority list unless someone is really excited about it. >> This is less about switchdev and more about generating VF netdevs to >> use with ip tools and friends. > > Right. One of the issues I have with this patch set is that it seems > to get things backwards. They are making VFs appear that don't do > much of anything and then trying to bolt on features after the fact. > We probably need to focus on enabling the VF representation, and then > providing the ability to switch them on and off. Also I would argue > that we should actually be enabling switch features such as FDB > entries instead of trying to bolt on stuff like flow director which > doesn't really apply to very many switches and isn't as likely to be > used on a switch port. Fair enough. Organizing the patches better seems OK to me. I plan to use the 'tc' offloaded mechanisms not the ethtool flow director interface for virtual switch offloads. > >> Another option would be to just always enable VF netdevs and have no >> legacy mode at all. I think that would be fine it just depends on if >> you think having extra netdevs around will confuse the stack at all. >> It might create a few corner cases but one reasonable thing to do >> would be to just fix those cases as they appear. > > I'd say we are better off starting out with them just enabled and then > enabling the option to disable them after the fact. If we are going > to have this extra code floating around we should be defaulting it to > enabled so that it is more likely to be used. The legacy option > should only really be there so we can turn this off if we don't want > it. > Works for me. > - Alex >
On Thu, 2016-08-04 at 18:49 +0200, Sridhar Samudrala wrote: > > With smode support in iproute2 'devlink' utility, switch mode can be set > and get via following commands. > > # devlink dev smode pci/0000:05:00.0 > mode: legacy > # devlink dev set pci/0000:05:00.0 smode switchdev > # devlink dev smode pci/0000:05:00.0 > mode: switchdev > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> > --- > drivers/net/ethernet/intel/Kconfig | 1 + > drivers/net/ethernet/intel/i40e/i40e.h | 3 ++ > drivers/net/ethernet/intel/i40e/i40e_main.c | 84 > ++++++++++++++++++++++++++--- > 3 files changed, 80 insertions(+), 8 deletions(-) Since there was no title patch to this series, I am using patch 1 of the series to address the whole series. Since Alex has brought up some good concerns/comments that should be addressed (and the fact that this series does not apply cleanly to my current tree), I am dropping this series. I will await a v2 where Alex's comments are addressed and the series is rebased against my current dev- queue branch.
On 8/10/2016 11:41 AM, John Fastabend wrote: > On 16-08-10 11:18 AM, Alexander Duyck wrote: >> On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend >> <john.fastabend@gmail.com> wrote: >>> On 16-08-10 09:01 AM, Alexander Duyck wrote: >>>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala >>>> <sridhar.samudrala@intel.com> wrote: >>>>> Add initial devlink support to set/get the mode of SRIOV switch. >>>>> This patch allows the mode to be set to either 'legacy' or 'switchdev', but >>>>> doesn't implement any functionality to create vf representors in switchdev >>>>> mode. >>>>> >>>>> With smode support in iproute2 'devlink' utility, switch mode can be set >>>>> and get via following commands. >>>>> >>>>> # devlink dev smode pci/0000:05:00.0 >>>>> mode: legacy >>>>> # devlink dev set pci/0000:05:00.0 smode switchdev >>>>> # devlink dev smode pci/0000:05:00.0 >>>>> mode: switchdev >>>>> >>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> >>>> I really don't see much value in this patch. If you are going to >>>> support SwitchDev then just do it. Otherwise you are adding extra >>>> overhead for maintaining two different modes. >>>> >>>> I would recommend putting this series out to netdev as an RFC. >>>> Submitting it to intel-wired-lan is kind of pointless as the audience >>>> it to small to get any valuable review. As these patches have only changes to i40e driver, i didn't include netdev. Will do so when i submit v2. >>>> >>>> - Alex >>> I argued at length about this already. Jiri and company wanted this flag >>> to push device in and out of this mode. Here we are just following the >>> already upstreamed and debated decision. >> Yeah, I started doing more research after reviewing this patch. I see >> the idea behind it. I think the issue if anything is that it seems >> like things are a bit backwards. We probably should be enabling the >> SwitchDev bits first and then working on adding devlink knobs to >> disable things later. >> > Sure, although its not clear to me exactly which switchdev bits are > useful for an edge NIC like this. Getting switch ids is one thing > that will become useful when we enable multiple bridges. This patchset is not really adding any switchdev ops. We are calling the mode in which the PF driver creates VF representors as SWITCHDEV mode (this is based on the netdev discussion of mellanox patches). Amritha has a patch to add switchdev ops to VF representor netdevs that enables returning switch id via switchdev_port_attr_get() op. > > Otherwise I don't see what l2 switchdev blocks are useful vs > just using the standard ndo op interfaces already in place when > working on a device without learning/aging/etc. The one thing > that I've never bothered to add is pushing "learned" rules down > into the hardware but I'm not convinced for most use cases this > is particularly interesting because you _should_ know in a managed > system what MAC addresses a VM/container/etc is allowed to use > ahead of time via libvirt or other mgmt stack. I haven't tested > the VLAN handling though so that needs to be looked at. > > And l3 switchdev routing may be interesting but its fairly > low on my priority list unless someone is really excited about it. > >>> This is less about switchdev and more about generating VF netdevs to >>> use with ip tools and friends. >> Right. One of the issues I have with this patch set is that it seems >> to get things backwards. They are making VFs appear that don't do >> much of anything and then trying to bolt on features after the fact. >> We probably need to focus on enabling the VF representation, and then >> providing the ability to switch them on and off. Also I would argue >> that we should actually be enabling switch features such as FDB >> entries instead of trying to bolt on stuff like flow director which >> doesn't really apply to very many switches and isn't as likely to be >> used on a switch port. > Fair enough. Organizing the patches better seems OK to me. I plan to > use the 'tc' offloaded mechanisms not the ethtool flow director > interface for virtual switch offloads. > >>> Another option would be to just always enable VF netdevs and have no >>> legacy mode at all. I think that would be fine it just depends on if >>> you think having extra netdevs around will confuse the stack at all. >>> It might create a few corner cases but one reasonable thing to do >>> would be to just fix those cases as they appear. >> I'd say we are better off starting out with them just enabled and then >> enabling the option to disable them after the fact. If we are going >> to have this extra code floating around we should be defaulting it to >> enabled so that it is more likely to be used. The legacy option >> should only really be there so we can turn this off if we don't want >> it. >> > Works for me. > OK. If we all agree that 'creating VF netdevs' by default is the right way to go, I will rearrange the patchset in the following order. - Introduce VF representor netdevs (create VF netdevs by default when sr-iov VFs are enabled) - enable ethtool stats on VF representors - enable ntuple filters on VF representors - introduce devlink to enable the switch mode to be changed to 'legacy' (no VF netdevs) Thanks Sridhar
On Tue, Aug 16, 2016 at 11:54 AM, Samudrala, Sridhar <sridhar.samudrala@intel.com> wrote: > On 8/10/2016 11:41 AM, John Fastabend wrote: >> >> On 16-08-10 11:18 AM, Alexander Duyck wrote: >>> >>> On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend >>> <john.fastabend@gmail.com> wrote: >>>> >>>> On 16-08-10 09:01 AM, Alexander Duyck wrote: >>>>> >>>>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala >>>>> <sridhar.samudrala@intel.com> wrote: >>>>>> >>>>>> Add initial devlink support to set/get the mode of SRIOV switch. >>>>>> This patch allows the mode to be set to either 'legacy' or >>>>>> 'switchdev', but >>>>>> doesn't implement any functionality to create vf representors in >>>>>> switchdev >>>>>> mode. >>>>>> >>>>>> With smode support in iproute2 'devlink' utility, switch mode can be >>>>>> set >>>>>> and get via following commands. >>>>>> >>>>>> # devlink dev smode pci/0000:05:00.0 >>>>>> mode: legacy >>>>>> # devlink dev set pci/0000:05:00.0 smode switchdev >>>>>> # devlink dev smode pci/0000:05:00.0 >>>>>> mode: switchdev >>>>>> >>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> >>>>> >>>>> I really don't see much value in this patch. If you are going to >>>>> support SwitchDev then just do it. Otherwise you are adding extra >>>>> overhead for maintaining two different modes. >>>>> >>>>> I would recommend putting this series out to netdev as an RFC. >>>>> Submitting it to intel-wired-lan is kind of pointless as the audience >>>>> it to small to get any valuable review. > > > As these patches have only changes to i40e driver, i didn't include netdev. > Will do so when > i submit v2. > >>>>> >>>>> - Alex >>>> >>>> I argued at length about this already. Jiri and company wanted this flag >>>> to push device in and out of this mode. Here we are just following the >>>> already upstreamed and debated decision. >>> >>> Yeah, I started doing more research after reviewing this patch. I see >>> the idea behind it. I think the issue if anything is that it seems >>> like things are a bit backwards. We probably should be enabling the >>> SwitchDev bits first and then working on adding devlink knobs to >>> disable things later. >>> >> Sure, although its not clear to me exactly which switchdev bits are >> useful for an edge NIC like this. Getting switch ids is one thing >> that will become useful when we enable multiple bridges. > > > This patchset is not really adding any switchdev ops. We are calling the > mode in which the > PF driver creates VF representors as SWITCHDEV mode (this is based on the > netdev > discussion of mellanox patches). > > Amritha has a patch to add switchdev ops to VF representor netdevs that > enables returning > switch id via switchdev_port_attr_get() op. > >> >> Otherwise I don't see what l2 switchdev blocks are useful vs >> just using the standard ndo op interfaces already in place when >> working on a device without learning/aging/etc. The one thing >> that I've never bothered to add is pushing "learned" rules down >> into the hardware but I'm not convinced for most use cases this >> is particularly interesting because you _should_ know in a managed >> system what MAC addresses a VM/container/etc is allowed to use >> ahead of time via libvirt or other mgmt stack. I haven't tested >> the VLAN handling though so that needs to be looked at. >> >> And l3 switchdev routing may be interesting but its fairly >> low on my priority list unless someone is really excited about it. >> >>>> This is less about switchdev and more about generating VF netdevs to >>>> use with ip tools and friends. >>> >>> Right. One of the issues I have with this patch set is that it seems >>> to get things backwards. They are making VFs appear that don't do >>> much of anything and then trying to bolt on features after the fact. >>> We probably need to focus on enabling the VF representation, and then >>> providing the ability to switch them on and off. Also I would argue >>> that we should actually be enabling switch features such as FDB >>> entries instead of trying to bolt on stuff like flow director which >>> doesn't really apply to very many switches and isn't as likely to be >>> used on a switch port. >> >> Fair enough. Organizing the patches better seems OK to me. I plan to >> use the 'tc' offloaded mechanisms not the ethtool flow director >> interface for virtual switch offloads. >> >>>> Another option would be to just always enable VF netdevs and have no >>>> legacy mode at all. I think that would be fine it just depends on if >>>> you think having extra netdevs around will confuse the stack at all. >>>> It might create a few corner cases but one reasonable thing to do >>>> would be to just fix those cases as they appear. >>> >>> I'd say we are better off starting out with them just enabled and then >>> enabling the option to disable them after the fact. If we are going >>> to have this extra code floating around we should be defaulting it to >>> enabled so that it is more likely to be used. The legacy option >>> should only really be there so we can turn this off if we don't want >>> it. >>> >> Works for me. >> > > OK. If we all agree that 'creating VF netdevs' by default is the right way > to go, I will rearrange the > patchset in the following order. > - Introduce VF representor netdevs (create VF netdevs by default when > sr-iov VFs are enabled) > - enable ethtool stats on VF representors > - enable ntuple filters on VF representors No NTUPLE filters. This is not the way the community will want to do this, and this is not how I want to do this. The easy way to think of this is that NTUPLE is just an alias for Rx network flow classification. Now where are the VF representors supposed to receive traffic from? It is the Tx path of the VF. So by putting NTUPLE filters on the VF representors you are implying you can get in the Tx path for the VF for all traffic, and last I knew that was not the case. As such it makes no sense to do it this way. > - introduce devlink to enable the switch mode to be changed to 'legacy' > (no VF netdevs) > > Thanks > Sridhar You have to think of the VF representors as switch ports. They don't actually represent the VFs. So if you think about it a switch port will only have a few real features. You can control the VLAN membership, tagged/untagged, what FDB entries are assigned to the port, the link speed, if the link is up/down, etc. So some of your first steps might be to focus on the L1 and L2 type items. So for example you might see if we can do things like control the upper limit of the MTU for the VF by controlling the MTU for the VF representor. See if you can at least make it so that the MAC and VLAN values for the VF show up as FDB entries for the VF representor and so forth. - Alex
On 8/16/2016 1:39 PM, Alexander Duyck wrote: > On Tue, Aug 16, 2016 at 11:54 AM, Samudrala, Sridhar > <sridhar.samudrala@intel.com> wrote: >> On 8/10/2016 11:41 AM, John Fastabend wrote: >>> On 16-08-10 11:18 AM, Alexander Duyck wrote: >>>> On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend >>>> <john.fastabend@gmail.com> wrote: >>>>> On 16-08-10 09:01 AM, Alexander Duyck wrote: >>>>>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala >>>>>> <sridhar.samudrala@intel.com> wrote: >>>>>>> Add initial devlink support to set/get the mode of SRIOV switch. >>>>>>> This patch allows the mode to be set to either 'legacy' or >>>>>>> 'switchdev', but >>>>>>> doesn't implement any functionality to create vf representors in >>>>>>> switchdev >>>>>>> mode. >>>>>>> >>>>>>> With smode support in iproute2 'devlink' utility, switch mode can be >>>>>>> set >>>>>>> and get via following commands. >>>>>>> >>>>>>> # devlink dev smode pci/0000:05:00.0 >>>>>>> mode: legacy >>>>>>> # devlink dev set pci/0000:05:00.0 smode switchdev >>>>>>> # devlink dev smode pci/0000:05:00.0 >>>>>>> mode: switchdev >>>>>>> >>>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> >>>>>> I really don't see much value in this patch. If you are going to >>>>>> support SwitchDev then just do it. Otherwise you are adding extra >>>>>> overhead for maintaining two different modes. >>>>>> >>>>>> I would recommend putting this series out to netdev as an RFC. >>>>>> Submitting it to intel-wired-lan is kind of pointless as the audience >>>>>> it to small to get any valuable review. >> >> As these patches have only changes to i40e driver, i didn't include netdev. >> Will do so when >> i submit v2. >> >>>>>> - Alex >>>>> I argued at length about this already. Jiri and company wanted this flag >>>>> to push device in and out of this mode. Here we are just following the >>>>> already upstreamed and debated decision. >>>> Yeah, I started doing more research after reviewing this patch. I see >>>> the idea behind it. I think the issue if anything is that it seems >>>> like things are a bit backwards. We probably should be enabling the >>>> SwitchDev bits first and then working on adding devlink knobs to >>>> disable things later. >>>> >>> Sure, although its not clear to me exactly which switchdev bits are >>> useful for an edge NIC like this. Getting switch ids is one thing >>> that will become useful when we enable multiple bridges. >> >> This patchset is not really adding any switchdev ops. We are calling the >> mode in which the >> PF driver creates VF representors as SWITCHDEV mode (this is based on the >> netdev >> discussion of mellanox patches). >> >> Amritha has a patch to add switchdev ops to VF representor netdevs that >> enables returning >> switch id via switchdev_port_attr_get() op. >> >>> Otherwise I don't see what l2 switchdev blocks are useful vs >>> just using the standard ndo op interfaces already in place when >>> working on a device without learning/aging/etc. The one thing >>> that I've never bothered to add is pushing "learned" rules down >>> into the hardware but I'm not convinced for most use cases this >>> is particularly interesting because you _should_ know in a managed >>> system what MAC addresses a VM/container/etc is allowed to use >>> ahead of time via libvirt or other mgmt stack. I haven't tested >>> the VLAN handling though so that needs to be looked at. >>> >>> And l3 switchdev routing may be interesting but its fairly >>> low on my priority list unless someone is really excited about it. >>> >>>>> This is less about switchdev and more about generating VF netdevs to >>>>> use with ip tools and friends. >>>> Right. One of the issues I have with this patch set is that it seems >>>> to get things backwards. They are making VFs appear that don't do >>>> much of anything and then trying to bolt on features after the fact. >>>> We probably need to focus on enabling the VF representation, and then >>>> providing the ability to switch them on and off. Also I would argue >>>> that we should actually be enabling switch features such as FDB >>>> entries instead of trying to bolt on stuff like flow director which >>>> doesn't really apply to very many switches and isn't as likely to be >>>> used on a switch port. >>> Fair enough. Organizing the patches better seems OK to me. I plan to >>> use the 'tc' offloaded mechanisms not the ethtool flow director >>> interface for virtual switch offloads. >>> >>>>> Another option would be to just always enable VF netdevs and have no >>>>> legacy mode at all. I think that would be fine it just depends on if >>>>> you think having extra netdevs around will confuse the stack at all. >>>>> It might create a few corner cases but one reasonable thing to do >>>>> would be to just fix those cases as they appear. >>>> I'd say we are better off starting out with them just enabled and then >>>> enabling the option to disable them after the fact. If we are going >>>> to have this extra code floating around we should be defaulting it to >>>> enabled so that it is more likely to be used. The legacy option >>>> should only really be there so we can turn this off if we don't want >>>> it. >>>> >>> Works for me. >>> >> OK. If we all agree that 'creating VF netdevs' by default is the right way >> to go, I will rearrange the >> patchset in the following order. >> - Introduce VF representor netdevs (create VF netdevs by default when >> sr-iov VFs are enabled) >> - enable ethtool stats on VF representors >> - enable ntuple filters on VF representors > No NTUPLE filters. This is not the way the community will want to do > this, and this is not how I want to do this. > > The easy way to think of this is that NTUPLE is just an alias for Rx > network flow classification. Now where are the VF representors > supposed to receive traffic from? It is the Tx path of the VF. So by > putting NTUPLE filters on the VF representors you are implying you > can get in the Tx path for the VF for all traffic, and last I knew > that was not the case. As such it makes no sense to do it this way. The idea behind NTUPLE filters on VF representors is to match RX traffic that is destined for the corresponding VF. Basically when adding the flow director filter, we are setting the dest vsi as the VSI of the corresponding VF. Without VF representors, this is currently done by overloading user-def value with VF id. > >> - introduce devlink to enable the switch mode to be changed to 'legacy' >> (no VF netdevs) >> >> Thanks >> Sridhar > You have to think of the VF representors as switch ports. They don't > actually represent the VFs. So if you think about it a switch port > will only have a few real features. You can control the VLAN > membership, tagged/untagged, what FDB entries are assigned to the > port, the link speed, if the link is up/down, etc. So some of your > first steps might be to focus on the L1 and L2 type items. So for > example you might see if we can do things like control the upper limit > of the MTU for the VF by controlling the MTU for the VF representor. > See if you can at least make it so that the MAC and VLAN values for > the VF show up as FDB entries for the VF representor and so forth. Sure. These are additional items that we can look into that can be enabled via VF representors. > > - Alex
On Tue, Aug 16, 2016 at 2:51 PM, Samudrala, Sridhar <sridhar.samudrala@intel.com> wrote: > The idea behind NTUPLE filters on VF representors is to match RX traffic > that is destined > for the corresponding VF. Basically when adding the flow director filter, we > are setting the > dest vsi as the VSI of the corresponding VF. > Without VF representors, this is currently done by overloading user-def > value with VF id. > Yes, and for now that is preferable to what you are doing since it makes no sense. I will try to clarify my point. First, instead of calling these VF representors lets call them VF port representor, or VFPR for short (trying to avoid VFR because it sounds too much like VRF which is completely unrelated). So the whole point of a VFPR is to present the VF as a port on your logical switch. You should mostly only be capable of doing on it what you would do with an actual switch. So for example if I have a switch connected to NIC. Now most switches have certain options for controlling things. For example I can force the switch port onto a given VLAN so that the port is isolated from other traffic. I can control the max frame size allowed to be sent or received over the port. I can block what traffic the port is allowed to send by restricting MAC addresses via ACLs, and I can manually program the forwarding table via FDB entries. The problem is Flow Director also know as NTUPLE or Rx NFC only operates on the Rx side of NICs. In your VFPR the Rx side represents packets coming from your VF, not packets going to it. If you transmit on a VFPR the packet should be received only by the VF, if you receive from it the packet should only be from the VF. So really any Flow Director rule you add to the VFPR should not impact the VF, but instead should impact the VFPR and the PF since the PF will add the rule and it should only apply on traffic received on the VFPR from the VF. At some point in the future we can look at adding flow Tx offloads via either TC or SwitchDev. For now adding NTUPLE is a strong no-go. The VFPR are not the place to do it unless you are just wanting to add custom rules to the PF itself that won't impact the VF. - Alex
On 16-08-16 05:17 PM, Alexander Duyck wrote: > On Tue, Aug 16, 2016 at 2:51 PM, Samudrala, Sridhar > <sridhar.samudrala@intel.com> wrote: >> The idea behind NTUPLE filters on VF representors is to match RX traffic >> that is destined >> for the corresponding VF. Basically when adding the flow director filter, we >> are setting the >> dest vsi as the VSI of the corresponding VF. >> Without VF representors, this is currently done by overloading user-def >> value with VF id. >> > > Yes, and for now that is preferable to what you are doing since it > makes no sense. > > I will try to clarify my point. First, instead of calling these VF > representors lets call them VF port representor, or VFPR for short > (trying to avoid VFR because it sounds too much like VRF which is > completely unrelated). So the whole point of a VFPR is to present the > VF as a port on your logical switch. You should mostly only be > capable of doing on it what you would do with an actual switch. So > for example if I have a switch connected to NIC. Now most switches > have certain options for controlling things. For example I can force > the switch port onto a given VLAN so that the port is isolated from > other traffic. I can control the max frame size allowed to be sent or > received over the port. I can block what traffic the port is allowed > to send by restricting MAC addresses via ACLs, and I can manually > program the forwarding table via FDB entries. > > The problem is Flow Director also know as NTUPLE or Rx NFC only > operates on the Rx side of NICs. In your VFPR the Rx side represents > packets coming from your VF, not packets going to it. If you > transmit on a VFPR the packet should be received only by the VF, if > you receive from it the packet should only be from the VF. So really > any Flow Director rule you add to the VFPR should not impact the VF, > but instead should impact the VFPR and the PF since the PF will add > the rule and it should only apply on traffic received on the VFPR from > the VF. > > At some point in the future we can look at adding flow Tx offloads via > either TC or SwitchDev. For now adding NTUPLE is a strong no-go. The > VFPR are not the place to do it unless you are just wanting to add > custom rules to the PF itself that won't impact the VF. > > - Alex > FWIW the features I want from the VFPR (nice acronym :) are the following, - xmit (just send it over the PF queues is fine) - recv : ideally the driver does this with descriptor magic but the next best thing would be to do this in the driver with a lookup on mac when anti-spoofing is on. Or worst case if the driver logic is too ugly I'll hack it in userspace :( - up/down link state - statistics - not essential but mtu would also be nice I think thats it to get my basic stuff working in userspace. Another nice piece would be the fdb_add offloads so we can get a basic l2 bridge working and bonus points for including the vlan parts of the fdb_add msg. Thanks, John
diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index c0e1743..2ede229 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -215,6 +215,7 @@ config I40E tristate "Intel(R) Ethernet Controller XL710 Family support" select PTP_1588_CLOCK depends on PCI + depends on MAY_USE_DEVLINK ---help--- This driver supports Intel(R) Ethernet Controller XL710 Family of devices. For more information on how to identify your adapter, go diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 2a88291..724bd82 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -53,6 +53,8 @@ #include <linux/clocksource.h> #include <linux/net_tstamp.h> #include <linux/ptp_clock_kernel.h> +#include <net/devlink.h> + #include "i40e_type.h" #include "i40e_prototype.h" #ifdef I40E_FCOE @@ -446,6 +448,7 @@ struct i40e_pf { u32 ioremap_len; u32 fd_inv; u16 phy_led_val; + enum devlink_eswitch_mode eswitch_mode; }; enum i40e_filter_state { diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 339d99b..bb44f09 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -27,6 +27,7 @@ #include <linux/etherdevice.h> #include <linux/of_net.h> #include <linux/pci.h> +#include <net/devlink.h> /* Local includes */ #include "i40e.h" @@ -10671,6 +10672,58 @@ static void i40e_get_platform_mac_addr(struct pci_dev *pdev, struct i40e_pf *pf) } /** + * i40e_devlink_eswitch_mode_get + * + * @devlink: pointer to devlink struct + * @mode: sr-iov switch mode pointer + * + * Returns the switch mode of the associated PF in the @mode pointer. + */ +static int i40e_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) +{ + struct i40e_pf *pf = devlink_priv(devlink); + + *mode = pf->eswitch_mode; + + return 0; +} + +/** + * i40e_devlink_eswitch_mode_set + * + * @devlink: pointer to devlink struct + * @mode: sr-iov switch mode + * + * Set the switch mode of the associated PF. + * Returns 0 on success and -EOPNOTSUPP on error. + */ +static int i40e_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode) +{ + struct i40e_pf *pf = devlink_priv(devlink); + int err = 0; + + if (mode == pf->eswitch_mode) + goto done; + + switch (mode) { + case DEVLINK_ESWITCH_MODE_LEGACY: + case DEVLINK_ESWITCH_MODE_SWITCHDEV: + pf->eswitch_mode = mode; + break; + default: + err = -EOPNOTSUPP; + } + +done: + return err; +} + +static const struct devlink_ops i40e_devlink_ops = { + .eswitch_mode_get = i40e_devlink_eswitch_mode_get, + .eswitch_mode_set = i40e_devlink_eswitch_mode_set, +}; + +/** * i40e_probe - Device initialization routine * @pdev: PCI device information struct * @ent: entry in i40e_pci_tbl @@ -10687,6 +10740,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) struct i40e_pf *pf; struct i40e_hw *hw; static u16 pfs_found; + struct devlink *devlink; u16 wol_nvm_bits; u16 link_status; int err; @@ -10721,16 +10775,21 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_enable_pcie_error_reporting(pdev); pci_set_master(pdev); + devlink = devlink_alloc(&i40e_devlink_ops, sizeof(*pf)); + if (!devlink) { + dev_err(&pdev->dev, + "devlink_alloc failed\n"); + err = -ENOMEM; + goto err_devlink_alloc; + } + /* Now that we have a PCI connection, we need to do the * low level device setup. This is primarily setting up * the Admin Queue structures and then querying for the * device's current profile information. */ - pf = kzalloc(sizeof(*pf), GFP_KERNEL); - if (!pf) { - err = -ENOMEM; - goto err_pf_alloc; - } + pf = devlink_priv(devlink); + pf->next_vsi = 0; pf->pdev = pdev; set_bit(__I40E_DOWN, &pf->state); @@ -11047,6 +11106,11 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } } + pf->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY; + err = devlink_register(devlink, &pdev->dev); + if (err) + goto err_devlink_register; + #ifdef CONFIG_PCI_IOV /* prep for VF support */ if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) && @@ -11188,6 +11252,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; /* Unwind what we've done if something failed in the setup */ +err_devlink_register: + i40e_stop_misc_vector(pf); err_vsis: set_bit(__I40E_DOWN, &pf->state); i40e_clear_interrupt_scheme(pf); @@ -11205,8 +11271,8 @@ err_adminq_setup: err_pf_reset: iounmap(hw->hw_addr); err_ioremap: - kfree(pf); -err_pf_alloc: + devlink_free(devlink); +err_devlink_alloc: pci_disable_pcie_error_reporting(pdev); pci_release_selected_regions(pdev, pci_select_bars(pdev, IORESOURCE_MEM)); @@ -11229,9 +11295,11 @@ static void i40e_remove(struct pci_dev *pdev) { struct i40e_pf *pf = pci_get_drvdata(pdev); struct i40e_hw *hw = &pf->hw; + struct devlink *devlink = priv_to_devlink(pf); i40e_status ret_code; int i; + devlink_unregister(devlink); i40e_dbg_pf_exit(pf); i40e_ptp_stop(pf); @@ -11319,7 +11387,7 @@ static void i40e_remove(struct pci_dev *pdev) kfree(pf->vsi); iounmap(hw->hw_addr); - kfree(pf); + devlink_free(devlink); pci_release_selected_regions(pdev, pci_select_bars(pdev, IORESOURCE_MEM));
Add initial devlink support to set/get the mode of SRIOV switch. This patch allows the mode to be set to either 'legacy' or 'switchdev', but doesn't implement any functionality to create vf representors in switchdev mode. With smode support in iproute2 'devlink' utility, switch mode can be set and get via following commands. # devlink dev smode pci/0000:05:00.0 mode: legacy # devlink dev set pci/0000:05:00.0 smode switchdev # devlink dev smode pci/0000:05:00.0 mode: switchdev Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> --- drivers/net/ethernet/intel/Kconfig | 1 + drivers/net/ethernet/intel/i40e/i40e.h | 3 ++ drivers/net/ethernet/intel/i40e/i40e_main.c | 84 ++++++++++++++++++++++++++--- 3 files changed, 80 insertions(+), 8 deletions(-)